View Javadoc

1   // Utf8.java, created Mon Feb  5 23:23:22 2001 by joewhaley
2   // Copyright (C) 2001-3 John Whaley <jwhaley@alum.mit.edu>
3   // Licensed under the terms of the GNU LGPL; see COPYING for details.
4   package jwutil.strings;
5   
6   /***
7    * Utf8 conversion routines
8    *
9    * @author  John Whaley <jwhaley@alum.mit.edu>
10   * @version $Id: Utf8.java 2461 2006-06-06 08:02:27Z joewhaley $
11   */
12  public abstract class Utf8 {
13  
14      //// Utf8 conversion routines
15      
16      /***
17       * Strictly check the format of the utf8/pseudo-utf8 byte array in
18       * fromUtf8.
19       */
20      static final boolean STRICTLY_CHECK_FORMAT = false;
21      /***
22       * Set fromUtf8 to not throw an exception when given a normal utf8
23       * byte array.
24       */
25      static final boolean ALLOW_NORMAL_UTF8 = false;
26      /***
27       * Set fromUtf8 to not throw an exception when given a pseudo utf8
28       * byte array.
29       */
30      static final boolean ALLOW_PSEUDO_UTF8 = true;
31      /***
32       * Set toUtf8 to write in pseudo-utf8 (rather than normal utf8).
33       */
34      static final boolean WRITE_PSEUDO_UTF8 = true;
35  
36      /***
37       * Convert the given sequence of (pseudo-)utf8 formatted bytes
38       * into a String.
39       *
40       * The acceptable input formats are controlled by the
41       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
42       * flags.
43       *
44       * @param utf8 (pseudo-)utf8 byte array
45       * @throws UTFDataFormatError if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
46       * @return unicode string
47       */
48      public static String fromUtf8(byte[] utf8)
49      throws UTFDataFormatError {
50          char[] result = new char[utf8.length];
51          int result_index = 0;
52          for (int i=0, n=utf8.length; i<n; ) {
53              byte b = utf8[i++];
54              if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8)
55                  if (b == 0)
56                      throw new UTFDataFormatError("0 byte encountered at location "+(i-1));
57              if (b >= 0) {  // < 0x80 unsigned
58                  // in the range '\001' to '\177'
59                  result[result_index++] = (char)b;
60                  continue;
61              }
62              try {
63                  byte nb = utf8[i++];
64                  if (b < -32) {  // < 0xe0 unsigned
65                      // '\000' or in the range '\200' to '\u07FF'
66                      char c = result[result_index++] =
67                          (char)(((b & 0x1f) << 6) | (nb & 0x3f));
68                      if (STRICTLY_CHECK_FORMAT) {
69                          if (((b & 0xe0) != 0xc0) ||
70                              ((nb & 0xc0) != 0x80))
71                              throw new UTFDataFormatError("invalid marker bits for double byte char at location "+(i-2));
72                          if (c < '\200') {
73                              if (!ALLOW_PSEUDO_UTF8 || (c != '\000'))
74                                  throw new UTFDataFormatError("encountered double byte char that should have been single byte at location "+(i-2));
75                          } else if (c > '\u07FF')
76                              throw new UTFDataFormatError("encountered double byte char that should have been triple byte at location "+(i-2));
77                      }
78                  } else {
79                      byte nnb = utf8[i++];
80                      // in the range '\u0800' to '\uFFFF'
81                      char c = result[result_index++] =
82                          (char)(((b & 0x0f) << 12) |
83                                 ((nb & 0x3f) << 6) |
84                                 (nnb & 0x3f));
85                      if (STRICTLY_CHECK_FORMAT) {
86                          if (((b & 0xf0) != 0xe0) ||
87                              ((nb & 0xc0) != 0x80) ||
88                              ((nnb & 0xc0) != 0x80))
89                              throw new UTFDataFormatError("invalid marker bits for triple byte char at location "+(i-3));
90                          if (c < '\u0800')
91                              throw new UTFDataFormatError("encountered triple byte char that should have been fewer bytes at location "+(i-3));
92                      }
93                  }
94              } catch (ArrayIndexOutOfBoundsException e) {
95                  throw new UTFDataFormatError("unexpected end at location "+i);
96              }
97          }
98          return new String(result, 0, result_index);
99      }
100 
101     /***
102      * Convert the given String into a sequence of (pseudo-)utf8
103      * formatted bytes.
104      *
105      * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
106      *
107      * @param s String to convert
108      * @return array containing sequence of (pseudo-)utf8 formatted bytes
109      */
110     public static byte[] toUtf8(String s) {
111         byte[] result = new byte[lengthUtf8(s)];
112         int result_index = 0;
113         for (int i = 0, n = s.length(); i < n; ++i) {
114             char c = (char)s.charAt(i);
115             // in all shifts below, c is an (unsigned) char,
116             // so either >>> or >> is ok
117             if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F))
118                 result[result_index++] = (byte)c;
119             else if (c > 0x07FF) {
120                 result[result_index++] = (byte)(0xe0 | (byte)(c >> 12));
121                 result[result_index++] = (byte)(0x80 | ((c & 0xfc0) >> 6));
122                 result[result_index++] = (byte)(0x80 | (c & 0x3f));
123             } else {
124                 result[result_index++] = (byte)(0xc0 | (byte)(c >> 6));
125                 result[result_index++] = (byte)(0x80 | (c & 0x3f));
126             }
127         }
128         return result;
129     }
130 
131     /***
132      * Converts a character to utf8 in the given byte array.
133      * Returns the new offset in the byte array.
134      */
135     public static int toUtf8(char c, byte[] to, int off, int end) {
136         if ((c >= 0x0001) && (c <= 0x007F)) {
137             to[off++] = (byte) c;
138         } else {
139             if (c > 0x07FF) {
140                 to[off++] = (byte)(0xe0 | (byte)(c >> 12));
141                 if (off == end) return -1;
142                 to[off++] = (byte)(0x80 | ((c & 0xfc0) >> 6));
143                 if (off == end) return -1;
144                 to[off++] = (byte)(0x80 | (c & 0x3f));
145             } else {
146                 to[off++] = (byte)(0xc0 | (byte)(c >> 6));
147                 if (off == end) return -1;
148                 to[off++] = (byte)(0x80 | (c & 0x3f));
149             }
150         }
151         return off;
152     }
153     
154     /***
155      * Returns the length of a string's utf8 encoded form.
156      */
157     public static int lengthUtf8(String s) {
158         int utflen = 0;
159         for (int i = 0, n = s.length(); i < n; ++i) {
160             int c = s.charAt(i);
161             if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F))
162                 ++utflen;
163             else if (c > 0x07FF)
164                 utflen += 3;
165             else
166                 utflen += 2;
167         }
168         return utflen;
169     }
170 
171     /***
172      * Returns the length of a string's utf8 encoded form.
173      */
174     public static int lengthUtf8(char[] cs, int off, int len) {
175         int result = 0;
176         for (int i = 0; i < len; ++i) {
177             char c = cs[off + i];
178             if ((c >= 0x0001) && (c <= 0x007F)) {
179                 ++result;
180             } else {
181                 if (c > 0x07FF) {
182                     result += 3;
183                 } else {
184                     result += 2;
185                 }
186             }
187         }
188         return result;
189     }
190     
191     /***
192      * Check whether the given sequence of bytes is valid (pseudo-)utf8.
193      *
194      * @param bytes byte array to check
195      * @return true iff the given sequence is valid (pseudo-)utf8.
196      */
197     public static boolean checkUtf8(byte[] bytes) {
198         for (int i=0, n=bytes.length; i<n; ) {
199             byte b = bytes[i++];
200             if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8)
201                 if (b == 0) return false;
202             if (b >= 0) {  // < 0x80 unsigned
203                 // in the range '\001' to '\177'
204                 continue;
205             }
206             try {
207                 byte nb = bytes[i++];
208                 if (b < -32) {  // < 0xe0 unsigned
209                     // '\000' or in the range '\200' to '\u07FF'
210                     char c = (char)(((b & 0x1f) << 6) | (nb & 0x3f));
211                     if (STRICTLY_CHECK_FORMAT) {
212                         if (((b & 0xe0) != 0xc0) ||
213                             ((nb & 0xc0) != 0x80))
214                             return false;
215                         if (c < '\200') {
216                             if (!ALLOW_PSEUDO_UTF8 || (c != '\000'))
217                                 return false;
218                             } else if (c > '\u07FF')
219                                 return false;
220                     }
221                 } else {
222                     byte nnb = bytes[i++];
223                     // in the range '\u0800' to '\uFFFF'
224                     char c = (char)(((b & 0x0f) << 12) |
225                                     ((nb & 0x3f) << 6) |
226                                     (nnb & 0x3f));
227                     if (STRICTLY_CHECK_FORMAT) {
228                         if (((b & 0xf0) != 0xe0) ||
229                             ((nb & 0xc0) != 0x80) ||
230                             ((nnb & 0xc0) != 0x80))
231                             return false;
232                         if (c < '\u0800')
233                             return false;
234                     }
235                 }
236             } catch (ArrayIndexOutOfBoundsException e) {
237                 return false;
238             }
239         }
240         return true;
241     }
242 
243 }