1
2
3
4 package jwutil.strings;
5
6 /***
7 * Utf8 conversion routines
8 *
9 * @author John Whaley <jwhaley@alum.mit.edu>
10 * @version $Id: Utf8.java 2461 2006-06-06 08:02:27Z joewhaley $
11 */
12 public abstract class Utf8 {
13
14
15
16 /***
17 * Strictly check the format of the utf8/pseudo-utf8 byte array in
18 * fromUtf8.
19 */
20 static final boolean STRICTLY_CHECK_FORMAT = false;
21 /***
22 * Set fromUtf8 to not throw an exception when given a normal utf8
23 * byte array.
24 */
25 static final boolean ALLOW_NORMAL_UTF8 = false;
26 /***
27 * Set fromUtf8 to not throw an exception when given a pseudo utf8
28 * byte array.
29 */
30 static final boolean ALLOW_PSEUDO_UTF8 = true;
31 /***
32 * Set toUtf8 to write in pseudo-utf8 (rather than normal utf8).
33 */
34 static final boolean WRITE_PSEUDO_UTF8 = true;
35
36 /***
37 * Convert the given sequence of (pseudo-)utf8 formatted bytes
38 * into a String.
39 *
40 * The acceptable input formats are controlled by the
41 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
42 * flags.
43 *
44 * @param utf8 (pseudo-)utf8 byte array
45 * @throws UTFDataFormatError if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
46 * @return unicode string
47 */
48 public static String fromUtf8(byte[] utf8)
49 throws UTFDataFormatError {
50 char[] result = new char[utf8.length];
51 int result_index = 0;
52 for (int i=0, n=utf8.length; i<n; ) {
53 byte b = utf8[i++];
54 if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8)
55 if (b == 0)
56 throw new UTFDataFormatError("0 byte encountered at location "+(i-1));
57 if (b >= 0) {
58
59 result[result_index++] = (char)b;
60 continue;
61 }
62 try {
63 byte nb = utf8[i++];
64 if (b < -32) {
65
66 char c = result[result_index++] =
67 (char)(((b & 0x1f) << 6) | (nb & 0x3f));
68 if (STRICTLY_CHECK_FORMAT) {
69 if (((b & 0xe0) != 0xc0) ||
70 ((nb & 0xc0) != 0x80))
71 throw new UTFDataFormatError("invalid marker bits for double byte char at location "+(i-2));
72 if (c < '\200') {
73 if (!ALLOW_PSEUDO_UTF8 || (c != '\000'))
74 throw new UTFDataFormatError("encountered double byte char that should have been single byte at location "+(i-2));
75 } else if (c > '\u07FF')
76 throw new UTFDataFormatError("encountered double byte char that should have been triple byte at location "+(i-2));
77 }
78 } else {
79 byte nnb = utf8[i++];
80
81 char c = result[result_index++] =
82 (char)(((b & 0x0f) << 12) |
83 ((nb & 0x3f) << 6) |
84 (nnb & 0x3f));
85 if (STRICTLY_CHECK_FORMAT) {
86 if (((b & 0xf0) != 0xe0) ||
87 ((nb & 0xc0) != 0x80) ||
88 ((nnb & 0xc0) != 0x80))
89 throw new UTFDataFormatError("invalid marker bits for triple byte char at location "+(i-3));
90 if (c < '\u0800')
91 throw new UTFDataFormatError("encountered triple byte char that should have been fewer bytes at location "+(i-3));
92 }
93 }
94 } catch (ArrayIndexOutOfBoundsException e) {
95 throw new UTFDataFormatError("unexpected end at location "+i);
96 }
97 }
98 return new String(result, 0, result_index);
99 }
100
101 /***
102 * Convert the given String into a sequence of (pseudo-)utf8
103 * formatted bytes.
104 *
105 * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
106 *
107 * @param s String to convert
108 * @return array containing sequence of (pseudo-)utf8 formatted bytes
109 */
110 public static byte[] toUtf8(String s) {
111 byte[] result = new byte[lengthUtf8(s)];
112 int result_index = 0;
113 for (int i = 0, n = s.length(); i < n; ++i) {
114 char c = (char)s.charAt(i);
115
116
117 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F))
118 result[result_index++] = (byte)c;
119 else if (c > 0x07FF) {
120 result[result_index++] = (byte)(0xe0 | (byte)(c >> 12));
121 result[result_index++] = (byte)(0x80 | ((c & 0xfc0) >> 6));
122 result[result_index++] = (byte)(0x80 | (c & 0x3f));
123 } else {
124 result[result_index++] = (byte)(0xc0 | (byte)(c >> 6));
125 result[result_index++] = (byte)(0x80 | (c & 0x3f));
126 }
127 }
128 return result;
129 }
130
131 /***
132 * Converts a character to utf8 in the given byte array.
133 * Returns the new offset in the byte array.
134 */
135 public static int toUtf8(char c, byte[] to, int off, int end) {
136 if ((c >= 0x0001) && (c <= 0x007F)) {
137 to[off++] = (byte) c;
138 } else {
139 if (c > 0x07FF) {
140 to[off++] = (byte)(0xe0 | (byte)(c >> 12));
141 if (off == end) return -1;
142 to[off++] = (byte)(0x80 | ((c & 0xfc0) >> 6));
143 if (off == end) return -1;
144 to[off++] = (byte)(0x80 | (c & 0x3f));
145 } else {
146 to[off++] = (byte)(0xc0 | (byte)(c >> 6));
147 if (off == end) return -1;
148 to[off++] = (byte)(0x80 | (c & 0x3f));
149 }
150 }
151 return off;
152 }
153
154 /***
155 * Returns the length of a string's utf8 encoded form.
156 */
157 public static int lengthUtf8(String s) {
158 int utflen = 0;
159 for (int i = 0, n = s.length(); i < n; ++i) {
160 int c = s.charAt(i);
161 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F))
162 ++utflen;
163 else if (c > 0x07FF)
164 utflen += 3;
165 else
166 utflen += 2;
167 }
168 return utflen;
169 }
170
171 /***
172 * Returns the length of a string's utf8 encoded form.
173 */
174 public static int lengthUtf8(char[] cs, int off, int len) {
175 int result = 0;
176 for (int i = 0; i < len; ++i) {
177 char c = cs[off + i];
178 if ((c >= 0x0001) && (c <= 0x007F)) {
179 ++result;
180 } else {
181 if (c > 0x07FF) {
182 result += 3;
183 } else {
184 result += 2;
185 }
186 }
187 }
188 return result;
189 }
190
191 /***
192 * Check whether the given sequence of bytes is valid (pseudo-)utf8.
193 *
194 * @param bytes byte array to check
195 * @return true iff the given sequence is valid (pseudo-)utf8.
196 */
197 public static boolean checkUtf8(byte[] bytes) {
198 for (int i=0, n=bytes.length; i<n; ) {
199 byte b = bytes[i++];
200 if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8)
201 if (b == 0) return false;
202 if (b >= 0) {
203
204 continue;
205 }
206 try {
207 byte nb = bytes[i++];
208 if (b < -32) {
209
210 char c = (char)(((b & 0x1f) << 6) | (nb & 0x3f));
211 if (STRICTLY_CHECK_FORMAT) {
212 if (((b & 0xe0) != 0xc0) ||
213 ((nb & 0xc0) != 0x80))
214 return false;
215 if (c < '\200') {
216 if (!ALLOW_PSEUDO_UTF8 || (c != '\000'))
217 return false;
218 } else if (c > '\u07FF')
219 return false;
220 }
221 } else {
222 byte nnb = bytes[i++];
223
224 char c = (char)(((b & 0x0f) << 12) |
225 ((nb & 0x3f) << 6) |
226 (nnb & 0x3f));
227 if (STRICTLY_CHECK_FORMAT) {
228 if (((b & 0xf0) != 0xe0) ||
229 ((nb & 0xc0) != 0x80) ||
230 ((nnb & 0xc0) != 0x80))
231 return false;
232 if (c < '\u0800')
233 return false;
234 }
235 }
236 } catch (ArrayIndexOutOfBoundsException e) {
237 return false;
238 }
239 }
240 return true;
241 }
242
243 }