1
2
3
4 package joeq.UTF;
5
6 import java.io.DataOutput;
7 import java.io.IOException;
8 import joeq.Class.jq_ClassFileConstants;
9 import joeq.Runtime.Debug;
10 import jwutil.collections.UnmodifiableIterator;
11 import jwutil.util.Assert;
12
13 /***
14 * TODO: ummm, synchronization?!
15 *
16 * @author John Whaley <jwhaley@alum.mit.edu>
17 * @version $Id: Utf8.java 1931 2004-09-22 22:17:47Z joewhaley $
18 */
19 public class Utf8 implements jq_ClassFileConstants {
20
21 public static
22
23 public static final int STARTING_TABLE_SIZE = 16384;
24 public static final int STARTING_HASH_SIZE = 9999;
25 public static final int STARTING_CHAIN_SIZE = 4;
26
27 public static Utf8[] table = new Utf8[STARTING_TABLE_SIZE];
28 public static int size = -1;
29 public static int[][] chains = new int[STARTING_HASH_SIZE][];
30
31 public static final Utf8 BYTE_DESC = Utf8.get((char)TC_BYTE+"");
32 public static final Utf8 CHAR_DESC = Utf8.get((char)TC_CHAR+"");
33 public static final Utf8 DOUBLE_DESC = Utf8.get((char)TC_DOUBLE+"");
34 public static final Utf8 FLOAT_DESC = Utf8.get((char)TC_FLOAT+"");
35 public static final Utf8 INT_DESC = Utf8.get((char)TC_INT+"");
36 public static final Utf8 LONG_DESC = Utf8.get((char)TC_LONG+"");
37 public static final Utf8 SHORT_DESC = Utf8.get((char)TC_SHORT+"");
38 public static final Utf8 BOOLEAN_DESC = Utf8.get((char)TC_BOOLEAN+"");
39 public static final Utf8 VOID_DESC = Utf8.get((char)TC_VOID+"");
40
41 public static Utf8 get(String s) {
42 return get(toUtf8(s));
43 }
44
45 public static Utf8 get(byte[] b) {
46 int id = getID(b);
47 return table[id];
48 }
49
50 public static Utf8 get(byte[] b, int startIndex, int endIndex) {
51 int id = getID(b, startIndex, endIndex);
52 return table[id];
53 }
54
55 public static int getID(byte[] b) {
56 int hash = hashCode(b);
57 int chain_index = Math.abs(hash) % chains.length;
58 int[] chain = chains[chain_index];
59 if (chain == null) {
60 chains[chain_index] = chain = new int[STARTING_CHAIN_SIZE];
61 return addToTable_helper(b, hash, chain, 0);
62 }
63 for (int i=0; i<chain.length; ++i) {
64 int id = chain[i]-1;
65 if (id == -1) {
66 return addToTable_helper(b, hash, chain, i);
67 }
68 Utf8 utf8 = table[id];
69
70 if (memcmp(utf8.data, b)) {
71
72 return id;
73 }
74 }
75 int[] newchain = new int[chain.length<<1];
76 System.arraycopy(chain, 0, newchain, 0, chain.length);
77 chains[chain_index] = newchain;
78 return addToTable_helper(b, hash, newchain, chain.length);
79
80
81
82 }
83
84 public static int getID(byte[] b, int startIndex, int endIndex) {
85 int hash = hashCode(b, startIndex, endIndex);
86 int chain_index = Math.abs(hash) % chains.length;
87 int[] chain = chains[chain_index];
88 if (chain == null) {
89 chains[chain_index] = chain = new int[STARTING_CHAIN_SIZE];
90 byte[] b2 = new byte[endIndex-startIndex];
91 System.arraycopy(b, startIndex, b2, 0, endIndex-startIndex);
92 return addToTable_helper(b2, hash, chain, 0);
93 }
94 for (int i=0; i<chain.length; ++i) {
95 int id = chain[i]-1;
96 if (id == -1) {
97 byte[] b2 = new byte[endIndex-startIndex];
98 System.arraycopy(b, startIndex, b2, 0, endIndex-startIndex);
99 return addToTable_helper(b2, hash, chain, i);
100 }
101 Utf8 utf8 = table[id];
102
103 if (memcmp(utf8.data, b, startIndex, endIndex)) {
104
105 return id;
106 }
107 }
108 int[] newchain = new int[chain.length<<1];
109 System.arraycopy(chain, 0, newchain, 0, chain.length);
110 chains[chain_index] = newchain;
111 byte[] b2 = new byte[endIndex-startIndex];
112 System.arraycopy(b, startIndex, b2, 0, endIndex-startIndex);
113 return addToTable_helper(b2, hash, newchain, chain.length);
114
115
116
117 }
118
119 public boolean isValidMethodDescriptor() {
120 if (data.length < 3)
121 return false;
122 if (data[0] != TC_PARAM)
123 return false;
124 int i=1;
125 while (data[i] != TC_PARAMEND) {
126 here:
127 switch (data[i]) {
128 case TC_BYTE:
129 case TC_CHAR:
130 case TC_DOUBLE:
131 case TC_FLOAT:
132 case TC_INT:
133 case TC_LONG:
134 case TC_SHORT:
135 case TC_BOOLEAN:
136 case TC_ARRAY:
137 break;
138 case TC_CLASS:
139 for (;;) {
140 if (++i == data.length)
141 return false;
142 if (data[i] == TC_CLASSEND)
143 break here;
144 }
145 default:
146 return false;
147 }
148 if (++i == data.length)
149 return false;
150 }
151 ++i;
152 return ((data[i] == TC_VOID) && (i == data.length-1)) ||
153 isValidTypeDescriptor(i);
154 }
155 public boolean isValidTypeDescriptor() {
156 return isValidTypeDescriptor(0);
157 }
158 private boolean isValidTypeDescriptor(int i) {
159 for (;;) {
160 if (data.length == i)
161 return false;
162 switch (data[i]) {
163 case TC_BYTE:
164 case TC_CHAR:
165 case TC_DOUBLE:
166 case TC_FLOAT:
167 case TC_INT:
168 case TC_LONG:
169 case TC_SHORT:
170 case TC_BOOLEAN:
171 return i == data.length-1;
172 case TC_ARRAY:
173 ++i; continue;
174 case TC_CLASS:
175 for (;;) {
176 if (++i == data.length)
177 return false;
178 if (data[i] == TC_CLASSEND)
179 return i == data.length-1;
180 }
181 default:
182 return false;
183 }
184 }
185 }
186
187 public boolean isDescriptor(byte desc) {
188 return (data.length > 0) && (data[0] == desc);
189 }
190
191 public Utf8 getArrayElementDescriptor() {
192 Assert._assert(isDescriptor(TC_ARRAY));
193 return get(data, 1, data.length);
194 }
195
196 public Utf8 getClassName() {
197 Assert._assert(isDescriptor(TC_CLASS));
198 return get(data, 1, data.length-1);
199 }
200
201 public Utf8 getAsArrayDescriptor() {
202 Assert._assert(isValidTypeDescriptor());
203
204 byte[] b = new byte[data.length+1];
205 b[0] = TC_ARRAY;
206 System.arraycopy(data, 0, b, 1, data.length);
207 return get(b);
208 }
209
210 public Utf8 getAsClassDescriptor() {
211 Assert._assert(data[0] != TC_ARRAY);
212
213 byte[] b = new byte[data.length+2];
214 b[0] = TC_CLASS;
215 System.arraycopy(data, 0, b, 1, data.length);
216 b[data.length+1] = TC_CLASSEND;
217 return get(b);
218 }
219
220 public MethodDescriptorIterator getParamDescriptors() {
221 return new MethodDescriptorIterator();
222 }
223
224 public class MethodDescriptorIterator extends UnmodifiableIterator {
225 int currentIndex;
226 MethodDescriptorIterator() {
227 Assert._assert(isDescriptor(TC_PARAM));
228 currentIndex = 0;
229 }
230 public boolean hasNext() {
231 return data[currentIndex+1] != TC_PARAMEND;
232 }
233 public Object next() { return nextUtf8(); }
234 public Utf8 nextUtf8() {
235 byte b = data[++currentIndex];
236 int startIndex = currentIndex;
237 while (b == TC_ARRAY) {
238 b = data[++currentIndex];
239 }
240 if (b == TC_CLASS) {
241 while (b != TC_CLASSEND) {
242 b = data[++currentIndex];
243 }
244 } else {
245 if ((b != TC_BYTE) &&
246 (b != TC_CHAR) &&
247 (b != TC_DOUBLE) &&
248 (b != TC_FLOAT) &&
249 (b != TC_INT) &&
250 (b != TC_LONG) &&
251 (b != TC_SHORT) &&
252 (b != TC_BOOLEAN)) {
253 throw new ClassFormatError("bad method descriptor: "+fromUtf8(data)+" index: "+currentIndex);
254 }
255 }
256 return get(data, startIndex, currentIndex+1);
257 }
258 public Utf8 getReturnDescriptor() {
259 Assert._assert(!hasNext());
260 return get(data, currentIndex+2, data.length);
261 }
262 }
263
264
265
266
267 private static boolean memcmp(byte[] b1, byte[] b2) {
268 if (b1.length != b2.length) return false;
269 for (int i=b1.length; --i>=0; ) {
270 if (b1[i] != b2[i]) return false;
271 }
272 return true;
273 }
274 private static boolean memcmp(byte[] b1, byte[] b2, int startIndex, int endIndex) {
275 if (b1.length != (endIndex-startIndex)) return false;
276 for (int i=b1.length; --i>=0; ) {
277 if (b1[i] != b2[i+startIndex]) return false;
278 }
279 return true;
280 }
281
282 public static boolean NO_NEW = false;
283
284
285 private static int addToTable_helper(byte[] b, int hash, int[] chain, int index) {
286 if (NO_NEW) {
287 throw new IllegalStateException("Trying to add Utf8 "+fromUtf8(b));
288 }
289 if (++size == table.length) growTable_helper();
290 if (!checkUtf8(b)) {
291 fromUtf8(b);
292 Assert.UNREACHABLE();
293 }
294 table[size] = new Utf8(b, hash);
295 chain[index] = size+1;
296 if (TRACE) System.out.println("allocated Utf8 ["+size+"] = \""+table[size]+"\"");
297 return size;
298 }
299
300
301 private static void growTable_helper() {
302 Utf8[] newtable = new Utf8[size<<1];
303 System.arraycopy(table, 0, newtable, 0, size);
304 if (TRACE) System.out.println("Growing Utf8 table from "+table.length+" to "+newtable.length);
305 table = newtable;
306 }
307
308 /*** Private constructor. Use the get() method to create a Utf8 object. */
309 private Utf8(byte[] data, int hash) {
310 this.data = data; this.hash = hash;
311 if (DEBUG) cache = fromUtf8(data);
312 }
313 private byte[] data;
314 private int hash;
315
316 public static int hashCode(byte[] data) {
317 int h = 4999;
318 int i=data.length;
319 while(--i>=0) {
320 h = 2999*h + data[i];
321 }
322 return h;
323 }
324
325 public static int hashCode(byte[] data, int startIndex, int endIndex) {
326 int h = 4999;
327 int i=endIndex;
328 while(--i>=startIndex) {
329 h = 2999*h + data[i];
330 }
331 return h;
332 }
333
334 public int hashCode() {
335 return hash;
336 }
337
338 public static final boolean USE_CACHE = true;
339 public static final boolean DEBUG = true;
340 private String cache;
341 public String toString() {
342 if (USE_CACHE) {
343 if (cache != null) return cache;
344 return cache = fromUtf8(data);
345 } else {
346 return fromUtf8(data);
347 }
348 }
349
350 public void dump(DataOutput out) throws IOException {
351 Assert._assert(data.length <= Character.MAX_VALUE);
352 out.writeChar(data.length);
353 out.write(data);
354 }
355
356 public void debugWrite() {
357 Debug.write(data, data.length);
358 }
359
360
361
362 /***
363 * Strictly check the format of the utf8/pseudo-utf8 byte array in
364 * fromUtf8.
365 */
366 static final boolean STRICTLY_CHECK_FORMAT = false;
367 /***
368 * Set fromUtf8 to not throw an exception when given a normal utf8
369 * byte array.
370 */
371 static final boolean ALLOW_NORMAL_UTF8 = false;
372 /***
373 * Set fromUtf8 to not throw an exception when given a pseudo utf8
374 * byte array.
375 */
376 static final boolean ALLOW_PSEUDO_UTF8 = true;
377 /***
378 * Set toUtf8 to write in pseudo-utf8 (rather than normal utf8).
379 */
380 static final boolean WRITE_PSEUDO_UTF8 = true;
381
382 /***
383 * Convert the given sequence of (pseudo-)utf8 formatted bytes
384 * into a String.
385 *
386 * The acceptable input formats are controlled by the
387 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
388 * flags.
389 *
390 * @param utf8 (pseudo-)utf8 byte array
391 * @throws UTFDataFormatError if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
392 * @return unicode string
393 */
394 public static String fromUtf8(byte[] utf8)
395 throws UTFDataFormatError {
396 char[] result = new char[utf8.length];
397 int result_index = 0;
398 for (int i=0, n=utf8.length; i<n; ) {
399 byte b = utf8[i++];
400 if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8)
401 if (b == 0)
402 throw new UTFDataFormatError("0 byte encountered at location "+(i-1));
403 if (b >= 0) {
404
405 result[result_index++] = (char)b;
406 continue;
407 }
408 try {
409 byte nb = utf8[i++];
410 if (b < -32) {
411
412 char c = result[result_index++] =
413 (char)(((b & 0x1f) << 6) | (nb & 0x3f));
414 if (STRICTLY_CHECK_FORMAT) {
415 if (((b & 0xe0) != 0xc0) ||
416 ((nb & 0xc0) != 0x80))
417 throw new UTFDataFormatError("invalid marker bits for double byte char at location "+(i-2));
418 if (c < '\200') {
419 if (!ALLOW_PSEUDO_UTF8 || (c != '\000'))
420 throw new UTFDataFormatError("encountered double byte char that should have been single byte at location "+(i-2));
421 } else if (c > '\u07FF')
422 throw new UTFDataFormatError("encountered double byte char that should have been triple byte at location "+(i-2));
423 }
424 } else {
425 byte nnb = utf8[i++];
426
427 char c = result[result_index++] =
428 (char)(((b & 0x0f) << 12) |
429 ((nb & 0x3f) << 6) |
430 (nnb & 0x3f));
431 if (STRICTLY_CHECK_FORMAT) {
432 if (((b & 0xf0) != 0xe0) ||
433 ((nb & 0xc0) != 0x80) ||
434 ((nnb & 0xc0) != 0x80))
435 throw new UTFDataFormatError("invalid marker bits for triple byte char at location "+(i-3));
436 if (c < '\u0800')
437 throw new UTFDataFormatError("encountered triple byte char that should have been fewer bytes at location "+(i-3));
438 }
439 }
440 } catch (ArrayIndexOutOfBoundsException e) {
441 throw new UTFDataFormatError("unexpected end at location "+i);
442 }
443 }
444 return new String(result, 0, result_index);
445 }
446
447 /***
448 * Convert the given String into a sequence of (pseudo-)utf8
449 * formatted bytes.
450 *
451 * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
452 *
453 * @param s String to convert
454 * @return array containing sequence of (pseudo-)utf8 formatted bytes
455 */
456 public static byte[] toUtf8(String s) {
457 byte[] result = new byte[lengthUtf8(s)];
458 int result_index = 0;
459 for (int i = 0, n = s.length(); i < n; ++i) {
460 char c = (char)s.charAt(i);
461
462
463 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F))
464 result[result_index++] = (byte)c;
465 else if (c > 0x07FF) {
466 result[result_index++] = (byte)(0xe0 | (byte)(c >> 12));
467 result[result_index++] = (byte)(0x80 | ((c & 0xfc0) >> 6));
468 result[result_index++] = (byte)(0x80 | (c & 0x3f));
469 } else {
470 result[result_index++] = (byte)(0xc0 | (byte)(c >> 6));
471 result[result_index++] = (byte)(0x80 | (c & 0x3f));
472 }
473 }
474 return result;
475 }
476
477 /***
478 * Returns the length of a string's utf8 encoded form.
479 */
480 public static int lengthUtf8(String s) {
481 int utflen = 0;
482 for (int i = 0, n = s.length(); i < n; ++i) {
483 int c = s.charAt(i);
484 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F))
485 ++utflen;
486 else if (c > 0x07FF)
487 utflen += 3;
488 else
489 utflen += 2;
490 }
491 return utflen;
492 }
493
494 /***
495 * Check whether the given sequence of bytes is valid (pseudo-)utf8.
496 *
497 * @param bytes byte array to check
498 * @return true iff the given sequence is valid (pseudo-)utf8.
499 */
500 public static boolean checkUtf8(byte[] bytes) {
501 for (int i=0, n=bytes.length; i<n; ) {
502 byte b = bytes[i++];
503 if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8)
504 if (b == 0) return false;
505 if (b >= 0) {
506
507 continue;
508 }
509 try {
510 byte nb = bytes[i++];
511 if (b < -32) {
512
513 char c = (char)(((b & 0x1f) << 6) | (nb & 0x3f));
514 if (STRICTLY_CHECK_FORMAT) {
515 if (((b & 0xe0) != 0xc0) ||
516 ((nb & 0xc0) != 0x80))
517 return false;
518 if (c < '\200') {
519 if (!ALLOW_PSEUDO_UTF8 || (c != '\000'))
520 return false;
521 } else if (c > '\u07FF')
522 return false;
523 }
524 } else {
525 byte nnb = bytes[i++];
526
527 char c = (char)(((b & 0x0f) << 12) |
528 ((nb & 0x3f) << 6) |
529 (nnb & 0x3f));
530 if (STRICTLY_CHECK_FORMAT) {
531 if (((b & 0xf0) != 0xe0) ||
532 ((nb & 0xc0) != 0x80) ||
533 ((nnb & 0xc0) != 0x80))
534 return false;
535 if (c < '\u0800')
536 return false;
537 }
538 }
539 } catch (ArrayIndexOutOfBoundsException e) {
540 return false;
541 }
542 }
543 return true;
544 }
545
546 }