Utf8 xref

View Javadoc

1   // Utf8.java, created Mon Feb  5 23:23:22 2001 by joewhaley
2   // Copyright (C) 2001-3 John Whaley <jwhaley@alum.mit.edu>
3   // Licensed under the terms of the GNU LGPL; see COPYING for details.
4   package joeq.UTF;
5   
6   import java.io.DataOutput;
7   import java.io.IOException;
8   import joeq.Class.jq_ClassFileConstants;
9   import joeq.Runtime.Debug;
10  import jwutil.collections.UnmodifiableIterator;
11  import jwutil.util.Assert;
12  
13  /***
14   * TODO: ummm, synchronization?!
15   *
16   * @author  John Whaley <jwhaley@alum.mit.edu>
17   * @version $Id: Utf8.java 1931 2004-09-22 22:17:47Z joewhaley $
18   */
19  public class Utf8 implements jq_ClassFileConstants {
20  
21      public static /*final*/ boolean TRACE = false;
22      
23      public static final int STARTING_TABLE_SIZE = 16384;
24      public static final int STARTING_HASH_SIZE = 9999;
25      public static final int STARTING_CHAIN_SIZE = 4;
26      
27      public static Utf8[] table = new Utf8[STARTING_TABLE_SIZE];
28      public static int size = -1;
29      public static int[][] chains = new int[STARTING_HASH_SIZE][]; // for hashing
30      
31      public static final Utf8 BYTE_DESC      = Utf8.get((char)TC_BYTE+"");
32      public static final Utf8 CHAR_DESC      = Utf8.get((char)TC_CHAR+"");
33      public static final Utf8 DOUBLE_DESC    = Utf8.get((char)TC_DOUBLE+"");
34      public static final Utf8 FLOAT_DESC     = Utf8.get((char)TC_FLOAT+"");
35      public static final Utf8 INT_DESC       = Utf8.get((char)TC_INT+"");
36      public static final Utf8 LONG_DESC      = Utf8.get((char)TC_LONG+"");
37      public static final Utf8 SHORT_DESC     = Utf8.get((char)TC_SHORT+"");
38      public static final Utf8 BOOLEAN_DESC   = Utf8.get((char)TC_BOOLEAN+"");
39      public static final Utf8 VOID_DESC      = Utf8.get((char)TC_VOID+"");
40  
41      public static Utf8 get(String s) {
42          return get(toUtf8(s));
43      }
44  
45      public static Utf8 get(byte[] b) {
46          int id = getID(b);
47          return table[id];
48      }
49      
50      public static Utf8 get(byte[] b, int startIndex, int endIndex) {
51          int id = getID(b, startIndex, endIndex);
52          return table[id];
53      }
54      
55      public static int getID(byte[] b) {
56          int hash = hashCode(b);
57          int chain_index = Math.abs(hash) % chains.length;
58          int[] chain = chains[chain_index];
59          if (chain == null) {
60              chains[chain_index] = chain = new int[STARTING_CHAIN_SIZE];
61              return addToTable_helper(b, hash, chain, 0);
62          }
63          for (int i=0; i<chain.length; ++i) {
64              int id = chain[i]-1;
65              if (id == -1) { // end of chain
66                  return addToTable_helper(b, hash, chain, i);
67              }
68              Utf8 utf8 = table[id];
69              // ??? check hash before memcmp ???
70              if (memcmp(utf8.data, b)) {
71                  // ??? swap first one and this one ???
72                  return id;
73              }
74          }
75          int[] newchain = new int[chain.length<<1];
76          System.arraycopy(chain, 0, newchain, 0, chain.length);
77          chains[chain_index] = newchain;
78          return addToTable_helper(b, hash, newchain, chain.length);
79          // free chain
80          
81          // todo: rehash when the table gets too full...
82      }
83      
84      public static int getID(byte[] b, int startIndex, int endIndex) {
85          int hash = hashCode(b, startIndex, endIndex);
86          int chain_index = Math.abs(hash) % chains.length;
87          int[] chain = chains[chain_index];
88          if (chain == null) {
89              chains[chain_index] = chain = new int[STARTING_CHAIN_SIZE];
90              byte[] b2 = new byte[endIndex-startIndex];
91              System.arraycopy(b, startIndex, b2, 0, endIndex-startIndex);
92              return addToTable_helper(b2, hash, chain, 0);
93          }
94          for (int i=0; i<chain.length; ++i) {
95              int id = chain[i]-1;
96              if (id == -1) { // end of chain
97                  byte[] b2 = new byte[endIndex-startIndex];
98                  System.arraycopy(b, startIndex, b2, 0, endIndex-startIndex);
99                  return addToTable_helper(b2, hash, chain, i);
100             }
101             Utf8 utf8 = table[id];
102             // ??? check hash before memcmp ???
103             if (memcmp(utf8.data, b, startIndex, endIndex)) {
104                 // ??? swap first one and this one ???
105                 return id;
106             }
107         }
108         int[] newchain = new int[chain.length<<1];
109         System.arraycopy(chain, 0, newchain, 0, chain.length);
110         chains[chain_index] = newchain;
111         byte[] b2 = new byte[endIndex-startIndex];
112         System.arraycopy(b, startIndex, b2, 0, endIndex-startIndex);
113         return addToTable_helper(b2, hash, newchain, chain.length);
114         // free(chain)
115         
116         // todo: rehash when the table gets too full...
117     }
118     
119     public boolean isValidMethodDescriptor() {
120         if (data.length < 3)
121             return false;
122         if (data[0] != TC_PARAM)
123             return false;
124         int i=1;
125         while (data[i] != TC_PARAMEND) {
126 here:
127             switch (data[i]) {
128                 case TC_BYTE:
129                 case TC_CHAR:
130                 case TC_DOUBLE:
131                 case TC_FLOAT:
132                 case TC_INT:
133                 case TC_LONG:
134                 case TC_SHORT:
135                 case TC_BOOLEAN:
136                 case TC_ARRAY:
137                     break;
138                 case TC_CLASS:
139                     for (;;) {
140                         if (++i == data.length)
141                             return false;
142                         if (data[i] == TC_CLASSEND)
143                             break here;
144                     }
145                 default:
146                     return false;
147             }
148             if (++i == data.length)
149                 return false;
150         }
151         ++i;
152         return ((data[i] == TC_VOID) && (i == data.length-1)) ||
153                isValidTypeDescriptor(i);
154     }
155     public boolean isValidTypeDescriptor() {
156         return isValidTypeDescriptor(0);
157     }
158     private boolean isValidTypeDescriptor(int i) {
159         for (;;) {
160             if (data.length == i)
161                 return false;
162             switch (data[i]) {
163                 case TC_BYTE:
164                 case TC_CHAR:
165                 case TC_DOUBLE:
166                 case TC_FLOAT:
167                 case TC_INT:
168                 case TC_LONG:
169                 case TC_SHORT:
170                 case TC_BOOLEAN:
171                     return i == data.length-1;
172                 case TC_ARRAY:
173                     ++i; continue;
174                 case TC_CLASS:
175                     for (;;) {
176                         if (++i == data.length)
177                             return false;
178                         if (data[i] == TC_CLASSEND)
179                             return i == data.length-1;
180                     }
181             default:
182                 return false;
183             }
184         }
185     }
186     
187     public boolean isDescriptor(byte desc) {
188         return (data.length > 0) && (data[0] == desc);
189     }
190 
191     public Utf8 getArrayElementDescriptor() {
192         Assert._assert(isDescriptor(TC_ARRAY));
193         return get(data, 1, data.length);
194     }
195     
196     public Utf8 getClassName() {
197         Assert._assert(isDescriptor(TC_CLASS));
198         return get(data, 1, data.length-1);
199     }
200     
201     public Utf8 getAsArrayDescriptor() {
202         Assert._assert(isValidTypeDescriptor());
203         // todo: might need to reevaluate making a new array on every query.
204         byte[] b = new byte[data.length+1];
205         b[0] = TC_ARRAY;
206         System.arraycopy(data, 0, b, 1, data.length);
207         return get(b);
208     }
209     
210     public Utf8 getAsClassDescriptor() {
211         Assert._assert(data[0] != TC_ARRAY);
212         // todo: might need to reevaluate making a new array on every query.
213         byte[] b = new byte[data.length+2];
214         b[0] = TC_CLASS;
215         System.arraycopy(data, 0, b, 1, data.length);
216         b[data.length+1] = TC_CLASSEND;
217         return get(b);
218     }
219 
220     public MethodDescriptorIterator getParamDescriptors() {
221         return new MethodDescriptorIterator();
222     }
223     
224     public class MethodDescriptorIterator extends UnmodifiableIterator {
225         int currentIndex;
226         MethodDescriptorIterator() {
227             Assert._assert(isDescriptor(TC_PARAM));
228             currentIndex = 0;
229         }
230         public boolean hasNext() {
231             return data[currentIndex+1] != TC_PARAMEND;
232         }
233         public Object next() { return nextUtf8(); }
234         public Utf8 nextUtf8() {
235             byte b = data[++currentIndex];
236             int startIndex = currentIndex;
237             while (b == TC_ARRAY) {
238                 b = data[++currentIndex];
239             }
240             if (b == TC_CLASS) {
241                 while (b != TC_CLASSEND) {
242                     b = data[++currentIndex];
243                 }
244             } else {
245                 if ((b != TC_BYTE) &&
246                     (b != TC_CHAR) &&
247                     (b != TC_DOUBLE) &&
248                     (b != TC_FLOAT) &&
249                     (b != TC_INT) &&
250                     (b != TC_LONG) &&
251                     (b != TC_SHORT) &&
252                     (b != TC_BOOLEAN)) {
253                     throw new ClassFormatError("bad method descriptor: "+fromUtf8(data)+" index: "+currentIndex);
254                 }
255             }
256             return get(data, startIndex, currentIndex+1);
257         }
258         public Utf8 getReturnDescriptor() {
259             Assert._assert(!hasNext());
260             return get(data, currentIndex+2, data.length);
261         }
262     }
263     
264     //// Implementation stuff below.
265     
266     // Helper function.
267     private static boolean memcmp(byte[] b1, byte[] b2) {
268         if (b1.length != b2.length) return false;
269         for (int i=b1.length; --i>=0; ) {
270             if (b1[i] != b2[i]) return false;
271         }
272         return true;
273     }
274     private static boolean memcmp(byte[] b1, byte[] b2, int startIndex, int endIndex) {
275         if (b1.length != (endIndex-startIndex)) return false;
276         for (int i=b1.length; --i>=0; ) {
277             if (b1[i] != b2[i+startIndex]) return false;
278         }
279         return true;
280     }
281     
282     public static boolean NO_NEW = false;
283 
284     // Helper function.
285     private static int addToTable_helper(byte[] b, int hash, int[] chain, int index) {
286         if (NO_NEW) {
287             throw new IllegalStateException("Trying to add Utf8 "+fromUtf8(b));
288         }
289         if (++size == table.length) growTable_helper();
290         if (!checkUtf8(b)) {
291             fromUtf8(b); // fromUtf8 has more informative error messages.
292             Assert.UNREACHABLE(); // fromUtf8 should have thrown an exception.
293         }
294         table[size] = new Utf8(b, hash);
295         chain[index] = size+1;
296         if (TRACE) System.out.println("allocated Utf8 ["+size+"] = \""+table[size]+"\"");
297         return size;
298     }
299     
300     // Helper function.
301     private static void growTable_helper() {
302         Utf8[] newtable = new Utf8[size<<1];
303         System.arraycopy(table, 0, newtable, 0, size);
304         if (TRACE) System.out.println("Growing Utf8 table from "+table.length+" to "+newtable.length);
305         table = newtable;
306     }
307     
308     /*** Private constructor.  Use the get() method to create a Utf8 object. */
309     private Utf8(byte[] data, int hash) {
310         this.data = data; this.hash = hash;
311         if (DEBUG) cache = fromUtf8(data);
312     }
313     private byte[] data;
314     private int hash;
315     
316     public static int hashCode(byte[] data) {
317         int h = 4999;
318         int i=data.length;
319         while(--i>=0) {
320             h = 2999*h + data[i];
321         }
322         return h;
323     }
324     
325     public static int hashCode(byte[] data, int startIndex, int endIndex) {
326         int h = 4999;
327         int i=endIndex;
328         while(--i>=startIndex) {
329             h = 2999*h + data[i];
330         }
331         return h;
332     }
333     
334     public int hashCode() {
335         return hash;
336     }
337     
338     public static final boolean USE_CACHE = true;
339     public static final boolean DEBUG = true;
340     private String cache;
341     public String toString() {
342         if (USE_CACHE) {
343             if (cache != null) return cache;
344             return cache = fromUtf8(data);
345         } else {
346             return fromUtf8(data);
347         }
348     }
349 
350     public void dump(DataOutput out) throws IOException {
351         Assert._assert(data.length <= Character.MAX_VALUE);
352         out.writeChar(data.length);
353         out.write(data);
354     }
355     
356     public void debugWrite() {
357         Debug.write(data, data.length);
358     }
359     
360     //// Utf8 conversion routines
361     
362     /***
363      * Strictly check the format of the utf8/pseudo-utf8 byte array in
364      * fromUtf8.
365      */
366     static final boolean STRICTLY_CHECK_FORMAT = false;
367     /***
368      * Set fromUtf8 to not throw an exception when given a normal utf8
369      * byte array.
370      */
371     static final boolean ALLOW_NORMAL_UTF8 = false;
372     /***
373      * Set fromUtf8 to not throw an exception when given a pseudo utf8
374      * byte array.
375      */
376     static final boolean ALLOW_PSEUDO_UTF8 = true;
377     /***
378      * Set toUtf8 to write in pseudo-utf8 (rather than normal utf8).
379      */
380     static final boolean WRITE_PSEUDO_UTF8 = true;
381 
382     /***
383      * Convert the given sequence of (pseudo-)utf8 formatted bytes
384      * into a String.
385      *
386      * The acceptable input formats are controlled by the
387      * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
388      * flags.
389      *
390      * @param utf8 (pseudo-)utf8 byte array
391      * @throws UTFDataFormatError if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
392      * @return unicode string
393      */
394     public static String fromUtf8(byte[] utf8)
395     throws UTFDataFormatError {
396         char[] result = new char[utf8.length];
397         int result_index = 0;
398         for (int i=0, n=utf8.length; i<n; ) {
399             byte b = utf8[i++];
400             if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8)
401                 if (b == 0)
402                     throw new UTFDataFormatError("0 byte encountered at location "+(i-1));
403             if (b >= 0) {  // < 0x80 unsigned
404                 // in the range '\001' to '\177'
405                 result[result_index++] = (char)b;
406                 continue;
407             }
408             try {
409                 byte nb = utf8[i++];
410                 if (b < -32) {  // < 0xe0 unsigned
411                     // '\000' or in the range '\200' to '\u07FF'
412                     char c = result[result_index++] =
413                         (char)(((b & 0x1f) << 6) | (nb & 0x3f));
414                     if (STRICTLY_CHECK_FORMAT) {
415                         if (((b & 0xe0) != 0xc0) ||
416                             ((nb & 0xc0) != 0x80))
417                             throw new UTFDataFormatError("invalid marker bits for double byte char at location "+(i-2));
418                         if (c < '\200') {
419                             if (!ALLOW_PSEUDO_UTF8 || (c != '\000'))
420                                 throw new UTFDataFormatError("encountered double byte char that should have been single byte at location "+(i-2));
421                         } else if (c > '\u07FF')
422                             throw new UTFDataFormatError("encountered double byte char that should have been triple byte at location "+(i-2));
423                     }
424                 } else {
425                     byte nnb = utf8[i++];
426                     // in the range '\u0800' to '\uFFFF'
427                     char c = result[result_index++] =
428                         (char)(((b & 0x0f) << 12) |
429                                ((nb & 0x3f) << 6) |
430                                (nnb & 0x3f));
431                     if (STRICTLY_CHECK_FORMAT) {
432                         if (((b & 0xf0) != 0xe0) ||
433                             ((nb & 0xc0) != 0x80) ||
434                             ((nnb & 0xc0) != 0x80))
435                             throw new UTFDataFormatError("invalid marker bits for triple byte char at location "+(i-3));
436                         if (c < '\u0800')
437                             throw new UTFDataFormatError("encountered triple byte char that should have been fewer bytes at location "+(i-3));
438                     }
439                 }
440             } catch (ArrayIndexOutOfBoundsException e) {
441                 throw new UTFDataFormatError("unexpected end at location "+i);
442             }
443         }
444         return new String(result, 0, result_index);
445     }
446 
447     /***
448      * Convert the given String into a sequence of (pseudo-)utf8
449      * formatted bytes.
450      *
451      * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
452      *
453      * @param s String to convert
454      * @return array containing sequence of (pseudo-)utf8 formatted bytes
455      */
456     public static byte[] toUtf8(String s) {
457         byte[] result = new byte[lengthUtf8(s)];
458         int result_index = 0;
459         for (int i = 0, n = s.length(); i < n; ++i) {
460             char c = (char)s.charAt(i);
461             // in all shifts below, c is an (unsigned) char,
462             // so either >>> or >> is ok
463             if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F))
464                 result[result_index++] = (byte)c;
465             else if (c > 0x07FF) {
466                 result[result_index++] = (byte)(0xe0 | (byte)(c >> 12));
467                 result[result_index++] = (byte)(0x80 | ((c & 0xfc0) >> 6));
468                 result[result_index++] = (byte)(0x80 | (c & 0x3f));
469             } else {
470                 result[result_index++] = (byte)(0xc0 | (byte)(c >> 6));
471                 result[result_index++] = (byte)(0x80 | (c & 0x3f));
472             }
473         }
474         return result;
475     }
476 
477     /***
478      * Returns the length of a string's utf8 encoded form.
479      */
480     public static int lengthUtf8(String s) {
481         int utflen = 0;
482         for (int i = 0, n = s.length(); i < n; ++i) {
483             int c = s.charAt(i);
484             if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F))
485                 ++utflen;
486             else if (c > 0x07FF)
487                 utflen += 3;
488             else
489                 utflen += 2;
490         }
491         return utflen;
492     }
493 
494     /***
495      * Check whether the given sequence of bytes is valid (pseudo-)utf8.
496      *
497      * @param bytes byte array to check
498      * @return true iff the given sequence is valid (pseudo-)utf8.
499      */
500     public static boolean checkUtf8(byte[] bytes) {
501         for (int i=0, n=bytes.length; i<n; ) {
502             byte b = bytes[i++];
503             if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8)
504                 if (b == 0) return false;
505             if (b >= 0) {  // < 0x80 unsigned
506                 // in the range '\001' to '\177'
507                 continue;
508             }
509             try {
510                 byte nb = bytes[i++];
511                 if (b < -32) {  // < 0xe0 unsigned
512                     // '\000' or in the range '\200' to '\u07FF'
513                     char c = (char)(((b & 0x1f) << 6) | (nb & 0x3f));
514                     if (STRICTLY_CHECK_FORMAT) {
515                         if (((b & 0xe0) != 0xc0) ||
516                             ((nb & 0xc0) != 0x80))
517                             return false;
518                         if (c < '\200') {
519                             if (!ALLOW_PSEUDO_UTF8 || (c != '\000'))
520                                 return false;
521                             } else if (c > '\u07FF')
522                                 return false;
523                     }
524                 } else {
525                     byte nnb = bytes[i++];
526                     // in the range '\u0800' to '\uFFFF'
527                     char c = (char)(((b & 0x0f) << 12) |
528                                     ((nb & 0x3f) << 6) |
529                                     (nnb & 0x3f));
530                     if (STRICTLY_CHECK_FORMAT) {
531                         if (((b & 0xf0) != 0xe0) ||
532                             ((nb & 0xc0) != 0x80) ||
533                             ((nnb & 0xc0) != 0x80))
534                             return false;
535                         if (c < '\u0800')
536                             return false;
537                     }
538                 }
539             } catch (ArrayIndexOutOfBoundsException e) {
540                 return false;
541             }
542         }
543         return true;
544     }
545 
546 }