001    /* NumericShaper.java
002       Copyright (C) 2003 Free Software Foundation, Inc.
003    
004    This file is part of GNU Classpath.
005    
006    GNU Classpath is free software; you can redistribute it and/or modify
007    it under the terms of the GNU General Public License as published by
008    the Free Software Foundation; either version 2, or (at your option)
009    any later version.
010    
011    GNU Classpath is distributed in the hope that it will be useful, but
012    WITHOUT ANY WARRANTY; without even the implied warranty of
013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014    General Public License for more details.
015    
016    You should have received a copy of the GNU General Public License
017    along with GNU Classpath; see the file COPYING.  If not, write to the
018    Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019    02110-1301 USA.
020    
021    Linking this library statically or dynamically with other modules is
022    making a combined work based on this library.  Thus, the terms and
023    conditions of the GNU General Public License cover the whole
024    combination.
025    
026    As a special exception, the copyright holders of this library give you
027    permission to link this library with independent modules to produce an
028    executable, regardless of the license terms of these independent
029    modules, and to copy and distribute the resulting executable under
030    terms of your choice, provided that you also meet, for each linked
031    independent module, the terms and conditions of the license of that
032    module.  An independent module is a module which is not derived from
033    or based on this library.  If you modify this library, you may extend
034    this exception to your version of the library, but you are not
035    obligated to do so.  If you do not wish to do so, delete this
036    exception statement from your version. */
037    
038    
039    package java.awt.font;
040    
041    import java.io.Serializable;
042    import java.lang.Character.UnicodeBlock;
043    
044    /**
045     * This class handles numeric shaping.  A shaper can either be contextual
046     * or not.  A non-contextual shaper will always translate ASCII digits
047     * in its input into the target Unicode range.  A contextual shaper will
048     * change the target Unicode range depending on the characters it has
049     * previously processed.
050     *
051     * @author Michael Koch
052     * @author Tom Tromey
053     *
054     * @since 1.4
055     * @specnote This class does not handle LIMBU or OSMANYA.
056     * @specnote The JDK does not seem to properly handle ranges without a
057     * digit zero, such as TAMIL.  This implementation does.
058     */
059    public final class NumericShaper implements Serializable
060    {
061      private static final long serialVersionUID = -8022764705923730308L;
062    
063      /** Convenience constant representing all the valid Unicode ranges.  */
064      public static final int ALL_RANGES  = 524287;
065    
066      /**
067       * Constant representing the Unicode ARABIC range.  Shaping done
068       * using this range will translate to the arabic decimal characters.
069       * Use EASTERN_ARABIC if you want to shape to the eastern arabic
070       * (also known as the extended arabic) decimal characters.
071       */
072      public static final int ARABIC  = 2;
073    
074      /** Constant representing the Unicode BENGALI range.  */
075      public static final int BENGALI  = 16;
076    
077      /** Constant representing the Unicode DEVANAGARI range.  */
078      public static final int DEVANAGARI  = 8;
079    
080      /**
081       * Constant representing the Unicode extended arabic range.
082       * In Unicode there are two different sets of arabic digits;
083       * this selects the extended or eastern set.
084       */
085      public static final int EASTERN_ARABIC  = 4;
086    
087      /**
088       * Constant representing the Unicode ETHIOPIC range.  Note that
089       * there is no digit zero in this range; an ASCII digit zero
090       * is left unchanged when shaping to this range.
091       */
092      public static final int ETHIOPIC  = 65536;
093    
094      /**
095       * Constant representing the Unicode EUROPEAN range.  For
096       * contextual shaping purposes, characters in the various
097       * extended Latin character blocks are recognized as EUROPEAN.
098       */
099      public static final int EUROPEAN  = 1;
100    
101      /** Constant representing the Unicode GUJARATI range.  */
102      public static final int GUJARATI  = 64;
103    
104      /** Constant representing the Unicode GURMUKHI range.  */
105      public static final int GURMUKHI  = 32;
106    
107      /** Constant representing the Unicode KANNADA range.  */
108      public static final int KANNADA  = 1024;
109    
110      /** Constant representing the Unicode KHMER range.  */
111      public static final int KHMER  = 131072;
112    
113      /** Constant representing the Unicode LAO range.  */
114      public static final int LAO  = 8192;
115    
116      /** Constant representing the Unicode MALAYALAM range.  */
117      public static final int MALAYALAM  = 2048;
118    
119      /** Constant representing the Unicode MONGOLIAN range.  */
120      public static final int MONGOLIAN  = 262144;
121    
122      /** Constant representing the Unicode MYANMAR range.  */
123      public static final int MYANMAR  = 32768;
124    
125      /** Constant representing the Unicode ORIYA range.  */
126      public static final int ORIYA  = 128;
127    
128      /**
129       * Constant representing the Unicode TAMIL range.  Note that
130       * there is no digit zero in this range; an ASCII digit zero
131       * is left unchanged when shaping to this range.
132       */
133      public static final int TAMIL  = 256;
134    
135      /** Constant representing the Unicode TELUGU range.  */
136      public static final int TELUGU  = 512;
137    
138      /** Constant representing the Unicode THAI range.  */
139      public static final int THAI  = 4096;
140    
141      /** Constant representing the Unicode TIBETAN range.  */
142      public static final int TIBETAN  = 16384;
143    
144      /**
145       * This table holds the zero digits for each language.  This is hard-coded
146       * because the values will not change and the table layout is tied to the
147       * other constants in this class in any case.  In the two places where a
148       * language does not have a zero digit, the character immediately preceeding
149       * the one digit is used instead.  These languages are special-cased in
150       * the shaping code.
151       */
152      private static final char[] zeroDigits =
153      {
154        '0',      // EUROPEAN
155        '\u0660', // ARABIC
156        '\u06f0', // EASTERN_ARABIC
157        '\u0966', // DEVANAGARI
158        '\u09e6', // BENGALI
159        '\u0a66', // GURMUKHI
160        '\u0ae6', // GUJARATI
161        '\u0b66', // ORIYA
162        '\u0be6', // TAMIL - special case as there is no digit zero
163        '\u0c66', // TELUGU
164        '\u0ce6', // KANNADA
165        '\u0d66', // MALAYALAM
166        '\u0e50', // THAI
167        '\u0ed0', // LAO
168        '\u0f20', // TIBETAN
169        '\u1040', // MYANMAR
170        '\u1368', // ETHIOPIC - special case as there is no digit zero
171        '\u17e0', // KHMER
172        '\u1810'  // MONGOLIAN
173      };
174    
175      /**
176       * The default initial context for this shaper, specified as
177       * an integer from 0 to 18.
178       */
179      private int key;
180    
181      /**
182       * The target ranges handled by this shaper.  If the shaper
183       * is not contextual, the high bit of this field will be set.
184       * @specnote This was discovered by reading the serialization spec
185       */
186      private int mask;
187    
188      /**
189       * Create a new numeric shaper.  The key given is a constant from
190       * this class, the constructor turns it into its internal form.
191       * @param key the key to use, as one of the manifest constants
192       * @param mask a mask of languages to shape for
193       */
194      private NumericShaper (int key, int mask)
195      {
196        // This internal form is a bit goofy, but it is specified by
197        // the serialization spec.
198        this.key = Integer.numberOfTrailingZeros(key);
199        this.mask = mask;
200      }
201    
202      /**
203       * Return an integer representing all the languages for which this
204       * shaper will shape.  The result is taken by "or"ing together
205       * the constants representing the various languages.
206       */
207      public int getRanges ()
208      {
209        return mask & ALL_RANGES;
210      }
211    
212      /**
213       * Return true if this shaper is contextual, false if it is not.
214       */
215      public boolean isContextual ()
216      {
217        return mask > 0;
218      }
219    
220      /**
221       * Shape the text in the given array.  The starting context will
222       * be the context passed to the shaper at creation time.
223       * @param text the text to shape
224       * @param start the index of the starting character of the array
225       * @param count the number of characters in the array
226       */
227      public void shape (char[] text, int start, int count)
228      {
229        shape (text, start, count, 1 << key);
230      }
231    
232      /**
233       * Given a unicode block object, return corresponding language constant.
234       * If the block is not recognized, returns zero.  Note that as there
235       * is no separate ARABIC block in Character, this case must
236       * be specially handled by the caller; EASTERN_ARABIC is preferred when
237       * both are specified.
238       * @param b the unicode block to classify
239       * @return the language constant, or zero if not recognized
240       */
241      private int classify(UnicodeBlock b)
242      {
243        if (b == null)
244          return 0;
245        // ARABIC is handled by the caller; from testing we know
246        // that EASTERN_ARABIC takes precedence.
247        if (b == UnicodeBlock.ARABIC)
248          return EASTERN_ARABIC;
249        if (b == UnicodeBlock.BENGALI)
250          return BENGALI;
251        if (b == UnicodeBlock.DEVANAGARI)
252          return DEVANAGARI;
253        if (b == UnicodeBlock.ETHIOPIC)
254          return ETHIOPIC;
255        if (b == UnicodeBlock.BASIC_LATIN
256            || b == UnicodeBlock.LATIN_1_SUPPLEMENT
257            || b == UnicodeBlock.LATIN_EXTENDED_A
258            || b == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL
259            || b == UnicodeBlock.LATIN_EXTENDED_B)
260          return EUROPEAN;
261        if (b == UnicodeBlock.GUJARATI)
262          return GUJARATI;
263        if (b == UnicodeBlock.GURMUKHI)
264          return GURMUKHI;
265        if (b == UnicodeBlock.KANNADA)
266          return KANNADA;
267        if (b == UnicodeBlock.KHMER)
268          return KHMER;
269        if (b == UnicodeBlock.LAO)
270          return LAO;
271        if (b == UnicodeBlock.MALAYALAM)
272          return MALAYALAM;
273        if (b == UnicodeBlock.MONGOLIAN)
274          return MONGOLIAN;
275        if (b == UnicodeBlock.MYANMAR)
276          return MYANMAR;
277        if (b == UnicodeBlock.ORIYA)
278          return ORIYA;
279        if (b == UnicodeBlock.TAMIL)
280          return TAMIL;
281        if (b == UnicodeBlock.TELUGU)
282          return TELUGU;
283        if (b == UnicodeBlock.THAI)
284          return THAI;
285        if (b == UnicodeBlock.TIBETAN)
286          return TIBETAN;
287        return 0;
288      }
289    
290      /**
291       * Shape the given text, using the indicated initial context.
292       * If this shaper is not a contextual shaper, then the given context
293       * will be ignored.
294       * @param text the text to shape
295       * @param start the index of the first character of the text to shape
296       * @param count the number of characters to shape in the text
297       * @param context the initial context
298       * @throws IllegalArgumentException if the initial context is invalid
299       */
300      public void shape (char[] text, int start, int count, int context)
301      {
302        int currentContext;
303        if (isContextual())
304          {
305            if (Integer.bitCount(context) != 1 || (context & ~ALL_RANGES) != 0)
306              throw new IllegalArgumentException("invalid context argument");
307            // If the indicated context is not one we are handling, reset it.
308            if ((context & mask) == 0)
309              currentContext = -1;
310            else
311              currentContext = Integer.numberOfTrailingZeros(context);
312          }
313        else
314          currentContext = key;
315    
316        for (int i = 0; i < count; ++i)
317          {
318            char c = text[start + i];
319            if (c >= '0' && c <= '9')
320              {
321                if (currentContext >= 0)
322                  {
323                    // Shape into the current context.
324                    if (c == '0'
325                      && ((1 << currentContext) == TAMIL
326                          || (1 << currentContext) == ETHIOPIC))
327                      {
328                        // No digit 0 in this context; do nothing.
329                      }
330                    else
331                      text[start + i]
332                        = (char) (zeroDigits[currentContext] + c - '0');
333                  }
334              }
335            else if (isContextual())
336              {
337                // if c is in a group, set currentContext; else reset it.
338                int group = classify(UnicodeBlock.of(c));
339                // Specially handle ARABIC.
340                if (group == EASTERN_ARABIC && (mask & EASTERN_ARABIC) == 0
341                    && (mask & ARABIC) != 0)
342                  group = ARABIC;
343                if ((mask & group) != 0)
344                  {
345                    // The character was classified as being in a group
346                    // we recognize, and it was selected by the shaper.
347                    // So, change the context.
348                    currentContext = Integer.numberOfTrailingZeros(group);
349                  }
350              }
351          }
352      }
353    
354      public boolean equals (Object obj)
355      {
356        if (! (obj instanceof NumericShaper))
357          return false;
358        NumericShaper tmp = (NumericShaper) obj;
359        return key == tmp.key && mask == tmp.mask;
360      }
361    
362      public int hashCode ()
363      {
364        return key ^ mask;
365      }
366    
367      public String toString ()
368      {
369        // For debugging only.
370        return "key=" + key + "; mask=" + mask;
371      }
372    
373      /**
374       * Return a non-contextual shaper which can shape to a single range.
375       * All ASCII digits in the input text are translated to this language.
376       * @param singleRange the target language
377       * @return a non-contextual shaper for this language
378       * @throws IllegalArgumentException if the argument does not name a
379       * single language, as specified by the constants declared in this class
380       */
381      public static NumericShaper getShaper (int singleRange)
382      {
383        if (Integer.bitCount(singleRange) != 1)
384          throw new IllegalArgumentException("more than one bit set in argument");
385        if ((singleRange & ~ALL_RANGES) != 0)
386          throw new IllegalArgumentException("argument out of range");
387        return new NumericShaper(singleRange, Integer.MIN_VALUE | singleRange);
388      }
389    
390      /**
391       * Return a contextual shaper which can shape to any of the indicated
392       * languages.  The default initial context for this shaper is EUROPEAN.
393       * @param ranges the ranges to shape to
394       * @return a contextual shaper which will target any of these ranges
395       * @throws IllegalArgumentException if the argument specifies an
396       * unrecognized range
397       */
398      public static NumericShaper getContextualShaper (int ranges)
399      {
400        if ((ranges & ~ALL_RANGES) != 0)
401          throw new IllegalArgumentException("argument out of range");
402        return new NumericShaper(EUROPEAN, ranges);
403      }
404    
405      /**
406       * Return a contextual shaper which can shape to any of the indicated
407       * languages.  The default initial context for this shaper is given as
408       * an argument.
409       * @param ranges the ranges to shape to
410       * @param defaultContext the default initial context
411       * @return a contextual shaper which will target any of these ranges
412       * @throws IllegalArgumentException if the ranges argument specifies an
413       * unrecognized range, or if the defaultContext argument does not specify
414       * a single valid range
415       */
416      public static NumericShaper getContextualShaper (int ranges,
417                                                       int defaultContext)
418      {
419        if (Integer.bitCount(defaultContext) != 1)
420          throw new IllegalArgumentException("more than one bit set in context");
421        if ((ranges & ~ALL_RANGES) != 0 || (defaultContext & ~ALL_RANGES) != 0)
422          throw new IllegalArgumentException("argument out of range");
423        return new NumericShaper(defaultContext, ranges);
424      }
425    }