001    /* Matcher.java -- Instance of a regular expression applied to a char sequence.
002       Copyright (C) 2002, 2004, 2006 Free Software Foundation, Inc.
003    
004    This file is part of GNU Classpath.
005    
006    GNU Classpath is free software; you can redistribute it and/or modify
007    it under the terms of the GNU General Public License as published by
008    the Free Software Foundation; either version 2, or (at your option)
009    any later version.
010    
011    GNU Classpath is distributed in the hope that it will be useful, but
012    WITHOUT ANY WARRANTY; without even the implied warranty of
013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014    General Public License for more details.
015    
016    You should have received a copy of the GNU General Public License
017    along with GNU Classpath; see the file COPYING.  If not, write to the
018    Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019    02110-1301 USA.
020    
021    Linking this library statically or dynamically with other modules is
022    making a combined work based on this library.  Thus, the terms and
023    conditions of the GNU General Public License cover the whole
024    combination.
025    
026    As a special exception, the copyright holders of this library give you
027    permission to link this library with independent modules to produce an
028    executable, regardless of the license terms of these independent
029    modules, and to copy and distribute the resulting executable under
030    terms of your choice, provided that you also meet, for each linked
031    independent module, the terms and conditions of the license of that
032    module.  An independent module is a module which is not derived from
033    or based on this library.  If you modify this library, you may extend
034    this exception to your version of the library, but you are not
035    obligated to do so.  If you do not wish to do so, delete this
036    exception statement from your version. */
037    
038    
039    package java.util.regex;
040    
041    import gnu.java.lang.CPStringBuilder;
042    
043    import gnu.java.util.regex.CharIndexed;
044    import gnu.java.util.regex.RE;
045    import gnu.java.util.regex.REMatch;
046    
047    /**
048     * Instance of a regular expression applied to a char sequence.
049     *
050     * @since 1.4
051     */
052    public final class Matcher implements MatchResult
053    {
054      private Pattern pattern;
055      private CharSequence input;
056      // We use CharIndexed as an input object to the getMatch method in order
057      // that /\G/ (the end of the previous match) may work.  The information
058      // of the previous match is stored in the CharIndexed object.
059      private CharIndexed inputCharIndexed;
060      private int position;
061      private int appendPosition;
062      private REMatch match;
063    
064      /**
065       * The start of the region of the input on which to match.
066       */
067      private int regionStart;
068    
069      /**
070       * The end of the region of the input on which to match.
071       */
072      private int regionEnd;
073      
074      /**
075       * True if the match process should look beyond the 
076       * region marked by regionStart to regionEnd when
077       * performing lookAhead, lookBehind and boundary
078       * matching.
079       */
080      private boolean transparentBounds;
081    
082      /**
083       * The flags that affect the anchoring bounds.
084       * If {@link #hasAnchoringBounds()} is {@code true},
085       * the match process will honour the
086       * anchoring bounds: ^, \A, \Z, \z and $.  If
087       * {@link #hasAnchoringBounds()} is {@code false},
088       * the anchors are ignored and appropriate flags,
089       * stored in this variable, are used to provide this
090       * behaviour.
091       */
092      private int anchoringBounds;
093    
094      Matcher(Pattern pattern, CharSequence input)
095      {
096        this.pattern = pattern;
097        this.input = input;
098        this.inputCharIndexed = RE.makeCharIndexed(input, 0);
099        regionStart = 0;
100        regionEnd = input.length();
101        transparentBounds = false;
102        anchoringBounds = 0;
103      }
104      
105      /**
106       * @param sb The target string buffer
107       * @param replacement The replacement string
108       *
109       * @exception IllegalStateException If no match has yet been attempted,
110       * or if the previous match operation failed
111       * @exception IndexOutOfBoundsException If the replacement string refers
112       * to a capturing group that does not exist in the pattern
113       */
114      public Matcher appendReplacement (StringBuffer sb, String replacement)
115        throws IllegalStateException
116      {
117        assertMatchOp();
118        sb.append(input.subSequence(appendPosition,
119                                    match.getStartIndex()).toString());
120        sb.append(RE.getReplacement(replacement, match,
121            RE.REG_REPLACE_USE_BACKSLASHESCAPE));
122        appendPosition = match.getEndIndex();
123        return this;
124      }
125    
126      /**
127       * @param sb The target string buffer
128       */
129      public StringBuffer appendTail (StringBuffer sb)
130      {
131        sb.append(input.subSequence(appendPosition, input.length()).toString());
132        return sb;
133      }
134     
135      /**
136       * @exception IllegalStateException If no match has yet been attempted,
137       * or if the previous match operation failed
138       */
139      public int end ()
140        throws IllegalStateException
141      {
142        assertMatchOp();
143        return match.getEndIndex();
144      }
145      
146      /**
147       * @param group The index of a capturing group in this matcher's pattern
148       *
149       * @exception IllegalStateException If no match has yet been attempted,
150       * or if the previous match operation failed
151       * @exception IndexOutOfBoundsException If the replacement string refers
152       * to a capturing group that does not exist in the pattern
153       */
154      public int end (int group)
155        throws IllegalStateException
156      {
157        assertMatchOp();
158        return match.getEndIndex(group);
159      }
160     
161      public boolean find ()
162      {
163        boolean first = (match == null);
164        if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
165          match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds);
166        else
167          match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
168                                           position, anchoringBounds);
169        if (match != null)
170          {
171            int endIndex = match.getEndIndex();
172            // Are we stuck at the same position?
173            if (!first && endIndex == position)
174              {         
175                match = null;
176                // Not at the end of the input yet?
177                if (position < input.length() - 1)
178                  {
179                    position++;
180                    return find(position);
181                  }
182                else
183                  return false;
184              }
185            position = endIndex;
186            return true;
187          }
188        return false;
189      } 
190    
191      /**
192       * @param start The index to start the new pattern matching
193       *
194       * @exception IndexOutOfBoundsException If the replacement string refers
195       * to a capturing group that does not exist in the pattern
196       */
197      public boolean find (int start)
198      {
199        if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
200          match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds);
201        else
202          match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
203                                           start, anchoringBounds);
204        if (match != null)
205          {
206            position = match.getEndIndex();
207            return true;
208          }
209        return false;
210      }
211     
212      /**
213       * @exception IllegalStateException If no match has yet been attempted,
214       * or if the previous match operation failed
215       */
216      public String group ()
217      {
218        assertMatchOp();
219        return match.toString();
220      }
221      
222      /**
223       * @param group The index of a capturing group in this matcher's pattern
224       *
225       * @exception IllegalStateException If no match has yet been attempted,
226       * or if the previous match operation failed
227       * @exception IndexOutOfBoundsException If the replacement string refers
228       * to a capturing group that does not exist in the pattern
229       */
230      public String group (int group)
231        throws IllegalStateException
232      {
233        assertMatchOp();
234        return match.toString(group);
235      }
236    
237      /**
238       * @param replacement The replacement string
239       */
240      public String replaceFirst (String replacement)
241      {
242        reset();
243        // Semantics might not quite match
244        return pattern.getRE().substitute(input, replacement, position,
245            RE.REG_REPLACE_USE_BACKSLASHESCAPE);
246      }
247    
248      /**
249       * @param replacement The replacement string
250       */
251      public String replaceAll (String replacement)
252      {
253        reset();
254        return pattern.getRE().substituteAll(input, replacement, position,
255            RE.REG_REPLACE_USE_BACKSLASHESCAPE);
256      }
257      
258      public int groupCount ()
259      {
260        return pattern.getRE().getNumSubs();
261      }
262     
263      public boolean lookingAt ()
264      {
265        if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
266          match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
267                                           anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
268        else
269          match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
270                                           anchoringBounds|RE.REG_FIX_STARTING_POSITION);
271        if (match != null)
272          {
273            if (match.getStartIndex() == 0)
274              {
275                position = match.getEndIndex();
276                return true;
277              }
278            match = null;
279          }
280        return false;
281      }
282      
283      /**
284       * Attempts to match the entire input sequence against the pattern. 
285       *
286       * If the match succeeds then more information can be obtained via the
287       * start, end, and group methods.
288       *
289       * @see #start()
290       * @see #end()
291       * @see #group()
292       */
293      public boolean matches ()
294      {
295        if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
296          match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
297                                           anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
298        else
299          match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
300                                           anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION);
301        if (match != null)
302          {
303            if (match.getStartIndex() == 0)
304              {
305                position = match.getEndIndex();
306                if (position == input.length())
307                    return true;
308              }
309            match = null;
310          }
311        return false;
312      }
313      
314      /**
315       * Returns the Pattern that is interpreted by this Matcher
316       */
317      public Pattern pattern ()
318      {
319        return pattern;
320      }
321      
322      /**
323       * Resets the internal state of the matcher, including
324       * resetting the region to its default state of encompassing
325       * the whole input.  The state of {@link #hasTransparentBounds()}
326       * and {@link #hasAnchoringBounds()} are unaffected.
327       *
328       * @return a reference to this matcher.
329       * @see #regionStart()
330       * @see #regionEnd()
331       * @see #hasTransparentBounds()
332       * @see #hasAnchoringBounds()
333       */
334      public Matcher reset ()
335      {
336        position = 0;
337        match = null;
338        regionStart = 0;
339        regionEnd = input.length();
340        appendPosition = 0;
341        return this;
342      }
343      
344      /**
345       * Resets the internal state of the matcher, including
346       * resetting the region to its default state of encompassing
347       * the whole input.  The state of {@link #hasTransparentBounds()}
348       * and {@link #hasAnchoringBounds()} are unaffected.
349       *
350       * @param input The new input character sequence.
351       * @return a reference to this matcher.
352       * @see #regionStart()
353       * @see #regionEnd()
354       * @see #hasTransparentBounds()
355       * @see #hasAnchoringBounds()
356       */
357      public Matcher reset (CharSequence input)
358      {
359        this.input = input;
360        this.inputCharIndexed = RE.makeCharIndexed(input, 0);
361        return reset();
362      }
363      
364      /**
365       * @return the index of a capturing group in this matcher's pattern
366       *
367       * @exception IllegalStateException If no match has yet been attempted,
368       * or if the previous match operation failed
369       */
370      public int start ()
371        throws IllegalStateException
372      {
373        assertMatchOp();
374        return match.getStartIndex();
375      }
376    
377      /**
378       * @param group The index of a capturing group in this matcher's pattern
379       *
380       * @exception IllegalStateException If no match has yet been attempted,
381       * or if the previous match operation failed
382       * @exception IndexOutOfBoundsException If the replacement string refers
383       * to a capturing group that does not exist in the pattern
384       */
385      public int start (int group)
386        throws IllegalStateException
387      {
388        assertMatchOp();
389        return match.getStartIndex(group);
390      }
391    
392      /**
393       * @return True if and only if the matcher hit the end of input.
394       * @since 1.5
395       */
396      public boolean hitEnd()
397      {
398        return inputCharIndexed.hitEnd();
399      }
400    
401      /**
402       * @return A string expression of this matcher.
403       */
404      public String toString()
405      {
406        CPStringBuilder sb = new CPStringBuilder();
407        sb.append(this.getClass().getName())
408          .append("[pattern=").append(pattern.pattern())
409          .append(" region=").append(regionStart).append(",").append(regionEnd)
410          .append(" anchoringBounds=").append(anchoringBounds == 0)
411          .append(" transparentBounds=").append(transparentBounds)
412          .append(" lastmatch=").append(match == null ? "" : match.toString())
413          .append("]");
414        return sb.toString();
415      }
416    
417      private void assertMatchOp()
418      {
419        if (match == null) throw new IllegalStateException();
420      }
421    
422      /**
423       * <p>
424       * Defines the region of the input on which to match.
425       * By default, the {@link Matcher} attempts to match
426       * the whole string (from 0 to the length of the input),
427       * but a region between {@code start} (inclusive) and
428       * {@code end} (exclusive) on which to match may instead
429       * be defined using this method.
430       * </p>
431       * <p>
432       * The behaviour of region matching is further affected
433       * by the use of transparent or opaque bounds (see
434       * {@link #useTransparentBounds(boolean)}) and whether or not
435       * anchors ({@code ^} and {@code $}) are in use
436       * (see {@link #useAnchoringBounds(boolean)}).  With transparent
437       * bounds, the matcher is aware of input outside the bounds
438       * set by this method, whereas, with opaque bounds (the default)
439       * only the input within the bounds is used.  The use of
440       * anchors are affected by this setting; with transparent
441       * bounds, anchors will match the beginning of the real input,
442       * while with opaque bounds they match the beginning of the
443       * region.  {@link #useAnchoringBounds(boolean)} can be used
444       * to turn on or off the matching of anchors.
445       * </p>
446       *
447       * @param start the start of the region (inclusive).
448       * @param end the end of the region (exclusive).
449       * @return a reference to this matcher.
450       * @throws IndexOutOfBoundsException if either {@code start} or
451       *                                   {@code end} are less than zero,
452       *                                   if either {@code start} or
453       *                                   {@code end} are greater than the
454       *                                   length of the input, or if
455       *                                   {@code start} is greater than
456       *                                   {@code end}.
457       * @see #regionStart()
458       * @see #regionEnd()
459       * @see #hasTransparentBounds()
460       * @see #useTransparentBounds(boolean)
461       * @see #hasAnchoringBounds()
462       * @see #useAnchoringBounds(boolean)
463       * @since 1.5
464       */
465      public Matcher region(int start, int end)
466      {
467        int length = input.length();
468        if (start < 0)
469          throw new IndexOutOfBoundsException("The start position was less than zero.");
470        if (start >= length)
471          throw new IndexOutOfBoundsException("The start position is after the end of the input.");
472        if (end < 0)
473          throw new IndexOutOfBoundsException("The end position was less than zero.");
474        if (end > length)
475          throw new IndexOutOfBoundsException("The end position is after the end of the input.");
476        if (start > end)
477          throw new IndexOutOfBoundsException("The start position is after the end position.");
478        reset();
479        regionStart = start;
480        regionEnd = end;
481        return this;
482      }
483    
484      /**
485       * The start of the region on which to perform matches (inclusive).
486       *
487       * @return the start index of the region.
488       * @see #region(int,int)
489       * #see #regionEnd()
490       * @since 1.5
491       */
492      public int regionStart()
493      {
494        return regionStart;
495      }
496      
497      /**
498       * The end of the region on which to perform matches (exclusive).
499       *
500       * @return the end index of the region.
501       * @see #region(int,int)
502       * @see #regionStart()
503       * @since 1.5
504       */
505      public int regionEnd()
506      {
507        return regionEnd;
508      }
509    
510      /**
511       * Returns true if the bounds of the region marked by
512       * {@link #regionStart()} and {@link #regionEnd()} are
513       * transparent.  When these bounds are transparent, the
514       * matching process can look beyond them to perform
515       * lookahead, lookbehind and boundary matching operations.
516       * By default, the bounds are opaque.
517       *
518       * @return true if the bounds of the matching region are
519       *         transparent.
520       * @see #useTransparentBounds(boolean)
521       * @see #region(int,int)
522       * @see #regionStart()
523       * @see #regionEnd()
524       * @since 1.5
525       */
526      public boolean hasTransparentBounds()
527      {
528        return transparentBounds;
529      }
530    
531      /**
532       * Sets the transparency of the bounds of the region
533       * marked by {@link #regionStart()} and {@link #regionEnd()}.
534       * A value of {@code true} makes the bounds transparent,
535       * so the matcher can see beyond them to perform lookahead,
536       * lookbehind and boundary matching operations.  A value
537       * of {@code false} (the default) makes the bounds opaque,
538       * restricting the match to the input region denoted
539       * by {@link #regionStart()} and {@link #regionEnd()}.
540       *
541       * @param transparent true if the bounds should be transparent.
542       * @return a reference to this matcher.
543       * @see #hasTransparentBounds()
544       * @see #region(int,int)
545       * @see #regionStart()
546       * @see #regionEnd()
547       * @since 1.5
548       */
549      public Matcher useTransparentBounds(boolean transparent)
550      {
551        transparentBounds = transparent;
552        return this;
553      }
554    
555      /**
556       * Returns true if the matcher will honour the use of
557       * the anchoring bounds: {@code ^}, {@code \A}, {@code \Z},
558       * {@code \z} and {@code $}.  By default, the anchors
559       * are used.  Note that the effect of the anchors is
560       * also affected by {@link #hasTransparentBounds()}.
561       *
562       * @return true if the matcher will attempt to match
563       *         the anchoring bounds.
564       * @see #useAnchoringBounds(boolean)
565       * @see #hasTransparentBounds()
566       * @since 1.5
567       */
568      public boolean hasAnchoringBounds()
569      {
570        return anchoringBounds == 0;
571      }
572    
573      /**
574       * Enables or disables the use of the anchoring bounds:
575       * {@code ^}, {@code \A}, {@code \Z}, {@code \z} and
576       * {@code $}. By default, their use is enabled.  When
577       * disabled, the matcher will not attempt to match
578       * the anchors.
579       *
580       * @param useAnchors true if anchoring bounds should be used.
581       * @return a reference to this matcher.
582       * @since 1.5
583       * @see #hasAnchoringBounds()
584       */
585      public Matcher useAnchoringBounds(boolean useAnchors)
586      {
587        if (useAnchors)
588          anchoringBounds = 0;
589        else
590          anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL;
591        return this;
592      }
593    
594      /**
595       * Returns a read-only snapshot of the current state of
596       * the {@link Matcher} as a {@link MatchResult}.  Any
597       * subsequent changes to this instance are not reflected
598       * in the returned {@link MatchResult}.
599       *
600       * @return a {@link MatchResult} instance representing the
601       *         current state of the {@link Matcher}.
602       */
603      public MatchResult toMatchResult()
604      {
605        Matcher snapshot = new Matcher(pattern, input);
606        snapshot.match = (REMatch) match.clone();
607        return snapshot;
608      }
609    
610    }