ICU 4.8.1.1  4.8.1.1
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
regex.h
Go to the documentation of this file.
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
9 *
10 * created on: 2002oct22
11 * created by: Andy Heninger
12 *
13 * ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 //#define REGEX_DEBUG
20 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h"
53 
54 #include "unicode/uregex.h"
55 
57 
58 
59 // Forward Declarations...
60 
61 class RegexMatcher;
62 class RegexPattern;
63 class UVector;
64 class UVector32;
65 class UVector64;
66 class UnicodeSet;
67 struct REStackFrame;
68 struct Regex8BitSet;
70 class RegexCImpl;
71 
72 
73 
74 
79 #ifdef REGEX_DEBUG
81  RegexPatternDump(const RegexPattern *pat);
82 #else
83  #undef RegexPatternDump
84  #define RegexPatternDump(pat)
85 #endif
86 
87 
88 
101 public:
102 
110  RegexPattern();
111 
118  RegexPattern(const RegexPattern &source);
119 
125  virtual ~RegexPattern();
126 
135  UBool operator==(const RegexPattern& that) const;
136 
145  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
146 
152  RegexPattern &operator =(const RegexPattern &source);
153 
161  virtual RegexPattern *clone() const;
162 
163 
188  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
189  UParseError &pe,
190  UErrorCode &status);
191 
192 
219  static RegexPattern * U_EXPORT2 compile( UText *regex,
220  UParseError &pe,
221  UErrorCode &status);
222 
247  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
248  uint32_t flags,
249  UParseError &pe,
250  UErrorCode &status);
251 
252 
279  static RegexPattern * U_EXPORT2 compile( UText *regex,
280  uint32_t flags,
281  UParseError &pe,
282  UErrorCode &status);
283 
284 
307  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
308  uint32_t flags,
309  UErrorCode &status);
310 
311 
336  static RegexPattern * U_EXPORT2 compile( UText *regex,
337  uint32_t flags,
338  UErrorCode &status);
339 
340 
346  virtual uint32_t flags() const;
347 
365  virtual RegexMatcher *matcher(const UnicodeString &input,
366  UErrorCode &status) const;
367 
368 private:
382  RegexMatcher *matcher(const UChar *input,
383  UErrorCode &status) const;
384 public:
385 
386 
398  virtual RegexMatcher *matcher(UErrorCode &status) const;
399 
400 
415  static UBool U_EXPORT2 matches(const UnicodeString &regex,
416  const UnicodeString &input,
417  UParseError &pe,
418  UErrorCode &status);
419 
420 
435  static UBool U_EXPORT2 matches(UText *regex,
436  UText *input,
437  UParseError &pe,
438  UErrorCode &status);
439 
440 
449  virtual UnicodeString pattern() const;
450 
451 
462  virtual UText *patternText(UErrorCode &status) const;
463 
464 
503  virtual int32_t split(const UnicodeString &input,
504  UnicodeString dest[],
505  int32_t destCapacity,
506  UErrorCode &status) const;
507 
508 
547  virtual int32_t split(UText *input,
548  UText *dest[],
549  int32_t destCapacity,
550  UErrorCode &status) const;
551 
552 
558  virtual UClassID getDynamicClassID() const;
559 
565  static UClassID U_EXPORT2 getStaticClassID();
566 
567 private:
568  //
569  // Implementation Data
570  //
571  UText *fPattern; // The original pattern string.
572  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
573  uint32_t fFlags; // The flags used when compiling the pattern.
574  //
575  UVector64 *fCompiledPat; // The compiled pattern p-code.
576  UnicodeString fLiteralText; // Any literal string data from the pattern,
577  // after un-escaping, for use during the match.
578 
579  UVector *fSets; // Any UnicodeSets referenced from the pattern.
580  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
581 
582 
583  UErrorCode fDeferredStatus; // status if some prior error has left this
584  // RegexPattern in an unusable state.
585 
586  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
587  // >= this value. For some patterns, this calculated
588  // value may be less than the true shortest
589  // possible match.
590 
591  int32_t fFrameSize; // Size of a state stack frame in the
592  // execution engine.
593 
594  int32_t fDataSize; // The size of the data needed by the pattern that
595  // does not go on the state stack, but has just
596  // a single copy per matcher.
597 
598  UVector32 *fGroupMap; // Map from capture group number to position of
599  // the group's variables in the matcher stack frame.
600 
601  int32_t fMaxCaptureDigits;
602 
603  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
604  // regex character classes, e.g. Word.
605 
606  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
607  // sets for predefined regex classes.
608 
609  int32_t fStartType; // Info on how a match must start.
610  int32_t fInitialStringIdx; //
611  int32_t fInitialStringLen;
612  UnicodeSet *fInitialChars;
613  UChar32 fInitialChar;
614  Regex8BitSet *fInitialChars8;
615  UBool fNeedsAltInput;
616 
617  friend class RegexCompile;
618  friend class RegexMatcher;
619  friend class RegexCImpl;
620 
621  //
622  // Implementation Methods
623  //
624  void init(); // Common initialization, for use by constructors.
625  void zap(); // Common cleanup
626 #ifdef REGEX_DEBUG
627  void dumpOp(int32_t index) const;
628  friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
629 #endif
630 
631 };
632 
633 
634 
645 public:
646 
661  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
662 
678  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
679 
701  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
702  uint32_t flags, UErrorCode &status);
703 
725  RegexMatcher(UText *regexp, UText *input,
726  uint32_t flags, UErrorCode &status);
727 
728 private:
742  RegexMatcher(const UnicodeString &regexp, const UChar *input,
743  uint32_t flags, UErrorCode &status);
744 public:
745 
746 
752  virtual ~RegexMatcher();
753 
754 
761  virtual UBool matches(UErrorCode &status);
762 
763 
774  virtual UBool matches(int64_t startIndex, UErrorCode &status);
775 
776 
790  virtual UBool lookingAt(UErrorCode &status);
791 
792 
806  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
807 
808 
821  virtual UBool find();
822 
823 
833  virtual UBool find(int64_t start, UErrorCode &status);
834 
835 
845  virtual UnicodeString group(UErrorCode &status) const;
846 
847 
860  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
861 
862 
868  virtual int32_t groupCount() const;
869 
870 
885  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
886 
902  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
903 
919  virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
920 
921 
929  virtual int32_t start(UErrorCode &status) const;
930 
938  virtual int64_t start64(UErrorCode &status) const;
939 
940 
954  virtual int32_t start(int32_t group, UErrorCode &status) const;
955 
969  virtual int64_t start64(int32_t group, UErrorCode &status) const;
970 
971 
985  virtual int32_t end(UErrorCode &status) const;
986 
1000  virtual int64_t end64(UErrorCode &status) const;
1001 
1002 
1020  virtual int32_t end(int32_t group, UErrorCode &status) const;
1021 
1039  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1040 
1041 
1050  virtual RegexMatcher &reset();
1051 
1052 
1068  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1069 
1070 
1088  virtual RegexMatcher &reset(const UnicodeString &input);
1089 
1090 
1104  virtual RegexMatcher &reset(UText *input);
1105 
1106 
1131  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1132 
1133 private:
1147  RegexMatcher &reset(const UChar *input);
1148 public:
1149 
1157  virtual const UnicodeString &input() const;
1158 
1167  virtual UText *inputText() const;
1168 
1179  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1180 
1181 
1200  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1201 
1213  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1214 
1223  virtual int32_t regionStart() const;
1224 
1233  virtual int64_t regionStart64() const;
1234 
1235 
1244  virtual int32_t regionEnd() const;
1245 
1254  virtual int64_t regionEnd64() const;
1255 
1264  virtual UBool hasTransparentBounds() const;
1265 
1284  virtual RegexMatcher &useTransparentBounds(UBool b);
1285 
1286 
1294  virtual UBool hasAnchoringBounds() const;
1295 
1296 
1309  virtual RegexMatcher &useAnchoringBounds(UBool b);
1310 
1311 
1324  virtual UBool hitEnd() const;
1325 
1335  virtual UBool requireEnd() const;
1336 
1337 
1343  virtual const RegexPattern &pattern() const;
1344 
1345 
1362  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1363 
1364 
1385  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1386 
1387 
1408  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1409 
1410 
1435  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1436 
1437 
1465  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1466  const UnicodeString &replacement, UErrorCode &status);
1467 
1468 
1496  virtual RegexMatcher &appendReplacement(UText *dest,
1497  UText *replacement, UErrorCode &status);
1498 
1499 
1510  virtual UnicodeString &appendTail(UnicodeString &dest);
1511 
1512 
1526  virtual UText *appendTail(UText *dest, UErrorCode &status);
1527 
1528 
1552  virtual int32_t split(const UnicodeString &input,
1553  UnicodeString dest[],
1554  int32_t destCapacity,
1555  UErrorCode &status);
1556 
1557 
1581  virtual int32_t split(UText *input,
1582  UText *dest[],
1583  int32_t destCapacity,
1584  UErrorCode &status);
1585 
1607  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1608 
1615  virtual int32_t getTimeLimit() const;
1616 
1638  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1639 
1647  virtual int32_t getStackLimit() const;
1648 
1649 
1663  virtual void setMatchCallback(URegexMatchCallback *callback,
1664  const void *context,
1665  UErrorCode &status);
1666 
1667 
1678  virtual void getMatchCallback(URegexMatchCallback *&callback,
1679  const void *&context,
1680  UErrorCode &status);
1681 
1682 
1696  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1697  const void *context,
1698  UErrorCode &status);
1699 
1700 
1711  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1712  const void *&context,
1713  UErrorCode &status);
1714 
1715 
1721  void setTrace(UBool state);
1722 
1723 
1729  static UClassID U_EXPORT2 getStaticClassID();
1730 
1736  virtual UClassID getDynamicClassID() const;
1737 
1738 private:
1739  // Constructors and other object boilerplate are private.
1740  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1741  RegexMatcher(); // default constructor not implemented
1742  RegexMatcher(const RegexPattern *pat);
1743  RegexMatcher(const RegexMatcher &other);
1744  RegexMatcher &operator =(const RegexMatcher &rhs);
1745  void init(UErrorCode &status); // Common initialization
1746  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1747 
1748  friend class RegexPattern;
1749  friend class RegexCImpl;
1750 public:
1752  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1753 private:
1754 
1755  //
1756  // MatchAt This is the internal interface to the match engine itself.
1757  // Match status comes back in matcher member variables.
1758  //
1759  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1760  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1761  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1762  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1763  REStackFrame *resetStack();
1764  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1765  void IncrementTime(UErrorCode &status);
1766  UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status);
1767 
1768  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1769 
1770  UBool findUsingChunk();
1771  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1772  UBool isChunkWordBoundary(int32_t pos);
1773 
1774  const RegexPattern *fPattern;
1775  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1776  // should delete it when through.
1777 
1778  const UnicodeString *fInput; // The string being matched. Only used for input()
1779  UText *fInputText; // The text being matched. Is never NULL.
1780  UText *fAltInputText; // A shallow copy of the text being matched.
1781  // Only created if the pattern contains backreferences.
1782  int64_t fInputLength; // Full length of the input text.
1783  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1784 
1785  int64_t fRegionStart; // Start of the input region, default = 0.
1786  int64_t fRegionLimit; // End of input region, default to input.length.
1787 
1788  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1789  int64_t fAnchorLimit; // See useAnchoringBounds
1790 
1791  int64_t fLookStart; // Region bounds for look-ahead/behind and
1792  int64_t fLookLimit; // and other boundary tests. See
1793  // useTransparentBounds
1794 
1795  int64_t fActiveStart; // Currently active bounds for matching.
1796  int64_t fActiveLimit; // Usually is the same as region, but
1797  // is changed to fLookStart/Limit when
1798  // entering look around regions.
1799 
1800  UBool fTransparentBounds; // True if using transparent bounds.
1801  UBool fAnchoringBounds; // True if using anchoring bounds.
1802 
1803  UBool fMatch; // True if the last attempted match was successful.
1804  int64_t fMatchStart; // Position of the start of the most recent match
1805  int64_t fMatchEnd; // First position after the end of the most recent match
1806  // Zero if no previous match, even when a region
1807  // is active.
1808  int64_t fLastMatchEnd; // First position after the end of the previous match,
1809  // or -1 if there was no previous match.
1810  int64_t fAppendPosition; // First position after the end of the previous
1811  // appendReplacement(). As described by the
1812  // JavaDoc for Java Matcher, where it is called
1813  // "append position"
1814  UBool fHitEnd; // True if the last match touched the end of input.
1815  UBool fRequireEnd; // True if the last match required end-of-input
1816  // (matched $ or Z)
1817 
1818  UVector64 *fStack;
1819  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1820  // which will contain the capture group results.
1821  // NOT valid while match engine is running.
1822 
1823  int64_t *fData; // Data area for use by the compiled pattern.
1824  int64_t fSmallData[8]; // Use this for data if it's enough.
1825 
1826  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1827  // match engine run. Zero for unlimited.
1828 
1829  int32_t fTime; // Match time, accumulates while matching.
1830  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1831  // Kept separately from fTime to keep as much
1832  // code as possible out of the inline
1833  // StateSave function.
1834 
1835  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1836  // stack, in bytes. Zero for unlimited.
1837 
1838  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1839  // NULL if there is no callback.
1840  const void *fCallbackContext; // User Context ptr for callback function.
1841 
1842  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1843  // NULL if there is no callback.
1844  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1845 
1846 
1847  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1848 
1849  UBool fTraceDebug; // Set true for debug tracing of match engine.
1850 
1851  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1852  // reported, or that permanently disables this matcher.
1853 
1854  RuleBasedBreakIterator *fWordBreakItr;
1855 
1856 
1857 };
1858 
1860 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1861 #endif