import cl @41939
diff --git a/i18n/regexcmp.cpp b/i18n/regexcmp.cpp
index 09da39c..2c84e3d 100644
--- a/i18n/regexcmp.cpp
+++ b/i18n/regexcmp.cpp
@@ -313,7 +313,6 @@
// Optimization passes
//
matchStartType();
- OptDotStar();
stripNOPs();
//
@@ -515,14 +514,29 @@
case doOpenLookAhead:
// Positive Look-ahead (?= stuff )
+ //
+ // Note: Addition of transparent input regions, with the need to
+ // restore the original regions when failing out of a lookahead
+ // block, complicated this sequence. Some conbined opcodes
+ // might make sense - or might not, lookahead aren't that common.
+ //
+ // Caution: min match length optimization knows about this
+ // sequence; don't change without making updates there too.
+ //
// Compiles to
- // 1 START_LA dataLoc
- // 2. NOP reserved for use by quantifiers on the block.
+ // 1 START_LA dataLoc Saves SP, Input Pos
+ // 2. STATE_SAVE 4 on failure of lookahead, goto 4
+ // 3 JMP 6 continue ...
+ //
+ // 4. LA_END Look Ahead failed. Restore regions.
+ // 5. BACKTRACK and back track again.
+ //
+ // 6. NOP reserved for use by quantifiers on the block.
// Look-ahead can't have quantifiers, but paren stack
// compile time conventions require the slot anyhow.
- // 3. NOP may be replaced if there is are '|' ops in the block.
- // 4. code for parenthesized stuff.
- // 5. ENDLA
+ // 7. NOP may be replaced if there is are '|' ops in the block.
+ // 8. code for parenthesized stuff.
+ // 9. LA_END
//
// Two data slots are reserved, for saving the stack ptr and the input position.
{
@@ -531,6 +545,18 @@
int32_t op = URX_BUILD(URX_LA_START, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
+ op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
+ fRXPat->fCompiledPat->addElement(op, *fStatus);
+
+ op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
+ fRXPat->fCompiledPat->addElement(op, *fStatus);
+
+ op = URX_BUILD(URX_LA_END, dataLoc);
+ fRXPat->fCompiledPat->addElement(op, *fStatus);
+
+ op = URX_BUILD(URX_BACKTRACK, 0);
+ fRXPat->fCompiledPat->addElement(op, *fStatus);
+
op = URX_BUILD(URX_NOP, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
fRXPat->fCompiledPat->addElement(op, *fStatus);
@@ -554,7 +580,8 @@
// 4. code for parenthesized stuff.
// 5. END_LA // Cut back stack, remove saved state from step 2.
// 6. FAIL // code in block succeeded, so neg. lookahead fails.
- // 7. ...
+ // 7. END_LA // Restore match region, in case look-ahead was using
+ // an alternate (transparent) region.
{
int32_t dataLoc = fRXPat->fDataSize;
fRXPat->fDataSize += 2;
@@ -570,7 +597,7 @@
// On the Parentheses stack, start a new frame and add the postions
// of the StateSave and NOP.
fParenStack.push(fModeFlags, *fStatus); // Match mode state
- fParenStack.push( negLookAhead, *fStatus); // Frame type
+ fParenStack.push(negLookAhead, *fStatus); // Frame type
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
@@ -750,13 +777,17 @@
}
if (URX_TYPE(repeatedOp) == URX_DOTANY ||
- URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
+ URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
+ URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
// Emit Optimized code for .+ operations.
int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
- // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
+ // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
loopOpI |= 1;
}
+ if (fModeFlags & UREGEX_UNIX_LINES) {
+ loopOpI |= 2;
+ }
fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
frameLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
@@ -889,13 +920,17 @@
}
if (URX_TYPE(repeatedOp) == URX_DOTANY ||
- URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
+ URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
+ URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
// Emit Optimized code for .* operations.
int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
// URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
loopOpI |= 1;
}
+ if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
+ loopOpI |= 2;
+ }
fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
dataLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
@@ -1068,6 +1103,8 @@
int32_t op;
if (fModeFlags & UREGEX_DOTALL) {
op = URX_BUILD(URX_DOTANY_ALL, 0);
+ } else if (fModeFlags & UREGEX_UNIX_LINES) {
+ op = URX_BUILD(URX_DOTANY_UNIX, 0);
} else {
op = URX_BUILD(URX_DOTANY, 0);
}
@@ -1077,15 +1114,35 @@
case doCaret:
{
- int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_CARET_M : URX_CARET;
+ int32_t op;
+ if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
+ op = URX_CARET;
+ } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
+ op = URX_CARET_M;
+ } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
+ op = URX_CARET; // Only testing true start of input.
+ } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
+ op = URX_CARET_M_UNIX;
+ }
+ if (fModeFlags & UREGEX_MULTILINE) {
+ op = (fModeFlags & UREGEX_UNIX_LINES)? URX_CARET_M_UNIX : URX_CARET_M;
+ }
fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
}
break;
-
case doDollar:
{
- int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_DOLLAR_M : URX_DOLLAR;
+ int32_t op;
+ if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
+ op = URX_DOLLAR;
+ } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
+ op = URX_DOLLAR_M;
+ } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
+ op = URX_DOLLAR_D;
+ } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
+ op = URX_DOLLAR_MD;
+ }
fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
}
break;
@@ -1340,6 +1397,7 @@
int32_t bit = 0;
switch (fC.fChar) {
case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
+ case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
@@ -2062,7 +2120,7 @@
case lookAhead:
{
- int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
+ int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
int32_t dataLoc = URX_VAL(startOp);
int32_t op = URX_BUILD(URX_LA_END, dataLoc);
@@ -2078,13 +2136,16 @@
int32_t dataLoc = URX_VAL(startOp);
int32_t op = URX_BUILD(URX_LA_END, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
- op = URX_BUILD(URX_FAIL, 0);
+ op = URX_BUILD(URX_BACKTRACK, 0);
+ fRXPat->fCompiledPat->addElement(op, *fStatus);
+ op = URX_BUILD(URX_LA_END, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
// Patch the URX_SAVE near the top of the block.
+ // The destination of the SAVE is the final LA_END that was just added.
int32_t saveOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
- int32_t dest = fRXPat->fCompiledPat->size();
+ int32_t dest = fRXPat->fCompiledPat->size()-1;
saveOp = URX_BUILD(URX_STATE_SAVE, dest);
fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
}
@@ -2392,7 +2453,6 @@
// If the op we are now at was the destination of a branch in the pattern,
// and that path has a shorter minimum length than the current accumulated value,
// replace the current accumulated value.
- U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
if (forwardedLength.elementAti(loc) < currentLen) {
currentLen = forwardedLength.elementAti(loc);
U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
@@ -2402,6 +2462,7 @@
// Ops that don't change the total length matched
case URX_RESERVED_OP:
case URX_END:
+ case URX_FAIL:
case URX_STRING_LEN:
case URX_NOP:
case URX_START_CAPTURE:
@@ -2411,10 +2472,11 @@
case URX_BACKSLASH_G:
case URX_BACKSLASH_Z:
case URX_DOLLAR:
+ case URX_DOLLAR_M:
+ case URX_DOLLAR_D:
+ case URX_DOLLAR_MD:
case URX_RELOC_OPRND:
case URX_STO_INP_LOC:
- case URX_DOLLAR_M:
- case URX_BACKTRACK:
case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
case URX_BACKREF_I:
@@ -2429,6 +2491,7 @@
break;
case URX_CARET_M:
+ case URX_CARET_M_UNIX:
if (atStart) {
fRXPat->fStartType = START_LINE;
}
@@ -2553,8 +2616,7 @@
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
case URX_DOTANY_ALL: // . matches one or two.
case URX_DOTANY:
- case URX_DOTANY_ALL_PL:
- case URX_DOTANY_PL:
+ case URX_DOTANY_UNIX:
if (currentLen == 0) {
// These constructs are all bad news when they appear at the start
// of a match. Any character can begin the match.
@@ -2595,7 +2657,7 @@
atStart = FALSE;
break;
- case URX_FAIL:
+ case URX_BACKTRACK:
// Fails are kind of like a branch, except that the min length was
// propagated already, by the state save.
currentLen = forwardedLength.elementAti(loc+1);
@@ -2718,18 +2780,25 @@
{
// Look-around. Scan forward until the matching look-ahead end,
// without processing the look-around block. This is overly pessimistic.
- int32_t depth = 0;
+
+ // Keep track of the nesting depth of look-around blocks. Boilerplate code for
+ // lookahead contains two LA_END instructions, so count goes up by two
+ // for each LA_START.
+ int32_t depth = (opType == URX_LA_START? 2: 1);
for (;;) {
loc++;
op = fRXPat->fCompiledPat->elementAti(loc);
- if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
+ if (URX_TYPE(op) == URX_LA_START) {
+ depth+=2;
+ }
+ if (URX_TYPE(op) == URX_LB_START) {
depth++;
}
if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
+ depth--;
if (depth == 0) {
break;
}
- depth--;
}
if (URX_TYPE(op) == URX_STATE_SAVE) {
// Need this because neg lookahead blocks will FAIL to outside
@@ -2863,7 +2932,8 @@
// If the op we are now at was the destination of a branch in the pattern,
// and that path has a shorter minimum length than the current accumulated value,
// replace the current accumulated value.
- U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
+ // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
+ // no-match-possible cases.
if (forwardedLength.elementAti(loc) < currentLen) {
currentLen = forwardedLength.elementAti(loc);
U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
@@ -2883,11 +2953,13 @@
case URX_BACKSLASH_Z:
case URX_CARET:
case URX_DOLLAR:
+ case URX_DOLLAR_M:
+ case URX_DOLLAR_D:
+ case URX_DOLLAR_MD:
case URX_RELOC_OPRND:
case URX_STO_INP_LOC:
- case URX_DOLLAR_M:
case URX_CARET_M:
- case URX_BACKTRACK:
+ case URX_CARET_M_UNIX:
case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
case URX_BACKREF_I:
@@ -2910,8 +2982,7 @@
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
case URX_DOTANY_ALL: // . matches one or two.
case URX_DOTANY:
- case URX_DOTANY_PL:
- case URX_DOTANY_ALL_PL:
+ case URX_DOTANY_UNIX:
currentLen++;
break;
@@ -2936,12 +3007,11 @@
}
break;
- case URX_FAIL:
+ case URX_BACKTRACK:
{
- // Fails are kind of like a branch, except that the min length was
+ // Back-tracks are kind of like a branch, except that the min length was
// propagated already, by the state save.
currentLen = forwardedLength.elementAti(loc+1);
- U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
}
break;
@@ -3008,21 +3078,33 @@
case URX_LB_START:
{
// Look-around. Scan forward until the matching look-ahead end,
- // without processing the look-around block. This is overly pessimistic.
+ // without processing the look-around block. This is overly pessimistic for look-ahead,
+ // it assumes that the look-ahead match might be zero-length.
// TODO: Positive lookahead could recursively do the block, then continue
// with the longer of the block or the value coming in.
- int32_t depth = 0;
+ int32_t depth = (opType == URX_LA_START? 2: 1);;
for (;;) {
loc++;
op = fRXPat->fCompiledPat->elementAti(loc);
- if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
+ if (URX_TYPE(op) == URX_LA_START) {
+ // The boilerplate for look-ahead includes two LA_END insturctions,
+ // Depth will be decremented by each one when it is seen.
+ depth += 2;
+ }
+ if (URX_TYPE(op) == URX_LB_START) {
depth++;
}
- if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
+ if (URX_TYPE(op) == URX_LA_END) {
+ depth--;
if (depth == 0) {
break;
}
+ }
+ if (URX_TYPE(op)==URX_LBN_END) {
depth--;
+ if (depth == 0) {
+ break;
+ }
}
if (URX_TYPE(op) == URX_STATE_SAVE) {
// Need this because neg lookahead blocks will FAIL to outside
@@ -3034,7 +3116,9 @@
}
}
}
-
+ if (loc > end) {
+ RegexPatternDump(fRXPat);
+ }
U_ASSERT(loc <= end);
}
}
@@ -3123,11 +3207,13 @@
case URX_BACKSLASH_Z:
case URX_CARET:
case URX_DOLLAR:
+ case URX_DOLLAR_M:
+ case URX_DOLLAR_D:
+ case URX_DOLLAR_MD:
case URX_RELOC_OPRND:
case URX_STO_INP_LOC:
- case URX_DOLLAR_M:
case URX_CARET_M:
- case URX_BACKTRACK:
+ case URX_CARET_M_UNIX:
case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
case URX_LD_SP:
@@ -3145,8 +3231,6 @@
case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
case URX_BACKREF_I:
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
- case URX_DOTANY_PL:
- case URX_DOTANY_ALL_PL:
currentLen = INT32_MAX;
break;
@@ -3160,6 +3244,7 @@
case URX_ONECHAR_I:
case URX_DOTANY_ALL:
case URX_DOTANY:
+ case URX_DOTANY_UNIX:
currentLen+=2;
break;
@@ -3193,8 +3278,8 @@
}
break;
- case URX_FAIL:
- // Fails are kind of like a branch, except that the max length was
+ case URX_BACKTRACK:
+ // back-tracks are kind of like a branch, except that the max length was
// propagated already, by the state save.
currentLen = forwardedLength.elementAti(loc+1);
break;
@@ -3377,13 +3462,12 @@
case URX_BACKSLASH_X:
case URX_BACKSLASH_Z:
case URX_DOTANY_ALL:
- case URX_DOTANY_ALL_PL:
- case URX_DOTANY_PL:
case URX_BACKSLASH_D:
case URX_CARET:
case URX_DOLLAR:
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
+ case URX_DOTANY_UNIX:
case URX_STO_SP:
case URX_LD_SP:
case URX_BACKREF:
@@ -3395,6 +3479,7 @@
case URX_BACKREF_I:
case URX_DOLLAR_M:
case URX_CARET_M:
+ case URX_CARET_M_UNIX:
case URX_LB_START:
case URX_LB_CONT:
case URX_LB_END:
@@ -3403,6 +3488,8 @@
case URX_LOOP_SR_I:
case URX_LOOP_DOT_I:
case URX_LOOP_C:
+ case URX_DOLLAR_D:
+ case URX_DOLLAR_MD:
// These instructions are unaltered by the relocation.
fRXPat->fCompiledPat->setElementAt(op, dst);
dst++;
@@ -3423,83 +3510,6 @@
//------------------------------------------------------------------------------
//
-// OptDotStar Optimize patterns that end with a '.*' or '.+' to
-// just advance the input to the end.
-//
-// Transform this compiled sequence
-// [DOT_ANY | DOT_ANY_ALL]
-// JMP_SAV to previous instruction
-// [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
-// END
-//
-// To
-// NOP
-// [DOT_ANY_PL | DOT_ANY_ALL_PL]
-// [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
-// END
-//
-//------------------------------------------------------------------------------
-void RegexCompile::OptDotStar() {
- // Scan backwards in the pattern, looking for a JMP_SAV near the end.
- int32_t jmpLoc;
- int32_t op = 0;
- int32_t opType;
- for (jmpLoc=fRXPat->fCompiledPat->size(); jmpLoc--;) {
- U_ASSERT(jmpLoc>0);
- op = fRXPat->fCompiledPat->elementAti(jmpLoc);
- opType = URX_TYPE(op);
- switch(opType) {
-
-
- case URX_END:
- case URX_NOP:
- case URX_END_CAPTURE:
- case URX_DOLLAR_M:
- case URX_DOLLAR:
- case URX_BACKSLASH_Z:
- // These ops may follow the JMP_SAV without preventing us from
- // doing this optimization.
- continue;
-
- case URX_JMP_SAV:
- // Got a trailing JMP_SAV that's a candidate for optimization.
- break;
-
- default:
- // This optimization not possible.
- return;
- }
- break; // from the for loop.
- }
-
- // We found in URX_JMP_SAV near the end that is a candidate for optimizing.
- // Is the target address the previous instruction?
- // Is the previous instruction a flavor of URX_DOTANY
- int32_t loopTopLoc = URX_VAL(op);
- if (loopTopLoc != jmpLoc-1) {
- return;
- }
- int32_t newOp;
- int32_t oldOp = fRXPat->fCompiledPat->elementAti(loopTopLoc);
- int32_t oldOpType = opType = URX_TYPE(oldOp);
- if (oldOpType == URX_DOTANY) {
- newOp = URX_BUILD(URX_DOTANY_PL, 0);
- }
- else if (oldOpType == URX_DOTANY_ALL) {
- newOp = URX_BUILD(URX_DOTANY_ALL_PL, 0);
- } else {
- return; // Sequence we were looking for isn't there.
- }
-
- // Substitute the new instructions into the pattern.
- // The NOP will be removed in a later optimization step.
- fRXPat->fCompiledPat->setElementAt(URX_BUILD(URX_NOP, 0), loopTopLoc);
- fRXPat->fCompiledPat->setElementAt(newOp, jmpLoc);
-}
-
-
-//------------------------------------------------------------------------------
-//
// Error Report a rule parse error.
// Only report it if no previous error has been recorded.
//
@@ -3701,6 +3711,10 @@
for (index=0; index<3; index++) {
int32_t ch = peekCharLL();
if (ch<chDigit0 || ch>chDigit7) {
+ if (index==0) {
+ // \0 is not followed by any octal digits.
+ error(U_REGEX_BAD_ESCAPE_SEQUENCE);
+ }
break;
}
c.fChar <<= 3;
@@ -3975,13 +3989,27 @@
//
// The property as it was didn't work.
- // Do an emergency fixe -
+ // Do emergency fixes -
// InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
+ // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
+ //
+ // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
+ // is accepted by Java. The property part of the name is compared
+ // case-insenstively. The spaces must be exactly as shown, either
+ // all there, or all omitted, with exactly one at each position
+ // if they are present. From checking against JDK 1.6
+ //
+ // This code should be removed ICU properties support the Java compatibility names
+ // (ICU 4.0?)
//
UnicodeString mPropName = propName;
if (mPropName.caseCompare(UnicodeString("InGreek", -1, UnicodeString::kInvariant), 0) == 0) {
mPropName = UnicodeString("InGreek and Coptic", -1 ,UnicodeString::kInvariant);
}
+ if (mPropName.caseCompare(UnicodeString("InCombining Marks for Symbols", -1, UnicodeString::kInvariant), 0) == 0 ||
+ mPropName.caseCompare(UnicodeString("InCombiningMarksforSymbols", -1, UnicodeString::kInvariant), 0) == 0) {
+ mPropName = UnicodeString("InCombining Diacritical Marks for Symbols", -1 ,UnicodeString::kInvariant);
+ }
else if (mPropName.compare(UnicodeString("all", -1, UnicodeString::kInvariant)) == 0) {
mPropName = UnicodeString("javaValidCodePoint", -1 ,UnicodeString::kInvariant);
}
diff --git a/i18n/regexcmp.h b/i18n/regexcmp.h
index a0248a3..00d932b 100644
--- a/i18n/regexcmp.h
+++ b/i18n/regexcmp.h
@@ -109,7 +109,6 @@
int32_t end);
void matchStartType();
void stripNOPs();
- void OptDotStar();
void setEval(int32_t op);
void setPushOp(int32_t op);
diff --git a/i18n/regexcst.h b/i18n/regexcst.h
index 5d31937..8c75310 100644
--- a/i18n/regexcst.h
+++ b/i18n/regexcst.h
@@ -133,20 +133,20 @@
, {doPatStart, 255, 2,0, FALSE} // 1 start
, {doLiteralChar, 254, 14,0, TRUE} // 2 term
, {doLiteralChar, 129, 14,0, TRUE} // 3
- , {doSetBegin, 91 /* [ */, 100, 178, TRUE} // 4
+ , {doSetBegin, 91 /* [ */, 102, 180, TRUE} // 4
, {doNOP, 40 /* ( */, 27,0, TRUE} // 5
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
, {doCaret, 94 /* ^ */, 14,0, TRUE} // 7
, {doDollar, 36 /* $ */, 14,0, TRUE} // 8
- , {doNOP, 92 /* \ */, 80,0, TRUE} // 9
+ , {doNOP, 92 /* \ */, 82,0, TRUE} // 9
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
, {doPatFinish, 253, 2,0, FALSE} // 12
- , {doRuleError, 255, 179,0, FALSE} // 13
- , {doNOP, 42 /* * */, 59,0, TRUE} // 14 expr-quant
- , {doNOP, 43 /* + */, 62,0, TRUE} // 15
- , {doNOP, 63 /* ? */, 65,0, TRUE} // 16
- , {doIntervalInit, 123 /* { */, 68,0, TRUE} // 17
+ , {doRuleError, 255, 181,0, FALSE} // 13
+ , {doNOP, 42 /* * */, 61,0, TRUE} // 14 expr-quant
+ , {doNOP, 43 /* + */, 64,0, TRUE} // 15
+ , {doNOP, 63 /* ? */, 67,0, TRUE} // 16
+ , {doIntervalInit, 123 /* { */, 70,0, TRUE} // 17
, {doNOP, 40 /* ( */, 23,0, TRUE} // 18
, {doNOP, 255, 20,0, FALSE} // 19
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
@@ -154,7 +154,7 @@
, {doNOP, 255, 2,0, FALSE} // 22
, {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
, {doNOP, 255, 27,0, FALSE} // 24
- , {doNOP, 35 /* # */, 47, 14, TRUE} // 25 open-paren-quant2
+ , {doNOP, 35 /* # */, 48, 14, TRUE} // 25 open-paren-quant2
, {doNOP, 255, 29,0, FALSE} // 26
, {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
@@ -162,153 +162,155 @@
, {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30
, {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31
, {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32
- , {doNOP, 60 /* < */, 44,0, TRUE} // 33
- , {doNOP, 35 /* # */, 47, 2, TRUE} // 34
- , {doBeginMatchMode, 105 /* i */, 50,0, FALSE} // 35
- , {doBeginMatchMode, 109 /* m */, 50,0, FALSE} // 36
- , {doBeginMatchMode, 115 /* s */, 50,0, FALSE} // 37
- , {doBeginMatchMode, 119 /* w */, 50,0, FALSE} // 38
- , {doBeginMatchMode, 120 /* x */, 50,0, FALSE} // 39
- , {doBeginMatchMode, 45 /* - */, 50,0, FALSE} // 40
- , {doConditionalExpr, 40 /* ( */, 179,0, TRUE} // 41
- , {doPerlInline, 123 /* { */, 179,0, TRUE} // 42
- , {doBadOpenParenType, 255, 179,0, FALSE} // 43
- , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 44 open-paren-lookbehind
- , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 45
- , {doBadOpenParenType, 255, 179,0, FALSE} // 46
- , {doNOP, 41 /* ) */, 255,0, TRUE} // 47 paren-comment
- , {doMismatchedParenErr, 253, 179,0, FALSE} // 48
- , {doNOP, 255, 47,0, TRUE} // 49
- , {doMatchMode, 105 /* i */, 50,0, TRUE} // 50 paren-flag
- , {doMatchMode, 109 /* m */, 50,0, TRUE} // 51
- , {doMatchMode, 115 /* s */, 50,0, TRUE} // 52
- , {doMatchMode, 119 /* w */, 50,0, TRUE} // 53
- , {doMatchMode, 120 /* x */, 50,0, TRUE} // 54
- , {doMatchMode, 45 /* - */, 50,0, TRUE} // 55
- , {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 56
- , {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 57
- , {doBadModeFlag, 255, 179,0, FALSE} // 58
- , {doNGStar, 63 /* ? */, 20,0, TRUE} // 59 quant-star
- , {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 60
- , {doStar, 255, 20,0, FALSE} // 61
- , {doNGPlus, 63 /* ? */, 20,0, TRUE} // 62 quant-plus
- , {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 63
- , {doPlus, 255, 20,0, FALSE} // 64
- , {doNGOpt, 63 /* ? */, 20,0, TRUE} // 65 quant-opt
- , {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 66
- , {doOpt, 255, 20,0, FALSE} // 67
- , {doNOP, 128, 70,0, FALSE} // 68 interval-open
- , {doIntervalError, 255, 179,0, FALSE} // 69
- , {doIntevalLowerDigit, 128, 70,0, TRUE} // 70 interval-lower
- , {doNOP, 44 /* , */, 74,0, TRUE} // 71
- , {doIntervalSame, 125 /* } */, 77,0, TRUE} // 72
- , {doIntervalError, 255, 179,0, FALSE} // 73
- , {doIntervalUpperDigit, 128, 74,0, TRUE} // 74 interval-upper
- , {doNOP, 125 /* } */, 77,0, TRUE} // 75
- , {doIntervalError, 255, 179,0, FALSE} // 76
- , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 77 interval-type
- , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 78
- , {doInterval, 255, 20,0, FALSE} // 79
- , {doBackslashA, 65 /* A */, 2,0, TRUE} // 80 backslash
- , {doBackslashB, 66 /* B */, 2,0, TRUE} // 81
- , {doBackslashb, 98 /* b */, 2,0, TRUE} // 82
- , {doBackslashd, 100 /* d */, 14,0, TRUE} // 83
- , {doBackslashD, 68 /* D */, 14,0, TRUE} // 84
- , {doBackslashG, 71 /* G */, 2,0, TRUE} // 85
- , {doNamedChar, 78 /* N */, 14,0, FALSE} // 86
- , {doProperty, 112 /* p */, 14,0, FALSE} // 87
- , {doProperty, 80 /* P */, 14,0, FALSE} // 88
- , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 89
- , {doBackslashS, 83 /* S */, 14,0, TRUE} // 90
- , {doBackslashs, 115 /* s */, 14,0, TRUE} // 91
- , {doBackslashW, 87 /* W */, 14,0, TRUE} // 92
- , {doBackslashw, 119 /* w */, 14,0, TRUE} // 93
- , {doBackslashX, 88 /* X */, 14,0, TRUE} // 94
- , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 95
- , {doBackslashz, 122 /* z */, 2,0, TRUE} // 96
- , {doBackRef, 128, 14,0, TRUE} // 97
- , {doEscapeError, 253, 179,0, FALSE} // 98
- , {doEscapedLiteralChar, 255, 14,0, TRUE} // 99
- , {doSetNegate, 94 /* ^ */, 103,0, TRUE} // 100 set-open
- , {doSetPosixProp, 58 /* : */, 105,0, FALSE} // 101
- , {doNOP, 255, 103,0, FALSE} // 102
- , {doSetLiteral, 93 /* ] */, 118,0, TRUE} // 103 set-open2
- , {doNOP, 255, 108,0, FALSE} // 104
- , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 105 set-posix
- , {doNOP, 58 /* : */, 108,0, FALSE} // 106
- , {doRuleError, 255, 179,0, FALSE} // 107
- , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 108 set-start
- , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 109
- , {doNOP, 92 /* \ */, 168,0, TRUE} // 110
- , {doNOP, 45 /* - */, 114,0, TRUE} // 111
- , {doNOP, 38 /* & */, 116,0, TRUE} // 112
- , {doSetLiteral, 255, 118,0, TRUE} // 113
- , {doRuleError, 45 /* - */, 179,0, FALSE} // 114 set-start-dash
- , {doSetAddDash, 255, 118,0, FALSE} // 115
- , {doRuleError, 38 /* & */, 179,0, FALSE} // 116 set-start-amp
- , {doSetAddAmp, 255, 118,0, FALSE} // 117
- , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 118 set-after-lit
- , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 119
- , {doNOP, 45 /* - */, 155,0, TRUE} // 120
- , {doNOP, 38 /* & */, 146,0, TRUE} // 121
- , {doNOP, 92 /* \ */, 168,0, TRUE} // 122
- , {doSetNoCloseError, 253, 179,0, FALSE} // 123
- , {doSetLiteral, 255, 118,0, TRUE} // 124
- , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 125 set-after-set
- , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 126
- , {doNOP, 45 /* - */, 148,0, TRUE} // 127
- , {doNOP, 38 /* & */, 143,0, TRUE} // 128
- , {doNOP, 92 /* \ */, 168,0, TRUE} // 129
- , {doSetNoCloseError, 253, 179,0, FALSE} // 130
- , {doSetLiteral, 255, 118,0, TRUE} // 131
- , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 132 set-after-range
- , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 133
- , {doNOP, 45 /* - */, 151,0, TRUE} // 134
- , {doNOP, 38 /* & */, 153,0, TRUE} // 135
- , {doNOP, 92 /* \ */, 168,0, TRUE} // 136
- , {doSetNoCloseError, 253, 179,0, FALSE} // 137
- , {doSetLiteral, 255, 118,0, TRUE} // 138
- , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 139 set-after-op
- , {doSetOpError, 93 /* ] */, 179,0, FALSE} // 140
- , {doNOP, 92 /* \ */, 168,0, TRUE} // 141
- , {doSetLiteral, 255, 118,0, TRUE} // 142
- , {doSetBeginIntersection1, 91 /* [ */, 100, 125, TRUE} // 143 set-set-amp
- , {doSetIntersection2, 38 /* & */, 139,0, TRUE} // 144
- , {doSetAddAmp, 255, 118,0, FALSE} // 145
- , {doSetIntersection2, 38 /* & */, 139,0, TRUE} // 146 set-lit-amp
- , {doSetAddAmp, 255, 118,0, FALSE} // 147
- , {doSetBeginDifference1, 91 /* [ */, 100, 125, TRUE} // 148 set-set-dash
- , {doSetDifference2, 45 /* - */, 139,0, TRUE} // 149
- , {doSetAddDash, 255, 118,0, FALSE} // 150
- , {doSetDifference2, 45 /* - */, 139,0, TRUE} // 151 set-range-dash
- , {doSetAddDash, 255, 118,0, FALSE} // 152
- , {doSetIntersection2, 38 /* & */, 139,0, TRUE} // 153 set-range-amp
- , {doSetAddAmp, 255, 118,0, FALSE} // 154
- , {doSetDifference2, 45 /* - */, 139,0, TRUE} // 155 set-lit-dash
- , {doSetAddDash, 91 /* [ */, 118,0, FALSE} // 156
- , {doSetAddDash, 93 /* ] */, 118,0, FALSE} // 157
- , {doNOP, 92 /* \ */, 160,0, TRUE} // 158
- , {doSetRange, 255, 132,0, TRUE} // 159
- , {doSetAddDash, 115 /* s */, 168,0, FALSE} // 160 set-lit-dash-escape
- , {doSetAddDash, 83 /* S */, 168,0, FALSE} // 161
- , {doSetAddDash, 119 /* w */, 168,0, FALSE} // 162
- , {doSetAddDash, 87 /* W */, 168,0, FALSE} // 163
- , {doSetAddDash, 100 /* d */, 168,0, FALSE} // 164
- , {doSetAddDash, 68 /* D */, 168,0, FALSE} // 165
- , {doSetNamedRange, 78 /* N */, 132,0, FALSE} // 166
- , {doSetRange, 255, 132,0, TRUE} // 167
- , {doSetProp, 112 /* p */, 125,0, FALSE} // 168 set-escape
- , {doSetProp, 80 /* P */, 125,0, FALSE} // 169
- , {doSetNamedChar, 78 /* N */, 118,0, FALSE} // 170
- , {doSetBackslash_s, 115 /* s */, 132,0, TRUE} // 171
- , {doSetBackslash_S, 83 /* S */, 132,0, TRUE} // 172
- , {doSetBackslash_w, 119 /* w */, 132,0, TRUE} // 173
- , {doSetBackslash_W, 87 /* W */, 132,0, TRUE} // 174
- , {doSetBackslash_d, 100 /* d */, 132,0, TRUE} // 175
- , {doSetBackslash_D, 68 /* D */, 132,0, TRUE} // 176
- , {doSetLiteralEscaped, 255, 118,0, TRUE} // 177
- , {doSetFinish, 255, 14,0, FALSE} // 178 set-finish
- , {doExit, 255, 179,0, TRUE} // 179 errorDeath
+ , {doNOP, 60 /* < */, 45,0, TRUE} // 33
+ , {doNOP, 35 /* # */, 48, 2, TRUE} // 34
+ , {doBeginMatchMode, 105 /* i */, 51,0, FALSE} // 35
+ , {doBeginMatchMode, 100 /* d */, 51,0, FALSE} // 36
+ , {doBeginMatchMode, 109 /* m */, 51,0, FALSE} // 37
+ , {doBeginMatchMode, 115 /* s */, 51,0, FALSE} // 38
+ , {doBeginMatchMode, 119 /* w */, 51,0, FALSE} // 39
+ , {doBeginMatchMode, 120 /* x */, 51,0, FALSE} // 40
+ , {doBeginMatchMode, 45 /* - */, 51,0, FALSE} // 41
+ , {doConditionalExpr, 40 /* ( */, 181,0, TRUE} // 42
+ , {doPerlInline, 123 /* { */, 181,0, TRUE} // 43
+ , {doBadOpenParenType, 255, 181,0, FALSE} // 44
+ , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 45 open-paren-lookbehind
+ , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 46
+ , {doBadOpenParenType, 255, 181,0, FALSE} // 47
+ , {doNOP, 41 /* ) */, 255,0, TRUE} // 48 paren-comment
+ , {doMismatchedParenErr, 253, 181,0, FALSE} // 49
+ , {doNOP, 255, 48,0, TRUE} // 50
+ , {doMatchMode, 105 /* i */, 51,0, TRUE} // 51 paren-flag
+ , {doMatchMode, 100 /* d */, 51,0, TRUE} // 52
+ , {doMatchMode, 109 /* m */, 51,0, TRUE} // 53
+ , {doMatchMode, 115 /* s */, 51,0, TRUE} // 54
+ , {doMatchMode, 119 /* w */, 51,0, TRUE} // 55
+ , {doMatchMode, 120 /* x */, 51,0, TRUE} // 56
+ , {doMatchMode, 45 /* - */, 51,0, TRUE} // 57
+ , {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 58
+ , {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 59
+ , {doBadModeFlag, 255, 181,0, FALSE} // 60
+ , {doNGStar, 63 /* ? */, 20,0, TRUE} // 61 quant-star
+ , {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 62
+ , {doStar, 255, 20,0, FALSE} // 63
+ , {doNGPlus, 63 /* ? */, 20,0, TRUE} // 64 quant-plus
+ , {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 65
+ , {doPlus, 255, 20,0, FALSE} // 66
+ , {doNGOpt, 63 /* ? */, 20,0, TRUE} // 67 quant-opt
+ , {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 68
+ , {doOpt, 255, 20,0, FALSE} // 69
+ , {doNOP, 128, 72,0, FALSE} // 70 interval-open
+ , {doIntervalError, 255, 181,0, FALSE} // 71
+ , {doIntevalLowerDigit, 128, 72,0, TRUE} // 72 interval-lower
+ , {doNOP, 44 /* , */, 76,0, TRUE} // 73
+ , {doIntervalSame, 125 /* } */, 79,0, TRUE} // 74
+ , {doIntervalError, 255, 181,0, FALSE} // 75
+ , {doIntervalUpperDigit, 128, 76,0, TRUE} // 76 interval-upper
+ , {doNOP, 125 /* } */, 79,0, TRUE} // 77
+ , {doIntervalError, 255, 181,0, FALSE} // 78
+ , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 79 interval-type
+ , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 80
+ , {doInterval, 255, 20,0, FALSE} // 81
+ , {doBackslashA, 65 /* A */, 2,0, TRUE} // 82 backslash
+ , {doBackslashB, 66 /* B */, 2,0, TRUE} // 83
+ , {doBackslashb, 98 /* b */, 2,0, TRUE} // 84
+ , {doBackslashd, 100 /* d */, 14,0, TRUE} // 85
+ , {doBackslashD, 68 /* D */, 14,0, TRUE} // 86
+ , {doBackslashG, 71 /* G */, 2,0, TRUE} // 87
+ , {doNamedChar, 78 /* N */, 14,0, FALSE} // 88
+ , {doProperty, 112 /* p */, 14,0, FALSE} // 89
+ , {doProperty, 80 /* P */, 14,0, FALSE} // 90
+ , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 91
+ , {doBackslashS, 83 /* S */, 14,0, TRUE} // 92
+ , {doBackslashs, 115 /* s */, 14,0, TRUE} // 93
+ , {doBackslashW, 87 /* W */, 14,0, TRUE} // 94
+ , {doBackslashw, 119 /* w */, 14,0, TRUE} // 95
+ , {doBackslashX, 88 /* X */, 14,0, TRUE} // 96
+ , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 97
+ , {doBackslashz, 122 /* z */, 2,0, TRUE} // 98
+ , {doBackRef, 128, 14,0, TRUE} // 99
+ , {doEscapeError, 253, 181,0, FALSE} // 100
+ , {doEscapedLiteralChar, 255, 14,0, TRUE} // 101
+ , {doSetNegate, 94 /* ^ */, 105,0, TRUE} // 102 set-open
+ , {doSetPosixProp, 58 /* : */, 107,0, FALSE} // 103
+ , {doNOP, 255, 105,0, FALSE} // 104
+ , {doSetLiteral, 93 /* ] */, 120,0, TRUE} // 105 set-open2
+ , {doNOP, 255, 110,0, FALSE} // 106
+ , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 107 set-posix
+ , {doNOP, 58 /* : */, 110,0, FALSE} // 108
+ , {doRuleError, 255, 181,0, FALSE} // 109
+ , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 110 set-start
+ , {doSetBeginUnion, 91 /* [ */, 102, 127, TRUE} // 111
+ , {doNOP, 92 /* \ */, 170,0, TRUE} // 112
+ , {doNOP, 45 /* - */, 116,0, TRUE} // 113
+ , {doNOP, 38 /* & */, 118,0, TRUE} // 114
+ , {doSetLiteral, 255, 120,0, TRUE} // 115
+ , {doRuleError, 45 /* - */, 181,0, FALSE} // 116 set-start-dash
+ , {doSetAddDash, 255, 120,0, FALSE} // 117
+ , {doRuleError, 38 /* & */, 181,0, FALSE} // 118 set-start-amp
+ , {doSetAddAmp, 255, 120,0, FALSE} // 119
+ , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 120 set-after-lit
+ , {doSetBeginUnion, 91 /* [ */, 102, 127, TRUE} // 121
+ , {doNOP, 45 /* - */, 157,0, TRUE} // 122
+ , {doNOP, 38 /* & */, 148,0, TRUE} // 123
+ , {doNOP, 92 /* \ */, 170,0, TRUE} // 124
+ , {doSetNoCloseError, 253, 181,0, FALSE} // 125
+ , {doSetLiteral, 255, 120,0, TRUE} // 126
+ , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 127 set-after-set
+ , {doSetBeginUnion, 91 /* [ */, 102, 127, TRUE} // 128
+ , {doNOP, 45 /* - */, 150,0, TRUE} // 129
+ , {doNOP, 38 /* & */, 145,0, TRUE} // 130
+ , {doNOP, 92 /* \ */, 170,0, TRUE} // 131
+ , {doSetNoCloseError, 253, 181,0, FALSE} // 132
+ , {doSetLiteral, 255, 120,0, TRUE} // 133
+ , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 134 set-after-range
+ , {doSetBeginUnion, 91 /* [ */, 102, 127, TRUE} // 135
+ , {doNOP, 45 /* - */, 153,0, TRUE} // 136
+ , {doNOP, 38 /* & */, 155,0, TRUE} // 137
+ , {doNOP, 92 /* \ */, 170,0, TRUE} // 138
+ , {doSetNoCloseError, 253, 181,0, FALSE} // 139
+ , {doSetLiteral, 255, 120,0, TRUE} // 140
+ , {doSetBeginUnion, 91 /* [ */, 102, 127, TRUE} // 141 set-after-op
+ , {doSetOpError, 93 /* ] */, 181,0, FALSE} // 142
+ , {doNOP, 92 /* \ */, 170,0, TRUE} // 143
+ , {doSetLiteral, 255, 120,0, TRUE} // 144
+ , {doSetBeginIntersection1, 91 /* [ */, 102, 127, TRUE} // 145 set-set-amp
+ , {doSetIntersection2, 38 /* & */, 141,0, TRUE} // 146
+ , {doSetAddAmp, 255, 120,0, FALSE} // 147
+ , {doSetIntersection2, 38 /* & */, 141,0, TRUE} // 148 set-lit-amp
+ , {doSetAddAmp, 255, 120,0, FALSE} // 149
+ , {doSetBeginDifference1, 91 /* [ */, 102, 127, TRUE} // 150 set-set-dash
+ , {doSetDifference2, 45 /* - */, 141,0, TRUE} // 151
+ , {doSetAddDash, 255, 120,0, FALSE} // 152
+ , {doSetDifference2, 45 /* - */, 141,0, TRUE} // 153 set-range-dash
+ , {doSetAddDash, 255, 120,0, FALSE} // 154
+ , {doSetIntersection2, 38 /* & */, 141,0, TRUE} // 155 set-range-amp
+ , {doSetAddAmp, 255, 120,0, FALSE} // 156
+ , {doSetDifference2, 45 /* - */, 141,0, TRUE} // 157 set-lit-dash
+ , {doSetAddDash, 91 /* [ */, 120,0, FALSE} // 158
+ , {doSetAddDash, 93 /* ] */, 120,0, FALSE} // 159
+ , {doNOP, 92 /* \ */, 162,0, TRUE} // 160
+ , {doSetRange, 255, 134,0, TRUE} // 161
+ , {doSetOpError, 115 /* s */, 181,0, FALSE} // 162 set-lit-dash-escape
+ , {doSetOpError, 83 /* S */, 181,0, FALSE} // 163
+ , {doSetOpError, 119 /* w */, 181,0, FALSE} // 164
+ , {doSetOpError, 87 /* W */, 181,0, FALSE} // 165
+ , {doSetOpError, 100 /* d */, 181,0, FALSE} // 166
+ , {doSetOpError, 68 /* D */, 181,0, FALSE} // 167
+ , {doSetNamedRange, 78 /* N */, 134,0, FALSE} // 168
+ , {doSetRange, 255, 134,0, TRUE} // 169
+ , {doSetProp, 112 /* p */, 127,0, FALSE} // 170 set-escape
+ , {doSetProp, 80 /* P */, 127,0, FALSE} // 171
+ , {doSetNamedChar, 78 /* N */, 120,0, FALSE} // 172
+ , {doSetBackslash_s, 115 /* s */, 134,0, TRUE} // 173
+ , {doSetBackslash_S, 83 /* S */, 134,0, TRUE} // 174
+ , {doSetBackslash_w, 119 /* w */, 134,0, TRUE} // 175
+ , {doSetBackslash_W, 87 /* W */, 134,0, TRUE} // 176
+ , {doSetBackslash_d, 100 /* d */, 134,0, TRUE} // 177
+ , {doSetBackslash_D, 68 /* D */, 134,0, TRUE} // 178
+ , {doSetLiteralEscaped, 255, 120,0, TRUE} // 179
+ , {doSetFinish, 255, 14,0, FALSE} // 180 set-finish
+ , {doExit, 255, 181,0, TRUE} // 181 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@@ -354,6 +356,7 @@
0,
0,
0,
+ 0,
"open-paren-lookbehind",
0,
0,
@@ -369,6 +372,7 @@
0,
0,
0,
+ 0,
"quant-star",
0,
0,
diff --git a/i18n/regexcst.txt b/i18n/regexcst.txt
index 888a0c4..304ac57 100644
--- a/i18n/regexcst.txt
+++ b/i18n/regexcst.txt
@@ -133,6 +133,7 @@
'<' n open-paren-lookbehind
'#' n paren-comment ^term
'i' paren-flag doBeginMatchMode
+ 'd' paren-flag doBeginMatchMode
'm' paren-flag doBeginMatchMode
's' paren-flag doBeginMatchMode
'w' paren-flag doBeginMatchMode
@@ -161,6 +162,7 @@
#
paren-flag:
'i' n paren-flag doMatchMode
+ 'd' n paren-flag doMatchMode
'm' n paren-flag doMatchMode
's' n paren-flag doMatchMode
'w' n paren-flag doMatchMode
@@ -398,7 +400,7 @@
# set-lit-dash
# Have scanned "[literals-" Could be a range or a -- operator or a literal
# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
-# [abc-\p{xx} the '-' is a literal
+# [abc-\p{xx} the '-' is an error
# [abc-] the '-' is a literal
# [ab-xy] the '-' is a range
#
@@ -416,12 +418,12 @@
# Could be a literal '-', if the '\' introduces a set-like construct e.g. \s aut \p{...}
#
set-lit-dash-escape:
- 's' set-escape doSetAddDash
- 'S' set-escape doSetAddDash
- 'w' set-escape doSetAddDash
- 'W' set-escape doSetAddDash
- 'd' set-escape doSetAddDash
- 'D' set-escape doSetAddDash
+ 's' errorDeath doSetOpError
+ 'S' errorDeath doSetOpError
+ 'w' errorDeath doSetOpError
+ 'W' errorDeath doSetOpError
+ 'd' errorDeath doSetOpError
+ 'D' errorDeath doSetOpError
'N' set-after-range doSetNamedRange
default n set-after-range doSetRange
diff --git a/i18n/regeximp.h b/i18n/regeximp.h
index 6944c08..225822d 100644
--- a/i18n/regeximp.h
+++ b/i18n/regeximp.h
@@ -1,5 +1,5 @@
//
-// Copyright (C) 2002-2005 International Business Machines Corporation
+// Copyright (C) 2002-2007 International Business Machines Corporation
// and others. All rights reserved.
//
// file: regeximp.h
@@ -57,7 +57,7 @@
enum {
URX_RESERVED_OP = 0, // For multi-operand ops, most non-first words.
URX_RESERVED_OP_N = 255, // For multi-operand ops, negative operand values.
- URX_BACKTRACK = 1,
+ URX_BACKTRACK = 1, // Force a backtrack, as if a match test had failed.
URX_END = 2,
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
URX_STRING = 4, // Value field is index of string start
@@ -96,13 +96,14 @@
// 3rd Operand: Minimum count.
// 4th Operand: Max count, -1 for unbounded.
- URX_DOTANY_PL = 27, // .+, match rest of the line. Fail already at end.
+ URX_DOTANY_UNIX = 27, // '.' operator in UNIX_LINES mode, only \n marks end of line.
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
URX_CTR_LOOP_NG = 29, // Also in three flavors.
// Operand is loc of corresponding CTR_INIT.
- URX_DOTANY_ALL_PL = 30, // .+, match rest of the Input. Fail if already at end
+ URX_CARET_M_UNIX = 30, // '^' operator, test for start of line in multi-line
+ // plus UNIX_LINES mode.
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
// back into compiled pattern code, and thus must
@@ -166,10 +167,16 @@
// Must always immediately follow LOOP_x_I instruction.
URX_LOOP_DOT_I = 52, // .*, initialization of the optimized loop.
// Operand value:
- // 0: Normal (. doesn't match new-line) mode.
- // 1: . matches new-line mode.
- URX_BACKSLASH_BU = 53 // \b or \B in UREGEX_UWORD mode, using Unicode style
+ // bit 0:
+ // 0: Normal (. doesn't match new-line) mode.
+ // 1: . matches new-line mode.
+ // bit 1: controls what new-lines are recognized by this operation.
+ // 0: All Unicode New-lines
+ // 1: UNIX_LINES, \u000a only.
+ URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style
// word boundaries.
+ URX_DOLLAR_D = 54, // $ end of input test, in UNIX_LINES mode.
+ URX_DOLLAR_MD = 55 // $ end of input test, in MULTI_LINE and UNIX_LINES mode.
};
@@ -203,10 +210,10 @@
"DOLLAR", \
"CTR_INIT", \
"CTR_INIT_NG", \
- "DOTANY_PL", \
+ "DOTANY_UNIX", \
"CTR_LOOP", \
"CTR_LOOP_NG", \
- "DOTANY_ALL_PL", \
+ "URX_CARET_M_UNIX", \
"RELOC_OPRND", \
"STO_SP", \
"LD_SP", \
@@ -229,7 +236,9 @@
"LOOP_SR_I", \
"LOOP_C", \
"LOOP_DOT_I", \
- "BACKSLASH_BU"
+ "BACKSLASH_BU", \
+ "DOLLAR_D", \
+ "DOLLAR_MD"
//
diff --git a/i18n/rematch.cpp b/i18n/rematch.cpp
index 2fe46a9..9439e8a 100644
--- a/i18n/rematch.cpp
+++ b/i18n/rematch.cpp
@@ -324,7 +324,7 @@
// Watch for interactions with replace operations when fixing.
int32_t startPos = fMatchEnd;
if (startPos==0) {
- startPos = fRegionStart;
+ startPos = fActiveStart;
}
if (fMatch) {
@@ -334,8 +334,9 @@
if (fMatchStart == fMatchEnd) {
// Previous match had zero length. Move start position up one position
// to avoid sending find() into a loop on zero-length matches.
- if (startPos >= fRegionLimit) {
+ if (startPos >= fActiveLimit) {
fMatch = FALSE;
+ fHitEnd = TRUE;
return FALSE;
}
startPos = fInput->moveIndex32(startPos, 1);
@@ -345,6 +346,7 @@
// A previous find() failed to match. Don't try again.
// (without this test, a pattern with a zero-length match
// could match again at the end of an input string.)
+ fHitEnd = TRUE;
return FALSE;
}
}
@@ -352,9 +354,12 @@
// Compute the position in the input string beyond which a match can not begin, because
// the minimum length match would extend past the end of the input.
- int32_t testLen = fRegionLimit - fPattern->fMinMatchLen;
+ // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
+ // Be aware of possible overflows if making changes here.
+ int32_t testLen = fActiveLimit - fPattern->fMinMatchLen;
if (startPos > testLen) {
fMatch = FALSE;
+ fHitEnd = TRUE;
return FALSE;
}
@@ -378,7 +383,7 @@
fHitEnd = TRUE;
return FALSE;
}
- U16_FWD_1(inputBuf, startPos, fRegionLimit);
+ U16_FWD_1(inputBuf, startPos, fActiveLimit);
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
@@ -388,7 +393,7 @@
case START_START:
// Matches are only possible at the start of the input string
// (pattern begins with ^ or \A)
- if (startPos > fRegionStart) {
+ if (startPos > fActiveStart) {
fMatch = FALSE;
return FALSE;
}
@@ -406,7 +411,7 @@
U_ASSERT(fPattern->fMinMatchLen > 0);
for (;;) {
int32_t pos = startPos;
- U16_NEXT(inputBuf, startPos, fRegionLimit, c); // like c = inputBuf[startPos++];
+ U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
if (c<256 && fPattern->fInitialChars8->contains(c) ||
c>=256 && fPattern->fInitialChars->contains(c)) {
MatchAt(pos, FALSE, fDeferredStatus);
@@ -419,6 +424,7 @@
}
if (pos >= testLen) {
fMatch = FALSE;
+ fHitEnd = TRUE;
return FALSE;
}
}
@@ -433,7 +439,7 @@
UChar32 theChar = fPattern->fInitialChar;
for (;;) {
int32_t pos = startPos;
- U16_NEXT(inputBuf, startPos, fRegionLimit, c); // like c = inputBuf[startPos++];
+ U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
if (c == theChar) {
MatchAt(pos, FALSE, fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
@@ -445,6 +451,7 @@
}
if (pos >= testLen) {
fMatch = FALSE;
+ fHitEnd = TRUE;
return FALSE;
}
}
@@ -454,7 +461,7 @@
case START_LINE:
{
UChar32 c;
- if (startPos == 0) {
+ if (startPos == fAnchorStart) {
MatchAt(startPos, FALSE, fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return FALSE;
@@ -462,32 +469,57 @@
if (fMatch) {
return TRUE;
}
- U16_NEXT(inputBuf, startPos, fRegionLimit, c); // like c = inputBuf[startPos++];
+ U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
}
- for (;;) {
- c = inputBuf[startPos-1];
- if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
- ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
- if (c == 0x0d && startPos < fRegionLimit && inputBuf[startPos] == 0x0a) {
- startPos++;
- }
- MatchAt(startPos, FALSE, fDeferredStatus);
- if (U_FAILURE(fDeferredStatus)) {
- return FALSE;
- }
- if (fMatch) {
- return TRUE;
- }
+ if (fPattern->fFlags & UREGEX_UNIX_LINES) {
+ for (;;) {
+ c = inputBuf[startPos-1];
+ if (c == 0x0a) {
+ MatchAt(startPos, FALSE, fDeferredStatus);
+ if (U_FAILURE(fDeferredStatus)) {
+ return FALSE;
+ }
+ if (fMatch) {
+ return TRUE;
+ }
+ }
+ if (startPos >= testLen) {
+ fMatch = FALSE;
+ fHitEnd = TRUE;
+ return FALSE;
+ }
+ U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
+ // Note that it's perfectly OK for a pattern to have a zero-length
+ // match at the end of a string, so we must make sure that the loop
+ // runs with startPos == testLen the last time through.
}
- if (startPos >= testLen) {
- fMatch = FALSE;
- return FALSE;
+ } else {
+ for (;;) {
+ c = inputBuf[startPos-1];
+ if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
+ ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
+ if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
+ startPos++;
+ }
+ MatchAt(startPos, FALSE, fDeferredStatus);
+ if (U_FAILURE(fDeferredStatus)) {
+ return FALSE;
+ }
+ if (fMatch) {
+ return TRUE;
+ }
+ }
+ if (startPos >= testLen) {
+ fMatch = FALSE;
+ fHitEnd = TRUE;
+ return FALSE;
+ }
+ U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
+ // Note that it's perfectly OK for a pattern to have a zero-length
+ // match at the end of a string, so we must make sure that the loop
+ // runs with startPos == testLen the last time through.
}
- U16_NEXT(inputBuf, startPos, fRegionLimit, c); // like c = inputBuf[startPos++];
- // Note that it's perfectly OK for a pattern to have a zero-length
- // match at the end of a string, so we must make sure that the loop
- // runs with startPos == testLen the last time through.
}
}
@@ -511,7 +543,7 @@
}
this->reset(); // Note: Reset() is specified by Java Matcher documentation.
// This will reset the region to be the full input length.
- if (start < fRegionStart || start > fRegionLimit) {
+ if (start < fActiveStart || start > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
@@ -613,7 +645,7 @@
return FALSE;
}
resetPreserveRegion();
- MatchAt(fRegionStart, FALSE, status);
+ MatchAt(fActiveStart, FALSE, status);
return fMatch;
}
@@ -627,7 +659,7 @@
return FALSE;
}
reset();
- if (start < fRegionStart || start > fRegionLimit) {
+ if (start < fActiveStart || start > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
@@ -651,7 +683,7 @@
return FALSE;
}
resetPreserveRegion();
- MatchAt(fRegionStart, TRUE, status);
+ MatchAt(fActiveStart, TRUE, status);
return fMatch;
}
@@ -665,7 +697,7 @@
return FALSE;
}
reset();
- if (start < fRegionStart || start > fRegionLimit) {
+ if (start < fActiveStart || start > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
@@ -701,6 +733,8 @@
this->reset();
fRegionStart = start;
fRegionLimit = limit;
+ fActiveStart = start;
+ fActiveLimit = limit;
if (!fTransparentBounds) {
fLookStart = start;
fLookLimit = limit;
@@ -805,6 +839,8 @@
RegexMatcher &RegexMatcher::reset() {
fRegionStart = 0;
fRegionLimit = fInput->length();
+ fActiveStart = 0;
+ fActiveLimit = fRegionLimit;
fAnchorStart = 0;
fAnchorLimit = fRegionLimit;
fLookStart = 0;
@@ -849,7 +885,7 @@
return *this;
}
reset(); // Reset also resets the region to be the entire string.
- if (position < 0 || position >= fRegionLimit) {
+ if (position < 0 || position >= fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return *this;
}
@@ -899,7 +935,7 @@
//
reset(input);
int32_t nextOutputStringStart = 0;
- if (fRegionLimit == 0) {
+ if (fActiveLimit == 0) {
return 0;
}
@@ -917,7 +953,7 @@
// last capture group saved in favor of the unprocessed remainder of the
// input string.)
i = destCapacity-1;
- int32_t remainingLength = fRegionLimit-nextOutputStringStart;
+ int32_t remainingLength = fActiveLimit-nextOutputStringStart;
if (remainingLength > 0) {
dest[i].setTo(input, nextOutputStringStart, remainingLength);
}
@@ -941,7 +977,7 @@
dest[i] = group(groupNum, status);
}
- if (nextOutputStringStart == fRegionLimit) {
+ if (nextOutputStringStart == fActiveLimit) {
// The delimiter was at the end of the string. We're done.
break;
}
@@ -951,7 +987,7 @@
{
// We ran off the end of the input while looking for the next delimiter.
// All the remaining text goes into the current output string.
- dest[i].setTo(input, nextOutputStringStart, fRegionLimit-nextOutputStringStart);
+ dest[i].setTo(input, nextOutputStringStart, fActiveLimit-nextOutputStringStart);
break;
}
}
@@ -1290,9 +1326,9 @@
case URX_ONECHAR:
- if (fp->fInputIdx < fRegionLimit) {
+ if (fp->fInputIdx < fActiveLimit) {
UChar32 c;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (c == opValue) {
break;
}
@@ -1318,7 +1354,7 @@
U_ASSERT(opType == URX_STRING_LEN);
U_ASSERT(stringLen >= 2);
- if (fp->fInputIdx + stringLen > fRegionLimit) {
+ if (fp->fInputIdx + stringLen > fActiveLimit) {
// No match. String is longer than the remaining input text.
// TODO: Should fHitEnd only be set if the string matches for whatever amount
// of input is actually available? Probably, although one could argue
@@ -1360,7 +1396,7 @@
case URX_END:
// The match loop will exit via this path on a successful match,
// when we reach the end of the pattern.
- if (toEnd && fp->fInputIdx != fRegionLimit) {
+ if (toEnd && fp->fInputIdx != fActiveLimit) {
// The pattern matched, but not to the end of input. Try some more.
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
@@ -1392,6 +1428,7 @@
// or for position before new line at end of input
if (fp->fInputIdx < fAnchorLimit-2) {
// We are no where near the end of input. Fail.
+ // This is the common case. Keep it first.
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@@ -1405,23 +1442,22 @@
// end of input, succeed.
if (fp->fInputIdx == fAnchorLimit-1) {
UChar32 c = fInput->char32At(fp->fInputIdx);
- if ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) {
+ if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
// If not in the middle of a CR/LF sequence
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
- break;
+ // At new-line at end of input. Success
fHitEnd = TRUE;
fRequireEnd = TRUE;
- // At new-line at end of input. Success
+ break;
}
}
}
- if (fp->fInputIdx == fAnchorLimit-2) {
- if (fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) {
+ if (fp->fInputIdx == fAnchorLimit-2 &&
+ fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) {
fHitEnd = TRUE;
fRequireEnd = TRUE;
break; // At CR/LF at end of input. Success
- }
}
fp = (REStackFrame *)fStack->popFrame(frameSize);
@@ -1429,6 +1465,29 @@
break;
+ case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
+ if (fp->fInputIdx >= fAnchorLimit-1) {
+ // Either at the last character of input, or off the end.
+ if (fp->fInputIdx == fAnchorLimit-1) {
+ // At last char of input. Success if it's a new line.
+ if (fInput->char32At(fp->fInputIdx) == 0x0a) {
+ fHitEnd = TRUE;
+ fRequireEnd = TRUE;
+ break;
+ }
+ } else {
+ // Off the end of input. Success.
+ fHitEnd = TRUE;
+ fRequireEnd = TRUE;
+ break;
+ }
+ }
+
+ // Not at end of input. Back-track out.
+ fp = (REStackFrame *)fStack->popFrame(frameSize);
+ break;
+
+
case URX_DOLLAR_M: // $, test for End of line in multi-line mode
{
if (fp->fInputIdx >= fAnchorLimit) {
@@ -1440,7 +1499,7 @@
// If we are positioned just before a new-line, succeed.
// It makes no difference where the new-line is within the input.
UChar32 c = inputBuf[fp->fInputIdx];
- if ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) {
+ if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence
// In multi-line mode, hitting a new-line just before the end of input does not
// set the hitEnd or requireEnd flags
@@ -1454,6 +1513,23 @@
break;
+ case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
+ {
+ if (fp->fInputIdx >= fAnchorLimit) {
+ // We really are at the end of input. Success.
+ fHitEnd = TRUE;
+ fRequireEnd = TRUE; // TODO: should require end be set in multi-line mode?
+ break;
+ }
+ // If we are not positioned just before a new-line, the test fails; backtrack out.
+ // It makes no difference where the new-line is within the input.
+ if (inputBuf[fp->fInputIdx] != 0x0a) {
+ fp = (REStackFrame *)fStack->popFrame(frameSize);
+ }
+ }
+ break;
+
+
case URX_CARET: // ^, test for start of line
if (fp->fInputIdx != fAnchorStart) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
@@ -1473,6 +1549,7 @@
if ((fp->fInputIdx < fAnchorLimit) &&
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
// It's a new-line. ^ is true. Success.
+ // TODO: what should be done with positions between a CR and LF?
break;
}
// Not at the start of a line. Fail.
@@ -1481,6 +1558,23 @@
break;
+ case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
+ {
+ U_ASSERT(fp->fInputIdx >= fAnchorStart);
+ if (fp->fInputIdx <= fAnchorStart) {
+ // We are at the start input. Success.
+ break;
+ }
+ // Check whether character just before the current pos is a new-line
+ U_ASSERT(fp->fInputIdx <= fAnchorLimit);
+ UChar c = inputBuf[fp->fInputIdx - 1];
+ if (c != 0x0a) {
+ // Not at the start of a line. Back-track out.
+ fp = (REStackFrame *)fStack->popFrame(frameSize);
+ }
+ }
+ break;
+
case URX_BACKSLASH_B: // Test for word boundaries
{
UBool success = isWordBoundary(fp->fInputIdx);
@@ -1505,7 +1599,7 @@
case URX_BACKSLASH_D: // Test for decimal digit
{
- if (fp->fInputIdx >= fRegionLimit) {
+ if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
@@ -1525,7 +1619,7 @@
case URX_BACKSLASH_G: // Test for position at end of previous match
- if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fRegionStart)) {
+ if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
break;
@@ -1538,7 +1632,7 @@
{
// Fail if at end of input
- if (fp->fInputIdx >= fRegionLimit) {
+ if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
@@ -1547,7 +1641,7 @@
// Examine (and consume) the current char.
// Dispatch into a little state machine, based on the char.
UChar32 c;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
UnicodeSet **sets = fPattern->fStaticSets;
if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
@@ -1561,8 +1655,8 @@
GC_L:
- if (fp->fInputIdx >= fRegionLimit) goto GC_Done;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
+ if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (sets[URX_GC_L]->contains(c)) goto GC_L;
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
@@ -1571,16 +1665,16 @@
goto GC_Extend;
GC_V:
- if (fp->fInputIdx >= fRegionLimit) goto GC_Done;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
+ if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (sets[URX_GC_V]->contains(c)) goto GC_V;
if (sets[URX_GC_T]->contains(c)) goto GC_T;
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
goto GC_Extend;
GC_T:
- if (fp->fInputIdx >= fRegionLimit) goto GC_Done;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
+ if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (sets[URX_GC_T]->contains(c)) goto GC_T;
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
goto GC_Extend;
@@ -1588,26 +1682,26 @@
GC_Extend:
// Combining characters are consumed here
for (;;) {
- if (fp->fInputIdx >= fRegionLimit) {
+ if (fp->fInputIdx >= fActiveLimit) {
break;
}
- U16_GET(inputBuf, 0, fp->fInputIdx, fRegionLimit, c);
+ U16_GET(inputBuf, 0, fp->fInputIdx, fActiveLimit, c);
if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
break;
}
- U16_FWD_1(inputBuf, fp->fInputIdx, fRegionLimit);
+ U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
}
goto GC_Done;
GC_Control:
// Most control chars stand alone (don't combine with combining chars),
// except for that CR/LF sequence is a single grapheme cluster.
- if (c == 0x0d && fp->fInputIdx < fRegionLimit && inputBuf[fp->fInputIdx] == 0x0a) {
+ if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
fp->fInputIdx++;
}
GC_Done:
- if (fp->fInputIdx >= fRegionLimit) {
+ if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
}
break;
@@ -1632,7 +1726,7 @@
// The high bit of the op value is a flag for the match polarity.
// 0: success if input char is in set.
// 1: success if input char is not in set.
- if (fp->fInputIdx >= fRegionLimit) {
+ if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
@@ -1642,7 +1736,7 @@
opValue &= ~URX_NEG_SET;
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
UChar32 c;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (c < 256) {
Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
if (s8->contains(c)) {
@@ -1665,7 +1759,7 @@
{
// Test input character for NOT being a member of one of
// the predefined sets (Word Characters, for example)
- if (fp->fInputIdx >= fRegionLimit) {
+ if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
@@ -1673,7 +1767,7 @@
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
UChar32 c;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (c < 256) {
Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
if (s8->contains(c) == FALSE) {
@@ -1692,14 +1786,14 @@
case URX_SETREF:
- if (fp->fInputIdx >= fRegionLimit) {
+ if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
// There is input left. Pick up one char and test it for set membership.
UChar32 c;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
U_ASSERT(opValue > 0 && opValue < sets->size());
if (c<256) {
Regex8BitSet *s8 = &fPattern->fSets8[opValue];
@@ -1721,7 +1815,7 @@
case URX_DOTANY:
{
// . matches anything, but stops at end-of-line.
- if (fp->fInputIdx >= fRegionLimit) {
+ if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
@@ -1729,7 +1823,7 @@
}
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
// End of line in normal mode. . does not match.
@@ -1738,12 +1832,12 @@
}
}
break;
-
-
+
+
case URX_DOTANY_ALL:
{
// ., in dot-matches-all (including new lines) mode
- if (fp->fInputIdx >= fRegionLimit) {
+ if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
@@ -1752,8 +1846,8 @@
// There is input left. Advance over one char, except if we are
// at a cr/lf, advance over both of them.
UChar32 c;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
- if (c==0x0d && fp->fInputIdx < fRegionLimit) {
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
+ if (c==0x0d && fp->fInputIdx < fActiveLimit) {
// In the case of a CR/LF, we need to advance over both.
UChar nextc = inputBuf[fp->fInputIdx];
if (nextc == 0x0a) {
@@ -1763,51 +1857,24 @@
}
break;
- case URX_DOTANY_PL:
- // Match all up to and end-of-line or end-of-input.
+
+ case URX_DOTANY_UNIX:
{
- // Fail if input already exhausted.
- if (fp->fInputIdx >= fRegionLimit) {
+ // '.' operator, matches all, but stops at end-of-line.
+ // UNIX_LINES mode, so 0x0a is the only recognized line ending.
+ if (fp->fInputIdx >= fActiveLimit) {
+ // At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
-
- // There is input left. Fail if we are at the end of a line.
+ // There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
- if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
- ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
- // End of line in normal mode. . does not match.
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
+ if (c == 0x0a) {
+ // End of line in normal mode. '.' does not match the \n
fp = (REStackFrame *)fStack->popFrame(frameSize);
- break;
}
-
- // There was input left. Consume it until we hit the end of a line,
- // or until it's exhausted.
- for (;;) {
- if (fp->fInputIdx >= fRegionLimit) {
- fHitEnd = TRUE;
- break;
- }
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
- if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
- ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
- U16_BACK_1(inputBuf, 0, fp->fInputIdx)
- // Scan has reached a line-end. We are done.
- break;
- }
- }
- }
- break;
-
- case URX_DOTANY_ALL_PL:
- // Match up to end of input. Fail if already at end of input.
- fHitEnd = TRUE;
- if (fp->fInputIdx >= fRegionLimit) {
- fp = (REStackFrame *)fStack->popFrame(frameSize);
- } else {
- fp->fInputIdx = fRegionLimit;
}
break;
@@ -2006,7 +2073,7 @@
}
UBool haveMatch = FALSE;
- if (fp->fInputIdx + len <= fRegionLimit) {
+ if (fp->fInputIdx + len <= fActiveLimit) {
if (opType == URX_BACKREF) {
if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) {
haveMatch = TRUE;
@@ -2054,13 +2121,14 @@
break;
case URX_LA_START:
- // TODO: setup for trnsaparent bounds,
{
// Entering a lookahead block.
// Save Stack Ptr, Input Pos.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
fData[opValue] = fStack->size();
fData[opValue+1] = fp->fInputIdx;
+ fActiveStart = fLookStart; // Set the match region change for
+ fActiveLimit = fLookLimit; // transparent bounds.
}
break;
@@ -2068,12 +2136,14 @@
{
// Leaving a look-ahead block.
// restore Stack Ptr, Input Pos to positions they had on entry to block.
- // TODO: will need to restore Region bounds as well, for Transparent Bounds.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
int32_t stackSize = fStack->size();
int32_t newStackSize = fData[opValue];
U_ASSERT(stackSize >= newStackSize);
if (stackSize > newStackSize) {
+ // Copy the current top frame back to the new (cut back) top frame.
+ // This makes the capture groups from within the look-ahead
+ // expression available.
int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
int32_t i;
for (i=0; i<frameSize; i++) {
@@ -2083,13 +2153,18 @@
fStack->setSize(newStackSize);
}
fp->fInputIdx = fData[opValue+1];
+
+ // Restore the active region bounds in the input string; they may have
+ // been changed because of transparent bounds on a Region.
+ fActiveStart = fRegionStart; // TODO: handle nested look-around blocks.
+ fActiveLimit = fRegionLimit;
}
break;
case URX_ONECHAR_I:
- if (fp->fInputIdx < fRegionLimit) {
+ if (fp->fInputIdx < fActiveLimit) {
UChar32 c;
- U16_NEXT(inputBuf, fp->fInputIdx, fRegionLimit, c);
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
break;
}
@@ -2115,7 +2190,7 @@
stringLen = opValue;
int32_t stringEndIndex = fp->fInputIdx + stringLen;
- if (stringEndIndex <= fRegionLimit) {
+ if (stringEndIndex <= fActiveLimit) {
if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
stringLen, U_FOLD_CASE_DEFAULT) == 0) {
// Success. Advance the current input position.
@@ -2144,8 +2219,8 @@
fData[opValue+2] = -1;
// Save input string length, then reset to pin any matches to end at
// the current position.
- fData[opValue+3] = fRegionLimit;
- fRegionLimit = fp->fInputIdx;
+ fData[opValue+3] = fActiveLimit;
+ fActiveLimit = fp->fInputIdx;
}
break;
@@ -2184,9 +2259,9 @@
// Look Behind altogether.
fp = (REStackFrame *)fStack->popFrame(frameSize);
int32_t restoreInputLen = fData[opValue+3];
- U_ASSERT(restoreInputLen >= fRegionLimit);
+ U_ASSERT(restoreInputLen >= fActiveLimit);
U_ASSERT(restoreInputLen <= fInput->length());
- fRegionLimit = restoreInputLen;
+ fActiveLimit = restoreInputLen;
break;
}
@@ -2201,7 +2276,7 @@
// End of a look-behind block, after a successful match.
{
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
- if (fp->fInputIdx != fRegionLimit) {
+ if (fp->fInputIdx != fActiveLimit) {
// The look-behind expression matched, but the match did not
// extend all the way to the point that we are looking behind from.
// FAIL out of here, which will take us back to the LB_CONT, which
@@ -2215,9 +2290,9 @@
// which had been truncated to pin the end of the lookbehind match to the
// position being looked-behind.
int32_t originalInputLen = fData[opValue+3];
- U_ASSERT(originalInputLen >= fRegionLimit);
+ U_ASSERT(originalInputLen >= fActiveLimit);
U_ASSERT(originalInputLen <= fInput->length());
- fRegionLimit = originalInputLen;
+ fActiveLimit = originalInputLen;
}
break;
@@ -2257,9 +2332,9 @@
// getting a match, which means that the negative lookbehind as
// a whole has succeeded. Jump forward to the continue location
int32_t restoreInputLen = fData[opValue+3];
- U_ASSERT(restoreInputLen >= fRegionLimit);
+ U_ASSERT(restoreInputLen >= fActiveLimit);
U_ASSERT(restoreInputLen <= fInput->length());
- fRegionLimit = restoreInputLen;
+ fActiveLimit = restoreInputLen;
fp->fPatIdx = continueLoc;
break;
}
@@ -2275,7 +2350,7 @@
// End of a negative look-behind block, after a successful match.
{
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
- if (fp->fInputIdx != fRegionLimit) {
+ if (fp->fInputIdx != fActiveLimit) {
// The look-behind expression matched, but the match did not
// extend all the way to the point that we are looking behind from.
// FAIL out of here, which will take us back to the LB_CONT, which
@@ -2292,9 +2367,9 @@
// inorder to pin the end of the lookbehind match
// to the position being looked-behind.
int32_t originalInputLen = fData[opValue+3];
- U_ASSERT(originalInputLen >= fRegionLimit);
+ U_ASSERT(originalInputLen >= fActiveLimit);
U_ASSERT(originalInputLen <= fInput->length());
- fRegionLimit = originalInputLen;
+ fActiveLimit = originalInputLen;
// Restore original stack position, discarding any state saved
// by the successful pattern match.
@@ -2324,12 +2399,12 @@
// we reach a character that is not a member of the set.
int32_t ix = fp->fInputIdx;
for (;;) {
- if (ix >= fRegionLimit) {
+ if (ix >= fActiveLimit) {
fHitEnd = TRUE;
break;
}
UChar32 c;
- U16_NEXT(inputBuf, ix, fRegionLimit, c);
+ U16_NEXT(inputBuf, ix, fActiveLimit, c);
if (c<256) {
if (s8->contains(c) == FALSE) {
U16_BACK_1(inputBuf, 0, ix);
@@ -2377,32 +2452,35 @@
// Loop through input until the input is exhausted (we reach an end-of-line)
// In multi-line mode, we can just go straight to the end of the input.
int32_t ix;
- if (opValue == 1) {
+ if ((opValue & 1) == 1) {
// Multi-line mode.
- ix = fRegionLimit;
+ ix = fActiveLimit;
fHitEnd = TRUE;
} else {
// NOT multi-line mode. Line endings do not match '.'
// Scan forward until a line ending or end of input.
ix = fp->fInputIdx;
for (;;) {
- if (ix >= fRegionLimit) {
+ if (ix >= fActiveLimit) {
fHitEnd = TRUE;
- ix = fRegionLimit;
+ ix = fActiveLimit;
break;
}
UChar32 c;
- U16_NEXT(inputBuf, ix, fRegionLimit, c); // c = inputBuf[ix++]
- if (((c & 0x7f) <= 0x29) &&
- ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
- // char is a line ending. Put the input pos back to the
- // line ending char, and exit the scanning loop.
- U16_BACK_1(inputBuf, 0, ix);
- break;
+ U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
+ if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
+ if ((c == 0x0a) || // 0x0a is newline in both modes.
+ ((opValue & 2) == 0) && // IF not UNIX_LINES mode
+ (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) {
+ // char is a line ending. Put the input pos back to the
+ // line ending char, and exit the scanning loop.
+ U16_BACK_1(inputBuf, 0, ix);
+ break;
+ }
}
}
}
-
+
// If there were no matching characters, skip over the loop altogether.
// The loop doesn't run at all, a * op always succeeds.
if (ix == fp->fInputIdx) {
@@ -2412,7 +2490,7 @@
// Peek ahead in the compiled pattern, to the URX_LOOP_C that
// must follow. It's operand is the stack location
- // that holds the starting input index for the match of this [set]*
+ // that holds the starting input index for the match of this .*
int32_t loopcOp = pat[fp->fPatIdx];
U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
int32_t stackLoc = URX_VAL(loopcOp);
diff --git a/i18n/repattrn.cpp b/i18n/repattrn.cpp
index 8cf55d7..17b721e 100644
--- a/i18n/repattrn.cpp
+++ b/i18n/repattrn.cpp
@@ -244,7 +244,8 @@
}
const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
- UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
+ UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
+ UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES;
if ((flags & ~allFlags) != 0) {
status = U_REGEX_INVALID_FLAG;
@@ -451,8 +452,6 @@
case URX_NOP:
case URX_DOTANY:
case URX_DOTANY_ALL:
- case URX_DOTANY_PL:
- case URX_DOTANY_ALL_PL:
case URX_FAIL:
case URX_CARET:
case URX_DOLLAR:
diff --git a/i18n/unicode/regex.h b/i18n/unicode/regex.h
index e2af308..cabf1a5 100644
--- a/i18n/unicode/regex.h
+++ b/i18n/unicode/regex.h
@@ -16,7 +16,7 @@
#ifndef REGEX_H
#define REGEX_H
-// #define REGEX_DEBUG
+#define REGEX_DEBUG
/**
* \file
@@ -1067,6 +1067,12 @@
int32_t fLookStart; // Region bounds for look-ahead/behind and
int32_t fLookLimit; // and other boundary tests. See
// useTransparentBounds
+
+ int32_t fActiveStart; // Currently active bouonds for matching.
+ int32_t fActiveLimit; // Usually is the same as region, but
+ // is changed to fLookStart/Limit when
+ // entering look around regions.
+
UBool fTransparentBounds; // True if using transparent bounds.
UBool fAnchoringBounds; // True if using anchoring bounds.
diff --git a/i18n/unicode/uregex.h b/i18n/unicode/uregex.h
index 660076a..3e04f7a 100644
--- a/i18n/unicode/uregex.h
+++ b/i18n/unicode/uregex.h
@@ -71,7 +71,7 @@
* early (\u) we should still do.
* @draft ICU 4.0
*/
- UREGEG_LITERAL = 16,
+ UREGEX_LITERAL = 16,
/** Control behavior of "$" and "^"
* If set, recognize line terminators within string,
@@ -84,7 +84,7 @@
* in the behavior of ., ^, and $.
* @draft ICU 4.0
*/
- URGEGX_UNIX_LINES = 1,
+ UREGEX_UNIX_LINES = 1,
/** Unicode word boundaries.
* If set, \b uses the Unicode TR 29 definition of word boundaries.
@@ -455,6 +455,7 @@
* uregex_start(), uregex_end() and uregex_group() to return an error
* indicating that there is no match information available. Clears any
* match region that may have been set.
+ * TODO: reset(-1) to preserve regions?
*
* @param regexp The compiled regular expression.
* @param index The position in the text at which a
diff --git a/test/cintltst/reapits.c b/test/cintltst/reapits.c
index 955bdd2..367d555 100644
--- a/test/cintltst/reapits.c
+++ b/test/cintltst/reapits.c
@@ -34,6 +34,22 @@
#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
log_err("Test Failure at file %s, line %d\n", __FILE__, __LINE__);}}
+#define TEST_SETUP(pattern, testString, flags) { \
+ status = U_ZERO_ERROR; \
+ re = uregex_openC(pattern, flags, NULL, &status); \
+ TEST_ASSERT_SUCCESS(status); \
+ UChar *srcString = (UChar *)malloc((strlen(testString)+2)*sizeof(UChar)); \
+ u_uastrncpy(srcString, testString, strlen(testString)+1); \
+ uregex_setText(re, srcString, -1, &status); \
+ TEST_ASSERT_SUCCESS(status);
+
+#define TEST_TEARDOWN \
+ TEST_ASSERT_SUCCESS(status); \
+ uregex_close(re); \
+ free(srcString); \
+ }
+
+
static void test_assert_string(const char *expected, const UChar *actual, UBool nulTerm, const char *file, int line) {
char buf_inside_macro[120];
int32_t len = (int32_t)strlen(expected);
@@ -548,22 +564,6 @@
/*
* Regions
*/
- #define TEST_SETUP(pattern, testString, flags) { \
- status = U_ZERO_ERROR; \
- re = uregex_openC(pattern, flags, NULL, &status); \
- TEST_ASSERT_SUCCESS(status); \
- UChar *srcString = (UChar *)malloc((strlen(testString)+2)*sizeof(UChar)); \
- u_uastrncpy(srcString, testString, strlen(testString)+1); \
- uregex_setText(re, srcString, -1, &status); \
- TEST_ASSERT_SUCCESS(status);
-
- #define TEST_TEARDOWN \
- TEST_ASSERT_SUCCESS(status); \
- uregex_close(re); \
- free(srcString); \
- }
-
-
// SetRegion(), getRegion() do something
@@ -656,14 +656,41 @@
TEST_ASSERT(uregex_requireEnd(re, &status) == FALSE);
TEST_TEARDOWN;
- // requireEnd
TEST_SETUP("abcd$", "abcd", 0);
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
TEST_ASSERT(uregex_requireEnd(re, &status) == TRUE);
TEST_TEARDOWN;
-
-
+ // anchoringBounds
+ TEST_SETUP("abc$", "abcdef", 0);
+ TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == TRUE);
+ uregex_useAnchoringBounds(re, FALSE, &status);
+ TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == FALSE);
+
+ TEST_ASSERT(uregex_find(re, -1, &status) == FALSE);
+ uregex_useAnchoringBounds(re, TRUE, &status);
+ uregex_setRegion(re, 0, 3, &status);
+ TEST_ASSERT(uregex_find(re, -1, &status) == TRUE);
+ TEST_ASSERT(uregex_end(re, 0, &status) == 3);
+ TEST_TEARDOWN;
+
+ // Transparent Bounds
+ TEST_SETUP("abc(?=def)", "abcdef", 0);
+ TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == FALSE);
+ uregex_useTransparentBounds(re, TRUE, &status);
+ TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == TRUE);
+
+ uregex_useTransparentBounds(re, FALSE, &status);
+ TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); // No Region
+ uregex_setRegion(re, 0, 3, &status);
+ TEST_ASSERT(uregex_find(re, -1, &status) == FALSE); // with region, opaque bounds
+ uregex_useTransparentBounds(re, TRUE, &status);
+ TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); // with region, transparent bounds
+ TEST_ASSERT(uregex_end(re, 0, &status) == 3);
+ TEST_TEARDOWN;
+
+
+
#if 0
status = U_ZERO_ERROR;
uregex_reset(re, 0, &status);
diff --git a/test/intltest/regextst.cpp b/test/intltest/regextst.cpp
index e984c2a..6ab96db 100644
--- a/test/intltest/regextst.cpp
+++ b/test/intltest/regextst.cpp
@@ -1360,7 +1360,7 @@
RegexMatcher quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status);
RegexMatcher commentMat ("\\s*(#.*)?$", 0, status);
- RegexMatcher flagsMat ("\\s*([ixsmdteEGMvatyYzZ2-9]*)([:letter:]*)", 0, status);
+ RegexMatcher flagsMat ("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)", 0, status);
RegexMatcher lineMat("(.*?)\\r?\\n", testString, 0, status);
UnicodeString testPattern; // The pattern for test from the test file.
@@ -1506,8 +1506,9 @@
int32_t numFinds;
int32_t i;
UBool useMatchesFunc = FALSE;
- int32_t regionStart = -1;
- int32_t regionEnd = -1;
+ UBool useLookingAtFunc = FALSE;
+ int32_t regionStart = -1;
+ int32_t regionEnd = -1;
//
// Compile the caller's pattern
@@ -1529,6 +1530,9 @@
if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
}
+ if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
+ bflags |= UREGEX_UNIX_LINES;
+ }
callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
@@ -1581,6 +1585,9 @@
if (flags.indexOf((UChar)0x4d) >= 0) {
useMatchesFunc = TRUE;
}
+ if (flags.indexOf((UChar)0x4c) >= 0) {
+ useLookingAtFunc = TRUE;
+ }
//
// Find the tags in the input data, remove them, and record the group boundary
@@ -1644,10 +1651,14 @@
//
// Do a find on the de-tagged input using the caller's pattern
+ // TODO: error on count>1 and not find().
+ // error on both matches() and lookingAt().
//
for (i=0; i<numFinds; i++) {
if (useMatchesFunc) {
isMatch = matcher->matches(status);
+ } else if (useLookingAtFunc) {
+ isMatch = matcher->lookingAt(status);
} else {
isMatch = matcher->find();
}
@@ -1702,22 +1713,22 @@
if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
matcher->requireEnd() == TRUE) {
- errln("requireEnd() returned TRUE. Expected FALSE");
+ errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
failed = TRUE;
}
if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
matcher->requireEnd() == FALSE) {
- errln("requireEnd() returned FALSE. Expected TRUE");
+ errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
failed = TRUE;
}
if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
matcher->hitEnd() == TRUE) {
- errln("hitEnd() returned TRUE. Expected FALSE");
+ errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
failed = TRUE;
}
if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
matcher->hitEnd() == FALSE) {
- errln("hitEnd() returned FALSE. Expected TRUE");
+ errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
failed = TRUE;
}
diff --git a/test/testdata/re_tests.txt b/test/testdata/re_tests.txt
index b863571..c18b638 100644
--- a/test/testdata/re_tests.txt
+++ b/test/testdata/re_tests.txt
@@ -822,7 +822,7 @@
.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
tt+$ xxxtt y - -
-([a-\d]+) za-9z y $1 a-9
+([a-\d]+) za-9z yi $1 a-9
([\d-z]+) a0-za y $1 0-z
([\d-\s]+) a0- z y $1 0-
([a-[:digit:]]+) za-9z y $1 a-9
diff --git a/test/testdata/regextst.txt b/test/testdata/regextst.txt
index 3a51386..01a867e 100644
--- a/test/testdata/regextst.txt
+++ b/test/testdata/regextst.txt
@@ -22,6 +22,7 @@
# s dot-matches-all mode
# m multi-line mode.
# ($ and ^ match at embedded new-lines)
+# D Unix Lines mode (only recognize 0x0a as new-line)
# v If icu configured without break iteration, this
# regex test pattern should not compile.
# e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag
@@ -32,6 +33,7 @@
# for the last find() in the sequence.
# G Only check match / no match. Do not check capture groups.
# E Pattern compilation error expected
+# L Use LookingAt() rather than find()
# M Use matches() rather than find().
#
# a Use non-Anchoring Bounds.
@@ -45,6 +47,114 @@
# White space must be present between the flags and the match string.
#
+# Look-ahead expressions
+#
+"abc(?=def)" "<0>abc</0>def"
+"(.*)(?=c)" "<0><1>ab</1></0>cdef"
+
+"(?:.*)(?=c)" "<r>ab</r>cdef"
+"(?:.*)(?=c)" b "<r><0>ab</0></r>cdef" # transparent bounds
+"(?:.*)(?=c)" bM "<r><0>ab</0></r>cdef" # transparent bounds
+
+"(?:.*)(?=(c))" b "<0>ab</0><1>c</1>def" # Capture in look-ahead
+"(?=(.)\1\1)\1" "abcc<0><1>d</1></0>ddefg" # Backrefs to look-ahead capture
+
+".(?!\p{L})" "abc<0>d</0> " # Negated look-ahead
+".(?!(\p{L}))" "abc<0>d</0> " # Negated look-ahead, no capture
+ # visible outside of look-ahead
+
+#
+# Negated Lookahead, various regions and region transparency
+#
+"abc(?!def)" "<0>abc</0>xyz"
+"abc(?!def)" "abcdef"
+"abc(?!def)" "<r><0>abc</0></r>def"
+"abc(?!def)" b "<r>abc</r>def"
+"abc(?!def)" b "<r><0>abc</0></r>xyz"
+
+#
+# Anchoring Bounds
+#
+"^def$" "abc<r><0>def</0></r>ghi" # anchoring (default) bounds
+"^def$" a "abc<r>def</r>ghi" # non-anchoring bounds
+"^def" a "<r><0>def</0></r>ghi" # non-anchoring bounds
+"def$" a "abc<r><0>def</0></r>" # non-anchoring bounds
+
+"^.*$" m "<0>line 1</0>\n line 2"
+"^.*$" m2 "line 1\n<0> line 2</0>"
+"^.*$" m3 "line 1\n line 2"
+"^.*$" m "li<r><0>ne </0></r>1\n line 2" # anchoring bounds
+"^.*$" m2 "li<r>ne </r>1\n line 2" # anchoring bounds
+"^.*$" am "li<r>ne </r>1\n line 2" # non-anchoring bounds
+"^.*$" am "li\n<r><0>ne </0></r>\n1\n line 2" # non-anchoring bounds
+
+#
+# HitEnd and RequireEnd for new-lines just before end-of-input
+#
+"xyz$" yz "<0>xyz</0>\n"
+"xyz$" yz "<0>xyz</0>\x{d}\x{a}"
+
+"xyz$" myz "<0>xyz</0>" # multi-line mode
+"xyz$" mYZ "<0>xyz</0>\n"
+"xyz$" mYZ "<0>xyz</0>\r\n"
+"xyz$" mYZ "<0>xyz</0>\x{85}abcd"
+
+"xyz$" Yz "xyz\nx"
+"xyz$" Yz "xyza"
+"xyz$" yz "<0>xyz</0>"
+
+#
+# All Unicode line endings recognized.
+# 0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029
+# Multi-line and non-multiline mode take different paths, so repeated tests.
+#
+"^def$" mYZ "abc\x{a}<0>def</0>\x{a}ghi"
+"^def$" mYZ "abc\x{b}<0>def</0>\x{b}ghi"
+"^def$" mYZ "abc\x{c}<0>def</0>\x{c}ghi"
+"^def$" mYZ "abc\x{d}<0>def</0>\x{d}ghi"
+"^def$" mYZ "abc\x{85}<0>def</0>\x{85}ghi"
+"^def$" mYZ "abc\x{2028}<0>def</0>\x{2028}ghi"
+"^def$" mYZ "abc\x{2029}<0>def</0>\x{2029}ghi"
+"^def$" mYZ "abc\r\n<0>def</0>\r\nghi"
+
+"^def$" yz "<0>def</0>\x{a}"
+"^def$" yz "<0>def</0>\x{b}"
+"^def$" yz "<0>def</0>\x{c}"
+"^def$" yz "<0>def</0>\x{d}"
+"^def$" yz "<0>def</0>\x{85}"
+"^def$" yz "<0>def</0>\x{2028}"
+"^def$" yz "<0>def</0>\x{2029}"
+"^def$" yz "<0>def</0>\r\n"
+"^def$" yz "<0>def</0>"
+
+
+"^def$" "<0>def</0>\x{2028" #TODO: should be an error of some sort.
+
+#
+# UNIX_LINES mode
+#
+"abc$" D "<0>abc</0>\n"
+"abc$" D "abc\r"
+"abc$" D "abc\u0085"
+"a.b" D "<0>a\rb</0>"
+"a.b" D "a\nb"
+"(?d)abc$" "<0>abc</0>\n"
+"(?d)abc$" "abc\r"
+"abc$" mD "<0>abc</0>\ndef"
+"abc$" mD "abc\rdef"
+
+".*def" L "abc\r def xyz" # Normal mode, LookingAt() stops at \r
+".*def" DL "<0>abc\r def</0> xyz" # Unix Lines mode, \r not line end.
+".*def" DL "abc\n def xyz"
+
+"(?d)a.b" "a\nb"
+"(?d)a.b" "<0>a\rb</0>"
+
+"^abc" m "xyz\r<0>abc</0>"
+"^abc" Dm "xyz\rabc"
+"^abc" Dm "xyz\n<0>abc</0>"
+
+
# Capturing parens
".(..)." "<0>a<1>bc</1>d</0>"
@@ -442,9 +552,10 @@
#
# Octal Escaping. This conforms to Java conventions, not Perl.
-"\0101\0\03\073\0154\01442" "<0>A\u0000\u0003\u003b\u006c\u0064\u0032</0>"
+"\0101\00\03\073\0154\01442" "<0>A\u0000\u0003\u003b\u006c\u0064\u0032</0>"
"\0776" "<0>\u003f\u0036</0>" # overflow, the 6 is literal.
"\0376xyz" "<0>\u00fexyz</0>"
+"\08" E "<0>\u00008</0>"
#
# \u Surrogate Pairs
@@ -454,6 +565,12 @@
"\ud800\ud800\udc00" "<0>\ud800\U00010000</0>\U00010000\U00010000\U00010001"
"(\ud800)(\udc00)" "\U00010000"
+#
+# hitEnd with find()
+#
+"abc" Z "aa<0>abc</0> abcab"
+"abc" 2Z "aaabc <0>abc</0>ab"
+"abc" 3z "aa>abc abcab"
#
# Bug 3225
@@ -652,7 +769,7 @@
"[abcd-[bc]]+" "<0>bad--dac</0>xyz"
"[abcd-]+" "<0>bad--dac</0>xyz"
-"[abcd-\s]+" "xyz<0>abcd --</0>xyz" # set-lit-dash-esc
+"[abcd-\s]+" E "xyz<0>abcd --</0>xyz" # set-lit-dash-esc
"[abcd-\N{LATIN SMALL LETTER G}]+" "xyz-<0>abcdefg</0>hij-"
"[bcd-\{]+" "a<0>bcdefyz{</0>|}"
@@ -682,6 +799,8 @@
"\p{InBasicLatin}+" "ΓΔΕΖΗΘ<0>hello, world.</0>ニヌネノハバパ"
"\P{InBasicLatin}+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
"\p{InGreek}+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
+"\p{InCombining Marks for Symbols}" "<0>\u20d0</0>"
+"\p{Incombiningmarksforsymbols}" "<0>\u20d0</0>"
"\p{javaDefined}+" "\uffff<0>abcd</0>\U00045678"
@@ -717,6 +836,11 @@
"\Q\Y\E" e "<0>\\Y</0>"
#
+# Reported problem
+#
+"[a-\w]" E "x"
+
+#
# Bug 4045
#
"A*" "<0>AAAA</0>"
@@ -785,6 +909,7 @@
# bug 5386 "^.*$" should match empty input
#
"^.*$" "<0></0>"
+"^.*$" m "<0></0>"
"^.*$" "<0></0>\n"
"(?s)^.*$" "<0>\n</0>"