blob: 66a26fbef37d1854f7403f6755a5949e19dbd493 [file] [log] [blame]
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2002-2012, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
//
// regextst.cpp
//
// ICU Regular Expressions test, part of intltest.
//
/*
NOTE!!
PLEASE be careful about ASCII assumptions in this test.
This test is one of the worst repeat offenders.
If you have questions, contact someone on the ICU PMC
who has access to an EBCDIC system.
*/
#include "intltest.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/regex.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
#include "unicode/ustring.h"
#include "regextst.h"
#include "uvector.h"
#include "util.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "cstring.h"
#include "uinvchar.h"
#define SUPPORT_MUTATING_INPUT_STRING 0
//---------------------------------------------------------------------------
//
// Test class boilerplate
//
//---------------------------------------------------------------------------
RegexTest::RegexTest()
{
}
RegexTest::~RegexTest()
{
}
void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
{
if (exec) logln("TestSuite RegexTest: ");
switch (index) {
case 0: name = "Basic";
if (exec) Basic();
break;
case 1: name = "API_Match";
if (exec) API_Match();
break;
case 2: name = "API_Replace";
if (exec) API_Replace();
break;
case 3: name = "API_Pattern";
if (exec) API_Pattern();
break;
case 4:
#if !UCONFIG_NO_FILE_IO
name = "Extended";
if (exec) Extended();
#else
name = "skip";
#endif
break;
case 5: name = "Errors";
if (exec) Errors();
break;
case 6: name = "PerlTests";
if (exec) PerlTests();
break;
case 7: name = "Callbacks";
if (exec) Callbacks();
break;
case 8: name = "FindProgressCallbacks";
if (exec) FindProgressCallbacks();
break;
case 9: name = "Bug 6149";
if (exec) Bug6149();
break;
case 10: name = "UTextBasic";
if (exec) UTextBasic();
break;
case 11: name = "API_Match_UTF8";
if (exec) API_Match_UTF8();
break;
case 12: name = "API_Replace_UTF8";
if (exec) API_Replace_UTF8();
break;
case 13: name = "API_Pattern_UTF8";
if (exec) API_Pattern_UTF8();
break;
case 14: name = "PerlTestsUTF8";
if (exec) PerlTestsUTF8();
break;
case 15: name = "PreAllocatedUTextCAPI";
if (exec) PreAllocatedUTextCAPI();
break;
case 16: name = "Bug 7651";
if (exec) Bug7651();
break;
case 17: name = "Bug 7740";
if (exec) Bug7740();
break;
case 18: name = "Bug 8479";
if (exec) Bug8479();
break;
case 19: name = "Bug 7029";
if (exec) Bug7029();
break;
case 20: name = "CheckInvBufSize";
if (exec) CheckInvBufSize();
break;
case 21: name = "Bug 9283";
if (exec) Bug9283();
break;
default: name = "";
break; //needed to end loop
}
}
/**
* Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
* into ASCII.
* @see utext_openUTF8
*/
static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
//---------------------------------------------------------------------------
//
// Error Checking / Reporting macros used in all of the tests.
//
//---------------------------------------------------------------------------
static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
int64_t oldIndex = utext_getNativeIndex(text);
utext_setNativeIndex(text, 0);
char *bufPtr = buf;
UChar32 c = utext_next32From(text, 0);
while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
if (0x000020<=c && c<0x00007e) {
*bufPtr = c;
} else {
#if 0
sprintf(bufPtr,"U+%04X", c);
bufPtr+= strlen(bufPtr)-1;
#else
*bufPtr = '%';
#endif
}
bufPtr++;
c = UTEXT_NEXT32(text);
}
*bufPtr = 0;
#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
char *ebuf = (char*)malloc(bufLen);
uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
uprv_strncpy(buf, ebuf, bufLen);
free((void*)ebuf);
#endif
utext_setNativeIndex(text, oldIndex);
}
static char ASSERT_BUF[1024];
const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
if(message.length()==0) {
strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
} else {
UnicodeString buf;
IntlTest::prettify(message,buf);
if(buf.length()==0) {
strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
} else {
buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
if(ASSERT_BUF[0]==0) {
ASSERT_BUF[0]=0;
for(int32_t i=0;i<buf.length();i++) {
UChar ch = buf[i];
sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
}
}
}
}
ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
return ASSERT_BUF;
}
#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
__FILE__, __LINE__, u_errorName(status)); return;}}
#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
__LINE__, u_errorName(errcode), u_errorName(status));};}
#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
"RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
static UBool testUTextEqual(UText *uta, UText *utb) {
UChar32 ca = 0;
UChar32 cb = 0;
utext_setNativeIndex(uta, 0);
utext_setNativeIndex(utb, 0);
do {
ca = utext_next32(uta);
cb = utext_next32(utb);
if (ca != cb) {
break;
}
} while (ca != U_SENTINEL);
return ca == cb;
}
/**
* @param expected expected text in UTF-8 (not platform) codepage
*/
void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
UErrorCode status = U_ZERO_ERROR;
UText expectedText = UTEXT_INITIALIZER;
utext_openUTF8(&expectedText, expected, -1, &status);
if(U_FAILURE(status)) {
errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
return;
}
if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
return;
}
utext_setNativeIndex(actual, 0);
if (!testUTextEqual(&expectedText, actual)) {
char buf[201 /*21*/];
char expectedBuf[201];
utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
}
utext_close(&expectedText);
}
/**
* @param expected invariant (platform local text) input
*/
void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
UErrorCode status = U_ZERO_ERROR;
UText expectedText = UTEXT_INITIALIZER;
regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
if(U_FAILURE(status)) {
errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
return;
}
utext_setNativeIndex(actual, 0);
if (!testUTextEqual(&expectedText, actual)) {
char buf[201 /*21*/];
char expectedBuf[201];
utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
}
utext_close(&expectedText);
}
/**
* Assumes utf-8 input
*/
#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
/**
* Assumes Invariant input
*/
#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
/**
* This buffer ( inv_buf ) is used to hold the UTF-8 strings
* passed into utext_openUTF8. An error will be given if
* INV_BUFSIZ is too small. It's only used on EBCDIC systems.
*/
#define INV_BUFSIZ 2048 /* increase this if too small */
static int64_t inv_next=0;
#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
static char inv_buf[INV_BUFSIZ];
#endif
static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
if(length==-1) length=strlen(inv);
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
inv_next+=length;
return utext_openUTF8(ut, inv, length, status);
#else
if(inv_next+length+1>INV_BUFSIZ) {
fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
__FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
unsigned char *buf = (unsigned char*)inv_buf+inv_next;
uprv_aestrncpy(buf, (const uint8_t*)inv, length);
inv_next+=length;
#if 0
fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
#endif
return utext_openUTF8(ut, (const char*)buf, length, status);
#endif
}
//---------------------------------------------------------------------------
//
// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
// for the LookingAt() and Match() functions.
//
// usage:
// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
//
// The expected results are UBool - TRUE or FALSE.
// The input text is unescaped. The pattern is not.
//
//
//---------------------------------------------------------------------------
#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
const UnicodeString pattern(pat, -1, US_INV);
const UnicodeString inputText(text, -1, US_INV);
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
RegexPattern *REPattern = NULL;
RegexMatcher *REMatcher = NULL;
UBool retVal = TRUE;
UnicodeString patString(pat, -1, US_INV);
REPattern = RegexPattern::compile(patString, 0, pe, status);
if (U_FAILURE(status)) {
dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
line, u_errorName(status));
return FALSE;
}
if (line==376) { RegexPatternDump(REPattern);}
UnicodeString inputString(inputText);
UnicodeString unEscapedInput = inputString.unescape();
REMatcher = REPattern->matcher(unEscapedInput, status);
if (U_FAILURE(status)) {
errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
line, u_errorName(status));
return FALSE;
}
UBool actualmatch;
actualmatch = REMatcher->lookingAt(status);
if (U_FAILURE(status)) {
errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
line, u_errorName(status));
retVal = FALSE;
}
if (actualmatch != looking) {
errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
retVal = FALSE;
}
status = U_ZERO_ERROR;
actualmatch = REMatcher->matches(status);
if (U_FAILURE(status)) {
errln("RegexTest failure in matches() at line %d. Status = %s\n",
line, u_errorName(status));
retVal = FALSE;
}
if (actualmatch != match) {
errln("RegexTest: wrong return from matches() at line %d.\n", line);
retVal = FALSE;
}
if (retVal == FALSE) {
RegexPatternDump(REPattern);
}
delete REPattern;
delete REMatcher;
return retVal;
}
UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
UText pattern = UTEXT_INITIALIZER;
int32_t inputUTF8Length;
char *textChars = NULL;
UText inputText = UTEXT_INITIALIZER;
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
RegexPattern *REPattern = NULL;
RegexMatcher *REMatcher = NULL;
UBool retVal = TRUE;
regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
REPattern = RegexPattern::compile(&pattern, 0, pe, status);
if (U_FAILURE(status)) {
dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
line, u_errorName(status));
return FALSE;
}
UnicodeString inputString(text, -1, US_INV);
UnicodeString unEscapedInput = inputString.unescape();
LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
// UTF-8 does not allow unpaired surrogates, so this could actually happen
logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
return TRUE; // not a failure of the Regex engine
}
status = U_ZERO_ERROR; // buffer overflow
textChars = new char[inputUTF8Length+1];
unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
REMatcher = &REPattern->matcher(status)->reset(&inputText);
if (U_FAILURE(status)) {
errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
line, u_errorName(status));
return FALSE;
}
UBool actualmatch;
actualmatch = REMatcher->lookingAt(status);
if (U_FAILURE(status)) {
errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
line, u_errorName(status));
retVal = FALSE;
}
if (actualmatch != looking) {
errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
retVal = FALSE;
}
status = U_ZERO_ERROR;
actualmatch = REMatcher->matches(status);
if (U_FAILURE(status)) {
errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
line, u_errorName(status));
retVal = FALSE;
}
if (actualmatch != match) {
errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
retVal = FALSE;
}
if (retVal == FALSE) {
RegexPatternDump(REPattern);
}
delete REPattern;
delete REMatcher;
utext_close(&inputText);
utext_close(&pattern);
delete[] textChars;
return retVal;
}
//---------------------------------------------------------------------------
//
// REGEX_ERR Macro + invocation function to simplify writing tests
// regex tests for incorrect patterns
//
// usage:
// REGEX_ERR("pattern", expected error line, column, expected status);
//
//---------------------------------------------------------------------------
#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
UErrorCode expectedStatus, int32_t line) {
UnicodeString pattern(pat);
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
RegexPattern *callerPattern = NULL;
//
// Compile the caller's pattern
//
UnicodeString patString(pat);
callerPattern = RegexPattern::compile(patString, 0, pe, status);
if (status != expectedStatus) {
dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
} else {
if (status != U_ZERO_ERROR) {
if (pe.line != errLine || pe.offset != errCol) {
errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
line, errLine, errCol, pe.line, pe.offset);
}
}
}
delete callerPattern;
//
// Compile again, using a UTF-8-based UText
//
UText patternText = UTEXT_INITIALIZER;
regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
if (status != expectedStatus) {
dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
} else {
if (status != U_ZERO_ERROR) {
if (pe.line != errLine || pe.offset != errCol) {
errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
line, errLine, errCol, pe.line, pe.offset);
}
}
}
delete callerPattern;
utext_close(&patternText);
}
//---------------------------------------------------------------------------
//
// Basic Check for basic functionality of regex pattern matching.
// Avoid the use of REGEX_FIND test macro, which has
// substantial dependencies on basic Regex functionality.
//
//---------------------------------------------------------------------------
void RegexTest::Basic() {
//
// Debug - slide failing test cases early
//
#if 0
{
// REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
UParseError pe;
UErrorCode status = U_ZERO_ERROR;
RegexPattern *pattern;
pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
RegexPatternDump(pattern);
RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
UBool result = m->find();
printf("result = %d\n", result);
// REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
}
exit(1);
#endif
//
// Pattern with parentheses
//
REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
//
// Patterns with *
//
REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
REGEX_TESTLM("a*", "", TRUE, TRUE);
REGEX_TESTLM("a*", "b", TRUE, FALSE);
//
// Patterns with "."
//
REGEX_TESTLM(".", "abc", TRUE, FALSE);
REGEX_TESTLM("...", "abc", TRUE, TRUE);
REGEX_TESTLM("....", "abc", FALSE, FALSE);
REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
//
// Patterns with * applied to chars at end of literal string
//
REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
//
// Supplemental chars match as single chars, not a pair of surrogates.
//
REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
//
// UnicodeSets in the pattern
//
REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
//
// OR operator in patterns
//
REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
REGEX_TESTLM("a|b", "b", TRUE, TRUE);
REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
//
// +
//
REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
REGEX_TESTLM("b+", "", FALSE, FALSE);
REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
//
// ?
//
REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
REGEX_TESTLM("ab?", "a", TRUE, TRUE);
REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
//
// Escape sequences that become single literal chars, handled internally
// by ICU's Unescape.
//
// REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
// Escape of special chars in patterns
REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
}
//---------------------------------------------------------------------------
//
// UTextBasic Check for quirks that are specific to the UText
// implementation.
//
//---------------------------------------------------------------------------
void RegexTest::UTextBasic() {
const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
UErrorCode status = U_ZERO_ERROR;
UText pattern = UTEXT_INITIALIZER;
utext_openUTF8(&pattern, str_abc, -1, &status);
RegexMatcher matcher(&pattern, 0, status);
REGEX_CHECK_STATUS;
UText input = UTEXT_INITIALIZER;
utext_openUTF8(&input, str_abc, -1, &status);
REGEX_CHECK_STATUS;
matcher.reset(&input);
REGEX_CHECK_STATUS;
REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
matcher.reset(matcher.inputText());
REGEX_CHECK_STATUS;
REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
utext_close(&pattern);
utext_close(&input);
}
//---------------------------------------------------------------------------
//
// API_Match Test that the API for class RegexMatcher
// is present and nominally working, but excluding functions
// implementing replace operations.
//
//---------------------------------------------------------------------------
void RegexTest::API_Match() {
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
int32_t flags = 0;
//
// Debug - slide failing test cases early
//
#if 0
{
}
return;
#endif
//
// Simple pattern compilation
//
{
UnicodeString re("abc");
RegexPattern *pat2;
pat2 = RegexPattern::compile(re, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString inStr1 = "abcdef this is a test";
UnicodeString instr2 = "not abc";
UnicodeString empty = "";
//
// Matcher creation and reset.
//
RegexMatcher *m1 = pat2->matcher(inStr1, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m1->lookingAt(status) == TRUE);
REGEX_ASSERT(m1->input() == inStr1);
m1->reset(instr2);
REGEX_ASSERT(m1->lookingAt(status) == FALSE);
REGEX_ASSERT(m1->input() == instr2);
m1->reset(inStr1);
REGEX_ASSERT(m1->input() == inStr1);
REGEX_ASSERT(m1->lookingAt(status) == TRUE);
m1->reset(empty);
REGEX_ASSERT(m1->lookingAt(status) == FALSE);
REGEX_ASSERT(m1->input() == empty);
REGEX_ASSERT(&m1->pattern() == pat2);
//
// reset(pos, status)
//
m1->reset(inStr1);
m1->reset(4, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m1->input() == inStr1);
REGEX_ASSERT(m1->lookingAt(status) == TRUE);
m1->reset(-1, status);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
status = U_ZERO_ERROR;
m1->reset(0, status);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
int32_t len = m1->input().length();
m1->reset(len-1, status);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
m1->reset(len, status);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
m1->reset(len+1, status);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
status = U_ZERO_ERROR;
//
// match(pos, status)
//
m1->reset(instr2);
REGEX_ASSERT(m1->matches(4, status) == TRUE);
m1->reset();
REGEX_ASSERT(m1->matches(3, status) == FALSE);
m1->reset();
REGEX_ASSERT(m1->matches(5, status) == FALSE);
REGEX_ASSERT(m1->matches(4, status) == TRUE);
REGEX_ASSERT(m1->matches(-1, status) == FALSE);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
// Match() at end of string should fail, but should not
// be an error.
status = U_ZERO_ERROR;
len = m1->input().length();
REGEX_ASSERT(m1->matches(len, status) == FALSE);
REGEX_CHECK_STATUS;
// Match beyond end of string should fail with an error.
status = U_ZERO_ERROR;
REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
// Successful match at end of string.
{
status = U_ZERO_ERROR;
RegexMatcher m("A?", 0, status); // will match zero length string.
REGEX_CHECK_STATUS;
m.reset(inStr1);
len = inStr1.length();
REGEX_ASSERT(m.matches(len, status) == TRUE);
REGEX_CHECK_STATUS;
m.reset(empty);
REGEX_ASSERT(m.matches(0, status) == TRUE);
REGEX_CHECK_STATUS;
}
//
// lookingAt(pos, status)
//
status = U_ZERO_ERROR;
m1->reset(instr2); // "not abc"
REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
status = U_ZERO_ERROR;
len = m1->input().length();
REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
delete m1;
delete pat2;
}
//
// Capture Group.
// RegexMatcher::start();
// RegexMatcher::end();
// RegexMatcher::groupCount();
//
{
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UnicodeString re("01(23(45)67)(.*)");
RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString data = "0123456789";
RegexMatcher *matcher = pat->matcher(data, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
static const int32_t matchStarts[] = {0, 2, 4, 8};
static const int32_t matchEnds[] = {10, 8, 6, 10};
int32_t i;
for (i=0; i<4; i++) {
int32_t actualStart = matcher->start(i, status);
REGEX_CHECK_STATUS;
if (actualStart != matchStarts[i]) {
errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
__LINE__, i, matchStarts[i], actualStart);
}
int32_t actualEnd = matcher->end(i, status);
REGEX_CHECK_STATUS;
if (actualEnd != matchEnds[i]) {
errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
__LINE__, i, matchEnds[i], actualEnd);
}
}
REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
matcher->reset();
REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
matcher->lookingAt(status);
REGEX_ASSERT(matcher->group(status) == "0123456789");
REGEX_ASSERT(matcher->group(0, status) == "0123456789");
REGEX_ASSERT(matcher->group(1, status) == "234567" );
REGEX_ASSERT(matcher->group(2, status) == "45" );
REGEX_ASSERT(matcher->group(3, status) == "89" );
REGEX_CHECK_STATUS;
REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
matcher->reset();
REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
delete matcher;
delete pat;
}
//
// find
//
{
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UnicodeString re("abc");
RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString data = ".abc..abc...abc..";
// 012345678901234567
RegexMatcher *matcher = pat->matcher(data, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 1);
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 6);
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 12);
REGEX_ASSERT(matcher->find() == FALSE);
REGEX_ASSERT(matcher->find() == FALSE);
matcher->reset();
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 1);
REGEX_ASSERT(matcher->find(0, status));
REGEX_ASSERT(matcher->start(status) == 1);
REGEX_ASSERT(matcher->find(1, status));
REGEX_ASSERT(matcher->start(status) == 1);
REGEX_ASSERT(matcher->find(2, status));
REGEX_ASSERT(matcher->start(status) == 6);
REGEX_ASSERT(matcher->find(12, status));
REGEX_ASSERT(matcher->start(status) == 12);
REGEX_ASSERT(matcher->find(13, status) == FALSE);
REGEX_ASSERT(matcher->find(16, status) == FALSE);
REGEX_ASSERT(matcher->find(17, status) == FALSE);
REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
status = U_ZERO_ERROR;
REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
status = U_ZERO_ERROR;
REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
REGEX_ASSERT(matcher->groupCount() == 0);
delete matcher;
delete pat;
}
//
// find, with \G in pattern (true if at the end of a previous match).
//
{
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString data = ".abcabc.abc..";
// 012345678901234567
RegexMatcher *matcher = pat->matcher(data, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 0);
REGEX_ASSERT(matcher->start(1, status) == -1);
REGEX_ASSERT(matcher->start(2, status) == 1);
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 4);
REGEX_ASSERT(matcher->start(1, status) == 4);
REGEX_ASSERT(matcher->start(2, status) == -1);
REGEX_CHECK_STATUS;
delete matcher;
delete pat;
}
//
// find with zero length matches, match position should bump ahead
// to prevent loops.
//
{
int32_t i;
UErrorCode status=U_ZERO_ERROR;
RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
// using an always-true look-ahead.
REGEX_CHECK_STATUS;
UnicodeString s(" ");
m.reset(s);
for (i=0; ; i++) {
if (m.find() == FALSE) {
break;
}
REGEX_ASSERT(m.start(status) == i);
REGEX_ASSERT(m.end(status) == i);
}
REGEX_ASSERT(i==5);
// Check that the bump goes over surrogate pairs OK
s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
s = s.unescape();
m.reset(s);
for (i=0; ; i+=2) {
if (m.find() == FALSE) {
break;
}
REGEX_ASSERT(m.start(status) == i);
REGEX_ASSERT(m.end(status) == i);
}
REGEX_ASSERT(i==10);
}
{
// find() loop breaking test.
// with pattern of /.?/, should see a series of one char matches, then a single
// match of zero length at the end of the input string.
int32_t i;
UErrorCode status=U_ZERO_ERROR;
RegexMatcher m(".?", 0, status);
REGEX_CHECK_STATUS;
UnicodeString s(" ");
m.reset(s);
for (i=0; ; i++) {
if (m.find() == FALSE) {
break;
}
REGEX_ASSERT(m.start(status) == i);
REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
}
REGEX_ASSERT(i==5);
}
//
// Matchers with no input string behave as if they had an empty input string.
//
{
UErrorCode status = U_ZERO_ERROR;
RegexMatcher m(".?", 0, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m.find());
REGEX_ASSERT(m.start(status) == 0);
REGEX_ASSERT(m.input() == "");
}
{
UErrorCode status = U_ZERO_ERROR;
RegexPattern *p = RegexPattern::compile(".", 0, status);
RegexMatcher *m = p->matcher(status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m->find() == FALSE);
REGEX_ASSERT(m->input() == "");
delete m;
delete p;
}
//
// Regions
//
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString("This is test data");
RegexMatcher m(".*", testString, 0, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m.regionStart() == 0);
REGEX_ASSERT(m.regionEnd() == testString.length());
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
m.region(2,4, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m.matches(status));
REGEX_ASSERT(m.start(status)==2);
REGEX_ASSERT(m.end(status)==4);
REGEX_CHECK_STATUS;
m.reset();
REGEX_ASSERT(m.regionStart() == 0);
REGEX_ASSERT(m.regionEnd() == testString.length());
UnicodeString shorterString("short");
m.reset(shorterString);
REGEX_ASSERT(m.regionStart() == 0);
REGEX_ASSERT(m.regionEnd() == shorterString.length());
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
}
//
// hitEnd() and requireEnd()
//
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString("aabb");
RegexMatcher m1(".*", testString, 0, status);
REGEX_ASSERT(m1.lookingAt(status) == TRUE);
REGEX_ASSERT(m1.hitEnd() == TRUE);
REGEX_ASSERT(m1.requireEnd() == FALSE);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
RegexMatcher m2("a*", testString, 0, status);
REGEX_ASSERT(m2.lookingAt(status) == TRUE);
REGEX_ASSERT(m2.hitEnd() == FALSE);
REGEX_ASSERT(m2.requireEnd() == FALSE);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
RegexMatcher m3(".*$", testString, 0, status);
REGEX_ASSERT(m3.lookingAt(status) == TRUE);
REGEX_ASSERT(m3.hitEnd() == TRUE);
REGEX_ASSERT(m3.requireEnd() == TRUE);
REGEX_CHECK_STATUS;
}
//
// Compilation error on reset with UChar *
// These were a hazard that people were stumbling over with runtime errors.
// Changed them to compiler errors by adding private methods that more closely
// matched the incorrect use of the functions.
//
#if 0
{
UErrorCode status = U_ZERO_ERROR;
UChar ucharString[20];
RegexMatcher m(".", 0, status);
m.reset(ucharString); // should not compile.
RegexPattern *p = RegexPattern::compile(".", 0, status);
RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
RegexMatcher m3(".", ucharString, 0, status); // Should not compile
}
#endif
//
// Time Outs.
// Note: These tests will need to be changed when the regexp engine is
// able to detect and cut short the exponential time behavior on
// this type of match.
//
{
UErrorCode status = U_ZERO_ERROR;
// Enough 'a's in the string to cause the match to time out.
// (Each on additonal 'a' doubles the time)
UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
RegexMatcher matcher("(a+)+b", testString, 0, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher.getTimeLimit() == 0);
matcher.setTimeLimit(100, status);
REGEX_ASSERT(matcher.getTimeLimit() == 100);
REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
REGEX_ASSERT(status == U_REGEX_TIME_OUT);
}
{
UErrorCode status = U_ZERO_ERROR;
// Few enough 'a's to slip in under the time limit.
UnicodeString testString("aaaaaaaaaaaaaaaaaa");
RegexMatcher matcher("(a+)+b", testString, 0, status);
REGEX_CHECK_STATUS;
matcher.setTimeLimit(100, status);
REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
REGEX_CHECK_STATUS;
}
//
// Stack Limits
//
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
// Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
// of the '+', and makes the stack frames larger.
RegexMatcher matcher("(A)+A$", testString, 0, status);
// With the default stack, this match should fail to run
REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
// With unlimited stack, it should run
status = U_ZERO_ERROR;
matcher.setStackLimit(0, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher.getStackLimit() == 0);
// With a limited stack, it the match should fail
status = U_ZERO_ERROR;
matcher.setStackLimit(10000, status);
REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
REGEX_ASSERT(matcher.getStackLimit() == 10000);
}
// A pattern that doesn't save state should work with
// a minimal sized stack
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString = "abc";
RegexMatcher matcher("abc", testString, 0, status);
REGEX_CHECK_STATUS;
matcher.setStackLimit(30, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher.matches(status) == TRUE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher.getStackLimit() == 30);
// Negative stack sizes should fail
status = U_ZERO_ERROR;
matcher.setStackLimit(1000, status);
REGEX_CHECK_STATUS;
matcher.setStackLimit(-1, status);
REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
REGEX_ASSERT(matcher.getStackLimit() == 1000);
}
}
//---------------------------------------------------------------------------
//
// API_Replace API test for class RegexMatcher, testing the
// Replace family of functions.
//
//---------------------------------------------------------------------------
void RegexTest::API_Replace() {
//
// Replace
//
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UnicodeString re("abc");
RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString data = ".abc..abc...abc..";
// 012345678901234567
RegexMatcher *matcher = pat->matcher(data, status);
//
// Plain vanilla matches.
//
UnicodeString dest;
dest = matcher->replaceFirst("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".yz..abc...abc..");
dest = matcher->replaceAll("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".yz..yz...yz..");
//
// Plain vanilla non-matches.
//
UnicodeString d2 = ".abx..abx...abx..";
matcher->reset(d2);
dest = matcher->replaceFirst("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".abx..abx...abx..");
dest = matcher->replaceAll("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".abx..abx...abx..");
//
// Empty source string
//
UnicodeString d3 = "";
matcher->reset(d3);
dest = matcher->replaceFirst("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "");
dest = matcher->replaceAll("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "");
//
// Empty substitution string
//
matcher->reset(data); // ".abc..abc...abc.."
dest = matcher->replaceFirst("", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "...abc...abc..");
dest = matcher->replaceAll("", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "........");
//
// match whole string
//
UnicodeString d4 = "abc";
matcher->reset(d4);
dest = matcher->replaceFirst("xyz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "xyz");
dest = matcher->replaceAll("xyz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "xyz");
//
// Capture Group, simple case
//
UnicodeString re2("a(..)");
RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString d5 = "abcdefg";
RegexMatcher *matcher2 = pat2->matcher(d5, status);
REGEX_CHECK_STATUS;
dest = matcher2->replaceFirst("$1$1", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "bcbcdefg");
dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "The value of $1 is bc.defg");
dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
replacement = replacement.unescape();
dest = matcher2->replaceFirst(replacement, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
//
// Replacement String with \u hex escapes
//
{
UnicodeString src = "abc 1 abc 2 abc 3";
UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
matcher->reset(src);
UnicodeString result = matcher->replaceAll(substitute, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
}
{
UnicodeString src = "abc !";
UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
matcher->reset(src);
UnicodeString result = matcher->replaceAll(substitute, status);
REGEX_CHECK_STATUS;
UnicodeString expected = UnicodeString("--");
expected.append((UChar32)0x10000);
expected.append("-- !");
REGEX_ASSERT(result == expected);
}
// TODO: need more through testing of capture substitutions.
// Bug 4057
//
{
status = U_ZERO_ERROR;
UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
RegexMatcher m("ss(.*?)ee", 0, status);
REGEX_CHECK_STATUS;
UnicodeString result;
// Multiple finds do NOT bump up the previous appendReplacement postion.
m.reset(s);
m.find();
m.find();
m.appendReplacement(result, "ooh", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
// After a reset into the interior of a string, appendReplacemnt still starts at beginning.
status = U_ZERO_ERROR;
result.truncate(0);
m.reset(10, status);
m.find();
m.find();
m.appendReplacement(result, "ooh", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
// find() at interior of string, appendReplacemnt still starts at beginning.
status = U_ZERO_ERROR;
result.truncate(0);
m.reset();
m.find(10, status);
m.find();
m.appendReplacement(result, "ooh", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
m.appendTail(result);
REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
}
delete matcher2;
delete pat2;
delete matcher;
delete pat;
}
//---------------------------------------------------------------------------
//
// API_Pattern Test that the API for class RegexPattern is
// present and nominally working.
//
//---------------------------------------------------------------------------
void RegexTest::API_Pattern() {
RegexPattern pata; // Test default constructor to not crash.
RegexPattern patb;
REGEX_ASSERT(pata == patb);
REGEX_ASSERT(pata == pata);
UnicodeString re1("abc[a-l][m-z]");
UnicodeString re2("def");
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(*pat1 == *pat1);
REGEX_ASSERT(*pat1 != pata);
// Assign
patb = *pat1;
REGEX_ASSERT(patb == *pat1);
// Copy Construct
RegexPattern patc(*pat1);
REGEX_ASSERT(patc == *pat1);
REGEX_ASSERT(patb == patc);
REGEX_ASSERT(pat1 != pat2);
patb = *pat2;
REGEX_ASSERT(patb != patc);
REGEX_ASSERT(patb == *pat2);
// Compile with no flags.
RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
REGEX_ASSERT(*pat1a == *pat1);
REGEX_ASSERT(pat1a->flags() == 0);
// Compile with different flags should be not equal
RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(*pat1b != *pat1a);
REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
REGEX_ASSERT(pat1a->flags() == 0);
delete pat1b;
// clone
RegexPattern *pat1c = pat1->clone();
REGEX_ASSERT(*pat1c == *pat1);
REGEX_ASSERT(*pat1c != *pat2);
delete pat1c;
delete pat1a;
delete pat1;
delete pat2;
//
// Verify that a matcher created from a cloned pattern works.
// (Jitterbug 3423)
//
{
UErrorCode status = U_ZERO_ERROR;
RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
RegexPattern *pClone = pSource->clone();
delete pSource;
RegexMatcher *mFromClone = pClone->matcher(status);
REGEX_CHECK_STATUS;
UnicodeString s = "Hello World";
mFromClone->reset(s);
REGEX_ASSERT(mFromClone->find() == TRUE);
REGEX_ASSERT(mFromClone->group(status) == "Hello");
REGEX_ASSERT(mFromClone->find() == TRUE);
REGEX_ASSERT(mFromClone->group(status) == "World");
REGEX_ASSERT(mFromClone->find() == FALSE);
delete mFromClone;
delete pClone;
}
//
// matches convenience API
//
REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
REGEX_CHECK_STATUS;
status = U_INDEX_OUTOFBOUNDS_ERROR;
REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
//
// Split()
//
status = U_ZERO_ERROR;
pat1 = RegexPattern::compile(" +", pe, status);
REGEX_CHECK_STATUS;
UnicodeString fields[10];
int32_t n;
n = pat1->split("Now is the time", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==4);
REGEX_ASSERT(fields[0]=="Now");
REGEX_ASSERT(fields[1]=="is");
REGEX_ASSERT(fields[2]=="the");
REGEX_ASSERT(fields[3]=="time");
REGEX_ASSERT(fields[4]=="");
n = pat1->split("Now is the time", fields, 2, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==2);
REGEX_ASSERT(fields[0]=="Now");
REGEX_ASSERT(fields[1]=="is the time");
REGEX_ASSERT(fields[2]=="the"); // left over from previous test
fields[1] = "*";
status = U_ZERO_ERROR;
n = pat1->split("Now is the time", fields, 1, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==1);
REGEX_ASSERT(fields[0]=="Now is the time");
REGEX_ASSERT(fields[1]=="*");
status = U_ZERO_ERROR;
n = pat1->split(" Now is the time ", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="Now");
REGEX_ASSERT(fields[2]=="is");
REGEX_ASSERT(fields[3]=="the");
REGEX_ASSERT(fields[4]=="time");
REGEX_ASSERT(fields[5]=="");
n = pat1->split(" ", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==2);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="");
fields[0] = "foo";
n = pat1->split("", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==0);
REGEX_ASSERT(fields[0]=="foo");
delete pat1;
// split, with a pattern with (capture)
pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==7);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="c");
REGEX_ASSERT(fields[6]=="");
REGEX_ASSERT(status==U_ZERO_ERROR);
n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==7);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="c");
REGEX_ASSERT(fields[6]=="");
status = U_ZERO_ERROR;
fields[6] = "foo";
n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
REGEX_ASSERT(fields[6]=="foo");
status = U_ZERO_ERROR;
fields[5] = "foo";
n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==5);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time<c>");
REGEX_ASSERT(fields[5]=="foo");
status = U_ZERO_ERROR;
fields[5] = "foo";
n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==5);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="foo");
status = U_ZERO_ERROR;
n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==4);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="the time<c>");
status = U_ZERO_ERROR;
delete pat1;
pat1 = RegexPattern::compile("([-,])", pe, status);
REGEX_CHECK_STATUS;
n = pat1->split("1-10,20", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==5);
REGEX_ASSERT(fields[0]=="1");
REGEX_ASSERT(fields[1]=="-");
REGEX_ASSERT(fields[2]=="10");
REGEX_ASSERT(fields[3]==",");
REGEX_ASSERT(fields[4]=="20");
delete pat1;
// Test split of string with empty trailing fields
pat1 = RegexPattern::compile(",", pe, status);
REGEX_CHECK_STATUS;
n = pat1->split("a,b,c,", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==4);
REGEX_ASSERT(fields[0]=="a");
REGEX_ASSERT(fields[1]=="b");
REGEX_ASSERT(fields[2]=="c");
REGEX_ASSERT(fields[3]=="");
n = pat1->split("a,,,", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==4);
REGEX_ASSERT(fields[0]=="a");
REGEX_ASSERT(fields[1]=="");
REGEX_ASSERT(fields[2]=="");
REGEX_ASSERT(fields[3]=="");
delete pat1;
// Split Separator with zero length match.
pat1 = RegexPattern::compile(":?", pe, status);
REGEX_CHECK_STATUS;
n = pat1->split("abc", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==5);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="b");
REGEX_ASSERT(fields[3]=="c");
REGEX_ASSERT(fields[4]=="");
delete pat1;
//
// RegexPattern::pattern()
//
pat1 = new RegexPattern();
REGEX_ASSERT(pat1->pattern() == "");
delete pat1;
pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
delete pat1;
//
// classID functions
//
pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
UnicodeString Hello("Hello, world.");
RegexMatcher *m = pat1->matcher(Hello, status);
REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
REGEX_ASSERT(m->getDynamicClassID() != NULL);
delete m;
delete pat1;
}
//---------------------------------------------------------------------------
//
// API_Match_UTF8 Test that the alternate engine for class RegexMatcher
// is present and working, but excluding functions
// implementing replace operations.
//
//---------------------------------------------------------------------------
void RegexTest::API_Match_UTF8() {
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
int32_t flags = 0;
//
// Debug - slide failing test cases early
//
#if 0
{
}
return;
#endif
//
// Simple pattern compilation
//
{
UText re = UTEXT_INITIALIZER;
regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
REGEX_VERBOSE_TEXT(&re);
RegexPattern *pat2;
pat2 = RegexPattern::compile(&re, flags, pe, status);
REGEX_CHECK_STATUS;
UText input1 = UTEXT_INITIALIZER;
UText input2 = UTEXT_INITIALIZER;
UText empty = UTEXT_INITIALIZER;
regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
REGEX_VERBOSE_TEXT(&input1);
regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
REGEX_VERBOSE_TEXT(&input2);
utext_openUChars(&empty, NULL, 0, &status);
int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
int32_t input2Len = strlen("not abc");
//
// Matcher creation and reset.
//
RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m1->lookingAt(status) == TRUE);
const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
m1->reset(&input2);
REGEX_ASSERT(m1->lookingAt(status) == FALSE);
const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
m1->reset(&input1);
REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
REGEX_ASSERT(m1->lookingAt(status) == TRUE);
m1->reset(&empty);
REGEX_ASSERT(m1->lookingAt(status) == FALSE);
REGEX_ASSERT(utext_nativeLength(&empty) == 0);
//
// reset(pos, status)
//
m1->reset(&input1);
m1->reset(4, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
REGEX_ASSERT(m1->lookingAt(status) == TRUE);
m1->reset(-1, status);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
status = U_ZERO_ERROR;
m1->reset(0, status);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
m1->reset(input1Len-1, status);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
m1->reset(input1Len, status);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
m1->reset(input1Len+1, status);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
status = U_ZERO_ERROR;
//
// match(pos, status)
//
m1->reset(&input2);
REGEX_ASSERT(m1->matches(4, status) == TRUE);
m1->reset();
REGEX_ASSERT(m1->matches(3, status) == FALSE);
m1->reset();
REGEX_ASSERT(m1->matches(5, status) == FALSE);
REGEX_ASSERT(m1->matches(4, status) == TRUE);
REGEX_ASSERT(m1->matches(-1, status) == FALSE);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
// Match() at end of string should fail, but should not
// be an error.
status = U_ZERO_ERROR;
REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
REGEX_CHECK_STATUS;
// Match beyond end of string should fail with an error.
status = U_ZERO_ERROR;
REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
// Successful match at end of string.
{
status = U_ZERO_ERROR;
RegexMatcher m("A?", 0, status); // will match zero length string.
REGEX_CHECK_STATUS;
m.reset(&input1);
REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
REGEX_CHECK_STATUS;
m.reset(&empty);
REGEX_ASSERT(m.matches(0, status) == TRUE);
REGEX_CHECK_STATUS;
}
//
// lookingAt(pos, status)
//
status = U_ZERO_ERROR;
m1->reset(&input2); // "not abc"
REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
status = U_ZERO_ERROR;
REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
delete m1;
delete pat2;
utext_close(&re);
utext_close(&input1);
utext_close(&input2);
utext_close(&empty);
}
//
// Capture Group.
// RegexMatcher::start();
// RegexMatcher::end();
// RegexMatcher::groupCount();
//
{
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UText re=UTEXT_INITIALIZER;
const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
utext_openUTF8(&re, str_01234567_pat, -1, &status);
RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
REGEX_CHECK_STATUS;
UText input = UTEXT_INITIALIZER;
const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
utext_openUTF8(&input, str_0123456789, -1, &status);
RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
static const int32_t matchStarts[] = {0, 2, 4, 8};
static const int32_t matchEnds[] = {10, 8, 6, 10};
int32_t i;
for (i=0; i<4; i++) {
int32_t actualStart = matcher->start(i, status);
REGEX_CHECK_STATUS;
if (actualStart != matchStarts[i]) {
errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
__FILE__, __LINE__, i, matchStarts[i], actualStart);
}
int32_t actualEnd = matcher->end(i, status);
REGEX_CHECK_STATUS;
if (actualEnd != matchEnds[i]) {
errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
__FILE__, __LINE__, i, matchEnds[i], actualEnd);
}
}
REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
matcher->reset();
REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
matcher->lookingAt(status);
UnicodeString dest;
UText destText = UTEXT_INITIALIZER;
utext_openUnicodeString(&destText, &dest, &status);
UText *result;
//const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
// Test shallow-clone API
int64_t group_len;
result = matcher->group((UText *)NULL, group_len, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
utext_close(result);
result = matcher->group(0, &destText, group_len, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
// destText is now immutable, reopen it
utext_close(&destText);
utext_openUnicodeString(&destText, &dest, &status);
result = matcher->group(0, NULL, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
utext_close(result);
result = matcher->group(0, &destText, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
result = matcher->group(1, NULL, status);
REGEX_CHECK_STATUS;
const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
utext_close(result);
result = matcher->group(1, &destText, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
result = matcher->group(2, NULL, status);
REGEX_CHECK_STATUS;
const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
REGEX_ASSERT_UTEXT_UTF8(str_45, result);
utext_close(result);
result = matcher->group(2, &destText, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_45, result);
result = matcher->group(3, NULL, status);
REGEX_CHECK_STATUS;
const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
REGEX_ASSERT_UTEXT_UTF8(str_89, result);
utext_close(result);
result = matcher->group(3, &destText, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_89, result);
REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
matcher->reset();
REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
delete matcher;
delete pat;
utext_close(&destText);
utext_close(&input);
utext_close(&re);
}
//
// find
//
{
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UText re=UTEXT_INITIALIZER;
const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
utext_openUTF8(&re, str_abc, -1, &status);
RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
REGEX_CHECK_STATUS;
UText input = UTEXT_INITIALIZER;
const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
utext_openUTF8(&input, str_abcabcabc, -1, &status);
// 012345678901234567
RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 1);
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 6);
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 12);
REGEX_ASSERT(matcher->find() == FALSE);
REGEX_ASSERT(matcher->find() == FALSE);
matcher->reset();
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 1);
REGEX_ASSERT(matcher->find(0, status));
REGEX_ASSERT(matcher->start(status) == 1);
REGEX_ASSERT(matcher->find(1, status));
REGEX_ASSERT(matcher->start(status) == 1);
REGEX_ASSERT(matcher->find(2, status));
REGEX_ASSERT(matcher->start(status) == 6);
REGEX_ASSERT(matcher->find(12, status));
REGEX_ASSERT(matcher->start(status) == 12);
REGEX_ASSERT(matcher->find(13, status) == FALSE);
REGEX_ASSERT(matcher->find(16, status) == FALSE);
REGEX_ASSERT(matcher->find(17, status) == FALSE);
REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
status = U_ZERO_ERROR;
REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
status = U_ZERO_ERROR;
REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
REGEX_ASSERT(matcher->groupCount() == 0);
delete matcher;
delete pat;
utext_close(&input);
utext_close(&re);
}
//
// find, with \G in pattern (true if at the end of a previous match).
//
{
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UText re=UTEXT_INITIALIZER;
const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
utext_openUTF8(&re, str_Gabcabc, -1, &status);
RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
REGEX_CHECK_STATUS;
UText input = UTEXT_INITIALIZER;
const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
utext_openUTF8(&input, str_abcabcabc, -1, &status);
// 012345678901234567
RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 0);
REGEX_ASSERT(matcher->start(1, status) == -1);
REGEX_ASSERT(matcher->start(2, status) == 1);
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 4);
REGEX_ASSERT(matcher->start(1, status) == 4);
REGEX_ASSERT(matcher->start(2, status) == -1);
REGEX_CHECK_STATUS;
delete matcher;
delete pat;
utext_close(&input);
utext_close(&re);
}
//
// find with zero length matches, match position should bump ahead
// to prevent loops.
//
{
int32_t i;
UErrorCode status=U_ZERO_ERROR;
RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
// using an always-true look-ahead.
REGEX_CHECK_STATUS;
UText s = UTEXT_INITIALIZER;
utext_openUTF8(&s, " ", -1, &status);
m.reset(&s);
for (i=0; ; i++) {
if (m.find() == FALSE) {
break;
}
REGEX_ASSERT(m.start(status) == i);
REGEX_ASSERT(m.end(status) == i);
}
REGEX_ASSERT(i==5);
// Check that the bump goes over characters outside the BMP OK
// "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
m.reset(&s);
for (i=0; ; i+=4) {
if (m.find() == FALSE) {
break;
}
REGEX_ASSERT(m.start(status) == i);
REGEX_ASSERT(m.end(status) == i);
}
REGEX_ASSERT(i==20);
utext_close(&s);
}
{
// find() loop breaking test.
// with pattern of /.?/, should see a series of one char matches, then a single
// match of zero length at the end of the input string.
int32_t i;
UErrorCode status=U_ZERO_ERROR;
RegexMatcher m(".?", 0, status);
REGEX_CHECK_STATUS;
UText s = UTEXT_INITIALIZER;
utext_openUTF8(&s, " ", -1, &status);
m.reset(&s);
for (i=0; ; i++) {
if (m.find() == FALSE) {
break;
}
REGEX_ASSERT(m.start(status) == i);
REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
}
REGEX_ASSERT(i==5);
utext_close(&s);
}
//
// Matchers with no input string behave as if they had an empty input string.
//
{
UErrorCode status = U_ZERO_ERROR;
RegexMatcher m(".?", 0, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m.find());
REGEX_ASSERT(m.start(status) == 0);
REGEX_ASSERT(m.input() == "");
}
{
UErrorCode status = U_ZERO_ERROR;
RegexPattern *p = RegexPattern::compile(".", 0, status);
RegexMatcher *m = p->matcher(status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m->find() == FALSE);
REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
delete m;
delete p;
}
//
// Regions
//
{
UErrorCode status = U_ZERO_ERROR;
UText testPattern = UTEXT_INITIALIZER;
UText testText = UTEXT_INITIALIZER;
regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
REGEX_VERBOSE_TEXT(&testPattern);
regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
REGEX_VERBOSE_TEXT(&testText);
RegexMatcher m(&testPattern, &testText, 0, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m.regionStart() == 0);
REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
m.region(2,4, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m.matches(status));
REGEX_ASSERT(m.start(status)==2);
REGEX_ASSERT(m.end(status)==4);
REGEX_CHECK_STATUS;
m.reset();
REGEX_ASSERT(m.regionStart() == 0);
REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
REGEX_VERBOSE_TEXT(&testText);
m.reset(&testText);
REGEX_ASSERT(m.regionStart() == 0);
REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
utext_close(&testText);
utext_close(&testPattern);
}
//
// hitEnd() and requireEnd()
//
{
UErrorCode status = U_ZERO_ERROR;
UText testPattern = UTEXT_INITIALIZER;
UText testText = UTEXT_INITIALIZER;
const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
utext_openUTF8(&testPattern, str_, -1, &status);
utext_openUTF8(&testText, str_aabb, -1, &status);
RegexMatcher m1(&testPattern, &testText, 0, status);
REGEX_ASSERT(m1.lookingAt(status) == TRUE);
REGEX_ASSERT(m1.hitEnd() == TRUE);
REGEX_ASSERT(m1.requireEnd() == FALSE);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
utext_openUTF8(&testPattern, str_a, -1, &status);
RegexMatcher m2(&testPattern, &testText, 0, status);
REGEX_ASSERT(m2.lookingAt(status) == TRUE);
REGEX_ASSERT(m2.hitEnd() == FALSE);
REGEX_ASSERT(m2.requireEnd() == FALSE);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
RegexMatcher m3(&testPattern, &testText, 0, status);
REGEX_ASSERT(m3.lookingAt(status) == TRUE);
REGEX_ASSERT(m3.hitEnd() == TRUE);
REGEX_ASSERT(m3.requireEnd() == TRUE);
REGEX_CHECK_STATUS;
utext_close(&testText);
utext_close(&testPattern);
}
}
//---------------------------------------------------------------------------
//
// API_Replace_UTF8 API test for class RegexMatcher, testing the
// Replace family of functions.
//
//---------------------------------------------------------------------------
void RegexTest::API_Replace_UTF8() {
//
// Replace
//
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UText re=UTEXT_INITIALIZER;
regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
REGEX_VERBOSE_TEXT(&re);
RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
REGEX_CHECK_STATUS;
char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
// 012345678901234567
UText dataText = UTEXT_INITIALIZER;
utext_openUTF8(&dataText, data, -1, &status);
REGEX_CHECK_STATUS;
REGEX_VERBOSE_TEXT(&dataText);
RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
//
// Plain vanilla matches.
//
UnicodeString dest;
UText destText = UTEXT_INITIALIZER;
utext_openUnicodeString(&destText, &dest, &status);
UText *result;
UText replText = UTEXT_INITIALIZER;
const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
utext_openUTF8(&replText, str_yz, -1, &status);
REGEX_VERBOSE_TEXT(&replText);
result = matcher->replaceFirst(&replText, NULL, status);
REGEX_CHECK_STATUS;
const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
utext_close(result);
result = matcher->replaceFirst(&replText, &destText, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
result = matcher->replaceAll(&replText, NULL, status);
REGEX_CHECK_STATUS;
const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
utext_close(result);
utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
result = matcher->replaceAll(&replText, &destText, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
//
// Plain vanilla non-matches.
//
const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
matcher->reset(&dataText);
result = matcher->replaceFirst(&replText, NULL, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
utext_close(result);
result = matcher->replaceFirst(&replText, &destText, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
result = matcher->replaceAll(&replText, NULL, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
utext_close(result);
utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
result = matcher->replaceAll(&replText, &destText, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
//
// Empty source string
//
utext_openUTF8(&dataText, NULL, 0, &status);
matcher->reset(&dataText);
result = matcher->replaceFirst(&replText, NULL, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT_UTEXT_UTF8("", result);
utext_close(result);
result = matcher->replaceFirst(&replText, &destText, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8("", result);
result = matcher->replaceAll(&replText, NULL, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT_UTEXT_UTF8("", result);
utext_close(result);
result = matcher->replaceAll(&replText, &destText, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8("", result);
//
// Empty substitution string
//