Introduce an optional dependency on ICU.
Users will be able to build against ICU in order to have full Unicode
properties support.
Change-Id: I22e722392a63fd292f23be221a1e7b807d70c56b
Reviewed-on: https://code-review.googlesource.com/4153
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/Makefile b/Makefile
index 8d866a5..92fbfc3 100644
--- a/Makefile
+++ b/Makefile
@@ -2,15 +2,20 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
-# to build against PCRE for testing or benchmarking,
-# uncomment the next two lines
+# To build against ICU for full Unicode properties support,
+# uncomment the next two lines:
+# CCICU=$(shell pkg-config icu-uc --cflags) -DRE2_USE_ICU
+# LDICU=$(shell pkg-config icu-uc --libs)
+
+# To build against PCRE for testing or benchmarking,
+# uncomment the next two lines:
# CCPCRE=-I/usr/local/include -DUSEPCRE
# LDPCRE=-L/usr/local/lib -lpcre
CXX?=g++
CXXFLAGS?=-std=c++11 -O3 -g -pthread # can override
-RE2_CXXFLAGS?=-Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCPCRE) # required
-LDFLAGS?=-pthread
+RE2_CXXFLAGS?=-Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCPCRE) # required
+LDFLAGS?=-pthread $(LDICU)
AR?=ar
ARFLAGS?=rsc
NM?=nm
diff --git a/re2/parse.cc b/re2/parse.cc
index f51e589..bd4f9c8 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -23,6 +23,12 @@
#include "re2/unicode_groups.h"
#include "re2/walker-inl.h"
+#if defined(RE2_USE_ICU)
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/utypes.h"
+#endif
+
namespace re2 {
// Regular expression parse state.
@@ -1488,11 +1494,6 @@
return NULL;
}
-// Fake UGroup containing all Runes
-static URange16 any16[] = { { 0, 65535 } };
-static URange32 any32[] = { { 65536, Runemax } };
-static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
-
// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
static const UGroup* LookupPosixGroup(const StringPiece& name) {
return LookupGroup(name, posix_groups, num_posix_groups);
@@ -1502,6 +1503,12 @@
return LookupGroup(name, perl_groups, num_perl_groups);
}
+#if !defined(RE2_USE_ICU)
+// Fake UGroup containing all Runes
+static URange16 any16[] = { { 0, 65535 } };
+static URange32 any32[] = { { 65536, Runemax } };
+static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
+
// Look for a Unicode group with the given name (e.g., "Han")
static const UGroup* LookupUnicodeGroup(const StringPiece& name) {
// Special case: "Any" means any.
@@ -1509,6 +1516,7 @@
return &anygroup;
return LookupGroup(name, unicode_groups, num_unicode_groups);
}
+#endif
// Add a UGroup or its negation to the character class.
static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign,
@@ -1600,7 +1608,7 @@
// Committed to parse. Results:
int sign = +1; // -1 = negated char class
if (c == 'P')
- sign = -1;
+ sign = -sign;
StringPiece seq = *s; // \p{Han} or \pL
StringPiece name; // Han or L
s->remove_prefix(2); // '\\', 'p'
@@ -1630,11 +1638,13 @@
// Chop seq where s now begins.
seq = StringPiece(seq.begin(), static_cast<int>(s->begin() - seq.begin()));
- // Look up group
if (name.size() > 0 && name[0] == '^') {
sign = -sign;
name.remove_prefix(1); // '^'
}
+
+#if !defined(RE2_USE_ICU)
+ // Look up the group in the RE2 Unicode data.
const UGroup *g = LookupUnicodeGroup(name);
if (g == NULL) {
status->set_code(kRegexpBadCharRange);
@@ -1643,6 +1653,31 @@
}
AddUGroup(cc, g, sign, parse_flags);
+#else
+ // Look up the group in the ICU Unicode data. Because ICU provides full
+ // Unicode properties support, this could be more than a lookup by name.
+ ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
+ string("\\p{") + name.ToString() + string("}"));
+ UErrorCode uerr = U_ZERO_ERROR;
+ ::icu::UnicodeSet uset(ustr, uerr);
+ if (uerr != U_ZERO_ERROR) {
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(seq);
+ return kParseError;
+ }
+
+ // Convert the UnicodeSet to a URange32 and UGroup that we can add.
+ int nr = uset.getRangeCount();
+ URange32* r = new URange32[nr];
+ for (int i = 0; i < nr; i++) {
+ r[i].lo = uset.getRangeStart(i);
+ r[i].hi = uset.getRangeEnd(i);
+ }
+ UGroup g = {"", +1, 0, 0, r, nr};
+ AddUGroup(cc, &g, sign, parse_flags);
+ delete[] r;
+#endif
+
return kParseOk;
}