LANG-1020: Improve performance of normalize space. Thanks to Libor Ondrusek. This closes #27 from github.
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1620317 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 7d8e2a7..b38dbce 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -22,6 +22,7 @@
<body>
<release version="3.4" date="tba" description="tba">
+ <action issue="LANG-1020" type="update" dev="britter" due-to="Libor Ondrusek">Improve performance of normalize space</action>
<action issue="LANG-1033" type="add" dev="ggregory">Add StringUtils.countMatches(CharSequence, char)</action>
<action issue="LANG-1027" type="update" dev="rmannibucau">org.apache.commons.lang3.SystemUtils#isJavaVersionAtLeast should return true by default</action>
<action issue="LANG-1021" type="add" dev="britter" due-to="Alexander Müller">Provide methods to retrieve all fields/methods annotated with a specific type</action>
diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java
index 0bb44ba..cd13b37 100644
--- a/src/main/java/org/apache/commons/lang3/StringUtils.java
+++ b/src/main/java/org/apache/commons/lang3/StringUtils.java
@@ -173,16 +173,6 @@
private static final int PAD_LIMIT = 8192;
/**
- * A regex pattern for recognizing blocks of whitespace characters.
- * The apparent convolutedness of the pattern serves the purpose of
- * ignoring "blocks" consisting of only a single space: the pattern
- * is used only to normalize whitespace, condensing "blocks" down to a
- * single space, thus matching the same would likely cause a great
- * many noop replacements.
- */
- private static final Pattern WHITESPACE_PATTERN = Pattern.compile("(?: |\\u00A0|\\s|[\\s&&[^ ]])\\s*");
-
- /**
* <p>{@code StringUtils} instances should NOT be constructed in
* standard programming. Instead, the class should be used as
* {@code StringUtils.trim(" foo ");}.</p>
@@ -7477,10 +7467,34 @@
* @since 3.0
*/
public static String normalizeSpace(final String str) {
- if (str == null) {
- return null;
+ // LANG-1020: Improved performance significantly normalizing manually instead of using regex
+ // See https://github.com/librucha/commons-lang-normalizespaces-benchmark for performance test
+ if (isEmpty(str)) {
+ return str;
}
- return WHITESPACE_PATTERN.matcher(trim(str)).replaceAll(SPACE);
+ final int size = str.length();
+ final char[] newChars = new char[size];
+ int count = 0;
+ int whitespacesCount = 0;
+ boolean startWhitespaces = true;
+ for (int i = 0; i < size; i++) {
+ char actualChar = str.charAt(i);
+ boolean isWhitespace = Character.isWhitespace(actualChar);
+ if (!isWhitespace) {
+ startWhitespaces = false;
+ newChars[count++] = (actualChar == 160 ? 32 : actualChar);
+ whitespacesCount = 0;
+ } else {
+ if (whitespacesCount == 0 && !startWhitespaces) {
+ newChars[count++] = SPACE.charAt(0);
+ }
+ whitespacesCount++;
+ }
+ }
+ if (startWhitespaces) {
+ return EMPTY;
+ }
+ return new String(newChars, 0, count - (whitespacesCount > 0 ? 1 : 0));
}
/**