blob: 2853bbe39db84eba5ecc9c1a724d51266c906246 [file] [log] [blame]
/*
* Copyright (C) 2007 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package java.util.regex;
import java.io.Serializable;
import java.util.ArrayList;
import com.ibm.icu4jni.regex.NativeRegEx;
/**
* Represents a pattern used for matching, searching, or replacing strings.
* {@code Pattern}s are specified in terms of regular expressions and compiled
* using an instance of this class. They are then used in conjunction with a
* {@link Matcher} to perform the actual search.
* <p/>
* A typical use case looks like this:
* <p/>
* <pre>
* Pattern p = Pattern.compile("Hello, A[a-z]*!");
*
* Matcher m = p.matcher("Hello, Android!");
* boolean b1 = m.matches(); // true
*
* m.setInput("Hello, Robot!");
* boolean b2 = m.matches(); // false
* </pre>
* <p/>
* The above code could also be written in a more compact fashion, though this
* variant is less efficient, since {@code Pattern} and {@code Matcher} objects
* are created on the fly instead of being reused.
* fashion:
* <pre>
* boolean b1 = Pattern.matches("Hello, A[a-z]*!", "Hello, Android!"); // true
* boolean b2 = Pattern.matches("Hello, A[a-z]*!", "Hello, Robot!"); // false
* </pre>
* <p/>
* Please consult the <a href="package.html">package documentation</a> for an
* overview of the regular expression syntax used in this class as well as
* Android-specific implementation details.
*
* @see Matcher
* @since Android 1.0
*/
public final class Pattern implements Serializable {
private static final long serialVersionUID = 5073258162644648461L;
/**
* This constant specifies that a pattern matches Unix line endings ('\n')
* only against the '.', '^', and '$' meta characters.
*
* @since Android 1.0
*/
public static final int UNIX_LINES = 0x01;
/**
* This constant specifies that a {@code Pattern} is matched
* case-insensitively. That is, the patterns "a+" and "A+" would both match
* the string "aAaAaA".
* <p>
* Note: For Android, the {@code CASE_INSENSITIVE} constant
* (currently) always includes the meaning of the {@link #UNICODE_CASE}
* constant. So if case insensitivity is enabled, this automatically extends
* to all Unicode characters. The {@code UNICODE_CASE} constant itself has
* no special consequences.
*
* @since Android 1.0
*/
public static final int CASE_INSENSITIVE = 0x02;
/**
* This constant specifies that a {@code Pattern} may contain whitespace or
* comments. Otherwise comments and whitespace are taken as literal
* characters.
*
* @since Android 1.0
*/
public static final int COMMENTS = 0x04;
/**
* This constant specifies that the meta characters '^' and '$' match only
* the beginning and end end of an input line, respectively. Normally, they
* match the beginning and the end of the complete input.
*
* @since Android 1.0
*/
public static final int MULTILINE = 0x08;
/**
* This constant specifies that the whole {@code Pattern} is to be taken
* literally, that is, all meta characters lose their meanings.
*
* @since Android 1.0
*/
public static final int LITERAL = 0x10;
/**
* This constant specifies that the '.' meta character matches arbitrary
* characters, including line endings, which is normally not the case.
*
* @since Android 1.0
*/
public static final int DOTALL = 0x20;
/**
* This constant specifies that a {@code Pattern} is matched
* case-insensitively with regard to all Unicode characters. It is used in
* conjunction with the {@link #CASE_INSENSITIVE} constant to extend its
* meaning to all Unicode characters.
* <p>
* Note: For Android, the {@code CASE_INSENSITIVE} constant
* (currently) always includes the meaning of the {@code UNICODE_CASE}
* constant. So if case insensitivity is enabled, this automatically extends
* to all Unicode characters. The {@code UNICODE_CASE} constant then has no
* special consequences.
*
* @since Android 1.0
*/
public static final int UNICODE_CASE = 0x40;
/**
* This constant specifies that a character in a {@code Pattern} and a
* character in the input string only match if they are canonically
* equivalent. It is (currently) not supported in Android.
*
* @since Android 1.0
*/
public static final int CANON_EQ = 0x80;
/**
* Holds the regular expression.
*/
private String pattern;
/**
* Holds the flags used when compiling this pattern.
*/
private int flags;
/**
* Holds a handle (a pointer, actually) for the native ICU pattern.
*/
transient int mNativePattern;
/**
* Holds the number of groups in the pattern.
*/
transient int mGroupCount;
/**
* Compiles a regular expression, creating a new Pattern instance in the
* process. This is actually a convenience method that calls {@link
* #compile(String, int)} with a {@code flags} value of zero.
*
* @param pattern
* the regular expression.
*
* @return the new {@code Pattern} instance.
*
* @throws PatternSyntaxException
* if the regular expression is syntactically incorrect.
*
* @since Android 1.0
*/
public static Pattern compile(String pattern) throws PatternSyntaxException {
return new Pattern(pattern, 0);
}
/**
* Compiles a regular expression, creating a new {@code Pattern} instance in
* the process. Allows to set some flags that modify the behavior of the
* {@code Pattern}.
*
* @param pattern
* the regular expression.
* @param flags
* the flags to set. Basically, any combination of the constants
* defined in this class is valid.
* <p>
* Note: Currently, the {@link #CASE_INSENSITIVE} and
* {@link #UNICODE_CASE} constants have slightly special behavior
* in Android, and the {@link #CANON_EQ} constant is not
* supported at all.
*
* @return the new {@code Pattern} instance.
*
* @throws PatternSyntaxException
* if the regular expression is syntactically incorrect.
*
* @see #CANON_EQ
* @see #CASE_INSENSITIVE
* @see #COMMENTS
* @see #DOTALL
* @see #LITERAL
* @see #MULTILINE
* @see #UNICODE_CASE
* @see #UNIX_LINES
*
* @since Android 1.0
*/
public static Pattern compile(String pattern, int flags) throws PatternSyntaxException {
return new Pattern(pattern, flags);
}
/**
* Creates a new {@code Pattern} instance from a given regular expression
* and flags.
*
* @param pattern
* the regular expression.
* @param flags
* the flags to set. Any combination of the constants defined in
* this class is valid.
*
* @throws PatternSyntaxException
* if the regular expression is syntactically incorrect.
*/
private Pattern(String pattern, int flags) throws PatternSyntaxException {
if ((flags & CANON_EQ) != 0) {
throw new UnsupportedOperationException("CANON_EQ flag not supported");
}
this.pattern = pattern;
this.flags = flags;
compileImpl(pattern, flags);
}
/**
* Compiles the given regular expression using the given flags. Used
* internally only.
*
* @param pattern
* the regular expression.
* @param flags
* the flags.
*/
private void compileImpl(String pattern, int flags) throws PatternSyntaxException {
if (pattern == null) {
throw new NullPointerException();
}
if ((flags & LITERAL) != 0) {
pattern = quote(pattern);
}
// These are the flags natively supported by ICU.
// They even have the same value in native code.
flags = flags & (CASE_INSENSITIVE | COMMENTS | MULTILINE | DOTALL | UNIX_LINES);
mNativePattern = NativeRegEx.open(pattern, flags);
mGroupCount = NativeRegEx.groupCount(mNativePattern);
}
/**
* Returns the regular expression that was compiled into this
* {@code Pattern}.
*
* @return the regular expression.
*
* @since Android 1.0
*/
public String pattern() {
return pattern;
}
/**
* Returns the flags that have been set for this {@code Pattern}.
*
* @return the flags that have been set. A combination of the constants
* defined in this class.
*
* @see #CANON_EQ
* @see #CASE_INSENSITIVE
* @see #COMMENTS
* @see #DOTALL
* @see #LITERAL
* @see #MULTILINE
* @see #UNICODE_CASE
* @see #UNIX_LINES
*
* @since Android 1.0
*/
public int flags() {
return flags;
}
/**
* Returns a {@link Matcher} for the {@code Pattern} and a given input. The
* {@code Matcher} can be used to match the {@code Pattern} against the
* whole input, find occurrences of the {@code Pattern} in the input, or
* replace parts of the input.
*
* @param input
* the input to process.
*
* @return the resulting {@code Matcher}.
*
* @since Android 1.0
*/
public Matcher matcher(CharSequence input) {
return new Matcher(this, input);
}
/**
* Tries to match a given regular expression against a given input. This is
* actually nothing but a convenience method that compiles the regular
* expression into a {@code Pattern}, builds a {@link Matcher} for it, and
* then does the match. If the same regular expression is used for multiple
* operations, it is recommended to compile it into a {@code Pattern}
* explicitly and request a reusable {@code Matcher}.
*
* @param regex
* the regular expression.
* @param input
* the input to process.
*
* @return true if and only if the {@code Pattern} matches the input.
*
* @see Pattern#compile(java.lang.String, int)
* @see Matcher#matches()
*
* @since Android 1.0
*/
static public boolean matches(String regex, CharSequence input) {
return new Matcher(new Pattern(regex, 0), input).matches();
}
/**
* Splits a given input around occurrences of a regular expression. This is
* a convenience method that is equivalent to calling the method
* {@link #split(java.lang.CharSequence, int)} with a limit of 0.
*
* @param input
* the input sequence.
*
* @return the resulting array.
*
* @since Android 1.0
*/
public String[] split(CharSequence input) {
return split(input, 0);
}
/**
* Splits the given input sequence at occurrences of this {@code Pattern}.
*
* If this {@code Pattern} does not occur in the input, the result is an
* array containing the input (converted from a {@code CharSequence} to
* a {@code String}).
*
* Otherwise, the {@code limit} parameter controls the contents of the
* returned array as described below.
*
* @param inputSeq
* the input sequence.
* @param limit
* Determines the maximum number of entries in the resulting
* array, and the treatment of trailing empty strings.
* <ul>
* <li>For n &gt; 0, the resulting array contains at most n
* entries. If this is fewer than the number of matches, the
* final entry will contain all remaining input.
* <li>For n &lt; 0, the length of the resulting array is
* exactly the number of occurrences of the {@code Pattern}
* plus one for the text after the final separator.
* All entries are included.
* <li>For n == 0, the result is as for n &lt; 0, except
* trailing empty strings will not be returned. (Note that
* the case where the input is itself an empty string is
* special, as described above, and the limit parameter does
* not apply there.)
* </ul>
*
* @return the resulting array.
*
* @since Android 1.0
*/
public String[] split(CharSequence inputSeq, int limit) {
if (inputSeq.length() == 0) {
// Unlike Perl, which considers the result of splitting the empty
// string to be the empty array, Java returns an array containing
// the empty string.
return new String[] { "" };
}
int maxLength = limit <= 0 ? Integer.MAX_VALUE : limit;
String input = inputSeq.toString();
ArrayList<String> list = new ArrayList<String>();
Matcher matcher = new Matcher(this, inputSeq);
int savedPos = 0;
// Add text preceding each occurrence, if enough space.
while(matcher.find() && list.size() + 1 < maxLength) {
list.add(input.substring(savedPos, matcher.start()));
savedPos = matcher.end();
}
// Add trailing text if enough space.
if (list.size() < maxLength) {
if (savedPos < input.length()) {
list.add(input.substring(savedPos));
} else {
list.add("");
}
}
// Remove trailing empty matches in the limit == 0 case.
if (limit == 0) {
int i = list.size() - 1;
while (i >= 0 && "".equals(list.get(i))) {
list.remove(i);
i--;
}
}
return list.toArray(new String[list.size()]);
}
/**
* Quotes a given string using "\Q" and "\E", so that all other
* meta-characters lose their special meaning. If the string is used for a
* {@code Pattern} afterwards, it can only be matched literally.
*
* @param s
* the string to quote.
*
* @return the quoted string.
*
* @since Android 1.0
*/
public static String quote(String s) {
StringBuffer sb = new StringBuffer().append("\\Q");
int apos = 0;
int k;
while ((k = s.indexOf("\\E", apos)) >= 0) {
sb.append(s.substring(apos, k + 2)).append("\\\\E\\Q");
apos = k + 2;
}
return sb.append(s.substring(apos)).append("\\E").toString();
}
@Override
public String toString() {
return pattern;
}
@Override
protected void finalize() throws Throwable {
try {
if (mNativePattern != 0) {
NativeRegEx.close(mNativePattern);
}
}
finally {
super.finalize();
}
}
/**
* Provides serialization support
*/
private void readObject(java.io.ObjectInputStream s)
throws java.io.IOException, ClassNotFoundException {
s.defaultReadObject();
compileImpl(pattern, flags);
}
}