blob: fcda11bfe804ef5de2d96b3dc0d1604d76a1e414 [file] [log] [blame]
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* JFlex 1.4.3 *
* Copyright (C) 1998-2009 Gerwin Klein <lsf@jflex.de> *
* All rights reserved. *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License. See the file *
* COPYRIGHT for more information. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License along *
* with this program; if not, write to the Free Software Foundation, Inc., *
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
package JFlex;
import java.util.*;
/**
*
* @author Gerwin Klein
* @version $Revision: 1.4.3 $, $Date: 2009/12/21 15:58:48 $
*/
public class CharClasses {
/** debug flag (for char classes only) */
private static final boolean DEBUG = false;
/** the largest character that can be used in char classes */
public static final char maxChar = '\uFFFF';
/** the char classes */
private Vector /* of IntCharSet */ classes;
/** the largest character actually used in a specification */
private char maxCharUsed;
/**
* Constructs a new CharClass object that provides space for
* classes of characters from 0 to maxCharCode.
*
* Initially all characters are in class 0.
*
* @param maxCharCode the last character code to be
* considered. (127 for 7bit Lexers,
* 255 for 8bit Lexers and 0xFFFF
* for Unicode Lexers).
*/
public CharClasses(int maxCharCode) {
if (maxCharCode < 0 || maxCharCode > 0xFFFF)
throw new IllegalArgumentException();
maxCharUsed = (char) maxCharCode;
classes = new Vector();
classes.addElement(new IntCharSet(new Interval((char) 0, maxChar)));
}
/**
* Returns the greatest Unicode value of the current input character set.
*/
public char getMaxCharCode() {
return maxCharUsed;
}
/**
* Sets the largest Unicode value of the current input character set.
*
* @param charCode the largest character code, used for the scanner
* (i.e. %7bit, %8bit, %16bit etc.)
*/
public void setMaxCharCode(int charCode) {
if (charCode < 0 || charCode > 0xFFFF)
throw new IllegalArgumentException();
maxCharUsed = (char) charCode;
}
/**
* Returns the current number of character classes.
*/
public int getNumClasses() {
return classes.size();
}
/**
* Updates the current partition, so that the specified set of characters
* gets a new character class.
*
* Characters that are elements of <code>set</code> are not in the same
* equivalence class with characters that are not elements of <code>set</code>.
*
* @param set the set of characters to distinguish from the rest
* @param caseless if true upper/lower/title case are considered equivalent
*/
public void makeClass(IntCharSet set, boolean caseless) {
if (caseless) set = set.getCaseless();
if ( DEBUG ) {
Out.dump("makeClass("+set+")");
dump();
}
int oldSize = classes.size();
for (int i = 0; i < oldSize; i++) {
IntCharSet x = (IntCharSet) classes.elementAt(i);
if (x.equals(set)) return;
IntCharSet and = x.and(set);
if ( and.containsElements() ) {
if ( x.equals(and) ) {
set.sub(and);
continue;
}
else if ( set.equals(and) ) {
x.sub(and);
classes.addElement(and);
if (DEBUG) {
Out.dump("makeClass(..) finished");
dump();
}
return;
}
set.sub(and);
x.sub(and);
classes.addElement(and);
}
}
if (DEBUG) {
Out.dump("makeClass(..) finished");
dump();
}
}
/**
* Returns the code of the character class the specified character belongs to.
*/
public int getClassCode(char letter) {
int i = -1;
while (true) {
IntCharSet x = (IntCharSet) classes.elementAt(++i);
if ( x.contains(letter) ) return i;
}
}
/**
* Dump charclasses to the dump output stream
*/
public void dump() {
Out.dump(toString());
}
/**
* Return a string representation of one char class
*
* @param theClass the index of the class to
*/
public String toString(int theClass) {
return classes.elementAt(theClass).toString();
}
/**
* Return a string representation of the char classes
* stored in this class.
*
* Enumerates the classes by index.
*/
public String toString() {
StringBuffer result = new StringBuffer("CharClasses:");
result.append(Out.NL);
for (int i = 0; i < classes.size(); i++)
result.append("class "+i+":"+Out.NL+classes.elementAt(i)+Out.NL);
return result.toString();
}
/**
* Creates a new character class for the single character <code>singleChar</code>.
*
* @param caseless if true upper/lower/title case are considered equivalent
*/
public void makeClass(char singleChar, boolean caseless) {
makeClass(new IntCharSet(singleChar), caseless);
}
/**
* Creates a new character class for each character of the specified String.
*
* @param caseless if true upper/lower/title case are considered equivalent
*/
public void makeClass(String str, boolean caseless) {
for (int i = 0; i < str.length(); i++) makeClass(str.charAt(i), caseless);
}
/**
* Updates the current partition, so that the specified set of characters
* gets a new character class.
*
* Characters that are elements of the set <code>v</code> are not in the same
* equivalence class with characters that are not elements of the set <code>v</code>.
*
* @param v a Vector of Interval objects.
* This Vector represents a set of characters. The set of characters is
* the union of all intervals in the Vector.
*
* @param caseless if true upper/lower/title case are considered equivalent
*/
public void makeClass(Vector /* Interval */ v, boolean caseless) {
makeClass(new IntCharSet(v), caseless);
}
/**
* Updates the current partition, so that the set of all characters not contained in the specified
* set of characters gets a new character class.
*
* Characters that are elements of the set <code>v</code> are not in the same
* equivalence class with characters that are not elements of the set <code>v</code>.
*
* This method is equivalent to <code>makeClass(v)</code>
*
* @param v a Vector of Interval objects.
* This Vector represents a set of characters. The set of characters is
* the union of all intervals in the Vector.
*
* @param caseless if true upper/lower/title case are considered equivalent
*/
public void makeClassNot(Vector v, boolean caseless) {
makeClass(new IntCharSet(v), caseless);
}
/**
* Returns an array that contains the character class codes of all characters
* in the specified set of input characters.
*/
private int [] getClassCodes(IntCharSet set, boolean negate) {
if (DEBUG) {
Out.dump("getting class codes for "+set);
if (negate)
Out.dump("[negated]");
}
int size = classes.size();
// [fixme: optimize]
int temp [] = new int [size];
int length = 0;
for (int i = 0; i < size; i++) {
IntCharSet x = (IntCharSet) classes.elementAt(i);
if ( negate ) {
if ( !set.and(x).containsElements() ) {
temp[length++] = i;
if (DEBUG) Out.dump("code "+i);
}
}
else {
if ( set.and(x).containsElements() ) {
temp[length++] = i;
if (DEBUG) Out.dump("code "+i);
}
}
}
int result [] = new int [length];
System.arraycopy(temp, 0, result, 0, length);
return result;
}
/**
* Returns an array that contains the character class codes of all characters
* in the specified set of input characters.
*
* @param intervallVec a Vector of Intervals, the set of characters to get
* the class codes for
*
* @return an array with the class codes for intervallVec
*/
public int [] getClassCodes(Vector /* Interval */ intervallVec) {
return getClassCodes(new IntCharSet(intervallVec), false);
}
/**
* Returns an array that contains the character class codes of all characters
* that are <strong>not</strong> in the specified set of input characters.
*
* @param intervallVec a Vector of Intervals, the complement of the
* set of characters to get the class codes for
*
* @return an array with the class codes for the complement of intervallVec
*/
public int [] getNotClassCodes(Vector /* Interval */ intervallVec) {
return getClassCodes(new IntCharSet(intervallVec), true);
}
/**
* Check consistency of the stored classes [debug].
*
* all classes must be disjoint, checks if all characters
* have a class assigned.
*/
public void check() {
for (int i = 0; i < classes.size(); i++)
for (int j = i+1; j < classes.size(); j++) {
IntCharSet x = (IntCharSet) classes.elementAt(i);
IntCharSet y = (IntCharSet) classes.elementAt(j);
if ( x.and(y).containsElements() ) {
System.out.println("Error: non disjoint char classes "+i+" and "+j);
System.out.println("class "+i+": "+x);
System.out.println("class "+j+": "+y);
}
}
// check if each character has a classcode
// (= if getClassCode terminates)
for (char c = 0; c < maxChar; c++) {
getClassCode(c);
if (c % 100 == 0) System.out.print(".");
}
getClassCode(maxChar);
}
/**
* Returns an array of all CharClassIntervalls in this
* char class collection.
*
* The array is ordered by char code, i.e.
* <code>result[i+1].start = result[i].end+1</code>
*
* Each CharClassInterval contains the number of the
* char class it belongs to.
*/
public CharClassInterval [] getIntervals() {
int i, c;
int size = classes.size();
int numIntervalls = 0;
for (i = 0; i < size; i++)
numIntervalls+= ((IntCharSet) classes.elementAt(i)).numIntervalls();
CharClassInterval [] result = new CharClassInterval[numIntervalls];
i = 0;
c = 0;
while (i < numIntervalls) {
int code = getClassCode((char) c);
IntCharSet set = (IntCharSet) classes.elementAt(code);
Interval iv = set.getNext();
result[i++] = new CharClassInterval(iv.start, iv.end, code);
c = iv.end+1;
}
return result;
}
}