| package com.fasterxml.jackson.core.sym; |
| |
| import java.util.Arrays; |
| import java.util.BitSet; |
| import java.util.concurrent.atomic.AtomicReference; |
| |
| import com.fasterxml.jackson.core.JsonFactory; |
| import com.fasterxml.jackson.core.util.InternCache; |
| |
| /** |
| * A caching symbol table implementation used for canonicalizing JSON field |
| * names (as {@link Name}s which are constructed directly from a byte-based |
| * input source). |
| * Complications arise from trying to do efficient reuse and merging of |
| * symbol tables, to be able to make use of usually shared vocabulary |
| * of subsequent parsing runs. |
| * |
| * @author Tatu Saloranta |
| */ |
| public final class BytesToNameCanonicalizer |
| { |
| private static final int DEFAULT_T_SIZE = 64; |
| |
| /** |
| * Let's not expand symbol tables past some maximum size; |
| * this should protected against OOMEs caused by large documents |
| * with unique (~= random) names. |
| */ |
| private static final int MAX_T_SIZE = 0x10000; // 64k entries == 256k mem |
| |
| /** |
| * Let's only share reasonably sized symbol tables. Max size set to 3/4 of 16k; |
| * this corresponds to 64k main hash index. This should allow for enough distinct |
| * names for almost any case. |
| */ |
| private final static int MAX_ENTRIES_FOR_REUSE = 6000; |
| |
| /** |
| * Also: to thwart attacks based on hash collisions (which may or may not |
| * be cheap to calculate), we will need to detect "too long" |
| * collision chains. |
| *<p> |
| * Note: longest chain we have been able to produce without malicious |
| * intent has been 10 (with "com.fasterxml.jackson.core.sym.TestSymbolTables"); |
| * our setting should be reasonable here. Also note that overflow |
| * chains are shared between multiple primary cells, which could cause |
| * problems for lower values. |
| *<p> |
| * Also note that value was lowered from 255 (2.3 and earlier) to 100 for 2.4 |
| * |
| * @since 2.1 |
| */ |
| private final static int MAX_COLL_CHAIN_LENGTH = 100; |
| |
| /** |
| * No point in trying to construct tiny tables, just need to resize soon. |
| */ |
| final static int MIN_HASH_SIZE = 16; |
| |
| /** |
| * We will also need to define initial size for collision list, |
| * when copying it. |
| */ |
| final static int INITIAL_COLLISION_LEN = 32; |
| |
| /** |
| * Bucket index is 8 bits, and value 0 is reserved to represent |
| * 'empty' status. |
| */ |
| final static int LAST_VALID_BUCKET = 0xFE; |
| |
| /* |
| /********************************************************** |
| /* Linkage, needed for merging symbol tables |
| /********************************************************** |
| */ |
| |
| /** |
| * Reference to the root symbol table, for child tables, so |
| * that they can merge table information back as necessary. |
| */ |
| final protected BytesToNameCanonicalizer _parent; |
| |
| /** |
| * Member that is only used by the root table instance: root |
| * passes immutable state into child instances, and children |
| * may return new state if they add entries to the table. |
| * Child tables do NOT use the reference. |
| */ |
| final protected AtomicReference<TableInfo> _tableInfo; |
| |
| /** |
| * Seed value we use as the base to make hash codes non-static between |
| * different runs, but still stable for lifetime of a single symbol table |
| * instance. |
| * This is done for security reasons, to avoid potential DoS attack via |
| * hash collisions. |
| * |
| * @since 2.1 |
| */ |
| final private int _seed; |
| |
| /* |
| /********************************************************** |
| /* Configuration |
| /********************************************************** |
| */ |
| |
| /** |
| * Whether canonical symbol Strings are to be intern()ed before added |
| * to the table or not. |
| *<p> |
| * NOTE: non-final to allow disabling intern()ing in case of excessive |
| * collisions. |
| */ |
| protected boolean _intern; |
| |
| /** |
| * Flag that indicates whether we should throw an exception if enough |
| * hash collisions are detected (true); or just worked around (false). |
| * |
| * @since 2.4 |
| */ |
| protected final boolean _failOnDoS; |
| |
| /* |
| /********************************************************** |
| /* Main table state |
| /********************************************************** |
| */ |
| |
| // // // First, global information |
| |
| /** |
| * Total number of Names in the symbol table; |
| * only used for child tables. |
| */ |
| protected int _count; |
| |
| /** |
| * We need to keep track of the longest collision list; this is needed |
| * both to indicate problems with attacks and to allow flushing for |
| * other cases. |
| * |
| * @since 2.1 |
| */ |
| protected int _longestCollisionList; |
| |
| // // // Then information regarding primary hash array and its |
| // // // matching Name array |
| |
| /** |
| * Mask used to truncate 32-bit hash value to current hash array |
| * size; essentially, hash array size - 1 (since hash array sizes |
| * are 2^N). |
| */ |
| protected int _hashMask; |
| |
| /** |
| * Array of 2^N size, which contains combination |
| * of 24-bits of hash (0 to indicate 'empty' slot), |
| * and 8-bit collision bucket index (0 to indicate empty |
| * collision bucket chain; otherwise subtract one from index) |
| */ |
| protected int[] _hash; |
| |
| /** |
| * Array that contains <code>Name</code> instances matching |
| * entries in <code>_mainHash</code>. Contains nulls for unused |
| * entries. |
| */ |
| protected Name[] _mainNames; |
| |
| // // // Then the collision/spill-over area info |
| |
| /** |
| * Array of heads of collision bucket chains; size dynamically |
| */ |
| protected Bucket[] _collList; |
| |
| /** |
| * Total number of Names in collision buckets (included in |
| * <code>_count</code> along with primary entries) |
| */ |
| protected int _collCount; |
| |
| /** |
| * Index of the first unused collision bucket entry (== size of |
| * the used portion of collision list): less than |
| * or equal to 0xFF (255), since max number of entries is 255 |
| * (8-bit, minus 0 used as 'empty' marker) |
| */ |
| protected int _collEnd; |
| |
| // // // Info regarding pending rehashing... |
| |
| /** |
| * This flag is set if, after adding a new entry, it is deemed |
| * that a rehash is warranted if any more entries are to be added. |
| */ |
| private transient boolean _needRehash; |
| |
| /* |
| /********************************************************** |
| /* Sharing, versioning |
| /********************************************************** |
| */ |
| |
| // // // Which of the buffers may be shared (and are copy-on-write)? |
| |
| /** |
| * Flag that indicates whether underlying data structures for |
| * the main hash area are shared or not. If they are, then they |
| * need to be handled in copy-on-write way, i.e. if they need |
| * to be modified, a copy needs to be made first; at this point |
| * it will not be shared any more, and can be modified. |
| *<p> |
| * This flag needs to be checked both when adding new main entries, |
| * and when adding new collision list queues (i.e. creating a new |
| * collision list head entry) |
| */ |
| private boolean _hashShared; |
| |
| private boolean _namesShared; |
| |
| /** |
| * Flag that indicates whether underlying data structures for |
| * the collision list are shared or not. If they are, then they |
| * need to be handled in copy-on-write way, i.e. if they need |
| * to be modified, a copy needs to be made first; at this point |
| * it will not be shared any more, and can be modified. |
| *<p> |
| * This flag needs to be checked when adding new collision entries. |
| */ |
| private boolean _collListShared; |
| |
| /* |
| /********************************************************** |
| /* Bit of DoS detection goodness |
| /********************************************************** |
| */ |
| |
| /** |
| * Lazily constructed structure that is used to keep track of |
| * collision buckets that have overflowed once: this is used |
| * to detect likely attempts at denial-of-service attacks that |
| * uses hash collisions. |
| * |
| * @since 2.4 |
| */ |
| protected BitSet _overflows; |
| |
| /* |
| /********************************************************** |
| /* Life-cycle: constructors |
| /********************************************************** |
| */ |
| |
| /** |
| * Constructor used for creating per-<code>JsonFactory</code> "root" |
| * symbol tables: ones used for merging and sharing common symbols |
| * |
| * @param sz Initial hash area size |
| * @param intern Whether Strings contained should be {@link String#intern}ed |
| * @param seed Random seed valued used to make it more difficult to cause |
| * collisions (used for collision-based DoS attacks). |
| */ |
| private BytesToNameCanonicalizer(int sz, boolean intern, int seed, boolean failOnDoS) { |
| _parent = null; |
| _seed = seed; |
| _intern = intern; |
| _failOnDoS = failOnDoS; |
| // Sanity check: let's now allow hash sizes below certain minimum value |
| if (sz < MIN_HASH_SIZE) { |
| sz = MIN_HASH_SIZE; |
| } else { |
| /* Also; size must be 2^N; otherwise hash algorithm won't |
| * work... so let's just pad it up, if so |
| */ |
| if ((sz & (sz - 1)) != 0) { // only true if it's 2^N |
| int curr = MIN_HASH_SIZE; |
| while (curr < sz) { |
| curr += curr; |
| } |
| sz = curr; |
| } |
| } |
| _tableInfo = new AtomicReference<TableInfo>(initTableInfo(sz)); |
| } |
| |
| /** |
| * Constructor used when creating a child instance |
| */ |
| private BytesToNameCanonicalizer(BytesToNameCanonicalizer parent, boolean intern, |
| int seed, boolean failOnDoS, TableInfo state) |
| { |
| _parent = parent; |
| _seed = seed; |
| _intern = intern; |
| _failOnDoS = failOnDoS; |
| _tableInfo = null; // not used by child tables |
| |
| // Then copy shared state |
| _count = state.count; |
| _hashMask = state.mainHashMask; |
| _hash = state.mainHash; |
| _mainNames = state.mainNames; |
| _collList = state.collList; |
| _collCount = state.collCount; |
| _collEnd = state.collEnd; |
| _longestCollisionList = state.longestCollisionList; |
| |
| // and then set other state to reflect sharing status |
| _needRehash = false; |
| _hashShared = true; |
| _namesShared = true; |
| _collListShared = true; |
| } |
| |
| /* |
| public TableInfo(int count, int mainHashMask, int[] mainHash, Name[] mainNames, |
| Bucket[] collList, int collCount, int collEnd, int longestCollisionList) |
| */ |
| private TableInfo initTableInfo(int sz) { |
| return new TableInfo(0, // count |
| sz - 1, // mainHashMask |
| new int[sz], // mainHash |
| new Name[sz], // mainNames |
| null, // collList |
| 0, // collCount, |
| 0, // collEnd |
| 0 // longestCollisionList |
| ); |
| } |
| |
| /* |
| /********************************************************** |
| /* Life-cycle: factory methods, merging |
| /********************************************************** |
| */ |
| |
| /** |
| * Factory method to call to create a symbol table instance with a |
| * randomized seed value. |
| */ |
| public static BytesToNameCanonicalizer createRoot() { |
| /* [Issue-21]: Need to use a variable seed, to thwart hash-collision |
| * based attacks. |
| */ |
| long now = System.currentTimeMillis(); |
| // ensure it's not 0; and might as well require to be odd so: |
| int seed = (((int) now) + ((int) (now >>> 32))) | 1; |
| return createRoot(seed); |
| } |
| |
| /** |
| * Factory method that should only be called from unit tests, where seed |
| * value should remain the same. |
| */ |
| protected static BytesToNameCanonicalizer createRoot(int seed) { |
| return new BytesToNameCanonicalizer(DEFAULT_T_SIZE, true, seed, true); |
| } |
| |
| /** |
| * Factory method used to create actual symbol table instance to |
| * use for parsing. |
| */ |
| public BytesToNameCanonicalizer makeChild(int flags) { |
| return new BytesToNameCanonicalizer(this, |
| JsonFactory.Feature.INTERN_FIELD_NAMES.enabledIn(flags), |
| _seed, |
| JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW.enabledIn(flags), |
| _tableInfo.get()); |
| } |
| |
| @Deprecated // since 2.4 |
| public BytesToNameCanonicalizer makeChild(boolean canonicalize, boolean intern) { |
| return new BytesToNameCanonicalizer(this, intern, _seed, |
| true, // JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW |
| _tableInfo.get()); |
| } |
| |
| /** |
| * Method called by the using code to indicate it is done |
| * with this instance. This lets instance merge accumulated |
| * changes into parent (if need be), safely and efficiently, |
| * and without calling code having to know about parent |
| * information |
| */ |
| public void release() |
| { |
| // we will try to merge if child table has new entries |
| if (_parent != null && maybeDirty()) { |
| _parent.mergeChild(new TableInfo(this)); |
| /* Let's also mark this instance as dirty, so that just in |
| * case release was too early, there's no corruption of possibly shared data. |
| */ |
| _hashShared = true; |
| _namesShared = true; |
| _collListShared = true; |
| } |
| } |
| |
| private void mergeChild(TableInfo childState) |
| { |
| final int childCount = childState.count; |
| TableInfo currState = _tableInfo.get(); |
| |
| /* Should usually grow; but occasionally could also shrink if |
| * (but only if) collision list overflow ends up clearing |
| * some collision lists. |
| */ |
| if (childCount == currState.count) { |
| return; |
| } |
| |
| /* One caveat: let's try to avoid problems with |
| * degenerate cases of documents with generated "random" |
| * names: for these, symbol tables would bloat indefinitely. |
| * One way to do this is to just purge tables if they grow |
| * too large, and that's what we'll do here. |
| */ |
| if (childCount > MAX_ENTRIES_FOR_REUSE) { |
| /* Should there be a way to get notified about this |
| * event, to log it or such? (as it's somewhat abnormal |
| * thing to happen) |
| */ |
| // At any rate, need to clean up the tables |
| childState = initTableInfo(DEFAULT_T_SIZE); |
| } |
| _tableInfo.compareAndSet(currState, childState); |
| } |
| |
| /* |
| /********************************************************** |
| /* API, accessors |
| /********************************************************** |
| */ |
| |
| public int size() |
| { |
| if (_tableInfo != null) { // root table |
| return _tableInfo.get().count; |
| } |
| // nope, child table |
| return _count; |
| } |
| |
| /** |
| * @since 2.1 |
| */ |
| public int bucketCount() { return _hash.length; } |
| |
| /** |
| * Method called to check to quickly see if a child symbol table |
| * may have gotten additional entries. Used for checking to see |
| * if a child table should be merged into shared table. |
| */ |
| public boolean maybeDirty() { return !_hashShared; } |
| |
| /** |
| * @since 2.1 |
| */ |
| public int hashSeed() { return _seed; } |
| |
| /** |
| * Method mostly needed by unit tests; calculates number of |
| * entries that are in collision list. Value can be at most |
| * ({@link #size} - 1), but should usually be much lower, ideally 0. |
| * |
| * @since 2.1 |
| */ |
| public int collisionCount() { return _collCount; } |
| |
| /** |
| * Method mostly needed by unit tests; calculates length of the |
| * longest collision chain. This should typically be a low number, |
| * but may be up to {@link #size} - 1 in the pathological case |
| * |
| * @since 2.1 |
| */ |
| public int maxCollisionLength() { |
| return _longestCollisionList; |
| } |
| |
| /* |
| /********************************************************** |
| /* Public API, accessing symbols: |
| /********************************************************** |
| */ |
| |
| public static Name getEmptyName() { |
| return Name1.getEmptyName(); |
| } |
| |
| /** |
| * Finds and returns name matching the specified symbol, if such |
| * name already exists in the table. |
| * If not, will return null. |
| *<p> |
| * Note: separate methods to optimize common case of |
| * short element/attribute names (4 or less ascii characters) |
| * |
| * @param q1 int32 containing first 4 bytes of the name; |
| * if the whole name less than 4 bytes, padded with zero bytes |
| * in front (zero MSBs, ie. right aligned) |
| * |
| * @return Name matching the symbol passed (or constructed for |
| * it) |
| */ |
| public Name findName(int q1) |
| { |
| int hash = calcHash(q1); |
| int ix = (hash & _hashMask); |
| int val = _hash[ix]; |
| |
| /* High 24 bits of the value are low 24 bits of hash (low 8 bits |
| * are bucket index)... match? |
| */ |
| if ((((val >> 8) ^ hash) << 8) == 0) { // match |
| // Ok, but do we have an actual match? |
| Name name = _mainNames[ix]; |
| if (name == null) { // main slot empty; can't find |
| return null; |
| } |
| if (name.equals(q1)) { |
| return name; |
| } |
| } else if (val == 0) { // empty slot? no match |
| return null; |
| } |
| // Maybe a spill-over? |
| val &= 0xFF; |
| if (val > 0) { // 0 means 'empty' |
| val -= 1; // to convert from 1-based to 0... |
| Bucket bucket = _collList[val]; |
| if (bucket != null) { |
| return bucket.find(hash, q1, 0); |
| } |
| } |
| // Nope, no match whatsoever |
| return null; |
| } |
| |
| /** |
| * Finds and returns name matching the specified symbol, if such |
| * name already exists in the table. |
| * If not, will return null. |
| *<p> |
| * Note: separate methods to optimize common case of relatively |
| * short element/attribute names (8 or less ascii characters) |
| * |
| * @param q1 int32 containing first 4 bytes of the name. |
| * @param q2 int32 containing bytes 5 through 8 of the |
| * name; if less than 8 bytes, padded with up to 3 zero bytes |
| * in front (zero MSBs, ie. right aligned) |
| * |
| * @return Name matching the symbol passed (or constructed for it) |
| */ |
| public Name findName(int q1, int q2) |
| { |
| int hash = (q2 == 0) ? calcHash(q1) : calcHash(q1, q2); |
| int ix = (hash & _hashMask); |
| int val = _hash[ix]; |
| |
| /* High 24 bits of the value are low 24 bits of hash (low 8 bits |
| * are bucket index)... match? |
| */ |
| if ((((val >> 8) ^ hash) << 8) == 0) { // match |
| // Ok, but do we have an actual match? |
| Name name = _mainNames[ix]; |
| if (name == null) { // main slot empty; can't find |
| return null; |
| } |
| if (name.equals(q1, q2)) { |
| return name; |
| } |
| } else if (val == 0) { // empty slot? no match |
| return null; |
| } |
| // Maybe a spill-over? |
| val &= 0xFF; |
| if (val > 0) { // 0 means 'empty' |
| val -= 1; // to convert from 1-based to 0... |
| Bucket bucket = _collList[val]; |
| if (bucket != null) { |
| return bucket.find(hash, q1, q2); |
| } |
| } |
| // Nope, no match whatsoever |
| return null; |
| } |
| |
| /** |
| * Finds and returns name matching the specified symbol, if such |
| * name already exists in the table; or if not, creates name object, |
| * adds to the table, and returns it. |
| *<p> |
| * Note: this is the general purpose method that can be called for |
| * names of any length. However, if name is less than 9 bytes long, |
| * it is preferable to call the version optimized for short |
| * names. |
| * |
| * @param q Array of int32s, each of which contain 4 bytes of |
| * encoded name |
| * @param qlen Number of int32s, starting from index 0, in quads |
| * parameter |
| * |
| * @return Name matching the symbol passed (or constructed for it) |
| */ |
| public Name findName(int[] q, int qlen) |
| { |
| if (qlen < 3) { // another sanity check |
| return findName(q[0], (qlen < 2) ? 0 : q[1]); |
| } |
| int hash = calcHash(q, qlen); |
| // (for rest of comments regarding logic, see method above) |
| int ix = (hash & _hashMask); |
| int val = _hash[ix]; |
| if ((((val >> 8) ^ hash) << 8) == 0) { |
| Name name = _mainNames[ix]; |
| if (name == null // main slot empty; no collision list then either |
| || name.equals(q, qlen)) { // should be match, let's verify |
| return name; |
| } |
| } else if (val == 0) { // empty slot? no match |
| return null; |
| } |
| val &= 0xFF; |
| if (val > 0) { // 0 means 'empty' |
| val -= 1; // to convert from 1-based to 0... |
| Bucket bucket = _collList[val]; |
| if (bucket != null) { |
| return bucket.find(hash, q, qlen); |
| } |
| } |
| return null; |
| } |
| |
| /* |
| /********************************************************** |
| /* API, mutators |
| /********************************************************** |
| */ |
| |
| public Name addName(String name, int q1, int q2) |
| { |
| if (_intern) { |
| name = InternCache.instance.intern(name); |
| } |
| int hash = (q2 == 0) ? calcHash(q1) : calcHash(q1, q2); |
| Name symbol = constructName(hash, name, q1, q2); |
| _addSymbol(hash, symbol); |
| return symbol; |
| } |
| |
| public Name addName(String name, int[] q, int qlen) |
| { |
| if (_intern) { |
| name = InternCache.instance.intern(name); |
| } |
| int hash; |
| if (qlen < 3) { |
| hash = (qlen == 1) ? calcHash(q[0]) : calcHash(q[0], q[1]); |
| } else { |
| hash = calcHash(q, qlen); |
| } |
| Name symbol = constructName(hash, name, q, qlen); |
| _addSymbol(hash, symbol); |
| return symbol; |
| } |
| |
| /* |
| /********************************************************** |
| /* Helper methods |
| /********************************************************** |
| */ |
| |
| /* Note on hash calculation: we try to make it more difficult to |
| * generate collisions automatically; part of this is to avoid |
| * simple "multiply-add" algorithm (like JDK String.hashCode()), |
| * and add bit of shifting. And other part is to make this |
| * non-linear, at least for shorter symbols. |
| */ |
| |
| // JDK uses 31; other fine choices are 33 and 65599, let's use 33 |
| // as it seems to give fewest collisions for us |
| // (see [http://www.cse.yorku.ca/~oz/hash.html] for details) |
| private final static int MULT = 33; |
| private final static int MULT2 = 65599; |
| private final static int MULT3 = 31; |
| |
| public int calcHash(int q1) |
| { |
| int hash = q1 ^ _seed; |
| hash += (hash >>> 15); // to xor hi- and low- 16-bits |
| hash ^= (hash >>> 9); // as well as lowest 2 bytes |
| return hash; |
| } |
| |
| public int calcHash(int q1, int q2) |
| { |
| /* For two quads, let's change algorithm a bit, to spice |
| * things up (can do bit more processing anyway) |
| */ |
| int hash = q1; |
| hash ^= (hash >>> 15); // try mixing first and second byte pairs first |
| hash += (q2 * MULT); // then add second quad |
| hash ^= _seed; |
| hash += (hash >>> 7); // and shuffle some more |
| return hash; |
| } |
| |
| public int calcHash(int[] q, int qlen) |
| { |
| // Note: may be called for qlen < 3; but has at least one int |
| if (qlen < 3) { |
| throw new IllegalArgumentException(); |
| } |
| |
| /* And then change handling again for "multi-quad" case; mostly |
| * to make calculation of collisions less fun. For example, |
| * add seed bit later in the game, and switch plus/xor around, |
| * use different shift lengths. |
| */ |
| int hash = q[0] ^ _seed; |
| hash += (hash >>> 9); |
| hash *= MULT; |
| hash += q[1]; |
| hash *= MULT2; |
| hash += (hash >>> 15); |
| hash ^= q[2]; |
| hash += (hash >>> 17); |
| |
| for (int i = 3; i < qlen; ++i) { |
| hash = (hash * MULT3) ^ q[i]; |
| // for longer entries, mess a bit in-between too |
| hash += (hash >>> 3); |
| hash ^= (hash << 7); |
| } |
| // and finally shuffle some more once done |
| hash += (hash >>> 15); // to get high-order bits to mix more |
| hash ^= (hash << 9); // as well as lowest 2 bytes |
| return hash; |
| } |
| |
| // Method only used by unit tests |
| protected static int[] calcQuads(byte[] wordBytes) { |
| int blen = wordBytes.length; |
| int[] result = new int[(blen + 3) / 4]; |
| for (int i = 0; i < blen; ++i) { |
| int x = wordBytes[i] & 0xFF; |
| |
| if (++i < blen) { |
| x = (x << 8) | (wordBytes[i] & 0xFF); |
| if (++i < blen) { |
| x = (x << 8) | (wordBytes[i] & 0xFF); |
| if (++i < blen) { |
| x = (x << 8) | (wordBytes[i] & 0xFF); |
| } |
| } |
| } |
| result[i >> 2] = x; |
| } |
| return result; |
| } |
| |
| /* |
| /********************************************************** |
| /* Standard methods |
| /********************************************************** |
| */ |
| |
| /* |
| @Override |
| public String toString() |
| { |
| StringBuilder sb = new StringBuilder(); |
| sb.append("[BytesToNameCanonicalizer, size: "); |
| sb.append(_count); |
| sb.append('/'); |
| sb.append(_mainHash.length); |
| sb.append(", "); |
| sb.append(_collCount); |
| sb.append(" coll; avg length: "); |
| |
| // Average length: minimum of 1 for all (1 == primary hit); |
| // and then 1 per each traversal for collisions/buckets |
| //int maxDist = 1; |
| int pathCount = _count; |
| for (int i = 0; i < _collEnd; ++i) { |
| int spillLen = _collList[i].length(); |
| for (int j = 1; j <= spillLen; ++j) { |
| pathCount += j; |
| } |
| } |
| double avgLength; |
| |
| if (_count == 0) { |
| avgLength = 0.0; |
| } else { |
| avgLength = (double) pathCount / (double) _count; |
| } |
| // let's round up a bit (two 2 decimal places) |
| //avgLength -= (avgLength % 0.01); |
| |
| sb.append(avgLength); |
| sb.append(']'); |
| return sb.toString(); |
| } |
| */ |
| |
| /* |
| /********************************************************** |
| /* Internal methods |
| /********************************************************** |
| */ |
| |
| private void _addSymbol(int hash, Name symbol) |
| { |
| if (_hashShared) { // always have to modify main entry |
| unshareMain(); |
| } |
| // First, do we need to rehash? |
| if (_needRehash) { |
| rehash(); |
| } |
| |
| ++_count; |
| |
| /* Ok, enough about set up: now we need to find the slot to add |
| * symbol in: |
| */ |
| int ix = (hash & _hashMask); |
| if (_mainNames[ix] == null) { // primary empty? |
| _hash[ix] = (hash << 8); |
| if (_namesShared) { |
| unshareNames(); |
| } |
| _mainNames[ix] = symbol; |
| } else { // nope, it's a collision, need to spill over |
| /* How about spill-over area... do we already know the bucket |
| * (is the case if it's not the first collision) |
| */ |
| if (_collListShared) { |
| unshareCollision(); // also allocates if list was null |
| } |
| ++_collCount; |
| int entryValue = _hash[ix]; |
| int bucket = entryValue & 0xFF; |
| if (bucket == 0) { // first spill over? |
| if (_collEnd <= LAST_VALID_BUCKET) { // yup, still unshared bucket |
| bucket = _collEnd; |
| ++_collEnd; |
| // need to expand? |
| if (bucket >= _collList.length) { |
| expandCollision(); |
| } |
| } else { // nope, have to share... let's find shortest? |
| bucket = findBestBucket(); |
| } |
| // Need to mark the entry... and the spill index is 1-based |
| _hash[ix] = (entryValue & ~0xFF) | (bucket + 1); |
| } else { |
| --bucket; // 1-based index in value |
| } |
| |
| // And then just need to link the new bucket entry in |
| Bucket newB = new Bucket(symbol, _collList[bucket]); |
| int collLen = newB.length; |
| if (collLen > MAX_COLL_CHAIN_LENGTH) { |
| /* 23-May-2014, tatu: Instead of throwing an exception right away, let's handle |
| * in bit smarter way. |
| */ |
| _handleSpillOverflow(bucket, newB); |
| } else { |
| _collList[bucket] = newB; |
| // but, be careful wrt attacks |
| _longestCollisionList = Math.max(newB.length, _longestCollisionList); |
| } |
| } |
| |
| /* Ok. Now, do we need a rehash next time? Need to have at least |
| * 50% fill rate no matter what: |
| */ |
| { |
| int hashSize = _hash.length; |
| if (_count > (hashSize >> 1)) { |
| int hashQuarter = (hashSize >> 2); |
| /* And either strictly above 75% (the usual) or |
| * just 50%, and collision count >= 25% of total hash size |
| */ |
| if (_count > (hashSize - hashQuarter)) { |
| _needRehash = true; |
| } else if (_collCount >= hashQuarter) { |
| _needRehash = true; |
| } |
| } |
| } |
| } |
| |
| private void _handleSpillOverflow(int bindex, Bucket newBucket) |
| { |
| if (_overflows == null) { |
| _overflows = new BitSet(); |
| _overflows.set(bindex); |
| } else { |
| if (_overflows.get(bindex)) { |
| // Has happened once already, so not a coincident... |
| if (_failOnDoS) { |
| reportTooManyCollisions(MAX_COLL_CHAIN_LENGTH); |
| } |
| // but even if we don't fail, we will stop intern()ing |
| _intern = false; |
| } else { |
| _overflows.set(bindex); |
| } |
| } |
| // regardless, if we get this far, clear up the bucket, adjust size appropriately. |
| _collList[bindex] = null; |
| _count -= (newBucket.length); |
| // we could calculate longest; but for now just mark as invalid |
| _longestCollisionList = -1; |
| } |
| |
| private void rehash() |
| { |
| _needRehash = false; |
| // Note: since we'll make copies, no need to unshare, can just mark as such: |
| _namesShared = false; |
| |
| /* And then we can first deal with the main hash area. Since we |
| * are expanding linearly (double up), we know there'll be no |
| * collisions during this phase. |
| */ |
| int[] oldMainHash = _hash; |
| int len = oldMainHash.length; |
| int newLen = len+len; |
| |
| /* 13-Mar-2010, tatu: Let's guard against OOME that could be caused by |
| * large documents with unique (or mostly so) names |
| */ |
| if (newLen > MAX_T_SIZE) { |
| nukeSymbols(); |
| return; |
| } |
| |
| _hash = new int[newLen]; |
| _hashMask = (newLen - 1); |
| Name[] oldNames = _mainNames; |
| _mainNames = new Name[newLen]; |
| int symbolsSeen = 0; // let's do a sanity check |
| for (int i = 0; i < len; ++i) { |
| Name symbol = oldNames[i]; |
| if (symbol != null) { |
| ++symbolsSeen; |
| int hash = symbol.hashCode(); |
| int ix = (hash & _hashMask); |
| _mainNames[ix] = symbol; |
| _hash[ix] = hash << 8; // will clear spill index |
| } |
| } |
| |
| /* And then the spill area. This may cause collisions, although |
| * not necessarily as many as there were earlier. Let's allocate |
| * same amount of space, however |
| */ |
| int oldEnd = _collEnd; |
| if (oldEnd == 0) { // no prior collisions... |
| _longestCollisionList = 0; |
| return; |
| } |
| |
| _collCount = 0; |
| _collEnd = 0; |
| _collListShared = false; |
| |
| int maxColl = 0; |
| |
| Bucket[] oldBuckets = _collList; |
| _collList = new Bucket[oldBuckets.length]; |
| for (int i = 0; i < oldEnd; ++i) { |
| for (Bucket curr = oldBuckets[i]; curr != null; curr = curr.next) { |
| ++symbolsSeen; |
| Name symbol = curr.name; |
| int hash = symbol.hashCode(); |
| int ix = (hash & _hashMask); |
| int val = _hash[ix]; |
| if (_mainNames[ix] == null) { // no primary entry? |
| _hash[ix] = (hash << 8); |
| _mainNames[ix] = symbol; |
| } else { // nope, it's a collision, need to spill over |
| ++_collCount; |
| int bucket = val & 0xFF; |
| if (bucket == 0) { // first spill over? |
| if (_collEnd <= LAST_VALID_BUCKET) { // yup, still unshared bucket |
| bucket = _collEnd; |
| ++_collEnd; |
| // need to expand? |
| if (bucket >= _collList.length) { |
| expandCollision(); |
| } |
| } else { // nope, have to share... let's find shortest? |
| bucket = findBestBucket(); |
| } |
| // Need to mark the entry... and the spill index is 1-based |
| _hash[ix] = (val & ~0xFF) | (bucket + 1); |
| } else { |
| --bucket; // 1-based index in value |
| } |
| // And then just need to link the new bucket entry in |
| Bucket newB = new Bucket(symbol, _collList[bucket]); |
| _collList[bucket] = newB; |
| maxColl = Math.max(maxColl, newB.length); |
| } |
| } // for (... buckets in the chain ...) |
| } // for (... list of bucket heads ... ) |
| |
| _longestCollisionList = maxColl; |
| |
| if (symbolsSeen != _count) { // sanity check |
| throw new RuntimeException("Internal error: count after rehash "+symbolsSeen+"; should be "+_count); |
| } |
| } |
| |
| /** |
| * Helper method called to empty all shared symbols, but to leave |
| * arrays allocated |
| */ |
| private void nukeSymbols() { |
| _count = 0; |
| _longestCollisionList = 0; |
| Arrays.fill(_hash, 0); |
| Arrays.fill(_mainNames, null); |
| Arrays.fill(_collList, null); |
| _collCount = 0; |
| _collEnd = 0; |
| } |
| |
| /** |
| * Method called to find the best bucket to spill a Name over to: |
| * usually the first bucket that has only one entry, but in general |
| * first one of the buckets with least number of entries |
| */ |
| private int findBestBucket() { |
| Bucket[] buckets = _collList; |
| int bestCount = Integer.MAX_VALUE; |
| int bestIx = -1; |
| |
| for (int i = 0, len = _collEnd; i < len; ++i) { |
| Bucket b = buckets[i]; |
| // [#145] may become null due to long overflow chain |
| if (b == null) { |
| return i; |
| } |
| int count = b.length; |
| if (count < bestCount) { |
| if (count == 1) { // best possible |
| return i; |
| } |
| bestCount = count; |
| bestIx = i; |
| } |
| } |
| return bestIx; |
| } |
| |
| /** |
| * Method that needs to be called, if the main hash structure |
| * is (may be) shared. This happens every time something is added, |
| * even if addition is to the collision list (since collision list |
| * index comes from lowest 8 bits of the primary hash entry) |
| */ |
| private void unshareMain() { |
| final int[] old = _hash; |
| _hash = Arrays.copyOf(old, old.length); |
| _hashShared = false; |
| } |
| |
| private void unshareCollision() { |
| Bucket[] old = _collList; |
| if (old == null) { |
| _collList = new Bucket[INITIAL_COLLISION_LEN]; |
| } else { |
| _collList = Arrays.copyOf(old, old.length); |
| } |
| _collListShared = false; |
| } |
| |
| private void unshareNames() { |
| final Name[] old = _mainNames; |
| _mainNames = Arrays.copyOf(old, old.length); |
| _namesShared = false; |
| } |
| |
| private void expandCollision() { |
| final Bucket[] old = _collList; |
| _collList = Arrays.copyOf(old, old.length * 2); |
| } |
| |
| /* |
| /********************************************************** |
| /* Constructing name objects |
| /********************************************************** |
| */ |
| |
| private static Name constructName(int hash, String name, int q1, int q2) { |
| if (q2 == 0) { // one quad only? |
| return new Name1(name, hash, q1); |
| } |
| return new Name2(name, hash, q1, q2); |
| } |
| |
| private static Name constructName(int hash, String name, int[] quads, int qlen) { |
| if (qlen < 4) { // Need to check for 3 quad one, can do others too |
| switch (qlen) { |
| case 1: |
| return new Name1(name, hash, quads[0]); |
| case 2: |
| return new Name2(name, hash, quads[0], quads[1]); |
| case 3: |
| return new Name3(name, hash, quads[0], quads[1], quads[2]); |
| default: |
| } |
| } |
| return NameN.construct(name, hash, quads, qlen); |
| } |
| |
| /* |
| /********************************************************** |
| /* Other helper methods |
| /********************************************************** |
| */ |
| |
| /** |
| * @since 2.1 |
| */ |
| protected void reportTooManyCollisions(int maxLen) |
| { |
| throw new IllegalStateException("Longest collision chain in symbol table (of size "+_count |
| +") now exceeds maximum, "+maxLen+" -- suspect a DoS attack based on hash collisions"); |
| } |
| |
| /* |
| /********************************************************** |
| /* Helper classes |
| /********************************************************** |
| */ |
| |
| /** |
| * Immutable value class used for sharing information as efficiently |
| * as possible, by only require synchronization of reference manipulation |
| * but not access to contents. |
| * |
| * @since 2.1 |
| */ |
| private final static class TableInfo |
| { |
| public final int count; |
| public final int mainHashMask; |
| public final int[] mainHash; |
| public final Name[] mainNames; |
| public final Bucket[] collList; |
| public final int collCount; |
| public final int collEnd; |
| public final int longestCollisionList; |
| |
| public TableInfo(int count, int mainHashMask, int[] mainHash, Name[] mainNames, |
| Bucket[] collList, int collCount, int collEnd, int longestCollisionList) |
| { |
| this.count = count; |
| this.mainHashMask = mainHashMask; |
| this.mainHash = mainHash; |
| this.mainNames = mainNames; |
| this.collList = collList; |
| this.collCount = collCount; |
| this.collEnd = collEnd; |
| this.longestCollisionList = longestCollisionList; |
| } |
| |
| public TableInfo(BytesToNameCanonicalizer src) |
| { |
| count = src._count; |
| mainHashMask = src._hashMask; |
| mainHash = src._hash; |
| mainNames = src._mainNames; |
| collList = src._collList; |
| collCount = src._collCount; |
| collEnd = src._collEnd; |
| longestCollisionList = src._longestCollisionList; |
| } |
| } |
| |
| final private static class Bucket |
| { |
| public final Name name; |
| public final Bucket next; |
| public final int hash; |
| public final int length; |
| |
| Bucket(Name name, Bucket next) { |
| this.name = name; |
| this.next = next; |
| length = (next == null) ? 1 : next.length+1; |
| hash = name.hashCode(); |
| } |
| |
| public Name find(int h, int firstQuad, int secondQuad) { |
| if (hash == h) { |
| if (name.equals(firstQuad, secondQuad)) { |
| return name; |
| } |
| } |
| for (Bucket curr = next; curr != null; curr = curr.next) { |
| if (curr.hash == h) { |
| Name currName = curr.name; |
| if (currName.equals(firstQuad, secondQuad)) { |
| return currName; |
| } |
| } |
| } |
| return null; |
| } |
| |
| public Name find(int h, int[] quads, int qlen) { |
| if (hash == h) { |
| if (name.equals(quads, qlen)) { |
| return name; |
| } |
| } |
| for (Bucket curr = next; curr != null; curr = curr.next) { |
| if (curr.hash == h) { |
| Name currName = curr.name; |
| if (currName.equals(quads, qlen)) { |
| return currName; |
| } |
| } |
| } |
| return null; |
| } |
| } |
| } |