blob: 8c593a37038bc0fd04a56a618187bccf64008d3a [file] [log] [blame]
/*
* Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package jdk.tools.jlink.internal;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import jdk.internal.jimage.ImageStringsReader;
/*
* The algorithm used here is outlined in Applications of Finite Automata
* Representing Large Vocabularies - Claudio L Lucchesi and Tomasz Kowaltowski,
* 1992, and A Practical Minimal Perfect Hashing Method - Fabiano C. Botelho1,
* Yoshiharu Kohayakawa, and Nivio Ziviani, 2005.
*
* The primary JDK use of this algorithm is managing the jimage location index.
*
* The goal of PerfectHashBuilder is to construct an automaton which maps a
* string key to a unique index 0..N-1, where N is the number of key-value pairs.
* What makes MPHM effective is that the size of the lookup table is N or very
* near N, and the minimum lookup is O(1) maximum lookup is O(2).
*
* The result of PerfectHashBuilder is two integer arrays, redirect and order.
* The redirect table provides a 1-1 mapping to the order table, using the
* reader algorithm described further on. The order table provides a mapping
* to entries. If entries are fixed size and can be put in a direct table, then
* the order table can be used to construct the direct table and then discarded.
*
* The steps for constructing the lookup tables are as follows;
*
* - Compute an MPHM hash for each key, based on a fixed base value modulo N.
* Note, the hash is based on the modified UTF-8 of the key, simplifying
* computation in native code.
*
* - Combine keys that map to the same hash code (collisions) into bucket
* chains.
*
* - Sort bucket chains by length of chains, longest first (most collisions.)
* Sorting is done to pack the redirect table with the worst collision
* offenders first.
*
* - For each chain, recompute the hash of each key using a new base value.
* Recomputation should give a different key distribution. A tally is kept
* of where the key maps, using the order table. The tally is used to detect
* new collisions. If there are further collisions, then restart
* redistribution using a different hash base value. If a chain is
* successfully distributed, then the base value used to compute the hash
* is recorded in the redirect table.
*
* - Once all colliding chains are resolved (length > 1), then the chains with
* only one entry are used to fill in the empty slots in the order table.
* These keys are recorded in the redirect table using the twos complement
* of the order index.
*
* - It is possible that a given set of keys cannot be packed into a table of
* size N. If this situation occurs then the size of the table is
* adjusted so that keys distribute differently.
*
* Readers algoritm;
*
* - Compute the hash for the key using the fixed base value modulo N. This
* will provide an index into the redirect table. The integer value in the
* redirect table will determine the next step.
*
* - If the value in the redirect table is positive, then that value is used
* to rehash the key to get the index into the order table.
*
* - If the value in the redirect table is negative, then that value is the
* twos complement of the index into the order table.
*
* - If the value in the redirect table is zero, then there is no matching
* entry.
*
* - Note that the resulting entry needs to be validated to ensure a match.
* This is typically done by comparing the key with the key in entry.
*/
public class PerfectHashBuilder<E> {
private static final int RETRY_LIMIT = 1000;
private Class<?> entryComponent;
private Class<?> bucketComponent;
private final Map<String, Entry<E>> map = new LinkedHashMap<>();
private int[] redirect;
private Entry<E>[] order;
private int count = 0;
@SuppressWarnings("EqualsAndHashcode")
public static class Entry<E> {
private final String key;
private final E value;
Entry() {
this("", null);
}
Entry(String key, E value) {
this.key = key;
this.value = value;
}
String getKey() {
return key;
}
E getValue() {
return value;
}
int hashCode(int seed) {
return ImageStringsReader.hashCode(key, seed);
}
@Override
public int hashCode() {
return ImageStringsReader.hashCode(key);
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (!(other instanceof Entry)) {
return false;
}
Entry<?> entry = (Entry<?>) other;
return entry.key.equals(key);
}
}
static class Bucket<E> implements Comparable<Bucket<E>> {
final List<Entry<E>> list = new ArrayList<>();
void add(Entry<E> entry) {
list.add(entry);
}
int getSize() {
return list.size();
}
List<Entry<E>> getList() {
return list;
}
Entry<E> getFirst() {
assert !list.isEmpty() : "bucket should never be empty";
return list.get(0);
}
@Override
public int hashCode() {
return getFirst().hashCode();
}
@Override
@SuppressWarnings("EqualsWhichDoesntCheckParameterClass")
public boolean equals(Object obj) {
return this == obj;
}
@Override
public int compareTo(Bucket<E> o) {
return o.getSize() - getSize();
}
}
public PerfectHashBuilder(Class<?> entryComponent, Class<?> bucketComponent) {
this.entryComponent = entryComponent;
this.bucketComponent = bucketComponent;
}
public int getCount() {
return map.size();
}
public int[] getRedirect() {
return redirect.clone();
}
public Entry<E>[] getOrder() {
return order.clone();
}
public Entry<E> put(String key, E value) {
return put(new Entry<>(key, value));
}
public Entry<E> put(Entry<E> entry) {
Entry<E> old = map.put(entry.key, entry);
if (old == null) {
count++;
}
return old;
}
@SuppressWarnings("unchecked")
public void generate() {
// If the table is empty then exit early.
boolean redo = count != 0;
// Repeat until a valid packing is achieved.
while (redo) {
redo = false;
// Allocate the resulting redirect and order tables.
redirect = new int[count];
order = (Entry<E>[])Array.newInstance(entryComponent, count);
// Place all the entries in bucket chains based on hash. Sort by
// length of chain.
Bucket<E>[] sorted = createBuckets();
int free = 0;
// Iterate through the chains, longest first.
for (Bucket<E> bucket : sorted) {
if (bucket.getSize() != 1) {
// Attempt to pack entries until no collisions occur.
if (!collidedEntries(bucket, count)) {
// Failed to pack. Meed to grow table.
redo = true;
break;
}
} else {
// A no collision entry (bucket.getSize() == 1). Find a free
// spot in the order table.
for ( ; free < count && order[free] != null; free++) {}
// If none found, then grow table.
if (free >= count) {
redo = true;
break;
}
// Store entry in order table.
order[free] = bucket.getFirst();
// Twos complement of order index stired in the redirect table.
redirect[(bucket.hashCode() & 0x7FFFFFFF) % count] = -1 - free;
// Update free slot index.
free++;
}
}
// If packing failed, then bump table size. Make odd to increase
// chances of being relatively prime.
if (redo) {
count = (count + 1) | 1;
}
}
}
@SuppressWarnings("unchecked")
private Bucket<E>[] createBuckets() {
// Build bucket chains based on key hash. Collisions end up in same chain.
Bucket<E>[] buckets = (Bucket<E>[])Array.newInstance(bucketComponent, count);
map.values().stream().forEach((entry) -> {
int index = (entry.hashCode() & 0x7FFFFFFF) % count;
Bucket<E> bucket = buckets[index];
if (bucket == null) {
buckets[index] = bucket = new Bucket<>();
}
bucket.add(entry);
});
// Sort chains, longest first.
Bucket<E>[] sorted = Arrays.asList(buckets).stream()
.filter((bucket) -> (bucket != null))
.sorted()
.toArray((length) -> {
return (Bucket<E>[])Array.newInstance(bucketComponent, length);
});
return sorted;
}
private boolean collidedEntries(Bucket<E> bucket, int count) {
// Track packing attempts.
List<Integer> undo = new ArrayList<>();
// Start with a new hash seed.
int seed = ImageStringsReader.HASH_MULTIPLIER + 1;
int retry = 0;
// Attempt to pack all the entries in a single chain.
redo:
while (true) {
for (Entry<E> entry : bucket.getList()) {
// Compute new hash.
int index = entry.hashCode(seed) % count;
// If a collision is detected.
if (order[index] != null) {
// Only retry so many times with current table size.
if (++retry > RETRY_LIMIT) {
return false;
}
// Undo the attempted packing.
undo.stream().forEach((i) -> {
order[i] = null;
});
// Reset the undo list and bump up the hash seed.
undo.clear();
seed++;
// Zero seed is not valid.
if (seed == 0) {
seed = 1;
}
// Try again.
continue redo;
}
// No collision.
order[index] = entry;
undo.add(index);
}
// Entire chain packed. Record hash seed used.
redirect[(bucket.hashCode() & 0x7FFFFFFF) % count] = seed;
break;
}
return true;
}
}