| // Copyright 2022 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package com.google.setfilters.cuckoofilter; |
| |
| import com.google.common.hash.Funnel; |
| import com.google.common.hash.HashCode; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.Optional; |
| import java.util.Random; |
| |
| /** |
| * A space efficient, probabilistic multiset data structure that supports membership check, |
| * insertion, and deletion of the elements. |
| * |
| * <p>Cuckoo filter enables tradeoffs between its space efficiency and the false positive |
| * probability of the membership check. |
| * |
| * <p>See the original paper https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf for more |
| * details. |
| * |
| * <p>This class is not thread-safe. |
| */ |
| public final class CuckooFilter<T> { |
| private final CuckooFilterConfig config; |
| private final CuckooFilterTable table; |
| private final Funnel<? super T> funnel; |
| private final Random random; |
| |
| /** Counts the total number of elements in the cuckoo filter. */ |
| private long count; |
| |
| /** Instantiates a new cuckoo filter. */ |
| public static <T> CuckooFilter<T> createNew(CuckooFilterConfig config, Funnel<? super T> funnel) { |
| Random random = new Random(); |
| CuckooFilterTable table = |
| CuckooFilterTable.create(config.size(), config.useSpaceOptimization(), random); |
| return new CuckooFilter<T>(config, table, funnel, random); |
| } |
| |
| /** |
| * Instantiates a cuckoo filter from serialized cuckoo filter table. |
| * |
| * <p>Note that {@link SerializedCuckooFilterTable} does not contain any data on {@link |
| * CuckooFilterConfig.HashFunction}, {@link CuckooFilterConfig.Strategy}, or {@link Funnel} used, |
| * so it is up to the user to supply appropriate hash function, strategy, and funnel that were |
| * used to generate the {@link SerializedCuckooFilterTable}. |
| */ |
| public static <T> CuckooFilter<T> createFromSerializedTable( |
| SerializedCuckooFilterTable serializedTable, |
| CuckooFilterConfig.HashFunction hashFunction, |
| CuckooFilterConfig.Strategy strategy, |
| Funnel<? super T> funnel) { |
| Random random = new Random(); |
| CuckooFilterTable table = CuckooFilterTable.createFromSerialization(serializedTable, random); |
| return new CuckooFilter<T>( |
| CuckooFilterConfig.newBuilder() |
| .setSize(table.size()) |
| .setHashFunction(hashFunction) |
| .setStrategy(strategy) |
| .build(), |
| table, |
| funnel, |
| random); |
| } |
| |
| private CuckooFilter( |
| CuckooFilterConfig config, CuckooFilterTable table, Funnel<? super T> funnel, Random random) { |
| this.config = config; |
| this.table = table; |
| this.funnel = funnel; |
| this.random = random; |
| count = 0; |
| } |
| |
| /** |
| * Returns true if {@code element} is in the cuckoo filter. |
| * |
| * <p>By the probabilistic nature of the cuckoo filter data structure, this method may return a |
| * false positive result. In other words, this method may incorrectly return true for an element |
| * that was actually never inserted. This probability can depend on various factors, including the |
| * size of the cuckoo filter and the hash function used. |
| * |
| * <p>However, it is guaranteed that this method never returns a false negative result, as long as |
| * {@code delete} method is called on an element that exists in the filter. Please see {@code |
| * delete} method for more details. |
| */ |
| public boolean contains(T element) { |
| HashCode hash = config.hashFunction().hash(element, funnel); |
| long fingerprint = |
| config.strategy().computeFingerprint(hash, config.size().fingerprintLength()); |
| int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount()); |
| int otherBucketIndex = |
| config |
| .strategy() |
| .computeOtherBucketIndex( |
| fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction()); |
| return table.contains(bucketIndex, fingerprint) |
| || table.contains(otherBucketIndex, fingerprint); |
| } |
| |
| /** |
| * Inserts {@code element} to the cuckoo filter, returning true if the element was inserted |
| * successfully. |
| * |
| * <p>Insertion of {@code element} will fail if there is no room for {@code element}. Note that |
| * even when the insertion of {@code element} fails, it is possible for another element to be |
| * inserted successfully. Even then, the insertion failure should be a good indicator that the |
| * filter is getting close to its maximum capacity. |
| */ |
| public boolean insert(T element) { |
| HashCode hash = config.hashFunction().hash(element, funnel); |
| long fingerprint = |
| config.strategy().computeFingerprint(hash, config.size().fingerprintLength()); |
| int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount()); |
| int otherBucketIndex = |
| config |
| .strategy() |
| .computeOtherBucketIndex( |
| fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction()); |
| |
| // First attempt to insert the fingerprint to one of the two assigned buckets. |
| if (attemptInsertion(fingerprint, bucketIndex, otherBucketIndex)) { |
| count++; |
| return true; |
| } |
| |
| // If both buckets are full, execute insertion with repeated replacements algorithm. |
| int startBucketIndex = (random.nextInt(2) == 0) ? bucketIndex : otherBucketIndex; |
| boolean inserted = insertWithRepeatedReplacements(fingerprint, startBucketIndex); |
| if (inserted) { |
| count++; |
| } |
| return inserted; |
| } |
| |
| /** |
| * Deletes {@code element} from the cuckoo filter, returning true if the element was deleted |
| * successfully. |
| * |
| * <p>It is critical for {@code delete} to be called on an already existing element. Otherwise, |
| * the filter may incorrectly delete a wrong element. When this happens, it is possible for {@code |
| * contains} method to return a false negative result. |
| */ |
| public boolean delete(T element) { |
| HashCode hash = config.hashFunction().hash(element, funnel); |
| long fingerprint = |
| config.strategy().computeFingerprint(hash, config.size().fingerprintLength()); |
| int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount()); |
| int otherBucketIndex = |
| config |
| .strategy() |
| .computeOtherBucketIndex( |
| fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction()); |
| boolean deleted = |
| table.delete(bucketIndex, fingerprint) || table.delete(otherBucketIndex, fingerprint); |
| if (deleted) { |
| count--; |
| } |
| return deleted; |
| } |
| |
| /** Returns the size of the cuckoo filter. */ |
| public CuckooFilterConfig.Size size() { |
| return config.size(); |
| } |
| |
| /** Returns the count of the elements in the cuckoo filter. */ |
| public long count() { |
| return count; |
| } |
| |
| /** |
| * Returns the ratio of the total number of elements in the cuckoo filter and the theoretical max |
| * capacity. |
| * |
| * <p>The returned value is in range [0, 1]. |
| */ |
| public double load() { |
| return count / ((double) config.size().bucketCount() * config.size().bucketCapacity()); |
| } |
| |
| /** |
| * Serializes the state of the cuckoo filter table. |
| * |
| * <p>Note that this method does not serialize hash function, strategy, and funnel. When |
| * instantiating a cuckoo filter from the returned {@link SerializedCuckooFilterTable}, it is up |
| * to the user to supply appropriate hash function, strategy, and funnel that were used. |
| */ |
| public SerializedCuckooFilterTable serializeTable() { |
| return table.serialize(); |
| } |
| |
| /** |
| * Attempts to insert {@code fingerprint} to one of the buckets with indices {@code bucketIndex} |
| * and {@code otherBucketIndex}, returning true when successful. Returns false if both buckets are |
| * full and the insertion failed. |
| */ |
| private boolean attemptInsertion(long fingerprint, int bucketIndex, int otherBucketIndex) { |
| if (!table.isFull(bucketIndex)) { |
| table.insertWithReplacement(bucketIndex, fingerprint); |
| return true; |
| } |
| if (!table.isFull(otherBucketIndex)) { |
| table.insertWithReplacement(otherBucketIndex, fingerprint); |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Randomly traverses the cuckoo graph to find an available bucket for insertion. |
| * |
| * <p>At a high level, this algorithm starts at vertex {@code bucketIndex} and performs a random |
| * walk of length at most {@link CuckooFilterConfig.Strategy#maxReplacementCount}. If an available |
| * bucket is found, the algorithm "pushes" all the fingerprints (edges) that are visited (note |
| * that in the cuckoo graph, the edges are the fingerprints) to their alternate buckets, and make |
| * room for {@code fingerprint} to be inserted. |
| * |
| * <p>If during the random walk an available bucket is not found, the insertion fails and the |
| * method returns false. |
| * |
| * <p>Note that it is possible to deterministically find an available bucket by performing breadth |
| * first search in the cuckoo graph, but this is usually slower and the extra chance of successful |
| * insertion is negligibly small in practice. |
| */ |
| private boolean insertWithRepeatedReplacements(long fingerprint, int bucketIndex) { |
| List<Integer> visitedBucketIndices = new ArrayList<>(); |
| List<Long> replacedFingerprints = new ArrayList<>(); |
| |
| long currFingerprint = fingerprint; |
| int currBucketIndex = bucketIndex; |
| visitedBucketIndices.add(-1); // Just for index alignment purpose. |
| replacedFingerprints.add(currFingerprint); |
| for (int i = 0; i < config.strategy().maxReplacementCount(); i++) { |
| Optional<Long> replacedFingerprint = |
| table.insertWithReplacement(currBucketIndex, currFingerprint); |
| // Found an available bucket, and the insertion is successful. |
| if (replacedFingerprint.isEmpty()) { |
| return true; |
| } |
| |
| visitedBucketIndices.add(currBucketIndex); |
| replacedFingerprints.add(replacedFingerprint.get()); |
| |
| currFingerprint = replacedFingerprint.get(); |
| currBucketIndex = |
| config |
| .strategy() |
| .computeOtherBucketIndex( |
| currFingerprint, |
| currBucketIndex, |
| config.size().bucketCount(), |
| config.hashFunction()); |
| } |
| |
| // Failed to find a bucket to insert. Reverse the replacements and declare that the insertion |
| // failed. |
| for (int i = visitedBucketIndices.size() - 1; i > 0; i--) { |
| int previousBucketIndex = visitedBucketIndices.get(i); |
| table.delete(previousBucketIndex, replacedFingerprints.get(i - 1)); |
| table.insertWithReplacement(previousBucketIndex, replacedFingerprints.get(i)); |
| } |
| return false; |
| } |
| } |