blob: 7eca41a2c809726699cce4ce19a12f0df791234b [file] [log] [blame]
/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <vector>
#include <cstdint>
#include <memory>
namespace slicer {
// A specialized Key -> T* map (note that, unlike std:: containers, the values
// are always pointers here, and we don't explicitly store the lookup keys)
//
// Implemented as an incrementally resizable hash table: we split the logical hash table
// into two internal fixed size tables, the "full table" and a "insertion table".
// When the insertion table overflows, we allocate a larger hashtable to replace
// it and "insertion table" becomes the "full table" (the old "full table" is
// rehashed into the new hash table)
//
// Similar to open addressing hash tables, all the buckets are a single,
// contiguous array. But this table is growing and the collisions are still handled
// as chains (using indexes instead of pointers).
//
// The result is faster than std::unordered_map and uses ~25% of
// the memory used by std::unordered_map<const char*, String*>
//
// The Hash template argument is a type which must implement:
// 1. hash function : uint32_t Hash(const Key& key)
// 2. key compare : bool Compare(const Key& key, T* value)
// 3. key extraction : Key GetKey(T* value)
// 4. copy semantics
//
template<class Key, class T, class Hash>
class HashTable {
private:
// the index type inside the bucket array
using Index = uint32_t;
static constexpr Index kInitialHashBuckets = (1 << 7) - 1;
static constexpr Index kAvgChainLength = 2;
static constexpr Index kInvalidIndex = static_cast<Index>(-1);
static constexpr double kResizeFactor = 1.6;
struct __attribute__((packed)) Bucket {
T* value = nullptr;
Index next = kInvalidIndex;
};
class Partition {
public:
Partition(Index size, const Hash& hasher);
bool Insert(T* value);
T* Lookup(const Key& key, uint32_t hash_value) const;
Index HashBuckets() const { return hash_buckets_; }
void InsertAll(const Partition& src);
void PrintStats(const char* name, bool verbose);
private:
std::vector<Bucket> buckets_;
const Index hash_buckets_;
Hash hasher_;
};
public:
explicit HashTable(const Hash& hasher = Hash()) : hasher_(hasher) {
// we start with full_table_ == nullptr
insertion_table_.reset(new Partition(kInitialHashBuckets, hasher_));
}
~HashTable() = default;
// No move or copy semantics
HashTable(const HashTable&) = delete;
HashTable& operator=(const HashTable&) = delete;
// Insert a new, non-nullptr T* into the hash table
// (we only store unique values so the new value must
// not be in the table already)
void Insert(T* value);
// Lookup an existing value
// (returns nullptr if the value is not found)
T* Lookup(const Key& key) const;
void PrintStats(const char* name, bool verbose);
private:
std::unique_ptr<Partition> full_table_;
std::unique_ptr<Partition> insertion_table_;
Hash hasher_;
};
template<class Key, class T, class Hash>
HashTable<Key, T, Hash>::Partition::Partition(Index size, const Hash& hasher)
: hash_buckets_(size), hasher_(hasher) {
// allocate space for the hash buckets + avg chain length
buckets_.reserve(hash_buckets_ * kAvgChainLength);
buckets_.resize(hash_buckets_);
}
// Similar to the "cellar" version of coalesced hashing,
// the buckets array is divided into a fixed set of entries
// addressable by the hash value [0 .. hash_buckets_) and
// extra buckets for the collision chains [hash_buckets_, buckets_.size())
// Unlike coalesced hashing, our "cellar" is growing so we don't actually
// have to coalesce any chains.
//
// Returns true if the insertion succeeded, false if the table overflows
// (we never insert more than the pre-reserved capacity)
//
template<class Key, class T, class Hash>
bool HashTable<Key, T, Hash>::Partition::Insert(T* value) {
SLICER_CHECK(value != nullptr);
// overflow?
if (buckets_.size() + 1 > buckets_.capacity()) {
return false;
}
auto key = hasher_.GetKey(value);
Index bucket_index = hasher_.Hash(key) % hash_buckets_;
if (buckets_[bucket_index].value == nullptr) {
buckets_[bucket_index].value = value;
} else {
Bucket new_bucket = {};
new_bucket.value = value;
new_bucket.next = buckets_[bucket_index].next;
buckets_[bucket_index].next = buckets_.size();
buckets_.push_back(new_bucket);
}
return true;
}
template<class Key, class T, class Hash>
T* HashTable<Key, T, Hash>::Partition::Lookup(const Key& key, uint32_t hash_value) const {
assert(hash_value == hasher_.Hash(key));
Index bucket_index = hash_value % hash_buckets_;
for (Index index = bucket_index; index != kInvalidIndex; index = buckets_[index].next) {
auto value = buckets_[index].value;
if (value == nullptr) {
assert(index < hash_buckets_);
break;
} else if (hasher_.Compare(key, value)) {
return value;
}
}
return nullptr;
}
template<class Key, class T, class Hash>
void HashTable<Key, T, Hash>::Partition::InsertAll(const Partition& src) {
for (const auto& bucket : src.buckets_) {
if (bucket.value != nullptr) {
SLICER_CHECK(Insert(bucket.value));
}
}
}
// Try to insert into the "insertion table". If that overflows,
// we allocate a new, larger hash table, move "full table" value to it
// and "insertion table" becomes the new "full table".
template<class Key, class T, class Hash>
void HashTable<Key, T, Hash>::Insert(T* value) {
assert(Lookup(hasher_.GetKey(value)) == nullptr);
if (!insertion_table_->Insert(value)) {
std::unique_ptr<Partition> new_hash_table(
new Partition(insertion_table_->HashBuckets() * kResizeFactor, hasher_));
if (full_table_) {
new_hash_table->InsertAll(*full_table_);
}
SLICER_CHECK(new_hash_table->Insert(value));
full_table_ = std::move(insertion_table_);
insertion_table_ = std::move(new_hash_table);
}
}
// First look into the "full table" and if the value is
// not found there look into the "insertion table" next
template<class Key, class T, class Hash>
T* HashTable<Key, T, Hash>::Lookup(const Key& key) const {
auto hash_value = hasher_.Hash(key);
if (full_table_) {
auto value = full_table_->Lookup(key, hash_value);
if (value != nullptr) {
return value;
}
}
return insertion_table_->Lookup(key, hash_value);
}
template<class Key, class T, class Hash>
void HashTable<Key, T, Hash>::Partition::PrintStats(const char* name, bool verbose) {
int max_chain_length = 0;
int sum_chain_length = 0;
int used_buckets = 0;
for (Index i = 0; i < hash_buckets_; ++i) {
if (verbose) printf("%4d : ", i);
if (buckets_[i].value != nullptr) {
++used_buckets;
int chain_length = 0;
for (Index ci = i; buckets_[ci].next != kInvalidIndex; ci = buckets_[ci].next) {
SLICER_CHECK(buckets_[ci].value != nullptr);
++chain_length;
if (verbose) printf("*");
}
max_chain_length = std::max(max_chain_length, chain_length);
sum_chain_length += chain_length;
}
if (verbose) printf("\n");
}
int avg_chain_length = used_buckets ? sum_chain_length / used_buckets : 0;
printf("\nHash table partition (%s):\n", name);
printf(" hash_buckets : %u\n", hash_buckets_);
printf(" size/capacity : %zu / %zu\n", buckets_.size(), buckets_.capacity());
printf(" used_buckets : %d\n", used_buckets);
printf(" max_chain_length : %d\n", max_chain_length);
printf(" avg_chain_length : %d\n", avg_chain_length);
}
template<class Key, class T, class Hash>
void HashTable<Key, T, Hash>::PrintStats(const char* name, bool verbose) {
printf("\nHash table stats (%s)\n", name);
if (full_table_) {
full_table_->PrintStats("full_table", verbose);
}
insertion_table_->PrintStats("insertion_table", verbose);
}
} // namespace slicer