Import foldhash 0.1.3
For CL Reviewers: go/android3p#cl-review
Bug: http://b/384489060
Test: mm, treehugger
Change-Id: I79a590827b7bc6ec05067bc6ea7d194628d0a919
diff --git a/crates/foldhash/.android-checksum.json b/crates/foldhash/.android-checksum.json
new file mode 100644
index 0000000..5d9d33a
--- /dev/null
+++ b/crates/foldhash/.android-checksum.json
@@ -0,0 +1 @@
+{"package":null,"files":{".cargo-checksum.json":"17157d0fd87f28e79025933ea52b87726d6b11bcaba8e174f6e7cd05b8261f21","README.md":"7001e5e97345c350ea56c617a4977a655f9cbe2de0dcdf1d354b4b19b7640607","LICENSE":"8be19fe391cb15f9cfd08f2d3a6b18c531eb3949a5077724f425953a65b33e9f","src/seed.rs":"999b6432c5a5def0aabefce4e9c5724962383f59c4d13ca2e365f6c4a9f0598a","Cargo.toml":"b01184d21495eb88dfcd0bdb3ba995a63f7fe1ad86ec49aa772d866f38afbb99","src/lib.rs":"a65700a106bf354ca4bd9eee25d4eead96871c4ca8b26389d4db0e2ea87d3c2a","src/convenience.rs":"0b8e6016da16d68148d690822c3f438ff8a423e9a88a9d177968fd8363e14df2","MODULE_LICENSE_ZLIB":"0d6f8afa3940b7f06bebee651376d43bc8b0d5b437337be2696d30377451e93a","Android.bp":"62b0b823f274f9e3441371e00e4f310074f219aa44961d455282379bfbe529e7","METADATA":"07558e7453064f3a550163bc0482b368b36be905ea8f6647eede6ae3d41fbe81","cargo_embargo.json":"0be745f01ba4955b20f2cb5011168c4d8cd396af75c007035ec693c67b17bce7"}}
\ No newline at end of file
diff --git a/crates/foldhash/.cargo-checksum.json b/crates/foldhash/.cargo-checksum.json
new file mode 100644
index 0000000..9c33333
--- /dev/null
+++ b/crates/foldhash/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"Cargo.toml":"128e508349ef346f424d0092228c47f7f6f6459fc8ceafc509e3918ef31cd717","LICENSE":"b1181a40b2a7b25cf66fd01481713bc1005df082c53ef73e851e55071b102744","README.md":"fe47dcae2123a581799544a577b3a464962f3b51323b5495c53903b3bd3cd4ed","src/convenience.rs":"5e1b3e9fc7e89f35680fc87719ffc4bd8b98e9d5b443ff5abe1749f235a1f601","src/lib.rs":"d0d4fddddaa2353b9cb934535583d170dfe2fe8b3d1816edd667f86218d90e13","src/seed.rs":"90b1bb8f5117a9d46e83d10fc04cc5ad33fe9950727ad5f0633357aa7330e86b"},"package":"f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2"}
\ No newline at end of file
diff --git a/crates/foldhash/Android.bp b/crates/foldhash/Android.bp
new file mode 100644
index 0000000..acf32de
--- /dev/null
+++ b/crates/foldhash/Android.bp
@@ -0,0 +1,34 @@
+// This file is generated by cargo_embargo.
+// Do not modify this file because the changes will be overridden on upgrade.
+
+package {
+ default_applicable_licenses: ["external_rust_crates_foldhash_license"],
+ default_team: "trendy_team_android_rust",
+}
+
+license {
+ name: "external_rust_crates_foldhash_license",
+ visibility: [":__subpackages__"],
+ license_kinds: ["SPDX-license-identifier-Zlib"],
+ license_text: ["LICENSE"],
+}
+
+rust_library {
+ name: "libfoldhash",
+ host_supported: true,
+ crate_name: "foldhash",
+ cargo_env_compat: true,
+ cargo_pkg_version: "0.1.3",
+ crate_root: "src/lib.rs",
+ edition: "2021",
+ features: [
+ "default",
+ "std",
+ ],
+ apex_available: [
+ "//apex_available:platform",
+ "//apex_available:anyapex",
+ ],
+ product_available: true,
+ vendor_available: true,
+}
diff --git a/crates/foldhash/Cargo.toml b/crates/foldhash/Cargo.toml
new file mode 100644
index 0000000..f7a751e
--- /dev/null
+++ b/crates/foldhash/Cargo.toml
@@ -0,0 +1,74 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2021"
+name = "foldhash"
+version = "0.1.3"
+authors = ["Orson Peters <orsonpeters@gmail.com>"]
+build = false
+exclude = [
+ "benches",
+ "tools",
+ "assets",
+]
+autobins = false
+autoexamples = false
+autotests = false
+autobenches = false
+description = "A fast, non-cryptographic, minimally DoS-resistant hashing algorithm."
+readme = "README.md"
+keywords = [
+ "hash",
+ "hasher",
+ "no-std",
+]
+categories = [
+ "algorithms",
+ "no-std",
+]
+license = "Zlib"
+repository = "https://github.com/orlp/foldhash"
+
+[profile.release]
+lto = "thin"
+
+[lib]
+name = "foldhash"
+path = "src/lib.rs"
+bench = false
+
+[dependencies]
+
+[dev-dependencies.ahash]
+version = "0.8"
+
+[dev-dependencies.chrono]
+version = "0.4"
+
+[dev-dependencies.criterion]
+version = "0.5"
+
+[dev-dependencies.fxhash]
+version = "0.2"
+
+[dev-dependencies.hashbrown]
+version = "0.14"
+
+[dev-dependencies.rand]
+version = "0.8"
+
+[dev-dependencies.uuid]
+version = "1.8"
+
+[features]
+default = ["std"]
+std = []
diff --git a/crates/foldhash/LICENSE b/crates/foldhash/LICENSE
new file mode 100644
index 0000000..2f65b0a
--- /dev/null
+++ b/crates/foldhash/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2024 Orson Peters
+
+This software is provided 'as-is', without any express or implied warranty. In
+no event will the authors be held liable for any damages arising from the use of
+this software.
+
+Permission is granted to anyone to use this software for any purpose, including
+commercial applications, and to alter it and redistribute it freely, subject to
+the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim
+ that you wrote the original software. If you use this software in a product,
+ an acknowledgment in the product documentation would be appreciated but is
+ not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source distribution.
\ No newline at end of file
diff --git a/crates/foldhash/METADATA b/crates/foldhash/METADATA
new file mode 100644
index 0000000..d8f863f
--- /dev/null
+++ b/crates/foldhash/METADATA
@@ -0,0 +1,17 @@
+name: "foldhash"
+description: "A fast, non-cryptographic, minimally DoS-resistant hashing algorithm."
+third_party {
+ version: "0.1.3"
+ license_type: NOTICE
+ last_upgrade_date {
+ year: 2024
+ month: 12
+ day: 16
+ }
+ homepage: "https://crates.io/crates/foldhash"
+ identifier {
+ type: "Archive"
+ value: "https://static.crates.io/crates/foldhash/foldhash-0.1.3.crate"
+ version: "0.1.3"
+ }
+}
diff --git a/crates/foldhash/MODULE_LICENSE_ZLIB b/crates/foldhash/MODULE_LICENSE_ZLIB
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crates/foldhash/MODULE_LICENSE_ZLIB
diff --git a/crates/foldhash/README.md b/crates/foldhash/README.md
new file mode 100644
index 0000000..706cd50
--- /dev/null
+++ b/crates/foldhash/README.md
@@ -0,0 +1,277 @@
+# Foldhash
+
+This repository contains foldhash, a fast, non-cryptographic, minimally
+DoS-resistant hashing algorithm implemented in Rust designed for computational
+uses such as hash maps, bloom filters, count sketching, etc.
+
+When should you **not** use foldhash:
+
+- You are afraid of people studying your long-running program's behavior to
+ reverse engineer its internal random state and using this knowledge to create
+ many colliding inputs for computational complexity attacks. For more details
+ see the section "HashDoS resistance".
+
+- You expect foldhash to have a consistent output across versions or
+ platforms, such as for persistent file formats or communication protocols.
+
+- You are relying on foldhash's properties for any kind of security.
+ Foldhash is **not appropriate for any cryptographic purpose**.
+
+Foldhash has two variants, one optimized for speed which is ideal for data
+structures such as hash maps and bloom filters, and one optimized for
+statistical quality which is ideal for algorithms such as
+[HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) and
+[MinHash](https://en.wikipedia.org/wiki/MinHash).
+
+Foldhash can be used in a `#![no_std]` environment by disabling its default
+`"std"` feature.
+
+
+## Performance
+
+We evaluated foldhash against three commonly used hashes in Rust:
+[aHash](https://github.com/tkaitchuck/aHash) v0.8.11,
+[fxhash](https://github.com/cbreeden/fxhash) v0.2.1, and
+[SipHash-1-3](https://en.wikipedia.org/wiki/SipHash), the default hash algorithm
+in Rust at the time of writing. We evaluated both variants foldhash provides,
+`foldhash-f` and `foldhash-q`, which correspond to `foldhash::fast` and
+`foldhash::quality` in the crate respectively.
+
+First we note that hashers with random state inflate the size of your `HashMap`,
+which may or may not be important for your performance:
+```rust
+std::mem::size_of::<foldhash::HashMap<u32, u32>>() = 40 // (both variants)
+std::mem::size_of::<ahash::HashMap<u32, u32>>() = 64
+std::mem::size_of::<fxhash::FxHashMap<u32, u32>>() = 32
+std::mem::size_of::<std::collections::HashMap<u32, u32>>() = 48
+```
+
+We tested runtime performance on two machines, one with a 2023 Apple M2 CPU, one
+with a 2023 Intel Xeon Platinum 8481C server CPU, both with stable Rust 1.80.1.
+Since one of our competitors (aHash) is reliant on AES-based instructions for
+optimal performance we have included both a benchmark with and without
+`-C target-cpu=native` for the Intel machine.
+
+We tested across a wide variety of data types we consider representative of
+types / distributions one might hash in the real world, in the context of a hash
+table key:
+
+- `u32` - random 32-bit unsigned integers,
+- `u32pair` - pairs of random 32-bit unsigned integers,
+- `u64` - random 64-bit unsigned integers,
+- `u64pair` - pairs of random 64-bit unsigned integers,
+- `u64lobits` - 64-bit unsigned integers where only the bottom 16 bits vary,
+- `u64hibits` - 64-bit unsigned integers where only the top 16 bits vary,
+- `ipv4` - [`std::net::Ipv4Addr`](https://doc.rust-lang.org/std/net/struct.Ipv4Addr.html), which is equivalent to `[u8; 4]`,
+- `ipv6` - [`std::net::Ipv6Addr`](https://doc.rust-lang.org/std/net/struct.Ipv6Addr.html), which is equivalent to `[u8; 16]`,
+- `rgba` - random `(u8, u8, u8, u8)` tuples,
+- `strenglishword` - strings containing words sampled uniformly from the top 10,000 most common English words,
+- `struuid` - random UUIDs, hashed in string representation,
+- `strurl` - strings containing URLs sampled uniformly from a corpus of 10,000 URLs,
+- `strdate` - random `YYYY-MM-DD` date strings,
+- `accesslog` - `(u128, u32, chrono::NaiveDate, bool)`, meant to simulate a typical
+ larger compound type, in this case `(resource_id, user_id, date, success)`
+ for an access log.
+- `kilobyte` - random bytestrings one kilobyte in length,
+- `tenkilobyte` - random bytestrings ten kilobytes in length.
+
+We tested the performance of hashing the above data types in the following four contexts:
+
+- `hashonly` - only the time it takes to hash the value,
+- `lookupmiss` - the time it takes to do a lookup in a 1,000 element hash map
+of random elements, only sampling keys of which we know that are not in the hash map,
+- `lookuphit` - similar to `lookupmiss`, except the keys are sampled from keys
+known to be in the hash map,
+- `setbuild` - the time it takes to construct a `HashSet` of 1,000 elements
+from 1,000 randomly sampled elements each repeated 10 times (so 10,000 inserts,
+with ~90% duplicates).
+
+All times are reported as expected time per operation, so one hash, one lookup,
+or one insert respectively. The full results [can be found
+here](https://gist.github.com/orlp/1271ad5b8b775c651cc55773888858eb). To
+summarize, we will only show the results for `u64` and `strenglishword` here, as
+well as the observed geometric mean and average rank over the full benchmark.
+
+```
+Xeon 8481c
+
+┌────────────────┬────────────┬────────────┬────────────┬─────────┬─────────┬─────────┐
+│ avg_rank ┆ 1.58 ┆ 2.66 ┆ 2.09 ┆ 3.70 ┆ 4.97 │
+│ geometric_mean ┆ 6.21 ┆ 7.01 ┆ 7.56 ┆ 8.74 ┆ 28.70 │
+╞════════════════╪════════════╪════════════╪════════════╪═════════╪═════════╪═════════╡
+│ distr ┆ bench ┆ foldhash-f ┆ foldhash-q ┆ fxhash ┆ ahash ┆ siphash │
+╞════════════════╪════════════╪════════════╪════════════╪═════════╪═════════╪═════════╡
+│ u64 ┆ hashonly ┆ 0.79 ┆ 1.03 ┆ 0.67 ┆ 1.23 ┆ 9.09 │
+│ u64 ┆ lookupmiss ┆ 2.01 ┆ 2.44 ┆ 1.73 ┆ 2.73 ┆ 12.03 │
+│ u64 ┆ lookuphit ┆ 3.04 ┆ 3.59 ┆ 2.64 ┆ 3.84 ┆ 12.65 │
+│ u64 ┆ setbuild ┆ 6.13 ┆ 6.52 ┆ 4.88 ┆ 6.66 ┆ 17.80 │
+| ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... |
+│ strenglishword ┆ hashonly ┆ 2.63 ┆ 2.98 ┆ 3.24 ┆ 3.57 ┆ 11.87 │
+│ strenglishword ┆ lookupmiss ┆ 4.63 ┆ 5.03 ┆ 4.51 ┆ 5.86 ┆ 15.19 │
+│ strenglishword ┆ lookuphit ┆ 8.62 ┆ 9.25 ┆ 8.28 ┆ 10.06 ┆ 21.35 │
+│ strenglishword ┆ setbuild ┆ 14.77 ┆ 15.57 ┆ 18.86 ┆ 15.72 ┆ 35.36 │
+└────────────────┴────────────┴────────────┴────────────┴─────────┴─────────┴─────────┘
+
+Xeon 8481c with RUSTFLAGS="-C target-cpu=native"
+
+┌────────────────┬────────────┬────────────┬────────────┬─────────┬─────────┬─────────┐
+│ avg_rank ┆ 1.89 ┆ 3.12 ┆ 2.25 ┆ 2.77 ┆ 4.97 │
+│ geometric_mean ┆ 6.00 ┆ 6.82 ┆ 7.39 ┆ 6.94 ┆ 29.49 │
+╞════════════════╪════════════╪════════════╪════════════╪═════════╪═════════╪═════════╡
+│ distr ┆ bench ┆ foldhash-f ┆ foldhash-q ┆ fxhash ┆ ahash ┆ siphash │
+╞════════════════╪════════════╪════════════╪════════════╪═════════╪═════════╪═════════╡
+│ u64 ┆ hashonly ┆ 0.79 ┆ 1.01 ┆ 0.67 ┆ 1.34 ┆ 9.24 │
+│ u64 ┆ lookupmiss ┆ 1.68 ┆ 2.12 ┆ 1.62 ┆ 1.96 ┆ 12.04 │
+│ u64 ┆ lookuphit ┆ 2.68 ┆ 3.19 ┆ 2.28 ┆ 3.16 ┆ 13.09 │
+│ u64 ┆ setbuild ┆ 6.16 ┆ 6.42 ┆ 4.75 ┆ 7.03 ┆ 18.88 │
+| ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... |
+│ strenglishword ┆ hashonly ┆ 2.60 ┆ 2.97 ┆ 3.25 ┆ 3.04 ┆ 11.58 │
+│ strenglishword ┆ lookupmiss ┆ 4.41 ┆ 4.96 ┆ 4.82 ┆ 4.79 ┆ 32.31 │
+│ strenglishword ┆ lookuphit ┆ 8.68 ┆ 9.35 ┆ 8.46 ┆ 8.63 ┆ 21.48 │
+│ strenglishword ┆ setbuild ┆ 15.01 ┆ 16.34 ┆ 19.34 ┆ 15.37 ┆ 35.22 │
+└────────────────┴────────────┴────────────┴────────────┴─────────┴─────────┴─────────┘
+
+Apple M2
+
+┌────────────────┬────────────┬────────────┬────────────┬─────────┬─────────┬─────────┐
+│ avg_rank ┆ 1.62 ┆ 2.81 ┆ 2.02 ┆ 3.58 ┆ 4.97 │
+│ geometric_mean ┆ 4.41 ┆ 4.86 ┆ 5.39 ┆ 5.71 ┆ 21.94 │
+╞════════════════╪════════════╪════════════╪════════════╪═════════╪═════════╪═════════╡
+│ distr ┆ bench ┆ foldhash-f ┆ foldhash-q ┆ fxhash ┆ ahash ┆ siphash │
+╞════════════════╪════════════╪════════════╪════════════╪═════════╪═════════╪═════════╡
+│ u64 ┆ hashonly ┆ 0.60 ┆ 0.70 ┆ 0.41 ┆ 0.78 ┆ 6.61 │
+│ u64 ┆ lookupmiss ┆ 1.50 ┆ 1.61 ┆ 1.23 ┆ 1.65 ┆ 8.28 │
+│ u64 ┆ lookuphit ┆ 1.78 ┆ 2.10 ┆ 1.57 ┆ 2.25 ┆ 8.53 │
+│ u64 ┆ setbuild ┆ 4.74 ┆ 5.19 ┆ 3.61 ┆ 5.38 ┆ 15.36 │
+| ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... |
+│ strenglishword ┆ hashonly ┆ 1.84 ┆ 2.13 ┆ 1.85 ┆ 2.13 ┆ 11.61 │
+│ strenglishword ┆ lookupmiss ┆ 2.71 ┆ 2.96 ┆ 2.47 ┆ 2.99 ┆ 9.27 │
+│ strenglishword ┆ lookuphit ┆ 7.54 ┆ 8.77 ┆ 7.83 ┆ 8.77 ┆ 18.65 │
+│ strenglishword ┆ setbuild ┆ 16.61 ┆ 17.09 ┆ 14.83 ┆ 16.52 ┆ 26.42 │
+└────────────────┴────────────┴────────────┴────────────┴─────────┴─────────┴─────────┘
+```
+
+We note from the above benchmark that for hash table performance the extra
+quality that `foldhash-q` provides is almost never actually worth the small but
+also non-negligible computational overhead it has over `foldhash-f`. This is our
+justification for providing `foldhash::fast` as a default choice for hash
+tables, even though it has measurable biases (see also the "Quality" section).
+
+fxhash generally does fairly well for small inputs on the benchmarks, however it
+has structural weaknesses as a hash which makes it ill-advised to use as a
+general-purpose hash function in our opinion. For example the `lookuphit`
+benchmark on Apple M2 for `u64hibits` takes 1.77 nanoseconds per lookup for
+foldhash, but 67.72 nanoseconds for fxhash (due to everything colliding - the
+effects would be even worse with a larger hash map). In our opinion foldhash-f
+strikes the right balance between quality and performance for hash tables,
+whereas fxhash flies a bit too close to the sun.
+
+aHash is faster than foldhash for medium-long strings when compiled with
+AES instruction support, but is slower in almost every other scenario or when
+AES instructions are unavailable.
+
+
+## Quality
+
+Foldhash-f is a fairly strong hash in terms of collisions *on its full 64-bit
+output*. However, statistical tests such as
+[SMHasher3](https://gitlab.com/fwojcik/smhasher3) can distinguish it from an ideal
+hash function in tests that focus on the relationship between individual
+input/output bits. One such property is avalanching: changing a single bit in
+the input does not flip every other bit with 50% probability when using
+foldhash-f like it should if it behaved like a proper random oracle.
+
+As the benchmarks above show, spending more effort in foldhash-f to improve the
+hash quality does not lead to better hash table performance. However, there are
+also use cases for hash functions where it is important that (each bit of) the
+hash is unbiased and a random function of all bits of the input, such as in
+algorithms as HyperLogLog or MinHash.
+
+For this purpose we also provide foldhash-q, which is simply a post-processed
+version of foldhash-f to properly avalanche all the bits. Foldhash-q passes the
+[SMHasher3](https://gitlab.com/fwojcik/smhasher3) test suite [without any
+failures](https://github.com/orlp/foldhash_smhasher3). You can also plot the
+worst-case probability (where 50% is ideal) that any particular output bit flips
+if you flip an input bit, which nicely visualizes how fxhash and foldhash-f
+fail this avalanche property but foldhash-q and SipHash-1-3 pass:
+
+
+| FxHash | Foldhash-f | Foldhash-q | SipHash-1-3 |
+|--------|------------|------------|-------------|
+| <img src="assets/avalanche-fxhash.png" width=300> | <img src="assets/avalanche-foldhash-fast.png" width=300> | <img src="assets/avalanche-foldhash-quality.png" width=300> | <img src="assets/avalanche-siphash.png" width=300>
+
+
+## Background
+
+The name foldhash is derived from the *folded multiply*. This technique
+compresses two 64-bit words into a single 64-bit word while simultaneously
+thoroughly mixing the bits. It does this using a 64 x 64 bit -> 128 bit
+multiplication followed by folding the two halves of the 128-bit product
+together using a XOR operation:
+
+```rust
+let full = (x as u128) * (y as u128);
+let lo = full as u64;
+let hi = (full >> 64) as u64;
+let folded = lo ^ hi;
+```
+
+We're not aware of a formal analysis of this operation, but empirically it works
+very well. An informal intuition for why it should work is that multiplication
+can be seen as the sum of many shifted copies of one of the arguments, only
+including those shifted copies where the other argument has set bits, e.g. for
+multiplying 4-bit words `abcd` and `efgh`:
+
+```
+abcd * efgh =
+
+ abcd * e
+ abcd * f
+ abcd * g
+ abcd * h
+--------------- +
+```
+
+Note that the middle bits of the product are a function of many of the input
+bits, whereas the top-most and bottom-most bits are impacted by fewer of the
+input bits. By folding the top half back onto the bottom half these effects
+compensate each other, making all the output bits affected by much of the input.
+
+We did not invent the folded multiply, it was previously used in essentially the
+same way in [aHash](https://github.com/tkaitchuck/aHash),
+[wyhash](https://github.com/wangyi-fudan/wyhash), and
+[xxhash3](https://github.com/Cyan4973/xxHash). The operation was also used
+in [mum-hash](https://github.com/vnmakarov/mum-hash), and probably others.
+We do not know who originally invented it, the earliest reference
+we could find was Steven Fuerst [blogging about it](https://web.archive.org/web/20121213174842/http://locklessinc.com/articles/crypto_hash/)
+in 2012.
+
+
+## HashDoS resistance
+
+The folded multiply has a fairly glaring flaw: if one of the halves is zero, the
+output is zero. This makes it trivial to create a large number of hash
+collisions (even by accident, as zeroes are a common input to hashes). To combat
+this, every folded multiply in foldhash has the following form:
+
+```rust
+folded_multiply(input1 ^ secret1, input2 ^ secret2)
+```
+
+Here `secret1` or `secret2` are either secret random numbers generated by
+foldhash beforehand, or partial hash results influenced by such a secret prior.
+This (plus other careful design throughout the hash function) ensures that it is
+not possible to create a list of inputs that collide for every instance of
+foldhash, and also prevents certain access patterns on hash tables going
+quadratric by ensuring that each hash table uses a different seed and thus a
+different access pattern. It is these two properties that we refer to when we
+claim foldhash is "minimally DoS-resistant": it does the bare minimum to defeat
+very simple attacks.
+
+However, to be crystal clear, **foldhash does not claim to provide HashDoS
+resistance against interactive attackers**. For a student of cryptography it
+should be trivial to derive the secret values from direct observation of hash
+outputs, and feasible to derive the secret values from indirect observation of
+hashes, such as through timing attacks or hash table iteration. Once an attacker
+knows the secret values, they can once again create infinite hash collisions
+with ease.
diff --git a/crates/foldhash/cargo_embargo.json b/crates/foldhash/cargo_embargo.json
new file mode 100644
index 0000000..9e26dfe
--- /dev/null
+++ b/crates/foldhash/cargo_embargo.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/crates/foldhash/src/convenience.rs b/crates/foldhash/src/convenience.rs
new file mode 100644
index 0000000..d515d0b
--- /dev/null
+++ b/crates/foldhash/src/convenience.rs
@@ -0,0 +1,65 @@
+use super::fast::{FixedState, RandomState};
+
+/// Type alias for [`std::collections::HashMap<K, V, foldhash::fast::RandomState>`].
+pub type HashMap<K, V> = std::collections::HashMap<K, V, RandomState>;
+
+/// Type alias for [`std::collections::HashSet<T, foldhash::fast::RandomState>`].
+pub type HashSet<T> = std::collections::HashSet<T, RandomState>;
+
+/// A convenience extension trait to enable [`HashMap::new`] for hash maps that use `foldhash`.
+pub trait HashMapExt {
+ /// Creates an empty `HashMap`.
+ fn new() -> Self;
+
+ /// Creates an empty `HashMap` with at least the specified capacity.
+ fn with_capacity(capacity: usize) -> Self;
+}
+
+impl<K, V> HashMapExt for std::collections::HashMap<K, V, RandomState> {
+ fn new() -> Self {
+ Self::with_hasher(RandomState::default())
+ }
+
+ fn with_capacity(capacity: usize) -> Self {
+ Self::with_capacity_and_hasher(capacity, RandomState::default())
+ }
+}
+
+impl<K, V> HashMapExt for std::collections::HashMap<K, V, FixedState> {
+ fn new() -> Self {
+ Self::with_hasher(FixedState::default())
+ }
+
+ fn with_capacity(capacity: usize) -> Self {
+ Self::with_capacity_and_hasher(capacity, FixedState::default())
+ }
+}
+
+/// A convenience extension trait to enable [`HashSet::new`] for hash sets that use `foldhash`.
+pub trait HashSetExt {
+ /// Creates an empty `HashSet`.
+ fn new() -> Self;
+
+ /// Creates an empty `HashSet` with at least the specified capacity.
+ fn with_capacity(capacity: usize) -> Self;
+}
+
+impl<T> HashSetExt for std::collections::HashSet<T, RandomState> {
+ fn new() -> Self {
+ Self::with_hasher(RandomState::default())
+ }
+
+ fn with_capacity(capacity: usize) -> Self {
+ Self::with_capacity_and_hasher(capacity, RandomState::default())
+ }
+}
+
+impl<T> HashSetExt for std::collections::HashSet<T, FixedState> {
+ fn new() -> Self {
+ Self::with_hasher(FixedState::default())
+ }
+
+ fn with_capacity(capacity: usize) -> Self {
+ Self::with_capacity_and_hasher(capacity, FixedState::default())
+ }
+}
diff --git a/crates/foldhash/src/lib.rs b/crates/foldhash/src/lib.rs
new file mode 100644
index 0000000..deb1636
--- /dev/null
+++ b/crates/foldhash/src/lib.rs
@@ -0,0 +1,397 @@
+//! This crate provides foldhash, a fast, non-cryptographic, minimally
+//! DoS-resistant hashing algorithm designed for computational uses such as
+//! hashmaps, bloom filters, count sketching, etc.
+//!
+//! When should you **not** use foldhash:
+//!
+//! - You are afraid of people studying your long-running program's behavior
+//! to reverse engineer its internal random state and using this knowledge to
+//! create many colliding inputs for computational complexity attacks.
+//!
+//! - You expect foldhash to have a consistent output across versions or
+//! platforms, such as for persistent file formats or communication protocols.
+//!
+//! - You are relying on foldhash's properties for any kind of security.
+//! Foldhash is **not appropriate for any cryptographic purpose**.
+//!
+//! Foldhash has two variants, one optimized for speed which is ideal for data
+//! structures such as hash maps and bloom filters, and one optimized for
+//! statistical quality which is ideal for algorithms such as
+//! [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) and
+//! [MinHash](https://en.wikipedia.org/wiki/MinHash).
+//!
+//! Foldhash can be used in a `#![no_std]` environment by disabling its default
+//! `"std"` feature.
+//!
+//! # Usage
+//!
+//! The easiest way to use this crate with the standard library [`HashMap`] or
+//! [`HashSet`] is to import them from `foldhash` instead, along with the
+//! extension traits to make [`HashMap::new`] and [`HashMap::with_capacity`]
+//! work out-of-the-box:
+//!
+//! ```rust
+//! use foldhash::{HashMap, HashMapExt};
+//!
+//! let mut hm = HashMap::new();
+//! hm.insert(42, "hello");
+//! ```
+//!
+//! You can also avoid the convenience types and do it manually by initializing
+//! a [`RandomState`](fast::RandomState), for example if you are using a different hash map
+//! implementation like [`hashbrown`](https://docs.rs/hashbrown/):
+//!
+//! ```rust
+//! use hashbrown::HashMap;
+//! use foldhash::fast::RandomState;
+//!
+//! let mut hm = HashMap::with_hasher(RandomState::default());
+//! hm.insert("foo", "bar");
+//! ```
+//!
+//! The above methods are the recommended way to use foldhash, which will
+//! automatically generate a randomly generated hasher instance for you. If you
+//! absolutely must have determinism you can use [`FixedState`](fast::FixedState)
+//! instead, but note that this makes you trivially vulnerable to HashDoS
+//! attacks and might lead to quadratic runtime when moving data from one
+//! hashmap/set into another:
+//!
+//! ```rust
+//! use std::collections::HashSet;
+//! use foldhash::fast::FixedState;
+//!
+//! let mut hm = HashSet::with_hasher(FixedState::with_seed(42));
+//! hm.insert([1, 10, 100]);
+//! ```
+//!
+//! If you rely on statistical properties of the hash for the correctness of
+//! your algorithm, such as in [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog),
+//! it is suggested to use the [`RandomState`](quality::RandomState)
+//! or [`FixedState`](quality::FixedState) from the [`quality`] module instead
+//! of the [`fast`] module. The latter is optimized purely for speed in hash
+//! tables and has known statistical imperfections.
+//!
+//! Finally, you can also directly use the [`RandomState`](quality::RandomState)
+//! or [`FixedState`](quality::FixedState) to manually hash items using the
+//! [`BuildHasher`](std::hash::BuildHasher) trait:
+//! ```rust
+//! use std::hash::BuildHasher;
+//! use foldhash::quality::RandomState;
+//!
+//! let random_state = RandomState::default();
+//! let hash = random_state.hash_one("hello world");
+//! ```
+
+#![cfg_attr(all(not(test), not(feature = "std")), no_std)]
+#![warn(missing_docs)]
+
+use core::hash::Hasher;
+
+#[cfg(feature = "std")]
+mod convenience;
+mod seed;
+
+#[cfg(feature = "std")]
+pub use convenience::*;
+
+// Arbitrary constants with high entropy. Hexadecimal digits of pi were used.
+const ARBITRARY0: u64 = 0x243f6a8885a308d3;
+const ARBITRARY1: u64 = 0x13198a2e03707344;
+const ARBITRARY2: u64 = 0xa4093822299f31d0;
+const ARBITRARY3: u64 = 0x082efa98ec4e6c89;
+const ARBITRARY4: u64 = 0x452821e638d01377;
+const ARBITRARY5: u64 = 0xbe5466cf34e90c6c;
+const ARBITRARY6: u64 = 0xc0ac29b7c97c50dd;
+const ARBITRARY7: u64 = 0x3f84d5b5b5470917;
+const ARBITRARY8: u64 = 0x9216d5d98979fb1b;
+const ARBITRARY9: u64 = 0xd1310ba698dfb5ac;
+
+#[inline(always)]
+const fn folded_multiply(x: u64, y: u64) -> u64 {
+ #[cfg(target_pointer_width = "64")]
+ {
+ // We compute the full u64 x u64 -> u128 product, this is a single mul
+ // instruction on x86-64, one mul plus one mulhi on ARM64.
+ let full = (x as u128) * (y as u128);
+ let lo = full as u64;
+ let hi = (full >> 64) as u64;
+
+ // The middle bits of the full product fluctuate the most with small
+ // changes in the input. This is the top bits of lo and the bottom bits
+ // of hi. We can thus make the entire output fluctuate with small
+ // changes to the input by XOR'ing these two halves.
+ lo ^ hi
+ }
+
+ #[cfg(target_pointer_width = "32")]
+ {
+ // u64 x u64 -> u128 product is prohibitively expensive on 32-bit.
+ // Decompose into 32-bit parts.
+ let lx = x as u32;
+ let ly = y as u32;
+ let hx = (x >> 32) as u32;
+ let hy = (y >> 32) as u32;
+
+ // u32 x u32 -> u64 the low bits of one with the high bits of the other.
+ let afull = (lx as u64) * (hy as u64);
+ let bfull = (hx as u64) * (ly as u64);
+
+ // Combine, swapping low/high of one of them so the upper bits of the
+ // product of one combine with the lower bits of the other.
+ afull ^ bfull.rotate_right(32)
+ }
+}
+
+/// The foldhash implementation optimized for speed.
+pub mod fast {
+ use super::*;
+
+ pub use seed::fast::{FixedState, RandomState};
+
+ /// A [`Hasher`] instance implementing foldhash, optimized for speed.
+ ///
+ /// It can't be created directly, see [`RandomState`] or [`FixedState`].
+ #[derive(Clone)]
+ pub struct FoldHasher {
+ accumulator: u64,
+ sponge: u128,
+ sponge_len: u8,
+ fold_seed: u64,
+ expand_seed: u64,
+ expand_seed2: u64,
+ expand_seed3: u64,
+ }
+
+ impl FoldHasher {
+ pub(crate) fn with_seed(per_hasher_seed: u64, global_seed: &[u64; 4]) -> FoldHasher {
+ FoldHasher {
+ accumulator: per_hasher_seed,
+ sponge: 0,
+ sponge_len: 0,
+ fold_seed: global_seed[0],
+ expand_seed: global_seed[1],
+ expand_seed2: global_seed[2],
+ expand_seed3: global_seed[3],
+ }
+ }
+
+ #[inline(always)]
+ fn write_num<T: Into<u128>>(&mut self, x: T) {
+ let bits: usize = 8 * core::mem::size_of::<T>();
+ if self.sponge_len as usize + bits > 128 {
+ let lo = self.sponge as u64;
+ let hi = (self.sponge >> 64) as u64;
+ self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed);
+ self.sponge = x.into();
+ self.sponge_len = 0;
+ } else {
+ self.sponge |= x.into() << self.sponge_len;
+ self.sponge_len += bits as u8;
+ }
+ }
+ }
+
+ impl Hasher for FoldHasher {
+ #[inline(always)]
+ fn write(&mut self, bytes: &[u8]) {
+ let mut s0 = self.accumulator;
+ let mut s1 = self.expand_seed;
+ let len = bytes.len();
+ if len <= 16 {
+ // XOR the input into s0, s1, then multiply and fold.
+ if len >= 8 {
+ s0 ^= u64::from_ne_bytes(bytes[0..8].try_into().unwrap());
+ s1 ^= u64::from_ne_bytes(bytes[len - 8..].try_into().unwrap());
+ } else if len >= 4 {
+ s0 ^= u32::from_ne_bytes(bytes[0..4].try_into().unwrap()) as u64;
+ s1 ^= u32::from_ne_bytes(bytes[len - 4..].try_into().unwrap()) as u64;
+ } else if len > 0 {
+ let lo = bytes[0];
+ let mid = bytes[len / 2];
+ let hi = bytes[len - 1];
+ s0 ^= lo as u64;
+ s1 ^= ((hi as u64) << 8) | mid as u64;
+ }
+ self.accumulator = folded_multiply(s0, s1);
+ } else if len < 256 {
+ self.accumulator = hash_bytes_medium(bytes, s0, s1, self.fold_seed);
+ } else {
+ self.accumulator = hash_bytes_long(
+ bytes,
+ s0,
+ s1,
+ self.expand_seed2,
+ self.expand_seed3,
+ self.fold_seed,
+ );
+ }
+ }
+
+ #[inline(always)]
+ fn write_u8(&mut self, i: u8) {
+ self.write_num(i);
+ }
+
+ #[inline(always)]
+ fn write_u16(&mut self, i: u16) {
+ self.write_num(i);
+ }
+
+ #[inline(always)]
+ fn write_u32(&mut self, i: u32) {
+ self.write_num(i);
+ }
+
+ #[inline(always)]
+ fn write_u64(&mut self, i: u64) {
+ self.write_num(i);
+ }
+
+ #[inline(always)]
+ fn write_u128(&mut self, i: u128) {
+ let lo = i as u64;
+ let hi = (i >> 64) as u64;
+ self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed);
+ }
+
+ #[inline(always)]
+ fn write_usize(&mut self, i: usize) {
+ // u128 doesn't implement From<usize>.
+ #[cfg(target_pointer_width = "32")]
+ self.write_num(i as u32);
+ #[cfg(target_pointer_width = "64")]
+ self.write_num(i as u64);
+ }
+
+ #[inline(always)]
+ fn finish(&self) -> u64 {
+ if self.sponge_len > 0 {
+ let lo = self.sponge as u64;
+ let hi = (self.sponge >> 64) as u64;
+ folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed)
+ } else {
+ self.accumulator
+ }
+ }
+ }
+}
+
+/// The foldhash implementation optimized for quality.
+pub mod quality {
+ use super::*;
+
+ pub use seed::quality::{FixedState, RandomState};
+
+ /// A [`Hasher`] instance implementing foldhash, optimized for quality.
+ ///
+ /// It can't be created directly, see [`RandomState`] or [`FixedState`].
+ #[derive(Clone)]
+ pub struct FoldHasher {
+ pub(crate) inner: fast::FoldHasher,
+ }
+
+ impl Hasher for FoldHasher {
+ #[inline(always)]
+ fn write(&mut self, bytes: &[u8]) {
+ self.inner.write(bytes);
+ }
+
+ #[inline(always)]
+ fn write_u8(&mut self, i: u8) {
+ self.inner.write_u8(i);
+ }
+
+ #[inline(always)]
+ fn write_u16(&mut self, i: u16) {
+ self.inner.write_u16(i);
+ }
+
+ #[inline(always)]
+ fn write_u32(&mut self, i: u32) {
+ self.inner.write_u32(i);
+ }
+
+ #[inline(always)]
+ fn write_u64(&mut self, i: u64) {
+ self.inner.write_u64(i);
+ }
+
+ #[inline(always)]
+ fn write_u128(&mut self, i: u128) {
+ self.inner.write_u128(i);
+ }
+
+ #[inline(always)]
+ fn write_usize(&mut self, i: usize) {
+ self.inner.write_usize(i);
+ }
+
+ #[inline(always)]
+ fn finish(&self) -> u64 {
+ folded_multiply(self.inner.finish(), ARBITRARY0)
+ }
+ }
+}
+
+/// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.
+fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> u64 {
+ // Process 32 bytes per iteration, 16 bytes from the start, 16 bytes from
+ // the end. On the last iteration these two chunks can overlap, but that is
+ // perfectly fine.
+ let left_to_right = bytes.chunks_exact(16);
+ let mut right_to_left = bytes.rchunks_exact(16);
+ for lo in left_to_right {
+ let hi = right_to_left.next().unwrap();
+ let unconsumed_start = lo.as_ptr();
+ let unconsumed_end = hi.as_ptr_range().end;
+ if unconsumed_start >= unconsumed_end {
+ break;
+ }
+
+ let a = u64::from_ne_bytes(lo[0..8].try_into().unwrap());
+ let b = u64::from_ne_bytes(lo[8..16].try_into().unwrap());
+ let c = u64::from_ne_bytes(hi[0..8].try_into().unwrap());
+ let d = u64::from_ne_bytes(hi[8..16].try_into().unwrap());
+ s0 = folded_multiply(a ^ s0, c ^ fold_seed);
+ s1 = folded_multiply(b ^ s1, d ^ fold_seed);
+ }
+
+ s0 ^ s1
+}
+
+/// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.
+#[cold]
+#[inline(never)]
+fn hash_bytes_long(
+ bytes: &[u8],
+ mut s0: u64,
+ mut s1: u64,
+ mut s2: u64,
+ mut s3: u64,
+ fold_seed: u64,
+) -> u64 {
+ let chunks = bytes.chunks_exact(64);
+ let remainder = chunks.remainder().len();
+ for chunk in chunks {
+ let a = u64::from_ne_bytes(chunk[0..8].try_into().unwrap());
+ let b = u64::from_ne_bytes(chunk[8..16].try_into().unwrap());
+ let c = u64::from_ne_bytes(chunk[16..24].try_into().unwrap());
+ let d = u64::from_ne_bytes(chunk[24..32].try_into().unwrap());
+ let e = u64::from_ne_bytes(chunk[32..40].try_into().unwrap());
+ let f = u64::from_ne_bytes(chunk[40..48].try_into().unwrap());
+ let g = u64::from_ne_bytes(chunk[48..56].try_into().unwrap());
+ let h = u64::from_ne_bytes(chunk[56..64].try_into().unwrap());
+ s0 = folded_multiply(a ^ s0, e ^ fold_seed);
+ s1 = folded_multiply(b ^ s1, f ^ fold_seed);
+ s2 = folded_multiply(c ^ s2, g ^ fold_seed);
+ s3 = folded_multiply(d ^ s3, h ^ fold_seed);
+ }
+ s0 ^= s2;
+ s1 ^= s3;
+
+ if remainder > 0 {
+ hash_bytes_medium(&bytes[bytes.len() - remainder.max(16)..], s0, s1, fold_seed)
+ } else {
+ s0 ^ s1
+ }
+}
diff --git a/crates/foldhash/src/seed.rs b/crates/foldhash/src/seed.rs
new file mode 100644
index 0000000..f74d2a6
--- /dev/null
+++ b/crates/foldhash/src/seed.rs
@@ -0,0 +1,326 @@
+use core::hash::BuildHasher;
+
+// These constants may end up unused depending on platform support.
+#[allow(unused)]
+use crate::{ARBITRARY1, ARBITRARY9};
+
+use super::{
+ folded_multiply, ARBITRARY2, ARBITRARY3, ARBITRARY4, ARBITRARY5, ARBITRARY6, ARBITRARY7,
+ ARBITRARY8,
+};
+
+/// Used for FixedState, and RandomState if atomics for dynamic init are unavailable.
+const FIXED_GLOBAL_SEED: [u64; 4] = [ARBITRARY4, ARBITRARY5, ARBITRARY6, ARBITRARY7];
+
+pub mod fast {
+ use super::*;
+ use crate::fast::FoldHasher;
+
+ /// A [`BuildHasher`] for [`fast::FoldHasher`]s that are randomly initialized.
+ #[derive(Copy, Clone, Debug)]
+ pub struct RandomState {
+ per_hasher_seed: u64,
+ global_seed: global::GlobalSeed,
+ }
+
+ impl Default for RandomState {
+ fn default() -> Self {
+ let per_hasher_seed;
+
+ // If we have the standard library available we use a thread-local
+ // counter for the per-hasher seed.
+ #[cfg(feature = "std")]
+ {
+ use std::cell::Cell;
+ thread_local! {
+ static PER_HASHER_NONDETERMINISM: Cell<u64> = const { Cell::new(0) };
+ }
+
+ let mut nondeterminism = PER_HASHER_NONDETERMINISM.get();
+ nondeterminism = nondeterminism.wrapping_add(ARBITRARY1 | 1); // Ensure number is odd for maximum period.
+ PER_HASHER_NONDETERMINISM.set(nondeterminism);
+ per_hasher_seed = folded_multiply(nondeterminism, ARBITRARY2);
+ };
+
+ // If we don't have the standard library we use our current stack
+ // address in combination with a global PER_HASHER_NONDETERMINISM to
+ // create a new value that is very likely to have never been used as
+ // a random state before.
+ //
+ // PER_HASHER_NONDETERMINISM is loaded and updated in a racy manner,
+ // but this doesn't matter in practice - it is impossible that two
+ // different threads have the same stack location, so they'll almost
+ // surely generate different seeds, and provide a different possible
+ // update for PER_HASHER_NONDETERMINISM. If we would use a proper
+ // fetch_add atomic update then there is a larger chance of
+ // problematic contention.
+ //
+ // We use usize instead of 64-bit atomics for best platform support.
+ #[cfg(not(feature = "std"))]
+ {
+ use core::sync::atomic::{AtomicUsize, Ordering};
+ static PER_HASHER_NONDETERMINISM: AtomicUsize = AtomicUsize::new(0);
+
+ let nondeterminism = PER_HASHER_NONDETERMINISM.load(Ordering::Relaxed) as u64;
+ let stack_ptr = &nondeterminism as *const _ as u64;
+ per_hasher_seed = folded_multiply(nondeterminism ^ stack_ptr, ARBITRARY2);
+ PER_HASHER_NONDETERMINISM.store(per_hasher_seed as usize, Ordering::Relaxed);
+ }
+
+ Self {
+ per_hasher_seed,
+ global_seed: global::GlobalSeed::new(),
+ }
+ }
+ }
+
+ impl BuildHasher for RandomState {
+ type Hasher = FoldHasher;
+
+ fn build_hasher(&self) -> FoldHasher {
+ FoldHasher::with_seed(self.per_hasher_seed, self.global_seed.get())
+ }
+ }
+
+ /// A [`BuildHasher`] for [`fast::FoldHasher`]s that all have the same fixed seed.
+ ///
+ /// Not recommended unless you absolutely need determinism.
+ #[derive(Copy, Clone, Debug)]
+ pub struct FixedState {
+ per_hasher_seed: u64,
+ }
+
+ impl FixedState {
+ /// Creates a [`FixedState`] with the given seed.
+ pub const fn with_seed(seed: u64) -> Self {
+ // XOR with ARBITRARY3 such that with_seed(0) matches default.
+ Self {
+ per_hasher_seed: seed ^ ARBITRARY3,
+ }
+ }
+ }
+
+ impl Default for FixedState {
+ fn default() -> Self {
+ Self {
+ per_hasher_seed: ARBITRARY3,
+ }
+ }
+ }
+
+ impl BuildHasher for FixedState {
+ type Hasher = FoldHasher;
+
+ fn build_hasher(&self) -> FoldHasher {
+ FoldHasher::with_seed(self.per_hasher_seed, &FIXED_GLOBAL_SEED)
+ }
+ }
+}
+
+pub mod quality {
+ use super::*;
+ use crate::quality::FoldHasher;
+
+ /// A [`BuildHasher`] for [`quality::FoldHasher`]s that are randomly initialized.
+ #[derive(Copy, Clone, Default, Debug)]
+ pub struct RandomState {
+ inner: fast::RandomState,
+ }
+
+ impl BuildHasher for RandomState {
+ type Hasher = FoldHasher;
+
+ fn build_hasher(&self) -> FoldHasher {
+ FoldHasher {
+ inner: self.inner.build_hasher(),
+ }
+ }
+ }
+
+ /// A [`BuildHasher`] for [`quality::FoldHasher`]s that all have the same fixed seed.
+ ///
+ /// Not recommended unless you absolutely need determinism.
+ #[derive(Copy, Clone, Default, Debug)]
+ pub struct FixedState {
+ inner: fast::FixedState,
+ }
+
+ impl FixedState {
+ /// Creates a [`FixedState`] with the given seed.
+ pub const fn with_seed(seed: u64) -> Self {
+ Self {
+ // We do an additional folded multiply with the seed here for
+ // the quality hash to ensure better independence between seed
+ // and hash. If the seed is zero the folded multiply is zero,
+ // preserving with_seed(0) == default().
+ inner: fast::FixedState::with_seed(folded_multiply(seed, ARBITRARY8)),
+ }
+ }
+ }
+
+ impl BuildHasher for FixedState {
+ type Hasher = FoldHasher;
+
+ fn build_hasher(&self) -> FoldHasher {
+ FoldHasher {
+ inner: self.inner.build_hasher(),
+ }
+ }
+ }
+}
+
+#[cfg(target_has_atomic = "8")]
+mod global {
+ use super::*;
+ use core::cell::UnsafeCell;
+ use core::sync::atomic::{AtomicU8, Ordering};
+
+ fn generate_global_seed() -> [u64; 4] {
+ let mix = |seed: u64, x: u64| folded_multiply(seed ^ x, ARBITRARY9);
+
+ // Use address space layout randomization as our main randomness source.
+ // This isn't great, but we don't advertise HashDoS resistance in the first
+ // place. This is a whole lot better than nothing, at near zero cost with
+ // no dependencies.
+ let mut seed = 0;
+ let stack_ptr = &seed as *const _;
+ let func_ptr = generate_global_seed;
+ let static_ptr = &GLOBAL_SEED_STORAGE as *const _;
+ seed = mix(seed, stack_ptr as usize as u64);
+ seed = mix(seed, func_ptr as usize as u64);
+ seed = mix(seed, static_ptr as usize as u64);
+
+ // If we have the standard library available, augment entropy with the
+ // current time and an address from the allocator.
+ #[cfg(feature = "std")]
+ {
+ #[cfg(not(all(target_family = "wasm", target_os = "unknown")))]
+ if let Ok(duration) = std::time::UNIX_EPOCH.elapsed() {
+ seed = mix(seed, duration.subsec_nanos() as u64);
+ seed = mix(seed, duration.as_secs());
+ }
+
+ let box_ptr = &*Box::new(0u8) as *const _;
+ seed = mix(seed, box_ptr as usize as u64);
+ }
+
+ let seed_a = mix(seed, 0);
+ let seed_b = mix(mix(mix(seed_a, 0), 0), 0);
+ let seed_c = mix(mix(mix(seed_b, 0), 0), 0);
+ let seed_d = mix(mix(mix(seed_c, 0), 0), 0);
+
+ // Zeroes form a weak-point for the multiply-mix, and zeroes tend to be
+ // a common input. So we want our global seeds that are XOR'ed with the
+ // input to always be non-zero. To also ensure there is always a good spread
+ // of bits, we give up 3 bits of entropy and simply force some bits on.
+ const FORCED_ONES: u64 = (1 << 63) | (1 << 31) | 1;
+ [
+ seed_a | FORCED_ONES,
+ seed_b | FORCED_ONES,
+ seed_c | FORCED_ONES,
+ seed_d | FORCED_ONES,
+ ]
+ }
+
+ // Now all the below code purely exists to cache the above seed as
+ // efficiently as possible. Even if we weren't a no_std crate and had access to
+ // OnceLock, we don't want to check whether the global is set each time we
+ // hash an object, so we hand-roll a global storage where type safety allows us
+ // to assume the storage is initialized after construction.
+ struct GlobalSeedStorage {
+ state: AtomicU8,
+ seed: UnsafeCell<[u64; 4]>,
+ }
+
+ const UNINIT: u8 = 0;
+ const LOCKED: u8 = 1;
+ const INIT: u8 = 2;
+
+ // SAFETY: we only mutate the UnsafeCells when state is in the thread-exclusive
+ // LOCKED state, and only read the UnsafeCells when state is in the
+ // once-achieved-eternally-preserved state INIT.
+ unsafe impl Sync for GlobalSeedStorage {}
+
+ static GLOBAL_SEED_STORAGE: GlobalSeedStorage = GlobalSeedStorage {
+ state: AtomicU8::new(UNINIT),
+ seed: UnsafeCell::new([0; 4]),
+ };
+
+ /// An object representing an initialized global seed.
+ ///
+ /// Does not actually store the seed inside itself, it is a zero-sized type.
+ /// This prevents inflating the RandomState size and in turn HashMap's size.
+ #[derive(Copy, Clone, Debug)]
+ pub struct GlobalSeed {
+ // So we can't accidentally type GlobalSeed { } within this crate.
+ _no_accidental_unsafe_init: (),
+ }
+
+ impl GlobalSeed {
+ #[inline(always)]
+ pub fn new() -> Self {
+ if GLOBAL_SEED_STORAGE.state.load(Ordering::Acquire) != INIT {
+ Self::init_slow()
+ }
+ Self {
+ _no_accidental_unsafe_init: (),
+ }
+ }
+
+ #[cold]
+ #[inline(never)]
+ fn init_slow() {
+ // Generate seed outside of critical section.
+ let seed = generate_global_seed();
+
+ loop {
+ match GLOBAL_SEED_STORAGE.state.compare_exchange_weak(
+ UNINIT,
+ LOCKED,
+ Ordering::Relaxed,
+ Ordering::Acquire,
+ ) {
+ Ok(_) => unsafe {
+ // SAFETY: we just acquired an exclusive lock.
+ *GLOBAL_SEED_STORAGE.seed.get() = seed;
+ GLOBAL_SEED_STORAGE.state.store(INIT, Ordering::Release);
+ return;
+ },
+
+ Err(INIT) => return,
+
+ // Yes, it's a spin loop. We need to support no_std (so no easy
+ // access to proper locks), this is a one-time-per-program
+ // initialization, and the critical section is only a few
+ // store instructions, so it'll be fine.
+ _ => core::hint::spin_loop(),
+ }
+ }
+ }
+
+ #[inline(always)]
+ pub fn get(self) -> &'static [u64; 4] {
+ // SAFETY: our constructor ensured we are in the INIT state and thus
+ // this raw read does not race with any write.
+ unsafe { &*GLOBAL_SEED_STORAGE.seed.get() }
+ }
+ }
+}
+
+#[cfg(not(target_has_atomic = "8"))]
+mod global {
+ #[derive(Copy, Clone, Debug)]
+ pub struct GlobalSeed {}
+
+ impl GlobalSeed {
+ #[inline(always)]
+ pub fn new() -> Self {
+ Self {}
+ }
+
+ #[inline(always)]
+ pub fn get(self) -> &'static [u64; 4] {
+ &super::FIXED_GLOBAL_SEED
+ }
+ }
+}
diff --git a/pseudo_crate/Cargo.lock b/pseudo_crate/Cargo.lock
index 1e2bfb7..2d2263a 100644
--- a/pseudo_crate/Cargo.lock
+++ b/pseudo_crate/Cargo.lock
@@ -193,6 +193,7 @@
"flagset",
"flate2",
"fnv",
+ "foldhash",
"foreign-types 0.3.1",
"foreign-types-shared 0.1.0",
"form_urlencoded",
@@ -1974,6 +1975,12 @@
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
+name = "foldhash"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2"
+
+[[package]]
name = "font-kit"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/pseudo_crate/Cargo.toml b/pseudo_crate/Cargo.toml
index 889d293..defc96a 100644
--- a/pseudo_crate/Cargo.toml
+++ b/pseudo_crate/Cargo.toml
@@ -108,6 +108,7 @@
flagset = "=0.4.6"
flate2 = "=1.0.35"
fnv = "=1.0.7"
+foldhash = "=0.1.3"
foreign-types = "=0.3.1"
foreign-types-shared = "=0.1.0"
form_urlencoded = "=1.2.1"
diff --git a/pseudo_crate/crate-list.txt b/pseudo_crate/crate-list.txt
index 4fe0ee6..8b7fa2c 100644
--- a/pseudo_crate/crate-list.txt
+++ b/pseudo_crate/crate-list.txt
@@ -100,6 +100,7 @@
flagset
flate2
fnv
+foldhash
foreign-types
foreign-types-shared
form_urlencoded