Upgrade rust/crates/memchr to 2.4.1 am: 1e1ba1afe0 am: 83f6d7e5a5 Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/memchr/+/1832701 Change-Id: Ice5124fe90d1e36646c6427d72a99c6cb47d3312

commit: 66c4d02086d468953ab6802cb7dc27d3cba83269 [log] [tgz]
author: Joel Galenson <jgalenson@google.com> Tue Oct 05 02:14:54 2021 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> Tue Oct 05 02:14:54 2021 +0000
tree: 7be84b30235be3873090ff421bd1268f8a562088
parent: cc65e6b515e48f872d16e0736992b9eec049895d [diff]
parent: 83f6d7e5a5c0153fa2c01727461d62ea178b318e [diff]
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index ee47bc5..2507469 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json

@@ -1,5 +1,5 @@
 {
   "git": {
-    "sha1": "0ba6735559ae9c1cb7e6470a97834a95cc06fbbb"
+    "sha1": "8e1da98fee06d66c13e66c330e3a3dd6ccf0e3a0"
   }
 }

diff --git a/Android.bp b/Android.bp
index 1ce90ac..bc7a1c2 100644
--- a/Android.bp
+++ b/Android.bp

@@ -42,6 +42,8 @@
     name: "libmemchr",
     host_supported: true,
     crate_name: "memchr",
+    cargo_env_compat: true,
+    cargo_pkg_version: "2.4.1",
     srcs: ["src/lib.rs"],
     edition: "2018",
     features: [

diff --git a/Cargo.toml b/Cargo.toml
index ed97e9f..e739019 100644
--- a/Cargo.toml
+++ b/Cargo.toml

@@ -3,26 +3,25 @@
 # When uploading crates to the registry Cargo will automatically
 # "normalize" Cargo.toml files for maximal compatibility
 # with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
 #
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
 
 [package]
 edition = "2018"
 name = "memchr"
-version = "2.4.0"
+version = "2.4.1"
 authors = ["Andrew Gallant <jamslam@gmail.com>", "bluss"]
 exclude = ["/bench", "/.github", "/fuzz"]
 description = "Safe interface to memchr."
-homepage = "https://github.com/BurntSushi/rust-memchr"
+homepage = "https://github.com/BurntSushi/memchr"
 documentation = "https://docs.rs/memchr/"
 readme = "README.md"
 keywords = ["memchr", "char", "scan", "strchr", "string"]
 license = "Unlicense/MIT"
-repository = "https://github.com/BurntSushi/rust-memchr"
+repository = "https://github.com/BurntSushi/memchr"
 [profile.bench]
 debug = true
 
@@ -36,6 +35,15 @@
 [lib]
 name = "memchr"
 bench = false
+[dependencies.compiler_builtins]
+version = "0.1.2"
+optional = true
+
+[dependencies.core]
+version = "1.0.0"
+optional = true
+package = "rustc-std-workspace-core"
+
 [dependencies.libc]
 version = "0.2.18"
 optional = true
@@ -46,5 +54,6 @@
 
 [features]
 default = ["std"]
+rustc-dep-of-std = ["core", "compiler_builtins"]
 std = []
 use_std = ["std"]

diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 6218b17..2348487 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig

@@ -1,11 +1,11 @@
 [package]
 name = "memchr"
-version = "2.4.0"  #:version
+version = "2.4.1"  #:version
 authors = ["Andrew Gallant <jamslam@gmail.com>", "bluss"]
 description = "Safe interface to memchr."
 documentation = "https://docs.rs/memchr/"
-homepage = "https://github.com/BurntSushi/rust-memchr"
-repository = "https://github.com/BurntSushi/rust-memchr"
+homepage = "https://github.com/BurntSushi/memchr"
+repository = "https://github.com/BurntSushi/memchr"
 readme = "README.md"
 keywords = ["memchr", "char", "scan", "strchr", "string"]
 license = "Unlicense/MIT"
@@ -31,9 +31,18 @@
 # then, it is alias for the 'std' feature.
 use_std = ["std"]
 
+# Internal feature, only used when building as part of libstd, not part of the
+# stable interface of this crate.
+rustc-dep-of-std = ['core', 'compiler_builtins']
+
 [dependencies]
 libc = { version = "0.2.18", default-features = false, optional = true }
 
+# Internal feature, only used when building as part of libstd, not part of the
+# stable interface of this crate.
+core = { version = '1.0.0', optional = true, package = 'rustc-std-workspace-core' }
+compiler_builtins = { version = '0.1.2', optional = true }
+
 [dev-dependencies]
 quickcheck = { version = "1.0.3", default-features = false }
 

diff --git a/METADATA b/METADATA
index 0944612..2112cd0 100644
--- a/METADATA
+++ b/METADATA

@@ -7,13 +7,13 @@
   }
   url {
     type: ARCHIVE
-    value: "https://static.crates.io/crates/memchr/memchr-2.4.0.crate"
+    value: "https://static.crates.io/crates/memchr/memchr-2.4.1.crate"
   }
-  version: "2.4.0"
+  version: "2.4.1"
   license_type: NOTICE
   last_upgrade_date {
     year: 2021
-    month: 5
-    day: 19
+    month: 9
+    day: 22
   }
 }

diff --git a/README.md b/README.md
index 3b92e18..df75816 100644
--- a/README.md
+++ b/README.md

@@ -2,7 +2,7 @@
 ======
 This library provides heavily optimized routines for string search primitives.
 
-[![Build status](https://github.com/BurntSushi/rust-memchr/workflows/ci/badge.svg)](https://github.com/BurntSushi/rust-memchr/actions)
+[![Build status](https://github.com/BurntSushi/memchr/workflows/ci/badge.svg)](https://github.com/BurntSushi/memchr/actions)
 [![](https://meritbadge.herokuapp.com/memchr)](https://crates.io/crates/memchr)
 
 Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
@@ -84,4 +84,24 @@
 * A huge suite of benchmarks that are also run as tests. Benchmarks always
   confirm that the expected result occurs.
 
-Improvements to the testing infrastructue are very welcome.
+Improvements to the testing infrastructure are very welcome.
+
+
+### Algorithms used
+
+At time of writing, this crate's implementation of substring search actually
+has a few different algorithms to choose from depending on the situation.
+
+* For very small haystacks,
+  [Rabin-Karp](https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm)
+  is used to reduce latency. Rabin-Karp has very small overhead and can often
+  complete before other searchers have even been constructed.
+* For small needles, a variant of the
+  ["Generic SIMD"](http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd)
+  algorithm is used. Instead of using the first and last bytes, a heuristic is
+  used to select bytes based on a background distribution of byte frequencies.
+* In all other cases,
+  [Two-Way](https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm)
+  is used. If possible, a prefilter based on the "Generic SIMD" algorithm
+  linked above is used to find candidates quickly. A dynamic heuristic is used
+  to detect if the prefilter is ineffective, and if so, disables it.

diff --git a/scripts/make-byte-frequency-table b/scripts/make-byte-frequency-table
new file mode 100755
index 0000000..37eeca7
--- /dev/null
+++ b/scripts/make-byte-frequency-table

@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+# This does simple normalized frequency analysis on UTF-8 encoded text. The
+# result of the analysis is translated to a ranked list, where every byte is
+# assigned a rank. This list is written to src/freqs.rs.
+#
+# Currently, the frequencies are generated from the following corpuses:
+#
+#   * The CIA world fact book
+#   * The source code of rustc
+#   * Septuaginta
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+from collections import Counter
+import sys
+
+preamble = '''
+// NOTE: The following code was generated by "scripts/frequencies.py", do not
+// edit directly
+'''.lstrip()
+
+
+def eprint(*args, **kwargs):
+    kwargs['file'] = sys.stderr
+    print(*args, **kwargs)
+
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument('corpus', metavar='FILE', nargs='+')
+    args = p.parse_args()
+
+    # Get frequency counts of each byte.
+    freqs = Counter()
+    for i in range(0, 256):
+        freqs[i] = 0
+
+    eprint('reading entire corpus into memory')
+    corpus = []
+    for fpath in args.corpus:
+        corpus.append(open(fpath, 'rb').read())
+
+    eprint('computing byte frequencies')
+    for c in corpus:
+        for byte in c:
+            freqs[byte] += 1.0 / float(len(c))
+
+    eprint('writing Rust code')
+    # Get the rank of each byte. A lower rank => lower relative frequency.
+    rank = [0] * 256
+    for i, (byte, _) in enumerate(freqs.most_common()):
+        # print(byte)
+        rank[byte] = 255 - i
+
+    # Forcefully set the highest rank possible for bytes that start multi-byte
+    # UTF-8 sequences. The idea here is that a continuation byte will be more
+    # discerning in a homogenous haystack.
+    for byte in range(0xC0, 0xFF + 1):
+        rank[byte] = 255
+
+    # Now write Rust.
+    olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
+    for byte in range(256):
+        olines.append('    %3d, // %r' % (rank[byte], chr(byte)))
+    olines.append('];')
+
+    print(preamble)
+    print('\n'.join(olines))
+
+
+if __name__ == '__main__':
+    main()

diff --git a/src/cow.rs b/src/cow.rs
index 95a0824..0b7d0da 100644
--- a/src/cow.rs
+++ b/src/cow.rs

@@ -5,7 +5,7 @@
 /// The purpose of this type is to permit usage of a "borrowed or owned
 /// byte string" in a way that keeps std/no-std compatibility. That is, in
 /// no-std mode, this type devolves into a simple &[u8] with no owned variant
-/// availble. We can't just use a plain Cow because Cow is not in core.
+/// available. We can't just use a plain Cow because Cow is not in core.
 #[derive(Clone, Debug)]
 pub struct CowBytes<'a>(Imp<'a>);
 

diff --git a/src/lib.rs b/src/lib.rs
index c3154b6..e0b4ce3 100644
--- a/src/lib.rs
+++ b/src/lib.rs

@@ -160,7 +160,7 @@
 #![cfg_attr(miri, allow(dead_code, unused_macros))]
 
 // Supporting 8-bit (or others) would be fine. If you need it, please submit a
-// bug report at https://github.com/BurntSushi/rust-memchr
+// bug report at https://github.com/BurntSushi/memchr
 #[cfg(not(any(
     target_pointer_width = "16",
     target_pointer_width = "32",

diff --git a/src/memmem/genericsimd.rs b/src/memmem/genericsimd.rs
index 2417f91..28bfdab 100644
--- a/src/memmem/genericsimd.rs
+++ b/src/memmem/genericsimd.rs

@@ -195,7 +195,9 @@
 }
 
 /// Search for an occurrence of two rare bytes from the needle in the chunk
-/// pointed to by ptr, with the end of the haystack pointed to by end_ptr.
+/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When
+/// an occurrence is found, memcmp is run to check if a match occurs at the
+/// corresponding position.
 ///
 /// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
 /// bytes repeated in each 8-bit lane, respectively.
@@ -210,7 +212,7 @@
 ///
 /// It must be safe to do an unaligned read of size(V) bytes starting at both
 /// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned
-/// loads on ptr up to end_ptr.
+/// loads on ptr up to (end_ptr - needle.len()).
 #[inline(always)]
 unsafe fn fwd_find_in_chunk<V: Vector>(
     fwd: &Forward,

diff --git a/src/memmem/prefilter/fallback.rs b/src/memmem/prefilter/fallback.rs
index 8ae1f4e..ae1bbcc 100644
--- a/src/memmem/prefilter/fallback.rs
+++ b/src/memmem/prefilter/fallback.rs

@@ -20,7 +20,7 @@
 
 This fallback implementation was originally formulated in regex many moons ago:
 https://github.com/rust-lang/regex/blob/3db8722d0b204a85380fe2a65e13d7065d7dd968/src/literal/imp.rs#L370-L501
-Prior to that, I'm not aware of anyone using this technique in any prominant
+Prior to that, I'm not aware of anyone using this technique in any prominent
 substring search implementation. Although, I'm sure folks have had this same
 insight long before me.
 

diff --git a/src/memmem/prefilter/mod.rs b/src/memmem/prefilter/mod.rs
index 2481cfe..6461f33 100644
--- a/src/memmem/prefilter/mod.rs
+++ b/src/memmem/prefilter/mod.rs

@@ -18,7 +18,7 @@
 /// instead of needing to pass around all three as distinct function
 /// parameters.
 pub(crate) struct Pre<'a> {
-    /// State that tracks the effectivess of a prefilter.
+    /// State that tracks the effectiveness of a prefilter.
     pub(crate) state: &'a mut PrefilterState,
     /// The actual prefilter function.
     pub(crate) prefn: PrefilterFn,
@@ -146,7 +146,7 @@
 /// The use of prefilters in this implementation does use a heuristic to detect
 /// when a prefilter might not be carrying its weight, and will dynamically
 /// disable its use. Nevertheless, this configuration option gives callers
-/// the ability to disable pefilters if you have knowledge that they won't be
+/// the ability to disable prefilters if you have knowledge that they won't be
 /// useful.
 #[derive(Clone, Copy, Debug)]
 #[non_exhaustive]

diff --git a/src/tests/memchr/testdata.rs b/src/tests/memchr/testdata.rs
index acb1fd8..6dda524 100644
--- a/src/tests/memchr/testdata.rs
+++ b/src/tests/memchr/testdata.rs

@@ -160,7 +160,7 @@
         //
         // You might think this would cause most needles to not be found, but
         // we actually expand our tests to include corpus sizes all the way up
-        // to >500 bytes, so we should exericse most branches.
+        // to >500 bytes, so we should exercise most branches.
         for align in 0..130 {
             let corpus = self.corpus(align);
             assert_eq!(
commit	66c4d02086d468953ab6802cb7dc27d3cba83269	[log] [tgz]
author	Joel Galenson <jgalenson@google.com>	Tue Oct 05 02:14:54 2021 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	Tue Oct 05 02:14:54 2021 +0000
tree	7be84b30235be3873090ff421bd1268f8a562088
parent	cc65e6b515e48f872d16e0736992b9eec049895d [diff]
parent	83f6d7e5a5c0153fa2c01727461d62ea178b318e [diff]