Snap for 6435660 from 3c640bc02706768a64797b2167b6dfa9c54cd796 to sdk-release Change-Id: I26271686112acf3df7354f9ea81ef5bbbab51206

commit: 213930d9bab55e016623e18368ec734db5507b92 [log] [tgz]
author: android-build-prod (mdb) <android-build-team-robot@google.com> Mon Apr 27 21:45:53 2020 +0000
committer: android-build-prod (mdb) <android-build-team-robot@google.com> Mon Apr 27 21:45:53 2020 +0000
tree: cb3f25cd050f85ca71e0fbf742efc064c2195309
parent: 100ab1994dd7f043162e02289881970f43d7e439 [diff]
parent: 3c640bc02706768a64797b2167b6dfa9c54cd796 [diff]
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
new file mode 100644
index 0000000..88d9580
--- /dev/null
+++ b/.cargo_vcs_info.json

@@ -0,0 +1,5 @@
+{
+  "git": {
+    "sha1": "378bb1caecac7079b867fa6e6059c0d8ea244e03"
+  }
+}

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ab067f2
--- /dev/null
+++ b/.gitignore

@@ -0,0 +1,10 @@
+.*.swp
+doc
+tags
+examples/ss10pusa.csv
+build
+target
+/Cargo.lock
+scratch*
+bench_large/huge
+tmp/

diff --git a/Android.bp b/Android.bp
new file mode 100644
index 0000000..599094d
--- /dev/null
+++ b/Android.bp

@@ -0,0 +1,18 @@
+// This file is generated by cargo2android.py.
+
+rust_library_host_rlib {
+    name: "libmemchr",
+    crate_name: "memchr",
+    srcs: ["src/lib.rs"],
+    edition: "2015",
+    features: [
+        "default",
+        "std",
+    ],
+    flags: [
+        "--cfg memchr_runtime_avx",
+        "--cfg memchr_runtime_simd",
+        "--cfg memchr_runtime_sse2",
+        "--cfg memchr_runtime_sse42",
+    ],
+}

diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..bb9c20a
--- /dev/null
+++ b/COPYING

@@ -0,0 +1,3 @@
+This project is dual-licensed under the Unlicense and MIT licenses.
+
+You may use this code under the terms of either license.

diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..6cdb3a1
--- /dev/null
+++ b/Cargo.toml

@@ -0,0 +1,42 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+name = "memchr"
+version = "2.3.3"
+authors = ["Andrew Gallant <jamslam@gmail.com>", "bluss"]
+exclude = ["/ci/*", "/.travis.yml", "/Makefile", "/appveyor.yml"]
+description = "Safe interface to memchr."
+homepage = "https://github.com/BurntSushi/rust-memchr"
+documentation = "https://docs.rs/memchr/"
+readme = "README.md"
+keywords = ["memchr", "char", "scan", "strchr", "string"]
+license = "Unlicense/MIT"
+repository = "https://github.com/BurntSushi/rust-memchr"
+[profile.test]
+opt-level = 3
+
+[lib]
+name = "memchr"
+bench = false
+[dependencies.libc]
+version = "0.2.18"
+optional = true
+default-features = false
+[dev-dependencies.quickcheck]
+version = "0.9"
+default-features = false
+
+[features]
+default = ["std"]
+std = []
+use_std = ["std"]

diff --git a/Cargo.toml.orig b/Cargo.toml.orig
new file mode 100644
index 0000000..0cae0f4
--- /dev/null
+++ b/Cargo.toml.orig

@@ -0,0 +1,37 @@
+[package]
+name = "memchr"
+version = "2.3.3"  #:version
+authors = ["Andrew Gallant <jamslam@gmail.com>", "bluss"]
+description = "Safe interface to memchr."
+documentation = "https://docs.rs/memchr/"
+homepage = "https://github.com/BurntSushi/rust-memchr"
+repository = "https://github.com/BurntSushi/rust-memchr"
+readme = "README.md"
+keywords = ["memchr", "char", "scan", "strchr", "string"]
+license = "Unlicense/MIT"
+exclude = ["/ci/*", "/.travis.yml", "/Makefile", "/appveyor.yml"]
+
+[lib]
+name = "memchr"
+bench = false
+
+[features]
+default = ["std"]
+
+# The 'std' feature permits the memchr crate to use the standard library. This
+# permits this crate to use runtime CPU feature detection to automatically
+# accelerate searching via vector instructions. Without the standard library,
+# this automatic detection is not possible.
+std = []
+# The 'use_std' feature is DEPRECATED. It will be removed in memchr 3. Until
+# then, it is alias for the 'std' feature.
+use_std = ["std"]
+
+[dependencies]
+libc = { version = "0.2.18", default-features = false, optional = true }
+
+[dev-dependencies]
+quickcheck = { version = "0.9", default-features = false }
+
+[profile.test]
+opt-level = 3

diff --git a/LICENSE b/LICENSE
new file mode 120000
index 0000000..7f9a88e
--- /dev/null
+++ b/LICENSE

@@ -0,0 +1 @@
+LICENSE-MIT
\ No newline at end of file

diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..3b0a5dc
--- /dev/null
+++ b/LICENSE-MIT

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Andrew Gallant
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..3a73002
--- /dev/null
+++ b/METADATA

@@ -0,0 +1,17 @@
+name: "memchr"
+description:
+    "Safe interface to memchr."
+
+third_party {
+  url {
+    type: HOMEPAGE
+    value: "https://crates.io/crates/memchr"
+  }
+  url {
+    type: GIT
+    value: "https://github.com/BurntSushi/rust-memchr"
+  }
+  version: "2.3.3"
+  last_upgrade_date { year: 2020 month: 3 day: 17 }
+  license_type: NOTICE
+}

diff --git a/MODULE_LICENSE_MIT b/MODULE_LICENSE_MIT
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_MIT


diff --git a/NOTICE b/NOTICE
new file mode 120000
index 0000000..7a694c9
--- /dev/null
+++ b/NOTICE

@@ -0,0 +1 @@
+LICENSE
\ No newline at end of file

diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..46fc303
--- /dev/null
+++ b/OWNERS

@@ -0,0 +1 @@
+include platform/prebuilts/rust:/OWNERS

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f78a5a5
--- /dev/null
+++ b/README.md

@@ -0,0 +1,79 @@
+memchr
+======
+The `memchr` crate provides heavily optimized routines for searching bytes.
+
+[![Build status](https://github.com/BurntSushi/rust-memchr/workflows/ci/badge.svg)](https://github.com/BurntSushi/rust-memchr/actions)
+[![](http://meritbadge.herokuapp.com/memchr)](https://crates.io/crates/memchr)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+
+
+### Documentation
+
+[https://docs.rs/memchr](https://docs.rs/memchr)
+
+
+### Overview
+
+The `memchr` function is traditionally provided by libc, but its
+performance can vary significantly depending on the specific
+implementation of libc that is used. They can range from manually tuned
+Assembly implementations (like that found in GNU's libc) all the way to
+non-vectorized C implementations (like that found in MUSL).
+
+To smooth out the differences between implementations of libc, at least
+on `x86_64` for Rust 1.27+, this crate provides its own implementation of
+`memchr` that should perform competitively with the one found in GNU's libc.
+The implementation is in pure Rust and has no dependency on a C compiler or an
+Assembler.
+
+Additionally, GNU libc also provides an extension, `memrchr`. This crate
+provides its own implementation of `memrchr` as well, on top of `memchr2`,
+`memchr3`, `memrchr2` and `memrchr3`. The difference between `memchr` and
+`memchr2` is that `memchr2` permits finding all occurrences of two bytes
+instead of one. Similarly for `memchr3`.
+
+### Compiling without the standard library
+
+memchr links to the standard library by default, but you can disable the
+`std` feature if you want to use it in a `#![no_std]` crate:
+
+```toml
+[dependencies]
+memchr = { version = "2", default-features = false }
+```
+
+On x86 platforms, when the `std` feature is disabled, the SSE2
+implementation of memchr will be used in compilers that support it. When
+`std` is enabled, the AVX implementation of memchr will be used if the CPU
+is determined to support it at runtime.
+
+### Using libc
+
+`memchr` is a routine that is part of libc, although this crate does not use
+libc by default. Instead, it uses its own routines, which are either vectorized
+or generic fallback routines. In general, these should be competitive with
+what's in libc, although this has not been tested for all architectures. If
+using `memchr` from libc is desirable and a vectorized routine is not otherwise
+available in this crate, then enabling the `libc` feature will use libc's
+version of `memchr`.
+
+The rest of the functions in this crate, e.g., `memchr2` or `memrchr3`, are not
+a standard part of libc, so they will always use the implementations in this
+crate. One exception to this is `memrchr`, which is an extension commonly found
+on Linux. On Linux, `memrchr` is used in precisely the same scenario as
+`memchr`, as described above.
+
+
+### Minimum Rust version policy
+
+This crate's minimum supported `rustc` version is `1.28.0`.
+
+The current policy is that the minimum Rust version required to use this crate
+can be increased in minor version updates. For example, if `crate 1.0` requires
+Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust
+1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum
+version of Rust.
+
+In general, this crate will be conservative with respect to the minimum
+supported version of Rust.

diff --git a/UNLICENSE b/UNLICENSE
new file mode 100644
index 0000000..68a49da
--- /dev/null
+++ b/UNLICENSE

@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>

diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..4ae3184
--- /dev/null
+++ b/build.rs

@@ -0,0 +1,61 @@
+use std::env;
+
+fn main() {
+    enable_simd_optimizations();
+    enable_libc();
+}
+
+// This adds various simd cfgs if this compiler supports it.
+//
+// This can be disabled with RUSTFLAGS="--cfg memchr_disable_auto_simd", but
+// this is generally only intended for testing.
+fn enable_simd_optimizations() {
+    if is_env_set("CARGO_CFG_MEMCHR_DISABLE_AUTO_SIMD") {
+        return;
+    }
+    println!("cargo:rustc-cfg=memchr_runtime_simd");
+    println!("cargo:rustc-cfg=memchr_runtime_sse2");
+    println!("cargo:rustc-cfg=memchr_runtime_sse42");
+    println!("cargo:rustc-cfg=memchr_runtime_avx");
+}
+
+// This adds a `memchr_libc` cfg if and only if libc can be used, if no other
+// better option is available.
+//
+// This could be performed in the source code, but it's simpler to do it once
+// here and consolidate it into one cfg knob.
+//
+// Basically, we use libc only if its enabled and if we aren't targeting a
+// known bad platform. For example, wasm32 doesn't have a libc and the
+// performance of memchr on Windows is seemingly worse than the fallback
+// implementation.
+fn enable_libc() {
+    const NO_ARCH: &'static [&'static str] = &["wasm32", "windows"];
+    const NO_ENV: &'static [&'static str] = &["sgx"];
+
+    if !is_feature_set("LIBC") {
+        return;
+    }
+
+    let arch = match env::var("CARGO_CFG_TARGET_ARCH") {
+        Err(_) => return,
+        Ok(arch) => arch,
+    };
+    let env = match env::var("CARGO_CFG_TARGET_ENV") {
+        Err(_) => return,
+        Ok(env) => env,
+    };
+    if NO_ARCH.contains(&&*arch) || NO_ENV.contains(&&*env) {
+        return;
+    }
+
+    println!("cargo:rustc-cfg=memchr_libc");
+}
+
+fn is_feature_set(name: &str) -> bool {
+    is_env_set(&format!("CARGO_FEATURE_{}", name))
+}
+
+fn is_env_set(name: &str) -> bool {
+    env::var_os(name).is_some()
+}

diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..aa37a21
--- /dev/null
+++ b/rustfmt.toml

@@ -0,0 +1,2 @@
+max_width = 79
+use_small_heuristics = "max"

diff --git a/src/c.rs b/src/c.rs
new file mode 100644
index 0000000..63feca9
--- /dev/null
+++ b/src/c.rs

@@ -0,0 +1,44 @@
+// This module defines safe wrappers around memchr (POSIX) and memrchr (GNU
+// extension).
+
+#![allow(dead_code)]
+
+extern crate libc;
+
+use self::libc::{c_int, c_void, size_t};
+
+pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+    let p = unsafe {
+        libc::memchr(
+            haystack.as_ptr() as *const c_void,
+            needle as c_int,
+            haystack.len() as size_t,
+        )
+    };
+    if p.is_null() {
+        None
+    } else {
+        Some(p as usize - (haystack.as_ptr() as usize))
+    }
+}
+
+// memrchr is a GNU extension. We know it's available on Linux, so start there.
+#[cfg(target_os = "linux")]
+pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+    // GNU's memrchr() will - unlike memchr() - error if haystack is empty.
+    if haystack.is_empty() {
+        return None;
+    }
+    let p = unsafe {
+        libc::memrchr(
+            haystack.as_ptr() as *const c_void,
+            needle as c_int,
+            haystack.len() as size_t,
+        )
+    };
+    if p.is_null() {
+        None
+    } else {
+        Some(p as usize - (haystack.as_ptr() as usize))
+    }
+}

diff --git a/src/fallback.rs b/src/fallback.rs
new file mode 100644
index 0000000..8bc32b2
--- /dev/null
+++ b/src/fallback.rs

@@ -0,0 +1,330 @@
+// This module defines pure Rust platform independent implementations of all
+// the memchr routines. We do our best to make them fast. Some of them may even
+// get auto-vectorized.
+
+use core::cmp;
+use core::usize;
+
+#[cfg(target_pointer_width = "16")]
+const USIZE_BYTES: usize = 2;
+
+#[cfg(target_pointer_width = "32")]
+const USIZE_BYTES: usize = 4;
+
+#[cfg(target_pointer_width = "64")]
+const USIZE_BYTES: usize = 8;
+
+// The number of bytes to loop at in one iteration of memchr/memrchr.
+const LOOP_SIZE: usize = 2 * USIZE_BYTES;
+
+/// Return `true` if `x` contains any zero byte.
+///
+/// From *Matters Computational*, J. Arndt
+///
+/// "The idea is to subtract one from each of the bytes and then look for
+/// bytes where the borrow propagated all the way to the most significant
+/// bit."
+#[inline(always)]
+fn contains_zero_byte(x: usize) -> bool {
+    const LO_U64: u64 = 0x0101010101010101;
+    const HI_U64: u64 = 0x8080808080808080;
+
+    const LO_USIZE: usize = LO_U64 as usize;
+    const HI_USIZE: usize = HI_U64 as usize;
+
+    x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0
+}
+
+/// Repeat the given byte into a word size number. That is, every 8 bits
+/// is equivalent to the given byte. For example, if `b` is `\x4E` or
+/// `01001110` in binary, then the returned value on a 32-bit system would be:
+/// `01001110_01001110_01001110_01001110`.
+#[inline(always)]
+fn repeat_byte(b: u8) -> usize {
+    (b as usize) * (usize::MAX / 255)
+}
+
+pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let confirm = |byte| byte == n1;
+    let loop_size = cmp::min(LOOP_SIZE, haystack.len());
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = (ptr as *const usize).read_unaligned();
+        if contains_zero_byte(chunk ^ vn1) {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align));
+        debug_assert!(ptr > start_ptr);
+        debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr);
+        while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let a = *(ptr as *const usize);
+            let b = *(ptr.add(USIZE_BYTES) as *const usize);
+            let eqa = contains_zero_byte(a ^ vn1);
+            let eqb = contains_zero_byte(b ^ vn1);
+            if eqa || eqb {
+                break;
+            }
+            ptr = ptr.add(LOOP_SIZE);
+        }
+        forward_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+/// Like `memchr`, but searches for two bytes instead of one.
+pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let vn2 = repeat_byte(n2);
+    let confirm = |byte| byte == n1 || byte == n2;
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = (ptr as *const usize).read_unaligned();
+        let eq1 = contains_zero_byte(chunk ^ vn1);
+        let eq2 = contains_zero_byte(chunk ^ vn2);
+        if eq1 || eq2 {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align));
+        debug_assert!(ptr > start_ptr);
+        debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr);
+        while ptr <= end_ptr.sub(USIZE_BYTES) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let chunk = *(ptr as *const usize);
+            let eq1 = contains_zero_byte(chunk ^ vn1);
+            let eq2 = contains_zero_byte(chunk ^ vn2);
+            if eq1 || eq2 {
+                break;
+            }
+            ptr = ptr.add(USIZE_BYTES);
+        }
+        forward_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+/// Like `memchr`, but searches for three bytes instead of one.
+pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let vn2 = repeat_byte(n2);
+    let vn3 = repeat_byte(n3);
+    let confirm = |byte| byte == n1 || byte == n2 || byte == n3;
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = (ptr as *const usize).read_unaligned();
+        let eq1 = contains_zero_byte(chunk ^ vn1);
+        let eq2 = contains_zero_byte(chunk ^ vn2);
+        let eq3 = contains_zero_byte(chunk ^ vn3);
+        if eq1 || eq2 || eq3 {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align));
+        debug_assert!(ptr > start_ptr);
+        debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr);
+        while ptr <= end_ptr.sub(USIZE_BYTES) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let chunk = *(ptr as *const usize);
+            let eq1 = contains_zero_byte(chunk ^ vn1);
+            let eq2 = contains_zero_byte(chunk ^ vn2);
+            let eq3 = contains_zero_byte(chunk ^ vn3);
+            if eq1 || eq2 || eq3 {
+                break;
+            }
+            ptr = ptr.add(USIZE_BYTES);
+        }
+        forward_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+/// Return the last index matching the byte `x` in `text`.
+pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let confirm = |byte| byte == n1;
+    let loop_size = cmp::min(LOOP_SIZE, haystack.len());
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned();
+        if contains_zero_byte(chunk ^ vn1) {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = (end_ptr as usize & !align) as *const u8;
+        debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+        while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let a = *(ptr.sub(2 * USIZE_BYTES) as *const usize);
+            let b = *(ptr.sub(1 * USIZE_BYTES) as *const usize);
+            let eqa = contains_zero_byte(a ^ vn1);
+            let eqb = contains_zero_byte(b ^ vn1);
+            if eqa || eqb {
+                break;
+            }
+            ptr = ptr.sub(loop_size);
+        }
+        reverse_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+/// Like `memrchr`, but searches for two bytes instead of one.
+pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let vn2 = repeat_byte(n2);
+    let confirm = |byte| byte == n1 || byte == n2;
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned();
+        let eq1 = contains_zero_byte(chunk ^ vn1);
+        let eq2 = contains_zero_byte(chunk ^ vn2);
+        if eq1 || eq2 {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = (end_ptr as usize & !align) as *const u8;
+        debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+        while ptr >= start_ptr.add(USIZE_BYTES) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let chunk = *(ptr.sub(USIZE_BYTES) as *const usize);
+            let eq1 = contains_zero_byte(chunk ^ vn1);
+            let eq2 = contains_zero_byte(chunk ^ vn2);
+            if eq1 || eq2 {
+                break;
+            }
+            ptr = ptr.sub(USIZE_BYTES);
+        }
+        reverse_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+/// Like `memrchr`, but searches for three bytes instead of one.
+pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let vn2 = repeat_byte(n2);
+    let vn3 = repeat_byte(n3);
+    let confirm = |byte| byte == n1 || byte == n2 || byte == n3;
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned();
+        let eq1 = contains_zero_byte(chunk ^ vn1);
+        let eq2 = contains_zero_byte(chunk ^ vn2);
+        let eq3 = contains_zero_byte(chunk ^ vn3);
+        if eq1 || eq2 || eq3 {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = (end_ptr as usize & !align) as *const u8;
+        debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+        while ptr >= start_ptr.add(USIZE_BYTES) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let chunk = *(ptr.sub(USIZE_BYTES) as *const usize);
+            let eq1 = contains_zero_byte(chunk ^ vn1);
+            let eq2 = contains_zero_byte(chunk ^ vn2);
+            let eq3 = contains_zero_byte(chunk ^ vn3);
+            if eq1 || eq2 || eq3 {
+                break;
+            }
+            ptr = ptr.sub(USIZE_BYTES);
+        }
+        reverse_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+#[inline(always)]
+unsafe fn forward_search<F: Fn(u8) -> bool>(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    mut ptr: *const u8,
+    confirm: F,
+) -> Option<usize> {
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr);
+
+    while ptr < end_ptr {
+        if confirm(*ptr) {
+            return Some(sub(ptr, start_ptr));
+        }
+        ptr = ptr.offset(1);
+    }
+    None
+}
+
+#[inline(always)]
+unsafe fn reverse_search<F: Fn(u8) -> bool>(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    mut ptr: *const u8,
+    confirm: F,
+) -> Option<usize> {
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr);
+
+    while ptr > start_ptr {
+        ptr = ptr.offset(-1);
+        if confirm(*ptr) {
+            return Some(sub(ptr, start_ptr));
+        }
+    }
+    None
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+    debug_assert!(a >= b);
+    (a as usize) - (b as usize)
+}

diff --git a/src/iter.rs b/src/iter.rs
new file mode 100644
index 0000000..6217ae4
--- /dev/null
+++ b/src/iter.rs

@@ -0,0 +1,173 @@
+use {memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3};
+
+macro_rules! iter_next {
+    // Common code for the memchr iterators:
+    // update haystack and position and produce the index
+    //
+    // self: &mut Self where Self is the iterator
+    // search_result: Option<usize> which is the result of the corresponding
+    // memchr function.
+    //
+    // Returns Option<usize> (the next iterator element)
+    ($self_:expr, $search_result:expr) => {
+        $search_result.map(move |index| {
+            // split and take the remaining back half
+            $self_.haystack = $self_.haystack.split_at(index + 1).1;
+            let found_position = $self_.position + index;
+            $self_.position = found_position + 1;
+            found_position
+        })
+    };
+}
+
+macro_rules! iter_next_back {
+    ($self_:expr, $search_result:expr) => {
+        $search_result.map(move |index| {
+            // split and take the remaining front half
+            $self_.haystack = $self_.haystack.split_at(index).0;
+            $self_.position + index
+        })
+    };
+}
+
+/// An iterator for `memchr`.
+pub struct Memchr<'a> {
+    needle: u8,
+    // The haystack to iterate over
+    haystack: &'a [u8],
+    // The index
+    position: usize,
+}
+
+impl<'a> Memchr<'a> {
+    /// Creates a new iterator that yields all positions of needle in haystack.
+    #[inline]
+    pub fn new(needle: u8, haystack: &[u8]) -> Memchr {
+        Memchr { needle: needle, haystack: haystack, position: 0 }
+    }
+}
+
+impl<'a> Iterator for Memchr<'a> {
+    type Item = usize;
+
+    #[inline]
+    fn next(&mut self) -> Option<usize> {
+        iter_next!(self, memchr(self.needle, self.haystack))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.haystack.len()))
+    }
+}
+
+impl<'a> DoubleEndedIterator for Memchr<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<Self::Item> {
+        iter_next_back!(self, memrchr(self.needle, self.haystack))
+    }
+}
+
+/// An iterator for `memchr2`.
+pub struct Memchr2<'a> {
+    needle1: u8,
+    needle2: u8,
+    // The haystack to iterate over
+    haystack: &'a [u8],
+    // The index
+    position: usize,
+}
+
+impl<'a> Memchr2<'a> {
+    /// Creates a new iterator that yields all positions of needle in haystack.
+    #[inline]
+    pub fn new(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2 {
+        Memchr2 {
+            needle1: needle1,
+            needle2: needle2,
+            haystack: haystack,
+            position: 0,
+        }
+    }
+}
+
+impl<'a> Iterator for Memchr2<'a> {
+    type Item = usize;
+
+    #[inline]
+    fn next(&mut self) -> Option<usize> {
+        iter_next!(self, memchr2(self.needle1, self.needle2, self.haystack))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.haystack.len()))
+    }
+}
+
+impl<'a> DoubleEndedIterator for Memchr2<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<Self::Item> {
+        iter_next_back!(
+            self,
+            memrchr2(self.needle1, self.needle2, self.haystack)
+        )
+    }
+}
+
+/// An iterator for `memchr3`.
+pub struct Memchr3<'a> {
+    needle1: u8,
+    needle2: u8,
+    needle3: u8,
+    // The haystack to iterate over
+    haystack: &'a [u8],
+    // The index
+    position: usize,
+}
+
+impl<'a> Memchr3<'a> {
+    /// Create a new `Memchr3` that's initialized to zero with a haystack
+    #[inline]
+    pub fn new(
+        needle1: u8,
+        needle2: u8,
+        needle3: u8,
+        haystack: &[u8],
+    ) -> Memchr3 {
+        Memchr3 {
+            needle1: needle1,
+            needle2: needle2,
+            needle3: needle3,
+            haystack: haystack,
+            position: 0,
+        }
+    }
+}
+
+impl<'a> Iterator for Memchr3<'a> {
+    type Item = usize;
+
+    #[inline]
+    fn next(&mut self) -> Option<usize> {
+        iter_next!(
+            self,
+            memchr3(self.needle1, self.needle2, self.needle3, self.haystack)
+        )
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.haystack.len()))
+    }
+}
+
+impl<'a> DoubleEndedIterator for Memchr3<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<Self::Item> {
+        iter_next_back!(
+            self,
+            memrchr3(self.needle1, self.needle2, self.needle3, self.haystack)
+        )
+    }
+}

diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..fed7108
--- /dev/null
+++ b/src/lib.rs

@@ -0,0 +1,451 @@
+/*!
+The `memchr` crate provides heavily optimized routines for searching bytes.
+
+The `memchr` function is traditionally provided by libc, however, the
+performance of `memchr` can vary significantly depending on the specific
+implementation of libc that is used. They can range from manually tuned
+Assembly implementations (like that found in GNU's libc) all the way to
+non-vectorized C implementations (like that found in MUSL).
+
+To smooth out the differences between implementations of libc, at least
+on `x86_64` for Rust 1.27+, this crate provides its own implementation of
+`memchr` that should perform competitively with the one found in GNU's libc.
+The implementation is in pure Rust and has no dependency on a C compiler or an
+Assembler.
+
+Additionally, GNU libc also provides an extension, `memrchr`. This crate
+provides its own implementation of `memrchr` as well, on top of `memchr2`,
+`memchr3`, `memrchr2` and `memrchr3`. The difference between `memchr` and
+`memchr2` is that that `memchr2` permits finding all occurrences of two bytes
+instead of one. Similarly for `memchr3`.
+*/
+
+#![cfg_attr(not(feature = "std"), no_std)]
+#![deny(missing_docs)]
+#![doc(html_root_url = "https://docs.rs/memchr/2.0.0")]
+
+// Supporting 8-bit (or others) would be fine. If you need it, please submit a
+// bug report at https://github.com/BurntSushi/rust-memchr
+#[cfg(not(any(
+    target_pointer_width = "16",
+    target_pointer_width = "32",
+    target_pointer_width = "64"
+)))]
+compile_error!("memchr currently not supported on non-32 or non-64 bit");
+
+#[cfg(feature = "std")]
+extern crate core;
+
+#[cfg(all(test, all(not(miri), feature = "std")))]
+#[macro_use]
+extern crate quickcheck;
+
+use core::iter::Rev;
+
+pub use iter::{Memchr, Memchr2, Memchr3};
+
+// N.B. If you're looking for the cfg knobs for libc, see build.rs.
+#[cfg(memchr_libc)]
+mod c;
+#[allow(dead_code)]
+mod fallback;
+mod iter;
+mod naive;
+#[cfg(all(test, all(not(miri), feature = "std")))]
+mod tests;
+#[cfg(all(test, any(miri, not(feature = "std"))))]
+#[path = "tests/miri.rs"]
+mod tests;
+#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
+mod x86;
+
+/// An iterator over all occurrences of the needle in a haystack.
+#[inline]
+pub fn memchr_iter(needle: u8, haystack: &[u8]) -> Memchr {
+    Memchr::new(needle, haystack)
+}
+
+/// An iterator over all occurrences of the needles in a haystack.
+#[inline]
+pub fn memchr2_iter(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2 {
+    Memchr2::new(needle1, needle2, haystack)
+}
+
+/// An iterator over all occurrences of the needles in a haystack.
+#[inline]
+pub fn memchr3_iter(
+    needle1: u8,
+    needle2: u8,
+    needle3: u8,
+    haystack: &[u8],
+) -> Memchr3 {
+    Memchr3::new(needle1, needle2, needle3, haystack)
+}
+
+/// An iterator over all occurrences of the needle in a haystack, in reverse.
+#[inline]
+pub fn memrchr_iter(needle: u8, haystack: &[u8]) -> Rev<Memchr> {
+    Memchr::new(needle, haystack).rev()
+}
+
+/// An iterator over all occurrences of the needles in a haystack, in reverse.
+#[inline]
+pub fn memrchr2_iter(
+    needle1: u8,
+    needle2: u8,
+    haystack: &[u8],
+) -> Rev<Memchr2> {
+    Memchr2::new(needle1, needle2, haystack).rev()
+}
+
+/// An iterator over all occurrences of the needles in a haystack, in reverse.
+#[inline]
+pub fn memrchr3_iter(
+    needle1: u8,
+    needle2: u8,
+    needle3: u8,
+    haystack: &[u8],
+) -> Rev<Memchr3> {
+    Memchr3::new(needle1, needle2, needle3, haystack).rev()
+}
+
+/// Search for the first occurrence of a byte in a slice.
+///
+/// This returns the index corresponding to the first occurrence of `needle` in
+/// `haystack`, or `None` if one is not found.
+///
+/// While this is operationally the same as something like
+/// `haystack.iter().position(|&b| b == needle)`, `memchr` will use a highly
+/// optimized routine that can be up to an order of magnitude faster in some
+/// cases.
+///
+/// # Example
+///
+/// This shows how to find the first position of a byte in a byte string.
+///
+/// ```
+/// use memchr::memchr;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memchr(b'k', haystack), Some(8));
+/// ```
+#[inline]
+pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+    #[cfg(miri)]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        naive::memchr(n1, haystack)
+    }
+
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memchr(n1, haystack)
+    }
+
+    #[cfg(all(
+        memchr_libc,
+        not(all(target_arch = "x86_64", memchr_runtime_simd)),
+        not(miri),
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        c::memchr(n1, haystack)
+    }
+
+    #[cfg(all(
+        not(memchr_libc),
+        not(all(target_arch = "x86_64", memchr_runtime_simd)),
+        not(miri),
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memchr(n1, haystack)
+    }
+
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle, haystack)
+    }
+}
+
+/// Like `memchr`, but searches for either of two bytes instead of just one.
+///
+/// This returns the index corresponding to the first occurrence of `needle1`
+/// or the first occurrence of `needle2` in `haystack` (whichever occurs
+/// earlier), or `None` if neither one is found.
+///
+/// While this is operationally the same as something like
+/// `haystack.iter().position(|&b| b == needle1 || b == needle2)`, `memchr2`
+/// will use a highly optimized routine that can be up to an order of magnitude
+/// faster in some cases.
+///
+/// # Example
+///
+/// This shows how to find the first position of either of two bytes in a byte
+/// string.
+///
+/// ```
+/// use memchr::memchr2;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memchr2(b'k', b'q', haystack), Some(4));
+/// ```
+#[inline]
+pub fn memchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize> {
+    #[cfg(miri)]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+        naive::memchr2(n1, n2, haystack)
+    }
+
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memchr2(n1, n2, haystack)
+    }
+
+    #[cfg(all(
+        not(all(target_arch = "x86_64", memchr_runtime_simd)),
+        not(miri),
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memchr2(n1, n2, haystack)
+    }
+
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle1, needle2, haystack)
+    }
+}
+
+/// Like `memchr`, but searches for any of three bytes instead of just one.
+///
+/// This returns the index corresponding to the first occurrence of `needle1`,
+/// the first occurrence of `needle2`, or the first occurrence of `needle3` in
+/// `haystack` (whichever occurs earliest), or `None` if none are found.
+///
+/// While this is operationally the same as something like
+/// `haystack.iter().position(|&b| b == needle1 || b == needle2 ||
+/// b == needle3)`, `memchr3` will use a highly optimized routine that can be
+/// up to an order of magnitude faster in some cases.
+///
+/// # Example
+///
+/// This shows how to find the first position of any of three bytes in a byte
+/// string.
+///
+/// ```
+/// use memchr::memchr3;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memchr3(b'k', b'q', b'e', haystack), Some(2));
+/// ```
+#[inline]
+pub fn memchr3(
+    needle1: u8,
+    needle2: u8,
+    needle3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    #[cfg(miri)]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+        naive::memchr3(n1, n2, n3, haystack)
+    }
+
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memchr3(n1, n2, n3, haystack)
+    }
+
+    #[cfg(all(
+        not(all(target_arch = "x86_64", memchr_runtime_simd)),
+        not(miri),
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memchr3(n1, n2, n3, haystack)
+    }
+
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle1, needle2, needle3, haystack)
+    }
+}
+
+/// Search for the last occurrence of a byte in a slice.
+///
+/// This returns the index corresponding to the last occurrence of `needle` in
+/// `haystack`, or `None` if one is not found.
+///
+/// While this is operationally the same as something like
+/// `haystack.iter().rposition(|&b| b == needle)`, `memrchr` will use a highly
+/// optimized routine that can be up to an order of magnitude faster in some
+/// cases.
+///
+/// # Example
+///
+/// This shows how to find the last position of a byte in a byte string.
+///
+/// ```
+/// use memchr::memrchr;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memrchr(b'o', haystack), Some(17));
+/// ```
+#[inline]
+pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+    #[cfg(miri)]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        naive::memrchr(n1, haystack)
+    }
+
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memrchr(n1, haystack)
+    }
+
+    #[cfg(all(
+        memchr_libc,
+        target_os = "linux",
+        not(all(target_arch = "x86_64", memchr_runtime_simd)),
+        not(miri)
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        c::memrchr(n1, haystack)
+    }
+
+    #[cfg(all(
+        not(all(memchr_libc, target_os = "linux")),
+        not(all(target_arch = "x86_64", memchr_runtime_simd)),
+        not(miri),
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memrchr(n1, haystack)
+    }
+
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle, haystack)
+    }
+}
+
+/// Like `memrchr`, but searches for either of two bytes instead of just one.
+///
+/// This returns the index corresponding to the last occurrence of `needle1`
+/// or the last occurrence of `needle2` in `haystack` (whichever occurs later),
+/// or `None` if neither one is found.
+///
+/// While this is operationally the same as something like
+/// `haystack.iter().rposition(|&b| b == needle1 || b == needle2)`, `memrchr2`
+/// will use a highly optimized routine that can be up to an order of magnitude
+/// faster in some cases.
+///
+/// # Example
+///
+/// This shows how to find the last position of either of two bytes in a byte
+/// string.
+///
+/// ```
+/// use memchr::memrchr2;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memrchr2(b'k', b'q', haystack), Some(8));
+/// ```
+#[inline]
+pub fn memrchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize> {
+    #[cfg(miri)]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+        naive::memrchr2(n1, n2, haystack)
+    }
+
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memrchr2(n1, n2, haystack)
+    }
+
+    #[cfg(all(
+        not(all(target_arch = "x86_64", memchr_runtime_simd)),
+        not(miri),
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memrchr2(n1, n2, haystack)
+    }
+
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle1, needle2, haystack)
+    }
+}
+
+/// Like `memrchr`, but searches for any of three bytes instead of just one.
+///
+/// This returns the index corresponding to the last occurrence of `needle1`,
+/// the last occurrence of `needle2`, or the last occurrence of `needle3` in
+/// `haystack` (whichever occurs later), or `None` if none are found.
+///
+/// While this is operationally the same as something like
+/// `haystack.iter().rposition(|&b| b == needle1 || b == needle2 ||
+/// b == needle3)`, `memrchr3` will use a highly optimized routine that can be
+/// up to an order of magnitude faster in some cases.
+///
+/// # Example
+///
+/// This shows how to find the last position of any of three bytes in a byte
+/// string.
+///
+/// ```
+/// use memchr::memrchr3;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memrchr3(b'k', b'q', b'e', haystack), Some(8));
+/// ```
+#[inline]
+pub fn memrchr3(
+    needle1: u8,
+    needle2: u8,
+    needle3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    #[cfg(miri)]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+        naive::memrchr3(n1, n2, n3, haystack)
+    }
+
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memrchr3(n1, n2, n3, haystack)
+    }
+
+    #[cfg(all(
+        not(all(target_arch = "x86_64", memchr_runtime_simd)),
+        not(miri),
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memrchr3(n1, n2, n3, haystack)
+    }
+
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle1, needle2, needle3, haystack)
+    }
+}

diff --git a/src/naive.rs b/src/naive.rs
new file mode 100644
index 0000000..3f3053d
--- /dev/null
+++ b/src/naive.rs

@@ -0,0 +1,25 @@
+#![allow(dead_code)]
+
+pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    haystack.iter().position(|&b| b == n1)
+}
+
+pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    haystack.iter().position(|&b| b == n1 || b == n2)
+}
+
+pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    haystack.iter().position(|&b| b == n1 || b == n2 || b == n3)
+}
+
+pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    haystack.iter().rposition(|&b| b == n1)
+}
+
+pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    haystack.iter().rposition(|&b| b == n1 || b == n2)
+}
+
+pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3)
+}

diff --git a/src/tests/iter.rs b/src/tests/iter.rs
new file mode 100644
index 0000000..8f33500
--- /dev/null
+++ b/src/tests/iter.rs

@@ -0,0 +1,229 @@
+use tests::memchr_tests;
+use {Memchr, Memchr2, Memchr3};
+
+#[test]
+fn memchr1_iter() {
+    for test in memchr_tests() {
+        test.iter_one(false, Memchr::new);
+    }
+}
+
+#[test]
+fn memchr2_iter() {
+    for test in memchr_tests() {
+        test.iter_two(false, Memchr2::new);
+    }
+}
+
+#[test]
+fn memchr3_iter() {
+    for test in memchr_tests() {
+        test.iter_three(false, Memchr3::new);
+    }
+}
+
+#[test]
+fn memrchr1_iter() {
+    for test in memchr_tests() {
+        test.iter_one(true, |n1, corpus| Memchr::new(n1, corpus).rev());
+    }
+}
+
+#[test]
+fn memrchr2_iter() {
+    for test in memchr_tests() {
+        test.iter_two(true, |n1, n2, corpus| {
+            Memchr2::new(n1, n2, corpus).rev()
+        })
+    }
+}
+
+#[test]
+fn memrchr3_iter() {
+    for test in memchr_tests() {
+        test.iter_three(true, |n1, n2, n3, corpus| {
+            Memchr3::new(n1, n2, n3, corpus).rev()
+        })
+    }
+}
+
+quickcheck! {
+    fn qc_memchr_double_ended_iter(
+        needle: u8, data: Vec<u8>, take_side: Vec<bool>
+    ) -> bool {
+        // make nonempty
+        let mut take_side = take_side;
+        if take_side.is_empty() { take_side.push(true) };
+
+        let iter = Memchr::new(needle, &data);
+        let all_found = double_ended_take(
+            iter, take_side.iter().cycle().cloned());
+
+        all_found.iter().cloned().eq(positions1(needle, &data))
+    }
+
+    fn qc_memchr2_double_ended_iter(
+        needle1: u8, needle2: u8, data: Vec<u8>, take_side: Vec<bool>
+    ) -> bool {
+        // make nonempty
+        let mut take_side = take_side;
+        if take_side.is_empty() { take_side.push(true) };
+
+        let iter = Memchr2::new(needle1, needle2, &data);
+        let all_found = double_ended_take(
+            iter, take_side.iter().cycle().cloned());
+
+        all_found.iter().cloned().eq(positions2(needle1, needle2, &data))
+    }
+
+    fn qc_memchr3_double_ended_iter(
+        needle1: u8, needle2: u8, needle3: u8,
+        data: Vec<u8>, take_side: Vec<bool>
+    ) -> bool {
+        // make nonempty
+        let mut take_side = take_side;
+        if take_side.is_empty() { take_side.push(true) };
+
+        let iter = Memchr3::new(needle1, needle2, needle3, &data);
+        let all_found = double_ended_take(
+            iter, take_side.iter().cycle().cloned());
+
+        all_found
+            .iter()
+            .cloned()
+            .eq(positions3(needle1, needle2, needle3, &data))
+    }
+
+    fn qc_memchr1_iter(data: Vec<u8>) -> bool {
+        let needle = 0;
+        let answer = positions1(needle, &data);
+        answer.eq(Memchr::new(needle, &data))
+    }
+
+    fn qc_memchr1_rev_iter(data: Vec<u8>) -> bool {
+        let needle = 0;
+        let answer = positions1(needle, &data);
+        answer.rev().eq(Memchr::new(needle, &data).rev())
+    }
+
+    fn qc_memchr2_iter(data: Vec<u8>) -> bool {
+        let needle1 = 0;
+        let needle2 = 1;
+        let answer = positions2(needle1, needle2, &data);
+        answer.eq(Memchr2::new(needle1, needle2, &data))
+    }
+
+    fn qc_memchr2_rev_iter(data: Vec<u8>) -> bool {
+        let needle1 = 0;
+        let needle2 = 1;
+        let answer = positions2(needle1, needle2, &data);
+        answer.rev().eq(Memchr2::new(needle1, needle2, &data).rev())
+    }
+
+    fn qc_memchr3_iter(data: Vec<u8>) -> bool {
+        let needle1 = 0;
+        let needle2 = 1;
+        let needle3 = 2;
+        let answer = positions3(needle1, needle2, needle3, &data);
+        answer.eq(Memchr3::new(needle1, needle2, needle3, &data))
+    }
+
+    fn qc_memchr3_rev_iter(data: Vec<u8>) -> bool {
+        let needle1 = 0;
+        let needle2 = 1;
+        let needle3 = 2;
+        let answer = positions3(needle1, needle2, needle3, &data);
+        answer.rev().eq(Memchr3::new(needle1, needle2, needle3, &data).rev())
+    }
+
+    fn qc_memchr1_iter_size_hint(data: Vec<u8>) -> bool {
+        // test that the size hint is within reasonable bounds
+        let needle = 0;
+        let mut iter = Memchr::new(needle, &data);
+        let mut real_count = data
+            .iter()
+            .filter(|&&elt| elt == needle)
+            .count();
+
+        while let Some(index) = iter.next() {
+            real_count -= 1;
+            let (lower, upper) = iter.size_hint();
+            assert!(lower <= real_count);
+            assert!(upper.unwrap() >= real_count);
+            assert!(upper.unwrap() <= data.len() - index);
+        }
+        true
+    }
+}
+
+// take items from a DEI, taking front for each true and back for each false.
+// Return a vector with the concatenation of the fronts and the reverse of the
+// backs.
+fn double_ended_take<I, J>(mut iter: I, take_side: J) -> Vec<I::Item>
+where
+    I: DoubleEndedIterator,
+    J: Iterator<Item = bool>,
+{
+    let mut found_front = Vec::new();
+    let mut found_back = Vec::new();
+
+    for take_front in take_side {
+        if take_front {
+            if let Some(pos) = iter.next() {
+                found_front.push(pos);
+            } else {
+                break;
+            }
+        } else {
+            if let Some(pos) = iter.next_back() {
+                found_back.push(pos);
+            } else {
+                break;
+            }
+        };
+    }
+
+    let mut all_found = found_front;
+    all_found.extend(found_back.into_iter().rev());
+    all_found
+}
+
+// return an iterator of the 0-based indices of haystack that match the needle
+fn positions1<'a>(
+    n1: u8,
+    haystack: &'a [u8],
+) -> Box<dyn DoubleEndedIterator<Item = usize> + 'a> {
+    let it = haystack
+        .iter()
+        .enumerate()
+        .filter(move |&(_, &b)| b == n1)
+        .map(|t| t.0);
+    Box::new(it)
+}
+
+fn positions2<'a>(
+    n1: u8,
+    n2: u8,
+    haystack: &'a [u8],
+) -> Box<dyn DoubleEndedIterator<Item = usize> + 'a> {
+    let it = haystack
+        .iter()
+        .enumerate()
+        .filter(move |&(_, &b)| b == n1 || b == n2)
+        .map(|t| t.0);
+    Box::new(it)
+}
+
+fn positions3<'a>(
+    n1: u8,
+    n2: u8,
+    n3: u8,
+    haystack: &'a [u8],
+) -> Box<dyn DoubleEndedIterator<Item = usize> + 'a> {
+    let it = haystack
+        .iter()
+        .enumerate()
+        .filter(move |&(_, &b)| b == n1 || b == n2 || b == n3)
+        .map(|t| t.0);
+    Box::new(it)
+}

diff --git a/src/tests/memchr.rs b/src/tests/memchr.rs
new file mode 100644
index 0000000..87d3d14
--- /dev/null
+++ b/src/tests/memchr.rs

@@ -0,0 +1,131 @@
+use fallback;
+use naive;
+use {memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3};
+
+use tests::memchr_tests;
+
+#[test]
+fn memchr1_find() {
+    for test in memchr_tests() {
+        test.one(false, memchr);
+    }
+}
+
+#[test]
+fn memchr1_fallback_find() {
+    for test in memchr_tests() {
+        test.one(false, fallback::memchr);
+    }
+}
+
+#[test]
+fn memchr2_find() {
+    for test in memchr_tests() {
+        test.two(false, memchr2);
+    }
+}
+
+#[test]
+fn memchr2_fallback_find() {
+    for test in memchr_tests() {
+        test.two(false, fallback::memchr2);
+    }
+}
+
+#[test]
+fn memchr3_find() {
+    for test in memchr_tests() {
+        test.three(false, memchr3);
+    }
+}
+
+#[test]
+fn memchr3_fallback_find() {
+    for test in memchr_tests() {
+        test.three(false, fallback::memchr3);
+    }
+}
+
+#[test]
+fn memrchr1_find() {
+    for test in memchr_tests() {
+        test.one(true, memrchr);
+    }
+}
+
+#[test]
+fn memrchr1_fallback_find() {
+    for test in memchr_tests() {
+        test.one(true, fallback::memrchr);
+    }
+}
+
+#[test]
+fn memrchr2_find() {
+    for test in memchr_tests() {
+        test.two(true, memrchr2);
+    }
+}
+
+#[test]
+fn memrchr2_fallback_find() {
+    for test in memchr_tests() {
+        test.two(true, fallback::memrchr2);
+    }
+}
+
+#[test]
+fn memrchr3_find() {
+    for test in memchr_tests() {
+        test.three(true, memrchr3);
+    }
+}
+
+#[test]
+fn memrchr3_fallback_find() {
+    for test in memchr_tests() {
+        test.three(true, fallback::memrchr3);
+    }
+}
+
+quickcheck! {
+    fn qc_memchr1_matches_naive(n1: u8, corpus: Vec<u8>) -> bool {
+        memchr(n1, &corpus) == naive::memchr(n1, &corpus)
+    }
+}
+
+quickcheck! {
+    fn qc_memchr2_matches_naive(n1: u8, n2: u8, corpus: Vec<u8>) -> bool {
+        memchr2(n1, n2, &corpus) == naive::memchr2(n1, n2, &corpus)
+    }
+}
+
+quickcheck! {
+    fn qc_memchr3_matches_naive(
+        n1: u8, n2: u8, n3: u8,
+        corpus: Vec<u8>
+    ) -> bool {
+        memchr3(n1, n2, n3, &corpus) == naive::memchr3(n1, n2, n3, &corpus)
+    }
+}
+
+quickcheck! {
+    fn qc_memrchr1_matches_naive(n1: u8, corpus: Vec<u8>) -> bool {
+        memrchr(n1, &corpus) == naive::memrchr(n1, &corpus)
+    }
+}
+
+quickcheck! {
+    fn qc_memrchr2_matches_naive(n1: u8, n2: u8, corpus: Vec<u8>) -> bool {
+        memrchr2(n1, n2, &corpus) == naive::memrchr2(n1, n2, &corpus)
+    }
+}
+
+quickcheck! {
+    fn qc_memrchr3_matches_naive(
+        n1: u8, n2: u8, n3: u8,
+        corpus: Vec<u8>
+    ) -> bool {
+        memrchr3(n1, n2, n3, &corpus) == naive::memrchr3(n1, n2, n3, &corpus)
+    }
+}

diff --git a/src/tests/miri.rs b/src/tests/miri.rs
new file mode 100644
index 0000000..879ef93
--- /dev/null
+++ b/src/tests/miri.rs

@@ -0,0 +1,19 @@
+// Simple tests using MIRI
+
+use crate::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3};
+
+#[test]
+fn test_with_miri() {
+    assert_eq!(memchr(b'a', b"abcda"), Some(0));
+    assert_eq!(memchr(b'z', b"abcda"), None);
+    assert_eq!(memchr2(b'a', b'z', b"abcda"), Some(0));
+    assert_eq!(memchr2(b'z', b'y', b"abcda"), None);
+    assert_eq!(memchr3(b'a', b'z', b'b', b"abcda"), Some(0));
+    assert_eq!(memchr3(b'z', b'y', b'x', b"abcda"), None);
+    assert_eq!(memrchr(b'a', b"abcda"), Some(4));
+    assert_eq!(memrchr(b'z', b"abcda"), None);
+    assert_eq!(memrchr2(b'a', b'z', b"abcda"), Some(4));
+    assert_eq!(memrchr2(b'z', b'y', b"abcda"), None);
+    assert_eq!(memrchr3(b'a', b'z', b'b', b"abcda"), Some(4));
+    assert_eq!(memrchr3(b'z', b'y', b'x', b"abcda"), None);
+}

diff --git a/src/tests/mod.rs b/src/tests/mod.rs
new file mode 100644
index 0000000..82c1a24
--- /dev/null
+++ b/src/tests/mod.rs

@@ -0,0 +1,362 @@
+use std::iter::repeat;
+
+mod iter;
+mod memchr;
+
+#[cfg(target_endian = "little")]
+#[test]
+fn byte_order() {
+    eprintln!("LITTLE ENDIAN");
+}
+
+#[cfg(target_endian = "big")]
+#[test]
+fn byte_order() {
+    eprintln!("BIG ENDIAN");
+}
+
+/// Create a sequence of tests that should be run by memchr implementations.
+fn memchr_tests() -> Vec<MemchrTest> {
+    let mut tests = Vec::new();
+    for statict in MEMCHR_TESTS {
+        assert!(!statict.corpus.contains("%"), "% is not allowed in corpora");
+        assert!(!statict.corpus.contains("#"), "# is not allowed in corpora");
+        assert!(!statict.needles.contains(&b'%'), "% is an invalid needle");
+        assert!(!statict.needles.contains(&b'#'), "# is an invalid needle");
+
+        let t = MemchrTest {
+            corpus: statict.corpus.to_string(),
+            needles: statict.needles.to_vec(),
+            positions: statict.positions.to_vec(),
+        };
+        tests.push(t.clone());
+        tests.extend(t.expand());
+    }
+    tests
+}
+
+/// A set of tests for memchr-like functions.
+///
+/// These tests mostly try to cover the short string cases. We cover the longer
+/// string cases via the benchmarks (which are tests themselves), via
+/// quickcheck tests and via automatic expansion of each test case (by
+/// increasing the corpus size). Finally, we cover different alignment cases
+/// in the tests by varying the starting point of the slice.
+const MEMCHR_TESTS: &[MemchrTestStatic] = &[
+    // one needle (applied to memchr + memchr2 + memchr3)
+    MemchrTestStatic { corpus: "a", needles: &[b'a'], positions: &[0] },
+    MemchrTestStatic { corpus: "aa", needles: &[b'a'], positions: &[0, 1] },
+    MemchrTestStatic {
+        corpus: "aaa",
+        needles: &[b'a'],
+        positions: &[0, 1, 2],
+    },
+    MemchrTestStatic { corpus: "", needles: &[b'a'], positions: &[] },
+    MemchrTestStatic { corpus: "z", needles: &[b'a'], positions: &[] },
+    MemchrTestStatic { corpus: "zz", needles: &[b'a'], positions: &[] },
+    MemchrTestStatic { corpus: "zza", needles: &[b'a'], positions: &[2] },
+    MemchrTestStatic { corpus: "zaza", needles: &[b'a'], positions: &[1, 3] },
+    MemchrTestStatic { corpus: "zzza", needles: &[b'a'], positions: &[3] },
+    MemchrTestStatic { corpus: "\x00a", needles: &[b'a'], positions: &[1] },
+    MemchrTestStatic { corpus: "\x00", needles: &[b'\x00'], positions: &[0] },
+    MemchrTestStatic {
+        corpus: "\x00\x00",
+        needles: &[b'\x00'],
+        positions: &[0, 1],
+    },
+    MemchrTestStatic {
+        corpus: "\x00a\x00",
+        needles: &[b'\x00'],
+        positions: &[0, 2],
+    },
+    MemchrTestStatic {
+        corpus: "zzzzzzzzzzzzzzzza",
+        needles: &[b'a'],
+        positions: &[16],
+    },
+    MemchrTestStatic {
+        corpus: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzza",
+        needles: &[b'a'],
+        positions: &[32],
+    },
+    // two needles (applied to memchr2 + memchr3)
+    MemchrTestStatic {
+        corpus: "az",
+        needles: &[b'a', b'z'],
+        positions: &[0, 1],
+    },
+    MemchrTestStatic {
+        corpus: "az",
+        needles: &[b'a', b'z'],
+        positions: &[0, 1],
+    },
+    MemchrTestStatic { corpus: "az", needles: &[b'x', b'y'], positions: &[] },
+    MemchrTestStatic { corpus: "az", needles: &[b'a', b'y'], positions: &[0] },
+    MemchrTestStatic { corpus: "az", needles: &[b'x', b'z'], positions: &[1] },
+    MemchrTestStatic {
+        corpus: "yyyyaz",
+        needles: &[b'a', b'z'],
+        positions: &[4, 5],
+    },
+    MemchrTestStatic {
+        corpus: "yyyyaz",
+        needles: &[b'z', b'a'],
+        positions: &[4, 5],
+    },
+    // three needles (applied to memchr3)
+    MemchrTestStatic {
+        corpus: "xyz",
+        needles: &[b'x', b'y', b'z'],
+        positions: &[0, 1, 2],
+    },
+    MemchrTestStatic {
+        corpus: "zxy",
+        needles: &[b'x', b'y', b'z'],
+        positions: &[0, 1, 2],
+    },
+    MemchrTestStatic {
+        corpus: "zxy",
+        needles: &[b'x', b'a', b'z'],
+        positions: &[0, 1],
+    },
+    MemchrTestStatic {
+        corpus: "zxy",
+        needles: &[b't', b'a', b'z'],
+        positions: &[0],
+    },
+    MemchrTestStatic {
+        corpus: "yxz",
+        needles: &[b't', b'a', b'z'],
+        positions: &[2],
+    },
+];
+
+/// A description of a test on a memchr like function.
+#[derive(Clone, Debug)]
+struct MemchrTest {
+    /// The thing to search. We use `&str` instead of `&[u8]` because they
+    /// are nicer to write in tests, and we don't miss much since memchr
+    /// doesn't care about UTF-8.
+    ///
+    /// Corpora cannot contain either '%' or '#'. We use these bytes when
+    /// expanding test cases into many test cases, and we assume they are not
+    /// used. If they are used, `memchr_tests` will panic.
+    corpus: String,
+    /// The needles to search for. This is intended to be an "alternation" of
+    /// needles. The number of needles may cause this test to be skipped for
+    /// some memchr variants. For example, a test with 2 needles cannot be used
+    /// to test `memchr`, but can be used to test `memchr2` and `memchr3`.
+    /// However, a test with only 1 needle can be used to test all of `memchr`,
+    /// `memchr2` and `memchr3`. We achieve this by filling in the needles with
+    /// bytes that we never used in the corpus (such as '#').
+    needles: Vec<u8>,
+    /// The positions expected to match for all of the needles.
+    positions: Vec<usize>,
+}
+
+/// Like MemchrTest, but easier to define as a constant.
+#[derive(Clone, Debug)]
+struct MemchrTestStatic {
+    corpus: &'static str,
+    needles: &'static [u8],
+    positions: &'static [usize],
+}
+
+impl MemchrTest {
+    fn one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) {
+        let needles = match self.needles(1) {
+            None => return,
+            Some(needles) => needles,
+        };
+        // We test different alignments here. Since some implementations use
+        // AVX2, which can read 32 bytes at a time, we test at least that.
+        // Moreover, with loop unrolling, we sometimes process 64 (sse2) or 128
+        // (avx) bytes at a time, so we include that in our offsets as well.
+        //
+        // You might think this would cause most needles to not be found, but
+        // we actually expand our tests to include corpus sizes all the way up
+        // to >500 bytes, so we should exericse most branches.
+        for align in 0..130 {
+            let corpus = self.corpus(align);
+            assert_eq!(
+                self.positions(align, reverse).get(0).cloned(),
+                f(needles[0], corpus.as_bytes()),
+                "search for {:?} failed in: {:?} (len: {}, alignment: {})",
+                needles[0] as char,
+                corpus,
+                corpus.len(),
+                align
+            );
+        }
+    }
+
+    fn two<F: Fn(u8, u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) {
+        let needles = match self.needles(2) {
+            None => return,
+            Some(needles) => needles,
+        };
+        for align in 0..130 {
+            let corpus = self.corpus(align);
+            assert_eq!(
+                self.positions(align, reverse).get(0).cloned(),
+                f(needles[0], needles[1], corpus.as_bytes()),
+                "search for {:?}|{:?} failed in: {:?} \
+                 (len: {}, alignment: {})",
+                needles[0] as char,
+                needles[1] as char,
+                corpus,
+                corpus.len(),
+                align
+            );
+        }
+    }
+
+    fn three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>(
+        &self,
+        reverse: bool,
+        f: F,
+    ) {
+        let needles = match self.needles(3) {
+            None => return,
+            Some(needles) => needles,
+        };
+        for align in 0..130 {
+            let corpus = self.corpus(align);
+            assert_eq!(
+                self.positions(align, reverse).get(0).cloned(),
+                f(needles[0], needles[1], needles[2], corpus.as_bytes()),
+                "search for {:?}|{:?}|{:?} failed in: {:?} \
+                 (len: {}, alignment: {})",
+                needles[0] as char,
+                needles[1] as char,
+                needles[2] as char,
+                corpus,
+                corpus.len(),
+                align
+            );
+        }
+    }
+
+    fn iter_one<'a, I, F>(&'a self, reverse: bool, f: F)
+    where
+        F: FnOnce(u8, &'a [u8]) -> I,
+        I: Iterator<Item = usize>,
+    {
+        if let Some(ns) = self.needles(1) {
+            self.iter(reverse, f(ns[0], self.corpus.as_bytes()));
+        }
+    }
+
+    fn iter_two<'a, I, F>(&'a self, reverse: bool, f: F)
+    where
+        F: FnOnce(u8, u8, &'a [u8]) -> I,
+        I: Iterator<Item = usize>,
+    {
+        if let Some(ns) = self.needles(2) {
+            self.iter(reverse, f(ns[0], ns[1], self.corpus.as_bytes()));
+        }
+    }
+
+    fn iter_three<'a, I, F>(&'a self, reverse: bool, f: F)
+    where
+        F: FnOnce(u8, u8, u8, &'a [u8]) -> I,
+        I: Iterator<Item = usize>,
+    {
+        if let Some(ns) = self.needles(3) {
+            self.iter(reverse, f(ns[0], ns[1], ns[2], self.corpus.as_bytes()));
+        }
+    }
+
+    /// Test that the positions yielded by the given iterator match the
+    /// positions in this test. If reverse is true, then reverse the positions
+    /// before comparing them.
+    fn iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I) {
+        assert_eq!(
+            self.positions(0, reverse),
+            it.collect::<Vec<usize>>(),
+            r"search for {:?} failed in: {:?}",
+            self.needles.iter().map(|&b| b as char).collect::<Vec<char>>(),
+            self.corpus
+        );
+    }
+
+    /// Expand this test into many variations of the same test.
+    ///
+    /// In particular, this will generate more tests with larger corpus sizes.
+    /// The expected positions are updated to maintain the integrity of the
+    /// test.
+    ///
+    /// This is important in testing a memchr implementation, because there are
+    /// often different cases depending on the length of the corpus.
+    ///
+    /// Note that we extend the corpus by adding `%` bytes, which we
+    /// don't otherwise use as a needle.
+    fn expand(&self) -> Vec<MemchrTest> {
+        let mut more = Vec::new();
+
+        // Add bytes to the start of the corpus.
+        for i in 1..515 {
+            let mut t = self.clone();
+            let mut new_corpus: String = repeat('%').take(i).collect();
+            new_corpus.push_str(&t.corpus);
+            t.corpus = new_corpus;
+            t.positions = t.positions.into_iter().map(|p| p + i).collect();
+            more.push(t);
+        }
+        // Add bytes to the end of the corpus.
+        for i in 1..515 {
+            let mut t = self.clone();
+            let padding: String = repeat('%').take(i).collect();
+            t.corpus.push_str(&padding);
+            more.push(t);
+        }
+
+        more
+    }
+
+    /// Return the corpus at the given alignment.
+    ///
+    /// If the alignment exceeds the length of the corpus, then this returns
+    /// an empty slice.
+    fn corpus(&self, align: usize) -> &str {
+        self.corpus.get(align..).unwrap_or("")
+    }
+
+    /// Return exactly `count` needles from this test. If this test has less
+    /// than `count` needles, then add `#` until the number of needles
+    /// matches `count`. If this test has more than `count` needles, then
+    /// return `None` (because there is no way to use this test data for a
+    /// search using fewer needles).
+    fn needles(&self, count: usize) -> Option<Vec<u8>> {
+        if self.needles.len() > count {
+            return None;
+        }
+
+        let mut needles = self.needles.to_vec();
+        for _ in needles.len()..count {
+            // we assume # is never used in tests.
+            needles.push(b'#');
+        }
+        Some(needles)
+    }
+
+    /// Return the positions in this test, reversed if `reverse` is true.
+    ///
+    /// If alignment is given, then all positions greater than or equal to that
+    /// alignment are offset by the alignment. Positions less than the
+    /// alignment are dropped.
+    fn positions(&self, align: usize, reverse: bool) -> Vec<usize> {
+        let positions = if reverse {
+            let mut positions = self.positions.to_vec();
+            positions.reverse();
+            positions
+        } else {
+            self.positions.to_vec()
+        };
+        positions
+            .into_iter()
+            .filter(|&p| p >= align)
+            .map(|p| p - align)
+            .collect()
+    }
+}

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
new file mode 100644
index 0000000..e3d8e89
--- /dev/null
+++ b/src/x86/avx.rs

@@ -0,0 +1,703 @@
+use core::arch::x86_64::*;
+use core::cmp;
+use core::mem::size_of;
+
+use x86::sse2;
+
+const VECTOR_SIZE: usize = size_of::<__m256i>();
+const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
+
+// The number of bytes to loop at in one iteration of memchr/memrchr.
+const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
+
+// The number of bytes to loop at in one iteration of memchr2/memrchr2 and
+// memchr3/memrchr3. There was no observable difference between 128 and 64
+// bytes in benchmarks. memchr3 in particular only gets a very slight speed up
+// from the loop unrolling.
+const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    // For a high level explanation for how this algorithm works, see the
+    // sse2 implementation. The avx implementation here is the same, but with
+    // 256-bit vectors instead of 128-bit vectors.
+
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        // For small haystacks, defer to the SSE2 implementation. Codegen
+        // suggests this completely avoids touching the AVX vectors.
+        return sse2::memchr(n1, haystack);
+    }
+
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let loop_size = cmp::min(LOOP_SIZE, haystack.len());
+    if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
+        let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
+        let eqa = _mm256_cmpeq_epi8(vn1, a);
+        let eqb = _mm256_cmpeq_epi8(vn1, b);
+        let eqc = _mm256_cmpeq_epi8(vn1, c);
+        let eqd = _mm256_cmpeq_epi8(vn1, d);
+        let or1 = _mm256_or_si256(eqa, eqb);
+        let or2 = _mm256_or_si256(eqc, eqd);
+        let or3 = _mm256_or_si256(or1, or2);
+        if _mm256_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask = _mm256_movemask_epi8(eqa);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqb);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqc);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqd);
+            debug_assert!(mask != 0);
+            return Some(at + forward_pos(mask));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
+
+        if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search1(start_ptr, end_ptr, ptr, vn1);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let vn2 = _mm256_set1_epi8(n2 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let eqa1 = _mm256_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm256_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm256_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm256_cmpeq_epi8(vn2, b);
+        let or1 = _mm256_or_si256(eqa1, eqb1);
+        let or2 = _mm256_or_si256(eqa2, eqb2);
+        let or3 = _mm256_or_si256(or1, or2);
+        if _mm256_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask1 = _mm256_movemask_epi8(eqa1);
+            let mask2 = _mm256_movemask_epi8(eqa2);
+            if mask1 != 0 || mask2 != 0 {
+                return Some(at + forward_pos2(mask1, mask2));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm256_movemask_epi8(eqb1);
+            let mask2 = _mm256_movemask_epi8(eqb2);
+            return Some(at + forward_pos2(mask1, mask2));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memchr3(
+    n1: u8,
+    n2: u8,
+    n3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let vn2 = _mm256_set1_epi8(n2 as i8);
+    let vn3 = _mm256_set1_epi8(n3 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let eqa1 = _mm256_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm256_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm256_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm256_cmpeq_epi8(vn2, b);
+        let eqa3 = _mm256_cmpeq_epi8(vn3, a);
+        let eqb3 = _mm256_cmpeq_epi8(vn3, b);
+        let or1 = _mm256_or_si256(eqa1, eqb1);
+        let or2 = _mm256_or_si256(eqa2, eqb2);
+        let or3 = _mm256_or_si256(eqa3, eqb3);
+        let or4 = _mm256_or_si256(or1, or2);
+        let or5 = _mm256_or_si256(or3, or4);
+        if _mm256_movemask_epi8(or5) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask1 = _mm256_movemask_epi8(eqa1);
+            let mask2 = _mm256_movemask_epi8(eqa2);
+            let mask3 = _mm256_movemask_epi8(eqa3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + forward_pos3(mask1, mask2, mask3));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm256_movemask_epi8(eqb1);
+            let mask2 = _mm256_movemask_epi8(eqb2);
+            let mask3 = _mm256_movemask_epi8(eqb3);
+            return Some(at + forward_pos3(mask1, mask2, mask3));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        if let Some(i) =
+            forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
+        {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
+        let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
+        let eqa = _mm256_cmpeq_epi8(vn1, a);
+        let eqb = _mm256_cmpeq_epi8(vn1, b);
+        let eqc = _mm256_cmpeq_epi8(vn1, c);
+        let eqd = _mm256_cmpeq_epi8(vn1, d);
+        let or1 = _mm256_or_si256(eqa, eqb);
+        let or2 = _mm256_or_si256(eqc, eqd);
+        let or3 = _mm256_or_si256(or1, or2);
+        if _mm256_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
+            let mask = _mm256_movemask_epi8(eqd);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqc);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqb);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqa);
+            debug_assert!(mask != 0);
+            return Some(at + reverse_pos(mask));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let vn2 = _mm256_set1_epi8(n2 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 || *ptr == n2 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let eqa1 = _mm256_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm256_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm256_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm256_cmpeq_epi8(vn2, b);
+        let or1 = _mm256_or_si256(eqa1, eqb1);
+        let or2 = _mm256_or_si256(eqa2, eqb2);
+        let or3 = _mm256_or_si256(or1, or2);
+        if _mm256_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
+            let mask1 = _mm256_movemask_epi8(eqb1);
+            let mask2 = _mm256_movemask_epi8(eqb2);
+            if mask1 != 0 || mask2 != 0 {
+                return Some(at + reverse_pos2(mask1, mask2));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask1 = _mm256_movemask_epi8(eqa1);
+            let mask2 = _mm256_movemask_epi8(eqa2);
+            return Some(at + reverse_pos2(mask1, mask2));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memrchr3(
+    n1: u8,
+    n2: u8,
+    n3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let vn2 = _mm256_set1_epi8(n2 as i8);
+    let vn3 = _mm256_set1_epi8(n3 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let eqa1 = _mm256_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm256_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm256_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm256_cmpeq_epi8(vn2, b);
+        let eqa3 = _mm256_cmpeq_epi8(vn3, a);
+        let eqb3 = _mm256_cmpeq_epi8(vn3, b);
+        let or1 = _mm256_or_si256(eqa1, eqb1);
+        let or2 = _mm256_or_si256(eqa2, eqb2);
+        let or3 = _mm256_or_si256(eqa3, eqb3);
+        let or4 = _mm256_or_si256(or1, or2);
+        let or5 = _mm256_or_si256(or3, or4);
+        if _mm256_movemask_epi8(or5) != 0 {
+            let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
+            let mask1 = _mm256_movemask_epi8(eqb1);
+            let mask2 = _mm256_movemask_epi8(eqb2);
+            let mask3 = _mm256_movemask_epi8(eqb3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + reverse_pos3(mask1, mask2, mask3));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask1 = _mm256_movemask_epi8(eqa1);
+            let mask2 = _mm256_movemask_epi8(eqa2);
+            let mask3 = _mm256_movemask_epi8(eqa3);
+            return Some(at + reverse_pos3(mask1, mask2, mask3));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) =
+            reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
+        {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn forward_search1(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1));
+    if mask != 0 {
+        Some(sub(ptr, start_ptr) + forward_pos(mask))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn forward_search2(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+    vn2: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
+    if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
+        let mask1 = _mm256_movemask_epi8(eq1);
+        let mask2 = _mm256_movemask_epi8(eq2);
+        Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn forward_search3(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+    vn2: __m256i,
+    vn3: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
+    let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
+    let or = _mm256_or_si256(eq1, eq2);
+    if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
+        let mask1 = _mm256_movemask_epi8(eq1);
+        let mask2 = _mm256_movemask_epi8(eq2);
+        let mask3 = _mm256_movemask_epi8(eq3);
+        Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn reverse_search1(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk));
+    if mask != 0 {
+        Some(sub(ptr, start_ptr) + reverse_pos(mask))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn reverse_search2(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+    vn2: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
+    if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
+        let mask1 = _mm256_movemask_epi8(eq1);
+        let mask2 = _mm256_movemask_epi8(eq2);
+        Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn reverse_search3(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+    vn2: __m256i,
+    vn3: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
+    let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
+    let or = _mm256_or_si256(eq1, eq2);
+    if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
+        let mask1 = _mm256_movemask_epi8(eq1);
+        let mask2 = _mm256_movemask_epi8(eq2);
+        let mask3 = _mm256_movemask_epi8(eq3);
+        Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
+    } else {
+        None
+    }
+}
+
+/// Compute the position of the first matching byte from the given mask. The
+/// position returned is always in the range [0, 31].
+///
+/// The mask given is expected to be the result of _mm256_movemask_epi8.
+fn forward_pos(mask: i32) -> usize {
+    // We are dealing with little endian here, where the most significant byte
+    // is at a higher address. That means the least significant bit that is set
+    // corresponds to the position of our first matching byte. That position
+    // corresponds to the number of zeros after the least significant bit.
+    mask.trailing_zeros() as usize
+}
+
+/// Compute the position of the first matching byte from the given masks. The
+/// position returned is always in the range [0, 31]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm256_movemask_epi8,
+/// where at least one of the masks is non-zero (i.e., indicates a match).
+fn forward_pos2(mask1: i32, mask2: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0);
+
+    forward_pos(mask1 | mask2)
+}
+
+/// Compute the position of the first matching byte from the given masks. The
+/// position returned is always in the range [0, 31]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm256_movemask_epi8,
+/// where at least one of the masks is non-zero (i.e., indicates a match).
+fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
+
+    forward_pos(mask1 | mask2 | mask3)
+}
+
+/// Compute the position of the last matching byte from the given mask. The
+/// position returned is always in the range [0, 31].
+///
+/// The mask given is expected to be the result of _mm256_movemask_epi8.
+fn reverse_pos(mask: i32) -> usize {
+    // We are dealing with little endian here, where the most significant byte
+    // is at a higher address. That means the most significant bit that is set
+    // corresponds to the position of our last matching byte. The position from
+    // the end of the mask is therefore the number of leading zeros in a 32
+    // bit integer, and the position from the start of the mask is therefore
+    // 32 - (leading zeros) - 1.
+    VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1
+}
+
+/// Compute the position of the last matching byte from the given masks. The
+/// position returned is always in the range [0, 31]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm256_movemask_epi8,
+/// where at least one of the masks is non-zero (i.e., indicates a match).
+fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0);
+
+    reverse_pos(mask1 | mask2)
+}
+
+/// Compute the position of the last matching byte from the given masks. The
+/// position returned is always in the range [0, 31]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm256_movemask_epi8,
+/// where at least one of the masks is non-zero (i.e., indicates a match).
+fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
+
+    reverse_pos(mask1 | mask2 | mask3)
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+    debug_assert!(a >= b);
+    (a as usize) - (b as usize)
+}

diff --git a/src/x86/mod.rs b/src/x86/mod.rs
new file mode 100644
index 0000000..855dc8b
--- /dev/null
+++ b/src/x86/mod.rs

@@ -0,0 +1,119 @@
+use fallback;
+
+// We only use AVX when we can detect at runtime whether it's available, which
+// requires std.
+#[cfg(feature = "std")]
+mod avx;
+mod sse2;
+
+// This macro employs a gcc-like "ifunc" trick where by upon first calling
+// `memchr` (for example), CPU feature detection will be performed at runtime
+// to determine the best implementation to use. After CPU feature detection
+// is done, we replace `memchr`'s function pointer with the selection. Upon
+// subsequent invocations, the CPU-specific routine is invoked directly, which
+// skips the CPU feature detection and subsequent branch that's required.
+//
+// While this typically doesn't matter for rare occurrences or when used on
+// larger haystacks, `memchr` can be called in tight loops where the overhead
+// of this branch can actually add up *and is measurable*. This trick was
+// necessary to bring this implementation up to glibc's speeds for the 'tiny'
+// benchmarks, for example.
+//
+// At some point, I expect the Rust ecosystem will get a nice macro for doing
+// exactly this, at which point, we can replace our hand-jammed version of it.
+//
+// N.B. The ifunc strategy does prevent function inlining of course, but on
+// modern CPUs, you'll probably end up with the AVX2 implementation, which
+// probably can't be inlined anyway---unless you've compiled your entire
+// program with AVX2 enabled. However, even then, the various memchr
+// implementations aren't exactly small, so inlining might not help anyway!
+#[cfg(feature = "std")]
+macro_rules! ifunc {
+    ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{
+        use std::mem;
+        use std::sync::atomic::{AtomicPtr, Ordering};
+
+        type FnRaw = *mut ();
+
+        static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw);
+
+        fn detect($($needle: u8),+, haystack: &[u8]) -> Option<usize> {
+            let fun =
+                if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") {
+                    avx::$name as FnRaw
+                } else if cfg!(memchr_runtime_sse2) {
+                    sse2::$name as FnRaw
+                } else {
+                    fallback::$name as FnRaw
+                };
+            FN.store(fun as FnRaw, Ordering::Relaxed);
+            unsafe {
+                mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, haystack)
+            }
+        }
+
+        unsafe {
+            let fun = FN.load(Ordering::Relaxed);
+            mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, $haystack)
+        }
+    }}
+}
+
+// When std isn't available to provide runtime CPU feature detection, or if
+// runtime CPU feature detection has been explicitly disabled, then just call
+// our optimized SSE2 routine directly. SSE2 is avalbale on all x86_64 targets,
+// so no CPU feature detection is necessary.
+#[cfg(not(feature = "std"))]
+macro_rules! ifunc {
+    ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{
+        if cfg!(memchr_runtime_sse2) {
+            unsafe { sse2::$name($($needle),+, $haystack) }
+        } else {
+            fallback::$name($($needle),+, $haystack)
+        }
+    }}
+}
+
+#[inline(always)]
+pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(fn(u8, &[u8]) -> Option<usize>, memchr, haystack, n1)
+}
+
+#[inline(always)]
+pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(fn(u8, u8, &[u8]) -> Option<usize>, memchr2, haystack, n1, n2)
+}
+
+#[inline(always)]
+pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(
+        fn(u8, u8, u8, &[u8]) -> Option<usize>,
+        memchr3,
+        haystack,
+        n1,
+        n2,
+        n3
+    )
+}
+
+#[inline(always)]
+pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(fn(u8, &[u8]) -> Option<usize>, memrchr, haystack, n1)
+}
+
+#[inline(always)]
+pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(fn(u8, u8, &[u8]) -> Option<usize>, memrchr2, haystack, n1, n2)
+}
+
+#[inline(always)]
+pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(
+        fn(u8, u8, u8, &[u8]) -> Option<usize>,
+        memrchr3,
+        haystack,
+        n1,
+        n2,
+        n3
+    )
+}

diff --git a/src/x86/sse2.rs b/src/x86/sse2.rs
new file mode 100644
index 0000000..76f5a78
--- /dev/null
+++ b/src/x86/sse2.rs

@@ -0,0 +1,793 @@
+use core::arch::x86_64::*;
+use core::cmp;
+use core::mem::size_of;
+
+const VECTOR_SIZE: usize = size_of::<__m128i>();
+const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
+
+// The number of bytes to loop at in one iteration of memchr/memrchr.
+const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
+
+// The number of bytes to loop at in one iteration of memchr2/memrchr2 and
+// memchr3/memrchr3. There was no observable difference between 64 and 32 bytes
+// in benchmarks. memchr3 in particular only gets a very slight speed up from
+// the loop unrolling.
+const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    // What follows is a fast SSE2-only algorithm to detect the position of
+    // `n1` in `haystack` if it exists. From what I know, this is the "classic"
+    // algorithm. I believe it can be found in places like glibc and Go's
+    // standard library. It appears to be well known and is elaborated on in
+    // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html
+    //
+    // While this routine is very long, the basic idea is actually very simple
+    // and can be expressed straight-forwardly in pseudo code:
+    //
+    //     needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
+    //     // Note: shift amount in bytes
+    //
+    //     while i <= haystack.len() - 16:
+    //       // A 16 byte vector. Each byte in chunk corresponds to a byte in
+    //       // the haystack.
+    //       chunk = haystack[i:i+16]
+    //       // Compare bytes in needle with bytes in chunk. The result is a 16
+    //       // byte chunk where each byte is 0xFF if the corresponding bytes
+    //       // in needle and chunk were equal, or 0x00 otherwise.
+    //       eqs = cmpeq(needle, chunk)
+    //       // Return a 32 bit integer where the most significant 16 bits
+    //       // are always 0 and the lower 16 bits correspond to whether the
+    //       // most significant bit in the correspond byte in `eqs` is set.
+    //       // In other words, `mask as u16` has bit i set if and only if
+    //       // needle[i] == chunk[i].
+    //       mask = movemask(eqs)
+    //
+    //       // Mask is 0 if there is no match, and non-zero otherwise.
+    //       if mask != 0:
+    //         // trailing_zeros tells us the position of the least significant
+    //         // bit that is set.
+    //         return i + trailing_zeros(mask)
+    //
+    //     // haystack length may not be a multiple of 16, so search the rest.
+    //     while i < haystack.len():
+    //       if haystack[i] == n1:
+    //         return i
+    //
+    //     // No match found.
+    //     return NULL
+    //
+    // In fact, we could loosely translate the above code to Rust line-for-line
+    // and it would be a pretty fast algorithm. But, we pull out all the stops
+    // to go as fast as possible:
+    //
+    // 1. We use aligned loads. That is, we do some finagling to make sure our
+    //    primary loop not only proceeds in increments of 16 bytes, but that
+    //    the address of haystack's pointer that we dereference is aligned to
+    //    16 bytes. 16 is a magic number here because it is the size of SSE2
+    //    128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
+    //    Therefore, to get aligned loads, our pointer's address must be evenly
+    //    divisible by 16.
+    // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
+    //    kind of like loop unrolling, but we combine the equality comparisons
+    //    using a vector OR such that we only need to extract a single mask to
+    //    determine whether a match exists or not. If so, then we do some
+    //    book-keeping to determine the precise location but otherwise mush on.
+    // 3. We use our "chunk" comparison routine in as many places as possible,
+    //    even if it means using unaligned loads. In particular, if haystack
+    //    starts with an unaligned address, then we do an unaligned load to
+    //    search the first 16 bytes. We then start our primary loop at the
+    //    smallest subsequent aligned address, which will actually overlap with
+    //    previously searched bytes. But we're OK with that. We do a similar
+    //    dance at the end of our primary loop. Finally, to avoid a
+    //    byte-at-a-time loop at the end, we do a final 16 byte unaligned load
+    //    that may overlap with a previous load. This is OK because it converts
+    //    a loop into a small number of very fast vector instructions.
+    //
+    // The primary downside of this algorithm is that it's effectively
+    // completely unsafe. Therefore, we have to be super careful to avoid
+    // undefined behavior:
+    //
+    // 1. We use raw pointers everywhere. Not only does dereferencing a pointer
+    //    require the pointer to be valid, but we actually can't even store the
+    //    address of an invalid pointer (unless it's 1 past the end of
+    //    haystack) without sacrificing performance.
+    // 2. _mm_loadu_si128 is used when you don't care about alignment, and
+    //    _mm_load_si128 is used when you do care. You cannot use the latter
+    //    on unaligned pointers.
+    // 3. We make liberal use of debug_assert! to check assumptions.
+    // 4. We make a concerted effort to stick with pointers instead of indices.
+    //    Indices are nicer because there's less to worry about with them (see
+    //    above about pointer offsets), but I could not get the compiler to
+    //    produce as good of code as what the below produces. In any case,
+    //    pointers are what we really care about here, and alignment is
+    //    expressed a bit more naturally with them.
+    //
+    // In general, most of the algorithms in this crate have a similar
+    // structure to what you see below, so this comment applies fairly well to
+    // all of them.
+
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
+        let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
+        let eqa = _mm_cmpeq_epi8(vn1, a);
+        let eqb = _mm_cmpeq_epi8(vn1, b);
+        let eqc = _mm_cmpeq_epi8(vn1, c);
+        let eqd = _mm_cmpeq_epi8(vn1, d);
+        let or1 = _mm_or_si128(eqa, eqb);
+        let or2 = _mm_or_si128(eqc, eqd);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask = _mm_movemask_epi8(eqa);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqb);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqc);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqd);
+            debug_assert!(mask != 0);
+            return Some(at + forward_pos(mask));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
+
+        if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search1(start_ptr, end_ptr, ptr, vn1);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            if mask1 != 0 || mask2 != 0 {
+                return Some(at + forward_pos2(mask1, mask2));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            return Some(at + forward_pos2(mask1, mask2));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memchr3(
+    n1: u8,
+    n2: u8,
+    n3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let vn3 = _mm_set1_epi8(n3 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let eqa3 = _mm_cmpeq_epi8(vn3, a);
+        let eqb3 = _mm_cmpeq_epi8(vn3, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(eqa3, eqb3);
+        let or4 = _mm_or_si128(or1, or2);
+        let or5 = _mm_or_si128(or3, or4);
+        if _mm_movemask_epi8(or5) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            let mask3 = _mm_movemask_epi8(eqa3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + forward_pos3(mask1, mask2, mask3));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            let mask3 = _mm_movemask_epi8(eqb3);
+            return Some(at + forward_pos3(mask1, mask2, mask3));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        if let Some(i) =
+            forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
+        {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
+        let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
+        let eqa = _mm_cmpeq_epi8(vn1, a);
+        let eqb = _mm_cmpeq_epi8(vn1, b);
+        let eqc = _mm_cmpeq_epi8(vn1, c);
+        let eqd = _mm_cmpeq_epi8(vn1, d);
+        let or1 = _mm_or_si128(eqa, eqb);
+        let or2 = _mm_or_si128(eqc, eqd);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
+            let mask = _mm_movemask_epi8(eqd);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqc);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqb);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqa);
+            debug_assert!(mask != 0);
+            return Some(at + reverse_pos(mask));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 || *ptr == n2 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            if mask1 != 0 || mask2 != 0 {
+                return Some(at + reverse_pos2(mask1, mask2));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            return Some(at + reverse_pos2(mask1, mask2));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memrchr3(
+    n1: u8,
+    n2: u8,
+    n3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let vn3 = _mm_set1_epi8(n3 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let eqa3 = _mm_cmpeq_epi8(vn3, a);
+        let eqb3 = _mm_cmpeq_epi8(vn3, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(eqa3, eqb3);
+        let or4 = _mm_or_si128(or1, or2);
+        let or5 = _mm_or_si128(or3, or4);
+        if _mm_movemask_epi8(or5) != 0 {
+            let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            let mask3 = _mm_movemask_epi8(eqb3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + reverse_pos3(mask1, mask2, mask3));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            let mask3 = _mm_movemask_epi8(eqa3);
+            return Some(at + reverse_pos3(mask1, mask2, mask3));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) =
+            reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
+        {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn forward_search1(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1));
+    if mask != 0 {
+        Some(sub(ptr, start_ptr) + forward_pos(mask))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn forward_search2(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn forward_search3(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+    vn3: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    let eq3 = _mm_cmpeq_epi8(chunk, vn3);
+    let or = _mm_or_si128(eq1, eq2);
+    if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        let mask3 = _mm_movemask_epi8(eq3);
+        Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn reverse_search1(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk));
+    if mask != 0 {
+        Some(sub(ptr, start_ptr) + reverse_pos(mask))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn reverse_search2(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn reverse_search3(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+    vn3: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    let eq3 = _mm_cmpeq_epi8(chunk, vn3);
+    let or = _mm_or_si128(eq1, eq2);
+    if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        let mask3 = _mm_movemask_epi8(eq3);
+        Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
+    } else {
+        None
+    }
+}
+
+/// Compute the position of the first matching byte from the given mask. The
+/// position returned is always in the range [0, 15].
+///
+/// The mask given is expected to be the result of _mm_movemask_epi8.
+fn forward_pos(mask: i32) -> usize {
+    // We are dealing with little endian here, where the most significant byte
+    // is at a higher address. That means the least significant bit that is set
+    // corresponds to the position of our first matching byte. That position
+    // corresponds to the number of zeros after the least significant bit.
+    mask.trailing_zeros() as usize
+}
+
+/// Compute the position of the first matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn forward_pos2(mask1: i32, mask2: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0);
+
+    forward_pos(mask1 | mask2)
+}
+
+/// Compute the position of the first matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
+
+    forward_pos(mask1 | mask2 | mask3)
+}
+
+/// Compute the position of the last matching byte from the given mask. The
+/// position returned is always in the range [0, 15].
+///
+/// The mask given is expected to be the result of _mm_movemask_epi8.
+fn reverse_pos(mask: i32) -> usize {
+    // We are dealing with little endian here, where the most significant byte
+    // is at a higher address. That means the most significant bit that is set
+    // corresponds to the position of our last matching byte. The position from
+    // the end of the mask is therefore the number of leading zeros in a 16
+    // bit integer, and the position from the start of the mask is therefore
+    // 16 - (leading zeros) - 1.
+    VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1
+}
+
+/// Compute the position of the last matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0);
+
+    reverse_pos(mask1 | mask2)
+}
+
+/// Compute the position of the last matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
+
+    reverse_pos(mask1 | mask2 | mask3)
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+    debug_assert!(a >= b);
+    (a as usize) - (b as usize)
+}

diff --git a/src/x86/sse42.rs b/src/x86/sse42.rs
new file mode 100644
index 0000000..78a9b37
--- /dev/null
+++ b/src/x86/sse42.rs

@@ -0,0 +1,75 @@
+// This code is unused. PCMPESTRI is gratuitously slow. I imagine it might
+// start winning with a hypothetical memchr4 (or greater). This technique might
+// also be good for exposing searches over ranges of bytes, but that departs
+// from the standard memchr API, so it's not clear whether we actually want
+// that or not.
+//
+// N.B. PCMPISTRI appears to be about twice as fast as PCMPESTRI, which is kind
+// of neat. Unfortunately, UTF-8 strings can contain NUL bytes, which means
+// I don't see a way of effectively using PCMPISTRI unless there's some fast
+// way to replace zero bytes with a byte that is not not a needle byte.
+
+use core::arch::x86_64::*;
+use core::mem::size_of;
+
+use x86::sse2;
+
+const VECTOR_SIZE: usize = size_of::<__m128i>();
+const CONTROL_ANY: i32 =
+    _SIDD_UBYTE_OPS
+    | _SIDD_CMP_EQUAL_ANY
+    | _SIDD_POSITIVE_POLARITY
+    | _SIDD_LEAST_SIGNIFICANT;
+
+#[target_feature(enable = "sse4.2")]
+pub unsafe fn memchr3(
+    n1: u8, n2: u8, n3: u8,
+    haystack: &[u8]
+) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let vn3 = _mm_set1_epi8(n3 as i8);
+    let vn = _mm_setr_epi8(
+        n1 as i8, n2 as i8, n3 as i8, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+    );
+    let len = haystack.len();
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        let chunk = _mm_loadu_si128(ptr as *const __m128i);
+        let res = _mm_cmpestri(vn, 3, chunk, 16, CONTROL_ANY);
+        if res < 16 {
+            return Some(sub(ptr, start_ptr) + res as usize);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return sse2::forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+    debug_assert!(a >= b);
+    (a as usize) - (b as usize)
+}
commit	213930d9bab55e016623e18368ec734db5507b92	[log] [tgz]
author	android-build-prod (mdb) <android-build-team-robot@google.com>	Mon Apr 27 21:45:53 2020 +0000
committer	android-build-prod (mdb) <android-build-team-robot@google.com>	Mon Apr 27 21:45:53 2020 +0000
tree	cb3f25cd050f85ca71e0fbf742efc064c2195309
parent	100ab1994dd7f043162e02289881970f43d7e439 [diff]
parent	3c640bc02706768a64797b2167b6dfa9c54cd796 [diff]