blob: ae304a7d64ec763aafa9a3e8bd8edd4b3edd7746 [file]
/*
* Copyright (C) 2025 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdbit.h>
#include <stdint.h>
#include <string.h>
#include <wchar.h>
#include "portable_simd_detail.h"
#include "portable_simd_exports.h"
extern "C" wchar_t* __wmemchr_misaligned(const wchar_t*, wchar_t, size_t);
namespace portable_simd {
namespace {
struct CharTraits {
// Highway doesn't support direct `char` usage in vector types, presumably
// because it can vary in signed-ness. `memchr` does not care at all; choose
// signed because hwy docs say it's slightly more efficient on older x86_64
// CPUs. If there's any better argument for unsigned, that should probably be
// preferred.
using CharType = int8_t;
using VectorTag = portable_simd::FullVector<CharType>;
};
struct WcharTraits {
using CharType = int32_t;
static_assert(sizeof(CharType) == sizeof(wchar_t));
using VectorTag = portable_simd::FullVector<CharType>;
};
// All functions here share a 'driver' implementation, with a `Traits` object
// to encapsulate their differences.
template <typename Traits>
struct CharAgnosticMemchrTraits {
using CharType = Traits::CharType;
using VectorTag = Traits::VectorTag;
// As of C11, calling `memchr` with a length greater than the size of the
// buffer pointed to is well-defined _if_ the needle can be found in that
// buffer. `wmemchr` provides no such guarantee, but treat it similarly
// anyway for simplicity.
constexpr static bool kIsMaxLengthGuaranteed = false;
// Advance `ptr` in the direction of this memchr.
PSIMD_FLATTEN static const CharType* advance_ptr(const CharType* p, size_t n = 1) {
constexpr VectorTag d;
return p + n * d.MaxLanes();
}
// Performs adjustment on `p` before starting the algorithm. Memchr starts at `p`.
PSIMD_FLATTEN static const CharType* initial_adjust_ptr(const CharType* p, size_t) { return p; }
// Given a scalar mask indicating which vector element(s) matched our target
// character, return the offset from the vector pointer of the closest
// match.
PSIMD_FLATTEN static size_t lane_offset_of_first(ScalarMaskForD<VectorTag> mask) {
return stdc_trailing_zeros(mask);
}
// Returns the pointer to the closest element in `val` equal to `ch`,
// provided `val` was loaded from `ptr - lanes_to_skip`. If `count` is
// non-empty, this will ignore any elements in the highest `count` lanes of
// `val`.
PSIMD_FLATTEN static optional<const void*> ptr_of_first(const CharType* ptr,
hn::VFromD<VectorTag> val, CharType ch,
optional<size_t> count = {},
optional<size_t> lanes_to_skip = {}) {
constexpr VectorTag d;
const auto all_ch = Set(d, ch);
const size_t raw_eq_mask = BitsFromMask(d, all_ch == val);
size_t eq_mask = raw_eq_mask >> lanes_to_skip.unwrap_or(0);
if (count) {
const size_t inbounds_mask = ~(kMaxSizeT << *count);
eq_mask &= inbounds_mask;
}
if (!eq_mask) {
return {};
}
return optional<const void*>{ptr + lane_offset_of_first(eq_mask)};
}
// `align_forward_to_vec`.
PSIMD_FLATTEN static auto align_ptr_to_vec(const CharType* s, auto f) {
return align_forward_to_vec<VectorTag>(s, f);
}
// Since the max length is never guaranteed, 'known safe' isn't possible.
__attribute__((diagnose_if(true, "This is unsafe, and should never be called",
"error"))) static auto
align_ptr_to_vec_known_safe(auto...) {
__builtin_unreachable();
}
};
// Use `struct`s instead of `using`s so error messages and stack traces are
// prettier.
struct MemchrTraits : CharAgnosticMemchrTraits<CharTraits> {};
struct WmemchrTraits : CharAgnosticMemchrTraits<WcharTraits> {};
struct MemrchrTraits : CharTraits {
// memrchr always walks to its maximum length, so we can treat it as a
// guarantee.
constexpr static bool kIsMaxLengthGuaranteed = true;
// Advance `ptr` in the direction of this memchr.
PSIMD_FLATTEN static const CharType* advance_ptr(const CharType* p, size_t n = 1) {
constexpr VectorTag d;
return p - n * d.MaxLanes();
}
// Performs adjustment on `p` before starting the algorithm.
// Memrchr starts at the end and immediately calls alignment functions that
// expect the pointer to be adjusted backwards by one vector width.
PSIMD_FLATTEN static const CharType* initial_adjust_ptr(const CharType* p, size_t count) {
constexpr VectorTag d;
return p + count - d.MaxLanes();
}
// Given a scalar mask indicating which vector element(s) matched our target
// character, return the offset from the vector pointer of the farthest
// match.
PSIMD_FLATTEN static size_t lane_offset_of_first(ScalarMaskForD<VectorTag> mask) {
constexpr VectorTag d;
const size_t leading_zeros = stdc_leading_zeros(mask);
const size_t distance_of_one = leading_zeros + 1;
const size_t offset_from_start = d.MaxLanes() - distance_of_one;
return offset_from_start;
}
// Returns the pointer to the farthest element in `val` equal to `ch`,
// provided `val` was loaded from `ptr - lanes_to_skip`. If `count` is
// non-empty, this will ignore any elements in the lowest `count` lanes of `val`.
PSIMD_FLATTEN static optional<const void*> ptr_of_first(const CharType* ptr,
hn::VFromD<VectorTag> val, CharType ch,
optional<size_t> count = {},
optional<size_t> lanes_to_skip = {}) {
constexpr VectorTag d;
const auto all_ch = Set(d, ch);
// NOTE: The size of this type is important; this code ends up simpler if we
// can rely on this mask only ever containing bits that correspond with
// vector lanes.
ScalarMaskForD<VectorTag> eq_mask = BitsFromMask(d, all_ch == val);
// TODO(gbiv): This was written expecting that `count` and `lanes_to_skip`
// would both trivially `has_value` or not, so there'll no be actual
// branches here. It could be nice to either:
// - have an assertion of that here (though attempts like
// `if (!__builtin_constant_p(has_value())) warning_func();` have failed,
// despite no branches existing in the LLVM IR related to the optionals),
// or
// - template this on some kind of `const_optional` type so we can `if
// constexpr` these branches
if (lanes_to_skip) {
eq_mask <<= *lanes_to_skip;
}
if (count) {
const size_t count_mask = kMaxSizeT << (d.MaxLanes() - *count);
eq_mask &= count_mask;
}
if (!eq_mask) {
return {};
}
return optional<const void*>{ptr + lane_offset_of_first(eq_mask)};
}
// `align_backward_to_vec`.
PSIMD_FLATTEN static auto align_ptr_to_vec(const CharType* s, auto f) {
return align_backward_to_vec<VectorTag>(s, f);
}
// `align_backward_to_vec_known_safe`.
PSIMD_FLATTEN static auto align_ptr_to_vec_known_safe(const CharType* s, auto f) {
return align_backward_to_vec_known_safe<VectorTag>(s, f);
}
};
template <typename Traits>
PSIMD_FLATTEN static const void* memchr_vectorized(const typename Traits::CharType* s,
typename Traits::CharType ch, size_t count) {
using CharType = Traits::CharType;
using VectorTag = Traits::VectorTag;
constexpr VectorTag d;
s = Traits::initial_adjust_ptr(s, count);
const auto result_from_final_vec = [ch](const CharType* ptr, const auto vec_val, size_t count,
optional<size_t> lanes_to_skip = {}) -> const void* {
// If `count == 0`, we loaded `vec_val` when we shouldn't have. This is a
// correctness issue, since `ptr` might've been at the start of a new page.
PSIMD_DCHECK(count != 0);
return Traits::ptr_of_first(ptr, vec_val, ch, optional<size_t>{count}, lanes_to_skip)
.unwrap_or(nullptr);
};
if (count <= d.MaxLanes()) {
// Unlikely because it seems rare that people would depend on 0-sized
// memrchrs being a very fast case.
if (count == 0) [[unlikely]] {
return nullptr;
}
// We know for certain that we need 2 or fewer loads to service this request.
const auto [ptr, maybe_result] = Traits::align_ptr_to_vec(
s,
[&](const auto val, optional<size_t> bytes_to_skip, optional<size_t>)
PSIMD_FLATTEN -> optional<const void*> {
// If we loaded `ptr` directly, one vector op is all this will
// take.
if (!bytes_to_skip.has_value()) {
return optional{result_from_final_vec(s, val, count)};
}
// If pointers are properly aligned, we should never load a partial
// CharType.
PSIMD_DCHECK(*bytes_to_skip % sizeof(CharType) == 0);
const size_t lanes_to_skip = *bytes_to_skip / sizeof(CharType);
// Reiterating from `align_forward_to_vec`, this is expected to
// be inlined such that `lanes_to_skip.has_value()` always
// trivially folds to a constant.
if (const optional<const void*> x = Traits::ptr_of_first(
s, val, ch, optional<size_t>{count}, optional<size_t>{lanes_to_skip})) {
return x;
}
const auto lanes_read = d.MaxLanes() - lanes_to_skip;
if (lanes_read >= count) {
return optional<const void*>{nullptr};
}
count -= lanes_read;
return {};
});
if (maybe_result) {
return *maybe_result;
}
return result_from_final_vec(ptr, Load(d, ptr), count);
}
auto align_func = [&](auto val, optional<size_t> bytes_to_skip, optional<size_t> overlap_bytes)
PSIMD_FLATTEN -> optional<const void*> {
// If pointers are properly aligned, we should never load a partial
// CharType, since vector alignments are all multiples of `char`
// and `wchar_t`.
optional<size_t> lanes_to_skip;
optional<size_t> overlap_lanes;
size_t lanes_consumed = d.MaxLanes();
if (bytes_to_skip) {
PSIMD_DCHECK(*bytes_to_skip % sizeof(CharType) == 0);
lanes_to_skip = optional<size_t>{*bytes_to_skip / sizeof(CharType)};
lanes_consumed -= *lanes_to_skip;
} else {
PSIMD_DCHECK(overlap_bytes.has_value());
PSIMD_DCHECK(*overlap_bytes % sizeof(CharType) == 0);
overlap_lanes = optional<size_t>{*overlap_bytes / sizeof(CharType)};
lanes_consumed -= *overlap_lanes;
}
if (const optional<const void*> x =
Traits::ptr_of_first(s, val, ch, /*count=*/{}, lanes_to_skip)) {
// No need to bounds-check, due to `count`'s size.
return optional<const void*>{*x};
}
count -= lanes_consumed;
return {};
};
GenericAlignResult<VectorTag, optional<const void*>> first_align_result;
if constexpr (Traits::kIsMaxLengthGuaranteed) {
first_align_result = Traits::align_ptr_to_vec_known_safe(s, align_func);
} else {
first_align_result = Traits::align_ptr_to_vec(s, align_func);
}
auto [ptr, maybe_result] = first_align_result;
if (maybe_result) {
return *maybe_result;
}
const auto check_ptr_and_advance = [&]() PSIMD_FLATTEN -> optional<const void*> {
if (const optional<const void*> x = Traits::ptr_of_first(ptr, Load(d, ptr), ch)) {
return optional{*x};
}
ptr = Traits::advance_ptr(ptr);
return {};
};
// Can't do fancy unrolling tricks if we can't read ahead.
if constexpr (!kReadAheadToPageBoundaryIsOK) {
while (count >= d.MaxLanes()) {
if (const optional<const void*> x = check_ptr_and_advance()) {
return *x;
}
count -= d.MaxLanes();
}
if (const size_t residual_count = count % d.MaxLanes()) {
return result_from_final_vec(ptr, Load(d, ptr), residual_count);
}
return nullptr;
}
// The simplest implementation from here would be:
//
// while (true) {
// // check for char, return if found
// ++ptr;
// }
//
// `perf` says that x86_64 CPUs stall on 'check for char, return if found'
// branches really badly, so it's a better balance if we can work in batches.
// Work in batches until heuristics say that running down an unrolled
// loop is likely to be better.
size_t full_vector_loads_remaining = count / d.MaxLanes();
constexpr size_t unrolled_loop_size = 4;
if (full_vector_loads_remaining >= unrolled_loop_size) {
const auto needle = Set(d, ch);
// NOTE: "3 checks per loop," was chosen based on experimentation on Brya,
// which ships with chips like the 2024 Intel Core 3 100U. 2 loads was as
// much as 1.1x slower on very long inputs. There was no obvious
// improvement in doing 4 per loop.
//
// That said, do 4 checks per loop if the size isn't a guaranteed
// upper-bound. It needs to be _some_ power of two, since the batches below
// can't safely cross a page boundary.
constexpr size_t checks_per_loop = Traits::kIsMaxLengthGuaranteed ? 3 : 4;
if constexpr (!Traits::kIsMaxLengthGuaranteed) {
constexpr size_t misaligned_mask = vector_align(d) * checks_per_loop - 1;
// Bring us to a correct multiple-of-vector-size alignment...
#pragma unroll
for (size_t i = 0; i < checks_per_loop - 1; ++i) {
bool is_misaligned = reinterpret_cast<uintptr_t>(ptr) & misaligned_mask;
if (!is_misaligned) {
break;
}
if (const optional<const void*> x = check_ptr_and_advance()) {
return *x;
}
full_vector_loads_remaining -= 1;
}
PSIMD_DCHECK(!(reinterpret_cast<uintptr_t>(ptr) & misaligned_mask));
}
while (full_vector_loads_remaining >= unrolled_loop_size) {
// So highway may represent masks as _either_:
// - a vector which you can convert to a scalar through MaskFromVec(), or
// - a scalar.
//
// It does not allow `operator|` on masks.
//
// This implementation was written assuming:
// - they're vectors (thus converting mask -> vector is free), and
// - this loop's hot path involves looping (so operations on that path
// should be minimized).
//
// When that no longer holds, it should be trivial to refactor a bit.
static_assert(sizeof(hn::MFromD<VectorTag>) == sizeof(hn::VFromD<VectorTag>));
hn::MFromD<VectorTag> equal_results[checks_per_loop];
#pragma unroll
for (size_t i = 0; i < checks_per_loop; ++i) {
const auto v = Load(d, Traits::advance_ptr(ptr, i));
equal_results[i] = v == needle;
}
auto merged_eq = Or(equal_results[0], equal_results[1]);
#pragma unroll
for (size_t i = 2; i < checks_per_loop; ++i) {
merged_eq = Or(merged_eq, equal_results[i]);
}
const size_t eq_bits = BitsFromMask(d, merged_eq);
// `[[likely]]` keeps this loop tight.
if (!eq_bits) [[likely]] {
full_vector_loads_remaining -= checks_per_loop;
ptr = Traits::advance_ptr(ptr, checks_per_loop);
continue;
}
#pragma unroll
for (size_t i = 0; i < checks_per_loop - 1; ++i) {
if (const size_t m = BitsFromMask(d, equal_results[i])) {
return ptr + Traits::lane_offset_of_first(m);
}
ptr = Traits::advance_ptr(ptr, 1);
}
// If earlier masks were empty, `eq_bits` only contains bits relevant to
// the last vector.
return ptr + Traits::lane_offset_of_first(eq_bits);
}
}
switch (full_vector_loads_remaining) {
case 3:
if (const optional<const void*> x = check_ptr_and_advance()) {
return *x;
}
[[fallthrough]];
case 2:
if (const optional<const void*> x = check_ptr_and_advance()) {
return *x;
}
[[fallthrough]];
case 1:
if (const optional<const void*> x = check_ptr_and_advance()) {
return *x;
}
[[fallthrough]];
case 0:
if (const size_t residual_count = count % d.MaxLanes()) {
return result_from_final_vec(ptr, Load(d, ptr), residual_count);
}
return nullptr;
default:
__builtin_unreachable();
}
}
} // namespace
} // namespace portable_simd
PSIMD_LIBC_FUNCTION(void*, memchr, const void* ptr, int ch, size_t count) {
using portable_simd::MemchrTraits;
return const_cast<void*>(portable_simd::memchr_vectorized<MemchrTraits>(
reinterpret_cast<const MemchrTraits::CharType*>(ptr), ch, count));
}
PSIMD_LIBC_FUNCTION(void*, memrchr, const void* ptr, int ch, size_t count) {
using portable_simd::MemrchrTraits;
return const_cast<void*>(portable_simd::memchr_vectorized<MemrchrTraits>(
reinterpret_cast<const MemrchrTraits::CharType*>(ptr), ch, count));
}
PSIMD_LIBC_FUNCTION(size_t, strnlen, const char* ptr, size_t count) {
// strnlen is just `memchr(ptr, '\0', count)` with an extra step.
const auto* s = static_cast<const char*>(PSIMD_LIBC_FUNCTION_NAME(memchr)(ptr, '\0', count));
if (!s) {
return count;
}
return static_cast<size_t>(s - ptr);
}
PSIMD_LIBC_FUNCTION(wchar_t*, wmemchr, const wchar_t* ptr, wchar_t ch, size_t count) {
using portable_simd::WmemchrTraits;
// We might have received a misaligned pointer. Support that with a slow case
// if needed. It's expected that the 99% case will be properly-aligned, so no
// meaningful effort is put into making the misaligned case fast.
if (reinterpret_cast<uintptr_t>(ptr) % alignof(wchar_t)) [[unlikely]] {
return __wmemchr_misaligned(ptr, ch, count);
}
return const_cast<wchar_t*>(
static_cast<const wchar_t*>(portable_simd::memchr_vectorized<WmemchrTraits>(
reinterpret_cast<const WmemchrTraits::CharType*>(ptr), ch, count)));
}
#if defined(__aarch64__)
PSIMD_MAYBE_STRONG_ALIAS(wmemchr);
#elif defined(__x86_64__)
PSIMD_MAYBE_STRONG_ALIAS(memchr);
PSIMD_MAYBE_STRONG_ALIAS(memrchr);
PSIMD_MAYBE_STRONG_ALIAS(strnlen);
PSIMD_MAYBE_STRONG_ALIAS(wmemchr);
#endif