blob: 4dbd771df7328b6ffe5f6fbbd1dfe6ffe5f72189 [file]
/*
* Copyright (C) 2025 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdbit.h>
#include <stdint.h>
#include <string.h>
#include "portable_simd_detail.h"
#include "portable_simd_exports.h"
namespace portable_simd {
namespace {
struct StrlenTraits {
// Highway doesn't support direct `char` usage in vector types, presumably
// because it can vary in signed-ness. `strlen` uses `min` to find `\0`, so
// unsigned is simplest.
//
// It's notable that the highway quick_reference advises against unsigned
// `min` operations for older CPUs:
// https://google.github.io/highway/en/master/quick_reference.html#speeding-up-code-for-older-x86-platforms
//
// But it's unclear why; instruction timing tables indicate that unsigned and
// signed bytewise `min` have identical timings.
using CharType = uint8_t;
using VectorTag = portable_simd::FullVector<CharType>;
using VectorType = hn::VFromD<VectorTag>;
};
struct WcslenTraits {
using CharType = uint32_t;
using VectorTag = portable_simd::FullVector<CharType>;
using VectorType = hn::VFromD<VectorTag>;
};
template <typename Traits>
PSIMD_FLATTEN static optional<size_t> index_of_nul(typename Traits::VectorType val,
size_t chars_to_skip = 0) {
constexpr typename Traits::VectorTag d;
const auto all_zeroes = Zero(d);
// NOTE: The mask that highway generates here is lane-wise.
const size_t raw_zero_mask = BitsFromMask(d, all_zeroes == val);
const size_t zero_mask = raw_zero_mask >> chars_to_skip;
if (!zero_mask) {
return {};
}
const size_t lane_index = stdc_trailing_zeros(zero_mask);
return optional{lane_index};
}
template <typename Traits>
PSIMD_FLATTEN static optional<const typename Traits::CharType*> ptr_of_nul(
const void* ptr, typename Traits::VectorType val) {
if (const optional<size_t> x = index_of_nul<Traits>(val)) {
return optional{*x + static_cast<const Traits::CharType*>(ptr)};
}
return {};
}
template <typename Traits>
PSIMD_FLATTEN static size_t strlen_vectorized(const typename Traits::CharType* s) {
using CharType = Traits::CharType;
using VectorTag = Traits::VectorTag;
constexpr VectorTag d;
auto [ptr, nul_distance] = align_forward_to_vec<VectorTag>(
s, [&](auto val, optional<size_t> bytes_to_skip, optional<size_t>) -> optional<size_t> {
size_t chars_to_skip = 0;
if (bytes_to_skip) {
// All wide-char pointers should be suitably aligned.
PSIMD_DCHECK(*bytes_to_skip % sizeof(CharType) == 0);
chars_to_skip = *bytes_to_skip / sizeof(CharType);
}
if (const optional<size_t> x = index_of_nul<Traits>(val, chars_to_skip)) {
return optional{*x};
}
return {};
});
if (nul_distance) {
return *nul_distance;
}
// The simplest implementation from here would be:
//
// while (true) {
// // check for nul, return if found
// ++ptr;
// }
//
// `perf` says that x86_64 CPUs stall on the 'check for nul, return if found'
// branch really badly, so it's a better balance if we can work in batches.
// Since batch size must be a power of two, work in batches of 4.
const auto check_ptr_and_inc = [&]() -> optional<size_t> {
if (const optional<const CharType*> x = ptr_of_nul<Traits>(ptr, Load(d, ptr))) {
return optional{static_cast<size_t>(*x - s)};
}
ptr += d.MaxLanes();
return {};
};
// Of course, we can only work in batches if it's safe to read more than one
// vector at a time.
if (!kReadAheadToPageBoundaryIsOK) {
while (true) {
if (const optional<size_t> x = check_ptr_and_inc()) {
return *x;
}
}
}
// Here, we have a trade-off to make: "how many very simple checks do we want
// to do before hitting the loop that's really fast for long strings?"
//
// Between here and the "big string" loop, there are between 0 and 3
// `check_ptr_and_inc`s to bring us up to alignment.
//
// We've checked between 1 and kVectorSize bytes so far, and it seems bad to
// dive into the 'big' case having checked as little as 1 byte, so add a
// few checks beforehand.
constexpr size_t kMinBytesUntilStringIsBig = 128;
// Worst case, we've only read 1 byte so far. Add 1 check to round up.
constexpr size_t kExtraChecksNeeded = 1 + (kMinBytesUntilStringIsBig - 1) / d.MaxBytes();
#pragma unroll
for (size_t i = 0; i < kExtraChecksNeeded; ++i) {
if (const optional<size_t> x = check_ptr_and_inc()) {
return *x;
}
}
// Now bring ourselves to 4*kVectorAlign alignment.
constexpr size_t kFourVecAlign = 4 * vector_align(d);
static_assert(kPageSize % kFourVecAlign == 0);
const size_t vector_width_from_prev_align =
(reinterpret_cast<uintptr_t>(ptr) & (kFourVecAlign - 1)) / vector_align(d);
switch (vector_width_from_prev_align) {
case 1:
if (const optional<size_t> x = check_ptr_and_inc()) {
return *x;
}
[[fallthrough]];
case 2:
if (const optional<size_t> x = check_ptr_and_inc()) {
return *x;
}
[[fallthrough]];
case 3:
if (const optional<size_t> x = check_ptr_and_inc()) {
return *x;
}
break;
default:
PSIMD_DCHECK(vector_width_from_prev_align == 0);
}
while (true) {
const auto vec1 = Load(d, ptr);
const auto vec2 = Load(d, ptr + d.MaxLanes());
const auto min12 = Min(vec1, vec2);
const auto vec3 = Load(d, ptr + d.MaxLanes() * 2);
const auto vec4 = Load(d, ptr + d.MaxLanes() * 3);
const auto min34 = Min(vec3, vec4);
const auto min_all = Min(min12, min34);
const optional<size_t> maybe_index = index_of_nul<Traits>(min_all);
if (!maybe_index) [[likely]] {
ptr += 4 * d.MaxLanes();
continue;
}
const size_t index_of_first_zero = *maybe_index;
// We're in the 'longer string' case, so there's no reason to assume `vec1`
// is most likely to have the `\0`. Assuming all cases are equally likely,
// prefer a constant 2 `ptr_of_nul` cost, rather than 1-3.
if (const optional<const CharType*> x = ptr_of_nul<Traits>(ptr, min12)) {
if (const optional<const CharType*> x2 = ptr_of_nul<Traits>(ptr, vec1)) {
return static_cast<size_t>(*x2 - s);
}
// If vec1 had no zeroes, then we can assume the min 0 is from vec2.
return static_cast<size_t>(*x - s) + d.MaxLanes();
}
ptr += d.MaxLanes() * 2;
if (const optional<const CharType*> x = ptr_of_nul<Traits>(ptr, vec3)) {
return static_cast<size_t>(*x - s);
}
// Since nothing else has nul in it, it's guaranteed that the nul is
// vec4's, so we can use the above `index_of_nul` result.
ptr += d.MaxLanes();
return static_cast<size_t>(reinterpret_cast<const CharType*>(ptr) - s) + index_of_first_zero;
}
}
} // namespace
} // namespace portable_simd
PSIMD_LIBC_FUNCTION(size_t, strlen, const char* s) {
using portable_simd::StrlenTraits;
return portable_simd::strlen_vectorized<StrlenTraits>(
reinterpret_cast<const StrlenTraits::CharType*>(s));
}
static size_t simplistic_misaligned_wcslen(const wchar_t* s) {
size_t len = 0;
while (*s) {
++s;
++len;
}
return len;
}
PSIMD_LIBC_FUNCTION(size_t, wcslen, const wchar_t* s) {
using portable_simd::WcslenTraits;
// We might have received a misaligned pointer. Support that with a slow case
// if needed. It's expected that the 99% case will be properly-aligned, so no
// meaningful effort is put into making the misaligned case fast.
if (reinterpret_cast<uintptr_t>(s) % alignof(wchar_t)) [[unlikely]] {
return simplistic_misaligned_wcslen(s);
}
return portable_simd::strlen_vectorized<WcslenTraits>(
reinterpret_cast<const WcslenTraits::CharType*>(s));
}
#if defined(__x86_64__)
PSIMD_MAYBE_STRONG_ALIAS(strlen);
PSIMD_MAYBE_STRONG_ALIAS(wcslen);
#elif defined(__aarch64__)
PSIMD_MAYBE_STRONG_ALIAS(wcslen);
#endif