libc/portable-simd/strspn.cpp - platform/bionic - Git at Google

 /*
  * Copyright (C) 2026 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */

 #include "portable_simd_detail.h"
 #include "portable_simd_exports.h"

 namespace portable_simd {
 namespace {

 // strspn exclusively uses 16B vectors, since:
 // 1. The 'large' implementation is written assuming 16B vectors, and would
 //    need to be rewritten for larger vector types.
 // 2. Larger vectors for the 'small' path only really help as we traverse deep
 //    into the haystack; it seems likely that most 'hits' will be early on in
 //    the haystack.
 //
 // Moreover, "it's simpler this way, and probably more than sufficiently fast."
 using ElemTy = int8_t;
 using FullVec = hn::Full128<ElemTy>;
 using FullVecTy = hn::VFromD<FullVec>;
 using FullVecM = hn::MFromD<FullVec>;

 constexpr static size_t kMaxSmallSize = 4;

 // How many bytes to scan before with a naive n^2 algorithm before building a
 // bitset, assuming the keep/reject set has more than kMaxSmallSize elements.
 //
 // The main disadvantage of the bitset is that it takes quite a while to
 // construct; the goal is to balance the extra throughput the bet confers with
 // this.
 //
 // To help inform one's intuition, on arm64 (mustang), a fully n^2
 // implementation (no bitset fallback) on Bionic's benchmarks is:
 // - Up to 3.5x slower than bitset fallback for strspn_medium (10 byte 'keep'
 //   set, 131KB to scan).
 // - Up to *9x* slower for strcspn_rare (31 byte 'reject' set, 131KB to scan).
 //
 // On the other hand, this n^2 approach is >3x faster for both of the above
 // cases in the smallest instance (8 bytes before a match).
 constexpr static size_t kMaxBytesBeforeBitset = 128;

 // Runs `strspn`-like functions, where `f` provides the implementation. Assumes
 // `haystack` is aligned properly.
 //
 // The signature of `f` should be compatible with:
 // optional<size_t> f(FullVecTy v, optional<size_t> ignore_first = {});
 //
 // where:
 //   - `v` is a vector of chars to scan,
 //   - `ignore_first` is the number of chars in `v` to ignore (starting at the
 //     0th lane)
 //   - The return is an empty optional if the algo should keep scanning;
 //     otherwise, it's the lane number of the char the algo should stop at in
 //     `v` (less `ignore_first`, if provided).
 template <typename Fn>
 PSIMD_FLATTEN size_t strspn_driver_aligned(const ElemTy* haystack, Fn f) {
   constexpr FullVec d;
   PSIMD_DCHECK(hn::IsAligned(d, haystack));

   size_t total_scanned = 0;
   while (true) {
     const auto this_vec = Load(d, haystack + total_scanned);
     if (const optional<size_t> index = f(this_vec)) {
       return total_scanned + *index;
     }
     total_scanned += d.MaxLanes();
   }
 }

 template <typename Fn>
 PSIMD_FLATTEN size_t strspn_driver(const ElemTy* haystack, Fn f) {
   constexpr FullVec d;

   const auto shift_back_bytes = reinterpret_cast<uintptr_t>(haystack) & (d.MaxBytes() - 1);
   size_t scanned = 0;
   if (shift_back_bytes) {
     static_assert(sizeof(ElemTy) == 1, "Adjust shift_back_bytes calculation");
     const auto this_vec = LoadU(d, haystack - shift_back_bytes);
     if (const optional<size_t> index = f(this_vec, optional<size_t>{shift_back_bytes})) {
       return *index;
     }
     scanned = d.MaxBytes() - shift_back_bytes;
     haystack += scanned;
   }

   return strspn_driver_aligned(haystack, f) + scanned;
 }

 // Traits that allow for incrementally finding needles/non-needles for strspn
 // and strcspn. These build up a mask with 'true' elements meaning "this
 // element doesn't end the function's logic," and 'false' meaning "this does."
 struct SharedTraits {
   // Returns the index of the element that strspn/strcspn should stop at, or
   // empty if none was found. Ignores the first `ignore_first` chars.
   PSIMD_FLATTEN static optional<size_t> find_first_false(FullVecM eq_mask,
                                                          optional<size_t> ignore_first) {
     constexpr FullVec d;
     if (ignore_first) {
       eq_mask = Or(eq_mask, FirstN(d, *ignore_first));
     }
     if (AllTrue(d, eq_mask)) {
       return {};
     }

     size_t index = FindKnownFirstTrue(d, Not(eq_mask));
     if (ignore_first) {
       index -= *ignore_first;
     }
     return optional<size_t>{index};
   }
 };

 struct StrspnSmallTraits : SharedTraits {
   constexpr static bool kIsRejectSet = false;

   PSIMD_FLATTEN static FullVecM init_mask(const FullVecTy& haystack, const FullVecTy& first_vec) {
     return haystack == first_vec;
   }

   PSIMD_FLATTEN static FullVecM combine_mask(const FullVecM& mask, const FullVecTy& haystack,
                                              const FullVecTy& next_vec) {
     return Or(mask, haystack == next_vec);
   }
 };

 struct StrcspnSmallTraits : SharedTraits {
   constexpr static bool kIsRejectSet = true;

   PSIMD_FLATTEN static FullVecM init_mask(const FullVecTy& haystack, const FullVecTy& first_vec) {
     constexpr FullVec d;
     auto mask = haystack != Zero(d);
     return combine_mask(mask, haystack, first_vec);
   }

   PSIMD_FLATTEN static FullVecM combine_mask(const FullVecM& mask, const FullVecTy& haystack,
                                              const FullVecTy& next_vec) {
     return And(mask, haystack != next_vec);
   }
 };

 // Implements `strspn` or `strcspn` given traits that can be used to create and
 // combine masks.
 template <typename Traits, size_t SmallSize>
 PSIMD_FLATTEN size_t strspnish_small(const ElemTy* haystack, const FullVecTy set[SmallSize]) {
   const auto find_index = [&](const FullVecTy& v, optional<size_t> ignore_first = {})
                               PSIMD_FLATTEN -> optional<size_t> {
     FullVecM eq_mask = Traits::init_mask(v, set[0]);
 #pragma unroll
     for (size_t i = 1; i < SmallSize; ++i) {
       eq_mask = Traits::combine_mask(eq_mask, v, set[i]);
     }

     return Traits::find_first_false(eq_mask, ignore_first);
   };

   return strspn_driver(haystack, find_index);
 }

 struct TryDispatchResult {
   // If this is nonempty, it's the result of the strcspn or strspn call.
   optional<size_t> result;

   // If `result` has a value, these have no well-defined value. Otherwise,
   // `haystack_scan_offset` is the offset of the `haystack` pointer to start
   // scanning from to discover the result.
   //
   // `haystack + haystack_scan_offset` is guaranteed to be aligned to 16B.
   size_t haystack_scan_offset;
   // This is the `strlen` of the keep/reject set.
   size_t set_len;

   PSIMD_FLATTEN static TryDispatchResult of_result(size_t result) {
     return TryDispatchResult{
         .result = optional{result},
     };
   }

   PSIMD_FLATTEN static TryDispatchResult of_offset_and_set_len(size_t offset, size_t set_len) {
     return TryDispatchResult{
         .haystack_scan_offset = offset,
         .set_len = set_len,
     };
   }

   PSIMD_FLATTEN TryDispatchResult advance_result(size_t offset) const {
     return TryDispatchResult::of_result(*result + offset);
   }
 };

 // Encapsulates logic for calculating `strspn`-like results incrementally.
 template <typename Traits>
 struct IncrementalStrspnishResult {
   PSIMD_FLATTEN static IncrementalStrspnishResult maybe_unaligned(const ElemTy* haystack,
                                                                   const FullVecTy& first_set_elem) {
     constexpr FullVec d;
     const auto shift_back_bytes = reinterpret_cast<uintptr_t>(haystack) & (d.MaxBytes() - 1);
     return IncrementalStrspnishResult(haystack, first_set_elem, optional<size_t>{shift_back_bytes});
   }

   PSIMD_FLATTEN static IncrementalStrspnishResult known_aligned(const ElemTy* haystack,
                                                                 const FullVecTy& first_set_elem) {
     constexpr FullVec d;
     PSIMD_DCHECK((reinterpret_cast<uintptr_t>(haystack) & (d.MaxBytes() - 1)) == 0);
     return IncrementalStrspnishResult(haystack, first_set_elem, /*shift_back_bytes=*/{});
   }

   PSIMD_FLATTEN void push_set_elem(const FullVecTy& v) {
     mask_ = Traits::combine_mask(mask_, loaded_haystack_, v);
   }

   // Returns the offset from `haystack` to use if scanning `haystack` further
   // is necessary.
   PSIMD_FLATTEN size_t haystack_offset() const {
     constexpr FullVec d;
     return d.MaxBytes() - haystack_ignore_.unwrap_or(0);
   }

   // Returns a `TryDispatchResult::of_result` if any matches were found
   // incrementally. Otherwise, returns an empty optional.
   PSIMD_FLATTEN optional<TryDispatchResult> try_to_result() const {
     if (const optional<size_t> result = Traits::find_first_false(mask_, haystack_ignore_)) {
       return optional{TryDispatchResult::of_result(*result)};
     }
     return {};
   }

  private:
   FullVecTy loaded_haystack_;
   // Subtle: This is an optional since there's a decent amount of
   // special-casing and optimization that can be done if this is statically
   // guaranteed to be 0. There's an entire class of usages where that's the
   // case, so represent it as empty for ease of LLVM analysis.
   optional<size_t> haystack_ignore_;
   FullVecM mask_;

   PSIMD_FLATTEN IncrementalStrspnishResult(const ElemTy* haystack, const FullVecTy& first_set_elem,
                                            optional<size_t> shift_back_bytes) {
     constexpr FullVec d;
     loaded_haystack_ = Load(d, haystack - shift_back_bytes.unwrap_or(0));
     haystack_ignore_ = shift_back_bytes;
     mask_ = Traits::init_mask(loaded_haystack_, first_set_elem);
   }
 };

 // Tries to determine the result of `strspn` or `strcspn` if `str` is small.
 template <typename Traits>
 PSIMD_FLATTEN TryDispatchResult try_dispatch_to_small_strspnish(const ElemTy* haystack,
                                                                 const ElemTy* str) {
   constexpr FullVec d;

   if (!str[0]) [[unlikely]] {
     if (Traits::kIsRejectSet) {
       return TryDispatchResult::of_result(
           __builtin_strlen(reinterpret_cast<const char*>(haystack)));
     }
     return TryDispatchResult::of_result(0);
   }

   FullVecTy checks[kMaxSmallSize];
   checks[0] = Set(d, str[0]);
 #pragma unroll
   for (size_t i = 1; i < kMaxSmallSize; ++i) {
     checks[i] = checks[0];
   }

   bool is_small = false;
 #pragma unroll
   for (size_t i = 0; i < kMaxSmallSize; ++i) {
     if (str[i] == 0) {
       // If this is strcspn with a single elem in `str`, we can trivially defer to
       // `strchrnul`, which we have optimized implementations of for the 64-bit
       // architectures.
       //
       // (Note that this branch is expected to fold away after unrolling.)
       if (i == 1 && Traits::kIsRejectSet) {
         const auto* res_ptr = reinterpret_cast<const ElemTy*>(
             strchrnul(reinterpret_cast<const char*>(haystack), str[0]));
         return TryDispatchResult::of_result(res_ptr - haystack);
       }

       // NOTE: This is not written as `return strspn_small_fn(haystack,
       // checks)` because LLVM's optimizations are more likely to turn that
       // into N inlined function calls to `strspn_small_fn`.
       //
       // Instead, have precisely one unrolled callsite, and N jumps to that,
       // which should be much more compact and closer to what we 'actually'
       // want here.
       is_small = true;
       break;
     }
     checks[i] = Set(d, str[i]);
   }

   if (is_small) [[likely]] {
     return TryDispatchResult::of_result(strspnish_small<Traits, kMaxSmallSize>(haystack, checks));
   }

   // It's possible that, even though this set of needles is large, the amount
   // we'll need to scan into `haystack` is small.
   //
   // Do an n^2 loop for the first handful of vectors into `haystack`, since
   // falling back to bitset building, while algorithmically optimal (& optimal
   // in practice for very large inputs), has a high fixed cost.
   auto incremental_result =
       IncrementalStrspnishResult<Traits>::maybe_unaligned(haystack, checks[0]);
 #pragma unroll
   for (size_t i = 1; i < kMaxSmallSize; ++i) {
     incremental_result.push_set_elem(checks[i]);
   }

   size_t str_len = kMaxSmallSize;
   for (; str[str_len]; ++str_len) {
     incremental_result.push_set_elem(Set(d, str[str_len]));
   }

   if (const auto m = incremental_result.try_to_result()) {
     return *m;
   }

   size_t haystack_offset = incremental_result.haystack_offset();
   PSIMD_DCHECK(haystack_offset < kMaxBytesBeforeBitset);
   do {
     incremental_result =
         IncrementalStrspnishResult<Traits>::known_aligned(haystack + haystack_offset, checks[0]);
 #pragma unroll
     for (size_t i = 1; i < kMaxSmallSize; ++i) {
       incremental_result.push_set_elem(checks[i]);
     }

     for (size_t i = kMaxSmallSize; i < str_len; ++i) {
       incremental_result.push_set_elem(Set(d, str[i]));
     }

     if (const auto m = incremental_result.try_to_result()) {
       return (*m).advance_result(haystack_offset);
     }
     PSIMD_DCHECK(incremental_result.haystack_offset() == d.MaxBytes());
     haystack_offset += d.MaxBytes();
   } while (haystack_offset < kMaxBytesBeforeBitset);

   return TryDispatchResult::of_offset_and_set_len(haystack_offset, str_len);
 }

 struct VecBitSet {
   PSIMD_FLATTEN static VecBitSet from_large_string(const ElemTy* keep, size_t keep_len,
                                                    bool include_nul) {
     constexpr FullVec d;
     // On Brya (mobile Intel from ~2022), four approaches were tested. Listed
     // in order of observed performance on Bionic's benchmarks (earlier is
     // better performance):
     //
     // 1. this one
     // 2. a direct bitset stored in `uint8_t[256 / 8]`
     // 3. maintaining a bitset directly in 128B vector registers
     // 4. a bitset from `uint8_t[256]` (which matches `strspn/strcspn`'s
     //    current implementation), which was later 'compressed' into two
     //    vectors.
     uint64_t lo_lo = 0;
     uint64_t lo_hi = 0;
     uint64_t hi_lo = 0;
     uint64_t hi_hi = 0;

     auto push_bits_into = [](uint8_t c, uint64_t& hi, uint64_t& lo) PSIMD_FLATTEN {
       PSIMD_DCHECK(c <= 0x7F);
       // It's expected that storing to `hi` or `lo` will be unpredictable, so
       // store to both.
       //
       // `bit` represents the bit to set, `is_hi` is either all 1s or 0s,
       // depending on the value of `c & 0x40`.
       const uint64_t bit = 1ULL << (c & 0x3F);
       const uint64_t is_hi = -static_cast<uint64_t>(c >> 6);
       hi |= bit & is_hi;
       lo |= bit & ~is_hi;
     };

     auto push_char = [&](uint8_t c) PSIMD_FLATTEN {
       if (c >= 0x80) [[unlikely]] {
         push_bits_into(c ^ 0x80, hi_hi, hi_lo);
       } else {
         push_bits_into(c, lo_hi, lo_lo);
       }
     };

     size_t i = 0;
 #pragma unroll
     for (; i < kMaxSmallSize; ++i) {
       PSIMD_DCHECK(keep[i]);
       push_char(static_cast<uint8_t>(keep[i]));
     }

     for (; i < keep_len; ++i) {
       PSIMD_DCHECK(keep[i]);
       push_char(static_cast<uint8_t>(keep[i]));
     }
     PSIMD_DCHECK(!keep[i]);
     if (include_nul) {
       push_char(0);
     }

     constexpr hn::Full128<uint64_t> d64;

     const auto lo = Dup128VecFromValues(d64, lo_lo, lo_hi);
     const auto hi = Dup128VecFromValues(d64, hi_lo, hi_hi);
     return VecBitSet{BitCast(d, hi), BitCast(d, lo)};
   }

   PSIMD_FLATTEN void flip_all_bits() {
     // Technically this is pointless if `!hi_relevant_`, but the extra check is
     // likely more expensive than just flipping the bits unconditionally.
     hi_ = Not(hi_);
     lo_ = Not(lo_);
     hi_flipped_ = !hi_flipped_;
   }

   PSIMD_FLATTEN optional<size_t> index_of_first_unset(FullVecTy v,
                                                       optional<size_t> ignore_first = {}) const {
     constexpr FullVec d;

     FullVecM eq_mask = test_membership(v, lo_);
     // If `hi_relevant_` is set, then we have chars outside of the standard
     // ASCII range. Technically permitted, but should be incredibly rare.
     if (hi_relevant_) [[unlikely]] {
       const FullVecTy hi_bits = Set(d, static_cast<ElemTy>(0x80));
       eq_mask = Or(eq_mask, test_membership(v ^ hi_bits, hi_));
     } else if (hi_flipped_) {
       eq_mask = Or(eq_mask, v < Zero(d));
     }
     if (ignore_first) {
       eq_mask = Or(eq_mask, FirstN(d, *ignore_first));
     }

     if (AllTrue(d, eq_mask)) {
       return {};
     }

     size_t index = FindKnownFirstTrue(d, Not(eq_mask));
     if (ignore_first) {
       index -= *ignore_first;
     }
     return optional<size_t>{index};
   }

  private:
   PSIMD_FLATTEN VecBitSet(FullVecTy hi, FullVecTy lo)
       : hi_(hi), lo_(lo), hi_relevant_(!AllBits0(FullVec(), hi)), hi_flipped_(false) {}

   // Does membership tests for each lane in v, returning a mask where lane i is
   // set if v[i] is in the bitset represented by lookups.
   //
   // All lanes of v must be in [0, 127]. Anything outside of that is zeroed in
   // the mask.
   PSIMD_FLATTEN static FullVecM test_membership(FullVecTy v, FullVecTy lookups) {
     constexpr FullVec d;

     // These are 128-bit vectors. Bit [i] indicates whether [i] is set.
     static_assert(sizeof(FullVecTy) == 16, "Expected 128-bit vectors");

     // This is a standard bit-set. Bit `i` will be in lane (i / 8) of the
     // lookup vector, at bit (i % 8).

     // ...To that end, move the bitset's lane `(i / 8)` into the lanes
     // corresponding to `v`.
     const auto index_parts = ShiftRight<3>(v);
     const auto lookup_masks = TableLookupBytesOr0(lookups, index_parts);

     // ...And then figure out the bit in each `v[i]` that needs to be set.
     const auto mask_shift_amounts = v & Set(d, 7);
     auto nth_bits = [&](auto v) PSIMD_FLATTEN {
       // The instruction sequence to emulate a bytewise `1 << v[i]` in SSE and
       // AVX are pretty massive; just look up from a table instead.
       if constexpr (kTargetIsX86OrX86_64) {
         alignas(d.MaxBytes()) static const int8_t bits[d.MaxLanes()] = {
             1u << 0, 1u << 1, 1u << 2, 1u << 3, 1u << 4, 1u << 5, 1u << 6, (int8_t)(1u << 7),
         };
         const auto loaded = Load(d, bits);
         return TableLookupBytes(loaded, v);
       } else {
         return Set(d, 1) << v;
       }
     };
     const auto shifted_masks = nth_bits(mask_shift_amounts);

     // ...Finally, it's as simple as `&`.
     return (lookup_masks & shifted_masks) != Zero(d);
   }

   FullVecTy hi_;
   FullVecTy lo_;
   // `hi_relevant_` tracks whether `hi_` is just all unset (or all set, in the
   // case of `hi_flipped_`).
   //
   // Testing this is trivial, should be consistently `false`, and lets us skip
   // a decent number of instructions if it is `false`.
   const bool hi_relevant_;
   // If `!hi_relevant_`, this indicates whether `hi` should be treated as all
   // `true` values.
   bool hi_flipped_;
 };

 // Mark this `noinline` because it's the uncommon case, and it adds a lot of
 // register pressure to its caller on x86_64.
 //
 // Visually inspecting `objdump`, `noinline` allows `strspn` to operate without
 // touching the stack unless it _has_ to call this function. With
 // `PSIMD_FLATTEN`, we end up unconditionally saving/restoring 3 registers for
 // no reason.
 __attribute__((noinline)) size_t strspn_large(const ElemTy* haystack, const ElemTy* keep,
                                               size_t keep_len, size_t start_offset) {
   const auto bitset = VecBitSet::from_large_string(keep, keep_len, /*include_nul=*/false);
   return start_offset +
          strspn_driver_aligned(haystack + start_offset,
                                [&](FullVecTy v, optional<size_t> ignore_first = {}) PSIMD_FLATTEN {
                                  return bitset.index_of_first_unset(v, ignore_first);
                                });
 }

 // Mark this `noinline` because it's the uncommon case, and it adds a lot of
 // register pressure to its caller on x86_64.
 //
 // Visually inspecting `objdump`, `noinline` allows `strcspn` to operate
 // without touching the stack unless it _has_ to call this function. With
 // `PSIMD_FLATTEN`, we end up unconditionally saving/restoring 4 registers for
 // no reason.
 __attribute__((noinline)) size_t strcspn_large(const ElemTy* haystack, const ElemTy* reject,
                                                size_t reject_len, size_t start_offset) {
   // `include_nul` since we're going to run `strspn`, but with membership flipped.
   auto bitset = VecBitSet::from_large_string(reject, reject_len, /*include_nul=*/true);
   bitset.flip_all_bits();
   return start_offset +
          strspn_driver_aligned(haystack + start_offset,
                                [&](FullVecTy v, optional<size_t> ignore_first = {}) PSIMD_FLATTEN {
                                  return bitset.index_of_first_unset(v, ignore_first);
                                });
 }
 }  // namespace
 }  // namespace portable_simd

 PSIMD_LIBC_FUNCTION(size_t, strspn, const char* raw_haystack, const char* raw_keep) {
   const auto* haystack = reinterpret_cast<const portable_simd::ElemTy*>(raw_haystack);
   const auto* keep = reinterpret_cast<const portable_simd::ElemTy*>(raw_keep);
   const auto small =
       portable_simd::try_dispatch_to_small_strspnish<portable_simd::StrspnSmallTraits>(haystack,
                                                                                        keep);
   if (small.result) [[likely]] {
     return *small.result;
   }
   return portable_simd::strspn_large(haystack, keep, small.set_len, small.haystack_scan_offset);
 }

 PSIMD_LIBC_FUNCTION(size_t, strcspn, const char* raw_haystack, const char* raw_reject) {
   const auto* haystack = reinterpret_cast<const portable_simd::ElemTy*>(raw_haystack);
   const auto* reject = reinterpret_cast<const portable_simd::ElemTy*>(raw_reject);
   const auto small =
       portable_simd::try_dispatch_to_small_strspnish<portable_simd::StrcspnSmallTraits>(haystack,
                                                                                         reject);
   if (small.result) [[likely]] {
     return *small.result;
   }
   return portable_simd::strcspn_large(haystack, reject, small.set_len, small.haystack_scan_offset);
 }

 PSIMD_STRONG_ALIAS(strspn);
 PSIMD_STRONG_ALIAS(strcspn);