libc/portable-simd/strlen.cpp - platform/bionic - Git at Google

 /*
  * Copyright (C) 2025 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */

 #include <stdbit.h>
 #include <stdint.h>
 #include <string.h>

 #include "portable_simd_detail.h"
 #include "portable_simd_exports.h"

 namespace portable_simd {
 namespace {

 struct StrlenTraits {
   // Highway doesn't support direct `char` usage in vector types, presumably
   // because it can vary in signed-ness. `strlen` uses `min` to find `\0`, so
   // unsigned is simplest.
   //
   // It's notable that the highway quick_reference advises against unsigned
   // `min` operations for older CPUs:
   // https://google.github.io/highway/en/master/quick_reference.html#speeding-up-code-for-older-x86-platforms
   //
   // But it's unclear why; instruction timing tables indicate that unsigned and
   // signed bytewise `min` have identical timings.
   using CharType = uint8_t;
   using VectorTag = portable_simd::FullVector<CharType>;
   using VectorType = hn::VFromD<VectorTag>;
 };

 struct WcslenTraits {
   using CharType = uint32_t;
   using VectorTag = portable_simd::FullVector<CharType>;
   using VectorType = hn::VFromD<VectorTag>;
 };

 template <typename Traits>
 PSIMD_FLATTEN static optional<size_t> index_of_nul(typename Traits::VectorType val,
                                                    size_t chars_to_skip = 0) {
   constexpr typename Traits::VectorTag d;
   const auto all_zeroes = Zero(d);
   // NOTE: The mask that highway generates here is lane-wise.
   const size_t raw_zero_mask = BitsFromMask(d, all_zeroes == val);
   const size_t zero_mask = raw_zero_mask >> chars_to_skip;
   if (!zero_mask) {
     return {};
   }

   const size_t lane_index = stdc_trailing_zeros(zero_mask);
   return optional{lane_index};
 }

 template <typename Traits>
 PSIMD_FLATTEN static optional<const typename Traits::CharType*> ptr_of_nul(
     const void* ptr, typename Traits::VectorType val) {
   if (const optional<size_t> x = index_of_nul<Traits>(val)) {
     return optional{*x + static_cast<const Traits::CharType*>(ptr)};
   }
   return {};
 }

 template <typename Traits>
 PSIMD_FLATTEN static size_t strlen_vectorized(const typename Traits::CharType* s) {
   using CharType = Traits::CharType;
   using VectorTag = Traits::VectorTag;

   constexpr VectorTag d;

   auto [ptr, nul_distance] = align_forward_to_vec<VectorTag>(
       s, [&](auto val, optional<size_t> bytes_to_skip, optional<size_t>) -> optional<size_t> {
         size_t chars_to_skip = 0;
         if (bytes_to_skip) {
           // All wide-char pointers should be suitably aligned.
           PSIMD_DCHECK(*bytes_to_skip % sizeof(CharType) == 0);
           chars_to_skip = *bytes_to_skip / sizeof(CharType);
         }
         if (const optional<size_t> x = index_of_nul<Traits>(val, chars_to_skip)) {
           return optional{*x};
         }
         return {};
       });
   if (nul_distance) {
     return *nul_distance;
   }

   // The simplest implementation from here would be:
   //
   // while (true) {
   //   // check for nul, return if found
   //   ++ptr;
   // }
   //
   // `perf` says that x86_64 CPUs stall on the 'check for nul, return if found'
   // branch really badly, so it's a better balance if we can work in batches.
   // Since batch size must be a power of two, work in batches of 4.
   const auto check_ptr_and_inc = [&]() -> optional<size_t> {
     if (const optional<const CharType*> x = ptr_of_nul<Traits>(ptr, Load(d, ptr))) {
       return optional{static_cast<size_t>(*x - s)};
     }
     ptr += d.MaxLanes();
     return {};
   };

   // Of course, we can only work in batches if it's safe to read more than one
   // vector at a time.
   if (!kReadAheadToPageBoundaryIsOK) {
     while (true) {
       if (const optional<size_t> x = check_ptr_and_inc()) {
         return *x;
       }
     }
   }

   // Here, we have a trade-off to make: "how many very simple checks do we want
   // to do before hitting the loop that's really fast for long strings?"
   //
   // Between here and the "big string" loop, there are between 0 and 3
   // `check_ptr_and_inc`s to bring us up to alignment.
   //
   // We've checked between 1 and kVectorSize bytes so far, and it seems bad to
   // dive into the 'big' case having checked as little as 1 byte, so add a
   // few checks beforehand.
   constexpr size_t kMinBytesUntilStringIsBig = 128;
   // Worst case, we've only read 1 byte so far. Add 1 check to round up.
   constexpr size_t kExtraChecksNeeded = 1 + (kMinBytesUntilStringIsBig - 1) / d.MaxBytes();

 #pragma unroll
   for (size_t i = 0; i < kExtraChecksNeeded; ++i) {
     if (const optional<size_t> x = check_ptr_and_inc()) {
       return *x;
     }
   }

   // Now bring ourselves to 4*kVectorAlign alignment.
   constexpr size_t kFourVecAlign = 4 * vector_align(d);
   static_assert(kPageSize % kFourVecAlign == 0);
   const size_t vector_width_from_prev_align =
       (reinterpret_cast<uintptr_t>(ptr) & (kFourVecAlign - 1)) / vector_align(d);
   switch (vector_width_from_prev_align) {
     case 1:
       if (const optional<size_t> x = check_ptr_and_inc()) {
         return *x;
       }
       [[fallthrough]];
     case 2:
       if (const optional<size_t> x = check_ptr_and_inc()) {
         return *x;
       }
       [[fallthrough]];
     case 3:
       if (const optional<size_t> x = check_ptr_and_inc()) {
         return *x;
       }
       break;
     default:
       PSIMD_DCHECK(vector_width_from_prev_align == 0);
   }

   while (true) {
     const auto vec1 = Load(d, ptr);
     const auto vec2 = Load(d, ptr + d.MaxLanes());
     const auto min12 = Min(vec1, vec2);
     const auto vec3 = Load(d, ptr + d.MaxLanes() * 2);
     const auto vec4 = Load(d, ptr + d.MaxLanes() * 3);
     const auto min34 = Min(vec3, vec4);
     const auto min_all = Min(min12, min34);

     const optional<size_t> maybe_index = index_of_nul<Traits>(min_all);
     if (!maybe_index) [[likely]] {
       ptr += 4 * d.MaxLanes();
       continue;
     }

     const size_t index_of_first_zero = *maybe_index;
     // We're in the 'longer string' case, so there's no reason to assume `vec1`
     // is most likely to have the `\0`. Assuming all cases are equally likely,
     // prefer a constant 2 `ptr_of_nul` cost, rather than 1-3.
     if (const optional<const CharType*> x = ptr_of_nul<Traits>(ptr, min12)) {
       if (const optional<const CharType*> x2 = ptr_of_nul<Traits>(ptr, vec1)) {
         return static_cast<size_t>(*x2 - s);
       }
       // If vec1 had no zeroes, then we can assume the min 0 is from vec2.
       return static_cast<size_t>(*x - s) + d.MaxLanes();
     }

     ptr += d.MaxLanes() * 2;
     if (const optional<const CharType*> x = ptr_of_nul<Traits>(ptr, vec3)) {
       return static_cast<size_t>(*x - s);
     }

     // Since nothing else has nul in it, it's guaranteed that the nul is
     // vec4's, so we can use the above `index_of_nul` result.
     ptr += d.MaxLanes();
     return static_cast<size_t>(reinterpret_cast<const CharType*>(ptr) - s) + index_of_first_zero;
   }
 }
 }  // namespace
 }  // namespace portable_simd

 PSIMD_LIBC_FUNCTION(size_t, strlen, const char* s) {
   using portable_simd::StrlenTraits;
   return portable_simd::strlen_vectorized<StrlenTraits>(
       reinterpret_cast<const StrlenTraits::CharType*>(s));
 }

 static size_t simplistic_misaligned_wcslen(const wchar_t* s) {
   size_t len = 0;
   while (*s) {
     ++s;
     ++len;
   }
   return len;
 }

 PSIMD_LIBC_FUNCTION(size_t, wcslen, const wchar_t* s) {
   using portable_simd::WcslenTraits;

   // We might have received a misaligned pointer. Support that with a slow case
   // if needed. It's expected that the 99% case will be properly-aligned, so no
   // meaningful effort is put into making the misaligned case fast.
   if (reinterpret_cast<uintptr_t>(s) % alignof(wchar_t)) [[unlikely]] {
     return simplistic_misaligned_wcslen(s);
   }

   return portable_simd::strlen_vectorized<WcslenTraits>(
       reinterpret_cast<const WcslenTraits::CharType*>(s));
 }

 #if defined(__x86_64__)
 PSIMD_MAYBE_STRONG_ALIAS(strlen);
 PSIMD_MAYBE_STRONG_ALIAS(wcslen);
 #elif defined(__aarch64__)
 PSIMD_MAYBE_STRONG_ALIAS(wcslen);
 #endif
	/*
	* Copyright (C) 2025 The Android Open Source Project
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <stdbit.h>
	#include <stdint.h>
	#include <string.h>

	#include "portable_simd_detail.h"
	#include "portable_simd_exports.h"

	namespace portable_simd {
	namespace {

	struct StrlenTraits {
	// Highway doesn't support direct `char` usage in vector types, presumably
	// because it can vary in signed-ness. `strlen` uses `min` to find `\0`, so
	// unsigned is simplest.
	//
	// It's notable that the highway quick_reference advises against unsigned
	// `min` operations for older CPUs:
	// https://google.github.io/highway/en/master/quick_reference.html#speeding-up-code-for-older-x86-platforms
	//
	// But it's unclear why; instruction timing tables indicate that unsigned and
	// signed bytewise `min` have identical timings.
	using CharType = uint8_t;
	using VectorTag = portable_simd::FullVector<CharType>;
	using VectorType = hn::VFromD<VectorTag>;
	};

	struct WcslenTraits {
	using CharType = uint32_t;
	using VectorTag = portable_simd::FullVector<CharType>;
	using VectorType = hn::VFromD<VectorTag>;
	};

	template <typename Traits>
	PSIMD_FLATTEN static optional<size_t> index_of_nul(typename Traits::VectorType val,
	size_t chars_to_skip = 0) {
	constexpr typename Traits::VectorTag d;
	const auto all_zeroes = Zero(d);
	// NOTE: The mask that highway generates here is lane-wise.
	const size_t raw_zero_mask = BitsFromMask(d, all_zeroes == val);
	const size_t zero_mask = raw_zero_mask >> chars_to_skip;
	if (!zero_mask) {
	return {};
	}

	const size_t lane_index = stdc_trailing_zeros(zero_mask);
	return optional{lane_index};
	}

	template <typename Traits>
	PSIMD_FLATTEN static optional<const typename Traits::CharType*> ptr_of_nul(
	const void* ptr, typename Traits::VectorType val) {
	if (const optional<size_t> x = index_of_nul<Traits>(val)) {
	return optional{x + static_cast<const Traits::CharType>(ptr)};
	}
	return {};
	}

	template <typename Traits>
	PSIMD_FLATTEN static size_t strlen_vectorized(const typename Traits::CharType* s) {
	using CharType = Traits::CharType;
	using VectorTag = Traits::VectorTag;

	constexpr VectorTag d;

	auto [ptr, nul_distance] = align_forward_to_vec<VectorTag>(
	s, [&](auto val, optional<size_t> bytes_to_skip, optional<size_t>) -> optional<size_t> {
	size_t chars_to_skip = 0;
	if (bytes_to_skip) {
	// All wide-char pointers should be suitably aligned.
	PSIMD_DCHECK(*bytes_to_skip % sizeof(CharType) == 0);
	chars_to_skip = *bytes_to_skip / sizeof(CharType);
	}
	if (const optional<size_t> x = index_of_nul<Traits>(val, chars_to_skip)) {
	return optional{*x};
	}
	return {};
	});
	if (nul_distance) {
	return *nul_distance;
	}

	// The simplest implementation from here would be:
	//
	// while (true) {
	// // check for nul, return if found
	// ++ptr;
	// }
	//
	// `perf` says that x86_64 CPUs stall on the 'check for nul, return if found'
	// branch really badly, so it's a better balance if we can work in batches.
	// Since batch size must be a power of two, work in batches of 4.
	const auto check_ptr_and_inc = [&]() -> optional<size_t> {
	if (const optional<const CharType*> x = ptr_of_nul<Traits>(ptr, Load(d, ptr))) {
	return optional{static_cast<size_t>(*x - s)};
	}
	ptr += d.MaxLanes();
	return {};
	};

	// Of course, we can only work in batches if it's safe to read more than one
	// vector at a time.
	if (!kReadAheadToPageBoundaryIsOK) {
	while (true) {
	if (const optional<size_t> x = check_ptr_and_inc()) {
	return *x;
	}
	}
	}

	// Here, we have a trade-off to make: "how many very simple checks do we want
	// to do before hitting the loop that's really fast for long strings?"
	//
	// Between here and the "big string" loop, there are between 0 and 3
	// `check_ptr_and_inc`s to bring us up to alignment.
	//
	// We've checked between 1 and kVectorSize bytes so far, and it seems bad to
	// dive into the 'big' case having checked as little as 1 byte, so add a
	// few checks beforehand.
	constexpr size_t kMinBytesUntilStringIsBig = 128;
	// Worst case, we've only read 1 byte so far. Add 1 check to round up.
	constexpr size_t kExtraChecksNeeded = 1 + (kMinBytesUntilStringIsBig - 1) / d.MaxBytes();

	#pragma unroll
	for (size_t i = 0; i < kExtraChecksNeeded; ++i) {
	if (const optional<size_t> x = check_ptr_and_inc()) {
	return *x;
	}
	}

	// Now bring ourselves to 4*kVectorAlign alignment.
	constexpr size_t kFourVecAlign = 4 * vector_align(d);
	static_assert(kPageSize % kFourVecAlign == 0);
	const size_t vector_width_from_prev_align =
	(reinterpret_cast<uintptr_t>(ptr) & (kFourVecAlign - 1)) / vector_align(d);
	switch (vector_width_from_prev_align) {
	case 1:
	if (const optional<size_t> x = check_ptr_and_inc()) {
	return *x;
	}
	[[fallthrough]];
	case 2:
	if (const optional<size_t> x = check_ptr_and_inc()) {
	return *x;
	}
	[[fallthrough]];
	case 3:
	if (const optional<size_t> x = check_ptr_and_inc()) {
	return *x;
	}
	break;
	default:
	PSIMD_DCHECK(vector_width_from_prev_align == 0);
	}

	while (true) {
	const auto vec1 = Load(d, ptr);
	const auto vec2 = Load(d, ptr + d.MaxLanes());
	const auto min12 = Min(vec1, vec2);
	const auto vec3 = Load(d, ptr + d.MaxLanes() * 2);
	const auto vec4 = Load(d, ptr + d.MaxLanes() * 3);
	const auto min34 = Min(vec3, vec4);
	const auto min_all = Min(min12, min34);

	const optional<size_t> maybe_index = index_of_nul<Traits>(min_all);
	if (!maybe_index) [[likely]] {
	ptr += 4 * d.MaxLanes();
	continue;
	}

	const size_t index_of_first_zero = *maybe_index;
	// We're in the 'longer string' case, so there's no reason to assume `vec1`
	// is most likely to have the `\0`. Assuming all cases are equally likely,
	// prefer a constant 2 `ptr_of_nul` cost, rather than 1-3.
	if (const optional<const CharType*> x = ptr_of_nul<Traits>(ptr, min12)) {
	if (const optional<const CharType*> x2 = ptr_of_nul<Traits>(ptr, vec1)) {
	return static_cast<size_t>(*x2 - s);
	}
	// If vec1 had no zeroes, then we can assume the min 0 is from vec2.
	return static_cast<size_t>(*x - s) + d.MaxLanes();
	}

	ptr += d.MaxLanes() * 2;
	if (const optional<const CharType*> x = ptr_of_nul<Traits>(ptr, vec3)) {
	return static_cast<size_t>(*x - s);
	}

	// Since nothing else has nul in it, it's guaranteed that the nul is
	// vec4's, so we can use the above `index_of_nul` result.
	ptr += d.MaxLanes();
	return static_cast<size_t>(reinterpret_cast<const CharType*>(ptr) - s) + index_of_first_zero;
	}
	}
	} // namespace
	} // namespace portable_simd

	PSIMD_LIBC_FUNCTION(size_t, strlen, const char* s) {
	using portable_simd::StrlenTraits;
	return portable_simd::strlen_vectorized<StrlenTraits>(
	reinterpret_cast<const StrlenTraits::CharType*>(s));
	}

	static size_t simplistic_misaligned_wcslen(const wchar_t* s) {
	size_t len = 0;
	while (*s) {
	++s;
	++len;
	}
	return len;
	}

	PSIMD_LIBC_FUNCTION(size_t, wcslen, const wchar_t* s) {
	using portable_simd::WcslenTraits;

	// We might have received a misaligned pointer. Support that with a slow case
	// if needed. It's expected that the 99% case will be properly-aligned, so no
	// meaningful effort is put into making the misaligned case fast.
	if (reinterpret_cast<uintptr_t>(s) % alignof(wchar_t)) [[unlikely]] {
	return simplistic_misaligned_wcslen(s);
	}

	return portable_simd::strlen_vectorized<WcslenTraits>(
	reinterpret_cast<const WcslenTraits::CharType*>(s));
	}

	#if defined(__x86_64__)
	PSIMD_MAYBE_STRONG_ALIAS(strlen);
	PSIMD_MAYBE_STRONG_ALIAS(wcslen);
	#elif defined(__aarch64__)
	PSIMD_MAYBE_STRONG_ALIAS(wcslen);
	#endif