Merge "Upgrade arm-optimized-routines to e112794669739057178f5ae8c94ccf0f8ca59c60"
diff --git a/METADATA b/METADATA
index bbb8e3f..f2b01ff 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@
type: GIT
value: "https://github.com/ARM-software/optimized-routines.git"
}
- version: "30c1ada57d6af777f44826ae31f92ceeffcbe02b"
+ version: "e112794669739057178f5ae8c94ccf0f8ca59c60"
license_type: NOTICE
last_upgrade_date {
year: 2020
- month: 4
+ month: 5
day: 1
}
}
diff --git a/config.mk.dist b/config.mk.dist
index 2336c52..cac40eb 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -60,6 +60,9 @@
#math-ulpflags = -q -f
#math-testflags = -nostatus
+# Remove GNU Property Notes from asm files.
+#string-cflags += -DWANT_GNU_PROPERTY=0
+
# Enable assertion checks.
#networking-cflags += -DWANT_ASSERT
diff --git a/string/Dir.mk b/string/Dir.mk
index bb881a3..400eda8 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -22,7 +22,7 @@
build/lib/libstringlib.so \
build/lib/libstringlib.a \
-string-tools := \
+string-tests := \
build/bin/test/memcpy \
build/bin/test/memmove \
build/bin/test/memset \
@@ -53,11 +53,11 @@
string-files := \
$(string-objs) \
$(string-libs) \
- $(string-tools) \
+ $(string-tests) \
$(string-benches) \
$(string-includes) \
-all-string: $(string-libs) $(string-tools) $(string-benches) $(string-includes)
+all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes)
$(string-objs): $(string-includes)
$(string-objs): CFLAGS_ALL += $(string-cflags)
@@ -82,21 +82,14 @@
build/bin/%.sh: $(S)/test/%.sh
cp $< $@
-check-string: $(string-tools)
- $(EMULATOR) build/bin/test/memcpy
- $(EMULATOR) build/bin/test/memmove
- $(EMULATOR) build/bin/test/memset
- $(EMULATOR) build/bin/test/memchr
- $(EMULATOR) build/bin/test/memcmp
- $(EMULATOR) build/bin/test/strcpy
- $(EMULATOR) build/bin/test/stpcpy
- $(EMULATOR) build/bin/test/strcmp
- $(EMULATOR) build/bin/test/strchr
- $(EMULATOR) build/bin/test/strrchr
- $(EMULATOR) build/bin/test/strchrnul
- $(EMULATOR) build/bin/test/strlen
- $(EMULATOR) build/bin/test/strnlen
- $(EMULATOR) build/bin/test/strncmp
+string-tests-out = $(string-tests:build/bin/test/%=build/string/test/%.out)
+
+build/string/test/%.out: build/bin/test/%
+ $(EMULATOR) $^ | tee $@.tmp
+ mv $@.tmp $@
+
+check-string: $(string-tests-out)
+ ! grep FAIL $^
bench-string: $(string-benches)
$(EMULATOR) build/bin/bench/memcpy
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
index d6775a4..4537aab 100644
--- a/string/aarch64/check-arch.S
+++ b/string/aarch64/check-arch.S
@@ -8,3 +8,6 @@
#if !__aarch64__
# error ARCH setting does not match the compiler.
#endif
+
+#include "../asmdefs.h"
+END_FILE
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
new file mode 100644
index 0000000..0a869c7
--- /dev/null
+++ b/string/aarch64/memchr-mte.S
@@ -0,0 +1,150 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define tmp2 x5
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define qdata q1
+#define vdata v1
+#define vhas_chr v2
+#define vrepmask v3
+#define vend v4
+
+/*
+ * Core algorithm:
+ *
+ * For each 16-byte chunk we calculate a 64-bit syndrome value, with four bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1, 2, 3 are not used (faster than using a lower
+ * bit syndrome). Since the bits in the syndrome reflect exactly the order in
+ * which things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr_aarch64_mte)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x10011001 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x1001
+ movk wtmp2, #0x1001, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 16-byte chunks */
+ bic src, srcin, #15
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #15
+ and cntrem, cntin, #15
+ b.eq L(aligned_start)
+
+ /*
+ * Input string is not 16-byte aligned. We calculate the syndrome
+ * value for the aligned 16 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata.16b}, [src], #16
+ sub tmp, soff, #16
+ adds cntin, cntin, tmp
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ lsl tmp, soff, #2
+ mov tmp2, #~0
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ lsl tmp, tmp2, tmp
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ mov synd, vend.d[0]
+ /* Clear the soff*4 lower bits */
+ and synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(aligned_start:)
+ /* Make sure that it won't overread by a 16-byte chunk */
+ add tmp, cntin, #15
+ tbnz tmp, 4, L(loop32_2)
+
+L(loop32):
+ ld1 {vdata.16b}, [src], #16
+ subs cntin, cntin, #16
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ mov synd, vend.d[0]
+ cbnz synd, L(end)
+
+L(loop32_2):
+ ld1 {vdata.16b}, [src], #16
+ subs cntin, cntin, #16
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ b.ls L(end)
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ mov synd, vend.d[0]
+ /* We haven't found the character, loop with 32 byte chunks */
+ cbz synd, L(loop32)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ mov synd, vend.d[0]
+ /* Only do the clear for the last possible block */
+ b.hs L(tail)
+
+L(masklast):
+ /* Clear the (16 - ((cntrem + soff) % 16)) * 4 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #15
+ sub tmp, tmp, #16
+ neg tmp, tmp, lsl #2
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #16
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #2
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+
+END (__memchr_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index 58badd2..53efd4c 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -5,6 +5,8 @@
* SPDX-License-Identifier: MIT
*/
+#include "../asmdefs.h"
+
#if __ARM_FEATURE_SVE
/* Assumptions:
*
@@ -15,10 +17,7 @@
.arch armv8-a+sve
.text
- .globl __memchr_aarch64_sve
- .type __memchr_aarch64_sve, %function
- .p2align 4
-__memchr_aarch64_sve:
+ENTRY_ALIGN(__memchr_aarch64_sve, 4)
dup z1.b, w1 /* duplicate c to a vector */
setffr /* initialize FFR */
mov x3, 0 /* initialize off */
@@ -60,5 +59,8 @@
9: mov x0, 0 /* return null */
ret
- .size __memchr_aarch64_sve, . - __memchr_aarch64_sve
+END (__memchr_aarch64_sve)
+
#endif
+
+END_FILE
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 10be49e..f5538bd 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -110,7 +110,7 @@
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.d[0]
/* Only do the clear for the last possible block */
- b.hi L(tail)
+ b.hs L(tail)
L(masklast):
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
@@ -141,3 +141,5 @@
ret
END (__memchr_aarch64)
+
+END_FILE
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index c216103..07512ba 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -5,6 +5,8 @@
* SPDX-License-Identifier: MIT
*/
+#include "../asmdefs.h"
+
#if __ARM_FEATURE_SVE
/* Assumptions:
*
@@ -15,10 +17,7 @@
.arch armv8-a+sve
.text
- .globl __memcmp_aarch64_sve
- .type __memcmp_aarch64_sve, %function
- .p2align 4
-__memcmp_aarch64_sve:
+ENTRY_ALIGN (__memcmp_aarch64_sve, 4)
mov x3, 0 /* initialize off */
0: whilelo p0.b, x3, x2 /* while off < max */
@@ -46,5 +45,8 @@
9: mov x0, 0 /* return equality */
ret
- .size __memcmp_aarch64_sve, . - __memcmp_aarch64_sve
+END (__memcmp_aarch64_sve)
+
#endif
+
+END_FILE
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 6722516..4be23de 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -131,3 +131,5 @@
ret
END (__memcmp_aarch64)
+
+END_FILE
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index f7dce55..844cc41 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -50,8 +50,8 @@
The loop tail is handled by always copying 64 bytes from the end.
*/
-ENTRY (__memcpy_aarch64_simd)
ENTRY_ALIAS (__memmove_aarch64_simd)
+ENTRY (__memcpy_aarch64_simd)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
@@ -199,3 +199,5 @@
ret
END (__memcpy_aarch64_simd)
+
+END_FILE
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 060b794..c91f6e5 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -53,8 +53,8 @@
The loop tail is handled by always copying 64 bytes from the end.
*/
-ENTRY (__memcpy_aarch64)
ENTRY_ALIAS (__memmove_aarch64)
+ENTRY (__memcpy_aarch64)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
@@ -237,3 +237,5 @@
ret
END (__memcpy_aarch64)
+
+END_FILE
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index aa580df..7c3e4f4 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -112,3 +112,5 @@
ret
END (__memset_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 2041e73..b9a5e71 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -1,130 +1,105 @@
/*
* strchr - find a character in a string
*
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
#include "../asmdefs.h"
-/* Arguments and results. */
#define srcin x0
#define chrin w1
-
#define result x0
#define src x2
-#define tmp1 x3
-#define wtmp2 w4
-#define tmp3 x5
+#define tmp1 x1
+#define wtmp2 w3
+#define tmp3 x3
#define vrepchr v0
-#define qdata q1
#define vdata v1
+#define qdata q1
#define vhas_nul v2
#define vhas_chr v3
-#define vrepmask_0 v4
-#define vrepmask_c v5
+#define vrepmask v4
+#define vrepmask2 v5
#define vend v6
-
-#define L(l) .L ## l
+#define dend d6
/* Core algorithm.
- For each 16-byte chunk we calculate a 64-bit syndrome value, with
- four bits per byte (LSB is always in bits 0 and 1, for both big
- and little-endian systems). For each tuple, bit 0 is set if
- the relevant byte matched the requested character; bit 1 is set
- if the relevant byte matched the NUL end of string (we trigger
- off bit0 for the special case of looking for NUL) and bits 2 and 3
- are not used.
- Since the bits in the syndrome reflect exactly the order in which
- things occur in the original string a count_trailing_zeros()
- operation will identify exactly which byte is causing the termination,
- and why. */
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
+ requested character, bits 2-3 are set if the byte is NUL (or matched), and
+ bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
+ bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
+ in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
-/* Locals and temporaries. */
-
-ENTRY(__strchr_aarch64_mte)
- /* Magic constant 0x10011001 to allow us to identify which lane
- matches the requested byte. Magic constant 0x20022002 used
- similarly for NUL termination. */
- mov wtmp2, #0x1001
- movk wtmp2, #0x1001, lsl #16
+ENTRY (__strchr_aarch64_mte)
+ bic src, srcin, 15
dup vrepchr.16b, chrin
- bic src, srcin, #15 /* Work with aligned 16-byte chunks. */
- dup vrepmask_c.4s, wtmp2
- ands tmp1, srcin, #15
- add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
- b.eq L(loop)
-
- /* Input string is not 16-byte aligned. Rather than forcing
- the padding bytes to a safe value, we calculate the syndrome
- for all the bytes, but then mask off those bits of the
- syndrome that are related to the padding. */
- ldr qdata, [src], #16
- cmeq vhas_nul.16b, vdata.16b, #0
+ ld1 {vdata.16b}, [src]
+ mov wtmp2, 0x3003
+ dup vrepmask.8h, wtmp2
+ cmeq vhas_nul.16b, vdata.16b, 0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
- and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
- lsl tmp1, tmp1, #2
- orr vend.16b, vhas_nul.16b, vhas_chr.16b
- mov tmp3, #~0
- addp vend.16b, vend.16b, vend.16b /* 128->64 */
- lsl tmp1, tmp3, tmp1
+ mov wtmp2, 0xf00f
+ dup vrepmask2.8h, wtmp2
- mov tmp3, vend.d[0]
- ands tmp1, tmp3, tmp1 /* Mask padding bits. */
- b.ne L(tail)
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ lsl tmp3, srcin, 2
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
-L(loop):
- ldr qdata, [src], #32
- cmeq vhas_nul.16b, vdata.16b, #0
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- /* Use a fast check for the termination condition. */
- orr vend.16b, vhas_nul.16b, vhas_chr.16b
- addp vend.16b, vend.16b, vend.16b /* 128->64 */
- mov tmp1, vend.d[0]
- cbnz tmp1, L(end)
-
- ldr qdata, [src, #-16]
- cmeq vhas_nul.16b, vdata.16b, #0
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- /* Use a fast check for the termination condition. */
- orr vend.16b, vhas_nul.16b, vhas_chr.16b
- addp vend.16b, vend.16b, vend.16b /* 128->64 */
- mov tmp1, vend.d[0]
+ fmov tmp1, dend
+ lsr tmp1, tmp1, tmp3
cbz tmp1, L(loop)
- /* Adjust src for next two subtractions. */
- add src, src, #16
-L(end):
- /* Termination condition found. Now need to establish exactly why
- we terminated. */
- and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
- and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
- sub src, src, #16
- orr vend.16b, vhas_nul.16b, vhas_chr.16b
- addp vend.16b, vend.16b, vend.16b /* 128->64 */
-
- mov tmp1, vend.d[0]
-L(tail):
- /* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
- /* Re-bias source. */
- sub src, src, #16
- clz tmp1, tmp1 /* And counting the leading zeros. */
- /* Tmp1 is even if the target character was found first. Otherwise
- we've found the end of string and we weren't looking for NUL. */
- tst tmp1, #1
- add result, src, tmp1, lsr #2
+ clz tmp1, tmp1
+ /* Tmp1 is an even multiple of 2 if the target character was
+ found first. Otherwise we've found the end of string. */
+ tst tmp1, 2
+ add result, srcin, tmp1, lsr 2
csel result, result, xzr, eq
ret
-END(__strchr_aarch64_mte)
+ .p2align 4
+L(loop):
+ ldr qdata, [src, 16]!
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov tmp1, dend
+ cbz tmp1, L(loop)
+
+#ifdef __AARCH64EB__
+ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ fmov tmp1, dend
+#else
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ fmov tmp1, dend
+ rbit tmp1, tmp1
+#endif
+ clz tmp1, tmp1
+ /* Tmp1 is an even multiple of 2 if the target character was
+ found first. Otherwise we've found the end of string. */
+ tst tmp1, 2
+ add result, src, tmp1, lsr 2
+ csel result, result, xzr, eq
+ ret
+
+END (__strchr_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 35d5dd9..ba02bf6 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -5,6 +5,8 @@
* SPDX-License-Identifier: MIT
*/
+#include "../asmdefs.h"
+
#if __ARM_FEATURE_SVE
/* Assumptions:
*
@@ -22,10 +24,7 @@
#define FUNC __strchr_aarch64_sve
#endif
- .globl FUNC
- .type FUNC, %function
- .p2align 4
-FUNC:
+ENTRY_ALIGN (FUNC, 4)
dup z1.b, w1 /* replicate byte across vector */
setffr /* initialize FFR */
ptrue p1.b /* all ones; loop invariant */
@@ -67,5 +66,8 @@
incp x0, p0.b
b 0b
- .size FUNC, . - FUNC
+END (FUNC)
+
#endif
+
+END_FILE
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 00d9be3..39241a3 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -51,11 +51,11 @@
/* Locals and temporaries. */
ENTRY (__strchr_aarch64)
- /* Magic constant 0x40100401 to allow us to identify which lane
- matches the requested byte. Magic constant 0x80200802 used
- similarly for NUL termination. */
- mov wtmp2, #0x0401
- movk wtmp2, #0x4010, lsl #16
+ /* Magic constant 0xc0300c03 to allow us to identify which lane
+ matches the requested byte. Even bits are set if the character
+ matches, odd bits if either the char is NUL or matches. */
+ mov wtmp2, 0x0c03
+ movk wtmp2, 0xc030, lsl 16
dup vrepchr.16b, chrin
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask_c.4s, wtmp2
@@ -73,12 +73,10 @@
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
- and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+ bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+ and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+ and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vend1.16b, vend2.16b // 256->128
mov tmp3, #~0
@@ -89,31 +87,26 @@
bic tmp1, tmp3, tmp1 // Mask padding bits.
cbnz tmp1, L(tail)
+ .p2align 4
L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
- cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- /* Use a fast check for the termination condition. */
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
- orr vend1.16b, vend1.16b, vend2.16b
- addp vend1.2d, vend1.2d, vend1.2d
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+ umaxp vend1.16b, vend1.16b, vend1.16b
mov tmp1, vend1.d[0]
cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
- and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
- and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+ bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+ and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+ and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
addp vend1.16b, vend1.16b, vend2.16b // 256->128
addp vend1.16b, vend1.16b, vend2.16b // 128->64
-
mov tmp1, vend1.d[0]
L(tail):
/* Count the trailing zeros, by bit reversing... */
@@ -129,3 +122,5 @@
ret
END (__strchr_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
new file mode 100644
index 0000000..c813939
--- /dev/null
+++ b/string/aarch64/strchrnul-mte.S
@@ -0,0 +1,84 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin x0
+#define chrin w1
+#define result x0
+
+#define src x2
+#define tmp1 x1
+#define tmp2 x3
+#define tmp2w w3
+
+#define vrepchr v0
+#define vdata v1
+#define qdata q1
+#define vhas_nul v2
+#define vhas_chr v3
+#define vrepmask v4
+#define vend v5
+#define dend d5
+
+/* Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (__strchrnul_aarch64_mte)
+ bic src, srcin, 15
+ dup vrepchr.16b, chrin
+ ld1 {vdata.16b}, [src]
+ mov tmp2w, 0xf00f
+ dup vrepmask.8h, tmp2w
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
+ lsl tmp2, srcin, 2
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov tmp1, dend
+ lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
+ cbz tmp1, L(loop)
+
+ rbit tmp1, tmp1
+ clz tmp1, tmp1
+ add result, srcin, tmp1, lsr 2
+ ret
+
+ .p2align 4
+L(loop):
+ ldr qdata, [src, 16]!
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b
+ fmov tmp1, dend
+ cbz tmp1, L(loop)
+
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov tmp1, dend
+#ifndef __AARCH64EB__
+ rbit tmp1, tmp1
+#endif
+ clz tmp1, tmp1
+ add result, src, tmp1, lsr 2
+ ret
+
+END (__strchrnul_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 81264ea..0e08d82 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -63,14 +63,12 @@
syndrome that are related to the padding. */
ld1 {vdata1.16b, vdata2.16b}, [src], #32
neg tmp1, tmp1
- cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
- orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
mov tmp3, #~0
@@ -81,24 +79,22 @@
bic tmp1, tmp3, tmp1 // Mask padding bits.
cbnz tmp1, L(tail)
+ .p2align 4
L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
- cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- /* Use a fast check for the termination condition. */
- orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
- orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
- addp vend1.2d, vend1.2d, vend1.2d
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+ umaxp vend1.16b, vend1.16b, vend1.16b
mov tmp1, vend1.d[0]
cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
addp vend1.16b, vend1.16b, vend1.16b // 128->64
@@ -114,3 +110,5 @@
ret
END (__strchrnul_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
new file mode 100644
index 0000000..28efce2
--- /dev/null
+++ b/string/aarch64/strcmp-mte.S
@@ -0,0 +1,246 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define result x0
+
+/* Internal variables. */
+#define data1 x2
+#define data1w w2
+#define data2 x3
+#define data2w w3
+#define has_nul x4
+#define diff x5
+#define syndrome x6
+#define tmp1 x7
+#define tmp2 x8
+#define tmp3 x9
+#define zeroones x10
+#define pos x11
+#define offset x12
+#define neg_offset x13
+#define mask x14
+
+/* Define endian dependent shift operations.
+ On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes.
+ LS_BK means shifting towards later bytes.
+ */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+ /* Start of performance-critical section -- one 64B cache line. */
+ENTRY (__strcmp_aarch64_mte)
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
+ b.ne L(misaligned8)
+ ands tmp1, src1, #7
+ b.ne L(mutual_align)
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+L(loop_aligned):
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+L(start_realigned):
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_aligned)
+ /* End of performance-critical section -- one 64B cache line. */
+
+L(end):
+#ifdef __AARCH64EB__
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+L(end_quick):
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+#endif
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, syndrome
+#ifndef __AARCH64EB__
+ rev data2, data2
+#endif
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ ldr data1, [src1], #8
+ neg tmp1, tmp1 /* Bits to alignment -64. */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+ LS_FW tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ b L(start_realigned)
+
+ /* The following diagram explains the comparison of misaligned strings.
+ The bytes are shown in natural order. For little-endian, it is
+ reversed in the registers. The "x" bytes are before the string.
+ The "|" separates data that is loaded at one time.
+ src1 | a a a a a a a a | b b b c c c c c | . . .
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
+
+ After shifting in each step, the data looks like this:
+ STEP_A STEP_B STEP_C
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
+
+ The bytes with "0" are eliminated from the syndrome via mask. */
+
+L(misaligned8):
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond page boundary in
+ SRC2. */
+ tst src1, #7
+ b.eq L(src1_aligned)
+L(do_misaligned):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ tst src1, #7
+ b.ne L(do_misaligned)
+
+ /* Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+ time from SRC2. The comparison happens in 3 steps. After each step
+ the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+ /* Calculate offset from 8 byte alignment to string start in bits. No
+ need to mask offset since shifts are ignoring upper bits. */
+ lsl offset, src2, #3
+ bic src2, src2, #0xf
+ mov mask, -1
+ neg neg_offset, offset
+ ldr data1, [src1], #8
+ ldp tmp1, tmp2, [src2], #16
+ LS_BK mask, mask, neg_offset
+ /* Skip the first compare if data in tmp1 is irrelevant. */
+ tbnz offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+ LS_FW data2, tmp1, offset
+ LS_BK tmp1, tmp2, neg_offset
+ sub has_nul, data1, zeroones
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
+ orr tmp3, data1, #REP8_7f
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
+ orr syndrome, diff, has_nul
+ cbnz syndrome, L(end)
+
+ ldr data1, [src1], #8
+L(misaligned_mid_loop):
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
+ LS_FW data2, tmp2, offset
+#ifdef __AARCH64EB__
+ /* For big-endian we do a byte reverse to avoid carry-propagation
+ problem described above. This way we can reuse the has_nul in the
+ next step and also use syndrome value trick at the end. */
+ rev tmp3, data1
+ #define data1_fixed tmp3
+#else
+ #define data1_fixed data1
+#endif
+ sub has_nul, data1_fixed, zeroones
+ orr tmp3, data1_fixed, #REP8_7f
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ orr syndrome, diff, has_nul
+ bics syndrome, syndrome, mask /* Ignore later bytes. */
+ b.ne L(end_quick)
+
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
+ ldp tmp1, tmp2, [src2], #16
+ LS_BK data2, tmp1, neg_offset
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ orr syndrome, diff, has_nul
+ ands syndrome, syndrome, mask /* Ignore earlier bytes. */
+ b.ne L(end_quick)
+
+ ldr data1, [src1], #8
+ b L(loop_misaligned)
+
+L(done):
+ sub result, data1, data2
+ ret
+
+END (__strcmp_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index 8e0b1a7..62a2bb1 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -5,6 +5,8 @@
* SPDX-License-Identifier: MIT
*/
+#include "../asmdefs.h"
+
#if __ARM_FEATURE_SVE
/* Assumptions:
*
@@ -15,10 +17,7 @@
.arch armv8-a+sve
.text
- .globl __strcmp_aarch64_sve
- .type __strcmp_aarch64_sve, %function
- .p2align 4
-__strcmp_aarch64_sve:
+ENTRY_ALIGN (__strcmp_aarch64_sve, 4)
setffr /* initialize FFR */
ptrue p1.b, all /* all ones; loop invariant */
mov x2, 0 /* initialize offset */
@@ -55,5 +54,8 @@
b.none 0b
b 1b
- .size __strcmp_aarch64_sve, . - __strcmp_aarch64_sve
+END (__strcmp_aarch64_sve)
+
#endif
+
+END_FILE
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 65af5ce..a6de6e8 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -1,7 +1,7 @@
/*
* strcmp - compare two strings
*
- * Copyright (c) 2012, Arm Limited.
+ * Copyright (c) 2012-2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -168,3 +168,5 @@
ret
END (__strcmp_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index 1029542..ee0b92e 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -5,6 +5,8 @@
* SPDX-License-Identifier: MIT
*/
+#include "../asmdefs.h"
+
#if __ARM_FEATURE_SVE
/* Assumptions:
*
@@ -22,10 +24,7 @@
#define FUNC __strcpy_aarch64_sve
#endif
- .globl FUNC
- .type FUNC, %function
- .p2align 4
-FUNC:
+ENTRY_ALIGN (FUNC, 4)
setffr /* initialize FFR */
ptrue p2.b, all /* all ones; loop invariant */
mov x2, 0 /* initialize offset */
@@ -67,5 +66,8 @@
#endif
ret
- .size FUNC, . - FUNC
+END (FUNC)
+
#endif
+
+END_FILE
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 4edffcf..079baef 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -306,3 +306,5 @@
b L(fp_gt8)
END (STRCPY)
+
+END_FILE
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index d2bb79c..ef5b207 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -183,3 +183,5 @@
b L(tail)
END(__strlen_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 82a1e85..13a4319 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -5,6 +5,8 @@
* SPDX-License-Identifier: MIT
*/
+#include "../asmdefs.h"
+
#if __ARM_FEATURE_SVE
/* Assumptions:
*
@@ -15,10 +17,7 @@
.arch armv8-a+sve
.text
- .globl __strlen_aarch64_sve
- .type __strlen_aarch64_sve, %function
- .p2align 4
-__strlen_aarch64_sve:
+ENTRY_ALIGN (__strlen_aarch64_sve, 4)
setffr /* initialize FFR */
ptrue p2.b /* all ones; loop invariant */
mov x1, 0 /* initialize length */
@@ -53,5 +52,8 @@
incp x1, p0.b
b 0b
- .size __strlen_aarch64_sve, . - __strlen_aarch64_sve
+END (__strlen_aarch64_sve)
+
#endif
+
+END_FILE
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 2293f73..3176b5e 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -204,3 +204,5 @@
b L(page_cross_entry)
END (__strlen_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
new file mode 100644
index 0000000..ca1adba
--- /dev/null
+++ b/string/aarch64/strncmp-mte.S
@@ -0,0 +1,310 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result x0
+
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define has_nul x5
+#define diff x6
+#define syndrome x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define zeroones x11
+#define pos x12
+#define mask x13
+#define endloop x14
+#define count mask
+#define offset pos
+#define neg_offset x15
+
+/* Define endian dependent shift operations.
+ On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes.
+ LS_BK means shifting towards later bytes.
+ */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+ .text
+ .p2align 6
+ .rep 9
+ nop /* Pad so that the loop below fits a cache line. */
+ .endr
+ENTRY_ALIGN (__strncmp_aarch64_mte, 0)
+ cbz limit, L(ret0)
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
+ and count, src1, #7
+ b.ne L(misaligned8)
+ cbnz count, L(mutual_align)
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ /* Start of performance-critical section -- one 64B cache line. */
+L(loop_aligned):
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+L(start_realigned):
+ subs limit, limit, #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp endloop, #0, #0, eq
+ b.eq L(loop_aligned)
+ /* End of performance-critical section -- one 64B cache line. */
+
+L(full_check):
+#ifndef __AARCH64EB__
+ orr syndrome, diff, has_nul
+ add limit, limit, 8 /* Rewind limit to before last subs. */
+L(syndrome_check):
+ /* Limit was reached. Check if the NUL byte or the difference
+ is before the limit. */
+ rev syndrome, syndrome
+ rev data1, data1
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ cmp limit, pos, lsr #3
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ csel result, result, xzr, hi
+ ret
+#else
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit, #63, L(not_limit)
+ add tmp1, limit, 8
+ cbz limit, L(not_limit)
+
+ lsl limit, tmp1, #3 /* Bits -> bytes. */
+ mov mask, #~0
+ lsr mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+L(not_limit):
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+L(end_quick):
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point.
+ We also need to adjust the limit calculations, but without
+ overflowing if the limit is near ULONG_MAX. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ ldr data1, [src1], #8
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+ and count, count, #0x3f
+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
+ /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
+ add limit, limit, count
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ b L(start_realigned)
+
+ .p2align 6
+ /* Don't bother with dwords for up to 16 bytes. */
+L(misaligned8):
+ cmp limit, #16
+ b.hs L(try_misaligned_words)
+
+L(byte_loop):
+ /* Perhaps we can do better than this. */
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ subs limit, limit, #1
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+L(done):
+ sub result, data1, data2
+ ret
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+L(try_misaligned_words):
+ cbz count, L(src1_aligned)
+
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+
+L(page_end_loop):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ subs count, count, #1
+ b.hi L(page_end_loop)
+
+ /* The following diagram explains the comparison of misaligned strings.
+ The bytes are shown in natural order. For little-endian, it is
+ reversed in the registers. The "x" bytes are before the string.
+ The "|" separates data that is loaded at one time.
+ src1 | a a a a a a a a | b b b c c c c c | . . .
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
+
+ After shifting in each step, the data looks like this:
+ STEP_A STEP_B STEP_C
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
+
+ The bytes with "0" are eliminated from the syndrome via mask.
+
+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+ time from SRC2. The comparison happens in 3 steps. After each step
+ the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+ /* Calculate offset from 8 byte alignment to string start in bits. No
+ need to mask offset since shifts are ignoring upper bits. */
+ lsl offset, src2, #3
+ bic src2, src2, #0xf
+ mov mask, -1
+ neg neg_offset, offset
+ ldr data1, [src1], #8
+ ldp tmp1, tmp2, [src2], #16
+ LS_BK mask, mask, neg_offset
+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
+ /* Skip the first compare if data in tmp1 is irrelevant. */
+ tbnz offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+ LS_FW data2, tmp1, offset
+ LS_BK tmp1, tmp2, neg_offset
+ subs limit, limit, #8
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
+ sub has_nul, data1, zeroones
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ orr tmp3, data1, #REP8_7f
+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
+ orr tmp3, endloop, has_nul
+ cbnz tmp3, L(full_check)
+
+ ldr data1, [src1], #8
+L(misaligned_mid_loop):
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
+ LS_FW data2, tmp2, offset
+#ifdef __AARCH64EB__
+ /* For big-endian we do a byte reverse to avoid carry-propagation
+ problem described above. This way we can reuse the has_nul in the
+ next step and also use syndrome value trick at the end. */
+ rev tmp3, data1
+ #define data1_fixed tmp3
+#else
+ #define data1_fixed data1
+#endif
+ sub has_nul, data1_fixed, zeroones
+ orr tmp3, data1_fixed, #REP8_7f
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ cmp limit, neg_offset, lsr #3
+ orr syndrome, diff, has_nul
+ bic syndrome, syndrome, mask /* Ignore later bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
+ ldp tmp1, tmp2, [src2], #16
+ cmp limit, #8
+ LS_BK data2, tmp1, neg_offset
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ orr syndrome, diff, has_nul
+ and syndrome, syndrome, mask /* Ignore earlier bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ ldr data1, [src1], #8
+ sub limit, limit, #8
+ b L(loop_misaligned)
+
+#ifdef __AARCH64EB__
+L(syndrome_check):
+ clz pos, syndrome
+ cmp pos, limit, lsl #3
+ b.lo L(end_quick)
+#endif
+
+L(ret0):
+ mov result, #0
+ ret
+END(__strncmp_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index c4ec813..c8fbf32 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -5,6 +5,8 @@
* SPDX-License-Identifier: MIT
*/
+#include "../asmdefs.h"
+
#if __ARM_FEATURE_SVE
/* Assumptions:
*
@@ -15,10 +17,7 @@
.arch armv8-a+sve
.text
- .globl __strncmp_aarch64_sve
- .type __strncmp_aarch64_sve, %function
- .p2align 4
-__strncmp_aarch64_sve:
+ENTRY_ALIGN (__strncmp_aarch64_sve, 4)
setffr /* initialize FFR */
mov x3, 0 /* initialize off */
@@ -64,5 +63,8 @@
9: mov x0, 0 /* return equal */
ret
- .size __strncmp_aarch64_sve, . - __strncmp_aarch64_sve
+END (__strncmp_aarch64_sve)
+
#endif
+
+END_FILE
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index fbd08ee..766524b 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -42,7 +42,7 @@
.text
.p2align 6
- .rep 7
+ .rep 6
nop /* Pad so that the loop below fits a cache line. */
.endr
ENTRY_ALIGN (__strncmp_aarch64, 0)
@@ -259,3 +259,5 @@
ret
END ( __strncmp_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index e80d26e..cf293f6 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -5,6 +5,8 @@
* SPDX-License-Identifier: MIT
*/
+#include "../asmdefs.h"
+
#if __ARM_FEATURE_SVE
/* Assumptions:
*
@@ -15,10 +17,7 @@
.arch armv8-a+sve
.text
- .globl __strnlen_aarch64_sve
- .type __strnlen_aarch64_sve, %function
- .p2align 4
-__strnlen_aarch64_sve:
+ENTRY_ALIGN (__strnlen_aarch64_sve, 4)
setffr /* initialize FFR */
mov x2, 0 /* initialize len */
b 1f
@@ -70,5 +69,8 @@
9: mov x0, x2
ret
- .size __strnlen_aarch64_sve, . - __strnlen_aarch64_sve
+END (__strnlen_aarch64_sve)
+
#endif
+
+END_FILE
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index df66b60..202c401 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -40,7 +40,7 @@
.p2align 6
L(start):
/* Pre-pad to ensure critical loop begins an icache line. */
- .rep 7
+ .rep 6
nop
.endr
/* Put this code here to avoid wasting more space with pre-padding. */
@@ -153,3 +153,5 @@
b L(realigned)
END (__strnlen_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
new file mode 100644
index 0000000..bd1296d
--- /dev/null
+++ b/string/aarch64/strrchr-mte.S
@@ -0,0 +1,134 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+#define src_match x6
+#define src_offset x7
+#define const_m1 x8
+#define tmp4 x9
+#define nul_match x10
+#define chr_match x11
+
+#define vrepchr v0
+#define vdata v1
+#define vhas_nul v2
+#define vhas_chr v3
+#define vrepmask_0 v4
+#define vrepmask_c v16
+#define vend v17
+
+/* Core algorithm.
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value, with
+ four bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set if
+ the relevant byte matched the requested character; bit 1 is set
+ if the relevant byte matched the NUL end of string (we trigger
+ off bit0 for the special case of looking for NUL) and bits 2 and 3
+ are not used.
+ Since the bits in the syndrome reflect exactly the order in which
+ things occur in the original string a count_trailing_zeros()
+ operation will identify exactly which byte is causing the termination,
+ and why. */
+
+ENTRY (__strrchr_aarch64_mte)
+ /* Magic constant 0x10011001 to allow us to identify which lane
+ matches the requested byte. Magic constant 0x20022002 used
+ similarly for NUL termination. */
+ mov wtmp2, #0x1001
+ movk wtmp2, #0x1001, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #15 /* Work with aligned 16-byte chunks. */
+ dup vrepmask_c.4s, wtmp2
+ mov src_offset, #0
+ ands tmp1, srcin, #15
+ add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+ b.eq L(aligned)
+
+ /* Input string is not 16-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata.16b}, [src], #16
+ neg tmp1, tmp1
+ cmeq vhas_nul.16b, vdata.16b, #0
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
+ and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
+ addp vhas_nul.16b, vhas_nul.16b, vhas_nul.16b // 128->64
+ addp vhas_chr.16b, vhas_chr.16b, vhas_chr.16b // 128->64
+ mov nul_match, vhas_nul.d[0]
+ lsl tmp1, tmp1, #2
+ mov const_m1, #~0
+ mov chr_match, vhas_chr.d[0]
+ lsr tmp3, const_m1, tmp1
+
+ bic nul_match, nul_match, tmp3 // Mask padding bits.
+ bic chr_match, chr_match, tmp3 // Mask padding bits.
+ cbnz nul_match, L(tail)
+
+L(loop):
+ cmp chr_match, #0
+ csel src_match, src, src_match, ne
+ csel src_offset, chr_match, src_offset, ne
+L(aligned):
+ ld1 {vdata.16b}, [src], #16
+ cmeq vhas_nul.16b, vdata.16b, #0
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b // 128->64
+ and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
+ addp vhas_chr.16b, vhas_chr.16b, vhas_chr.16b // 128->64
+ mov nul_match, vend.d[0]
+ mov chr_match, vhas_chr.d[0]
+ cbz nul_match, L(loop)
+
+ and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
+ addp vhas_nul.16b, vhas_nul.16b, vhas_nul.16b
+ mov nul_match, vhas_nul.d[0]
+
+L(tail):
+ /* Work out exactly where the string ends. */
+ sub tmp4, nul_match, #1
+ eor tmp4, tmp4, nul_match
+ ands chr_match, chr_match, tmp4
+ /* And pick the values corresponding to the last match. */
+ csel src_match, src, src_match, ne
+ csel src_offset, chr_match, src_offset, ne
+
+ /* Count down from the top of the syndrome to find the last match. */
+ clz tmp3, src_offset
+ /* Src_match points beyond the word containing the match, so we can
+ simply subtract half the bit-offset into the syndrome. Because
+ we are counting down, we need to go back one more character. */
+ add tmp3, tmp3, #2
+ sub result, src_match, tmp3, lsr #2
+ /* But if the syndrome shows no match was found, then return NULL. */
+ cmp src_offset, #0
+ csel result, result, xzr, ne
+
+ ret
+
+END (__strrchr_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index 4047a8e..fda9a43 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -5,6 +5,8 @@
* SPDX-License-Identifier: MIT
*/
+#include "../asmdefs.h"
+
#if __ARM_FEATURE_SVE
/* Assumptions:
*
@@ -15,10 +17,7 @@
.arch armv8-a+sve
.text
- .globl __strrchr_aarch64_sve
- .type __strrchr_aarch64_sve, %function
- .p2align 4
-__strrchr_aarch64_sve:
+ENTRY_ALIGN (__strrchr_aarch64_sve, 4)
dup z1.b, w1 /* replicate byte across vector */
setffr /* initialize FFR */
ptrue p1.b /* all ones; loop invariant */
@@ -81,5 +80,8 @@
5: mov x0, 0
ret
- .size __strrchr_aarch64_sve, . - __strrchr_aarch64_sve
+END (__strrchr_aarch64_sve)
+
#endif
+
+END_FILE
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index 1b4caac..726aa83 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -84,38 +84,38 @@
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
- addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64
- addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
- mov nul_match, vhas_nul1.d[0]
+ addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vend1.d[0]
lsl tmp1, tmp1, #1
mov const_m1, #~0
- mov chr_match, vhas_chr1.d[0]
lsr tmp3, const_m1, tmp1
+ mov chr_match, vend1.d[1]
bic nul_match, nul_match, tmp3 // Mask padding bits.
bic chr_match, chr_match, tmp3 // Mask padding bits.
cbnz nul_match, L(tail)
+ .p2align 4
L(loop):
cmp chr_match, #0
csel src_match, src, src_match, ne
csel src_offset, chr_match, src_offset, ne
L(aligned):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
- cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
+ uminp vend1.16b, vdata1.16b, vdata2.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ cmeq vend1.16b, vend1.16b, 0
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
- addp vend1.16b, vend1.16b, vend1.16b // 128->64
- addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
+ addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
mov nul_match, vend1.d[0]
- mov chr_match, vhas_chr1.d[0]
+ mov chr_match, vend1.d[1]
cbz nul_match, L(loop)
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_nul2.16b, vdata2.16b, #0
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
@@ -145,3 +145,5 @@
ret
END (__strrchr_aarch64)
+
+END_FILE
diff --git a/string/asmdefs.h b/string/asmdefs.h
index 7d143a9..c7fcb08 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -8,6 +8,55 @@
#ifndef _ASMDEFS_H
#define _ASMDEFS_H
+#if defined(__aarch64__)
+
+/* Branch Target Identitication support. */
+#define BTI_C hint 34
+#define BTI_J hint 36
+/* Return address signing support (pac-ret). */
+#define PACIASP hint 25; .cfi_window_save
+#define AUTIASP hint 29; .cfi_window_save
+
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a GNU_PROPERTY_AARCH64_FEATURE_1_AND note. */
+#define GNU_PROPERTY(features) \
+ .section .note.gnu.property, "a"; \
+ .p2align 3; \
+ .word 4; \
+ .word 16; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word 0xc0000000; \
+ .word 4; \
+ .word features; \
+ .word 0;
+
+/* If set then the GNU Property Note section will be added to
+ mark objects to support BTI and PAC-RET. */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+#define END_FILE GNU_PROPERTY(FEATURE_1_BTI|FEATURE_1_PAC)
+#else
+#define END_FILE
+#endif
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name,%function; \
+ .align alignment; \
+ name: \
+ .cfi_startproc; \
+ BTI_C;
+
+#else
+
+#define END_FILE
+
#define ENTRY_ALIGN(name, alignment) \
.global name; \
.type name,%function; \
@@ -15,6 +64,8 @@
name: \
.cfi_startproc;
+#endif
+
#define ENTRY(name) ENTRY_ALIGN(name, 6)
#define ENTRY_ALIAS(name) \
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 25a4475..34e5a29 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -27,8 +27,13 @@
size_t __strlen_aarch64 (const char *);
size_t __strnlen_aarch64 (const char *, size_t);
int __strncmp_aarch64 (const char *, const char *, size_t);
+void * __memchr_aarch64_mte (const void *, int, size_t);
char *__strchr_aarch64_mte (const char *, int);
+char * __strchrnul_aarch64_mte (const char *, int );
size_t __strlen_aarch64_mte (const char *);
+char *__strrchr_aarch64_mte (const char *, int);
+int __strcmp_aarch64_mte (const char *, const char *);
+int __strncmp_aarch64_mte (const char *, const char *, size_t);
#if __ARM_NEON
void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64_simd (void *, const void *, size_t);
diff --git a/string/test/memchr.c b/string/test/memchr.c
index 1ebc6d6..15531c6 100644
--- a/string/test/memchr.c
+++ b/string/test/memchr.c
@@ -11,6 +11,7 @@
#include <string.h>
#include <limits.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -21,6 +22,7 @@
F(memchr)
#if __aarch64__
F(__memchr_aarch64)
+F(__memchr_aarch64_mte)
# if __ARM_FEATURE_SVE
F(__memchr_aarch64_sve)
# endif
@@ -31,12 +33,11 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define SP 512
#define LEN 250000
+#define MAX_LEN SIZE_MAX
+
static unsigned char sbuf[LEN+2*A];
static void *alignup(void *p)
@@ -44,30 +45,34 @@
return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void test(const struct fun *fun, int align, int seekpos, int len)
+static void test(const struct fun *fun, int align, size_t seekpos,
+ size_t array_len, size_t param_len)
{
unsigned char *src = alignup(sbuf);
unsigned char *s = src + align;
- unsigned char *f = len ? s + seekpos : 0;
+ unsigned char *f = array_len ? s + seekpos : 0;
int seekchar = 0x1;
int i;
void *p;
- if (len > LEN || seekpos >= len || align >= A)
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (array_len > LEN || seekpos >= array_len || align >= A)
abort();
for (i = 0; i < seekpos; i++)
s[i] = 'a' + i%23;
s[i++] = seekchar;
- for (; i < len; i++)
+ for (; i < array_len; i++)
s[i] = 'a' + i%23;
- p = fun->fun(s, seekchar, len);
+ p = fun->fun(s, seekchar, param_len);
if (p != f) {
- ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
- ERR("expected: %p\n", f);
- abort();
+ ERR("%s(%p,0x%02x,%zu) returned %p\n",
+ fun->name, s, seekchar, param_len, p);
+ printf("expected: %p\n", f);
+ quote("str", s, param_len);
}
}
@@ -75,18 +80,21 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int a = 0; a < A; a++) {
for (int n = 0; n < 100; n++)
for (int sp = 0; sp < n-1; sp++)
- test(funtab+i, a, sp, n);
+ test(funtab+i, a, sp, n, n);
for (int n = 100; n < LEN; n *= 2) {
- test(funtab+i, a, n-1, n);
- test(funtab+i, a, n/2, n);
+ test(funtab+i, a, n-1, n, n);
+ test(funtab+i, a, n/2, n, n);
+ }
+ for (int n = 0; n < 100; n++) {
+ test(funtab+i, a, LEN-1-n, LEN, MAX_LEN-n);
}
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index 114f1d7..28160ef 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <string.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -28,9 +29,6 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define LEN 250000
static unsigned char s1buf[LEN+2*A];
@@ -41,7 +39,7 @@
return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
+static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos, int delta)
{
unsigned char *src1 = alignup(s1buf);
unsigned char *src2 = alignup(s2buf);
@@ -49,25 +47,29 @@
unsigned char *s2 = src2 + s2align;
int r;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || s1align >= A || s2align >= A)
abort();
- if (diffpos && diffpos >= len)
+ if (diffpos >= len)
+ abort();
+ if ((diffpos < 0) != (delta == 0))
abort();
for (int i = 0; i < len+A; i++)
src1[i] = src2[i] = '?';
for (int i = 0; i < len; i++)
s1[i] = s2[i] = 'a' + i%23;
- if (diffpos)
- s1[diffpos]++;
+ if (delta)
+ s1[diffpos] += delta;
r = fun->fun(s1, s2, len);
- if ((!diffpos && r != 0) || (diffpos && r == 0)) {
+ if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) {
ERR("%s(align %d, align %d, %d) failed, returned %d\n",
fun->name, s1align, s2align, len, r);
- ERR("src1: %.*s\n", s1align+len+1, src1);
- ERR("src2: %.*s\n", s2align+len+1, src2);
+ quoteat("src1", src1, len+A, diffpos);
+ quoteat("src2", src2, len+A, diffpos);
}
}
@@ -75,21 +77,27 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int d = 0; d < A; d++)
for (int s = 0; s < A; s++) {
int n;
- for (n = 0; n < 100; n++) {
- test(funtab+i, d, s, n, 0);
- test(funtab+i, d, s, n, n / 2);
+ test(funtab+i, d, s, 0, -1, 0);
+ test(funtab+i, d, s, 1, -1, 0);
+ test(funtab+i, d, s, 1, 0, -1);
+ test(funtab+i, d, s, 1, 0, 1);
+ for (n = 2; n < 100; n++) {
+ test(funtab+i, d, s, n, -1, 0);
+ test(funtab+i, d, s, n, 0, -1);
+ test(funtab+i, d, s, n, n - 1, -1);
+ test(funtab+i, d, s, n, n / 2, 1);
}
for (; n < LEN; n *= 2) {
- test(funtab+i, d, s, n, 0);
- test(funtab+i, d, s, n, n / 2);
+ test(funtab+i, d, s, n, -1, 0);
+ test(funtab+i, d, s, n, n / 2, -1);
}
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index 8572452..bf1bbae 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <string.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -30,9 +31,6 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define LEN 250000
static unsigned char dbuf[LEN+2*A];
@@ -55,6 +53,8 @@
void *p;
int i;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || dalign >= A || salign >= A)
abort();
for (i = 0; i < len+A; i++) {
@@ -70,8 +70,8 @@
for (i = 0; i < len+A; i++) {
if (dst[i] != want[i]) {
ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
- ERR("got : %.*s\n", dalign+len+1, dst);
- ERR("want: %.*s\n", dalign+len+1, want);
+ quoteat("got", dst, len+A, i);
+ quoteat("want", want, len+A, i);
break;
}
}
@@ -81,7 +81,7 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int d = 0; d < A; d++)
for (int s = 0; s < A; s++) {
int n;
@@ -90,8 +90,8 @@
for (; n < LEN; n *= 2)
test(funtab+i, d, s, n);
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/memmove.c b/string/test/memmove.c
index 7891b14..04f4c3c 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <string.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -28,9 +29,6 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define LEN 250000
static unsigned char dbuf[LEN+2*A];
@@ -53,6 +51,8 @@
void *p;
int i;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || dalign >= A || salign >= A)
abort();
for (i = 0; i < len+A; i++) {
@@ -68,8 +68,8 @@
for (i = 0; i < len+A; i++) {
if (dst[i] != want[i]) {
ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
- ERR("got : %.*s\n", dalign+len+1, dst);
- ERR("want: %.*s\n", dalign+len+1, want);
+ quoteat("got", dst, len+A, i);
+ quoteat("want", want, len+A, i);
break;
}
}
@@ -78,13 +78,15 @@
static void test_overlap(const struct fun *fun, int dalign, int salign, int len)
{
unsigned char *src = alignup(sbuf);
- unsigned char *dst = alignup(sbuf);
+ unsigned char *dst = src;
unsigned char *want = wbuf;
unsigned char *s = src + salign;
unsigned char *d = dst + dalign;
unsigned char *w = wbuf + dalign;
void *p;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || dalign >= A || salign >= A)
abort();
@@ -92,16 +94,9 @@
src[i] = want[i] = '?';
for (int i = 0; i < len; i++)
- s[i] = w[i] = 'a' + i%23;
-
- /* Copy the potential overlap range. */
- if (s < d) {
- for (int i = 0; i < (uintptr_t)d-(uintptr_t)s; i++)
- want[salign+i] = src[salign+i];
- } else {
- for (int i = 0; i < (uintptr_t)s-(uintptr_t)d; i++)
- want[len + dalign + i] = src[len + dalign + i];
- }
+ s[i] = want[salign+i] = 'a' + i%23;
+ for (int i = 0; i < len; i++)
+ w[i] = s[i];
p = fun->fun(d, s, len);
if (p != d)
@@ -109,9 +104,8 @@
for (int i = 0; i < len+A; i++) {
if (dst[i] != want[i]) {
ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
- ERR("got : %.*s\n", dalign+len+1, dst);
- ERR("want: %.*s\n", dalign+len+1, want);
- abort();
+ quoteat("got", dst, len+A, i);
+ quoteat("want", want, len+A, i);
break;
}
}
@@ -119,11 +113,9 @@
int main()
{
- test_overlap(funtab+0, 2, 1, 1);
-
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int d = 0; d < A; d++)
for (int s = 0; s < A; s++) {
int n;
@@ -136,8 +128,8 @@
test_overlap(funtab+i, d, s, n);
}
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/memset.c b/string/test/memset.c
index 48c10fa..8b05bd6 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <string.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -27,9 +28,6 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define LEN 250000
static unsigned char sbuf[LEN+2*A];
@@ -39,12 +37,6 @@
return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void err(const char *name, unsigned char *src, int salign, int c, int len)
-{
- ERR("%s(align %d, %d, %d) failed\n", name, salign, c, len);
- ERR("got : %.*s\n", salign+len+1, src);
-}
-
static void test(const struct fun *fun, int salign, int c, int len)
{
unsigned char *src = alignup(sbuf);
@@ -52,14 +44,14 @@
void *p;
int i;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || salign >= A)
abort();
for (i = 0; i < len+A; i++)
src[i] = '?';
for (i = 0; i < len; i++)
s[i] = 'a' + i%23;
- for (; i<len%A; i++)
- s[i] = '*';
p = fun->fun(s, c, len);
if (p != s)
@@ -67,19 +59,22 @@
for (i = 0; i < salign; i++) {
if (src[i] != '?') {
- err(fun->name, src, salign, c, len);
+ ERR("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+ quoteat("got", src, len+A, i);
return;
}
}
- for (i = salign; i < len; i++) {
+ for (; i < salign+len; i++) {
if (src[i] != (unsigned char)c) {
- err(fun->name, src, salign, c, len);
+ ERR("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+ quoteat("got", src, len+A, i);
return;
}
}
- for (; i < len%A; i++) {
- if (src[i] != '*') {
- err(fun->name, src, salign, c, len);
+ for (; i < len+A; i++) {
+ if (src[i] != '?') {
+ ERR("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+ quoteat("got", src, len+A, i);
return;
}
}
@@ -89,7 +84,7 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int s = 0; s < A; s++) {
int n;
for (n = 0; n < 100; n++) {
@@ -103,8 +98,8 @@
test(funtab+i, s, 0xaa25, n);
}
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
index 9050227..9001057 100644
--- a/string/test/stpcpy.c
+++ b/string/test/stpcpy.c
@@ -11,6 +11,7 @@
#include <stdlib.h>
#include <string.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -29,14 +30,11 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define LEN 250000
-static char dbuf[LEN+2*A];
-static char sbuf[LEN+2*A];
-static char wbuf[LEN+2*A];
+static char dbuf[LEN+2*A+1];
+static char sbuf[LEN+2*A+1];
+static char wbuf[LEN+2*A+1];
static void *alignup(void *p)
{
@@ -54,6 +52,8 @@
void *p;
int i;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || dalign >= A || salign >= A)
abort();
for (i = 0; i < len+A; i++) {
@@ -62,7 +62,7 @@
}
for (i = 0; i < len; i++)
s[i] = w[i] = 'a' + i%23;
- s[i] = w[i] = '\0';
+ s[len] = w[len] = '\0';
p = fun->fun(d, s);
if (p != d + len)
@@ -70,8 +70,8 @@
for (i = 0; i < len+A; i++) {
if (dst[i] != want[i]) {
ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
- ERR("got : %.*s\n", dalign+len+1, dst);
- ERR("want: %.*s\n", dalign+len+1, want);
+ quoteat("got", dst, len+A, i);
+ quoteat("want", want, len+A, i);
break;
}
}
@@ -81,7 +81,7 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int d = 0; d < A; d++)
for (int s = 0; s < A; s++) {
int n;
@@ -90,8 +90,8 @@
for (; n < LEN; n *= 2)
test(funtab+i, d, s, n);
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/strchr.c b/string/test/strchr.c
index 80a454a..1d90c85 100644
--- a/string/test/strchr.c
+++ b/string/test/strchr.c
@@ -11,6 +11,7 @@
#include <string.h>
#include <limits.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -30,13 +31,9 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define LEN 512
+static char sbuf[LEN+3*A];
static void *alignup(void *p)
{
@@ -51,25 +48,33 @@
int seekchar = 0x1;
void *p;
- if (len > LEN || seekpos >= len - 1 || align >= A)
- abort();
- if (seekchar >= 'a' && seekchar <= 'a' + 23)
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || seekpos >= len || align >= A)
abort();
- for (int i = 0; i < len + A; i++)
- src[i] = '?';
- for (int i = 0; i < len - 2; i++)
- s[i] = 'a' + i%23;
+ for (int i = 0; src + i < s; i++)
+ src[i] = i & 1 ? seekchar : 0;
+ for (int i = 1; i < A; i++)
+ s[len+i] = i & 1 ? seekchar : 0;
+ for (int i = 0; i < len; i++)
+ s[i] = 'a' + i%32;
if (seekpos != -1)
- s[seekpos] = seekchar;
- s[len - 1] = '\0';
+ s[seekpos] = s[seekpos+2] = seekchar;
+ s[len] = '\0';
p = fun->fun(s, seekchar);
-
if (p != f) {
- ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
- ERR("expected: %p\n", f);
- abort();
+ ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, seekchar, len, p, f, seekpos);
+ quote("input", s, len);
+ }
+
+ p = fun->fun(s, 0);
+ if (p != s + len) {
+ ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, seekchar, len, p, s + len, len);
+ quote("input", s, len);
}
}
@@ -77,21 +82,17 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int a = 0; a < A; a++) {
int n;
- for (n = 1; n < 100; n++) {
- for (int sp = 0; sp < n - 1; sp++)
+ for (n = 1; n < LEN; n++) {
+ for (int sp = 0; sp < n; sp++)
test(funtab+i, a, sp, n);
test(funtab+i, a, -1, n);
}
- for (; n < LEN; n *= 2) {
- test(funtab+i, a, -1, n);
- test(funtab+i, a, n / 2, n);
- }
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
index 814dd1e..b103568 100644
--- a/string/test/strchrnul.c
+++ b/string/test/strchrnul.c
@@ -13,6 +13,7 @@
#include <string.h>
#include <limits.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -23,6 +24,7 @@
F(strchrnul)
#if __aarch64__
F(__strchrnul_aarch64)
+F(__strchrnul_aarch64_mte)
# if __ARM_FEATURE_SVE
F(__strchrnul_aarch64_sve)
# endif
@@ -31,13 +33,9 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define LEN 512
+static char sbuf[LEN+3*A];
static void *alignup(void *p)
{
@@ -48,29 +46,37 @@
{
char *src = alignup(sbuf);
char *s = src + align;
- char *f = seekpos != -1 ? s + seekpos : s + len - 1;
+ char *f = seekpos != -1 ? s + seekpos : s + len;
int seekchar = 0x1;
void *p;
- if (len > LEN || seekpos >= len - 1 || align >= A)
- abort();
- if (seekchar >= 'a' && seekchar <= 'a' + 23)
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || seekpos >= len || align >= A)
abort();
- for (int i = 0; i < len + A; i++)
- src[i] = '?';
- for (int i = 0; i < len - 2; i++)
- s[i] = 'a' + i%23;
+ for (int i = 0; src + i < s; i++)
+ src[i] = i & 1 ? seekchar : 0;
+ for (int i = 1; i < A; i++)
+ s[len+i] = i & 1 ? seekchar : 0;
+ for (int i = 0; i < len; i++)
+ s[i] = 'a' + i%32;
if (seekpos != -1)
- s[seekpos] = seekchar;
- s[len - 1] = '\0';
+ s[seekpos] = s[seekpos+2] = seekchar;
+ s[len] = '\0';
p = fun->fun(s, seekchar);
-
if (p != f) {
- ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
- ERR("expected: %p\n", f);
- abort();
+ ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, seekchar, len, p, f, seekpos);
+ quote("input", s, len);
+ }
+
+ p = fun->fun(s, 0);
+ if (p != s + len) {
+ ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, seekchar, len, p, s + len, len);
+ quote("input", s, len);
}
}
@@ -78,21 +84,17 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int a = 0; a < A; a++) {
int n;
- for (n = 1; n < 100; n++) {
- for (int sp = 0; sp < n - 1; sp++)
+ for (n = 1; n < LEN; n++) {
+ for (int sp = 0; sp < n; sp++)
test(funtab+i, a, sp, n);
test(funtab+i, a, -1, n);
}
- for (; n < LEN; n *= 2) {
- test(funtab+i, a, -1, n);
- test(funtab+i, a, n / 2, n);
- }
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index 91fa9dd..078fb1b 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <string.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -20,6 +21,7 @@
F(strcmp)
#if __aarch64__
F(__strcmp_aarch64)
+F(__strcmp_aarch64_mte)
# if __ARM_FEATURE_SVE
F(__strcmp_aarch64_sve)
# endif
@@ -34,20 +36,17 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define LEN 250000
-static char s1buf[LEN+2*A];
-static char s2buf[LEN+2*A];
+static char s1buf[LEN+2*A+1];
+static char s2buf[LEN+2*A+1];
static void *alignup(void *p)
{
return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
+static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos, int delta)
{
char *src1 = alignup(s1buf);
char *src2 = alignup(s2buf);
@@ -55,26 +54,30 @@
char *s2 = src2 + s2align;
int r;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || s1align >= A || s2align >= A)
abort();
- if (diffpos > 1 && diffpos >= len-1)
+ if (diffpos >= len)
+ abort();
+ if ((diffpos < 0) != (delta == 0))
abort();
for (int i = 0; i < len+A; i++)
src1[i] = src2[i] = '?';
- for (int i = 0; i < len-1; i++)
+ for (int i = 0; i < len; i++)
s1[i] = s2[i] = 'a' + i%23;
- if (diffpos > 1)
- s1[diffpos]++;
+ if (delta)
+ s1[diffpos] += delta;
s1[len] = s2[len] = '\0';
r = fun->fun(s1, s2);
- if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
+ if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) {
ERR("%s(align %d, align %d, %d) failed, returned %d\n",
fun->name, s1align, s2align, len, r);
- ERR("src1: %.*s\n", s1align+len+1, src1);
- ERR("src2: %.*s\n", s2align+len+1, src2);
+ quoteat("src1", src1, len+A, diffpos);
+ quoteat("src2", src2, len+A, diffpos);
}
}
@@ -82,21 +85,26 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int d = 0; d < A; d++)
for (int s = 0; s < A; s++) {
int n;
- for (n = 0; n < 100; n++) {
- test(funtab+i, d, s, n, 0);
- test(funtab+i, d, s, n, n / 2);
+ test(funtab+i, d, s, 0, -1, 0);
+ test(funtab+i, d, s, 1, -1, 0);
+ test(funtab+i, d, s, 1, 0, 1);
+ test(funtab+i, d, s, 1, 0, -1);
+ for (n = 2; n < 100; n++) {
+ test(funtab+i, d, s, n, -1, 0);
+ test(funtab+i, d, s, n, n - 1, -1);
+ test(funtab+i, d, s, n, n / 2, 1);
}
for (; n < LEN; n *= 2) {
- test(funtab+i, d, s, n, 0);
- test(funtab+i, d, s, n, n / 2);
+ test(funtab+i, d, s, n, -1, 0);
+ test(funtab+i, d, s, n, n / 2, -1);
}
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index ea74c9e..68fc76f 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <string.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -30,14 +31,11 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define LEN 250000
-static char dbuf[LEN+2*A];
-static char sbuf[LEN+2*A];
-static char wbuf[LEN+2*A];
+static char dbuf[LEN+2*A+1];
+static char sbuf[LEN+2*A+1];
+static char wbuf[LEN+2*A+1];
static void *alignup(void *p)
{
@@ -55,6 +53,8 @@
void *p;
int i;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || dalign >= A || salign >= A)
abort();
for (i = 0; i < len+A; i++) {
@@ -63,7 +63,7 @@
}
for (i = 0; i < len; i++)
s[i] = w[i] = 'a' + i%23;
- s[i] = w[i] = '\0';
+ s[len] = w[len] = '\0';
p = fun->fun(d, s);
if (p != d)
@@ -71,8 +71,8 @@
for (i = 0; i < len+A; i++) {
if (dst[i] != want[i]) {
ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
- ERR("got : %.*s\n", dalign+len+1, dst);
- ERR("want: %.*s\n", dalign+len+1, want);
+ quoteat("got", dst, len+A, i);
+ quoteat("want", want, len+A, i);
break;
}
}
@@ -82,7 +82,7 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int d = 0; d < A; d++)
for (int s = 0; s < A; s++) {
int n;
@@ -91,8 +91,8 @@
for (; n < LEN; n *= 2)
test(funtab+i, d, s, n);
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
new file mode 100644
index 0000000..b9c034a
--- /dev/null
+++ b/string/test/stringtest.h
@@ -0,0 +1,50 @@
+/*
+ * Common string test code.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+
+/* Accounting errors for a test case. */
+static int err_count;
+#define ERR_LIMIT 10
+#define ERR(...) (err_count++, printf(__VA_ARGS__))
+
+static inline void quotechar(unsigned char c)
+{
+ if (isprint(c))
+ putchar(c);
+ else
+ printf("\\x%02x", c);
+}
+
+/* quoted print around at or the entire string if at < 0. */
+static void quoteat(const char *prefix, const void *p, int len, int at)
+{
+ static const int CTXLEN = 15;
+ int i;
+ const char *pre="\"";
+ const char *post="\"";
+ const char *s = p;
+ if (at > CTXLEN) {
+ s += at - CTXLEN;
+ len -= at - CTXLEN;
+ pre = "...\"";
+ }
+ if (at >= 0 && len > 2*CTXLEN + 1) {
+ len = 2*CTXLEN + 1;
+ post = "\"...";
+ }
+ printf("%4s: %s", prefix, pre);
+ for (i = 0; i < len; i++)
+ quotechar(s[i]);
+ printf("%s\n", post);
+}
+
+static inline void quote(const char *prefix, const void *p, int len)
+{
+ quoteat(prefix, p, len, -1);
+}
diff --git a/string/test/strlen.c b/string/test/strlen.c
index 96e6cd6..b2e2ffa 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -11,6 +11,7 @@
#include <string.h>
#include <limits.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -34,13 +35,10 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define SP 512
#define LEN 250000
-static char sbuf[LEN+2*A];
+static char sbuf[LEN+2*A+1];
static void *alignup(void *p)
{
@@ -53,21 +51,22 @@
char *s = src + align;
size_t r;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || align >= A)
abort();
for (int i = 0; i < len + A; i++)
src[i] = '?';
- for (int i = 0; i < len - 2; i++)
+ for (int i = 0; i < len; i++)
s[i] = 'a' + i%23;
- s[len - 1] = '\0';
+ s[len] = '\0';
r = fun->fun(s);
- if (r != len-1) {
+ if (r != len) {
ERR("%s(%p) returned %zu\n", fun->name, s, r);
- ERR("input: %.*s\n", align+len+1, src);
- ERR("expected: %d\n", len);
- abort();
+ quote("input", src, len);
+ printf("expected: %d\n", len);
}
}
@@ -75,7 +74,7 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int a = 0; a < A; a++) {
int n;
for (n = 1; n < 100; n++)
@@ -83,8 +82,8 @@
for (; n < LEN; n *= 2)
test(funtab+i, a, n);
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 43f941d..8ed21bd 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <string.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -20,6 +21,7 @@
F(strncmp)
#if __aarch64__
F(__strncmp_aarch64)
+F(__strncmp_aarch64_mte)
# if __ARM_FEATURE_SVE
F(__strncmp_aarch64_sve)
# endif
@@ -28,20 +30,17 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define LEN 250000
-static char s1buf[LEN+2*A];
-static char s2buf[LEN+2*A];
+static char s1buf[LEN+2*A+1];
+static char s2buf[LEN+2*A+1];
static void *alignup(void *p)
{
return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void test(const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, int len)
+static void test(const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, int len, int delta)
{
char *src1 = alignup(s1buf);
char *src2 = alignup(s2buf);
@@ -49,28 +48,34 @@
char *s2 = src2 + s2align;
int r;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || s1align >= A || s2align >= A)
abort();
- if (diffpos > 1 && diffpos >= len-1)
+ if (diffpos >= len)
+ abort();
+ if ((diffpos < 0) != (delta == 0))
abort();
for (int i = 0; i < len+A; i++)
src1[i] = src2[i] = '?';
- for (int i = 0; i < len-1; i++)
+ for (int i = 0; i < len; i++)
s1[i] = s2[i] = 'a' + i%23;
- if (diffpos > 1)
- s1[diffpos]++;
+ if (delta)
+ s1[diffpos] += delta;
s1[len] = s2[len] = '\0';
r = fun->fun(s1, s2, maxlen);
- diffpos = maxlen <= diffpos ? 0 : diffpos;
-
- if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
- ERR("%s(align %d, align %d, %d (%d)) failed, returned %d (%d)\n",
- fun->name, s1align, s2align, maxlen, len, r, diffpos);
- ERR("src1: %.*s\n", s1align+len+1, src1);
- ERR("src2: %.*s\n", s2align+len+1, src2);
+ if (diffpos >= maxlen) {
+ diffpos = -1;
+ delta = 0;
+ }
+ if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) {
+ ERR("%s(align %d, align %d, %d) (len=%d, diffpos=%d) failed, returned %d\n",
+ fun->name, s1align, s2align, maxlen, len, diffpos, r);
+ quoteat("src1", src1, len+A, diffpos);
+ quoteat("src2", src2, len+A, diffpos);
}
}
@@ -78,25 +83,32 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int d = 0; d < A; d++)
for (int s = 0; s < A; s++) {
int n;
- for (n = 0; n < 100; n++) {
- test(funtab+i, d, s, n, 0, n);
- test(funtab+i, d, s, n, n/2, n);
- test(funtab+i, d, s, n/2, 0, n);
- test(funtab+i, d, s, n/2, n/2, n);
+ test(funtab+i, d, s, 0, -1, 0, 0);
+ test(funtab+i, d, s, 1, -1, 0, 0);
+ test(funtab+i, d, s, 0, -1, 1, 0);
+ test(funtab+i, d, s, 1, -1, 1, 0);
+ test(funtab+i, d, s, 2, -1, 1, 0);
+ test(funtab+i, d, s, 1, 0, 1, 1);
+ test(funtab+i, d, s, 1, 0, 1, -1);
+ for (n = 2; n < 100; n++) {
+ test(funtab+i, d, s, n, -1, n, 0);
+ test(funtab+i, d, s, n, n/2, n, 1);
+ test(funtab+i, d, s, n/2, -1, n, 0);
+ test(funtab+i, d, s, n/2, n/2, n, -1);
}
for (; n < LEN; n *= 2) {
- test(funtab+i, d, s, n, 0, n);
- test(funtab+i, d, s, n, n/2, n);
- test(funtab+i, d, s, n/2, 0, n);
- test(funtab+i, d, s, n/2, n/2, n);
+ test(funtab+i, d, s, n, -1, n, 0);
+ test(funtab+i, d, s, n, n/2, n, -1);
+ test(funtab+i, d, s, n/2, -1, n, 0);
+ test(funtab+i, d, s, n/2, n/2, n, 1);
}
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
index db41f2a..29f85a0 100644
--- a/string/test/strnlen.c
+++ b/string/test/strnlen.c
@@ -13,6 +13,7 @@
#include <string.h>
#include <limits.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -31,13 +32,10 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
#define SP 512
#define LEN 250000
-static char sbuf[LEN+2*A];
+static char sbuf[LEN+2*A+1];
static void *alignup(void *p)
{
@@ -49,23 +47,24 @@
char *src = alignup(sbuf);
char *s = src + align;
size_t r;
- size_t e = maxlen < len ? maxlen : len - 1;
+ size_t e = maxlen < len ? maxlen : len;
+ if (err_count >= ERR_LIMIT)
+ return;
if (len > LEN || align >= A)
abort();
for (int i = 0; i < len + A; i++)
src[i] = '?';
- for (int i = 0; i < len - 2; i++)
+ for (int i = 0; i < len; i++)
s[i] = 'a' + i%23;
- s[len - 1] = '\0';
+ s[len] = '\0';
r = fun->fun(s, maxlen);
if (r != e) {
- ERR("%s(%p) returned %zu\n", fun->name, s, r);
- ERR("input: %.*s\n", align+len+1, src);
- ERR("expected: %d\n", len);
- abort();
+ ERR("%s(%p, %d) returned %zu\n", fun->name, s, maxlen, r);
+ quote("input", src, len+A);
+ printf("expected: %zu\n", e);
}
}
@@ -73,7 +72,7 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int a = 0; a < A; a++) {
int n;
for (n = 1; n < 100; n++)
@@ -85,8 +84,8 @@
test(funtab+i, a, n/2, n);
}
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index 532fa51..1ad1f3a 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -11,6 +11,7 @@
#include <string.h>
#include <limits.h>
#include "stringlib.h"
+#include "stringtest.h"
static const struct fun
{
@@ -21,6 +22,7 @@
F(strrchr)
#if __aarch64__
F(__strrchr_aarch64)
+F(__strrchr_aarch64_mte)
# if __ARM_FEATURE_SVE
F(__strrchr_aarch64_sve)
# endif
@@ -29,13 +31,9 @@
{0, 0}
};
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
#define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define LEN 512
+static char sbuf[LEN+3*A];
static void *alignup(void *p)
{
@@ -50,25 +48,35 @@
int seekchar = 0x1;
void *p;
- if (len > LEN || seekpos >= len - 1 || align >= A)
- abort();
- if (seekchar >= 'a' && seekchar <= 'a' + 23)
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || seekpos >= len || align >= A)
abort();
- for (int i = 0; i < len + A; i++)
- src[i] = '?';
- for (int i = 0; i < len - 2; i++)
- s[i] = 'a' + i%23;
- if (seekpos != -1)
+ for (int i = 0; src + i < s; i++)
+ src[i] = i & 1 ? seekchar : 0;
+ for (int i = 1; i < A; i++)
+ s[len+i] = i & 1 ? seekchar : 0;
+ for (int i = 0; i < len; i++)
+ s[i] = 'a' + i%32;
+ if (seekpos != -1) {
s[seekpos/2] = s[seekpos] = seekchar;
- s[len - 1] = '\0';
+ s[seekpos - (seekpos & 15)] = s[seekpos & 7] = seekchar;
+ }
+ s[len] = '\0';
p = fun->fun(s, seekchar);
-
if (p != f) {
- ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
- ERR("expected: %p\n", f);
- abort();
+ ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, seekchar, len, p, f, seekpos);
+ quote("input", s, len);
+ }
+
+ p = fun->fun(s, 0);
+ if (p != s + len) {
+ ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, seekchar, len, p, s + len, len);
+ quote("input", s, len);
}
}
@@ -76,21 +84,17 @@
{
int r = 0;
for (int i=0; funtab[i].name; i++) {
- test_status = 0;
+ err_count = 0;
for (int a = 0; a < A; a++) {
int n;
- for (n = 1; n < 100; n++) {
- for (int sp = 0; sp < n - 1; sp++)
+ for (n = 1; n < LEN; n++) {
+ for (int sp = 0; sp < n; sp++)
test(funtab+i, a, sp, n);
test(funtab+i, a, -1, n);
}
- for (; n < LEN; n *= 2) {
- test(funtab+i, a, -1, n);
- test(funtab+i, a, n / 2, n);
- }
}
- printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
- if (test_status)
+ printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
r = -1;
}
return r;