Merge "Upgrade arm-optimized-routines to e112794669739057178f5ae8c94ccf0f8ca59c60"
diff --git a/METADATA b/METADATA
index bbb8e3f..f2b01ff 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@
     type: GIT
     value: "https://github.com/ARM-software/optimized-routines.git"
   }
-  version: "30c1ada57d6af777f44826ae31f92ceeffcbe02b"
+  version: "e112794669739057178f5ae8c94ccf0f8ca59c60"
   license_type: NOTICE
   last_upgrade_date {
     year: 2020
-    month: 4
+    month: 5
     day: 1
   }
 }
diff --git a/config.mk.dist b/config.mk.dist
index 2336c52..cac40eb 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -60,6 +60,9 @@
 #math-ulpflags = -q -f
 #math-testflags = -nostatus
 
+# Remove GNU Property Notes from asm files.
+#string-cflags += -DWANT_GNU_PROPERTY=0
+
 # Enable assertion checks.
 #networking-cflags += -DWANT_ASSERT
 
diff --git a/string/Dir.mk b/string/Dir.mk
index bb881a3..400eda8 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -22,7 +22,7 @@
 	build/lib/libstringlib.so \
 	build/lib/libstringlib.a \
 
-string-tools := \
+string-tests := \
 	build/bin/test/memcpy \
 	build/bin/test/memmove \
 	build/bin/test/memset \
@@ -53,11 +53,11 @@
 string-files := \
 	$(string-objs) \
 	$(string-libs) \
-	$(string-tools) \
+	$(string-tests) \
 	$(string-benches) \
 	$(string-includes) \
 
-all-string: $(string-libs) $(string-tools) $(string-benches) $(string-includes)
+all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes)
 
 $(string-objs): $(string-includes)
 $(string-objs): CFLAGS_ALL += $(string-cflags)
@@ -82,21 +82,14 @@
 build/bin/%.sh: $(S)/test/%.sh
 	cp $< $@
 
-check-string: $(string-tools)
-	$(EMULATOR) build/bin/test/memcpy
-	$(EMULATOR) build/bin/test/memmove
-	$(EMULATOR) build/bin/test/memset
-	$(EMULATOR) build/bin/test/memchr
-	$(EMULATOR) build/bin/test/memcmp
-	$(EMULATOR) build/bin/test/strcpy
-	$(EMULATOR) build/bin/test/stpcpy
-	$(EMULATOR) build/bin/test/strcmp
-	$(EMULATOR) build/bin/test/strchr
-	$(EMULATOR) build/bin/test/strrchr
-	$(EMULATOR) build/bin/test/strchrnul
-	$(EMULATOR) build/bin/test/strlen
-	$(EMULATOR) build/bin/test/strnlen
-	$(EMULATOR) build/bin/test/strncmp
+string-tests-out = $(string-tests:build/bin/test/%=build/string/test/%.out)
+
+build/string/test/%.out: build/bin/test/%
+	$(EMULATOR) $^ | tee $@.tmp
+	mv $@.tmp $@
+
+check-string: $(string-tests-out)
+	! grep FAIL $^
 
 bench-string: $(string-benches)
 	$(EMULATOR) build/bin/bench/memcpy
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
index d6775a4..4537aab 100644
--- a/string/aarch64/check-arch.S
+++ b/string/aarch64/check-arch.S
@@ -8,3 +8,6 @@
 #if !__aarch64__
 # error ARCH setting does not match the compiler.
 #endif
+
+#include "../asmdefs.h"
+END_FILE
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
new file mode 100644
index 0000000..0a869c7
--- /dev/null
+++ b/string/aarch64/memchr-mte.S
@@ -0,0 +1,150 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define	tmp2		x5
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define qdata		q1
+#define vdata		v1
+#define vhas_chr	v2
+#define vrepmask	v3
+#define vend		v4
+
+/*
+ * Core algorithm:
+ *
+ * For each 16-byte chunk we calculate a 64-bit syndrome value, with four bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1, 2, 3 are not used (faster than using a lower
+ * bit syndrome). Since the bits in the syndrome reflect exactly the order in
+ * which things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr_aarch64_mte)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x10011001 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x1001
+	movk	wtmp2, #0x1001, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 16-byte chunks */
+	bic	src, srcin, #15
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #15
+	and	cntrem, cntin, #15
+	b.eq	L(aligned_start)
+
+	/*
+	 * Input string is not 16-byte aligned. We calculate the syndrome
+	 * value for the aligned 16 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata.16b}, [src], #16
+	sub	tmp, soff, #16
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	lsl	tmp, soff, #2
+	mov	tmp2, #~0
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	lsl	tmp, tmp2, tmp
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	mov	synd, vend.d[0]
+	/* Clear the soff*4 lower bits */
+	and	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(aligned_start:)
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntin, #15
+	tbnz	tmp, 4, L(loop32_2)
+
+L(loop32):
+	ld1	{vdata.16b}, [src], #16
+	subs	cntin, cntin, #16
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	mov	synd, vend.d[0]
+	cbnz	synd, L(end)
+
+L(loop32_2):
+	ld1	{vdata.16b}, [src], #16
+	subs	cntin, cntin, #16
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	b.ls	L(end)
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	mov	synd, vend.d[0]
+	/* We haven't found the character, loop with 32 byte chunks */
+	cbz	synd, L(loop32)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	mov	synd, vend.d[0]
+	/* Only do the clear for the last possible block */
+	b.hs	L(tail)
+
+L(masklast):
+	/* Clear the (16 - ((cntrem + soff) % 16)) * 4 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #15
+	sub	tmp, tmp, #16
+	neg	tmp, tmp, lsl #2
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #16
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #2
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+
+END (__memchr_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index 58badd2..53efd4c 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
@@ -15,10 +17,7 @@
 	.arch	armv8-a+sve
 	.text
 
-	.globl	__memchr_aarch64_sve
-	.type	__memchr_aarch64_sve, %function
-	.p2align 4
-__memchr_aarch64_sve:
+ENTRY_ALIGN(__memchr_aarch64_sve, 4)
 	dup	z1.b, w1			/* duplicate c to a vector */
 	setffr					/* initialize FFR */
 	mov	x3, 0				/* initialize off */
@@ -60,5 +59,8 @@
 9:	mov	x0, 0			/* return null */
 	ret
 
-	.size	__memchr_aarch64_sve, . - __memchr_aarch64_sve
+END (__memchr_aarch64_sve)
+
 #endif
+
+END_FILE
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 10be49e..f5538bd 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -110,7 +110,7 @@
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
 	mov	synd, vend.d[0]
 	/* Only do the clear for the last possible block */
-	b.hi	L(tail)
+	b.hs	L(tail)
 
 L(masklast):
 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
@@ -141,3 +141,5 @@
 	ret
 
 END (__memchr_aarch64)
+
+END_FILE
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index c216103..07512ba 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
@@ -15,10 +17,7 @@
 	.arch	armv8-a+sve
 	.text
 
-	.globl	__memcmp_aarch64_sve
-	.type	__memcmp_aarch64_sve, %function
-	.p2align 4
-__memcmp_aarch64_sve:
+ENTRY_ALIGN (__memcmp_aarch64_sve, 4)
 	mov	x3, 0			/* initialize off */
 
 0:	whilelo	p0.b, x3, x2		/* while off < max */
@@ -46,5 +45,8 @@
 9:	mov	x0, 0			/* return equality */
 	ret
 
-	.size	__memcmp_aarch64_sve, . - __memcmp_aarch64_sve
+END (__memcmp_aarch64_sve)
+
 #endif
+
+END_FILE
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 6722516..4be23de 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -131,3 +131,5 @@
 	ret
 
 END (__memcmp_aarch64)
+
+END_FILE
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index f7dce55..844cc41 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -50,8 +50,8 @@
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
-ENTRY (__memcpy_aarch64_simd)
 ENTRY_ALIAS (__memmove_aarch64_simd)
+ENTRY (__memcpy_aarch64_simd)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
@@ -199,3 +199,5 @@
 	ret
 
 END (__memcpy_aarch64_simd)
+
+END_FILE
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 060b794..c91f6e5 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -53,8 +53,8 @@
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
-ENTRY (__memcpy_aarch64)
 ENTRY_ALIAS (__memmove_aarch64)
+ENTRY (__memcpy_aarch64)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
@@ -237,3 +237,5 @@
 	ret
 
 END (__memcpy_aarch64)
+
+END_FILE
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index aa580df..7c3e4f4 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -112,3 +112,5 @@
 	ret
 
 END (__memset_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 2041e73..b9a5e71 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -1,130 +1,105 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
 #include "../asmdefs.h"
 
-/* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
-
 #define result		x0
 
 #define src		x2
-#define	tmp1		x3
-#define wtmp2		w4
-#define tmp3		x5
+#define tmp1		x1
+#define wtmp2		w3
+#define tmp3		x3
 
 #define vrepchr		v0
-#define qdata		q1
 #define vdata		v1
+#define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask_0	v4
-#define vrepmask_c	v5
+#define vrepmask	v4
+#define vrepmask2	v5
 #define vend		v6
-
-#define L(l) .L ## l
+#define dend		d6
 
 /* Core algorithm.
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value, with
-   four bits per byte (LSB is always in bits 0 and 1, for both big
-   and little-endian systems).  For each tuple, bit 0 is set if
-   the relevant byte matched the requested character; bit 1 is set
-   if the relevant byte matched the NUL end of string (we trigger
-   off bit0 for the special case of looking for NUL) and bits 2 and 3
-   are not used.
-   Since the bits in the syndrome reflect exactly the order in which
-   things occur in the original string a count_trailing_zeros()
-   operation will identify exactly which byte is causing the termination,
-   and why. */
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
+   requested character, bits 2-3 are set if the byte is NUL (or matched), and
+   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
+   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
+   in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
-/* Locals and temporaries. */
-
-ENTRY(__strchr_aarch64_mte)
-	/* Magic constant 0x10011001 to allow us to identify which lane
-	   matches the requested byte.  Magic constant 0x20022002 used
-	   similarly for NUL termination. */
-	mov	wtmp2, #0x1001
-	movk	wtmp2, #0x1001, lsl #16
+ENTRY (__strchr_aarch64_mte)
+	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	bic	src, srcin, #15		/* Work with aligned 16-byte chunks. */
-	dup	vrepmask_c.4s, wtmp2
-	ands	tmp1, srcin, #15
-	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
-	b.eq	L(loop)
-
-	/* Input string is not 16-byte aligned.  Rather than forcing
-	   the padding bytes to a safe value, we calculate the syndrome
-	   for all the bytes, but then mask off those bits of the
-	   syndrome that are related to the padding.  */
-	ldr	qdata, [src], #16
-	cmeq	vhas_nul.16b, vdata.16b, #0
+	ld1	{vdata.16b}, [src]
+	mov	wtmp2, 0x3003
+	dup	vrepmask.8h, wtmp2
+	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
-	lsl	tmp1, tmp1, #2
-	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
-	mov	tmp3, #~0
-	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */
-	lsl	tmp1, tmp3, tmp1
+	mov	wtmp2, 0xf00f
+	dup	vrepmask2.8h, wtmp2
 
-	mov	tmp3, vend.d[0]
-	ands	tmp1, tmp3, tmp1	/* Mask padding bits. */
-	b.ne	L(tail)
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	lsl	tmp3, srcin, 2
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
 
-L(loop):
-	ldr	qdata, [src], #32
-	cmeq	vhas_nul.16b, vdata.16b, #0
-	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
-	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */
-	mov	tmp1, vend.d[0]
-	cbnz	tmp1, L(end)
-
-	ldr	qdata, [src, #-16]
-	cmeq	vhas_nul.16b, vdata.16b, #0
-	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
-	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */
-	mov	tmp1, vend.d[0]
+	fmov	tmp1, dend
+	lsr	tmp1, tmp1, tmp3
 	cbz	tmp1, L(loop)
 
-	/* Adjust src for next two subtractions. */
-	add	src, src, #16
-L(end):
-	/* Termination condition found.  Now need to establish exactly why
-	   we terminated.  */
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
-	sub	src, src, #16
-	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
-	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */
-
-	mov	tmp1, vend.d[0]
-L(tail):
-	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
-	/* Re-bias source.  */
-	sub	src, src, #16
-	clz	tmp1, tmp1	/* And counting the leading zeros.  */
-	/* Tmp1 is even if the target character was found first.  Otherwise
-	   we've found the end of string and we weren't looking for NUL.  */
-	tst	tmp1, #1
-	add	result, src, tmp1, lsr #2
+	clz	tmp1, tmp1
+	/* Tmp1 is an even multiple of 2 if the target character was
+	   found first. Otherwise we've found the end of string.  */
+	tst	tmp1, 2
+	add	result, srcin, tmp1, lsr 2
 	csel	result, result, xzr, eq
 	ret
 
-END(__strchr_aarch64_mte)
+	.p2align 4
+L(loop):
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
+	cbz	tmp1, L(loop)
+
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	tmp1, dend
+#else
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	tmp1, dend
+	rbit	tmp1, tmp1
+#endif
+	clz	tmp1, tmp1
+	/* Tmp1 is an even multiple of 2 if the target character was
+	   found first. Otherwise we've found the end of string.  */
+	tst	tmp1, 2
+	add	result, src, tmp1, lsr 2
+	csel	result, result, xzr, eq
+	ret
+
+END (__strchr_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 35d5dd9..ba02bf6 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
@@ -22,10 +24,7 @@
 #define FUNC  __strchr_aarch64_sve
 #endif
 
-	.globl	FUNC
-	.type	FUNC, %function
-	.p2align 4
-FUNC:
+ENTRY_ALIGN (FUNC, 4)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
@@ -67,5 +66,8 @@
 	incp	x0, p0.b
 	b	0b
 
-	.size	FUNC, . - FUNC
+END (FUNC)
+
 #endif
+
+END_FILE
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 00d9be3..39241a3 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -51,11 +51,11 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchr_aarch64)
-	/* Magic constant 0x40100401 to allow us to identify which lane
-	   matches the requested byte.  Magic constant 0x80200802 used
-	   similarly for NUL termination.  */
-	mov	wtmp2, #0x0401
-	movk	wtmp2, #0x4010, lsl #16
+	/* Magic constant 0xc0300c03 to allow us to identify which lane
+	   matches the requested byte.  Even bits are set if the character
+	   matches, odd bits if either the char is NUL or matches.  */
+	mov	wtmp2, 0x0c03
+	movk	wtmp2, 0xc030, lsl 16
 	dup	vrepchr.16b, chrin
 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
 	dup	vrepmask_c.4s, wtmp2
@@ -73,12 +73,10 @@
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
 	lsl	tmp1, tmp1, #1
 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
 	mov	tmp3, #~0
@@ -89,31 +87,26 @@
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
 	cbnz	tmp1, L(tail)
 
+	.p2align 4
 L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
-	orr	vend1.16b, vend1.16b, vend2.16b
-	addp	vend1.2d, vend1.2d, vend1.2d
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
 	mov	tmp1, vend1.d[0]
 	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
-	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
-
 	mov	tmp1, vend1.d[0]
 L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
@@ -129,3 +122,5 @@
 	ret
 
 END (__strchr_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
new file mode 100644
index 0000000..c813939
--- /dev/null
+++ b/string/aarch64/strchrnul-mte.S
@@ -0,0 +1,84 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define result		x0
+
+#define src		x2
+#define tmp1		x1
+#define tmp2		x3
+#define tmp2w		w3
+
+#define vrepchr		v0
+#define vdata		v1
+#define qdata		q1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vend		v5
+#define dend		d5
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__strchrnul_aarch64_mte)
+	bic	src, srcin, 15
+	dup	vrepchr.16b, chrin
+	ld1	{vdata.16b}, [src]
+	mov	tmp2w, 0xf00f
+	dup	vrepmask.8h, tmp2w
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	lsl	tmp2, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	tmp1, dend
+	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
+	cbz	tmp1, L(loop)
+
+	rbit	tmp1, tmp1
+	clz	tmp1, tmp1
+	add	result, srcin, tmp1, lsr 2
+	ret
+
+	.p2align 4
+L(loop):
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
+	fmov	tmp1, dend
+	cbz	tmp1, L(loop)
+
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	tmp1, dend
+#ifndef __AARCH64EB__
+	rbit	tmp1, tmp1
+#endif
+	clz	tmp1, tmp1
+	add	result, src, tmp1, lsr 2
+	ret
+
+END (__strchrnul_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 81264ea..0e08d82 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -63,14 +63,12 @@
 	   syndrome that are related to the padding.  */
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	neg	tmp1, tmp1
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
-	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
 	lsl	tmp1, tmp1, #1
 	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
 	mov	tmp3, #~0
@@ -81,24 +79,22 @@
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
 	cbnz	tmp1, L(tail)
 
+	.p2align 4
 L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
-	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
-	addp	vend1.2d, vend1.2d, vend1.2d
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
 	mov	tmp1, vend1.d[0]
 	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
 	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
 	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
 
@@ -114,3 +110,5 @@
 	ret
 
 END (__strchrnul_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
new file mode 100644
index 0000000..28efce2
--- /dev/null
+++ b/string/aarch64/strcmp-mte.S
@@ -0,0 +1,246 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define syndrome	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define zeroones	x10
+#define pos		x11
+#define offset		x12
+#define neg_offset	x13
+#define mask		x14
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+	/* Start of performance-critical section  -- one 64B cache line.  */
+ENTRY (__strcmp_aarch64_mte)
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	b.ne	L(misaligned8)
+	ands	tmp1, src1, #7
+	b.ne	L(mutual_align)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+L(end):
+#ifdef __AARCH64EB__
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(end_quick):
+#ifndef __AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+#endif
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+#ifndef __AARCH64EB__
+	rev	data2, data2
+#endif
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	LS_FW	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	L(start_realigned)
+
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
+
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
+
+	   The bytes with "0" are eliminated from the syndrome via mask. */
+
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(src1_aligned)
+L(do_misaligned):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(do_misaligned)
+
+	/* Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
+	ldr	data1, [src1], #8
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	sub	has_nul, data1, zeroones
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	orr	tmp3, data1, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	syndrome, diff, has_nul
+	cbnz	syndrome, L(end)
+
+	ldr	data1, [src1], #8
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	orr	syndrome, diff, has_nul
+	bics	syndrome, syndrome, mask	/* Ignore later bytes. */
+	b.ne	L(end_quick)
+
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	ands	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	b.ne	L(end_quick)
+
+	ldr	data1, [src1], #8
+	b	L(loop_misaligned)
+
+L(done):
+	sub	result, data1, data2
+	ret
+
+END (__strcmp_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index 8e0b1a7..62a2bb1 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
@@ -15,10 +17,7 @@
 	.arch	armv8-a+sve
 	.text
 
-	.globl	__strcmp_aarch64_sve
-	.type	__strcmp_aarch64_sve, %function
-	.p2align 4
-__strcmp_aarch64_sve:
+ENTRY_ALIGN (__strcmp_aarch64_sve, 4)
 	setffr				/* initialize FFR */
 	ptrue	p1.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
@@ -55,5 +54,8 @@
 	b.none	0b
 	b	1b
 
-	.size	__strcmp_aarch64_sve, . - __strcmp_aarch64_sve
+END (__strcmp_aarch64_sve)
+
 #endif
+
+END_FILE
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 65af5ce..a6de6e8 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -1,7 +1,7 @@
 /*
  * strcmp - compare two strings
  *
- * Copyright (c) 2012, Arm Limited.
+ * Copyright (c) 2012-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -168,3 +168,5 @@
 	ret
 
 END (__strcmp_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index 1029542..ee0b92e 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
@@ -22,10 +24,7 @@
 #define FUNC  __strcpy_aarch64_sve
 #endif
 
-	.globl	FUNC
-	.type	FUNC, %function
-	.p2align 4
-FUNC:
+ENTRY_ALIGN (FUNC, 4)
 	setffr				/* initialize FFR */
 	ptrue	p2.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
@@ -67,5 +66,8 @@
 #endif
 	ret
 
-	.size	FUNC, . - FUNC
+END (FUNC)
+
 #endif
+
+END_FILE
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 4edffcf..079baef 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -306,3 +306,5 @@
 	b	L(fp_gt8)
 
 END (STRCPY)
+
+END_FILE
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index d2bb79c..ef5b207 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -183,3 +183,5 @@
 	b	L(tail)
 
 END(__strlen_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 82a1e85..13a4319 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
@@ -15,10 +17,7 @@
 	.arch	armv8-a+sve
 	.text
 
-	.globl	__strlen_aarch64_sve
-	.type	__strlen_aarch64_sve, %function
-	.p2align 4
-__strlen_aarch64_sve:
+ENTRY_ALIGN (__strlen_aarch64_sve, 4)
 	setffr			/* initialize FFR */
 	ptrue	p2.b		/* all ones; loop invariant */
 	mov	x1, 0		/* initialize length */
@@ -53,5 +52,8 @@
 	incp	x1, p0.b
 	b	0b
 
-	.size	__strlen_aarch64_sve, . - __strlen_aarch64_sve
+END (__strlen_aarch64_sve)
+
 #endif
+
+END_FILE
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 2293f73..3176b5e 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -204,3 +204,5 @@
 	b	L(page_cross_entry)
 
 END (__strlen_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
new file mode 100644
index 0000000..ca1adba
--- /dev/null
+++ b/string/aarch64/strncmp-mte.S
@@ -0,0 +1,310 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define syndrome	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define zeroones	x11
+#define pos		x12
+#define mask		x13
+#define endloop		x14
+#define count		mask
+#define offset		pos
+#define neg_offset	x15
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+	.text
+	.p2align 6
+	.rep 9
+	nop	/* Pad so that the loop below fits a cache line.  */
+	.endr
+ENTRY_ALIGN (__strncmp_aarch64_mte, 0)
+	cbz	limit, L(ret0)
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	and	count, src1, #7
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	/* Start of performance-critical section  -- one 64B cache line.  */
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	subs	limit, limit, #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	endloop, #0, #0, eq
+	b.eq	L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+L(full_check):
+#ifndef __AARCH64EB__
+	orr	syndrome, diff, has_nul
+	add	limit, limit, 8	/* Rewind limit to before last subs. */
+L(syndrome_check):
+	/* Limit was reached. Check if the NUL byte or the difference
+	   is before the limit. */
+	rev	syndrome, syndrome
+	rev	data1, data1
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	cmp	limit, pos, lsr #3
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	csel result, result, xzr, hi
+	ret
+#else
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit, #63, L(not_limit)
+	add	tmp1, limit, 8
+	cbz	limit, L(not_limit)
+
+	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+	lsr	mask, mask, limit
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+L(end_quick):
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.
+	   We also need to adjust the limit calculations, but without
+	   overflowing if the limit is near ULONG_MAX.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	ldr	data1, [src1], #8
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	and	count, count, #0x3f
+	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
+	add	limit, limit, count
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	L(start_realigned)
+
+	.p2align 6
+	/* Don't bother with dwords for up to 16 bytes.  */
+L(misaligned8):
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+L(done):
+	sub	result, data1, data2
+	ret
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	cbz	count, L(src1_aligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
+
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
+
+	   The bytes with "0" are eliminated from the syndrome via mask.
+
+	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
+	ldr	data1, [src1], #8
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	subs	limit, limit, #8
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	sub	has_nul, data1, zeroones
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	orr	tmp3, data1, #REP8_7f
+	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	tmp3, endloop, has_nul
+	cbnz	tmp3, L(full_check)
+
+	ldr	data1, [src1], #8
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	cmp	limit, neg_offset, lsr #3
+	orr	syndrome, diff, has_nul
+	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	cmp	limit, #8
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	ldr	data1, [src1], #8
+	sub	limit, limit, #8
+	b	L(loop_misaligned)
+
+#ifdef	__AARCH64EB__
+L(syndrome_check):
+	clz	pos, syndrome
+	cmp	pos, limit, lsl #3
+	b.lo	L(end_quick)
+#endif
+
+L(ret0):
+	mov	result, #0
+	ret
+END(__strncmp_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index c4ec813..c8fbf32 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
@@ -15,10 +17,7 @@
 	.arch	armv8-a+sve
 	.text
 
-	.globl	__strncmp_aarch64_sve
-	.type	__strncmp_aarch64_sve, %function
-	.p2align 4
-__strncmp_aarch64_sve:
+ENTRY_ALIGN (__strncmp_aarch64_sve, 4)
 	setffr				/* initialize FFR */
 	mov	x3, 0			/* initialize off */
 
@@ -64,5 +63,8 @@
 9:	mov	x0, 0			/* return equal */
 	ret
 
-	.size	__strncmp_aarch64_sve, . - __strncmp_aarch64_sve
+END (__strncmp_aarch64_sve)
+
 #endif
+
+END_FILE
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index fbd08ee..766524b 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -42,7 +42,7 @@
 
 	.text
 	.p2align 6
-	.rep 7
+	.rep 6
 	nop	/* Pad so that the loop below fits a cache line.  */
 	.endr
 ENTRY_ALIGN (__strncmp_aarch64, 0)
@@ -259,3 +259,5 @@
 	ret
 
 END ( __strncmp_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index e80d26e..cf293f6 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
@@ -15,10 +17,7 @@
 	.arch	armv8-a+sve
 	.text
 
-	.globl	__strnlen_aarch64_sve
-	.type	__strnlen_aarch64_sve, %function
-	.p2align 4
-__strnlen_aarch64_sve:
+ENTRY_ALIGN (__strnlen_aarch64_sve, 4)
 	setffr				/* initialize FFR */
 	mov	x2, 0			/* initialize len */
 	b	1f
@@ -70,5 +69,8 @@
 9:	mov	x0, x2
 	ret
 
-	.size	__strnlen_aarch64_sve, . - __strnlen_aarch64_sve
+END (__strnlen_aarch64_sve)
+
 #endif
+
+END_FILE
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index df66b60..202c401 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -40,7 +40,7 @@
 	.p2align	6
 L(start):
 	/* Pre-pad to ensure critical loop begins an icache line.  */
-	.rep 7
+	.rep 6
 	nop
 	.endr
 	/* Put this code here to avoid wasting more space with pre-padding.  */
@@ -153,3 +153,5 @@
 	b	L(realigned)
 
 END (__strnlen_aarch64)
+
+END_FILE
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
new file mode 100644
index 0000000..bd1296d
--- /dev/null
+++ b/string/aarch64/strrchr-mte.S
@@ -0,0 +1,134 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+#define src_match	x6
+#define src_offset	x7
+#define const_m1	x8
+#define tmp4		x9
+#define nul_match	x10
+#define chr_match	x11
+
+#define vrepchr		v0
+#define vdata		v1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask_0	v4
+#define vrepmask_c	v16
+#define vend		v17
+
+/* Core algorithm.
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value, with
+   four bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set if
+   the relevant byte matched the requested character; bit 1 is set
+   if the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL) and bits 2 and 3
+   are not used.
+   Since the bits in the syndrome reflect exactly the order in which
+   things occur in the original string a count_trailing_zeros()
+   operation will identify exactly which byte is causing the termination,
+   and why. */
+
+ENTRY (__strrchr_aarch64_mte)
+	/* Magic constant 0x10011001 to allow us to identify which lane
+	   matches the requested byte.  Magic constant 0x20022002 used
+	   similarly for NUL termination. */
+	mov	wtmp2, #0x1001
+	movk	wtmp2, #0x1001, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #15		/* Work with aligned 16-byte chunks. */
+	dup	vrepmask_c.4s, wtmp2
+	mov	src_offset, #0
+	ands	tmp1, srcin, #15
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	L(aligned)
+
+	/* Input string is not 16-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata.16b}, [src], #16
+	neg	tmp1, tmp1
+	cmeq	vhas_nul.16b, vdata.16b, #0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
+	addp	vhas_nul.16b, vhas_nul.16b, vhas_nul.16b	// 128->64
+	addp	vhas_chr.16b, vhas_chr.16b, vhas_chr.16b	// 128->64
+	mov	nul_match, vhas_nul.d[0]
+	lsl	tmp1, tmp1, #2
+	mov	const_m1, #~0
+	mov	chr_match, vhas_chr.d[0]
+	lsr	tmp3, const_m1, tmp1
+
+	bic	nul_match, nul_match, tmp3	// Mask padding bits.
+	bic	chr_match, chr_match, tmp3	// Mask padding bits.
+	cbnz	nul_match, L(tail)
+
+L(loop):
+	cmp	chr_match, #0
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+L(aligned):
+	ld1	{vdata.16b}, [src], #16
+	cmeq	vhas_nul.16b, vdata.16b, #0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b	// 128->64
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
+	addp	vhas_chr.16b, vhas_chr.16b, vhas_chr.16b	// 128->64
+	mov	nul_match, vend.d[0]
+	mov	chr_match, vhas_chr.d[0]
+	cbz	nul_match, L(loop)
+
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
+	addp	vhas_nul.16b, vhas_nul.16b, vhas_nul.16b
+	mov	nul_match, vhas_nul.d[0]
+
+L(tail):
+	/* Work out exactly where the string ends.  */
+	sub	tmp4, nul_match, #1
+	eor	tmp4, tmp4, nul_match
+	ands	chr_match, chr_match, tmp4
+	/* And pick the values corresponding to the last match.  */
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+
+	/* Count down from the top of the syndrome to find the last match.  */
+	clz	tmp3, src_offset
+	/* Src_match points beyond the word containing the match, so we can
+	   simply subtract half the bit-offset into the syndrome.  Because
+	   we are counting down, we need to go back one more character.  */
+	add	tmp3, tmp3, #2
+	sub	result, src_match, tmp3, lsr #2
+	/* But if the syndrome shows no match was found, then return NULL.  */
+	cmp	src_offset, #0
+	csel	result, result, xzr, ne
+
+	ret
+
+END (__strrchr_aarch64_mte)
+
+END_FILE
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index 4047a8e..fda9a43 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
@@ -15,10 +17,7 @@
 	.arch	armv8-a+sve
 	.text
 
-	.globl	__strrchr_aarch64_sve
-	.type	__strrchr_aarch64_sve, %function
-	.p2align 4
-__strrchr_aarch64_sve:
+ENTRY_ALIGN (__strrchr_aarch64_sve, 4)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
@@ -81,5 +80,8 @@
 5:	mov	x0, 0
 	ret
 
-	.size	__strrchr_aarch64_sve, . - __strrchr_aarch64_sve
+END (__strrchr_aarch64_sve)
+
 #endif
+
+END_FILE
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index 1b4caac..726aa83 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -84,38 +84,38 @@
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
-	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b	// 128->64
-	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
-	mov	nul_match, vhas_nul1.d[0]
+	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
+	mov	nul_match, vend1.d[0]
 	lsl	tmp1, tmp1, #1
 	mov	const_m1, #~0
-	mov	chr_match, vhas_chr1.d[0]
 	lsr	tmp3, const_m1, tmp1
+	mov	chr_match, vend1.d[1]
 
 	bic	nul_match, nul_match, tmp3	// Mask padding bits.
 	bic	chr_match, chr_match, tmp3	// Mask padding bits.
 	cbnz	nul_match, L(tail)
 
+	.p2align 4
 L(loop):
 	cmp	chr_match, #0
 	csel	src_match, src, src_match, ne
 	csel	src_offset, chr_match, src_offset, ne
 L(aligned):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	addp	vend1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	uminp	vend1.16b, vdata1.16b, vdata2.16b
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	cmeq	vend1.16b, vend1.16b, 0
 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
-	addp	vend1.16b, vend1.16b, vend1.16b	// 128->64
-	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
+	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
 	mov	nul_match, vend1.d[0]
-	mov	chr_match, vhas_chr1.d[0]
+	mov	chr_match, vend1.d[1]
 	cbz	nul_match, L(loop)
 
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
 	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
@@ -145,3 +145,5 @@
 	ret
 
 END (__strrchr_aarch64)
+
+END_FILE
diff --git a/string/asmdefs.h b/string/asmdefs.h
index 7d143a9..c7fcb08 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -8,6 +8,55 @@
 #ifndef _ASMDEFS_H
 #define _ASMDEFS_H
 
+#if defined(__aarch64__)
+
+/* Branch Target Identitication support.  */
+#define BTI_C		hint	34
+#define BTI_J		hint	36
+/* Return address signing support (pac-ret).  */
+#define PACIASP		hint	25; .cfi_window_save
+#define AUTIASP		hint	29; .cfi_window_save
+
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a GNU_PROPERTY_AARCH64_FEATURE_1_AND note.  */
+#define GNU_PROPERTY(features)		\
+  .section .note.gnu.property, "a";	\
+  .p2align 3;				\
+  .word 4;				\
+  .word 16;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word 0xc0000000;			\
+  .word 4;				\
+  .word features;			\
+  .word 0;
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+#define END_FILE GNU_PROPERTY(FEATURE_1_BTI|FEATURE_1_PAC)
+#else
+#define END_FILE
+#endif
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;	\
+  BTI_C;
+
+#else
+
+#define END_FILE
+
 #define ENTRY_ALIGN(name, alignment)	\
   .global name;		\
   .type name,%function;	\
@@ -15,6 +64,8 @@
   name:			\
   .cfi_startproc;
 
+#endif
+
 #define ENTRY(name)	ENTRY_ALIGN(name, 6)
 
 #define ENTRY_ALIAS(name)	\
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 25a4475..34e5a29 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -27,8 +27,13 @@
 size_t __strlen_aarch64 (const char *);
 size_t __strnlen_aarch64 (const char *, size_t);
 int __strncmp_aarch64 (const char *, const char *, size_t);
+void * __memchr_aarch64_mte (const void *, int, size_t);
 char *__strchr_aarch64_mte (const char *, int);
+char * __strchrnul_aarch64_mte (const char *, int );
 size_t __strlen_aarch64_mte (const char *);
+char *__strrchr_aarch64_mte (const char *, int);
+int __strcmp_aarch64_mte (const char *, const char *);
+int __strncmp_aarch64_mte (const char *, const char *, size_t);
 #if __ARM_NEON
 void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_simd (void *, const void *, size_t);
diff --git a/string/test/memchr.c b/string/test/memchr.c
index 1ebc6d6..15531c6 100644
--- a/string/test/memchr.c
+++ b/string/test/memchr.c
@@ -11,6 +11,7 @@
 #include <string.h>
 #include <limits.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -21,6 +22,7 @@
 F(memchr)
 #if __aarch64__
 F(__memchr_aarch64)
+F(__memchr_aarch64_mte)
 # if __ARM_FEATURE_SVE
 F(__memchr_aarch64_sve)
 # endif
@@ -31,12 +33,11 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define SP 512
 #define LEN 250000
+#define MAX_LEN SIZE_MAX
+
 static unsigned char sbuf[LEN+2*A];
 
 static void *alignup(void *p)
@@ -44,30 +45,34 @@
 	return (void*)(((uintptr_t)p + A-1) & -A);
 }
 
-static void test(const struct fun *fun, int align, int seekpos, int len)
+static void test(const struct fun *fun, int align, size_t seekpos,
+	size_t array_len, size_t param_len)
 {
 	unsigned char *src = alignup(sbuf);
 	unsigned char *s = src + align;
-	unsigned char *f = len ? s + seekpos : 0;
+	unsigned char *f = array_len ? s + seekpos : 0;
 	int seekchar = 0x1;
 	int i;
 	void *p;
 
-	if (len > LEN || seekpos >= len || align >= A)
+	if (err_count >= ERR_LIMIT)
+		return;
+	if (array_len > LEN || seekpos >= array_len || align >= A)
 		abort();
 
 	for (i = 0; i < seekpos; i++)
 		s[i] = 'a' + i%23;
 	s[i++] = seekchar;
-	for (; i < len; i++)
+	for (; i < array_len; i++)
 		s[i] = 'a' + i%23;
 
-	p = fun->fun(s, seekchar, len);
+	p = fun->fun(s, seekchar, param_len);
 
 	if (p != f) {
-		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
-		ERR("expected: %p\n", f);
-		abort();
+		ERR("%s(%p,0x%02x,%zu) returned %p\n",
+			fun->name, s, seekchar, param_len, p);
+		printf("expected: %p\n", f);
+		quote("str", s, param_len);
 	}
 }
 
@@ -75,18 +80,21 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int a = 0; a < A; a++) {
 			for (int n = 0; n < 100; n++)
 				for (int sp = 0; sp < n-1; sp++)
-					test(funtab+i, a, sp, n);
+					test(funtab+i, a, sp, n, n);
 			for (int n = 100; n < LEN; n *= 2) {
-				test(funtab+i, a, n-1, n);
-				test(funtab+i, a, n/2, n);
+				test(funtab+i, a, n-1, n, n);
+				test(funtab+i, a, n/2, n, n);
+			}
+			for (int n = 0; n < 100; n++) {
+				test(funtab+i, a, LEN-1-n, LEN, MAX_LEN-n);
 			}
 		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index 114f1d7..28160ef 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -28,9 +29,6 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define LEN 250000
 static unsigned char s1buf[LEN+2*A];
@@ -41,7 +39,7 @@
 	return (void*)(((uintptr_t)p + A-1) & -A);
 }
 
-static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
+static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos, int delta)
 {
 	unsigned char *src1 = alignup(s1buf);
 	unsigned char *src2 = alignup(s2buf);
@@ -49,25 +47,29 @@
 	unsigned char *s2 = src2 + s2align;
 	int r;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || s1align >= A || s2align >= A)
 		abort();
-	if (diffpos && diffpos >= len)
+	if (diffpos >= len)
+		abort();
+	if ((diffpos < 0) != (delta == 0))
 		abort();
 
 	for (int i = 0; i < len+A; i++)
 		src1[i] = src2[i] = '?';
 	for (int i = 0; i < len; i++)
 		s1[i] = s2[i] = 'a' + i%23;
-	if (diffpos)
-		s1[diffpos]++;
+	if (delta)
+		s1[diffpos] += delta;
 
 	r = fun->fun(s1, s2, len);
 
-	if ((!diffpos && r != 0) || (diffpos && r == 0)) {
+	if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) {
 		ERR("%s(align %d, align %d, %d) failed, returned %d\n",
 			fun->name, s1align, s2align, len, r);
-		ERR("src1: %.*s\n", s1align+len+1, src1);
-		ERR("src2: %.*s\n", s2align+len+1, src2);
+		quoteat("src1", src1, len+A, diffpos);
+		quoteat("src2", src2, len+A, diffpos);
 	}
 }
 
@@ -75,21 +77,27 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int d = 0; d < A; d++)
 			for (int s = 0; s < A; s++) {
 				int n;
-				for (n = 0; n < 100; n++) {
-					test(funtab+i, d, s, n, 0);
-					test(funtab+i, d, s, n, n / 2);
+				test(funtab+i, d, s, 0, -1,  0);
+				test(funtab+i, d, s, 1, -1,  0);
+				test(funtab+i, d, s, 1,  0, -1);
+				test(funtab+i, d, s, 1,  0,  1);
+				for (n = 2; n < 100; n++) {
+					test(funtab+i, d, s, n, -1, 0);
+					test(funtab+i, d, s, n, 0, -1);
+					test(funtab+i, d, s, n, n - 1, -1);
+					test(funtab+i, d, s, n, n / 2, 1);
 				}
 				for (; n < LEN; n *= 2) {
-					test(funtab+i, d, s, n, 0);
-					test(funtab+i, d, s, n, n / 2);
+					test(funtab+i, d, s, n, -1, 0);
+					test(funtab+i, d, s, n, n / 2, -1);
 				}
 			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index 8572452..bf1bbae 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -30,9 +31,6 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define LEN 250000
 static unsigned char dbuf[LEN+2*A];
@@ -55,6 +53,8 @@
 	void *p;
 	int i;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || dalign >= A || salign >= A)
 		abort();
 	for (i = 0; i < len+A; i++) {
@@ -70,8 +70,8 @@
 	for (i = 0; i < len+A; i++) {
 		if (dst[i] != want[i]) {
 			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
-			ERR("got : %.*s\n", dalign+len+1, dst);
-			ERR("want: %.*s\n", dalign+len+1, want);
+			quoteat("got", dst, len+A, i);
+			quoteat("want", want, len+A, i);
 			break;
 		}
 	}
@@ -81,7 +81,7 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int d = 0; d < A; d++)
 			for (int s = 0; s < A; s++) {
 				int n;
@@ -90,8 +90,8 @@
 				for (; n < LEN; n *= 2)
 					test(funtab+i, d, s, n);
 			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/memmove.c b/string/test/memmove.c
index 7891b14..04f4c3c 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -28,9 +29,6 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define LEN 250000
 static unsigned char dbuf[LEN+2*A];
@@ -53,6 +51,8 @@
 	void *p;
 	int i;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || dalign >= A || salign >= A)
 		abort();
 	for (i = 0; i < len+A; i++) {
@@ -68,8 +68,8 @@
 	for (i = 0; i < len+A; i++) {
 		if (dst[i] != want[i]) {
 			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
-			ERR("got : %.*s\n", dalign+len+1, dst);
-			ERR("want: %.*s\n", dalign+len+1, want);
+			quoteat("got", dst, len+A, i);
+			quoteat("want", want, len+A, i);
 			break;
 		}
 	}
@@ -78,13 +78,15 @@
 static void test_overlap(const struct fun *fun, int dalign, int salign, int len)
 {
 	unsigned char *src = alignup(sbuf);
-	unsigned char *dst = alignup(sbuf);
+	unsigned char *dst = src;
 	unsigned char *want = wbuf;
 	unsigned char *s = src + salign;
 	unsigned char *d = dst + dalign;
 	unsigned char *w = wbuf + dalign;
 	void *p;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || dalign >= A || salign >= A)
 		abort();
 
@@ -92,16 +94,9 @@
 		src[i] = want[i] = '?';
 
 	for (int i = 0; i < len; i++)
-		s[i] = w[i] = 'a' + i%23;
-
-	/* Copy the potential overlap range.  */
-	if (s < d) {
-		for (int i = 0; i < (uintptr_t)d-(uintptr_t)s; i++)
-			want[salign+i] = src[salign+i];
-	} else {
-		for (int i = 0; i < (uintptr_t)s-(uintptr_t)d; i++)
-			want[len + dalign + i] = src[len + dalign + i];
-	}
+		s[i] = want[salign+i] = 'a' + i%23;
+	for (int i = 0; i < len; i++)
+		w[i] = s[i];
 
 	p = fun->fun(d, s, len);
 	if (p != d)
@@ -109,9 +104,8 @@
 	for (int i = 0; i < len+A; i++) {
 		if (dst[i] != want[i]) {
 			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
-			ERR("got : %.*s\n", dalign+len+1, dst);
-			ERR("want: %.*s\n", dalign+len+1, want);
-			abort();
+			quoteat("got", dst, len+A, i);
+			quoteat("want", want, len+A, i);
 			break;
 		}
 	}
@@ -119,11 +113,9 @@
 
 int main()
 {
-	test_overlap(funtab+0, 2, 1, 1);
-
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int d = 0; d < A; d++)
 			for (int s = 0; s < A; s++) {
 				int n;
@@ -136,8 +128,8 @@
 					test_overlap(funtab+i, d, s, n);
 				}
 			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/memset.c b/string/test/memset.c
index 48c10fa..8b05bd6 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -27,9 +28,6 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define LEN 250000
 static unsigned char sbuf[LEN+2*A];
@@ -39,12 +37,6 @@
 	return (void*)(((uintptr_t)p + A-1) & -A);
 }
 
-static void err(const char *name, unsigned char *src, int salign, int c, int len)
-{
-	ERR("%s(align %d, %d, %d) failed\n", name, salign, c, len);
-	ERR("got : %.*s\n", salign+len+1, src);
-}
-
 static void test(const struct fun *fun, int salign, int c, int len)
 {
 	unsigned char *src = alignup(sbuf);
@@ -52,14 +44,14 @@
 	void *p;
 	int i;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || salign >= A)
 		abort();
 	for (i = 0; i < len+A; i++)
 		src[i] = '?';
 	for (i = 0; i < len; i++)
 		s[i] = 'a' + i%23;
-	for (; i<len%A; i++)
-		s[i] = '*';
 
 	p = fun->fun(s, c, len);
 	if (p != s)
@@ -67,19 +59,22 @@
 
 	for (i = 0; i < salign; i++) {
 		if (src[i] != '?') {
-			err(fun->name, src, salign, c, len);
+			ERR("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+			quoteat("got", src, len+A, i);
 			return;
 		}
 	}
-	for (i = salign; i < len; i++) {
+	for (; i < salign+len; i++) {
 		if (src[i] != (unsigned char)c) {
-			err(fun->name, src, salign, c, len);
+			ERR("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+			quoteat("got", src, len+A, i);
 			return;
 		}
 	}
-	for (; i < len%A; i++) {
-		if (src[i] != '*') {
-			err(fun->name, src, salign, c, len);
+	for (; i < len+A; i++) {
+		if (src[i] != '?') {
+			ERR("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+			quoteat("got", src, len+A, i);
 			return;
 		}
 	}
@@ -89,7 +84,7 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int s = 0; s < A; s++) {
 			int n;
 			for (n = 0; n < 100; n++) {
@@ -103,8 +98,8 @@
 				test(funtab+i, s, 0xaa25, n);
 			}
 		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
index 9050227..9001057 100644
--- a/string/test/stpcpy.c
+++ b/string/test/stpcpy.c
@@ -11,6 +11,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -29,14 +30,11 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define LEN 250000
-static char dbuf[LEN+2*A];
-static char sbuf[LEN+2*A];
-static char wbuf[LEN+2*A];
+static char dbuf[LEN+2*A+1];
+static char sbuf[LEN+2*A+1];
+static char wbuf[LEN+2*A+1];
 
 static void *alignup(void *p)
 {
@@ -54,6 +52,8 @@
 	void *p;
 	int i;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || dalign >= A || salign >= A)
 		abort();
 	for (i = 0; i < len+A; i++) {
@@ -62,7 +62,7 @@
 	}
 	for (i = 0; i < len; i++)
 		s[i] = w[i] = 'a' + i%23;
-	s[i] = w[i] = '\0';
+	s[len] = w[len] = '\0';
 
 	p = fun->fun(d, s);
 	if (p != d + len)
@@ -70,8 +70,8 @@
 	for (i = 0; i < len+A; i++) {
 		if (dst[i] != want[i]) {
 			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
-			ERR("got : %.*s\n", dalign+len+1, dst);
-			ERR("want: %.*s\n", dalign+len+1, want);
+			quoteat("got", dst, len+A, i);
+			quoteat("want", want, len+A, i);
 			break;
 		}
 	}
@@ -81,7 +81,7 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int d = 0; d < A; d++)
 			for (int s = 0; s < A; s++) {
 				int n;
@@ -90,8 +90,8 @@
 				for (; n < LEN; n *= 2)
 					test(funtab+i, d, s, n);
 			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/strchr.c b/string/test/strchr.c
index 80a454a..1d90c85 100644
--- a/string/test/strchr.c
+++ b/string/test/strchr.c
@@ -11,6 +11,7 @@
 #include <string.h>
 #include <limits.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -30,13 +31,9 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define LEN 512
+static char sbuf[LEN+3*A];
 
 static void *alignup(void *p)
 {
@@ -51,25 +48,33 @@
 	int seekchar = 0x1;
 	void *p;
 
-	if (len > LEN || seekpos >= len - 1 || align >= A)
-		abort();
-	if (seekchar >= 'a' && seekchar <= 'a' + 23)
+	if (err_count >= ERR_LIMIT)
+		return;
+	if (len > LEN || seekpos >= len || align >= A)
 		abort();
 
-	for (int i = 0; i < len + A; i++)
-		src[i] = '?';
-	for (int i = 0; i < len - 2; i++)
-		s[i] = 'a' + i%23;
+	for (int i = 0; src + i < s; i++)
+		src[i] = i & 1 ? seekchar : 0;
+	for (int i = 1; i < A; i++)
+		s[len+i] = i & 1 ? seekchar : 0;
+	for (int i = 0; i < len; i++)
+		s[i] = 'a' + i%32;
 	if (seekpos != -1)
-		s[seekpos] = seekchar;
-	s[len - 1] = '\0';
+		s[seekpos] = s[seekpos+2] = seekchar;
+	s[len] = '\0';
 
 	p = fun->fun(s, seekchar);
-
 	if (p != f) {
-		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
-		ERR("expected: %p\n", f);
-		abort();
+		ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+			fun->name, s, seekchar, len, p, f, seekpos);
+		quote("input", s, len);
+	}
+
+	p = fun->fun(s, 0);
+	if (p != s + len) {
+		ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+			fun->name, s, seekchar, len, p, s + len, len);
+		quote("input", s, len);
 	}
 }
 
@@ -77,21 +82,17 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int a = 0; a < A; a++) {
 			int n;
-			for (n = 1; n < 100; n++) {
-				for (int sp = 0; sp < n - 1; sp++)
+			for (n = 1; n < LEN; n++) {
+				for (int sp = 0; sp < n; sp++)
 					test(funtab+i, a, sp, n);
 				test(funtab+i, a, -1, n);
 			}
-			for (; n < LEN; n *= 2) {
-				test(funtab+i, a, -1, n);
-				test(funtab+i, a, n / 2, n);
-			}
 		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
index 814dd1e..b103568 100644
--- a/string/test/strchrnul.c
+++ b/string/test/strchrnul.c
@@ -13,6 +13,7 @@
 #include <string.h>
 #include <limits.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -23,6 +24,7 @@
 F(strchrnul)
 #if __aarch64__
 F(__strchrnul_aarch64)
+F(__strchrnul_aarch64_mte)
 # if __ARM_FEATURE_SVE
 F(__strchrnul_aarch64_sve)
 # endif
@@ -31,13 +33,9 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define LEN 512
+static char sbuf[LEN+3*A];
 
 static void *alignup(void *p)
 {
@@ -48,29 +46,37 @@
 {
 	char *src = alignup(sbuf);
 	char *s = src + align;
-	char *f = seekpos != -1 ? s + seekpos : s + len - 1;
+	char *f = seekpos != -1 ? s + seekpos : s + len;
 	int seekchar = 0x1;
 	void *p;
 
-	if (len > LEN || seekpos >= len - 1 || align >= A)
-		abort();
-	if (seekchar >= 'a' && seekchar <= 'a' + 23)
+	if (err_count >= ERR_LIMIT)
+		return;
+	if (len > LEN || seekpos >= len || align >= A)
 		abort();
 
-	for (int i = 0; i < len + A; i++)
-		src[i] = '?';
-	for (int i = 0; i < len - 2; i++)
-		s[i] = 'a' + i%23;
+	for (int i = 0; src + i < s; i++)
+		src[i] = i & 1 ? seekchar : 0;
+	for (int i = 1; i < A; i++)
+		s[len+i] = i & 1 ? seekchar : 0;
+	for (int i = 0; i < len; i++)
+		s[i] = 'a' + i%32;
 	if (seekpos != -1)
-		s[seekpos] = seekchar;
-	s[len - 1] = '\0';
+		s[seekpos] = s[seekpos+2] = seekchar;
+	s[len] = '\0';
 
 	p = fun->fun(s, seekchar);
-
 	if (p != f) {
-		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
-		ERR("expected: %p\n", f);
-		abort();
+		ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+			fun->name, s, seekchar, len, p, f, seekpos);
+		quote("input", s, len);
+	}
+
+	p = fun->fun(s, 0);
+	if (p != s + len) {
+		ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+			fun->name, s, seekchar, len, p, s + len, len);
+		quote("input", s, len);
 	}
 }
 
@@ -78,21 +84,17 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int a = 0; a < A; a++) {
 			int n;
-			for (n = 1; n < 100; n++) {
-				for (int sp = 0; sp < n - 1; sp++)
+			for (n = 1; n < LEN; n++) {
+				for (int sp = 0; sp < n; sp++)
 					test(funtab+i, a, sp, n);
 				test(funtab+i, a, -1, n);
 			}
-			for (; n < LEN; n *= 2) {
-				test(funtab+i, a, -1, n);
-				test(funtab+i, a, n / 2, n);
-			}
 		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index 91fa9dd..078fb1b 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -20,6 +21,7 @@
 F(strcmp)
 #if __aarch64__
 F(__strcmp_aarch64)
+F(__strcmp_aarch64_mte)
 # if __ARM_FEATURE_SVE
 F(__strcmp_aarch64_sve)
 # endif
@@ -34,20 +36,17 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define LEN 250000
-static char s1buf[LEN+2*A];
-static char s2buf[LEN+2*A];
+static char s1buf[LEN+2*A+1];
+static char s2buf[LEN+2*A+1];
 
 static void *alignup(void *p)
 {
 	return (void*)(((uintptr_t)p + A-1) & -A);
 }
 
-static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
+static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos, int delta)
 {
 	char *src1 = alignup(s1buf);
 	char *src2 = alignup(s2buf);
@@ -55,26 +54,30 @@
 	char *s2 = src2 + s2align;
 	int r;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || s1align >= A || s2align >= A)
 		abort();
-	if (diffpos > 1 && diffpos >= len-1)
+	if (diffpos >= len)
+		abort();
+	if ((diffpos < 0) != (delta == 0))
 		abort();
 
 	for (int i = 0; i < len+A; i++)
 		src1[i] = src2[i] = '?';
-	for (int i = 0; i < len-1; i++)
+	for (int i = 0; i < len; i++)
 		s1[i] = s2[i] = 'a' + i%23;
-	if (diffpos > 1)
-		s1[diffpos]++;
+	if (delta)
+		s1[diffpos] += delta;
 	s1[len] = s2[len] = '\0';
 
 	r = fun->fun(s1, s2);
 
-	if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
+	if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) {
 		ERR("%s(align %d, align %d, %d) failed, returned %d\n",
 			fun->name, s1align, s2align, len, r);
-		ERR("src1: %.*s\n", s1align+len+1, src1);
-		ERR("src2: %.*s\n", s2align+len+1, src2);
+		quoteat("src1", src1, len+A, diffpos);
+		quoteat("src2", src2, len+A, diffpos);
 	}
 }
 
@@ -82,21 +85,26 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int d = 0; d < A; d++)
 			for (int s = 0; s < A; s++) {
 				int n;
-				for (n = 0; n < 100; n++) {
-					test(funtab+i, d, s, n, 0);
-					test(funtab+i, d, s, n, n / 2);
+				test(funtab+i, d, s, 0, -1, 0);
+				test(funtab+i, d, s, 1, -1, 0);
+				test(funtab+i, d, s, 1,  0, 1);
+				test(funtab+i, d, s, 1,  0, -1);
+				for (n = 2; n < 100; n++) {
+					test(funtab+i, d, s, n, -1, 0);
+					test(funtab+i, d, s, n, n - 1, -1);
+					test(funtab+i, d, s, n, n / 2, 1);
 				}
 				for (; n < LEN; n *= 2) {
-					test(funtab+i, d, s, n, 0);
-					test(funtab+i, d, s, n, n / 2);
+					test(funtab+i, d, s, n, -1, 0);
+					test(funtab+i, d, s, n, n / 2, -1);
 				}
 			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index ea74c9e..68fc76f 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -30,14 +31,11 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define LEN 250000
-static char dbuf[LEN+2*A];
-static char sbuf[LEN+2*A];
-static char wbuf[LEN+2*A];
+static char dbuf[LEN+2*A+1];
+static char sbuf[LEN+2*A+1];
+static char wbuf[LEN+2*A+1];
 
 static void *alignup(void *p)
 {
@@ -55,6 +53,8 @@
 	void *p;
 	int i;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || dalign >= A || salign >= A)
 		abort();
 	for (i = 0; i < len+A; i++) {
@@ -63,7 +63,7 @@
 	}
 	for (i = 0; i < len; i++)
 		s[i] = w[i] = 'a' + i%23;
-	s[i] = w[i] = '\0';
+	s[len] = w[len] = '\0';
 
 	p = fun->fun(d, s);
 	if (p != d)
@@ -71,8 +71,8 @@
 	for (i = 0; i < len+A; i++) {
 		if (dst[i] != want[i]) {
 			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
-			ERR("got : %.*s\n", dalign+len+1, dst);
-			ERR("want: %.*s\n", dalign+len+1, want);
+			quoteat("got", dst, len+A, i);
+			quoteat("want", want, len+A, i);
 			break;
 		}
 	}
@@ -82,7 +82,7 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int d = 0; d < A; d++)
 			for (int s = 0; s < A; s++) {
 				int n;
@@ -91,8 +91,8 @@
 				for (; n < LEN; n *= 2)
 					test(funtab+i, d, s, n);
 			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
new file mode 100644
index 0000000..b9c034a
--- /dev/null
+++ b/string/test/stringtest.h
@@ -0,0 +1,50 @@
+/*
+ * Common string test code.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+
+/* Accounting errors for a test case.  */
+static int err_count;
+#define ERR_LIMIT 10
+#define ERR(...) (err_count++, printf(__VA_ARGS__))
+
+static inline void quotechar(unsigned char c)
+{
+	if (isprint(c))
+		putchar(c);
+	else
+		printf("\\x%02x", c);
+}
+
+/* quoted print around at or the entire string if at < 0.  */
+static void quoteat(const char *prefix, const void *p, int len, int at)
+{
+	static const int CTXLEN = 15;
+	int i;
+	const char *pre="\"";
+	const char *post="\"";
+	const char *s = p;
+	if (at > CTXLEN) {
+		s += at - CTXLEN;
+		len -= at - CTXLEN;
+		pre = "...\"";
+	}
+	if (at >= 0 && len > 2*CTXLEN + 1) {
+		len = 2*CTXLEN + 1;
+		post = "\"...";
+	}
+	printf("%4s: %s", prefix, pre);
+	for (i = 0; i < len; i++)
+		quotechar(s[i]);
+	printf("%s\n", post);
+}
+
+static inline void quote(const char *prefix, const void *p, int len)
+{
+	quoteat(prefix, p, len, -1);
+}
diff --git a/string/test/strlen.c b/string/test/strlen.c
index 96e6cd6..b2e2ffa 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -11,6 +11,7 @@
 #include <string.h>
 #include <limits.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -34,13 +35,10 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define SP 512
 #define LEN 250000
-static char sbuf[LEN+2*A];
+static char sbuf[LEN+2*A+1];
 
 static void *alignup(void *p)
 {
@@ -53,21 +51,22 @@
 	char *s = src + align;
 	size_t r;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || align >= A)
 		abort();
 
 	for (int i = 0; i < len + A; i++)
 		src[i] = '?';
-	for (int i = 0; i < len - 2; i++)
+	for (int i = 0; i < len; i++)
 		s[i] = 'a' + i%23;
-	s[len - 1] = '\0';
+	s[len] = '\0';
 
 	r = fun->fun(s);
-	if (r != len-1) {
+	if (r != len) {
 		ERR("%s(%p) returned %zu\n", fun->name, s, r);
-		ERR("input:    %.*s\n", align+len+1, src);
-		ERR("expected: %d\n", len);
-		abort();
+		quote("input", src, len);
+		printf("expected: %d\n", len);
 	}
 }
 
@@ -75,7 +74,7 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int a = 0; a < A; a++) {
 			int n;
 			for (n = 1; n < 100; n++)
@@ -83,8 +82,8 @@
 			for (; n < LEN; n *= 2)
 				test(funtab+i, a, n);
 		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 43f941d..8ed21bd 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -20,6 +21,7 @@
 F(strncmp)
 #if __aarch64__
 F(__strncmp_aarch64)
+F(__strncmp_aarch64_mte)
 # if __ARM_FEATURE_SVE
 F(__strncmp_aarch64_sve)
 # endif
@@ -28,20 +30,17 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define LEN 250000
-static char s1buf[LEN+2*A];
-static char s2buf[LEN+2*A];
+static char s1buf[LEN+2*A+1];
+static char s2buf[LEN+2*A+1];
 
 static void *alignup(void *p)
 {
 	return (void*)(((uintptr_t)p + A-1) & -A);
 }
 
-static void test(const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, int len)
+static void test(const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, int len, int delta)
 {
 	char *src1 = alignup(s1buf);
 	char *src2 = alignup(s2buf);
@@ -49,28 +48,34 @@
 	char *s2 = src2 + s2align;
 	int r;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || s1align >= A || s2align >= A)
 		abort();
-	if (diffpos > 1 && diffpos >= len-1)
+	if (diffpos >= len)
+		abort();
+	if ((diffpos < 0) != (delta == 0))
 		abort();
 
 	for (int i = 0; i < len+A; i++)
 		src1[i] = src2[i] = '?';
-	for (int i = 0; i < len-1; i++)
+	for (int i = 0; i < len; i++)
 		s1[i] = s2[i] = 'a' + i%23;
-	if (diffpos > 1)
-		s1[diffpos]++;
+	if (delta)
+		s1[diffpos] += delta;
 	s1[len] = s2[len] = '\0';
 
 	r = fun->fun(s1, s2, maxlen);
 
-	diffpos = maxlen <= diffpos ? 0 : diffpos;
-
-	if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
-		ERR("%s(align %d, align %d, %d (%d)) failed, returned %d (%d)\n",
-			fun->name, s1align, s2align, maxlen, len, r, diffpos);
-		ERR("src1: %.*s\n", s1align+len+1, src1);
-		ERR("src2: %.*s\n", s2align+len+1, src2);
+	if (diffpos >= maxlen) {
+		diffpos = -1;
+		delta = 0;
+	}
+	if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) {
+		ERR("%s(align %d, align %d, %d) (len=%d, diffpos=%d) failed, returned %d\n",
+			fun->name, s1align, s2align, maxlen, len, diffpos, r);
+		quoteat("src1", src1, len+A, diffpos);
+		quoteat("src2", src2, len+A, diffpos);
 	}
 }
 
@@ -78,25 +83,32 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int d = 0; d < A; d++)
 			for (int s = 0; s < A; s++) {
 				int n;
-				for (n = 0; n < 100; n++) {
-					test(funtab+i, d, s, n,   0,   n);
-					test(funtab+i, d, s, n,   n/2, n);
-					test(funtab+i, d, s, n/2, 0,   n);
-					test(funtab+i, d, s, n/2, n/2, n);
+				test(funtab+i, d, s, 0,   -1,  0, 0);
+				test(funtab+i, d, s, 1,   -1,  0, 0);
+				test(funtab+i, d, s, 0,   -1,  1, 0);
+				test(funtab+i, d, s, 1,   -1,  1, 0);
+				test(funtab+i, d, s, 2,   -1,  1, 0);
+				test(funtab+i, d, s, 1,    0,  1, 1);
+				test(funtab+i, d, s, 1,    0,  1, -1);
+				for (n = 2; n < 100; n++) {
+					test(funtab+i, d, s, n,   -1,  n, 0);
+					test(funtab+i, d, s, n,   n/2, n, 1);
+					test(funtab+i, d, s, n/2, -1,  n, 0);
+					test(funtab+i, d, s, n/2, n/2, n, -1);
 				}
 				for (; n < LEN; n *= 2) {
-					test(funtab+i, d, s, n,   0,   n);
-					test(funtab+i, d, s, n,   n/2, n);
-					test(funtab+i, d, s, n/2, 0,   n);
-					test(funtab+i, d, s, n/2, n/2, n);
+					test(funtab+i, d, s, n,   -1,  n, 0);
+					test(funtab+i, d, s, n,   n/2, n, -1);
+					test(funtab+i, d, s, n/2, -1,  n, 0);
+					test(funtab+i, d, s, n/2, n/2, n, 1);
 				}
 			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
index db41f2a..29f85a0 100644
--- a/string/test/strnlen.c
+++ b/string/test/strnlen.c
@@ -13,6 +13,7 @@
 #include <string.h>
 #include <limits.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -31,13 +32,10 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
 #define SP 512
 #define LEN 250000
-static char sbuf[LEN+2*A];
+static char sbuf[LEN+2*A+1];
 
 static void *alignup(void *p)
 {
@@ -49,23 +47,24 @@
 	char *src = alignup(sbuf);
 	char *s = src + align;
 	size_t r;
-	size_t e = maxlen < len ? maxlen : len - 1;
+	size_t e = maxlen < len ? maxlen : len;
 
+	if (err_count >= ERR_LIMIT)
+		return;
 	if (len > LEN || align >= A)
 		abort();
 
 	for (int i = 0; i < len + A; i++)
 		src[i] = '?';
-	for (int i = 0; i < len - 2; i++)
+	for (int i = 0; i < len; i++)
 		s[i] = 'a' + i%23;
-	s[len - 1] = '\0';
+	s[len] = '\0';
 
 	r = fun->fun(s, maxlen);
 	if (r != e) {
-		ERR("%s(%p) returned %zu\n", fun->name, s, r);
-		ERR("input:    %.*s\n", align+len+1, src);
-		ERR("expected: %d\n", len);
-		abort();
+		ERR("%s(%p, %d) returned %zu\n", fun->name, s, maxlen, r);
+		quote("input", src, len+A);
+		printf("expected: %zu\n", e);
 	}
 }
 
@@ -73,7 +72,7 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int a = 0; a < A; a++) {
 			int n;
 			for (n = 1; n < 100; n++)
@@ -85,8 +84,8 @@
 				test(funtab+i, a, n/2, n);
 			}
 		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index 532fa51..1ad1f3a 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -11,6 +11,7 @@
 #include <string.h>
 #include <limits.h>
 #include "stringlib.h"
+#include "stringtest.h"
 
 static const struct fun
 {
@@ -21,6 +22,7 @@
 F(strrchr)
 #if __aarch64__
 F(__strrchr_aarch64)
+F(__strrchr_aarch64_mte)
 # if __ARM_FEATURE_SVE
 F(__strrchr_aarch64_sve)
 # endif
@@ -29,13 +31,9 @@
 	{0, 0}
 };
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
 #define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define LEN 512
+static char sbuf[LEN+3*A];
 
 static void *alignup(void *p)
 {
@@ -50,25 +48,35 @@
 	int seekchar = 0x1;
 	void *p;
 
-	if (len > LEN || seekpos >= len - 1 || align >= A)
-		abort();
-	if (seekchar >= 'a' && seekchar <= 'a' + 23)
+	if (err_count >= ERR_LIMIT)
+		return;
+	if (len > LEN || seekpos >= len || align >= A)
 		abort();
 
-	for (int i = 0; i < len + A; i++)
-		src[i] = '?';
-	for (int i = 0; i < len - 2; i++)
-		s[i] = 'a' + i%23;
-	if (seekpos != -1)
+	for (int i = 0; src + i < s; i++)
+		src[i] = i & 1 ? seekchar : 0;
+	for (int i = 1; i < A; i++)
+		s[len+i] = i & 1 ? seekchar : 0;
+	for (int i = 0; i < len; i++)
+		s[i] = 'a' + i%32;
+	if (seekpos != -1) {
 		s[seekpos/2] = s[seekpos] = seekchar;
-	s[len - 1] = '\0';
+		s[seekpos - (seekpos & 15)] = s[seekpos & 7] = seekchar;
+	}
+	s[len] = '\0';
 
 	p = fun->fun(s, seekchar);
-
 	if (p != f) {
-		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
-		ERR("expected: %p\n", f);
-		abort();
+		ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+			fun->name, s, seekchar, len, p, f, seekpos);
+		quote("input", s, len);
+	}
+
+	p = fun->fun(s, 0);
+	if (p != s + len) {
+		ERR("%s(%p,0x%02x) len %d returned %p, expected %p pos %d\n",
+			fun->name, s, seekchar, len, p, s + len, len);
+		quote("input", s, len);
 	}
 }
 
@@ -76,21 +84,17 @@
 {
 	int r = 0;
 	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
+		err_count = 0;
 		for (int a = 0; a < A; a++) {
 			int n;
-			for (n = 1; n < 100; n++) {
-				for (int sp = 0; sp < n - 1; sp++)
+			for (n = 1; n < LEN; n++) {
+				for (int sp = 0; sp < n; sp++)
 					test(funtab+i, a, sp, n);
 				test(funtab+i, a, -1, n);
 			}
-			for (; n < LEN; n *= 2) {
-				test(funtab+i, a, -1, n);
-				test(funtab+i, a, n / 2, n);
-			}
 		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
+		printf("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+		if (err_count)
 			r = -1;
 	}
 	return r;