diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
index 4a75e73945..fd1b941666 100644
--- a/sysdeps/aarch64/strchr.S
+++ b/sysdeps/aarch64/strchr.S
@@ -22,118 +22,98 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
-/* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
-
 #define result		x0
 
 #define src		x2
-#define tmp1		x3
-#define wtmp2		w4
-#define tmp3		x5
+#define tmp1		x1
+#define wtmp2		w3
+#define tmp3		x3
 
 #define vrepchr		v0
-#define vdata1		v1
-#define vdata2		v2
-#define vhas_nul1	v3
-#define vhas_nul2	v4
-#define vhas_chr1	v5
-#define vhas_chr2	v6
-#define vrepmask_0	v7
-#define vrepmask_c	v16
-#define vend1		v17
-#define vend2		v18
-
-	/* Core algorithm.
-	   For each 32-byte hunk we calculate a 64-bit syndrome value, with
-	   two bits per byte (LSB is always in bits 0 and 1, for both big
-	   and little-endian systems).  Bit 0 is set iff the relevant byte
-	   matched the requested character.  Bit 1 is set iff the
-	   relevant byte matched the NUL end of string (we trigger off bit0
-	   for the special case of looking for NUL).  Since the bits
-	   in the syndrome reflect exactly the order in which things occur
-	   in the original string a count_trailing_zeros() operation will
-	   identify exactly which byte is causing the termination, and why.  */
-
-/* Locals and temporaries.  */
+#define vdata		v1
+#define qdata		q1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vrepmask2	v5
+#define vend		v6
+#define dend		d6
+
+/* Core algorithm.
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
+   requested character, bits 2-3 are set if the byte is NUL (or matched), and
+   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
+   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
+   in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (strchr)
 	DELOUSE (0)
-	mov	wtmp2, #0x0401
-	movk	wtmp2, #0x4010, lsl #16
+	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	bic	src, srcin, #31
-	dup	vrepmask_c.4s, wtmp2
-	ands	tmp1, srcin, #31
-	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s // lsl #1
-	b.eq	L(loop)
-
-	/* Input string is not 32-byte aligned.  Rather than forcing
-	   the padding bytes to a safe value, we calculate the syndrome
-	   for all the bytes, but then mask off those bits of the
-	   syndrome that are related to the padding.  */
-	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	neg	tmp1, tmp1
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
-	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
-	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
-	lsl	tmp1, tmp1, #1
-	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
-	mov	tmp3, #~0
-	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
-	lsr	tmp1, tmp3, tmp1
-
-	mov	tmp3, vend1.2d[0]
-	bic	tmp1, tmp3, tmp1	// Mask padding bits.
-	cbnz	tmp1, L(tail)
+	ld1	{vdata.16b}, [src]
+	mov	wtmp2, 0x3003
+	dup	vrepmask.8h, wtmp2
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	mov	wtmp2, 0xf00f
+	dup	vrepmask2.8h, wtmp2
+
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	lsl	tmp3, srcin, 2
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+
+	fmov	tmp1, dend
+	lsr	tmp1, tmp1, tmp3
+	cbz	tmp1, L(loop)
+
+	rbit	tmp1, tmp1
+	clz	tmp1, tmp1
+	/* Tmp1 is an even multiple of 2 if the target character was
+	   found first. Otherwise we've found the end of string.  */
+	tst	tmp1, 2
+	add	result, srcin, tmp1, lsr 2
+	csel	result, result, xzr, eq
+	ret
 
+	.p2align 4
 L(loop):
-	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
-	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
-	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
-	orr	vend1.16b, vend1.16b, vend2.16b
-	addp	vend1.2d, vend1.2d, vend1.2d
-	mov	tmp1, vend1.2d[0]
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
 
-	/* Termination condition found.  Now need to establish exactly why
-	   we terminated.  */
-	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
-	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
-	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
-
-	mov	tmp1, vend1.2d[0]
-L(tail):
-	sub	src, src, #32
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	tmp1, dend
+#else
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	tmp1, dend
 	rbit	tmp1, tmp1
+#endif
 	clz	tmp1, tmp1
-	/* Tmp1 is even if the target charager was found first.  Otherwise
-	   we've found the end of string and we weren't looking for NUL.  */
-	tst	tmp1, #1
-	add	result, src, tmp1, lsr #1
+	/* Tmp1 is an even multiple of 2 if the target character was
+	   found first. Otherwise we've found the end of string.  */
+	tst	tmp1, 2
+	add	result, src, tmp1, lsr 2
 	csel	result, result, xzr, eq
 	ret
+
 END (strchr)
 libc_hidden_builtin_def (strchr)
 weak_alias (strchr, index)