[PATCH, NEWLIB/ARM] Optimise memchr for NEON-enabled processors

Prakhar Bahuguna prakhar.bahuguna@arm.com
Wed Apr 5 12:39:00 GMT 2017


This patch provides an optimised implementation of memchr using NEON
instructions to improve its performance, especially with longer search regions.
This gave at least double the performance against the Thumb2+DSP optimised code,
with as much as quadruple performance for larger inputs. The NEON code also wins
in cases where the input is small (less than 8 bytes) by defaulting to a simple
byte-by-byte search. This avoids the overhead imposed by filling two quadword
registers from memory.

newlib/ChangeLog:

2017-01-26  Prakhar Bahuguna  <prakhar.bahuguna@arm.com>

	* newlib/libc/machine/arm/memchr-stub.c: Add __ARM_NEON__ ifdef.
	Change if to elif.
	* newlib/libc/machine/arm/memchr.S: Add __ARM_NEON__ ifdef.
	Add NEON-optimised memchr implementation.
	Change if to elif.

Testing done: Ran regression tests for arm-eabi-none, and benchmark tests on
hardware.

Okay for master?

--

Prakhar Bahuguna
-------------- next part --------------
>From 52e23fde584c161989ffaf8ea5b6b02edbe23592 Mon Sep 17 00:00:00 2001
From: Prakhar Bahuguna <prakhar.bahuguna@arm.com>
Date: Thu, 26 Jan 2017 10:06:10 +0000
Subject: [PATCH] Optimise memchr for NEON-enabled processors

---
 newlib/libc/machine/arm/memchr-stub.c |   4 +-
 newlib/libc/machine/arm/memchr.S      | 183 +++++++++++++++++++++++++++++++++-
 2 files changed, 185 insertions(+), 2 deletions(-)

diff --git a/newlib/libc/machine/arm/memchr-stub.c b/newlib/libc/machine/arm/memchr-stub.c
index 21ffbbd96..5c7881b9c 100644
--- a/newlib/libc/machine/arm/memchr-stub.c
+++ b/newlib/libc/machine/arm/memchr-stub.c
@@ -29,7 +29,9 @@
 
 #include "acle-compat.h"
 
-#if __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
+#if defined (__ARM_NEON__) || defined (__ARM_NEON)
+/* Defined in memchr.S.  */
+#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
 /* Defined in memchr.S.  */
 #else
 # include "../../string/memchr.c"
diff --git a/newlib/libc/machine/arm/memchr.S b/newlib/libc/machine/arm/memchr.S
index ef2d6d08b..b5dcf83c0 100644
--- a/newlib/libc/machine/arm/memchr.S
+++ b/newlib/libc/machine/arm/memchr.S
@@ -78,7 +78,188 @@
 #include "acle-compat.h"
 
 @ NOTE: This ifdef MUST match the one in memchr-stub.c
-#if __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
+#if defined (__ARM_NEON__) || defined (__ARM_NEON)
+	.arch	armv7-a
+	.fpu	neon
+
+
+/* Arguments */
+#define srcin		r0
+#define chrin		r1
+#define cntin		r2
+
+/* Retval */
+#define result		r0	/* Live range does not overlap with srcin */
+
+/* Working registers */
+#define src		r1	/* Live range does not overlap with chrin */
+#define tmp		r3
+#define synd		r0	/* No overlap with srcin or result */
+#define soff		r12
+
+/* Working NEON registers */
+#define vrepchr		q0
+#define vdata0		q1
+#define vdata0_0	d2	/* Lower half of vdata0 */
+#define vdata0_1	d3	/* Upper half of vdata0 */
+#define vdata1		q2
+#define vdata1_0	d4	/* Lower half of vhas_chr0 */
+#define vdata1_1	d5	/* Upper half of vhas_chr0 */
+#define vrepmask	q3
+#define vrepmask0	d6
+#define vrepmask1	d7
+#define vend		q4
+#define vend0		d8
+#define vend1		d9
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
+ * byte. Each bit is set if the relevant byte matched the requested character
+ * and cleared otherwise. Since the bits in the syndrome reflect exactly the
+ * order in which things occur in the original string, counting trailing zeros
+ * allows to identify exactly which byte has matched.
+ */
+
+	.text
+	.thumb_func
+	.align 4
+	.p2align 4,,15
+	.global memchr
+	.type memchr,%function
+
+memchr:
+	.cfi_sections .debug_frame
+	.cfi_startproc
+	/* Use a simple loop if there are less than 8 bytes to search.  */
+	cmp	cntin, #7
+	bhi	.Llargestr
+
+.Lsmallstr:
+	subs	cntin, cntin, #1
+	blt	.Lnotfound	/* Return not found if reached end.  */
+	ldrb	tmp, [srcin], #1
+	cmp	tmp, chrin
+	bne	.Lsmallstr	/* Loop again if not found.  */
+	/* Otherwise fixup address and return.  */
+	sub	result, result, #1
+	bx	lr
+
+
+.Llargestr:
+	vdup.8	vrepchr, chrin	/* Duplicate char across all lanes. */
+	/*
+	 * Magic constant 0x8040201008040201 allows us to identify which lane
+	 * matches the requested byte.
+	 */
+	movw	tmp, #0x0201
+	movt	tmp, #0x0804
+	lsl	soff, tmp, #4
+	vmov	vrepmask0, tmp, soff
+	vmov	vrepmask1, tmp, soff
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	ands	soff, srcin, #31
+	beq	.Lloopintro	/* Go straight to main loop if it's aligned. */
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+	vld1.8		{vdata0, vdata1}, [src:256]!
+	sub		tmp, soff, #32
+	adds		cntin, cntin, tmp
+	vceq.i8		vdata0, vdata0, vrepchr
+	vceq.i8		vdata1, vdata1, vrepchr
+	vand		vdata0, vdata0, vrepmask
+	vand		vdata1, vdata1, vrepmask
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
+	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
+	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
+	vmov		synd, vdata0_0[0]
+
+	/* Clear the soff lower bits */
+	lsr		synd, synd, soff
+	lsl		synd, synd, soff
+	/* The first block can also be the last */
+	bls		.Lmasklast
+	/* Have we found something already? */
+	cbnz		synd, .Ltail
+
+
+.Lloopintro:
+	vpush	{vend}
+	/* 264/265 correspond to d8/d9 for q4 */
+	.cfi_adjust_cfa_offset	16
+	.cfi_rel_offset	264, 0
+	.cfi_rel_offset	265, 8
+	.p2align 3,,7
+.Lloop:
+	vld1.8		{vdata0, vdata1}, [src:256]!
+	subs		cntin, cntin, #32
+	vceq.i8		vdata0, vdata0, vrepchr
+	vceq.i8		vdata1, vdata1, vrepchr
+	/* If we're out of data we finish regardless of the result. */
+	bls		.Lend
+	/* Use a fast check for the termination condition. */
+	vorr		vend, vdata0, vdata1
+	vorr		vend0, vend0, vend1
+	vmov		synd, tmp, vend0
+	orrs		synd, synd, tmp
+	/* We're not out of data, loop if we haven't found the character. */
+	beq		.Lloop
+
+.Lend:
+	vpop		{vend}
+	.cfi_adjust_cfa_offset	-16
+	.cfi_restore	264
+	.cfi_restore	265
+
+	/* Termination condition found, let's calculate the syndrome value. */
+	vand		vdata0, vdata0, vrepmask
+	vand		vdata1, vdata1, vrepmask
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
+	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
+	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
+	vmov		synd, vdata0_0[0]
+	cbz		synd, .Lnotfound
+	bhi		.Ltail
+
+
+.Lmasklast:
+	/* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
+	neg	cntin, cntin
+	lsl	synd, synd, cntin
+	lsrs	synd, synd, cntin
+	it	eq
+	moveq	src, #0	/* If no match, set src to 0 so the retval is 0. */
+
+
+.Ltail:
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result and return */
+	add	result, src, synd
+	bx	lr
+
+
+.Lnotfound:
+	/* Set result to NULL if not found and return */
+	mov	result, #0
+	bx	lr
+
+	.cfi_endproc
+	.size	memchr, . - memchr
+
+#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
 
 #if __ARM_ARCH_PROFILE == 'M'
        .arch armv7e-m
-- 
2.11.0



More information about the Newlib mailing list