This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH v2] aarch64: Optimized implementation of strcpy

From: Xuelei Zhang <zhangxuelei4 at huawei dot com>
To: <libc-alpha at sourceware dot org>, <siddhesh at gotplt dot org>, <Szabolcs dot Nagy at arm dot com>, <Wilco dot Dijkstra at arm dot com>, <jiangyikun at huawei dot com>, <yikunkero at gmail dot com>
Date: Tue, 22 Oct 2019 17:39:30 +0800
Subject: [PATCH v2] aarch64: Optimized implementation of strcpy

Optimize the strcpy implementation by using vector loads and operations
in main loop.Compared to aarch64/strcpy.S, it reduces latency of cases
in bench-strlen by 5%~18% when the length of src is greater than 64
bytes, with gains throughout the benchmark.
---
 sysdeps/aarch64/strcpy.S | 59 ++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index edc16252f68..290bcf8d236 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -53,6 +53,12 @@
 #define len		x16
 #define to_align	x17
 
+/* NEON register */
+#define dataq		q2
+#define datav		v2
+#define datab2		b3
+#define datav2		v3
+
 #ifdef BUILD_STPCPY
 #define STRCPY __stpcpy
 #else
@@ -199,7 +205,6 @@ L(fp_lt2):
 #endif
 	ret
 
-	.p2align 6
 	/* Aligning here ensures that the entry code and main loop all lies
 	   within one 64-byte cache line.  */
 L(bulk_entry):
@@ -214,46 +219,36 @@ L(bulk_entry):
 	   especially on cores with a high number of issue slots per
 	   cycle, as we get much better parallelism out of the operations.  */
 L(main_loop):
-	stp	data1, data2, [dst], #16
+	str	dataq, [dst], #16
 L(entry_no_page_cross):
-	ldp	data1, data2, [src], #16
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(main_loop)
+	ldr	dataq, [src], #16
+	uminv	datab2, datav.16b
+	mov	tmp3, datav2.d[0]
+	cbnz	tmp3, L(main_loop)
 
 	/* Since we know we are copying at least 16 bytes, the fastest way
 	   to deal with the tail is to determine the location of the
 	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
-	cmp	has_nul1, #0
 #ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
+	rev64	datav.16b, datav.16b
+#endif
+	/* 算出loc */
+	cmeq	datav.16b, datav.16b, #0
+	mov	data1, datav.d[0]
+	mov	data2, datav.d[1]
+	cmp	data1, 0
 	csel	data1, data1, data2, ne
+	mov	pos, 8
 	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
-	csel	has_nul1, has_nul1, has_nul2, ne
-#endif
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	tmp1, pos, #72
-	add	pos, pos, #8
-	csel	pos, pos, tmp1, ne
-	add	src, src, pos, lsr #3
-	add	dst, dst, pos, lsr #3
-	ldp	data1, data2, [src, #-32]
-	stp	data1, data2, [dst, #-16]
+	clz	tmp1, data1
+	csel	pos, xzr, pos, ne
+	add	pos, pos, tmp1, lsr 3
+	add	src, src, pos
+	add	dst, dst, pos
+	ldr	dataq,[src, #-31]
+	str	dataq,[dst, #-15]
 #ifdef BUILD_STPCPY
-	sub	dstin, dst, #1
+	mov	dstin, dst
 #endif
 	ret
 
-- 
2.14.1.windows.1

Follow-Ups:
- Re: [PATCH v2] aarch64: Optimized implementation of strcpy
  - From: Wilco Dijkstra

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]