This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[glibc/release/2.26/master] [AArch64] Optimized memcmp.

From: Wilco Dijkstra <wilco at sourceware dot org>
To: glibc-cvs at sourceware dot org
Date: 6 Sep 2019 16:26:12 -0000
Subject: [glibc/release/2.26/master] [AArch64] Optimized memcmp.
https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=1896de3d926d299a1ed5c9f0a4f03f5a81969200

commit 1896de3d926d299a1ed5c9f0a4f03f5a81969200
Author: Wilco Dijkstra <wdijkstr@arm.com>
Date:   Thu Aug 10 17:00:38 2017 +0100

    [AArch64] Optimized memcmp.
    
    This is an optimized memcmp for AArch64.  This is a complete rewrite
    using a different algorithm.  The previous version split into cases
    where both inputs were aligned, the inputs were mutually aligned and
    unaligned using a byte loop.  The new version combines all these cases,
    while small inputs of less than 8 bytes are handled separately.
    
    This allows the main code to be sped up using unaligned loads since
    there are now at least 8 bytes to be compared.  After the first 8 bytes,
    align the first input.  This ensures each iteration does at most one
    unaligned access and mutually aligned inputs behave as aligned.
    After the main loop, process the last 8 bytes using unaligned accesses.
    
    This improves performance of (mutually) aligned cases by 25% and
    unaligned by >500% (yes >6 times faster) on large inputs.
    
    	* sysdeps/aarch64/memcmp.S (memcmp):
    	Rewrite of optimized memcmp.
    
    (cherry picked from commit 922369032c604b4dcfd535e1bcddd4687e7126a5)

Diff:
---
 ChangeLog                |   5 ++
 sysdeps/aarch64/memcmp.S | 176 +++++++++++++++++++----------------------------
 2 files changed, 76 insertions(+), 105 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4cd747a..5783090 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-09-06  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* sysdeps/aarch64/memcmp.S (memcmp):
+	Rewrite of optimized memcmp.
+
 2019-07-12  Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 
 	[BZ #24699]
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 4cfcb89..b99c081 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -22,132 +22,98 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses.
  */
 
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define limit		x2
-#define result		x0
+#define result		w0
 
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
 #define data2		x4
 #define data2w		w4
-#define has_nul		x5
-#define diff		x6
-#define endloop		x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define pos		x11
-#define limit_wd	x12
-#define mask		x13
+#define tmp1		x5
 
 ENTRY_ALIGN (memcmp, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	DELOUSE (2)
-	cbz	limit, L(ret0)
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
-	add	limit_wd, limit, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Start of performance-critical section  -- one 64B cache line.  */
-L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-L(start_realigned):
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
-	cbz	endloop, L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
-
-	/* Not reached the limit, must have found a diff.  */
-	cbnz	limit_wd, L(not_limit)
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	L(not_limit)
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
-#else
-	lsl	mask, mask, limit
-#endif
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	orr	diff, diff, mask
-L(not_limit):
 
-#ifndef	__AARCH64EB__
-	rev	diff, diff
+	subs	limit, limit, 8
+	b.lo	.Lless8
+
+	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	and	tmp1, src1, 7
+	add	limit, limit, tmp1
+	cmp	data1, data2
+	bne	.Lreturn
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	subs	limit, limit, 8
+	b.ls	.Llast_bytes
+
+	/* Loop performing 8 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 8 and must be larger than zero.
+	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	.p2align 4
+.Lloop8:
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	subs	limit, limit, 8
+	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
+	b.eq	.Lloop8
+
+	cmp	data1, data2
+	bne	.Lreturn
+
+	/* Compare last 1-8 bytes using unaligned access.  */
+.Llast_bytes:
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+.Lreturn:
+#ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	/* The MS-non-zero bit of DIFF marks either the first bit
-	   that is different, or the end of the significant data.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	RET
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	add	limit_wd, limit, #7
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	lsr	limit_wd, limit_wd, #3
-	b	L(start_realigned)
-
-L(ret0):
-	mov	result, #0
-	RET
-
-	.p2align 6
-L(misaligned8):
-	sub	limit, limit, #1
-1:
-	/* Perhaps we can do better than this.  */
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
-	sub	result, data1, data2
-	RET
+	cmp     data1, data2
+.Lret_eq:
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+.Lless8:
+	adds	limit, limit, 4
+	b.lo	.Lless4
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
+	cmp	data1w, data2w
+	b.ne	.Lreturn
+	sub	limit, limit, 4
+.Lless4:
+	adds	limit, limit, 4
+	beq	.Lret_eq
+.Lbyte_loop:
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	.Lbyte_loop
+	sub	result, data1w, data2w
+	ret
+
 END (memcmp)
 #undef bcmp
 weak_alias (memcmp, bcmp)
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]