This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch master updated. glibc-2.27.9000-191-g7108f1f

From: siddhesh at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 13 Mar 2018 18:28:33 -0000
Subject: GNU C Library master sources branch master updated. glibc-2.27.9000-191-g7108f1f
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  7108f1f944792ac68332967015d5e6418c5ccc88 (commit)
      from  2cc7bad0ae0a412e75270be5ed41d45c03e7a931 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=7108f1f944792ac68332967015d5e6418c5ccc88

commit 7108f1f944792ac68332967015d5e6418c5ccc88
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Tue Mar 13 23:57:03 2018 +0530

    aarch64: Improve strncmp for mutually misaligned inputs
    
    The mutually misaligned inputs on aarch64 are compared with a simple
    byte copy, which is not very efficient.  Enhance the comparison
    similar to strcmp by loading a double-word at a time.  The peak
    performance improvement (i.e. 4k maxlen comparisons) due to this on
    the strncmp microbenchmark is as follows:
    
    falkor: 3.5x (up to 72% time reduction)
    cortex-a73: 3.5x (up to 71% time reduction)
    cortex-a53: 3.5x (up to 71% time reduction)
    
    All mutually misaligned inputs from 16 bytes maxlen onwards show
    upwards of 15% improvement and there is no measurable effect on the
    performance of aligned/mutually aligned inputs.
    
    	* sysdeps/aarch64/strncmp.S (count): New macro.
    	(strncmp): Store misaligned length in SRC1 in COUNT.
    	(mutual_align): Adjust.
    	(misaligned8): Load dword at a time when it is safe.

diff --git a/ChangeLog b/ChangeLog
index 203d274..4153cb2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2018-03-13  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strncmp.S (count): New macro.
+	(strncmp): Store misaligned length in SRC1 in COUNT.
+	(mutual_align): Adjust.
+	(misaligned8): Load dword at a time when it is safe.
+
 2018-03-12  Zack Weinberg  <zackw@panix.com>
 
 	[BZ #1190]
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index a08d2c0..20c7ec8 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -49,6 +49,7 @@
 #define limit_wd	x13
 #define mask		x14
 #define endloop		x15
+#define count		mask
 
 ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
 	DELOUSE (0)
@@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
+	and	count, src1, #7
 	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
+	cbnz	count, L(mutual_align)
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
@@ -165,43 +166,107 @@ L(mutual_align):
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	ldr	data1, [src1], #8
-	neg	tmp3, tmp1, lsl #3	/* 64 - bits(bytes beyond align). */
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 #ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #endif
 	and	tmp3, limit_wd, #7
 	lsr	limit_wd, limit_wd, #3
 	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, tmp1
-	add	tmp3, tmp3, tmp1
+	add	limit, limit, count
+	add	tmp3, tmp3, count
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
-L(ret0):
-	mov	result, #0
-	RET
-
 	.p2align 6
+	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
-	sub	limit, limit, #1
-1:
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, cs	/* NZCV = 0b0000.  */
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	RET
+
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	mov	limit_wd, limit, lsr #3
+	cbz	count, L(do_misaligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+	mov	limit_wd, limit, lsr #3
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+L(do_misaligned):
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	L(done_loop)
+L(loop_misaligned):
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, L(page_end_loop)
+
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+	subs	limit_wd, limit_wd, #1
+	b.pl	L(loop_misaligned)
+
+L(done_loop):
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, L(not_limit)
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+
+L(ret0):
+	mov	result, #0
+	RET
+
 END (strncmp)
 libc_hidden_builtin_def (strncmp)

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                 |    7 +++
 sysdeps/aarch64/strncmp.S |   95 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 87 insertions(+), 15 deletions(-)


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]