This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH] Optimize SSE 4.1 x86_64 memcmp

From: Florian Weimer <fweimer at redhat dot com>
To: GNU C Library <libc-alpha at sourceware dot org>
Date: Fri, 31 Jan 2014 16:09:54 +0100
Subject: [PATCH] Optimize SSE 4.1 x86_64 memcmp
Authentication-results: sourceware.org; auth=none

This patch vectorizes the difference extraction in memcmp. It does soby loading all relevant input bytes, converting them to big-endian usingBSWAP, and comparing the result.

This changes the return value of memcmp, but stays within the interfacecontract. Some corner cases in the old implementation do not returnactual byte differences, either, so I hope this will not break applications.

On my i7-2620M, some lengths (mostly 5, 6, 7) are about one cycle slowerthan the old implementation, but only if all branches in the oldimplementation are correctly predicted. If there is misprediction (asthere is bound to be when sorting strings), the vectorized version isconsistently faster. The glibc benchtests favor the oldimplementation[*], but the speed difference between tests slightlyfavors the new implementation (but the difference is not clearlysignificant). I could probably game that by using a data-dependentbranch in for those lengths, but I think the fully vectorized version isbetter for real-world input data.

In case BSWAP turns out to be too slow for some CPUs, it could bereplaced with BSF and a shift (both constant time in currentprocessors), like we already do in some of the existing memcmpimplementations.

This work was prompted by a desire to make the time taken by memcmpindependent of the length of the shared prefix of the arguments as faras possible. The current implementation can still be used to confirm acorrect guess of a 4-byte or 8-byte prefix, but this is extremelyunlikely and not suitable for use in a timing oracle (at least the8-byte guess). The new implementation should not act as a cache oracleeither because it does not perform data-dependent single-byte loads.

If such a change is acceptable in principle, I will make work on similaradjustments to the other x86 implementations.

[*] Tight loop over identical inputs, after bumping the inner loop countfrom 64 to 4096. If I don't do this, the tests are very fragile, butthe new implementation seems to be roughly twice as fast across theboard. Using qsort to sort random strings of fixed length is about 10%faster over all using the new implementation. I still need to try tosort a random permutation of /usr/dict/word and compare the results.


--
Florian Weimer / Red Hat Product Security Team

2014-01-31  Florian Weimer  <fweimer@redhat.com>

	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Vectorize the comparison
	of differing bytes.

diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index e753d62..d9fc424 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -875,13 +875,18 @@ L(13bytes):
 
 	.p2align 4
 L(5bytes):
+	/* Bytes 12345 loaded as ...54321.  */
 	mov	-5(%rdi), %eax
 	mov	-5(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %edx
-	sub	%edx, %eax
+	movzbl	-1(%rdi), %edi
+	movzbl	-1(%rsi), %esi
+	shl	$32, %rdi
+	shl	$32, %rsi
+	or	%rdi, %rax
+	or	%rsi, %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -916,12 +921,16 @@ L(10bytes):
 	mov	-10(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
+L(2bytes):
+	/* Bytes 12 loaded as ..12.  */
+	movzbl  -2(%rdi), %eax
+	movzbl  -2(%rsi), %ecx
+	shl	$8, %eax
+	movzbl  -1(%rdi), %edi
+	shl	$8, %ecx
+	movzbl  -1(%rsi), %esi
+	or	%edi, %eax
+	or	%esi, %ecx
 	sub	%ecx, %eax
 	ret
 
@@ -940,18 +949,18 @@ L(14bytes):
 
 	.p2align 4
 L(6bytes):
-	mov	-6(%rdi), %eax
-	mov	-6(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-L(2bytes):
-	movzwl	-2(%rsi), %ecx
+	/* Bytes 123456 loaded as ..654321.  */
 	movzwl	-2(%rdi), %eax
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
+	movzwl	-2(%rsi), %ecx
+	mov	-6(%rdi), %edi
+	mov	-6(%rsi), %esi
+	shl	$32, %rax
+	shl	$32, %rcx
+	or	%rdi, %rax
+	or	%rsi, %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -1021,10 +1030,21 @@ L(7bytes):
 
 	.p2align 4
 L(3bytes):
+	/* Bytes 123 loaded as .123.  */
 	movzwl	-3(%rdi), %eax
+	bswap	%eax
 	movzwl	-3(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin2bytes)
+	movzbl	-1(%rdi), %edi
+	bswap	%ecx
+	shr	$8, %eax
+	movzbl	-1(%rsi), %esi
+	shr	$8, %ecx
+	or	%edi, %eax
+	or	%esi, %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
 L(1bytes):
 	movzbl	-1(%rdi), %eax
 	movzbl	-1(%rsi), %ecx
@@ -1296,7 +1316,10 @@ L(26bytes):
 	jne	L(diffin8bytes)
 	movzwl	-2(%rdi), %eax
 	movzwl	-2(%rsi), %ecx
-	jmp	L(diffin2bytes)
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
 
 	.p2align 4
 L(75bytes):
@@ -1553,43 +1576,44 @@ L(less16bytes):
 	jne	L(diffin8bytes)
 	mov	8(%rsi, %rdx), %rcx
 	mov	8(%rdi, %rdx), %rax
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
 L(diffin8bytes):
+# ifndef USE_AS_WMEMCMP
+/* Expects left word in %rax, right word %rcx, and they must be different.  */
+	bswap	%rcx
+	bswap	%rax
+	cmp	%rcx, %rax
+	sbb	%eax, %eax
+	or	$1, %eax
+	ret
+
+	.p2align 4
+L(diffin4bytes):
+/* Expects left word in %eax, right word %ecx, and they must be different.  */
+	bswap	%ecx
+	bswap	%eax
+	cmp	%ecx, %eax
+	sbb	%eax, %eax
+	or	$1, %eax
+	ret
+
+# else
+/* for wmemcmp */
 	cmp	%eax, %ecx
 	jne	L(diffin4bytes)
 	shr	$32, %rcx
 	shr	$32, %rax
-
-# ifdef USE_AS_WMEMCMP
-/* for wmemcmp */
 	cmp	%eax, %ecx
 	jne	L(diffin4bytes)
 	xor	%eax, %eax
 	ret
-# endif
-
 L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
-	cmp	%cx, %ax
-	jne	L(diffin2bytes)
-	shr	$16, %ecx
-	shr	$16, %eax
-L(diffin2bytes):
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
-	ret
 
-	.p2align 4
-L(end):
-	and	$0xff, %eax
-	and	$0xff, %ecx
-	sub	%ecx, %eax
-	ret
-# else
-
-/* for wmemcmp */
 	mov	$1, %eax
 	jl	L(nequal_bigger)
 	neg	%eax

Follow-Ups:
- Re: [PATCH] Optimize SSE 4.1 x86_64 memcmp
  - From: Carlos O'Donell
- Re: [PATCH] Optimize SSE 4.1 x86_64 memcmp
  - From: OndÅej BÃlka
- Re: [PATCH] Optimize SSE 4.1 x86_64 memcmp
  - From: Joseph S. Myers

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]