This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Optimize SSE 4.1 x86_64 memcmp


This patch vectorizes the difference extraction in memcmp. It does so by loading all relevant input bytes, converting them to big-endian using BSWAP, and comparing the result.

This changes the return value of memcmp, but stays within the interface contract. Some corner cases in the old implementation do not return actual byte differences, either, so I hope this will not break applications.

On my i7-2620M, some lengths (mostly 5, 6, 7) are about one cycle slower than the old implementation, but only if all branches in the old implementation are correctly predicted. If there is misprediction (as there is bound to be when sorting strings), the vectorized version is consistently faster. The glibc benchtests favor the old implementation[*], but the speed difference between tests slightly favors the new implementation (but the difference is not clearly significant). I could probably game that by using a data-dependent branch in for those lengths, but I think the fully vectorized version is better for real-world input data.

In case BSWAP turns out to be too slow for some CPUs, it could be replaced with BSF and a shift (both constant time in current processors), like we already do in some of the existing memcmp implementations.

This work was prompted by a desire to make the time taken by memcmp independent of the length of the shared prefix of the arguments as far as possible. The current implementation can still be used to confirm a correct guess of a 4-byte or 8-byte prefix, but this is extremely unlikely and not suitable for use in a timing oracle (at least the 8-byte guess). The new implementation should not act as a cache oracle either because it does not perform data-dependent single-byte loads.

If such a change is acceptable in principle, I will make work on similar adjustments to the other x86 implementations.

[*] Tight loop over identical inputs, after bumping the inner loop count from 64 to 4096. If I don't do this, the tests are very fragile, but the new implementation seems to be roughly twice as fast across the board. Using qsort to sort random strings of fixed length is about 10% faster over all using the new implementation. I still need to try to sort a random permutation of /usr/dict/word and compare the results.

--
Florian Weimer / Red Hat Product Security Team
2014-01-31  Florian Weimer  <fweimer@redhat.com>

	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Vectorize the comparison
	of differing bytes.

diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index e753d62..d9fc424 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -875,13 +875,18 @@ L(13bytes):
 
 	.p2align 4
 L(5bytes):
+	/* Bytes 12345 loaded as ...54321.  */
 	mov	-5(%rdi), %eax
 	mov	-5(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %edx
-	sub	%edx, %eax
+	movzbl	-1(%rdi), %edi
+	movzbl	-1(%rsi), %esi
+	shl	$32, %rdi
+	shl	$32, %rsi
+	or	%rdi, %rax
+	or	%rsi, %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -916,12 +921,16 @@ L(10bytes):
 	mov	-10(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
+L(2bytes):
+	/* Bytes 12 loaded as ..12.  */
+	movzbl  -2(%rdi), %eax
+	movzbl  -2(%rsi), %ecx
+	shl	$8, %eax
+	movzbl  -1(%rdi), %edi
+	shl	$8, %ecx
+	movzbl  -1(%rsi), %esi
+	or	%edi, %eax
+	or	%esi, %ecx
 	sub	%ecx, %eax
 	ret
 
@@ -940,18 +949,18 @@ L(14bytes):
 
 	.p2align 4
 L(6bytes):
-	mov	-6(%rdi), %eax
-	mov	-6(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-L(2bytes):
-	movzwl	-2(%rsi), %ecx
+	/* Bytes 123456 loaded as ..654321.  */
 	movzwl	-2(%rdi), %eax
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
+	movzwl	-2(%rsi), %ecx
+	mov	-6(%rdi), %edi
+	mov	-6(%rsi), %esi
+	shl	$32, %rax
+	shl	$32, %rcx
+	or	%rdi, %rax
+	or	%rsi, %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -1021,10 +1030,21 @@ L(7bytes):
 
 	.p2align 4
 L(3bytes):
+	/* Bytes 123 loaded as .123.  */
 	movzwl	-3(%rdi), %eax
+	bswap	%eax
 	movzwl	-3(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin2bytes)
+	movzbl	-1(%rdi), %edi
+	bswap	%ecx
+	shr	$8, %eax
+	movzbl	-1(%rsi), %esi
+	shr	$8, %ecx
+	or	%edi, %eax
+	or	%esi, %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
 L(1bytes):
 	movzbl	-1(%rdi), %eax
 	movzbl	-1(%rsi), %ecx
@@ -1296,7 +1316,10 @@ L(26bytes):
 	jne	L(diffin8bytes)
 	movzwl	-2(%rdi), %eax
 	movzwl	-2(%rsi), %ecx
-	jmp	L(diffin2bytes)
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
 
 	.p2align 4
 L(75bytes):
@@ -1553,43 +1576,44 @@ L(less16bytes):
 	jne	L(diffin8bytes)
 	mov	8(%rsi, %rdx), %rcx
 	mov	8(%rdi, %rdx), %rax
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
 L(diffin8bytes):
+# ifndef USE_AS_WMEMCMP
+/* Expects left word in %rax, right word %rcx, and they must be different.  */
+	bswap	%rcx
+	bswap	%rax
+	cmp	%rcx, %rax
+	sbb	%eax, %eax
+	or	$1, %eax
+	ret
+
+	.p2align 4
+L(diffin4bytes):
+/* Expects left word in %eax, right word %ecx, and they must be different.  */
+	bswap	%ecx
+	bswap	%eax
+	cmp	%ecx, %eax
+	sbb	%eax, %eax
+	or	$1, %eax
+	ret
+
+# else
+/* for wmemcmp */
 	cmp	%eax, %ecx
 	jne	L(diffin4bytes)
 	shr	$32, %rcx
 	shr	$32, %rax
-
-# ifdef USE_AS_WMEMCMP
-/* for wmemcmp */
 	cmp	%eax, %ecx
 	jne	L(diffin4bytes)
 	xor	%eax, %eax
 	ret
-# endif
-
 L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
-	cmp	%cx, %ax
-	jne	L(diffin2bytes)
-	shr	$16, %ecx
-	shr	$16, %eax
-L(diffin2bytes):
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
-	ret
 
-	.p2align 4
-L(end):
-	and	$0xff, %eax
-	and	$0xff, %ecx
-	sub	%ecx, %eax
-	ret
-# else
-
-/* for wmemcmp */
 	mov	$1, %eax
 	jl	L(nequal_bigger)
 	neg	%eax

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]