This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Optimize SSE 4.1 x86_64 memcmp
- From: Florian Weimer <fweimer at redhat dot com>
- To: GNU C Library <libc-alpha at sourceware dot org>
- Date: Fri, 31 Jan 2014 16:09:54 +0100
- Subject: [PATCH] Optimize SSE 4.1 x86_64 memcmp
- Authentication-results: sourceware.org; auth=none
This patch vectorizes the difference extraction in memcmp. It does so
by loading all relevant input bytes, converting them to big-endian using
BSWAP, and comparing the result.
This changes the return value of memcmp, but stays within the interface
contract. Some corner cases in the old implementation do not return
actual byte differences, either, so I hope this will not break applications.
On my i7-2620M, some lengths (mostly 5, 6, 7) are about one cycle slower
than the old implementation, but only if all branches in the old
implementation are correctly predicted. If there is misprediction (as
there is bound to be when sorting strings), the vectorized version is
consistently faster. The glibc benchtests favor the old
implementation[*], but the speed difference between tests slightly
favors the new implementation (but the difference is not clearly
significant). I could probably game that by using a data-dependent
branch in for those lengths, but I think the fully vectorized version is
better for real-world input data.
In case BSWAP turns out to be too slow for some CPUs, it could be
replaced with BSF and a shift (both constant time in current
processors), like we already do in some of the existing memcmp
implementations.
This work was prompted by a desire to make the time taken by memcmp
independent of the length of the shared prefix of the arguments as far
as possible. The current implementation can still be used to confirm a
correct guess of a 4-byte or 8-byte prefix, but this is extremely
unlikely and not suitable for use in a timing oracle (at least the
8-byte guess). The new implementation should not act as a cache oracle
either because it does not perform data-dependent single-byte loads.
If such a change is acceptable in principle, I will make work on similar
adjustments to the other x86 implementations.
[*] Tight loop over identical inputs, after bumping the inner loop count
from 64 to 4096. If I don't do this, the tests are very fragile, but
the new implementation seems to be roughly twice as fast across the
board. Using qsort to sort random strings of fixed length is about 10%
faster over all using the new implementation. I still need to try to
sort a random permutation of /usr/dict/word and compare the results.
--
Florian Weimer / Red Hat Product Security Team
2014-01-31 Florian Weimer <fweimer@redhat.com>
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Vectorize the comparison
of differing bytes.
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index e753d62..d9fc424 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -875,13 +875,18 @@ L(13bytes):
.p2align 4
L(5bytes):
+ /* Bytes 12345 loaded as ...54321. */
mov -5(%rdi), %eax
mov -5(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- movzbl -1(%rdi), %eax
- movzbl -1(%rsi), %edx
- sub %edx, %eax
+ movzbl -1(%rdi), %edi
+ movzbl -1(%rsi), %esi
+ shl $32, %rdi
+ shl $32, %rsi
+ or %rdi, %rax
+ or %rsi, %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
ret
.p2align 4
@@ -916,12 +921,16 @@ L(10bytes):
mov -10(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
+L(2bytes):
+ /* Bytes 12 loaded as ..12. */
+ movzbl -2(%rdi), %eax
+ movzbl -2(%rsi), %ecx
+ shl $8, %eax
+ movzbl -1(%rdi), %edi
+ shl $8, %ecx
+ movzbl -1(%rsi), %esi
+ or %edi, %eax
+ or %esi, %ecx
sub %ecx, %eax
ret
@@ -940,18 +949,18 @@ L(14bytes):
.p2align 4
L(6bytes):
- mov -6(%rdi), %eax
- mov -6(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
-L(2bytes):
- movzwl -2(%rsi), %ecx
+ /* Bytes 123456 loaded as ..654321. */
movzwl -2(%rdi), %eax
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
- sub %ecx, %eax
+ movzwl -2(%rsi), %ecx
+ mov -6(%rdi), %edi
+ mov -6(%rsi), %esi
+ shl $32, %rax
+ shl $32, %rcx
+ or %rdi, %rax
+ or %rsi, %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
ret
.p2align 4
@@ -1021,10 +1030,21 @@ L(7bytes):
.p2align 4
L(3bytes):
+ /* Bytes 123 loaded as .123. */
movzwl -3(%rdi), %eax
+ bswap %eax
movzwl -3(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin2bytes)
+ movzbl -1(%rdi), %edi
+ bswap %ecx
+ shr $8, %eax
+ movzbl -1(%rsi), %esi
+ shr $8, %ecx
+ or %edi, %eax
+ or %esi, %ecx
+ sub %ecx, %eax
+ ret
+
+ .p2align 4
L(1bytes):
movzbl -1(%rdi), %eax
movzbl -1(%rsi), %ecx
@@ -1296,7 +1316,10 @@ L(26bytes):
jne L(diffin8bytes)
movzwl -2(%rdi), %eax
movzwl -2(%rsi), %ecx
- jmp L(diffin2bytes)
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
.p2align 4
L(75bytes):
@@ -1553,43 +1576,44 @@ L(less16bytes):
jne L(diffin8bytes)
mov 8(%rsi, %rdx), %rcx
mov 8(%rdi, %rdx), %rax
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
L(diffin8bytes):
+# ifndef USE_AS_WMEMCMP
+/* Expects left word in %rax, right word %rcx, and they must be different. */
+ bswap %rcx
+ bswap %rax
+ cmp %rcx, %rax
+ sbb %eax, %eax
+ or $1, %eax
+ ret
+
+ .p2align 4
+L(diffin4bytes):
+/* Expects left word in %eax, right word %ecx, and they must be different. */
+ bswap %ecx
+ bswap %eax
+ cmp %ecx, %eax
+ sbb %eax, %eax
+ or $1, %eax
+ ret
+
+# else
+/* for wmemcmp */
cmp %eax, %ecx
jne L(diffin4bytes)
shr $32, %rcx
shr $32, %rax
-
-# ifdef USE_AS_WMEMCMP
-/* for wmemcmp */
cmp %eax, %ecx
jne L(diffin4bytes)
xor %eax, %eax
ret
-# endif
-
L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
- cmp %cx, %ax
- jne L(diffin2bytes)
- shr $16, %ecx
- shr $16, %eax
-L(diffin2bytes):
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
- sub %ecx, %eax
- ret
- .p2align 4
-L(end):
- and $0xff, %eax
- and $0xff, %ecx
- sub %ecx, %eax
- ret
-# else
-
-/* for wmemcmp */
mov $1, %eax
jl L(nequal_bigger)
neg %eax