This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH 2/2] Improve strcpy: Faster unaligned loads.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 9 Sep 2013 18:11:12 +0200
- Subject: [PATCH 2/2] Improve strcpy: Faster unaligned loads.
- Authentication-results: sourceware.org; auth=none
- References: <20130909153051 dot GA23047 at domone dot kolej dot mff dot cuni dot cz>
This is actual implmentation. We use optimized header that makes calls
around 50 cycles faster for nehalem and ivy bridge.
Currently this improves strcpy, stpcpy, ctrcat I keep old implementation
of strncpy/strncat.
A header that I use improves speed by 10% on most processors for gcc
workload. Separate loops that use ssse3/shifts are needed as this
implemenation is slower on large sizes for processors without fast
unaligned loads.
Results were obtained by following benchmark:
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcpy_profile.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcpy_profile90913.tar.bz2
* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-new.S: New implementation.
* sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S: Use new implementation.
* sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: Do tailcall
of __strcpy_sse2_unaligned_tail.
---
sysdeps/x86_64/multiarch/Makefile | 2 +-
sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S | 2 +-
sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 6 +-
.../x86_64/multiarch/strcpy-sse2-unaligned-new.S | 280 +++++++++++++++++++++
4 files changed, 287 insertions(+), 3 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-new.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5ab950a..a1cb692 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -14,7 +14,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
- strcpy-sse2-unaligned strncpy-sse2-unaligned \
+ strcpy-sse2-unaligned-new strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
index 34231f8..160bf7d 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
@@ -1,3 +1,3 @@
#define USE_AS_STPCPY
#define STRCPY __stpcpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
+#include "strcpy-sse2-unaligned-new.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 028c6d3..d26db8a 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -273,7 +273,11 @@ L(StartStrcpyPart):
test %r8, %r8
jz L(ExitZero)
# define USE_AS_STRNCPY
+# include "strcpy-sse2-unaligned.S"
+
+# else
+ jmp __strcpy_sse2_unaligned_tail
+ END (STRCAT)
# endif
-# include "strcpy-sse2-unaligned.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-new.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-new.S
new file mode 100644
index 0000000..ac9ac55
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-new.S
@@ -0,0 +1,280 @@
+/* strcpy with SSE2 and unaligned load
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strcpy_sse2_unaligned
+# endif
+
+# define ALIGN(x) .p2align x
+
+
+ENTRY (STRCPY)
+ movq %rsi, %rdx
+ pxor %xmm4, %xmm4
+ movq %rdi, %rax
+ pxor %xmm5, %xmm5
+ andl $4095, %edx
+ pxor %xmm6, %xmm6
+ cmpq $4032, %rdx
+ ja L(cross_page)
+ movdqu (%rsi), %xmm1
+ pxor %xmm7, %xmm7
+ movdqu 16(%rsi), %xmm2
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %ecx
+ pcmpeqb %xmm2, %xmm5
+ pmovmskb %xmm5, %edx
+ salq $16, %rdx
+ orq %rcx, %rdx
+ jne L(less_32_bytes)
+ movdqu 32(%rsi), %xmm3
+ movdqu 48(%rsi), %xmm4
+ pcmpeqb %xmm3, %xmm6
+ pcmpeqb %xmm4, %xmm7
+ pmovmskb %xmm6, %edx
+ pmovmskb %xmm7, %ecx
+ salq $32, %rdx
+ salq $48, %rcx
+ orq %rcx, %rdx
+ jne less_64_bytes
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm3, 32(%rdi)
+ movdqu %xmm4, 48(%rdi)
+L(prepare_loop):
+ leaq 64(%rsi), %rdx
+ andq $-64, %rdx
+ addq %rdx, %rdi
+ pxor %xmm5, %xmm5
+ subq %rsi, %rdi
+ movq %rdx, %rsi
+ jmp L(loop_entry)
+
+ ALIGN (4)
+L(loop):
+ movdqu %xmm1, (%rdi)
+ addq $64, %rsi
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm3, 32(%rdi)
+ movdqu %xmm4, 48(%rdi)
+ addq $64, %rdi
+L(loop_entry):
+ movdqa 32(%rsi), %xmm3
+ movdqa 48(%rsi), %xmm4
+ movdqa %xmm3, %xmm0
+ movdqa 16(%rsi), %xmm2
+ pminub %xmm4, %xmm0
+ movdqa (%rsi), %xmm1
+ pminub %xmm2, %xmm0
+ pminub %xmm1, %xmm0
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ testq %rdx, %rdx
+ je L(loop)
+
+ salq $48, %rdx
+ pcmpeqb %xmm1, %xmm5
+ pcmpeqb %xmm2, %xmm6
+ pmovmskb %xmm5, %ecx
+ pmovmskb %xmm6, %r8d
+ pcmpeqb %xmm3, %xmm7
+ orq %rcx, %rdx
+ pmovmskb %xmm7, %r9d
+ salq $16, %r8
+ orq %r8, %rdx
+ salq $32, %r9
+ orq %r9, %rdx
+ bsfq %rdx, %rcx
+#ifdef USE_AS_STPCPY
+ lea (%rdi, %rcx), %rax
+#endif
+ cmpq $32, %rcx
+ jb L(less_32_bytes)
+ movdqu -31(%rsi,%rcx), %xmm3
+ movdqu -15(%rsi,%rcx), %xmm4
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm3, -31(%rdi,%rcx)
+ movdqu %xmm4, -15(%rdi,%rcx)
+ ret
+
+
+ ALIGN (3)
+L(between_16_31_bytes):
+ movdqu -15(%rsi,%rdx), %xmm2
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, -15(%rdi,%rdx)
+ ret
+
+ ALIGN (3)
+L(less_32_bytes):
+ bsfq %rdx, %rdx
+#ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+#endif
+ cmpq $15, %rdx
+ jae L(between_16_31_bytes)
+ cmpq $7, %rdx
+ jae L(between_8_15_bytes)
+ cmpq $3, %rdx
+ jae L(between_4_7_bytes)
+ cmpq $1, %rdx
+ jb L(between_1_1_bytes) /* We need to write terminating zero. */
+ movzwl -1(%rsi,%rdx), %ecx
+ movzwl (%rsi), %esi
+ movw %si, (%rdi)
+ movw %cx, -1(%rdi,%rdx)
+ ret
+
+ ALIGN (3)
+less_64_bytes:
+ bsfq %rdx, %rdx
+#ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+#endif
+ movdqu -31(%rsi,%rdx), %xmm3
+ movdqu -15(%rsi,%rdx), %xmm0
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm3, -31(%rdi,%rdx)
+ movdqu %xmm0, -15(%rdi,%rdx)
+ ret
+
+ ALIGN (3)
+L(between_8_15_bytes):
+ movq -7(%rsi,%rdx), %rcx
+ movq (%rsi), %rsi
+ movq %rsi, (%rdi)
+ movq %rcx, -7(%rdi,%rdx)
+ ret
+
+ ALIGN (3)
+L(between_4_7_bytes):
+ movl -3(%rsi,%rdx), %ecx
+ movl (%rsi), %esi
+ movl %esi, (%rdi)
+ movl %ecx, -3(%rdi,%rdx)
+ ret
+
+L(between_1_1_bytes):
+ movzbl (%rsi), %edx
+ movb %dl, (%rdi)
+ ret
+
+ ALIGN(4)
+L(cross_page):
+ movq %rsi, %rcx
+ pxor %xmm0, %xmm0
+ andq $-64, %rcx
+ movabsq $-9223372036854775808, %r10
+ movdqa (%rcx), %xmm4
+ movdqa 16(%rcx), %xmm3
+ pcmpeqb %xmm0, %xmm4
+ movdqa 32(%rcx), %xmm2
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm4, %edx
+ movdqa 48(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm0, %xmm1
+ orq %r10, %rdx
+ pmovmskb %xmm3, %r10d
+ pmovmskb %xmm2, %r9d
+ salq $16, %r10
+ orq %r10, %rdx
+ pmovmskb %xmm1, %r8d
+ salq $32, %r9
+ orq %r9, %rdx
+ salq $48, %r8
+ orq %r8, %rdx
+ movq %rsi, %r10
+ subq %rcx, %r10
+ movq %r10, %rcx
+ shrq %cl, %rdx
+ bsfq %rdx, %rdx
+#ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+#endif
+ cmpq $15, %rdx
+ jbe L(copy_less_16_bytes)
+ cmpq $31, %rdx
+ jbe L(copy_16_32_bytes)
+ movdqu (%rsi), %xmm3
+ movdqu 16(%rsi), %xmm2
+ movdqu -31(%rsi,%rdx), %xmm1
+ movdqu -15(%rsi,%rdx), %xmm0
+ movdqu %xmm3, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm1, -31(%rdi,%rdx)
+ movdqu %xmm0, -15(%rdi,%rdx)
+L(copied_cross_page):
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+ cmpb $0, (%rsi,%rdx)
+ jne L(prepare_loop)
+ ret
+
+ ALIGN (3)
+L(copy_less_16_bytes):
+ cmpq $7, %rdx
+ jae L(copy_8_15_bytes)
+ cmpq $3, %rdx
+ jae L(copy_4_7_bytes)
+ cmpq $1, %rdx
+ jb L(copy_1_byte)
+ movzwl (%rsi), %ecx
+ movw %cx, (%rdi)
+ movzwl -1(%rsi,%rdx), %ecx
+ movw %cx, -1(%rdi,%rdx)
+ jmp L(copied_cross_page)
+
+ ALIGN (3)
+L(copy_16_32_bytes):
+ movdqu (%rsi), %xmm1
+ movdqu -15(%rsi,%rdx), %xmm0
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm0, -15(%rdi,%rdx)
+ jmp L(copied_cross_page)
+
+L(copy_8_15_bytes):
+ movq (%rsi), %r9
+ movq -7(%rsi,%rdx), %rcx
+ movq %r9, (%rdi)
+ movq %rcx, -7(%rdi,%rdx)
+ jmp L(copied_cross_page)
+
+L(copy_4_7_bytes):
+ movl (%rsi), %r9d
+ movl -3(%rsi,%rdx), %ecx
+ movl %r9d, (%rdi)
+ movl %ecx, -3(%rdi,%rdx)
+ jmp L(copied_cross_page)
+
+L(copy_1_byte):
+ movzbl (%rsi), %ecx
+ movb %cl, (%rdi)
+ jmp L(copied_cross_page)
+
+END (STRCPY)
+
+#endif
--
1.8.3.2