This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
PATCH: Optimize strchr/strrchr with SSE4.2
- From: "H.J. Lu" <hongjiu dot lu at intel dot com>
- To: GNU C Library <libc-alpha at sourceware dot org>
- Date: Thu, 22 Oct 2009 11:07:06 -0700
- Subject: PATCH: Optimize strchr/strrchr with SSE4.2
- Reply-to: "H.J. Lu" <hjl dot tools at gmail dot com>
Hi,
This patch adds SSE4.2 optimized strchr/strrchr. It can speed up
strchr/strrchr by up to 2X on Intel Core i7.
Thanks.
H.J.
---
2009-10-21 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strend-sse4.
* sysdeps/x86_64/multiarch/strchr.S: New.
* sysdeps/x86_64/multiarch/strend-sse4.S: Likewise.
* sysdeps/x86_64/multiarch/strrchr.S: Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 0ded3b3..364e7bb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,7 +4,8 @@ gen-as-const-headers += ifunc-defines.sym
endif
ifeq ($(subdir),string)
-sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3
+sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
+ strend-sse4
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
new file mode 100644
index 0000000..a77cc1c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -0,0 +1,177 @@
+/* strchr with SSE4.2
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+ .text
+ENTRY(strchr)
+ .type strchr, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __strchr_sse2(%rip), %rax
+ testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+ jz 2f
+ leaq __strchr_sse42(%rip), %rax
+2: ret
+END(strchr)
+
+
+/*
+ This implementation uses SSE4 instructions to compare up to 16 bytes
+ at a time looking for the first occurrence of the character c in the
+ string s:
+
+ char *strchr (const char *s, int c);
+
+ We use 0xa:
+ _SIDD_SBYTE_OPS
+ | _SIDD_CMP_EQUAL_EACH
+ | _SIDD_LEAST_SIGNIFICANT
+ on pcmpistri to compare xmm/mem128
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ X X X X X X X X X X X X X X X X
+
+ against xmm
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ C C C C C C C C C C C C C C C C
+
+ to find out if the first 16byte data element has a byte C and the
+ offset of the first byte. There are 3 cases:
+
+ 1. The first 16byte data element has the byte C at the offset X.
+ 2. The first 16byte data element has EOS and doesn't have the byte C.
+ 3. The first 16byte data element is valid and doesn't have the byte C.
+
+ Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
+
+ case ECX CFlag ZFlag SFlag
+ 1 X 1 0/1 0
+ 2 16 0 1 0
+ 3 16 0 0 0
+
+ We exit from the loop for cases 1 and 2 with jbe which branches
+ when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
+ X for case 1. */
+
+ .section .text.sse4.2,"ax",@progbits
+ .align 16
+ .type __strchr_sse42, @function
+__strchr_sse42:
+ cfi_startproc
+ CALL_MCOUNT
+ testb %sil, %sil
+ je __strend_sse4
+ pxor %xmm2, %xmm2
+ movd %esi, %xmm1
+ movl %edi, %ecx
+ andl $15, %ecx
+ movq %rdi, %r8
+ je L(aligned_start)
+
+/* Handle unaligned string. */
+ andq $-16, %r8
+ pshufb %xmm2, %xmm1
+ movdqa (%r8), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %esi
+ /* Remove the leading bytes. */
+ sarl %cl, %edx
+ sarl %cl, %esi
+ testl %esi, %esi
+ je L(unaligned_no_match)
+ /* Check which byte is a match. */
+ bsfl %esi, %eax
+ /* Is there a NULL? */
+ testl %edx, %edx
+ je L(unaligned_match)
+ bsfl %edx, %esi
+ cmpl %esi, %eax
+ /* Return NULL if NULL comes first. */
+ ja L(return_null)
+L(unaligned_match):
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(unaligned_no_match):
+ testl %edx, %edx
+ jne L(return_null)
+
+/* Loop start on aligned string. */
+L(loop):
+ addq $16, %r8
+L(aligned_start):
+ pcmpistri $0x2, (%r8), %xmm1
+ jbe L(wrap)
+ addq $16, %r8
+ pcmpistri $0x2, (%r8), %xmm1
+ jbe L(wrap)
+ addq $16, %r8
+ pcmpistri $0x2, (%r8), %xmm1
+ jbe L(wrap)
+ addq $16, %r8
+ pcmpistri $0x2, (%r8), %xmm1
+ jbe L(wrap)
+ jmp L(loop)
+L(wrap):
+ jc L(loop_exit)
+
+/* Return NULL. */
+L(return_null):
+ xorl %eax, %eax
+ ret
+
+/* Loop exit. */
+ .p2align 4
+L(loop_exit):
+ leaq (%r8,%rcx), %rax
+ ret
+ cfi_endproc
+ .size __strchr_sse42, .-__strchr_sse42
+
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strchr_sse2, @function; \
+ .align 16; \
+ __strchr_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strchr calls through a PLT.
+ The speedup we get from using SSE4.2 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strchr; __GI_strchr = __strchr_sse2
+#endif
+
+#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/strend-sse4.S b/sysdeps/x86_64/multiarch/strend-sse4.S
new file mode 100644
index 0000000..c3220b3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strend-sse4.S
@@ -0,0 +1,49 @@
+/* Return the pointer to the end of string, using SSE4.2
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+ .section .text.sse4.2,"ax",@progbits
+ENTRY (__strend_sse4)
+ pxor %xmm2, %xmm2
+ movq %rdi, %rcx
+ andq $~15, %rdi
+ movdqa %xmm2, %xmm1
+ pcmpeqb (%rdi), %xmm2
+ orl $0xffffffff, %esi
+ subq %rdi, %rcx
+ shll %cl, %esi
+ pmovmskb %xmm2, %edx
+ andl %esi, %edx
+ jnz 1f
+
+2: pcmpistri $0x08, 16(%rdi), %xmm1
+ leaq 16(%rdi), %rdi
+ jnz 2b
+
+ leaq (%rdi,%rcx), %rax
+ ret
+
+1: bsfl %edx, %eax
+ addq %rdi, %rax
+ ret
+
+END (__strend_sse4)
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
new file mode 100644
index 0000000..a8c28a4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr.S
@@ -0,0 +1,278 @@
+/* strrchr with SSE4.2
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+
+/* Define multiple versions only for the definition in libc and for
+ the DSO. In static binaries we need strrchr before the initialization
+ happened. */
+#if defined SHARED && !defined NOT_IN_libc
+ .text
+ENTRY(strrchr)
+ .type strrchr, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __strrchr_sse2(%rip), %rax
+ testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+ jz 2f
+ leaq __strrchr_sse42(%rip), %rax
+2: ret
+END(strrchr)
+
+/*
+ This implementation uses SSE4 instructions to compare up to 16 bytes
+ at a time looking for the last occurrence of the character c in the
+ string s:
+
+ char *strrchr (const char *s, int c);
+
+ We use 0x4a:
+ _SIDD_SBYTE_OPS
+ | _SIDD_CMP_EQUAL_EACH
+ | _SIDD_MOST_SIGNIFICANT
+ on pcmpistri to compare xmm/mem128
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ X X X X X X X X X X X X X X X X
+
+ against xmm
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ C C C C C C C C C C C C C C C C
+
+ to find out if the first 16byte data element has a byte C and the
+ last offset. There are 4 cases:
+
+ 1. The first 16byte data element has EOS and has the byte C at the
+ last offset X.
+ 2. The first 16byte data element is valid and has the byte C at the
+ last offset X.
+ 3. The first 16byte data element has EOS and doesn't have the byte C.
+ 4. The first 16byte data element is valid and doesn't have the byte C.
+
+ Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
+
+ case ECX CFlag ZFlag SFlag
+ 1 X 1 1 0
+ 2 X 1 0 0
+ 3 16 0 1 0
+ 4 16 0 0 0
+
+ We exit from the loop for cases 1 and 3 with jz which branches
+ when ZFlag is 1. If CFlag == 1, ECX has the offset X for case 1. */
+
+
+ .section .text.sse4.2,"ax",@progbits
+ .align 16
+ .type __strrchr_sse42, @function
+__strrchr_sse42:
+ cfi_startproc
+ CALL_MCOUNT
+ testb %sil, %sil
+ je __strend_sse4
+ xor %eax,%eax /* RAX has the last occurrence of s. */
+ movd %esi, %xmm1
+ punpcklbw %xmm1, %xmm1
+ movl %edi, %esi
+ punpcklbw %xmm1, %xmm1
+ andl $15, %esi
+ pshufd $0, %xmm1, %xmm1
+ movq %rdi, %r8
+ je L(loop)
+
+/* Handle unaligned string using psrldq. */
+ leaq L(psrldq_table)(%rip), %rdx
+ andq $-16, %r8
+ movslq (%rdx,%rsi,4),%r9
+ movdqa (%r8), %xmm0
+ addq %rdx, %r9
+ jmp *%r9
+
+/* Handle unaligned string with offset 1 using psrldq. */
+ .p2align 4
+L(psrldq_1):
+ psrldq $1, %xmm0
+
+ .p2align 4
+L(unaligned_pcmpistri):
+ pcmpistri $0x4a, %xmm1, %xmm0
+ jnc L(unaligned_no_byte)
+ leaq (%rdi,%rcx), %rax
+L(unaligned_no_byte):
+ /* Find the length of the unaligned string. */
+ pcmpistri $0x3a, %xmm0, %xmm0
+ movl $16, %edx
+ subl %esi, %edx
+ cmpl %ecx, %edx
+ /* Return RAX if the unaligned fragment to next 16B already
+ contain the NULL terminator. */
+ jg L(exit)
+ addq $16, %r8
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ pcmpistri $0x4a, (%r8), %xmm1
+ jbe L(match_or_eos)
+ addq $16, %r8
+ jmp L(loop)
+ .p2align 4
+L(match_or_eos):
+ je L(had_eos)
+L(match_no_eos):
+ leaq (%r8,%rcx), %rax
+ addq $16, %r8
+ jmp L(loop)
+ .p2align 4
+L(had_eos):
+ jnc L(exit)
+ leaq (%r8,%rcx), %rax
+ .p2align 4
+L(exit):
+ ret
+
+/* Handle unaligned string with offset 15 using psrldq. */
+ .p2align 4
+L(psrldq_15):
+ psrldq $15, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 14 using psrldq. */
+ .p2align 4
+L(psrldq_14):
+ psrldq $14, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 13 using psrldq. */
+ .p2align 4
+L(psrldq_13):
+ psrldq $13, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 12 using psrldq. */
+ .p2align 4
+L(psrldq_12):
+ psrldq $12, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 11 using psrldq. */
+ .p2align 4
+L(psrldq_11):
+ psrldq $11, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 10 using psrldq. */
+ .p2align 4
+L(psrldq_10):
+ psrldq $10, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 9 using psrldq. */
+ .p2align 4
+L(psrldq_9):
+ psrldq $9, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 8 using psrldq. */
+ .p2align 4
+L(psrldq_8):
+ psrldq $8, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 7 using psrldq. */
+ .p2align 4
+L(psrldq_7):
+ psrldq $7, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 6 using psrldq. */
+ .p2align 4
+L(psrldq_6):
+ psrldq $6, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 5 using psrldq. */
+ .p2align 4
+L(psrldq_5):
+ psrldq $5, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 4 using psrldq. */
+ .p2align 4
+L(psrldq_4):
+ psrldq $4, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 3 using psrldq. */
+ .p2align 4
+L(psrldq_3):
+ psrldq $3, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 2 using psrldq. */
+ .p2align 4
+L(psrldq_2):
+ psrldq $2, %xmm0
+ jmp L(unaligned_pcmpistri)
+
+ cfi_endproc
+ .size __strrchr_sse42, .-__strrchr_sse42
+
+ .section .rodata.sse4.2,"a",@progbits
+ .p2align 4
+L(psrldq_table):
+ .int L(loop) - L(psrldq_table)
+ .int L(psrldq_1) - L(psrldq_table)
+ .int L(psrldq_2) - L(psrldq_table)
+ .int L(psrldq_3) - L(psrldq_table)
+ .int L(psrldq_4) - L(psrldq_table)
+ .int L(psrldq_5) - L(psrldq_table)
+ .int L(psrldq_6) - L(psrldq_table)
+ .int L(psrldq_7) - L(psrldq_table)
+ .int L(psrldq_8) - L(psrldq_table)
+ .int L(psrldq_9) - L(psrldq_table)
+ .int L(psrldq_10) - L(psrldq_table)
+ .int L(psrldq_11) - L(psrldq_table)
+ .int L(psrldq_12) - L(psrldq_table)
+ .int L(psrldq_13) - L(psrldq_table)
+ .int L(psrldq_14) - L(psrldq_table)
+ .int L(psrldq_15) - L(psrldq_table)
+
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strrchr_sse2, @function; \
+ .align 16; \
+ __strrchr_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
+ The speedup we get from using SSE4.2 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
+#endif
+
+#include "../strrchr.S"