From dc1a95c730699bdccbafa85f189b814107f409b5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Ond=C5=99ej=20B=C3=ADlka?= Date: Thu, 26 Sep 2013 18:54:09 +0200 Subject: [PATCH] Faster strrchr. --- ChangeLog | 9 + sysdeps/x86_64/multiarch/Makefile | 4 +- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 - sysdeps/x86_64/multiarch/strend-sse4.S | 48 -- .../x86_64/multiarch/strrchr-sse2-no-bsf.S | 555 ------------------ sysdeps/x86_64/multiarch/strrchr.S | 288 --------- sysdeps/x86_64/strrchr.S | 241 ++++++-- 7 files changed, 208 insertions(+), 943 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strend-sse4.S delete mode 100644 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S delete mode 100644 sysdeps/x86_64/multiarch/strrchr.S diff --git a/ChangeLog b/ChangeLog index 0fa412cdf4..7025fa9ca9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2013-09-26 Ondřej Bílka + + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update. + * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Remove strrchr ifunc. + * sysdeps/x86_64/multiarch/strend-sse4.S Remove. + * sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S Likewise. + * sysdeps/x86_64/multiarch/strrchr.S: Likewise. + * sysdeps/x86_64/strrchr.S (strrchr): Use optimized implementation. + 2013-09-25 Adhemerval Zanella * sysdeps/powerpc/powerpc64/stackguard-macros.h (POINTER_CHK_GUARD: diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 5ab950a53f..9fd0fd64c5 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -8,7 +8,7 @@ ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcmp-sse2-unaligned strncmp-ssse3 \ - strend-sse4 memcmp-sse4 memcpy-ssse3 \ + memcmp-sse4 memcpy-ssse3 \ memcpy-sse2-unaligned mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ @@ -17,7 +17,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ - strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3 + strchr-sse2-no-bsf memcmp-ssse3 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 1a65ac04ff..71beab82e4 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -176,12 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strpbrk_sse42) IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2)) - /* Support sysdeps/x86_64/multiarch/strrchr.S. */ - IFUNC_IMPL (i, name, strrchr, - IFUNC_IMPL_ADD (array, i, strrchr, HAS_SSE4_2, - __strrchr_sse42) - IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2_no_bsf) - IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2)) /* Support sysdeps/x86_64/multiarch/strspn.S. */ IFUNC_IMPL (i, name, strspn, diff --git a/sysdeps/x86_64/multiarch/strend-sse4.S b/sysdeps/x86_64/multiarch/strend-sse4.S deleted file mode 100644 index c5a7ae28a6..0000000000 --- a/sysdeps/x86_64/multiarch/strend-sse4.S +++ /dev/null @@ -1,48 +0,0 @@ -/* Return the pointer to the end of string, using SSE4.2 - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include "asm-syntax.h" - - .section .text.sse4.2,"ax",@progbits -ENTRY (__strend_sse4) - pxor %xmm2, %xmm2 - movq %rdi, %rcx - andq $~15, %rdi - movdqa %xmm2, %xmm1 - pcmpeqb (%rdi), %xmm2 - orl $0xffffffff, %esi - subq %rdi, %rcx - shll %cl, %esi - pmovmskb %xmm2, %edx - andl %esi, %edx - jnz 1f - -2: pcmpistri $0x08, 16(%rdi), %xmm1 - leaq 16(%rdi), %rdi - jnz 2b - - leaq (%rdi,%rcx), %rax - ret - -1: bsfl %edx, %eax - addq %rdi, %rax - ret - -END (__strend_sse4) diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S deleted file mode 100644 index fcef610dbc..0000000000 --- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S +++ /dev/null @@ -1,555 +0,0 @@ -/* strrchr with SSE2 without bsf and bsr - Copyright (C) 2011-2013 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if defined SHARED && !defined NOT_IN_libc - -# include -# include "asm-syntax.h" - - atom_text_section -ENTRY (__strrchr_sse2_no_bsf) - - movd %rsi, %xmm1 - pxor %xmm2, %xmm2 - mov %rdi, %rcx - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - /* ECX has OFFSET. */ - and $63, %rcx - cmp $48, %rcx - pshufd $0, %xmm1, %xmm1 - ja L(crosscache) - -/* unaligned string. */ - movdqu (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm2 - pcmpeqb %xmm1, %xmm0 - /* Find where NULL is. */ - pmovmskb %xmm2, %rcx - /* Check if there is a match. */ - pmovmskb %xmm0, %rax - add $16, %rdi - - test %rax, %rax - jnz L(unaligned_match1) - - test %rcx, %rcx - jnz L(return_null) - - and $-16, %rdi - xor %r8, %r8 - jmp L(loop) - - .p2align 4 -L(unaligned_match1): - test %rcx, %rcx - jnz L(prolog_find_zero_1) - - mov %rax, %r8 - mov %rdi, %rsi - and $-16, %rdi - jmp L(loop) - - .p2align 4 -L(crosscache): -/* Hancle unaligned string. */ - and $15, %rcx - and $-16, %rdi - pxor %xmm3, %xmm3 - movdqa (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm3 - pcmpeqb %xmm1, %xmm0 - /* Find where NULL is. */ - pmovmskb %xmm3, %rdx - /* Check if there is a match. */ - pmovmskb %xmm0, %rax - /* Remove the leading bytes. */ - shr %cl, %rdx - shr %cl, %rax - add $16, %rdi - - test %rax, %rax - jnz L(unaligned_match) - - test %rdx, %rdx - jnz L(return_null) - - xor %r8, %r8 - jmp L(loop) - - .p2align 4 -L(unaligned_match): - test %rdx, %rdx - jnz L(prolog_find_zero) - - mov %rax, %r8 - lea (%rdi, %rcx), %rsi - -/* Loop start on aligned string. */ - .p2align 4 -L(loop): - movdqa (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm2 - add $16, %rdi - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm2 - add $16, %rdi - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm2 - add $16, %rdi - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm2 - add $16, %rdi - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jz L(loop) - -L(matches): - test %rax, %rax - jnz L(match) -L(return_value): - test %r8, %r8 - jz L(return_null) - mov %r8, %rax - mov %rsi, %rdi - jmp L(match_exit) - - .p2align 4 -L(match): - pmovmskb %xmm2, %rcx - test %rcx, %rcx - jnz L(find_zero) - mov %rax, %r8 - mov %rdi, %rsi - jmp L(loop) - - .p2align 4 -L(find_zero): - test %cl, %cl - jz L(find_zero_high) - mov %cl, %dl - and $15, %dl - jz L(find_zero_8) - test $0x01, %cl - jnz L(FindZeroExit1) - test $0x02, %cl - jnz L(FindZeroExit2) - test $0x04, %cl - jnz L(FindZeroExit3) - and $1 << 4 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(find_zero_8): - test $0x10, %cl - jnz L(FindZeroExit5) - test $0x20, %cl - jnz L(FindZeroExit6) - test $0x40, %cl - jnz L(FindZeroExit7) - and $1 << 8 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(find_zero_high): - mov %ch, %dh - and $15, %dh - jz L(find_zero_high_8) - test $0x01, %ch - jnz L(FindZeroExit9) - test $0x02, %ch - jnz L(FindZeroExit10) - test $0x04, %ch - jnz L(FindZeroExit11) - and $1 << 12 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(find_zero_high_8): - test $0x10, %ch - jnz L(FindZeroExit13) - test $0x20, %ch - jnz L(FindZeroExit14) - test $0x40, %ch - jnz L(FindZeroExit15) - and $1 << 16 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit1): - and $1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit2): - and $1 << 2 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit3): - and $1 << 3 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit5): - and $1 << 5 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit6): - and $1 << 6 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit7): - and $1 << 7 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit9): - and $1 << 9 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit10): - and $1 << 10 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit11): - and $1 << 11 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit13): - and $1 << 13 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit14): - and $1 << 14 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit15): - and $1 << 15 - 1, %rax - jz L(return_value) - - .p2align 4 -L(match_exit): - test %ah, %ah - jnz L(match_exit_high) - mov %al, %dl - and $15 << 4, %dl - jnz L(match_exit_8) - test $0x08, %al - jnz L(Exit4) - test $0x04, %al - jnz L(Exit3) - test $0x02, %al - jnz L(Exit2) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(match_exit_8): - test $0x80, %al - jnz L(Exit8) - test $0x40, %al - jnz L(Exit7) - test $0x20, %al - jnz L(Exit6) - lea -12(%rdi), %rax - ret - - .p2align 4 -L(match_exit_high): - mov %ah, %dh - and $15 << 4, %dh - jnz L(match_exit_high_8) - test $0x08, %ah - jnz L(Exit12) - test $0x04, %ah - jnz L(Exit11) - test $0x02, %ah - jnz L(Exit10) - lea -8(%rdi), %rax - ret - - .p2align 4 -L(match_exit_high_8): - test $0x80, %ah - jnz L(Exit16) - test $0x40, %ah - jnz L(Exit15) - test $0x20, %ah - jnz L(Exit14) - lea -4(%rdi), %rax - ret - - .p2align 4 -L(Exit2): - lea -15(%rdi), %rax - ret - - .p2align 4 -L(Exit3): - lea -14(%rdi), %rax - ret - - .p2align 4 -L(Exit4): - lea -13(%rdi), %rax - ret - - .p2align 4 -L(Exit6): - lea -11(%rdi), %rax - ret - - .p2align 4 -L(Exit7): - lea -10(%rdi), %rax - ret - - .p2align 4 -L(Exit8): - lea -9(%rdi), %rax - ret - - .p2align 4 -L(Exit10): - lea -7(%rdi), %rax - ret - - .p2align 4 -L(Exit11): - lea -6(%rdi), %rax - ret - - .p2align 4 -L(Exit12): - lea -5(%rdi), %rax - ret - - .p2align 4 -L(Exit14): - lea -3(%rdi), %rax - ret - - .p2align 4 -L(Exit15): - lea -2(%rdi), %rax - ret - - .p2align 4 -L(Exit16): - lea -1(%rdi), %rax - ret - -/* Return NULL. */ - .p2align 4 -L(return_null): - xor %rax, %rax - ret - - .p2align 4 -L(prolog_find_zero): - add %rcx, %rdi - mov %rdx, %rcx -L(prolog_find_zero_1): - test %cl, %cl - jz L(prolog_find_zero_high) - mov %cl, %dl - and $15, %dl - jz L(prolog_find_zero_8) - test $0x01, %cl - jnz L(PrologFindZeroExit1) - test $0x02, %cl - jnz L(PrologFindZeroExit2) - test $0x04, %cl - jnz L(PrologFindZeroExit3) - and $1 << 4 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(prolog_find_zero_8): - test $0x10, %cl - jnz L(PrologFindZeroExit5) - test $0x20, %cl - jnz L(PrologFindZeroExit6) - test $0x40, %cl - jnz L(PrologFindZeroExit7) - and $1 << 8 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(prolog_find_zero_high): - mov %ch, %dh - and $15, %dh - jz L(prolog_find_zero_high_8) - test $0x01, %ch - jnz L(PrologFindZeroExit9) - test $0x02, %ch - jnz L(PrologFindZeroExit10) - test $0x04, %ch - jnz L(PrologFindZeroExit11) - and $1 << 12 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(prolog_find_zero_high_8): - test $0x10, %ch - jnz L(PrologFindZeroExit13) - test $0x20, %ch - jnz L(PrologFindZeroExit14) - test $0x40, %ch - jnz L(PrologFindZeroExit15) - and $1 << 16 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit1): - and $1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit2): - and $1 << 2 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit3): - and $1 << 3 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit5): - and $1 << 5 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit6): - and $1 << 6 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit7): - and $1 << 7 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit9): - and $1 << 9 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit10): - and $1 << 10 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit11): - and $1 << 11 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit13): - and $1 << 13 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit14): - and $1 << 14 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit15): - and $1 << 15 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - -END (__strrchr_sse2_no_bsf) -#endif diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S deleted file mode 100644 index 3f92a41ef9..0000000000 --- a/sysdeps/x86_64/multiarch/strrchr.S +++ /dev/null @@ -1,288 +0,0 @@ -/* Multiple versions of strrchr - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - - -/* Define multiple versions only for the definition in libc and for - the DSO. In static binaries we need strrchr before the initialization - happened. */ -#if defined SHARED && !defined NOT_IN_libc - .text -ENTRY(strrchr) - .type strrchr, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __strrchr_sse2(%rip), %rax - testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip) - jnz 2f - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jz 2f - leaq __strrchr_sse42(%rip), %rax - ret -2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) - jz 3f - leaq __strrchr_sse2_no_bsf(%rip), %rax -3: ret -END(strrchr) - -/* - This implementation uses SSE4 instructions to compare up to 16 bytes - at a time looking for the last occurrence of the character c in the - string s: - - char *strrchr (const char *s, int c); - - We use 0x4a: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_EACH - | _SIDD_MOST_SIGNIFICANT - on pcmpistri to compare xmm/mem128 - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - X X X X X X X X X X X X X X X X - - against xmm - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - C C C C C C C C C C C C C C C C - - to find out if the first 16byte data element has a byte C and the - last offset. There are 4 cases: - - 1. The first 16byte data element has EOS and has the byte C at the - last offset X. - 2. The first 16byte data element is valid and has the byte C at the - last offset X. - 3. The first 16byte data element has EOS and doesn't have the byte C. - 4. The first 16byte data element is valid and doesn't have the byte C. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases: - - case ECX CFlag ZFlag SFlag - 1 X 1 1 0 - 2 X 1 0 0 - 3 16 0 1 0 - 4 16 0 0 0 - - We exit from the loop for cases 1 and 3 with jz which branches - when ZFlag is 1. If CFlag == 1, ECX has the offset X for case 1. */ - - - .section .text.sse4.2,"ax",@progbits - .align 16 - .type __strrchr_sse42, @function - .globl __strrchr_sse42 - .hidden __strrchr_sse42 -__strrchr_sse42: - cfi_startproc - CALL_MCOUNT - testb %sil, %sil - je __strend_sse4 - xor %eax,%eax /* RAX has the last occurrence of s. */ - movd %esi, %xmm1 - punpcklbw %xmm1, %xmm1 - movl %edi, %esi - punpcklbw %xmm1, %xmm1 - andl $15, %esi - pshufd $0, %xmm1, %xmm1 - movq %rdi, %r8 - je L(loop) - -/* Handle unaligned string using psrldq. */ - leaq L(psrldq_table)(%rip), %rdx - andq $-16, %r8 - movslq (%rdx,%rsi,4),%r9 - movdqa (%r8), %xmm0 - addq %rdx, %r9 - jmp *%r9 - -/* Handle unaligned string with offset 1 using psrldq. */ - .p2align 4 -L(psrldq_1): - psrldq $1, %xmm0 - - .p2align 4 -L(unaligned_pcmpistri): - pcmpistri $0x4a, %xmm1, %xmm0 - jnc L(unaligned_no_byte) - leaq (%rdi,%rcx), %rax -L(unaligned_no_byte): - /* Find the length of the unaligned string. */ - pcmpistri $0x3a, %xmm0, %xmm0 - movl $16, %edx - subl %esi, %edx - cmpl %ecx, %edx - /* Return RAX if the unaligned fragment to next 16B already - contain the NULL terminator. */ - jg L(exit) - addq $16, %r8 - -/* Loop start on aligned string. */ - .p2align 4 -L(loop): - pcmpistri $0x4a, (%r8), %xmm1 - jbe L(match_or_eos) - addq $16, %r8 - jmp L(loop) - .p2align 4 -L(match_or_eos): - je L(had_eos) -L(match_no_eos): - leaq (%r8,%rcx), %rax - addq $16, %r8 - jmp L(loop) - .p2align 4 -L(had_eos): - jnc L(exit) - leaq (%r8,%rcx), %rax - .p2align 4 -L(exit): - ret - -/* Handle unaligned string with offset 15 using psrldq. */ - .p2align 4 -L(psrldq_15): - psrldq $15, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 14 using psrldq. */ - .p2align 4 -L(psrldq_14): - psrldq $14, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 13 using psrldq. */ - .p2align 4 -L(psrldq_13): - psrldq $13, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 12 using psrldq. */ - .p2align 4 -L(psrldq_12): - psrldq $12, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 11 using psrldq. */ - .p2align 4 -L(psrldq_11): - psrldq $11, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 10 using psrldq. */ - .p2align 4 -L(psrldq_10): - psrldq $10, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 9 using psrldq. */ - .p2align 4 -L(psrldq_9): - psrldq $9, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 8 using psrldq. */ - .p2align 4 -L(psrldq_8): - psrldq $8, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 7 using psrldq. */ - .p2align 4 -L(psrldq_7): - psrldq $7, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 6 using psrldq. */ - .p2align 4 -L(psrldq_6): - psrldq $6, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 5 using psrldq. */ - .p2align 4 -L(psrldq_5): - psrldq $5, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 4 using psrldq. */ - .p2align 4 -L(psrldq_4): - psrldq $4, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 3 using psrldq. */ - .p2align 4 -L(psrldq_3): - psrldq $3, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 2 using psrldq. */ - .p2align 4 -L(psrldq_2): - psrldq $2, %xmm0 - jmp L(unaligned_pcmpistri) - - cfi_endproc - .size __strrchr_sse42, .-__strrchr_sse42 - - .section .rodata.sse4.2,"a",@progbits - .p2align 4 -L(psrldq_table): - .int L(loop) - L(psrldq_table) - .int L(psrldq_1) - L(psrldq_table) - .int L(psrldq_2) - L(psrldq_table) - .int L(psrldq_3) - L(psrldq_table) - .int L(psrldq_4) - L(psrldq_table) - .int L(psrldq_5) - L(psrldq_table) - .int L(psrldq_6) - L(psrldq_table) - .int L(psrldq_7) - L(psrldq_table) - .int L(psrldq_8) - L(psrldq_table) - .int L(psrldq_9) - L(psrldq_table) - .int L(psrldq_10) - L(psrldq_table) - .int L(psrldq_11) - L(psrldq_table) - .int L(psrldq_12) - L(psrldq_table) - .int L(psrldq_13) - L(psrldq_table) - .int L(psrldq_14) - L(psrldq_table) - .int L(psrldq_15) - L(psrldq_table) - - -# undef ENTRY -# define ENTRY(name) \ - .type __strrchr_sse2, @function; \ - .align 16; \ - .globl __strrchr_sse2; \ - .hidden __strrchr_sse2; \ - __strrchr_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strrchr calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_strrchr; __GI_strrchr = __strrchr_sse2 -#endif - -#include "../strrchr.S" diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S index e413b07438..514765b87f 100644 --- a/sysdeps/x86_64/strrchr.S +++ b/sysdeps/x86_64/strrchr.S @@ -1,6 +1,5 @@ /* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR. - For AMD x86-64. - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2013 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,63 +16,217 @@ License along with the GNU C Library; if not, see . */ + #include +# ifndef ALIGN +# define ALIGN(n) .p2align n +# endif + .text ENTRY (strrchr) movd %esi, %xmm1 - movq %rdi, %rcx - punpcklbw %xmm1, %xmm1 - andq $~15, %rdi - pxor %xmm2, %xmm2 - punpcklbw %xmm1, %xmm1 - orl $0xffffffff, %esi - movdqa (%rdi), %xmm0 + movq %rdi, %rax + andl $4095, %eax + punpcklbw %xmm1, %xmm1 + cmpq $4032, %rax + punpcklwd %xmm1, %xmm1 pshufd $0, %xmm1, %xmm1 - subq %rdi, %rcx + ja L(cross_page) + movdqu (%rdi), %xmm0 + pxor %xmm2, %xmm2 movdqa %xmm0, %xmm3 - leaq 16(%rdi), %rdi pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm3 - shl %cl, %esi - pmovmskb %xmm0, %edx - pmovmskb %xmm3, %ecx - andl %esi, %edx - andl %esi, %ecx - xorl %eax, %eax - movl %edx, %esi - orl %ecx, %esi - jnz 1f + pmovmskb %xmm0, %ecx + pmovmskb %xmm3, %edx + testq %rdx, %rdx + je L(next_48_bytes) + leaq -1(%rdx), %rax + xorq %rdx, %rax + andq %rcx, %rax + je L(exit) + bsrq %rax, %rax + addq %rdi, %rax + ret -2: movdqa (%rdi), %xmm0 - leaq 16(%rdi), %rdi - movdqa %xmm0, %xmm3 + ALIGN(4) +L(next_48_bytes): + movdqu 16(%rdi), %xmm4 + movdqa %xmm4, %xmm5 + movdqu 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm5 + movdqu 48(%rdi), %xmm0 + pmovmskb %xmm5, %edx + movdqa %xmm3, %xmm5 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm0, %xmm2 + salq $16, %rdx + pmovmskb %xmm3, %r8d + pmovmskb %xmm5, %eax + pmovmskb %xmm2, %esi + salq $32, %r8 + salq $32, %rax pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm3, %ecx - movl %edx, %esi - orl %ecx, %esi - jz 2b + orq %rdx, %rax + movq %rsi, %rdx + pmovmskb %xmm4, %esi + salq $48, %rdx + salq $16, %rsi + orq %r8, %rsi + orq %rcx, %rsi + pmovmskb %xmm0, %ecx + salq $48, %rcx + orq %rcx, %rsi + orq %rdx, %rax + je L(loop_header2) + leaq -1(%rax), %rcx + xorq %rax, %rcx + andq %rcx, %rsi + je L(exit) + bsrq %rsi, %rsi + leaq (%rdi,%rsi), %rax + ret -1: bsfl %ecx, %r9d - movl $0xffffffff, %r8d - movl $31, %ecx - jnz 5f + ALIGN(4) +L(loop_header2): + testq %rsi, %rsi + movq %rdi, %rcx + je L(no_c_found) +L(loop_header): + addq $64, %rdi + pxor %xmm7, %xmm7 + andq $-64, %rdi + jmp L(loop_entry) + + ALIGN(4) +L(loop64): + testq %rdx, %rdx + cmovne %rdx, %rsi + cmovne %rdi, %rcx + addq $64, %rdi +L(loop_entry): + movdqa 32(%rdi), %xmm3 + pxor %xmm6, %xmm6 + movdqa 48(%rdi), %xmm2 + movdqa %xmm3, %xmm0 + movdqa 16(%rdi), %xmm4 + pminub %xmm2, %xmm0 + movdqa (%rdi), %xmm5 + pminub %xmm4, %xmm0 + pminub %xmm5, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + movdqa %xmm5, %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %r9d + movdqa %xmm4, %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + movdqa %xmm3, %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $16, %rdx + pmovmskb %xmm0, %r10d + movdqa %xmm2, %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $32, %r10 + orq %r10, %rdx + pmovmskb %xmm0, %r8d + orq %r9, %rdx + salq $48, %r8 + orq %r8, %rdx + testl %eax, %eax + je L(loop64) + pcmpeqb %xmm6, %xmm4 + pcmpeqb %xmm6, %xmm3 + pcmpeqb %xmm6, %xmm5 + pmovmskb %xmm4, %eax + pmovmskb %xmm3, %r10d + pcmpeqb %xmm6, %xmm2 + pmovmskb %xmm5, %r9d + salq $32, %r10 + salq $16, %rax + pmovmskb %xmm2, %r8d + orq %r10, %rax + orq %r9, %rax + salq $48, %r8 + orq %r8, %rax + leaq -1(%rax), %r8 + xorq %rax, %r8 + andq %r8, %rdx + cmovne %rdi, %rcx + cmovne %rdx, %rsi + bsrq %rsi, %rsi + leaq (%rcx,%rsi), %rax + ret - bsrl %edx, %edx - jz 2b - leaq -16(%rdi,%rdx), %rax - jmp 2b + ALIGN(4) +L(no_c_found): + movl $1, %esi + xorl %ecx, %ecx + jmp L(loop_header) + + ALIGN(4) +L(exit): + xorl %eax, %eax + ret -5: subl %r9d, %ecx - shrl %cl, %r8d - andl %r8d, %edx - bsrl %edx, %edx - jz 4f - leaq -16(%rdi,%rdx), %rax -4: ret + ALIGN(4) +L(cross_page): + movq %rdi, %rax + pxor %xmm0, %xmm0 + andq $-64, %rax + movdqu (%rax), %xmm5 + movdqa %xmm5, %xmm6 + movdqu 16(%rax), %xmm4 + pcmpeqb %xmm1, %xmm5 + pcmpeqb %xmm0, %xmm6 + movdqu 32(%rax), %xmm3 + pmovmskb %xmm6, %esi + movdqa %xmm4, %xmm6 + movdqu 48(%rax), %xmm2 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm0, %xmm6 + pmovmskb %xmm6, %edx + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm0, %xmm6 + pcmpeqb %xmm2, %xmm0 + salq $16, %rdx + pmovmskb %xmm3, %r9d + pmovmskb %xmm6, %r8d + pmovmskb %xmm0, %ecx + salq $32, %r9 + salq $32, %r8 + pcmpeqb %xmm1, %xmm2 + orq %r8, %rdx + salq $48, %rcx + pmovmskb %xmm5, %r8d + orq %rsi, %rdx + pmovmskb %xmm4, %esi + orq %rcx, %rdx + pmovmskb %xmm2, %ecx + salq $16, %rsi + salq $48, %rcx + orq %r9, %rsi + orq %r8, %rsi + orq %rcx, %rsi + movl %edi, %ecx + subl %eax, %ecx + shrq %cl, %rdx + shrq %cl, %rsi + testq %rdx, %rdx + je L(loop_header2) + leaq -1(%rdx), %rax + xorq %rdx, %rax + andq %rax, %rsi + je L(exit) + bsrq %rsi, %rax + addq %rdi, %rax + ret END (strrchr) weak_alias (strrchr, rindex) -- 2.43.5