This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH v2.1] Fix strrchr regression.
- From: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- To: Ondřej Bílka <neleai at seznam dot cz>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Mon, 2 Sep 2013 14:34:04 +0400
- Subject: Re: [PATCH v2.1] Fix strrchr regression.
- Authentication-results: sourceware.org; auth=none
- References: <20130805173346 dot GA4978 at domone dot kolej dot mff dot cuni dot cz> <20130806083430 dot GA6122 at domone dot kolej dot mff dot cuni dot cz> <20130816110627 dot GA23280 at domone dot kolej dot mff dot cuni dot cz> <20130816121457 dot GA26552 at domone dot kolej dot mff dot cuni dot cz> <20130826134649 dot GB6065 at domone dot kolej dot mff dot cuni dot cz> <20130902092044 dot GC11034 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ93D6cf8CLcMjuHM6MZB9ou_Bp1z=rm1o9DqwJHs62AzYQ at mail dot gmail dot com> <20130902101701 dot GA12752 at domone dot kolej dot mff dot cuni dot cz>
It's for me ok now.
--
Liubov
Intel Corporation
On Mon, Sep 2, 2013 at 2:17 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> On Mon, Sep 02, 2013 at 01:39:52PM +0400, Liubov Dmitrieva wrote:
>> It looks good for me except several small issues.
>>
>> You mentioned Intel Corporation. :)
>>
>> > +/* strrchr with SSE2 without bsf and bsr
>> > + Copyright (C) 2011-2013 Free Software Foundation, Inc.
>> > + Contributed by Intel Corporation.
>>
>> And what was the purpose to remove .text?
>> > - .text
>>
>>
>> I don't see what is the purpose to include "asm-syntax.h".
>> > +#include "asm-syntax.h"
>>
> These were left from skeleton that I used and forgoten to delete.
> Here is fixed version.
>
>
> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update.
> * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Remove strrchr ifunc.
> * sysdeps/x86_64/multiarch/strend-sse4.S Remove.
> * sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S Likewise.
> * sysdeps/x86_64/multiarch/strrchr.S: Likewise.
> * sysdeps/x86_64/strrchr.S (strrchr): Use optimized implementation.
>
>
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 +-
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 -
> sysdeps/x86_64/multiarch/strend-sse4.S | 48 ---
> sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S | 555 -------------------------
> sysdeps/x86_64/multiarch/strrchr.S | 288 -------------
> sysdeps/x86_64/strrchr.S | 241 +++++++++--
> 6 files changed, 199 insertions(+), 943 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/strend-sse4.S
> delete mode 100644 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
> delete mode 100644 sysdeps/x86_64/multiarch/strrchr.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 203d16e..b99e8d1 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -7,7 +7,7 @@ endif
> ifeq ($(subdir),string)
>
> sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
> - strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
> + memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
> memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
> strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
> @@ -15,7 +15,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
> strcpy-sse2-unaligned strncpy-sse2-unaligned \
> stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
> strcat-sse2-unaligned strncat-sse2-unaligned \
> - strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
> + strchr-sse2-no-bsf memcmp-ssse3
> ifeq (yes,$(config-cflags-sse4))
> sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
> CFLAGS-varshift.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index d0d95f3..c0a4cfa 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -179,12 +179,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __strpbrk_sse42)
> IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
>
> - /* Support sysdeps/x86_64/multiarch/strrchr.S. */
> - IFUNC_IMPL (i, name, strrchr,
> - IFUNC_IMPL_ADD (array, i, strrchr, HAS_SSE4_2,
> - __strrchr_sse42)
> - IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2_no_bsf)
> - IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
>
> /* Support sysdeps/x86_64/multiarch/strspn.S. */
> IFUNC_IMPL (i, name, strspn,
> diff --git a/sysdeps/x86_64/multiarch/strend-sse4.S b/sysdeps/x86_64/multiarch/strend-sse4.S
> deleted file mode 100644
> index c5a7ae2..0000000
> --- a/sysdeps/x86_64/multiarch/strend-sse4.S
> +++ /dev/null
> @@ -1,48 +0,0 @@
> -/* Return the pointer to the end of string, using SSE4.2
> - Copyright (C) 2009-2013 Free Software Foundation, Inc.
> - Contributed by Intel Corporation.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -
> - .section .text.sse4.2,"ax",@progbits
> -ENTRY (__strend_sse4)
> - pxor %xmm2, %xmm2
> - movq %rdi, %rcx
> - andq $~15, %rdi
> - movdqa %xmm2, %xmm1
> - pcmpeqb (%rdi), %xmm2
> - orl $0xffffffff, %esi
> - subq %rdi, %rcx
> - shll %cl, %esi
> - pmovmskb %xmm2, %edx
> - andl %esi, %edx
> - jnz 1f
> -
> -2: pcmpistri $0x08, 16(%rdi), %xmm1
> - leaq 16(%rdi), %rdi
> - jnz 2b
> -
> - leaq (%rdi,%rcx), %rax
> - ret
> -
> -1: bsfl %edx, %eax
> - addq %rdi, %rax
> - ret
> -
> -END (__strend_sse4)
> diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
> deleted file mode 100644
> index fcef610..0000000
> --- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
> +++ /dev/null
> @@ -1,555 +0,0 @@
> -/* strrchr with SSE2 without bsf and bsr
> - Copyright (C) 2011-2013 Free Software Foundation, Inc.
> - Contributed by Intel Corporation.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -#if defined SHARED && !defined NOT_IN_libc
> -
> -# include <sysdep.h>
> -# include "asm-syntax.h"
> -
> - atom_text_section
> -ENTRY (__strrchr_sse2_no_bsf)
> -
> - movd %rsi, %xmm1
> - pxor %xmm2, %xmm2
> - mov %rdi, %rcx
> - punpcklbw %xmm1, %xmm1
> - punpcklbw %xmm1, %xmm1
> - /* ECX has OFFSET. */
> - and $63, %rcx
> - cmp $48, %rcx
> - pshufd $0, %xmm1, %xmm1
> - ja L(crosscache)
> -
> -/* unaligned string. */
> - movdqu (%rdi), %xmm0
> - pcmpeqb %xmm0, %xmm2
> - pcmpeqb %xmm1, %xmm0
> - /* Find where NULL is. */
> - pmovmskb %xmm2, %rcx
> - /* Check if there is a match. */
> - pmovmskb %xmm0, %rax
> - add $16, %rdi
> -
> - test %rax, %rax
> - jnz L(unaligned_match1)
> -
> - test %rcx, %rcx
> - jnz L(return_null)
> -
> - and $-16, %rdi
> - xor %r8, %r8
> - jmp L(loop)
> -
> - .p2align 4
> -L(unaligned_match1):
> - test %rcx, %rcx
> - jnz L(prolog_find_zero_1)
> -
> - mov %rax, %r8
> - mov %rdi, %rsi
> - and $-16, %rdi
> - jmp L(loop)
> -
> - .p2align 4
> -L(crosscache):
> -/* Hancle unaligned string. */
> - and $15, %rcx
> - and $-16, %rdi
> - pxor %xmm3, %xmm3
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm0, %xmm3
> - pcmpeqb %xmm1, %xmm0
> - /* Find where NULL is. */
> - pmovmskb %xmm3, %rdx
> - /* Check if there is a match. */
> - pmovmskb %xmm0, %rax
> - /* Remove the leading bytes. */
> - shr %cl, %rdx
> - shr %cl, %rax
> - add $16, %rdi
> -
> - test %rax, %rax
> - jnz L(unaligned_match)
> -
> - test %rdx, %rdx
> - jnz L(return_null)
> -
> - xor %r8, %r8
> - jmp L(loop)
> -
> - .p2align 4
> -L(unaligned_match):
> - test %rdx, %rdx
> - jnz L(prolog_find_zero)
> -
> - mov %rax, %r8
> - lea (%rdi, %rcx), %rsi
> -
> -/* Loop start on aligned string. */
> - .p2align 4
> -L(loop):
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - or %rax, %rcx
> - jz L(loop)
> -
> -L(matches):
> - test %rax, %rax
> - jnz L(match)
> -L(return_value):
> - test %r8, %r8
> - jz L(return_null)
> - mov %r8, %rax
> - mov %rsi, %rdi
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(match):
> - pmovmskb %xmm2, %rcx
> - test %rcx, %rcx
> - jnz L(find_zero)
> - mov %rax, %r8
> - mov %rdi, %rsi
> - jmp L(loop)
> -
> - .p2align 4
> -L(find_zero):
> - test %cl, %cl
> - jz L(find_zero_high)
> - mov %cl, %dl
> - and $15, %dl
> - jz L(find_zero_8)
> - test $0x01, %cl
> - jnz L(FindZeroExit1)
> - test $0x02, %cl
> - jnz L(FindZeroExit2)
> - test $0x04, %cl
> - jnz L(FindZeroExit3)
> - and $1 << 4 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(find_zero_8):
> - test $0x10, %cl
> - jnz L(FindZeroExit5)
> - test $0x20, %cl
> - jnz L(FindZeroExit6)
> - test $0x40, %cl
> - jnz L(FindZeroExit7)
> - and $1 << 8 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(find_zero_high):
> - mov %ch, %dh
> - and $15, %dh
> - jz L(find_zero_high_8)
> - test $0x01, %ch
> - jnz L(FindZeroExit9)
> - test $0x02, %ch
> - jnz L(FindZeroExit10)
> - test $0x04, %ch
> - jnz L(FindZeroExit11)
> - and $1 << 12 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(find_zero_high_8):
> - test $0x10, %ch
> - jnz L(FindZeroExit13)
> - test $0x20, %ch
> - jnz L(FindZeroExit14)
> - test $0x40, %ch
> - jnz L(FindZeroExit15)
> - and $1 << 16 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit1):
> - and $1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit2):
> - and $1 << 2 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit3):
> - and $1 << 3 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit5):
> - and $1 << 5 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit6):
> - and $1 << 6 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit7):
> - and $1 << 7 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit9):
> - and $1 << 9 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit10):
> - and $1 << 10 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit11):
> - and $1 << 11 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit13):
> - and $1 << 13 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit14):
> - and $1 << 14 - 1, %rax
> - jz L(return_value)
> - jmp L(match_exit)
> -
> - .p2align 4
> -L(FindZeroExit15):
> - and $1 << 15 - 1, %rax
> - jz L(return_value)
> -
> - .p2align 4
> -L(match_exit):
> - test %ah, %ah
> - jnz L(match_exit_high)
> - mov %al, %dl
> - and $15 << 4, %dl
> - jnz L(match_exit_8)
> - test $0x08, %al
> - jnz L(Exit4)
> - test $0x04, %al
> - jnz L(Exit3)
> - test $0x02, %al
> - jnz L(Exit2)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_exit_8):
> - test $0x80, %al
> - jnz L(Exit8)
> - test $0x40, %al
> - jnz L(Exit7)
> - test $0x20, %al
> - jnz L(Exit6)
> - lea -12(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_exit_high):
> - mov %ah, %dh
> - and $15 << 4, %dh
> - jnz L(match_exit_high_8)
> - test $0x08, %ah
> - jnz L(Exit12)
> - test $0x04, %ah
> - jnz L(Exit11)
> - test $0x02, %ah
> - jnz L(Exit10)
> - lea -8(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_exit_high_8):
> - test $0x80, %ah
> - jnz L(Exit16)
> - test $0x40, %ah
> - jnz L(Exit15)
> - test $0x20, %ah
> - jnz L(Exit14)
> - lea -4(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit2):
> - lea -15(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit3):
> - lea -14(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit4):
> - lea -13(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit6):
> - lea -11(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit7):
> - lea -10(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit8):
> - lea -9(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit10):
> - lea -7(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit11):
> - lea -6(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit12):
> - lea -5(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit14):
> - lea -3(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit15):
> - lea -2(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(Exit16):
> - lea -1(%rdi), %rax
> - ret
> -
> -/* Return NULL. */
> - .p2align 4
> -L(return_null):
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero):
> - add %rcx, %rdi
> - mov %rdx, %rcx
> -L(prolog_find_zero_1):
> - test %cl, %cl
> - jz L(prolog_find_zero_high)
> - mov %cl, %dl
> - and $15, %dl
> - jz L(prolog_find_zero_8)
> - test $0x01, %cl
> - jnz L(PrologFindZeroExit1)
> - test $0x02, %cl
> - jnz L(PrologFindZeroExit2)
> - test $0x04, %cl
> - jnz L(PrologFindZeroExit3)
> - and $1 << 4 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_8):
> - test $0x10, %cl
> - jnz L(PrologFindZeroExit5)
> - test $0x20, %cl
> - jnz L(PrologFindZeroExit6)
> - test $0x40, %cl
> - jnz L(PrologFindZeroExit7)
> - and $1 << 8 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_high):
> - mov %ch, %dh
> - and $15, %dh
> - jz L(prolog_find_zero_high_8)
> - test $0x01, %ch
> - jnz L(PrologFindZeroExit9)
> - test $0x02, %ch
> - jnz L(PrologFindZeroExit10)
> - test $0x04, %ch
> - jnz L(PrologFindZeroExit11)
> - and $1 << 12 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_high_8):
> - test $0x10, %ch
> - jnz L(PrologFindZeroExit13)
> - test $0x20, %ch
> - jnz L(PrologFindZeroExit14)
> - test $0x40, %ch
> - jnz L(PrologFindZeroExit15)
> - and $1 << 16 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit1):
> - and $1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit2):
> - and $1 << 2 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit3):
> - and $1 << 3 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit5):
> - and $1 << 5 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit6):
> - and $1 << 6 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit7):
> - and $1 << 7 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit9):
> - and $1 << 9 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit10):
> - and $1 << 10 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit11):
> - and $1 << 11 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit13):
> - and $1 << 13 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit14):
> - and $1 << 14 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(PrologFindZeroExit15):
> - and $1 << 15 - 1, %rax
> - jnz L(match_exit)
> - xor %rax, %rax
> - ret
> -
> -END (__strrchr_sse2_no_bsf)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
> deleted file mode 100644
> index 3f92a41..0000000
> --- a/sysdeps/x86_64/multiarch/strrchr.S
> +++ /dev/null
> @@ -1,288 +0,0 @@
> -/* Multiple versions of strrchr
> - All versions must be listed in ifunc-impl-list.c.
> - Copyright (C) 2009-2013 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -#include <init-arch.h>
> -
> -
> -/* Define multiple versions only for the definition in libc and for
> - the DSO. In static binaries we need strrchr before the initialization
> - happened. */
> -#if defined SHARED && !defined NOT_IN_libc
> - .text
> -ENTRY(strrchr)
> - .type strrchr, @gnu_indirect_function
> - cmpl $0, __cpu_features+KIND_OFFSET(%rip)
> - jne 1f
> - call __init_cpu_features
> -1: leaq __strrchr_sse2(%rip), %rax
> - testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
> - jnz 2f
> - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> - jz 2f
> - leaq __strrchr_sse42(%rip), %rax
> - ret
> -2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
> - jz 3f
> - leaq __strrchr_sse2_no_bsf(%rip), %rax
> -3: ret
> -END(strrchr)
> -
> -/*
> - This implementation uses SSE4 instructions to compare up to 16 bytes
> - at a time looking for the last occurrence of the character c in the
> - string s:
> -
> - char *strrchr (const char *s, int c);
> -
> - We use 0x4a:
> - _SIDD_SBYTE_OPS
> - | _SIDD_CMP_EQUAL_EACH
> - | _SIDD_MOST_SIGNIFICANT
> - on pcmpistri to compare xmm/mem128
> -
> - 0 1 2 3 4 5 6 7 8 9 A B C D E F
> - X X X X X X X X X X X X X X X X
> -
> - against xmm
> -
> - 0 1 2 3 4 5 6 7 8 9 A B C D E F
> - C C C C C C C C C C C C C C C C
> -
> - to find out if the first 16byte data element has a byte C and the
> - last offset. There are 4 cases:
> -
> - 1. The first 16byte data element has EOS and has the byte C at the
> - last offset X.
> - 2. The first 16byte data element is valid and has the byte C at the
> - last offset X.
> - 3. The first 16byte data element has EOS and doesn't have the byte C.
> - 4. The first 16byte data element is valid and doesn't have the byte C.
> -
> - Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
> -
> - case ECX CFlag ZFlag SFlag
> - 1 X 1 1 0
> - 2 X 1 0 0
> - 3 16 0 1 0
> - 4 16 0 0 0
> -
> - We exit from the loop for cases 1 and 3 with jz which branches
> - when ZFlag is 1. If CFlag == 1, ECX has the offset X for case 1. */
> -
> -
> - .section .text.sse4.2,"ax",@progbits
> - .align 16
> - .type __strrchr_sse42, @function
> - .globl __strrchr_sse42
> - .hidden __strrchr_sse42
> -__strrchr_sse42:
> - cfi_startproc
> - CALL_MCOUNT
> - testb %sil, %sil
> - je __strend_sse4
> - xor %eax,%eax /* RAX has the last occurrence of s. */
> - movd %esi, %xmm1
> - punpcklbw %xmm1, %xmm1
> - movl %edi, %esi
> - punpcklbw %xmm1, %xmm1
> - andl $15, %esi
> - pshufd $0, %xmm1, %xmm1
> - movq %rdi, %r8
> - je L(loop)
> -
> -/* Handle unaligned string using psrldq. */
> - leaq L(psrldq_table)(%rip), %rdx
> - andq $-16, %r8
> - movslq (%rdx,%rsi,4),%r9
> - movdqa (%r8), %xmm0
> - addq %rdx, %r9
> - jmp *%r9
> -
> -/* Handle unaligned string with offset 1 using psrldq. */
> - .p2align 4
> -L(psrldq_1):
> - psrldq $1, %xmm0
> -
> - .p2align 4
> -L(unaligned_pcmpistri):
> - pcmpistri $0x4a, %xmm1, %xmm0
> - jnc L(unaligned_no_byte)
> - leaq (%rdi,%rcx), %rax
> -L(unaligned_no_byte):
> - /* Find the length of the unaligned string. */
> - pcmpistri $0x3a, %xmm0, %xmm0
> - movl $16, %edx
> - subl %esi, %edx
> - cmpl %ecx, %edx
> - /* Return RAX if the unaligned fragment to next 16B already
> - contain the NULL terminator. */
> - jg L(exit)
> - addq $16, %r8
> -
> -/* Loop start on aligned string. */
> - .p2align 4
> -L(loop):
> - pcmpistri $0x4a, (%r8), %xmm1
> - jbe L(match_or_eos)
> - addq $16, %r8
> - jmp L(loop)
> - .p2align 4
> -L(match_or_eos):
> - je L(had_eos)
> -L(match_no_eos):
> - leaq (%r8,%rcx), %rax
> - addq $16, %r8
> - jmp L(loop)
> - .p2align 4
> -L(had_eos):
> - jnc L(exit)
> - leaq (%r8,%rcx), %rax
> - .p2align 4
> -L(exit):
> - ret
> -
> -/* Handle unaligned string with offset 15 using psrldq. */
> - .p2align 4
> -L(psrldq_15):
> - psrldq $15, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 14 using psrldq. */
> - .p2align 4
> -L(psrldq_14):
> - psrldq $14, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 13 using psrldq. */
> - .p2align 4
> -L(psrldq_13):
> - psrldq $13, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 12 using psrldq. */
> - .p2align 4
> -L(psrldq_12):
> - psrldq $12, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 11 using psrldq. */
> - .p2align 4
> -L(psrldq_11):
> - psrldq $11, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 10 using psrldq. */
> - .p2align 4
> -L(psrldq_10):
> - psrldq $10, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 9 using psrldq. */
> - .p2align 4
> -L(psrldq_9):
> - psrldq $9, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 8 using psrldq. */
> - .p2align 4
> -L(psrldq_8):
> - psrldq $8, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 7 using psrldq. */
> - .p2align 4
> -L(psrldq_7):
> - psrldq $7, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 6 using psrldq. */
> - .p2align 4
> -L(psrldq_6):
> - psrldq $6, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 5 using psrldq. */
> - .p2align 4
> -L(psrldq_5):
> - psrldq $5, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 4 using psrldq. */
> - .p2align 4
> -L(psrldq_4):
> - psrldq $4, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 3 using psrldq. */
> - .p2align 4
> -L(psrldq_3):
> - psrldq $3, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> -/* Handle unaligned string with offset 2 using psrldq. */
> - .p2align 4
> -L(psrldq_2):
> - psrldq $2, %xmm0
> - jmp L(unaligned_pcmpistri)
> -
> - cfi_endproc
> - .size __strrchr_sse42, .-__strrchr_sse42
> -
> - .section .rodata.sse4.2,"a",@progbits
> - .p2align 4
> -L(psrldq_table):
> - .int L(loop) - L(psrldq_table)
> - .int L(psrldq_1) - L(psrldq_table)
> - .int L(psrldq_2) - L(psrldq_table)
> - .int L(psrldq_3) - L(psrldq_table)
> - .int L(psrldq_4) - L(psrldq_table)
> - .int L(psrldq_5) - L(psrldq_table)
> - .int L(psrldq_6) - L(psrldq_table)
> - .int L(psrldq_7) - L(psrldq_table)
> - .int L(psrldq_8) - L(psrldq_table)
> - .int L(psrldq_9) - L(psrldq_table)
> - .int L(psrldq_10) - L(psrldq_table)
> - .int L(psrldq_11) - L(psrldq_table)
> - .int L(psrldq_12) - L(psrldq_table)
> - .int L(psrldq_13) - L(psrldq_table)
> - .int L(psrldq_14) - L(psrldq_table)
> - .int L(psrldq_15) - L(psrldq_table)
> -
> -
> -# undef ENTRY
> -# define ENTRY(name) \
> - .type __strrchr_sse2, @function; \
> - .align 16; \
> - .globl __strrchr_sse2; \
> - .hidden __strrchr_sse2; \
> - __strrchr_sse2: cfi_startproc; \
> - CALL_MCOUNT
> -# undef END
> -# define END(name) \
> - cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
> -# undef libc_hidden_builtin_def
> -/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
> - The speedup we get from using SSE4.2 instruction is likely eaten away
> - by the indirect call in the PLT. */
> -# define libc_hidden_builtin_def(name) \
> - .globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
> -#endif
> -
> -#include "../strrchr.S"
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index e413b07..514765b 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -1,6 +1,5 @@
> /* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
> - For AMD x86-64.
> - Copyright (C) 2009-2013 Free Software Foundation, Inc.
> + Copyright (C) 2013 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> The GNU C Library is free software; you can redistribute it and/or
> @@ -17,63 +16,217 @@
> License along with the GNU C Library; if not, see
> <http://www.gnu.org/licenses/>. */
>
> +
> #include <sysdep.h>
>
> +# ifndef ALIGN
> +# define ALIGN(n) .p2align n
> +# endif
> +
>
> .text
> ENTRY (strrchr)
> movd %esi, %xmm1
> - movq %rdi, %rcx
> - punpcklbw %xmm1, %xmm1
> - andq $~15, %rdi
> - pxor %xmm2, %xmm2
> - punpcklbw %xmm1, %xmm1
> - orl $0xffffffff, %esi
> - movdqa (%rdi), %xmm0
> + movq %rdi, %rax
> + andl $4095, %eax
> + punpcklbw %xmm1, %xmm1
> + cmpq $4032, %rax
> + punpcklwd %xmm1, %xmm1
> pshufd $0, %xmm1, %xmm1
> - subq %rdi, %rcx
> + ja L(cross_page)
> + movdqu (%rdi), %xmm0
> + pxor %xmm2, %xmm2
> movdqa %xmm0, %xmm3
> - leaq 16(%rdi), %rdi
> pcmpeqb %xmm1, %xmm0
> pcmpeqb %xmm2, %xmm3
> - shl %cl, %esi
> - pmovmskb %xmm0, %edx
> - pmovmskb %xmm3, %ecx
> - andl %esi, %edx
> - andl %esi, %ecx
> - xorl %eax, %eax
> - movl %edx, %esi
> - orl %ecx, %esi
> - jnz 1f
> + pmovmskb %xmm0, %ecx
> + pmovmskb %xmm3, %edx
> + testq %rdx, %rdx
> + je L(next_48_bytes)
> + leaq -1(%rdx), %rax
> + xorq %rdx, %rax
> + andq %rcx, %rax
> + je L(exit)
> + bsrq %rax, %rax
> + addq %rdi, %rax
> + ret
>
> -2: movdqa (%rdi), %xmm0
> - leaq 16(%rdi), %rdi
> - movdqa %xmm0, %xmm3
> + ALIGN(4)
> +L(next_48_bytes):
> + movdqu 16(%rdi), %xmm4
> + movdqa %xmm4, %xmm5
> + movdqu 32(%rdi), %xmm3
> + pcmpeqb %xmm1, %xmm4
> + pcmpeqb %xmm2, %xmm5
> + movdqu 48(%rdi), %xmm0
> + pmovmskb %xmm5, %edx
> + movdqa %xmm3, %xmm5
> + pcmpeqb %xmm1, %xmm3
> + pcmpeqb %xmm2, %xmm5
> + pcmpeqb %xmm0, %xmm2
> + salq $16, %rdx
> + pmovmskb %xmm3, %r8d
> + pmovmskb %xmm5, %eax
> + pmovmskb %xmm2, %esi
> + salq $32, %r8
> + salq $32, %rax
> pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm3
> - pmovmskb %xmm0, %edx
> - pmovmskb %xmm3, %ecx
> - movl %edx, %esi
> - orl %ecx, %esi
> - jz 2b
> + orq %rdx, %rax
> + movq %rsi, %rdx
> + pmovmskb %xmm4, %esi
> + salq $48, %rdx
> + salq $16, %rsi
> + orq %r8, %rsi
> + orq %rcx, %rsi
> + pmovmskb %xmm0, %ecx
> + salq $48, %rcx
> + orq %rcx, %rsi
> + orq %rdx, %rax
> + je L(loop_header2)
> + leaq -1(%rax), %rcx
> + xorq %rax, %rcx
> + andq %rcx, %rsi
> + je L(exit)
> + bsrq %rsi, %rsi
> + leaq (%rdi,%rsi), %rax
> + ret
>
> -1: bsfl %ecx, %r9d
> - movl $0xffffffff, %r8d
> - movl $31, %ecx
> - jnz 5f
> + ALIGN(4)
> +L(loop_header2):
> + testq %rsi, %rsi
> + movq %rdi, %rcx
> + je L(no_c_found)
> +L(loop_header):
> + addq $64, %rdi
> + pxor %xmm7, %xmm7
> + andq $-64, %rdi
> + jmp L(loop_entry)
> +
> + ALIGN(4)
> +L(loop64):
> + testq %rdx, %rdx
> + cmovne %rdx, %rsi
> + cmovne %rdi, %rcx
> + addq $64, %rdi
> +L(loop_entry):
> + movdqa 32(%rdi), %xmm3
> + pxor %xmm6, %xmm6
> + movdqa 48(%rdi), %xmm2
> + movdqa %xmm3, %xmm0
> + movdqa 16(%rdi), %xmm4
> + pminub %xmm2, %xmm0
> + movdqa (%rdi), %xmm5
> + pminub %xmm4, %xmm0
> + pminub %xmm5, %xmm0
> + pcmpeqb %xmm7, %xmm0
> + pmovmskb %xmm0, %eax
> + movdqa %xmm5, %xmm0
> + pcmpeqb %xmm1, %xmm0
> + pmovmskb %xmm0, %r9d
> + movdqa %xmm4, %xmm0
> + pcmpeqb %xmm1, %xmm0
> + pmovmskb %xmm0, %edx
> + movdqa %xmm3, %xmm0
> + pcmpeqb %xmm1, %xmm0
> + salq $16, %rdx
> + pmovmskb %xmm0, %r10d
> + movdqa %xmm2, %xmm0
> + pcmpeqb %xmm1, %xmm0
> + salq $32, %r10
> + orq %r10, %rdx
> + pmovmskb %xmm0, %r8d
> + orq %r9, %rdx
> + salq $48, %r8
> + orq %r8, %rdx
> + testl %eax, %eax
> + je L(loop64)
> + pcmpeqb %xmm6, %xmm4
> + pcmpeqb %xmm6, %xmm3
> + pcmpeqb %xmm6, %xmm5
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm3, %r10d
> + pcmpeqb %xmm6, %xmm2
> + pmovmskb %xmm5, %r9d
> + salq $32, %r10
> + salq $16, %rax
> + pmovmskb %xmm2, %r8d
> + orq %r10, %rax
> + orq %r9, %rax
> + salq $48, %r8
> + orq %r8, %rax
> + leaq -1(%rax), %r8
> + xorq %rax, %r8
> + andq %r8, %rdx
> + cmovne %rdi, %rcx
> + cmovne %rdx, %rsi
> + bsrq %rsi, %rsi
> + leaq (%rcx,%rsi), %rax
> + ret
>
> - bsrl %edx, %edx
> - jz 2b
> - leaq -16(%rdi,%rdx), %rax
> - jmp 2b
> + ALIGN(4)
> +L(no_c_found):
> + movl $1, %esi
> + xorl %ecx, %ecx
> + jmp L(loop_header)
> +
> + ALIGN(4)
> +L(exit):
> + xorl %eax, %eax
> + ret
>
> -5: subl %r9d, %ecx
> - shrl %cl, %r8d
> - andl %r8d, %edx
> - bsrl %edx, %edx
> - jz 4f
> - leaq -16(%rdi,%rdx), %rax
> -4: ret
> + ALIGN(4)
> +L(cross_page):
> + movq %rdi, %rax
> + pxor %xmm0, %xmm0
> + andq $-64, %rax
> + movdqu (%rax), %xmm5
> + movdqa %xmm5, %xmm6
> + movdqu 16(%rax), %xmm4
> + pcmpeqb %xmm1, %xmm5
> + pcmpeqb %xmm0, %xmm6
> + movdqu 32(%rax), %xmm3
> + pmovmskb %xmm6, %esi
> + movdqa %xmm4, %xmm6
> + movdqu 48(%rax), %xmm2
> + pcmpeqb %xmm1, %xmm4
> + pcmpeqb %xmm0, %xmm6
> + pmovmskb %xmm6, %edx
> + movdqa %xmm3, %xmm6
> + pcmpeqb %xmm1, %xmm3
> + pcmpeqb %xmm0, %xmm6
> + pcmpeqb %xmm2, %xmm0
> + salq $16, %rdx
> + pmovmskb %xmm3, %r9d
> + pmovmskb %xmm6, %r8d
> + pmovmskb %xmm0, %ecx
> + salq $32, %r9
> + salq $32, %r8
> + pcmpeqb %xmm1, %xmm2
> + orq %r8, %rdx
> + salq $48, %rcx
> + pmovmskb %xmm5, %r8d
> + orq %rsi, %rdx
> + pmovmskb %xmm4, %esi
> + orq %rcx, %rdx
> + pmovmskb %xmm2, %ecx
> + salq $16, %rsi
> + salq $48, %rcx
> + orq %r9, %rsi
> + orq %r8, %rsi
> + orq %rcx, %rsi
> + movl %edi, %ecx
> + subl %eax, %ecx
> + shrq %cl, %rdx
> + shrq %cl, %rsi
> + testq %rdx, %rdx
> + je L(loop_header2)
> + leaq -1(%rdx), %rax
> + xorq %rdx, %rax
> + andq %rax, %rsi
> + je L(exit)
> + bsrq %rsi, %rax
> + addq %rdi, %rax
> + ret
> END (strrchr)
>
> weak_alias (strrchr, rindex)
> --
> 1.8.3.2
>