This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch master updated. glibc-2.18-174-gdc1a95c


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  dc1a95c730699bdccbafa85f189b814107f409b5 (commit)
      from  5ebbff8fd1529aec13ac4d2906c1a36f3e738519 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=dc1a95c730699bdccbafa85f189b814107f409b5

commit dc1a95c730699bdccbafa85f189b814107f409b5
Author: OndÅ?ej Bílka <neleai@seznam.cz>
Date:   Thu Sep 26 18:54:09 2013 +0200

    Faster strrchr.

diff --git a/ChangeLog b/ChangeLog
index 0fa412c..7025fa9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2013-09-26  OndÅ?ej Bílka  <neleai@seznam.cz>
+
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update.
+	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Remove strrchr ifunc.
+	* sysdeps/x86_64/multiarch/strend-sse4.S Remove.
+	* sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S Likewise.
+	* sysdeps/x86_64/multiarch/strrchr.S: Likewise.
+	* sysdeps/x86_64/strrchr.S (strrchr): Use optimized implementation.
+
 2013-09-25  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/stackguard-macros.h (POINTER_CHK_GUARD:
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5ab950a..9fd0fd6 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,7 +8,7 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
-		   strend-sse4 memcmp-sse4 memcpy-ssse3 \
+		   memcmp-sse4 memcpy-ssse3 \
 		   memcpy-sse2-unaligned mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
@@ -17,7 +17,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
+		   strchr-sse2-no-bsf memcmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1a65ac0..71beab8 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -176,12 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strpbrk_sse42)
 	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
 
-  /* Support sysdeps/x86_64/multiarch/strrchr.S.  */
-  IFUNC_IMPL (i, name, strrchr,
-	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_SSE4_2,
-			      __strrchr_sse42)
-	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2_no_bsf)
-	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strspn.S.  */
   IFUNC_IMPL (i, name, strspn,
diff --git a/sysdeps/x86_64/multiarch/strend-sse4.S b/sysdeps/x86_64/multiarch/strend-sse4.S
deleted file mode 100644
index c5a7ae2..0000000
--- a/sysdeps/x86_64/multiarch/strend-sse4.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Return the pointer to the end of string, using SSE4.2
-   Copyright (C) 2009-2013 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-	.section .text.sse4.2,"ax",@progbits
-ENTRY (__strend_sse4)
-	pxor	%xmm2, %xmm2
-	movq	%rdi, %rcx
-	andq	$~15, %rdi
-	movdqa	%xmm2, %xmm1
-	pcmpeqb	(%rdi), %xmm2
-	orl	$0xffffffff, %esi
-	subq	%rdi, %rcx
-	shll	%cl, %esi
-	pmovmskb %xmm2, %edx
-	andl	%esi, %edx
-	jnz	1f
-
-2:	pcmpistri $0x08, 16(%rdi), %xmm1
-	leaq	16(%rdi), %rdi
-	jnz	2b
-
-	leaq	(%rdi,%rcx), %rax
-	ret
-
-1:	bsfl	%edx, %eax
-	addq	%rdi, %rax
-	ret
-
-END (__strend_sse4)
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
deleted file mode 100644
index fcef610..0000000
--- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
+++ /dev/null
@@ -1,555 +0,0 @@
-/* strrchr with SSE2 without bsf and bsr
-   Copyright (C) 2011-2013 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-# include <sysdep.h>
-# include "asm-syntax.h"
-
-	atom_text_section
-ENTRY (__strrchr_sse2_no_bsf)
-
-	movd	%rsi, %xmm1
-	pxor	%xmm2, %xmm2
-	mov	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	punpcklbw %xmm1, %xmm1
-	/* ECX has OFFSET. */
-	and	$63, %rcx
-	cmp	$48, %rcx
-	pshufd	$0, %xmm1, %xmm1
-	ja	L(crosscache)
-
-/* unaligned string. */
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	pcmpeqb	%xmm1, %xmm0
-	/* Find where NULL is.  */
-	pmovmskb %xmm2, %rcx
-	/* Check if there is a match.  */
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-/* Hancle unaligned string.  */
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	/* Find where NULL is.  */
-	pmovmskb %xmm3, %rdx
-	/* Check if there is a match.  */
-	pmovmskb %xmm0, %rax
-	/* Remove the leading bytes.  */
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-	jmp	L(match_exit)
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	%cl, %cl
-	jz	L(find_zero_high)
-	mov	%cl, %dl
-	and	$15, %dl
-	jz	L(find_zero_8)
-	test	$0x01, %cl
-	jnz	L(FindZeroExit1)
-	test	$0x02, %cl
-	jnz	L(FindZeroExit2)
-	test	$0x04, %cl
-	jnz	L(FindZeroExit3)
-	and	$1 << 4 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_8):
-	test	$0x10, %cl
-	jnz	L(FindZeroExit5)
-	test	$0x20, %cl
-	jnz	L(FindZeroExit6)
-	test	$0x40, %cl
-	jnz	L(FindZeroExit7)
-	and	$1 << 8 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_high):
-	mov	%ch, %dh
-	and	$15, %dh
-	jz	L(find_zero_high_8)
-	test	$0x01, %ch
-	jnz	L(FindZeroExit9)
-	test	$0x02, %ch
-	jnz	L(FindZeroExit10)
-	test	$0x04, %ch
-	jnz	L(FindZeroExit11)
-	and	$1 << 12 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_high_8):
-	test	$0x10, %ch
-	jnz	L(FindZeroExit13)
-	test	$0x20, %ch
-	jnz	L(FindZeroExit14)
-	test	$0x40, %ch
-	jnz	L(FindZeroExit15)
-	and	$1 << 16 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit1):
-	and	$1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit2):
-	and	$1 << 2 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit3):
-	and	$1 << 3 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit5):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit6):
-	and	$1 << 6 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit7):
-	and	$1 << 7 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit9):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit10):
-	and	$1 << 10 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit11):
-	and	$1 << 11 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit13):
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit14):
-	and	$1 << 14 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit15):
-	and	$1 << 15 - 1, %rax
-	jz	L(return_value)
-
-	.p2align 4
-L(match_exit):
-	test	%ah, %ah
-	jnz	L(match_exit_high)
-	mov	%al, %dl
-	and	$15 << 4, %dl
-	jnz	L(match_exit_8)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_8):
-	test	$0x80, %al
-	jnz	L(Exit8)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_high):
-	mov	%ah, %dh
-	and	$15 << 4, %dh
-	jnz	L(match_exit_high_8)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_high_8):
-	test	$0x80, %ah
-	jnz	L(Exit16)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit2):
-	lea	-15(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit3):
-	lea	-14(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit4):
-	lea	-13(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit6):
-	lea	-11(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit7):
-	lea	-10(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit8):
-	lea	-9(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit10):
-	lea	-7(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit11):
-	lea	-6(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit12):
-	lea	-5(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit14):
-	lea	-3(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit15):
-	lea	-2(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit16):
-	lea	-1(%rdi), %rax
-	ret
-
-/* Return NULL.  */
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	%cl, %cl
-	jz	L(prolog_find_zero_high)
-	mov	%cl, %dl
-	and	$15, %dl
-	jz	L(prolog_find_zero_8)
-	test	$0x01, %cl
-	jnz	L(PrologFindZeroExit1)
-	test	$0x02, %cl
-	jnz	L(PrologFindZeroExit2)
-	test	$0x04, %cl
-	jnz	L(PrologFindZeroExit3)
-	and	$1 << 4 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_8):
-	test	$0x10, %cl
-	jnz	L(PrologFindZeroExit5)
-	test	$0x20, %cl
-	jnz	L(PrologFindZeroExit6)
-	test	$0x40, %cl
-	jnz	L(PrologFindZeroExit7)
-	and	$1 << 8 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_high):
-	mov	%ch, %dh
-	and	$15, %dh
-	jz	L(prolog_find_zero_high_8)
-	test	$0x01, %ch
-	jnz	L(PrologFindZeroExit9)
-	test	$0x02, %ch
-	jnz	L(PrologFindZeroExit10)
-	test	$0x04, %ch
-	jnz	L(PrologFindZeroExit11)
-	and	$1 << 12 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_high_8):
-	test	$0x10, %ch
-	jnz	L(PrologFindZeroExit13)
-	test	$0x20, %ch
-	jnz	L(PrologFindZeroExit14)
-	test	$0x40, %ch
-	jnz	L(PrologFindZeroExit15)
-	and	$1 << 16 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit1):
-	and	$1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit2):
-	and	$1 << 2 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit3):
-	and	$1 << 3 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit5):
-	and	$1 << 5 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit6):
-	and	$1 << 6 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit7):
-	and	$1 << 7 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit9):
-	and	$1 << 9 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit10):
-	and	$1 << 10 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit11):
-	and	$1 << 11 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit13):
-	and	$1 << 13 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit14):
-	and	$1 << 14 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit15):
-	and	$1 << 15 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-END (__strrchr_sse2_no_bsf)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
deleted file mode 100644
index 3f92a41..0000000
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ /dev/null
@@ -1,288 +0,0 @@
-/* Multiple versions of strrchr
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2009-2013 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc and for
-   the DSO.  In static binaries we need strrchr before the initialization
-   happened.  */
-#if defined SHARED && !defined NOT_IN_libc
-	.text
-ENTRY(strrchr)
-	.type	strrchr, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strrchr_sse2(%rip), %rax
-	testl	$bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
-	jnz	2f
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-	jz	2f
-	leaq	__strrchr_sse42(%rip), %rax
-	ret
-2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
-	jz	3f
-	leaq    __strrchr_sse2_no_bsf(%rip), %rax
-3:	ret
-END(strrchr)
-
-/*
-   This implementation uses SSE4 instructions to compare up to 16 bytes
-   at a time looking for the last occurrence of the character c in the
-   string s:
-
-   char *strrchr (const char *s, int c);
-
-   We use 0x4a:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_EACH
-	| _SIDD_MOST_SIGNIFICANT
-   on pcmpistri to compare xmm/mem128
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   X X X X X X X X X X X X X X X X
-
-   against xmm
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   C C C C C C C C C C C C C C C C
-
-   to find out if the first 16byte data element has a byte C and the
-   last offset.  There are 4 cases:
-
-   1. The first 16byte data element has EOS and has the byte C at the
-      last offset X.
-   2. The first 16byte data element is valid and has the byte C at the
-      last offset X.
-   3. The first 16byte data element has EOS and doesn't have the byte C.
-   4. The first 16byte data element is valid and doesn't have the byte C.
-
-   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
-   case		ECX	CFlag	ZFlag	SFlag
-    1		 X	  1	  1	  0
-    2		 X	  1	  0	  0
-    3		16	  0	  1	  0
-    4		16	  0	  0	  0
-
-   We exit from the loop for cases 1 and 3 with jz which branches
-   when ZFlag is 1.  If CFlag == 1, ECX has the offset X for case 1.  */
-
-
-	.section .text.sse4.2,"ax",@progbits
-	.align	16
-	.type	__strrchr_sse42, @function
-	.globl	__strrchr_sse42
-	.hidden	__strrchr_sse42
-__strrchr_sse42:
-	cfi_startproc
-	CALL_MCOUNT
-	testb	%sil, %sil
-	je	__strend_sse4
-	xor	%eax,%eax	/* RAX has the last occurrence of s.  */
-	movd	%esi, %xmm1
-	punpcklbw	%xmm1, %xmm1
-	movl	%edi, %esi
-	punpcklbw	%xmm1, %xmm1
-	andl	$15, %esi
-	pshufd	$0, %xmm1, %xmm1
-	movq	%rdi, %r8
-	je	L(loop)
-
-/* Handle unaligned string using psrldq.  */
-	leaq	L(psrldq_table)(%rip), %rdx
-	andq	$-16, %r8
-	movslq	(%rdx,%rsi,4),%r9
-	movdqa	(%r8), %xmm0
-	addq	%rdx, %r9
-	jmp	*%r9
-
-/* Handle unaligned string with offset 1 using psrldq.  */
-	.p2align 4
-L(psrldq_1):
-	psrldq	$1, %xmm0
-
-	.p2align 4
-L(unaligned_pcmpistri):
-	pcmpistri	$0x4a, %xmm1, %xmm0
-	jnc	L(unaligned_no_byte)
-	leaq	(%rdi,%rcx), %rax
-L(unaligned_no_byte):
-	/* Find the length of the unaligned string.  */
-	pcmpistri	$0x3a, %xmm0, %xmm0
-	movl	$16, %edx
-	subl	%esi, %edx
-	cmpl	%ecx, %edx
-	/* Return RAX if the unaligned fragment to next 16B already
-	   contain the NULL terminator.  */
-	jg	L(exit)
-	addq	$16, %r8
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	pcmpistri	$0x4a, (%r8), %xmm1
-	jbe	L(match_or_eos)
-	addq	$16, %r8
-	jmp	L(loop)
-	.p2align 4
-L(match_or_eos):
-	je	L(had_eos)
-L(match_no_eos):
-	leaq	(%r8,%rcx), %rax
-	addq	$16, %r8
-	jmp     L(loop)
-	.p2align 4
-L(had_eos):
-	jnc     L(exit)
-	leaq	(%r8,%rcx), %rax
-	.p2align 4
-L(exit):
-	ret
-
-/* Handle unaligned string with offset 15 using psrldq.  */
-	.p2align 4
-L(psrldq_15):
-	psrldq	$15, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 14 using psrldq.  */
-	.p2align 4
-L(psrldq_14):
-	psrldq	$14, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 13 using psrldq.  */
-	.p2align 4
-L(psrldq_13):
-	psrldq	$13, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 12 using psrldq.  */
-	.p2align 4
-L(psrldq_12):
-	psrldq	$12, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 11 using psrldq.  */
-	.p2align 4
-L(psrldq_11):
-	psrldq	$11, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 10 using psrldq.  */
-	.p2align 4
-L(psrldq_10):
-	psrldq	$10, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 9 using psrldq.  */
-	.p2align 4
-L(psrldq_9):
-	psrldq	$9, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 8 using psrldq.  */
-	.p2align 4
-L(psrldq_8):
-	psrldq	$8, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 7 using psrldq.  */
-	.p2align 4
-L(psrldq_7):
-	psrldq	$7, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 6 using psrldq.  */
-	.p2align 4
-L(psrldq_6):
-	psrldq	$6, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 5 using psrldq.  */
-	.p2align 4
-L(psrldq_5):
-	psrldq	$5, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 4 using psrldq.  */
-	.p2align 4
-L(psrldq_4):
-	psrldq	$4, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 3 using psrldq.  */
-	.p2align 4
-L(psrldq_3):
-	psrldq	$3, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 2 using psrldq.  */
-	.p2align 4
-L(psrldq_2):
-	psrldq	$2, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-	cfi_endproc
-	.size	__strrchr_sse42, .-__strrchr_sse42
-
-	.section .rodata.sse4.2,"a",@progbits
-	.p2align 4
-L(psrldq_table):
-	.int	L(loop) - L(psrldq_table)
-	.int	L(psrldq_1) - L(psrldq_table)
-	.int	L(psrldq_2) - L(psrldq_table)
-	.int	L(psrldq_3) - L(psrldq_table)
-	.int	L(psrldq_4) - L(psrldq_table)
-	.int	L(psrldq_5) - L(psrldq_table)
-	.int	L(psrldq_6) - L(psrldq_table)
-	.int	L(psrldq_7) - L(psrldq_table)
-	.int	L(psrldq_8) - L(psrldq_table)
-	.int	L(psrldq_9) - L(psrldq_table)
-	.int	L(psrldq_10) - L(psrldq_table)
-	.int	L(psrldq_11) - L(psrldq_table)
-	.int	L(psrldq_12) - L(psrldq_table)
-	.int	L(psrldq_13) - L(psrldq_table)
-	.int	L(psrldq_14) - L(psrldq_table)
-	.int	L(psrldq_15) - L(psrldq_table)
-
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __strrchr_sse2, @function; \
-	.align 16; \
-	.globl __strrchr_sse2; \
-	.hidden __strrchr_sse2; \
-	__strrchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
-   The speedup we get from using SSE4.2 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
-#endif
-
-#include "../strrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index e413b07..514765b 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -1,6 +1,5 @@
 /* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
-   For AMD x86-64.
-   Copyright (C) 2009-2013 Free Software Foundation, Inc.
+   Copyright (C) 2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -17,63 +16,217 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+
 #include <sysdep.h>
 
+# ifndef ALIGN
+#  define ALIGN(n)	.p2align n
+# endif
+
 
 	.text
 ENTRY (strrchr)
 	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	andq	$~15, %rdi
-	pxor	%xmm2, %xmm2
-	punpcklbw %xmm1, %xmm1
-	orl	$0xffffffff, %esi
-	movdqa	(%rdi), %xmm0
+	movq	%rdi, %rax
+	andl	$4095, %eax
+	punpcklbw	%xmm1, %xmm1
+	cmpq	$4032, %rax
+	punpcklwd	%xmm1, %xmm1
 	pshufd	$0, %xmm1, %xmm1
-	subq	%rdi, %rcx
+	ja	L(cross_page)
+	movdqu	(%rdi), %xmm0
+	pxor	%xmm2, %xmm2
 	movdqa	%xmm0, %xmm3
-	leaq	16(%rdi), %rdi
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm3
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	andl	%esi, %edx
-	andl	%esi, %ecx
-	xorl	%eax, %eax
-	movl	%edx, %esi
-	orl	%ecx, %esi
-	jnz	1f
+	pmovmskb	%xmm0, %ecx
+	pmovmskb	%xmm3, %edx
+	testq	%rdx, %rdx
+	je	L(next_48_bytes)
+	leaq	-1(%rdx), %rax
+	xorq	%rdx, %rax
+	andq	%rcx, %rax
+	je	L(exit)
+	bsrq	%rax, %rax
+	addq	%rdi, %rax
+	ret
 
-2:	movdqa	(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	movdqa	%xmm0, %xmm3
+	ALIGN(4)
+L(next_48_bytes):
+	movdqu	16(%rdi), %xmm4
+	movdqa	%xmm4, %xmm5
+	movdqu	32(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm2, %xmm5
+	movdqu	48(%rdi), %xmm0
+	pmovmskb	%xmm5, %edx
+	movdqa	%xmm3, %xmm5
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm5
+	pcmpeqb	%xmm0, %xmm2
+	salq	$16, %rdx
+	pmovmskb	%xmm3, %r8d
+	pmovmskb	%xmm5, %eax
+	pmovmskb	%xmm2, %esi
+	salq	$32, %r8
+	salq	$32, %rax
 	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	movl	%edx, %esi
-	orl	%ecx, %esi
-	jz	2b
+	orq	%rdx, %rax
+	movq	%rsi, %rdx
+	pmovmskb	%xmm4, %esi
+	salq	$48, %rdx
+	salq	$16, %rsi
+	orq	%r8, %rsi
+	orq	%rcx, %rsi
+	pmovmskb	%xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rsi
+	orq	%rdx, %rax
+	je	L(loop_header2)
+	leaq	-1(%rax), %rcx
+	xorq	%rax, %rcx
+	andq	%rcx, %rsi
+	je	L(exit)
+	bsrq	%rsi, %rsi
+	leaq	(%rdi,%rsi), %rax
+	ret
 
-1:	bsfl	%ecx, %r9d
-	movl	$0xffffffff, %r8d
-	movl	$31, %ecx
-	jnz	5f
+	ALIGN(4)
+L(loop_header2):
+	testq	%rsi, %rsi
+	movq	%rdi, %rcx
+	je	L(no_c_found)
+L(loop_header):
+	addq	$64, %rdi
+	pxor	%xmm7, %xmm7
+	andq	$-64, %rdi
+	jmp	L(loop_entry)
+
+	ALIGN(4)
+L(loop64):
+	testq	%rdx, %rdx
+	cmovne	%rdx, %rsi
+	cmovne	%rdi, %rcx
+	addq	$64, %rdi
+L(loop_entry):
+	movdqa	32(%rdi), %xmm3
+	pxor	%xmm6, %xmm6
+	movdqa	48(%rdi), %xmm2
+	movdqa	%xmm3, %xmm0
+	movdqa	16(%rdi), %xmm4
+	pminub	%xmm2, %xmm0
+	movdqa	(%rdi), %xmm5
+	pminub	%xmm4, %xmm0
+	pminub	%xmm5, %xmm0
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb	%xmm0, %eax
+	movdqa	%xmm5, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %r9d
+	movdqa	%xmm4, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %edx
+	movdqa	%xmm3, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	salq	$16, %rdx
+	pmovmskb	%xmm0, %r10d
+	movdqa	%xmm2, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	salq	$32, %r10
+	orq	%r10, %rdx
+	pmovmskb	%xmm0, %r8d
+	orq	%r9, %rdx
+	salq	$48, %r8
+	orq	%r8, %rdx
+	testl	%eax, %eax
+	je	L(loop64)
+	pcmpeqb	%xmm6, %xmm4
+	pcmpeqb	%xmm6, %xmm3
+	pcmpeqb	%xmm6, %xmm5
+	pmovmskb	%xmm4, %eax
+	pmovmskb	%xmm3, %r10d
+	pcmpeqb	%xmm6, %xmm2
+	pmovmskb	%xmm5, %r9d
+	salq	$32, %r10
+	salq	$16, %rax
+	pmovmskb	%xmm2, %r8d
+	orq	%r10, %rax
+	orq	%r9, %rax
+	salq	$48, %r8
+	orq	%r8, %rax
+	leaq	-1(%rax), %r8
+	xorq	%rax, %r8
+	andq	%r8, %rdx
+	cmovne	%rdi, %rcx
+	cmovne	%rdx, %rsi
+	bsrq	%rsi, %rsi
+	leaq	(%rcx,%rsi), %rax
+	ret
 
-	bsrl	%edx, %edx
-	jz	2b
-	leaq	-16(%rdi,%rdx), %rax
-	jmp	2b
+	ALIGN(4)
+L(no_c_found):
+	movl	$1, %esi
+	xorl	%ecx, %ecx
+	jmp	L(loop_header)
+
+	ALIGN(4)
+L(exit):
+	xorl	%eax, %eax
+	ret
 
-5:	subl	%r9d, %ecx
-	shrl	%cl, %r8d
-	andl	%r8d, %edx
-	bsrl	%edx, %edx
-	jz	4f
-	leaq	-16(%rdi,%rdx), %rax
-4:	ret
+	ALIGN(4)
+L(cross_page):
+	movq	%rdi, %rax
+	pxor	%xmm0, %xmm0
+	andq	$-64, %rax
+	movdqu	(%rax), %xmm5
+	movdqa	%xmm5, %xmm6
+	movdqu	16(%rax), %xmm4
+	pcmpeqb	%xmm1, %xmm5
+	pcmpeqb	%xmm0, %xmm6
+	movdqu	32(%rax), %xmm3
+	pmovmskb	%xmm6, %esi
+	movdqa	%xmm4, %xmm6
+	movdqu	48(%rax), %xmm2
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm0, %xmm6
+	pmovmskb	%xmm6, %edx
+	movdqa	%xmm3, %xmm6
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm0, %xmm6
+	pcmpeqb	%xmm2, %xmm0
+	salq	$16, %rdx
+	pmovmskb	%xmm3, %r9d
+	pmovmskb	%xmm6, %r8d
+	pmovmskb	%xmm0, %ecx
+	salq	$32, %r9
+	salq	$32, %r8
+	pcmpeqb	%xmm1, %xmm2
+	orq	%r8, %rdx
+	salq	$48, %rcx
+	pmovmskb	%xmm5, %r8d
+	orq	%rsi, %rdx
+	pmovmskb	%xmm4, %esi
+	orq	%rcx, %rdx
+	pmovmskb	%xmm2, %ecx
+	salq	$16, %rsi
+	salq	$48, %rcx
+	orq	%r9, %rsi
+	orq	%r8, %rsi
+	orq	%rcx, %rsi
+	movl	%edi, %ecx
+	subl	%eax, %ecx
+	shrq	%cl, %rdx
+	shrq	%cl, %rsi
+	testq	%rdx, %rdx
+	je	L(loop_header2)
+	leaq	-1(%rdx), %rax
+	xorq	%rdx, %rax
+	andq	%rax, %rsi
+	je	L(exit)
+	bsrq	%rsi, %rax
+	addq	%rdi, %rax
+	ret
 END (strrchr)
 
 weak_alias (strrchr, rindex)

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                      |    9 +
 sysdeps/x86_64/multiarch/Makefile              |    4 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c     |    6 -
 sysdeps/x86_64/multiarch/strend-sse4.S         |   48 --
 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S |  555 ------------------------
 sysdeps/x86_64/multiarch/strrchr.S             |  288 ------------
 sysdeps/x86_64/strrchr.S                       |  241 +++++++++--
 7 files changed, 208 insertions(+), 943 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strend-sse4.S
 delete mode 100644 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
 delete mode 100644 sysdeps/x86_64/multiarch/strrchr.S


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]