This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/wcslen/master created. glibc-2.25-361-gdd72a28


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/wcslen/master has been created
        at  dd72a28e8bf51d1c1957a5745df1f892fc7e171c (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=dd72a28e8bf51d1c1957a5745df1f892fc7e171c

commit dd72a28e8bf51d1c1957a5745df1f892fc7e171c
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun May 21 09:22:44 2017 -0700

    x86-64: Add wmemset optimized with SSE2/AVX2/AVX512
    
    The difference between memset and wmemset is byte vs int.  Add a stub
    to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size.
    Since the ERMS versions of wmemset requires "rep stosl" instead of
    "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
    are added.  The SSE2 wmemset is about 3X faster with a stub like:
    
    SSE2 wmemset:
    	shl    $0x2,%rdx
    	movd   %esi,%xmm0
    	mov    %rdi,%rax
    	pshufd $0x0,%xmm0,%xmm0
    	jmp	entry_from_wmemset
    
    SSE2 memset:
    	movd   %esi,%xmm0
    	mov    %rdi,%rax
    	punpcklbw %xmm0,%xmm0
    	punpcklwd %xmm0,%xmm0
    	pshufd $0x0,%xmm0,%xmm0
    entry_from_wmemset:
    
    	* include/wchar.h (__wmemset_chk): New.
    	* sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
    	to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
    	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
    	(WMEMSET_CHK_SYMBOL): Likewise.
    	(WMEMSET_SYMBOL): Likewise.
    	(__wmemset): Add hidden definition.
    	(wmemset): Add weak hidden definition.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
    	__wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
    	__wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
    	and __wmemset_chk_avx512_unaligned.
    	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
    	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
    	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
    	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
    	(WMEMSET_SYMBOL): Likewise.
    	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
    	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
    	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
    	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
    	(WMEMSET_SYMBOL): Likewise.
    	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
    	(WMEMSET_CHK_SYMBOL): New.
    	(WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
    	(WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
    	* sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
    	(libc_hidden_builtin_def): Also define __GI_wmemset and
    	__GI___wmemset.
    	(weak_alias): New.
    	* sysdeps/x86_64/multiarch/wmemset.S: New file.
    	* sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise.
    	* sysdeps/x86_64/wmemset.S: Likewise.
    	* sysdeps/x86_64/wmemset_chk.S: Likewise.

diff --git a/include/wchar.h b/include/wchar.h
index e2579a1..a773d56 100644
--- a/include/wchar.h
+++ b/include/wchar.h
@@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
 extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
      __attribute_pure__;
 
+extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
+			       size_t __ns) __THROW;
+
 extern int __vfwscanf (__FILE *__restrict __s,
 		       const wchar_t *__restrict __format,
 		       __gnuc_va_list __arg)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 69ed509..4127878 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -26,13 +26,18 @@
 #define VMOVU		movdqu
 #define VMOVA		movdqa
 
-#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
   movq r, %rax; \
   punpcklbw %xmm0, %xmm0; \
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
+#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  pshufd $0, %xmm0, %xmm0
+
 #define SECTION(p)		p
 
 #ifndef MEMSET_SYMBOL
@@ -40,10 +45,21 @@
 # define MEMSET_SYMBOL(p,s)	memset
 #endif
 
+#ifndef WMEMSET_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) p
+# define WMEMSET_SYMBOL(p,s)	__wmemset
+#endif
+
 #include "multiarch/memset-vec-unaligned-erms.S"
 
 libc_hidden_builtin_def (memset)
 
+#if IS_IN (libc)
+libc_hidden_def (__wmemset)
+weak_alias (__wmemset, wmemset)
+libc_hidden_weak (wmemset)
+#endif
+
 #if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
 strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
 	.section .gnu.warning.__memset_zero_constant_len_parameter
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 06d9a9d..a91d2f9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wmemset.S.  */
+  IFUNC_IMPL (i, name, wmemset,
+	      IFUNC_IMPL_ADD (array, i, wmemset, 1,
+			      __wmemset_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_avx512_unaligned))
+
 #ifdef SHARED
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
@@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
 			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wmemset_chk.S.  */
+  IFUNC_IMPL (i, name, __wmemset_chk,
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
+			      __wmemset_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_chk_avx512_unaligned))
 #endif
 
   return i;
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 79975e0..7ab3d89 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,13 +4,19 @@
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
 
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
   movq r, %rax; \
   vpbroadcastb %xmm0, %ymm0
 
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %ymm0
+
 # define SECTION(p)		p##.avx
 # define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index a5ec349..0783979 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,14 +4,21 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
   movq r, %rax; \
   vpbroadcastb %xmm0, %xmm0; \
   vpbroadcastq %xmm0, %zmm0
 
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
 # define SECTION(p)		p##.avx512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
 
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 704eed9..2eb9e37 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -30,6 +30,10 @@
 # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
 #endif
 
+#ifndef WMEMSET_CHK_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+#endif
+
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -79,6 +83,21 @@ END (__bzero)
 weak_alias (__bzero, bzero)
 #endif
 
+#if IS_IN (libc)
+# if defined SHARED
+ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+# endif
+
+ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+	shlq	$2, %rdx
+	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	jmp	L(entry_from_bzero)
+END (WMEMSET_SYMBOL (__wmemset, unaligned))
+#endif
+
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 	cmpq	%rdx, %rcx
@@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-L(memset_entry):
-	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
@@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
-	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
 	cmpq	$(VEC_SIZE * 2), %rdx
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 9d33118..2b10869 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -58,18 +58,24 @@ END(memset)
 
 #if IS_IN (libc)
 # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
 
 # ifdef SHARED
-# undef libc_hidden_builtin_def
+#  undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memset calls through a PLT.
    The speedup we get from using SSE2 instructions is likely eaten away
    by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
+	.globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
+	.globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
 # endif
 
 # undef strong_alias
 # define strong_alias(original, alias)
+
+# undef weak_alias
+# define weak_alias(original, alias)
 #endif
 
 #include "../memset.S"
diff --git a/sysdeps/x86_64/multiarch/wmemset.S b/sysdeps/x86_64/multiarch/wmemset.S
new file mode 100644
index 0000000..a59f6c9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset.S
@@ -0,0 +1,45 @@
+/* Multiple versions of wmemset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+ENTRY(__wmemset)
+	.type	__wmemset, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__wmemset_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	lea	__wmemset_avx2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__wmemset_avx512_unaligned(%rip), %RAX_LP
+1:	ret
+END(__wmemset)
+
+weak_alias (__wmemset, wmemset)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S b/sysdeps/x86_64/multiarch/wmemset_chk.S
new file mode 100644
index 0000000..c76fcb1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset_chk.S
@@ -0,0 +1,46 @@
+/* Multiple versions of wmemset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# ifdef SHARED
+ENTRY(__wmemset_chk)
+	.type	__wmemset_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__wmemset_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	lea	__wmemset_chk_avx2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__wmemset_chk_avx512_unaligned(%rip), %RAX_LP
+1:	ret
+END(__wmemset_chk)
+# else
+#  include "../wmemset_chk.S"
+# endif
+#endif
diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S
new file mode 100644
index 0000000..f96d567
--- /dev/null
+++ b/sysdeps/x86_64/wmemset.S
@@ -0,0 +1 @@
+/* Implemented in memset.S.  */
diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S
new file mode 100644
index 0000000..64c2774
--- /dev/null
+++ b/sysdeps/x86_64/wmemset_chk.S
@@ -0,0 +1,33 @@
+/* Checking wmemset for x86-64.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+	/* For libc.so this is defined in wmemset.S.
+	   For libc.a, this is a separate source to avoid
+	   wmemset bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__wmemset_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	wmemset
+END (__wmemset_chk)
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2e9bca4211cfb79b86d315d12f6d9d4a41bb2dc1

commit 2e9bca4211cfb79b86d315d12f6d9d4a41bb2dc1
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat May 20 06:48:04 2017 -0700

    x86-64: Update strlen.S to support wcslen/wcsnlen
    
    The difference between strlen and wcslen is byte vs int.  We can
    replace pminub and pcmpeqb with pminud and pcmpeqd to turn strlen
    into wcslen.  Tested on Ivy Bridge with benchtests/bench-wcslen.c,
    the new strlen based wcslen is as fast as the old wcslen.
    
    	* sysdeps/x86_64/strlen.S (PMINU): New.
    	(PCMPEQ): Likewise.
    	(SHIFT_RETURN): Likewise.
    	(FIND_ZERO): Replace pcmpeqb with PCMPEQ.
    	(strlen): Add SHIFT_RETURN before ret.  Replace pcmpeqb and
    	pminub with PCMPEQ and PMINU.
    	* sysdeps/x86_64/wcslen.S: Define AS_WCSLEN and strlen.
    	Include "strlen.S".
    	* sysdeps/x86_64/wcsnlen.S: New file.

diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 5896e6b..b5ab117 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,4 +1,4 @@
-/* SSE2 version of strlen.
+/* SSE2 version of strlen/wcslen.
    Copyright (C) 2012-2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -18,6 +18,16 @@
 
 #include <sysdep.h>
 
+#ifdef AS_WCSLEN
+# define PMINU		pminud
+# define PCMPEQ		pcmpeqd
+# define SHIFT_RETURN	shrq $2, %rax
+#else
+# define PMINU		pminub
+# define PCMPEQ		pcmpeqb
+# define SHIFT_RETURN
+#endif
+
 /* Long lived register in strlen(s), strnlen(s, n) are:
 
 	%xmm3 - zero
@@ -32,10 +42,10 @@ ENTRY(strlen)
 
 /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 #define FIND_ZERO	\
-	pcmpeqb	(%rax), %xmm0;	\
-	pcmpeqb	16(%rax), %xmm1;	\
-	pcmpeqb	32(%rax), %xmm2;	\
-	pcmpeqb	48(%rax), %xmm3;	\
+	PCMPEQ	(%rax), %xmm0;	\
+	PCMPEQ	16(%rax), %xmm1;	\
+	PCMPEQ	32(%rax), %xmm2;	\
+	PCMPEQ	48(%rax), %xmm3;	\
 	pmovmskb	%xmm0, %esi;	\
 	pmovmskb	%xmm1, %edx;	\
 	pmovmskb	%xmm2, %r8d;	\
@@ -54,6 +64,9 @@ ENTRY(strlen)
 	xor	%rax, %rax
 	ret
 L(n_nonzero):
+# ifdef AS_WCSLEN
+	shlq	$2, %rsi
+# endif
 
 /* Initialize long lived registers.  */
 
@@ -96,6 +109,7 @@ L(n_nonzero):
 	test	%rdx, %rdx;	\
 	je	L(lab);	\
 	bsfq	%rdx, %rax;	\
+	SHIFT_RETURN;		\
 	ret
 
 #ifdef AS_STRNLEN
@@ -104,19 +118,20 @@ L(n_nonzero):
 #else
 	/* Test first 16 bytes unaligned.  */
 	movdqu	(%rax), %xmm4
-	pcmpeqb	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm4
 	pmovmskb	%xmm4, %edx
 	test	%edx, %edx
 	je 	L(next48_bytes)
 	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+	SHIFT_RETURN
 	ret
 
 L(next48_bytes):
 /* Same as FIND_ZERO except we do not check first 16 bytes.  */
 	andq	$-16, %rax
-	pcmpeqb 16(%rax), %xmm1
-	pcmpeqb 32(%rax), %xmm2
-	pcmpeqb 48(%rax), %xmm3
+	PCMPEQ 16(%rax), %xmm1
+	PCMPEQ 32(%rax), %xmm2
+	PCMPEQ 48(%rax), %xmm3
 	pmovmskb	%xmm1, %edx
 	pmovmskb	%xmm2, %r8d
 	pmovmskb	%xmm3, %ecx
@@ -145,6 +160,7 @@ L(strnlen_ret):
 	test	%rdx, %rdx
 	je	L(loop_init)
 	bsfq	%rdx, %rax
+	SHIFT_RETURN
 	ret
 #endif
 	.p2align 4
@@ -161,10 +177,10 @@ L(loop):
 	je	L(exit_end)
 
 	movdqa	(%rax), %xmm0
-	pminub	16(%rax), %xmm0
-	pminub	32(%rax), %xmm0
-	pminub	48(%rax), %xmm0
-	pcmpeqb	%xmm3, %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
 	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit)
@@ -182,6 +198,7 @@ L(first):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+	SHIFT_RETURN
 	ret
 
 	.p2align 4
@@ -192,6 +209,7 @@ L(exit):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+	SHIFT_RETURN
 	ret
 
 #else
@@ -201,10 +219,10 @@ L(exit):
 L(loop):
 
 	movdqa	64(%rax), %xmm0
-	pminub	80(%rax), %xmm0
-	pminub	96(%rax), %xmm0
-	pminub	112(%rax), %xmm0
-	pcmpeqb	%xmm3, %xmm0
+	PMINU	80(%rax), %xmm0
+	PMINU	96(%rax), %xmm0
+	PMINU	112(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
 	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit64)
@@ -212,10 +230,10 @@ L(loop):
 	subq	$-128, %rax
 
 	movdqa	(%rax), %xmm0
-	pminub	16(%rax), %xmm0
-	pminub	32(%rax), %xmm0
-	pminub	48(%rax), %xmm0
-	pcmpeqb	%xmm3, %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
 	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit0)
@@ -231,6 +249,7 @@ L(exit0):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+	SHIFT_RETURN
 	ret
 
 #endif
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index c6081a4..88ecdb2 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -1,238 +1,6 @@
-/* Optimized wcslen for x86-64 with SSE2.
-   Copyright (C) 2011-2017 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
+#define AS_WCSLEN
+#define strlen	__wcslen
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-	.text
-ENTRY (__wcslen)
-	cmpl	$0, (%rdi)
-	jz	L(exit_tail0)
-	cmpl	$0, 4(%rdi)
-	jz	L(exit_tail1)
-	cmpl	$0, 8(%rdi)
-	jz	L(exit_tail2)
-	cmpl	$0, 12(%rdi)
-	jz	L(exit_tail3)
-	cmpl	$0, 16(%rdi)
-	jz	L(exit_tail4)
-	cmpl	$0, 20(%rdi)
-	jz	L(exit_tail5)
-	cmpl	$0, 24(%rdi)
-	jz	L(exit_tail6)
-	cmpl	$0, 28(%rdi)
-	jz	L(exit_tail7)
-
-	pxor	%xmm0, %xmm0
-
-	lea	32(%rdi), %rax
-	lea	16(%rdi), %rcx
-	and	$-16, %rax
-
-	pcmpeqd	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	pxor	%xmm1, %xmm1
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	pxor	%xmm2, %xmm2
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	pxor	%xmm3, %xmm3
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	and	$-0x40, %rax
-
-	.p2align 4
-L(aligned_64_loop):
-	movaps	(%rax), %xmm0
-	movaps	16(%rax), %xmm1
-	movaps	32(%rax), %xmm2
-	movaps	48(%rax), %xmm6
-
-	pminub	%xmm1, %xmm0
-	pminub	%xmm6, %xmm2
-	pminub	%xmm0, %xmm2
-	pcmpeqd	%xmm3, %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	64(%rax), %rax
-	jz	L(aligned_64_loop)
-
-	pcmpeqd	-64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	48(%rcx), %rcx
-	jnz	L(exit)
-
-	pcmpeqd	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	-16(%rcx), %rcx
-	jnz	L(exit)
-
-	pcmpeqd	-32(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	-16(%rcx), %rcx
-	jnz	L(exit)
-
-	pcmpeqd	%xmm6, %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	-16(%rcx), %rcx
-	jnz	L(exit)
-
-	jmp	L(aligned_64_loop)
-
-	.p2align 4
-L(exit):
-	sub	%rcx, %rax
-	shr	$2, %rax
-	test	%dl, %dl
-	jz	L(exit_high)
-
-	mov	%dl, %cl
-	and	$15, %cl
-	jz	L(exit_1)
-	ret
-
-	.p2align 4
-L(exit_high):
-	mov	%dh, %ch
-	and	$15, %ch
-	jz	L(exit_3)
-	add	$2, %rax
-	ret
-
-	.p2align 4
-L(exit_1):
-	add	$1, %rax
-	ret
-
-	.p2align 4
-L(exit_3):
-	add	$3, %rax
-	ret
-
-	.p2align 4
-L(exit_tail0):
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(exit_tail1):
-	mov	$1, %rax
-	ret
-
-	.p2align 4
-L(exit_tail2):
-	mov	$2, %rax
-	ret
-
-	.p2align 4
-L(exit_tail3):
-	mov	$3, %rax
-	ret
-
-	.p2align 4
-L(exit_tail4):
-	mov	$4, %rax
-	ret
-
-	.p2align 4
-L(exit_tail5):
-	mov	$5, %rax
-	ret
-
-	.p2align 4
-L(exit_tail6):
-	mov	$6, %rax
-	ret
-
-	.p2align 4
-L(exit_tail7):
-	mov	$7, %rax
-	ret
-
-END (__wcslen)
+#include "strlen.S"
 
 weak_alias(__wcslen, wcslen)
diff --git a/sysdeps/x86_64/wcsnlen.S b/sysdeps/x86_64/wcsnlen.S
new file mode 100644
index 0000000..968bb69
--- /dev/null
+++ b/sysdeps/x86_64/wcsnlen.S
@@ -0,0 +1,7 @@
+#define AS_WCSLEN
+#define AS_STRNLEN
+#define strlen	__wcsnlen
+
+#include "strlen.S"
+
+weak_alias(__wcsnlen, wcsnlen)

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]