This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PING] [PATCH] x86_32: memcpy, mempcpy, memmove, memcpy_chk, mempcpy_chk, memmove_chk optimized with SSE2 unaligned loads/stores


On Tue, Dec 23, 2014 at 08:15:00PM +0100, OndÅej BÃlka wrote:
> On Mon, Dec 22, 2014 at 02:56:10PM +0300, Andrew Senkevich wrote:
> > Hi,
> > 
> > it is ping for patch which was discussed at the beginning of august,
> > end of discussion is
> > https://sourceware.org/ml/libc-alpha/2014-08/msg00078.html
> > 
> > Is it Ok for trunk?
> > 
> It looked OK for me performancewise. I planned to check that for bugs
> which is still in my TODO list. If somebody else checked that it would
> be ok to commit.

I checked it in with a couple fixes:

1. Add __bcopy_sse2_unaligned.
2. Check HAS_SSE2 in ifunc-impl-list.c.
3. Replace !defined NOT_IN_libc with IS_IN (libc).
4. Check bit_SSE2 before using __xxx_sse2_unaligned.
5. Replace CPUID_OFFSET with FEATURE_OFFSET in bit_Fast_Unaligned_Load
check.

Andrew, please double check my checkin.

Thanks.

H.J.
--
diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S
index a0fca88..4041eed 100644
--- a/sysdeps/i386/i686/multiarch/bcopy.S
+++ b/sysdeps/i386/i686/multiarch/bcopy.S
@@ -35,6 +35,11 @@ ENTRY(bcopy)
 	jne	1f
 	call	__init_cpu_features
 1:	leal	__bcopy_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__bcopy_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	jnz	2f
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__bcopy_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
index 8a6c4a8..4efa9c5 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
+++ b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -41,6 +41,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSSE3,
 			      __bcopy_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSSE3, __bcopy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSE2,
+			      __bcopy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/bzero.S.  */
@@ -69,7 +71,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_chk_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
 			      __memmove_chk_ssse3)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSE2,
 			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_ia32))
@@ -80,7 +82,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
 			      __memmove_ssse3)
-	      IFUNC_IMPL_ADD (array, i, memmove, 1,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSE2,
 			      __memmove_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
 
@@ -272,7 +274,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_chk_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
 			      __memcpy_chk_ssse3)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSE2,
 			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_ia32))
@@ -282,7 +284,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
 			      __memcpy_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSE2,
+			      __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S.  */
@@ -291,7 +294,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_chk_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
 			      __mempcpy_chk_ssse3)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSE2,
 			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_ia32))
@@ -302,7 +305,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
 			      __mempcpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSE2,
+			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strlen.S.  */
diff --git a/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
index fc85c18..ff89de2 100644
--- a/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
@@ -1,5 +1,5 @@
-/* memcpy optimized with sse2 unaligned memory access instructions.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+/* memcpy optimized with SSE2 unaligned memory access instructions.
+   Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,7 +16,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined NOT_IN_libc \
+#if IS_IN (libc) \
     && (defined SHARED \
 	|| defined USE_AS_MEMMOVE \
 	|| !defined USE_MULTIARCH)
@@ -29,9 +29,15 @@
 #  define MEMCPY_CHK	__memcpy_chk_sse2_unaligned
 # endif
 
-# define DEST		PARMS
-# define SRC		DEST+4
-# define LEN		SRC+4
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
 
 # define CFI_PUSH(REG)		\
   cfi_adjust_cfa_offset (4);		\
@@ -665,7 +671,7 @@ L(len_5_8_bytes):
 
 L(return):
 	movl	%edx, %eax
-# ifdef USE_AS_MEMPCPY
+# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
 	movl	LEN(%esp), %ecx
 	add	%ecx, %eax
 # endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
index 6e70730..845492c 100644
--- a/sysdeps/i386/i686/multiarch/memcpy.S
+++ b/sysdeps/i386/i686/multiarch/memcpy.S
@@ -35,10 +35,12 @@ ENTRY(memcpy)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memcpy_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+1:	leal	__memcpy_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memcpy_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
 	jnz	2f
-	leal	__memcpy_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__memcpy_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
index 142ba14..415d910 100644
--- a/sysdeps/i386/i686/multiarch/memcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -36,10 +36,12 @@ ENTRY(__memcpy_chk)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+1:	leal	__memcpy_chk_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
 	jnz	2f
-	leal	__memcpy_chk_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__memcpy_chk_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
index 3975545..29644dd 100644
--- a/sysdeps/i386/i686/multiarch/memmove.S
+++ b/sysdeps/i386/i686/multiarch/memmove.S
@@ -34,10 +34,12 @@ ENTRY(memmove)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memmove_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+1:	leal	__memmove_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memmove_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
 	jnz	2f
-	leal	__memmove_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__memmove_ssse3@GOTOFF(%ebx), %eax
@@ -65,10 +67,12 @@ ENTRY(memmove)
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memmove_sse2_unaligned, %eax
-	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features
+1:	leal	__memmove_ia32, %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	jz	2f
+	leal	__memmove_sse2_unaligned, %eax
+	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
 	jnz	2f
-	leal	__memmove_ia32, %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
 	jz	2f
 	leal	__memmove_ssse3, %eax
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
index 0b560f9..fea9b54 100644
--- a/sysdeps/i386/i686/multiarch/memmove_chk.S
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -34,10 +34,12 @@ ENTRY(__memmove_chk)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memmove_chk_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+1:	leal	__memmove_chk_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memmove_chk_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
 	jnz	2f
-	leal	__memmove_chk_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__memmove_chk_ssse3@GOTOFF(%ebx), %eax
@@ -56,10 +58,12 @@ ENTRY(__memmove_chk)
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memmove_chk_sse2_unaligned, %eax
-	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features
+1:	leal	__memmove_chk_ia32, %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	jz	2f
+	leal	__memmove_chk_sse2_unaligned, %eax
+	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
 	jnz	2f
-	leal	__memmove_chk_ia32, %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
 	jz	2f
 	leal	__memmove_chk_ssse3, %eax
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
index dab21e0..fd8b82c 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -35,10 +35,12 @@ ENTRY(__mempcpy)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__mempcpy_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+1:	leal	__mempcpy_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__mempcpy_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
 	jnz	2f
-	leal	__mempcpy_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__mempcpy_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
index 6a408ab..ed23b1b 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -36,10 +36,12 @@ ENTRY(__mempcpy_chk)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__mempcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
-	jnz	2f
 	leal	__mempcpy_chk_ia32@GOTOFF(%ebx), %eax
+1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__mempcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	jnz	2f
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__mempcpy_chk_ssse3@GOTOFF(%ebx), %eax

Attachment: 0001-i386-memcpy-functions-with-SSE2-unaligned-load-store.patch
Description: Text document


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]