This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/erms/hybrid created. glibc-2.23-137-g6ce43b9


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/erms/hybrid has been created
        at  6ce43b9abbfa089e80ae8fa3508b7ecdfb56c265 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6ce43b9abbfa089e80ae8fa3508b7ecdfb56c265

commit 6ce43b9abbfa089e80ae8fa3508b7ecdfb56c265
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Mar 18 12:36:03 2016 -0700

    Add Hybrid_ERMS and use it in memset.S

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c8f81ef..b8f0b82 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -192,6 +192,12 @@ init_cpu_features (struct cpu_features *cpu_features)
 	    }
 	}
 
+      /* Enable optimization of hybrid Enhanced REP MOVSB/STOSB with
+	 SSE/AVX.  */
+	if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	  cpu_features->feature[index_arch_Hybrid_ERMS]
+	    |= bit_arch_Hybrid_ERMS;
+
       /* Unaligned load with 256-bit AVX registers are faster on
 	 Intel processors with AVX2.  */
       if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 1c09712..f1df917 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -36,6 +36,7 @@
 #define bit_arch_Prefer_MAP_32BIT_EXEC		(1 << 16)
 #define bit_arch_Prefer_No_VZEROUPPER		(1 << 17)
 #define bit_arch_Fast_ERMS			(1 << 18)
+#define bit_arch_Hybrid_ERMS			(1 << 19)
 
 /* CPUID Feature flags.  */
 
@@ -105,6 +106,7 @@
 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Fast_ERMS		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Hybrid_ERMS		FEATURE_INDEX_1*FEATURE_SIZE
 
 
 # if defined (_LIBC) && !IS_IN (nonlib)
@@ -272,6 +274,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
 # define index_arch_Fast_ERMS		FEATURE_INDEX_1
+# define index_arch_Hybrid_ERMS		FEATURE_INDEX_1
 
 #endif	/* !__ASSEMBLER__ */
 
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index dda8185..7364747 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -29,10 +29,18 @@ ENTRY(memset)
 	leaq	__memset_erms(%rip), %rax
 	HAS_ARCH_FEATURE (Fast_ERMS)
 	jnz	2f
+	leaq	__memset_sse2_erms(%rip), %rax
+	HAS_ARCH_FEATURE (Hybrid_ERMS)
+	jnz	1f
 	leaq	__memset_sse2(%rip), %rax
+1:
 	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f
+	leaq	__memset_avx2_erms(%rip), %rax
+	HAS_ARCH_FEATURE (Hybrid_ERMS)
+	jnz	3f
 	leaq	__memset_avx2(%rip), %rax
+3:
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	2f

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9ad76210d60ab512bd329824ccf24add63c6a4b7

commit 9ad76210d60ab512bd329824ccf24add63c6a4b7
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Mar 18 10:34:07 2016 -0700

    Add __memset_avx2_erms and __memset_chk_avx2_erms

diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 002800e..cd8a2c1 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -95,6 +95,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_chk_avx2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_chk_avx2_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
@@ -110,6 +113,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_avx2)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
index df63472..9a565a9 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2.S
@@ -28,6 +28,24 @@
 
 	.section .text.avx2,"ax",@progbits
 #if defined PIC
+ENTRY_CHK (__memset_chk_avx2_erms)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk_avx2_erms)
+# endif
+
+ENTRY (__memset_avx2_erms)
+	cmpq	$REP_STOSB_THRESHOLD, %rdx
+	jbe	L(start)
+	movq	%rdx, %rcx
+	movzbl	%sil, %eax
+	movq	%rdi, %rdx
+	rep stosb
+	movq	%rdx, %rax
+	ret
+END (__memset_avx2_erms)
+
+#if defined PIC
 ENTRY (MEMSET_CHK)
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
@@ -35,6 +53,7 @@ END (MEMSET_CHK)
 #endif
 
 ENTRY (MEMSET)
+L(start):
 	vpxor	%xmm0, %xmm0, %xmm0
 	vmovd	%esi, %xmm1
 	lea	(%rdi, %rdx), %rsi

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=eeedc55e73367131c49cc84749006fd7cf0d2f41

commit eeedc55e73367131c49cc84749006fd7cf0d2f41
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Mar 18 10:27:58 2016 -0700

    Add avx_unaligned_erms versions of memcpy/mempcpy

diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 543c637..002800e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -281,6 +281,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
@@ -300,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -326,6 +332,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
@@ -350,6 +359,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index c2df9da..577b80c 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -33,6 +33,32 @@
 
 	.section .text.avx,"ax",@progbits
 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (__mempcpy_chk_avx_unaligned_erms)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_avx_unaligned_erms)
+
+ENTRY (__mempcpy_avx_unaligned_erms)
+	mov	%rdi, %rax
+	add	%rdx, %rax
+	jmp	L(start_erms)
+END (__mempcpy_avx_unaligned_erms)
+
+ENTRY (__memcpy_chk_avx_unaligned_erms)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk_avx_unaligned_erms)
+
+ENTRY(__memcpy_avx_unaligned_erms)
+	movq	%rdi, %rax
+L(start_erms):
+	cmpq	$REP_MOVSB_THRESHOLD, %rdx
+	jbe	L(start)
+	mov	%rdx, %rcx
+	rep movsb
+	ret
+END (__memcpy_avx_unaligned_erms)
+
 ENTRY (MEMPCPY_CHK)
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=21a60509c4d164f7c2ac21add165e9868da0aab0

commit 21a60509c4d164f7c2ac21add165e9868da0aab0
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Mar 18 10:07:48 2016 -0700

    Remove mempcpy-*.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2f29a2a..fdb8448 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,10 +8,10 @@ ifeq ($(subdir),string)
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
 		   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
-		   memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
-		   memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
-		   memcpy-avx-unaligned mempcpy-avx-unaligned \
-		   mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
+		   memcpy-avx512-no-vzeroupper memmove-ssse3 \
+		   memcpy-ssse3-back memmove-avx-unaligned \
+		   memcpy-avx-unaligned \
+		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index 4f4bf45..c2df9da 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -27,8 +27,8 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_avx_unaligned
 # define MEMCPY_CHK	__memcpy_chk_avx_unaligned
-# define MEMPCPY	__mempcpy_avx_unaligned_1
-# define MEMPCPY_CHK	__mempcpy_chk_avx_unaligned_1
+# define MEMPCPY	__mempcpy_avx_unaligned
+# define MEMPCPY_CHK	__mempcpy_chk_avx_unaligned
 #endif
 
 	.section .text.avx,"ax",@progbits
@@ -39,12 +39,8 @@ ENTRY (MEMPCPY_CHK)
 END (MEMPCPY_CHK)
 
 ENTRY (MEMPCPY)
-#if 0
-	lea	(%rdi, %rdx), %rax
-#else
 	mov	%rdi, %rax
 	add	%rdx, %rax
-#endif
 	jmp	L(start)
 END (MEMPCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
index 3a57b73..7babb47 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -27,8 +27,8 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_avx512_no_vzeroupper
 # define MEMCPY_CHK	__memcpy_chk_avx512_no_vzeroupper
-# define MEMPCPY	__mempcpy_avx512_no_vzeroupper_1
-# define MEMPCPY_CHK	__mempcpy_chk_avx512_no_vzeroupper_1
+# define MEMPCPY	__mempcpy_avx512_no_vzeroupper
+# define MEMPCPY_CHK	__mempcpy_chk_avx512_no_vzeroupper
 #endif
 
 	.section .text.avx512,"ax",@progbits
@@ -39,12 +39,8 @@ ENTRY (MEMPCPY_CHK)
 END (MEMPCPY_CHK)
 
 ENTRY (MEMPCPY)
-#if 0
-	lea	(%rdi, %rdx), %rax
-#else
 	mov	%rdi, %rax
 	add	%rdx, %rax
-#endif
 	jmp	L(start)
 END (MEMPCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 6184e4e..9a872d3 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -29,8 +29,8 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_ssse3_back
 # define MEMCPY_CHK	__memcpy_chk_ssse3_back
-# define MEMPCPY	__mempcpy_ssse3_back_1
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back_1
+# define MEMPCPY	__mempcpy_ssse3_back
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
 #endif
 
 #define JMPTBL(I, B)	I - B
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 127afaa..643f322 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -29,8 +29,8 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_ssse3
 # define MEMCPY_CHK	__memcpy_chk_ssse3
-# define MEMPCPY	__mempcpy_ssse3_1
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3_1
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
 #endif
 
 #define JMPTBL(I, B)	I - B
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
deleted file mode 100644
index 241378e..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* mempcpy with AVX
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_avx_unaligned
-#define MEMCPY_CHK	__mempcpy_chk_avx_unaligned
-#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
deleted file mode 100644
index fcc0945..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* mempcpy optimized with AVX512 for KNL hardware.
-   Copyright (C) 2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_avx512_no_vzeroupper
-#define MEMCPY_CHK	__mempcpy_chk_avx512_no_vzeroupper
-#include "memcpy-avx512-no-vzeroupper.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
deleted file mode 100644
index 82ffacb..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_ssse3_back
-#define MEMCPY_CHK	__mempcpy_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
deleted file mode 100644
index 822d98e..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_ssse3
-#define MEMCPY_CHK	__mempcpy_chk_ssse3
-#include "memcpy-ssse3.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=dce73065103d970a518ea7a869e218864c946198

commit dce73065103d970a518ea7a869e218864c946198
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Mar 6 13:37:31 2016 -0800

    Merge memcpy with mempcpy

diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index 74fed18..4f4bf45 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -25,11 +25,30 @@
 
 #include "asm-syntax.h"
 #ifndef MEMCPY
-# define MEMCPY	__memcpy_avx_unaligned
+# define MEMCPY		__memcpy_avx_unaligned
 # define MEMCPY_CHK	__memcpy_chk_avx_unaligned
+# define MEMPCPY	__mempcpy_avx_unaligned_1
+# define MEMPCPY_CHK	__mempcpy_chk_avx_unaligned_1
 #endif
 
 	.section .text.avx,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+	lea	(%rdi, %rdx), %rax
+#else
+	mov	%rdi, %rax
+	add	%rdx, %rax
+#endif
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
@@ -42,6 +61,7 @@ ENTRY (MEMCPY)
 #ifdef USE_AS_MEMPCPY
 	add	%rdx, %rax
 #endif
+L(start):
 	cmp	$256, %rdx
 	jae	L(256bytesormore)
 	cmp	$16, %dl
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
index 3d567fc..3a57b73 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -27,9 +27,28 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_avx512_no_vzeroupper
 # define MEMCPY_CHK	__memcpy_chk_avx512_no_vzeroupper
+# define MEMPCPY	__mempcpy_avx512_no_vzeroupper_1
+# define MEMPCPY_CHK	__mempcpy_chk_avx512_no_vzeroupper_1
 #endif
 
 	.section .text.avx512,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+	lea	(%rdi, %rdx), %rax
+#else
+	mov	%rdi, %rax
+	add	%rdx, %rax
+#endif
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
@@ -42,6 +61,7 @@ ENTRY (MEMCPY)
 #ifdef USE_AS_MEMPCPY
 	add	%rdx, %rax
 #endif
+L(start):
 	lea	(%rsi, %rdx), %rcx
 	lea	(%rdi, %rdx), %r9
 	cmp	$512, %rdx
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 08b41e9..6184e4e 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -29,6 +29,8 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_ssse3_back
 # define MEMCPY_CHK	__memcpy_chk_ssse3_back
+# define MEMPCPY	__mempcpy_ssse3_back_1
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back_1
 #endif
 
 #define JMPTBL(I, B)	I - B
@@ -44,6 +46,23 @@
   ud2
 
 	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+	lea	(%rdi, %rdx), %rax
+#else
+	mov	%rdi, %rax
+	add	%rdx, %rax
+#endif
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
@@ -66,6 +85,7 @@ ENTRY (MEMCPY)
 	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 L(copy_forward):
 #endif
+L(start):
 	cmp	$144, %rdx
 	jae	L(144bytesormore)
 
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 95de969..127afaa 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -29,6 +29,8 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_ssse3
 # define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3_1
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3_1
 #endif
 
 #define JMPTBL(I, B)	I - B
@@ -44,6 +46,23 @@
   ud2
 
 	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+	lea	(%rdi, %rdx), %rax
+#else
+	mov	%rdi, %rax
+	add	%rdx, %rax
+#endif
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
@@ -66,6 +85,7 @@ ENTRY (MEMCPY)
 	jmp	L(copy_backward)
 L(copy_forward):
 #endif
+L(start):
 	cmp	$79, %rdx
 	lea     L(table_less_80bytes)(%rip), %r11
 	ja	L(80bytesormore)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ad023b590b32dcde6d1b641d41758e531290b6fd

commit ad023b590b32dcde6d1b641d41758e531290b6fd
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Mar 18 09:22:56 2016 -0700

    Add __memset_sse2_erms and __memset_chk_sse2_erms
    
    	* sysdeps/x86_64/memset.S (__memset_chk_sse2_erms): New
    	function.
    	(__memset_sse2_erms): Likewise.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Test __memset_chk_sse2_erms and
    	__memset_sse2_erms.
    	* sysdeps/x86_64/sysdep.h (REP_STOSB_THRESHOLD): New.

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 4cf0da0..71796a7 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -42,6 +42,26 @@ ENTRY(__memset_tail)
 END(__memset_tail)
 #endif
 
+#ifdef USE_MULTIARCH
+# if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memset_chk_sse2_erms)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk_sse2_erms)
+# endif
+
+ENTRY (__memset_sse2_erms)
+	cmpq	$REP_STOSB_THRESHOLD, %rdx
+	jbe	L(memset_entry)
+	movq	%rdx, %rcx
+	movzbl	%sil, %eax
+	movq	%rdi, %rdx
+	rep stosb
+	movq	%rdx, %rax
+	ret
+END (__memset_sse2_erms)
+#endif
+
 #if defined PIC && IS_IN (libc)
 ENTRY_CHK (__memset_chk)
 	cmpq	%rdx, %rcx
@@ -50,6 +70,7 @@ END_CHK (__memset_chk)
 #endif
 
 ENTRY (memset)
+L(memset_entry):
 	movd	%esi, %xmm0
 	movq	%rdi, %rax
 	punpcklbw	%xmm0, %xmm0
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index ba3202e..543c637 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -89,6 +89,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_sse2)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
@@ -103,6 +105,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2_erms)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index 2444d63..38d3a32 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -133,6 +133,9 @@ lose:									      \
 /* Threshold to use Enhanced REP MOVSB.  */
 #define REP_MOVSB_THRESHOLD	2048
 
+/* Threshold to use Enhanced REP STOSB.  */
+#define REP_STOSB_THRESHOLD	1024
+
 #else	/* __ASSEMBLER__ */
 
 /* Long and pointer size in bytes.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f0bc5d68ff40abe2fe3ea1f2515811f9636f8ac9

commit f0bc5d68ff40abe2fe3ea1f2515811f9636f8ac9
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Mar 18 08:32:05 2016 -0700

    Add sse2_unaligned_erms versions of memcpy/mempcpy
    
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Test __memcpy_chk_sse2_unaligned_erms,
    	__memcpy_sse2_unaligned_erms, __mempcpy_chk_sse2_unaligned_erms
    	and __mempcpy_sse2_unaligned_erms.
    	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
    	(__mempcpy_chk_sse2_unaligned_erms): New function.
    	(__mempcpy_sse2_unaligned_erms): Likewise.
    	(__memcpy_chk_sse2_unaligned_erms): Likewise.
    	(__memcpy_sse2_unaligned_erms): Likewise.
    	* sysdeps/x86_64/sysdep.h (REP_MOVSB_THRESHOLD): New.

diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 26b4137..ba3202e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -286,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2))
@@ -305,6 +307,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_avx512_no_vzeroupper)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+			      __memcpy_sse2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
 
@@ -327,6 +331,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2))
@@ -347,6 +353,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+			      __mempcpy_sse2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 947c50f..53e9464 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -23,6 +23,51 @@
 #include "asm-syntax.h"
 
 # ifdef SHARED
+ENTRY (__mempcpy_chk_sse2_unaligned_erms)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_sse2_unaligned_erms)
+# endif
+
+ENTRY (__mempcpy_sse2_unaligned_erms)
+	mov	%rdi, %rax
+	add	%rdx, %rax
+	jmp	L(start_erms)
+END (__mempcpy_sse2_unaligned_erms)
+
+# ifdef SHARED
+ENTRY (__memcpy_chk_sse2_unaligned_erms)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk_sse2_unaligned_erms)
+# endif
+
+ENTRY(__memcpy_sse2_unaligned_erms)
+	movq	%rdi, %rax
+L(start_erms):
+	testq	%rdx, %rdx
+	je	L(return)
+	cmpq	$16, %rdx
+	jbe	L(less_16)
+	cmpq	$REP_MOVSB_THRESHOLD, %rdx
+	ja	.Lerms
+	movdqu	(%rsi), %xmm8
+	cmpq	$32, %rdx
+	movdqu	%xmm8, (%rdi)
+	movdqu	-16(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -16(%rdi,%rdx)
+	ja	.L31
+	ret
+
+	.p2align 4,,10
+	.p2align 4
+.Lerms:
+	mov	%rdx, %rcx
+	rep movsb
+	ret
+END (__memcpy_sse2_unaligned_erms)
+
+# ifdef SHARED
 ENTRY (__mempcpy_chk_sse2_unaligned)
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index fbe3560..2444d63 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -130,6 +130,9 @@ lose:									      \
 #define R14_LP	r14
 #define R15_LP	r15
 
+/* Threshold to use Enhanced REP MOVSB.  */
+#define REP_MOVSB_THRESHOLD	2048
+
 #else	/* __ASSEMBLER__ */
 
 /* Long and pointer size in bytes.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=438ed4b59ada763de874ba3be10bb18ea52d5643

commit 438ed4b59ada763de874ba3be10bb18ea52d5643
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Mar 7 05:47:26 2016 -0800

    Enable __memcpy_chk_sse2_unaligned
    
    Check Fast_Unaligned_Load for __memcpy_chk_sse2_unaligned. The new
    selection order is:
    
    1. __memcpy_chk_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
    2. __memcpy_chk_sse2_unaligned if Fast_Unaligned_Load bit is set.
    3. __memcpy_chk_sse2 if SSSE3 isn't available.
    4. __memcpy_chk_ssse3_back if Fast_Copy_Backward bit it set.
    5. __memcpy_chk_ssse3
    
    	[BZ #19776]
    	* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Check
    	Fast_Unaligned_Load to enable __mempcpy_chk_sse2_unaligned.

diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 0e21c09..9e218c0 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -35,22 +35,25 @@ ENTRY(__memcpy_chk)
 	jnz	2f
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
-	jz      1f
+	jz	1f
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz      1f
-	leaq    __memcpy_chk_avx512_no_vzeroupper(%rip), %rax
+	jz	1f
+	lea	__memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
 	ret
 #endif
-1:	leaq	__memcpy_chk_sse2(%rip), %rax
+1:	lea	__memcpy_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jnz	2f
+	lea	__memcpy_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	lea	__memcpy_chk_sse2(%rip), %RAX_LP
 	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__memcpy_chk_ssse3(%rip), %rax
+	lea	__memcpy_chk_ssse3_back(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Fast_Copy_Backward)
-	jz	2f
-	leaq	__memcpy_chk_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz  2f
-	leaq    __memcpy_chk_avx_unaligned(%rip), %rax
+	jnz	2f
+	lea	__memcpy_chk_ssse3(%rip), %RAX_LP
 2:	ret
 END(__memcpy_chk)
 # else

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fe0ea5b0874b4ae4255c6ad3b2e0f1aaf94a97ff

commit fe0ea5b0874b4ae4255c6ad3b2e0f1aaf94a97ff
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Mar 7 05:44:58 2016 -0800

    Enable __mempcpy_chk_sse2_unaligned
    
    Check Fast_Unaligned_Load for __mempcpy_chk_sse2_unaligned. The new
    selection order is:
    
    1. __mempcpy_chk_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
    2. __mempcpy_chk_sse2_unaligned if Fast_Unaligned_Load bit is set.
    3. __mempcpy_chk_sse2 if SSSE3 isn't available.
    4. __mempcpy_chk_ssse3_back if Fast_Copy_Backward bit it set.
    5. __mempcpy_chk_ssse3
    
    	[BZ #19776]
    	* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Check
    	Fast_Unaligned_Load to enable __mempcpy_chk_sse2_unaligned.

diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index de888f3..7c888d3 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -38,19 +38,22 @@ ENTRY(__mempcpy_chk)
 	jz	1f
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
 	jz	1f
-	leaq    __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+	lea	__mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
 	ret
 #endif
-1:	leaq	__mempcpy_chk_sse2(%rip), %rax
+1:	lea	__mempcpy_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jnz	2f
+	lea	__mempcpy_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	lea	__mempcpy_chk_sse2(%rip), %RAX_LP
 	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__mempcpy_chk_ssse3(%rip), %rax
+	lea	__mempcpy_chk_ssse3_back(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Fast_Copy_Backward)
-	jz	2f
-	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	2f
-	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
+	jnz	2f
+	lea	__mempcpy_chk_ssse3(%rip), %RAX_LP
 2:	ret
 END(__mempcpy_chk)
 # else

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=175e5a4dcf431f9820dd483b0ba93d165d8652b2

commit 175e5a4dcf431f9820dd483b0ba93d165d8652b2
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Mar 7 05:42:46 2016 -0800

    Enable __mempcpy_sse2_unaligned
    
    Check Fast_Unaligned_Load for __mempcpy_sse2_unaligned.  The new
    selection order is:
    
    1. __mempcpy_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
    2. __mempcpy_sse2_unaligned if Fast_Unaligned_Load bit is set.
    3. __mempcpy_sse2 if SSSE3 isn't available.
    4. __mempcpy_ssse3_back if Fast_Copy_Backward bit it set.
    5. __mempcpy_ssse3
    
    	[BZ #19776]
    	* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Check
    	Fast_Unaligned_Load to enable __mempcpy_sse2_unaligned.

diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b85cf27..05c1fc8 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -36,19 +36,22 @@ ENTRY(__mempcpy)
 	jz	1f
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
 	jz	1f
-	leaq    __mempcpy_avx512_no_vzeroupper(%rip), %rax
+	lea	__mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
 	ret
 #endif
-1:	leaq	__mempcpy_sse2(%rip), %rax
+1:	lea	__mempcpy_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jnz	2f
+	lea	__mempcpy_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	lea	__mempcpy_sse2(%rip), %RAX_LP
 	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__mempcpy_ssse3(%rip), %rax
+	lea	__mempcpy_ssse3_back(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Fast_Copy_Backward)
-	jz	2f
-	leaq	__mempcpy_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	2f
-	leaq	__mempcpy_avx_unaligned(%rip), %rax
+	jnz	2f
+	lea	__mempcpy_ssse3(%rip), %RAX_LP
 2:	ret
 END(__mempcpy)
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=32920004ade2b9fd8ac3b3a0e177c0687b536a04

commit 32920004ade2b9fd8ac3b3a0e177c0687b536a04
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Mar 6 17:06:41 2016 -0800

    Add entry points for __mempcpy_sse2_unaligned and _chk functions
    
    Add entry points for __mempcpy_chk_sse2_unaligned,
    __mempcpy_sse2_unaligned and __memcpy_chk_sse2_unaligned.
    
    	[BZ #19776]
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Test __memcpy_chk_sse2_unaligned,
    	__mempcpy_chk_sse2_unaligned and __mempcpy_sse2_unaligned.
    	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
    	(__mempcpy_chk_sse2_unaligned): New.
    	(__mempcpy_sse2_unaligned): Likewise.
    	(__memcpy_chk_sse2_unaligned): Likewise.
    	(L(start): New label.

diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b0d300d..26b4137 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -284,6 +284,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2))
@@ -323,6 +325,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2))
@@ -341,6 +345,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 335a498..947c50f 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -22,9 +22,29 @@
 
 #include "asm-syntax.h"
 
+# ifdef SHARED
+ENTRY (__mempcpy_chk_sse2_unaligned)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_sse2_unaligned)
+# endif
+
+ENTRY (__mempcpy_sse2_unaligned)
+	mov	%rdi, %rax
+	add	%rdx, %rax
+	jmp	L(start)
+END (__mempcpy_sse2_unaligned)
+
+# ifdef SHARED
+ENTRY (__memcpy_chk_sse2_unaligned)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk_sse2_unaligned)
+# endif
 
 ENTRY(__memcpy_sse2_unaligned)
 	movq	%rdi, %rax
+L(start):
 	testq	%rdx, %rdx
 	je	L(return)
 	cmpq	$16, %rdx

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=12cf609fa5ffb50af921f634c6e548b50960948d

commit 12cf609fa5ffb50af921f634c6e548b50960948d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Mar 6 16:52:53 2016 -0800

    Remove L(overlapping) from memcpy-sse2-unaligned.S
    
    Since memcpy doesn't need to check overlapping source and destination,
    we can remove L(overlapping).
    
    	[BZ #19776]
    	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
    	(L(overlapping)): Removed.

diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 19d8aa6..335a498 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -25,12 +25,8 @@
 
 ENTRY(__memcpy_sse2_unaligned)
 	movq	%rdi, %rax
-	movq	%rsi, %r11
-	leaq	(%rdx,%rdx), %rcx
-	subq	%rdi, %r11
-	subq	%rdx, %r11
-	cmpq	%rcx, %r11
-	jb	L(overlapping)
+	testq	%rdx, %rdx
+	je	L(return)
 	cmpq	$16, %rdx
 	jbe	L(less_16)
 	movdqu	(%rsi), %xmm8
@@ -89,45 +85,6 @@ L(loop):
 	cmpq	%rcx, %rdx
 	jne	L(loop)
 	ret
-L(overlapping):
-	testq	%rdx, %rdx
-	.p2align 4,,5
-	je	L(return)
-	movq	%rdx, %r9
-	leaq	16(%rsi), %rcx
-	leaq	16(%rdi), %r8
-	shrq	$4, %r9
-	movq	%r9, %r11
-	salq	$4, %r11
-	cmpq	%rcx, %rdi
-	setae	%cl
-	cmpq	%r8, %rsi
-	setae	%r8b
-	orl	%r8d, %ecx
-	cmpq	$15, %rdx
-	seta	%r8b
-	testb	%r8b, %cl
-	je	.L21
-	testq	%r11, %r11
-	je	.L21
-	xorl	%ecx, %ecx
-	xorl	%r8d, %r8d
-.L7:
-	movdqu	(%rsi,%rcx), %xmm8
-	addq	$1, %r8
-	movdqu	%xmm8, (%rdi,%rcx)
-	addq	$16, %rcx
-	cmpq	%r8, %r9
-	ja	.L7
-	cmpq	%r11, %rdx
-	je	L(return)
-.L21:
-	movzbl	(%rsi,%r11), %ecx
-	movb	%cl, (%rdi,%r11)
-	addq	$1, %r11
-	cmpq	%r11, %rdx
-	ja	.L21
-	ret
 L(less_16):
 	testb	$24, %dl
 	jne	L(between_9_16)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=849e297b6803c989f33c05f5be8f93dbb3274bb1

commit 849e297b6803c989f33c05f5be8f93dbb3274bb1
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Mar 6 13:46:54 2016 -0800

    Don't use RAX as scratch register
    
    To prepare sharing code with mempcpy, don't use RAX as scratch register
    so that RAX can be set to the return value at entrance.
    
    	[BZ #19776]
    	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Don't use
    	RAX as scratch register.

diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 7207753..19d8aa6 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -24,11 +24,12 @@
 
 
 ENTRY(__memcpy_sse2_unaligned)
-	movq	%rsi, %rax
+	movq	%rdi, %rax
+	movq	%rsi, %r11
 	leaq	(%rdx,%rdx), %rcx
-	subq	%rdi, %rax
-	subq	%rdx, %rax
-	cmpq	%rcx, %rax
+	subq	%rdi, %r11
+	subq	%rdx, %r11
+	cmpq	%rcx, %r11
 	jb	L(overlapping)
 	cmpq	$16, %rdx
 	jbe	L(less_16)
@@ -39,7 +40,6 @@ ENTRY(__memcpy_sse2_unaligned)
 	movdqu	%xmm8, -16(%rdi,%rdx)
 	ja	.L31
 L(return):
-	movq	%rdi, %rax
 	ret
 	.p2align 4,,10
 	.p2align 4
@@ -64,16 +64,16 @@ L(return):
 	addq	%rdi, %rdx
 	andq	$-64, %rdx
 	andq	$-64, %rcx
-	movq	%rcx, %rax
-	subq	%rdi, %rax
-	addq	%rax, %rsi
+	movq	%rcx, %r11
+	subq	%rdi, %r11
+	addq	%r11, %rsi
 	cmpq	%rdx, %rcx
 	je	L(return)
 	movq	%rsi, %r10
 	subq	%rcx, %r10
 	leaq	16(%r10), %r9
 	leaq	32(%r10), %r8
-	leaq	48(%r10), %rax
+	leaq	48(%r10), %r11
 	.p2align 4,,10
 	.p2align 4
 L(loop):
@@ -83,12 +83,12 @@ L(loop):
 	movdqa	%xmm8, 16(%rcx)
 	movdqu	(%rcx,%r8), %xmm8
 	movdqa	%xmm8, 32(%rcx)
-	movdqu	(%rcx,%rax), %xmm8
+	movdqu	(%rcx,%r11), %xmm8
 	movdqa	%xmm8, 48(%rcx)
 	addq	$64, %rcx
 	cmpq	%rcx, %rdx
 	jne	L(loop)
-	jmp	L(return)
+	ret
 L(overlapping):
 	testq	%rdx, %rdx
 	.p2align 4,,5
@@ -97,8 +97,8 @@ L(overlapping):
 	leaq	16(%rsi), %rcx
 	leaq	16(%rdi), %r8
 	shrq	$4, %r9
-	movq	%r9, %rax
-	salq	$4, %rax
+	movq	%r9, %r11
+	salq	$4, %r11
 	cmpq	%rcx, %rdi
 	setae	%cl
 	cmpq	%r8, %rsi
@@ -107,9 +107,9 @@ L(overlapping):
 	cmpq	$15, %rdx
 	seta	%r8b
 	testb	%r8b, %cl
-	je	.L16
-	testq	%rax, %rax
-	je	.L16
+	je	.L21
+	testq	%r11, %r11
+	je	.L21
 	xorl	%ecx, %ecx
 	xorl	%r8d, %r8d
 .L7:
@@ -119,15 +119,15 @@ L(overlapping):
 	addq	$16, %rcx
 	cmpq	%r8, %r9
 	ja	.L7
-	cmpq	%rax, %rdx
+	cmpq	%r11, %rdx
 	je	L(return)
 .L21:
-	movzbl	(%rsi,%rax), %ecx
-	movb	%cl, (%rdi,%rax)
-	addq	$1, %rax
-	cmpq	%rax, %rdx
+	movzbl	(%rsi,%r11), %ecx
+	movb	%cl, (%rdi,%r11)
+	addq	$1, %r11
+	cmpq	%r11, %rdx
 	ja	.L21
-	jmp	L(return)
+	ret
 L(less_16):
 	testb	$24, %dl
 	jne	L(between_9_16)
@@ -137,28 +137,25 @@ L(less_16):
 	testq	%rdx, %rdx
 	.p2align 4,,2
 	je	L(return)
-	movzbl	(%rsi), %eax
+	movzbl	(%rsi), %ecx
 	testb	$2, %dl
-	movb	%al, (%rdi)
+	movb	%cl, (%rdi)
 	je	L(return)
-	movzwl	-2(%rsi,%rdx), %eax
-	movw	%ax, -2(%rdi,%rdx)
-	jmp	L(return)
+	movzwl	-2(%rsi,%rdx), %ecx
+	movw	%cx, -2(%rdi,%rdx)
+	ret
 L(between_9_16):
-	movq	(%rsi), %rax
-	movq	%rax, (%rdi)
-	movq	-8(%rsi,%rdx), %rax
-	movq	%rax, -8(%rdi,%rdx)
-	jmp	L(return)
-.L16:
-	xorl	%eax, %eax
-	jmp	.L21
+	movq	(%rsi), %rcx
+	movq	%rcx, (%rdi)
+	movq	-8(%rsi,%rdx), %rcx
+	movq	%rcx, -8(%rdi,%rdx)
+	ret
 L(between_5_8):
-	movl	(%rsi), %eax
-	movl	%eax, (%rdi)
-	movl	-4(%rsi,%rdx), %eax
-	movl	%eax, -4(%rdi,%rdx)
-	jmp	L(return)
+	movl	(%rsi), %ecx
+	movl	%ecx, (%rdi)
+	movl	-4(%rsi,%rdx), %ecx
+	movl	%ecx, -4(%rdi,%rdx)
+	ret
 END(__memcpy_sse2_unaligned)
 
 #endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4891daae057485dfaa319971afd1d4fe52fbc929

commit 4891daae057485dfaa319971afd1d4fe52fbc929
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Mar 6 14:16:32 2016 -0800

    Remove dead code from memcpy-sse2-unaligned.S
    
    There are
    
    ENTRY(__memcpy_sse2_unaligned)
       movq  %rsi, %rax
       leaq  (%rdx,%rdx), %rcx
       subq  %rdi, %rax
       subq  %rdx, %rax
       cmpq  %rcx, %rax
       jb L(overlapping)
    
    When branch is taken,
    
       cmpq  %rsi, %rdi
       jae   .L3
    
    will never be taken.  We can remove the dead code.
    
    	[BZ #19776]
    	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S (.L3) Removed.

diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index c450983..7207753 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -90,8 +90,6 @@ L(loop):
 	jne	L(loop)
 	jmp	L(return)
 L(overlapping):
-	cmpq	%rsi, %rdi
-	jae	.L3
 	testq	%rdx, %rdx
 	.p2align 4,,5
 	je	L(return)
@@ -146,15 +144,6 @@ L(less_16):
 	movzwl	-2(%rsi,%rdx), %eax
 	movw	%ax, -2(%rdi,%rdx)
 	jmp	L(return)
-.L3:
-	leaq	-1(%rdx), %rax
-	.p2align 4,,10
-	.p2align 4
-.L11:
-	movzbl	(%rsi,%rax), %edx
-	movb	%dl, (%rdi,%rax)
-	subq	$1, %rax
-	jmp	.L11
 L(between_9_16):
 	movq	(%rsi), %rax
 	movq	%rax, (%rdi)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=735a17465774fc651dfe26cf27e3cf6cdf1883f6

commit 735a17465774fc651dfe26cf27e3cf6cdf1883f6
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Apr 11 08:51:16 2014 -0700

    Test 32-bit ERMS memcpy/memset
    
    	* sysdeps/i386/i686/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add __bcopy_erms, __bzero_erms,
    	__memmove_chk_erms, __memmove_erms, __memset_chk_erms,
    	__memset_erms, __memcpy_chk_erms, __memcpy_erms,
    	__mempcpy_chk_erms and __mempcpy_erms.

diff --git a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
index ef30a95..f3cbca0 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
+++ b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -44,6 +44,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __bcopy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2),
 			      __bcopy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_erms)
 	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/bzero.S.  */
@@ -52,6 +53,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __bzero_sse2_rep)
 	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
 			      __bzero_sse2)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_erms)
 	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memchr.S.  */
@@ -82,6 +84,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSE2),
 			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memmove.S.  */
@@ -92,6 +96,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2),
 			      __memmove_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memrchr.S.  */
@@ -111,6 +116,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSE2),
 			      __memset_chk_sse2)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memset.S.  */
@@ -119,6 +126,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memset_sse2_rep)
 	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
 			      __memset_sse2)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/rawmemchr.S.  */
@@ -319,6 +327,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSE2),
 			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memcpy.S.  */
@@ -329,6 +339,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2),
 			      __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S.  */
@@ -343,6 +354,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSE2),
 			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/mempcpy.S.  */
@@ -353,6 +366,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2),
 			      __mempcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strlen.S.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=665d19ea1eaa11d1a37d2e8fe679f92075583bf8

commit 665d19ea1eaa11d1a37d2e8fe679f92075583bf8
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Apr 11 08:25:17 2014 -0700

    Test 64-bit ERMS memcpy/memset
    
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add __memmove_chk_erms,
    	__memmove_erms, __memset_erms, __memset_chk_erms,
    	__memcpy_chk_erms, __memcpy_erms, __mempcpy_chk_erms and
    	__mempcpy_erms.

diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 188b6d3..b0d300d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -63,6 +63,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memmove.S.  */
@@ -79,12 +81,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_sse2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_chk_avx2)
@@ -98,6 +103,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_avx2)
@@ -278,6 +284,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
@@ -295,6 +303,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_avx512_no_vzeroupper)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
@@ -314,6 +323,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
@@ -330,6 +341,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ef0a94fe0d1f51db1532b7f530841bf5ad2e2ba5

commit ef0a94fe0d1f51db1532b7f530841bf5ad2e2ba5
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Sep 21 15:21:28 2011 -0700

    Add 32it ERMS memcpy/memset
    
    	* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
    	bcopy-erms, memcpy-erms, memmove-erms, mempcpy-erms, bzero-erms
    	and memset-erms.
    	* sysdeps/i386/i686/multiarch/bcopy-erms.S: New file.
    	* sysdeps/i386/i686/multiarch/bzero-erms.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memcpy-erms.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memmove-erms.S: Likewise.
    	* sysdeps/i386/i686/multiarch/mempcpy-erms.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memset-erms.S: Likewise.
    	* sysdeps/i386/i686/multiarch/ifunc-defines.sym: Add
    	COMMON_CPUID_INDEX_7.
    	* sysdeps/i386/i686/multiarch/bcopy.S: Enable ERMS optimization
    	for Fast_ERMS.
    	* sysdeps/i386/i686/multiarch/bzero.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memcpy.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memcpy_chk.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memmove.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memmove_chk.S: Likewise.
    	* sysdeps/i386/i686/multiarch/mempcpy.S: Likewise.
    	* sysdeps/i386/i686/multiarch/mempcpy_chk.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memset.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memset_chk.S: Likewise.

diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 700010d..6bcef4c 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -25,7 +25,9 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
 		   strcasecmp_l-sse4 strncase_l-sse4 \
 		   bcopy-sse2-unaligned memcpy-sse2-unaligned \
 		   mempcpy-sse2-unaligned memmove-sse2-unaligned \
-		   strcspn-c strpbrk-c strspn-c
+		   strcspn-c strpbrk-c strspn-c \
+		   bcopy-erms memcpy-erms memmove-erms mempcpy-erms \
+		   bzero-erms memset-erms
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/bcopy-erms.S b/sysdeps/i386/i686/multiarch/bcopy-erms.S
new file mode 100644
index 0000000..da9e160
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bcopy-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S
index ce6661b..04f5a3a 100644
--- a/sysdeps/i386/i686/multiarch/bcopy.S
+++ b/sysdeps/i386/i686/multiarch/bcopy.S
@@ -27,6 +27,9 @@
 ENTRY(bcopy)
 	.type	bcopy, @gnu_indirect_function
 	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bcopy_erms)
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	LOAD_FUNC_GOT_EAX (__bcopy_ia32)
 	HAS_CPU_FEATURE (SSE2)
 	jz	2f
diff --git a/sysdeps/i386/i686/multiarch/bzero-erms.S b/sysdeps/i386/i686/multiarch/bzero-erms.S
new file mode 100644
index 0000000..2c3bed6
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bzero-erms.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_erms __bzero_erms
+#include "memset-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/bzero.S b/sysdeps/i386/i686/multiarch/bzero.S
index 738ca69..a61b5d2 100644
--- a/sysdeps/i386/i686/multiarch/bzero.S
+++ b/sysdeps/i386/i686/multiarch/bzero.S
@@ -27,6 +27,9 @@
 ENTRY(__bzero)
 	.type	__bzero, @gnu_indirect_function
 	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bzero_erms)
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	LOAD_FUNC_GOT_EAX (__bzero_ia32)
 	HAS_CPU_FEATURE (SSE2)
 	jz	2f
diff --git a/sysdeps/i386/i686/multiarch/ifunc-defines.sym b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
index 96e9cfa..3df946f 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-defines.sym
+++ b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
@@ -16,4 +16,5 @@ FEATURE_OFFSET		offsetof (struct cpu_features, feature)
 FEATURE_SIZE		sizeof (unsigned int)
 
 COMMON_CPUID_INDEX_1
+COMMON_CPUID_INDEX_7
 FEATURE_INDEX_1
diff --git a/sysdeps/i386/i686/multiarch/memcpy-erms.S b/sysdeps/i386/i686/multiarch/memcpy-erms.S
new file mode 100644
index 0000000..df0c801
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy-erms.S
@@ -0,0 +1,100 @@
+/* memcpy with Enhanced REP MOVSB/STOSB
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_erms
+# define MEMCPY_CHK	__memcpy_chk_erms
+#endif
+
+#ifdef USE_AS_BCOPY
+# define STR2		12
+# define STR1		STR2+4
+# define N     		STR1+4
+#else
+# define STR1		12
+# define STR2		STR1+4
+# define N     		STR2+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+	.section .text.erms,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+	PUSH	(%esi)
+	PUSH	(%edi)
+	movl	N(%esp), %ecx
+	movl	STR1(%esp), %edi
+	movl	STR2(%esp), %esi
+	mov	%edi, %eax
+#ifdef USE_AS_MEMPCPY
+	add	%ecx, %eax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%esi, %edi
+	ja	L(copy_backward)
+	je	L(bwd_write_0bytes)
+#endif
+
+	rep	movsb
+	POP	(%edi)
+	POP	(%esi)
+	ret
+
+#ifdef USE_AS_MEMMOVE
+L(copy_backward):
+	lea	-1(%edi,%ecx), %edi
+	lea	-1(%esi,%ecx), %esi
+	std
+	rep	movsb
+	cld
+L(bwd_write_0bytes):
+	POP	(%edi)
+	POP	(%esi)
+	ret
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
index 652b5a2..79ae41f 100644
--- a/sysdeps/i386/i686/multiarch/memcpy.S
+++ b/sysdeps/i386/i686/multiarch/memcpy.S
@@ -29,6 +29,9 @@
 ENTRY(memcpy)
 	.type	memcpy, @gnu_indirect_function
 	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_erms)
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	LOAD_FUNC_GOT_EAX (__memcpy_ia32)
 	HAS_CPU_FEATURE (SSE2)
 	jz	2f
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
index 0eee32c..dd1d38a 100644
--- a/sysdeps/i386/i686/multiarch/memcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -30,6 +30,9 @@
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
 	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_erms)
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
 	HAS_CPU_FEATURE (SSE2)
 	jz	2f
diff --git a/sysdeps/i386/i686/multiarch/memmove-erms.S b/sysdeps/i386/i686/multiarch/memmove-erms.S
new file mode 100644
index 0000000..357289a
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_erms
+#define MEMCPY_CHK	__memmove_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
index 725a421..13223b3 100644
--- a/sysdeps/i386/i686/multiarch/memmove.S
+++ b/sysdeps/i386/i686/multiarch/memmove.S
@@ -27,6 +27,9 @@
 ENTRY(memmove)
 	.type	memmove, @gnu_indirect_function
 	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_erms)
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	LOAD_FUNC_GOT_EAX (__memmove_ia32)
 	HAS_CPU_FEATURE (SSE2)
 	jz	2f
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
index a29bbc9..ed000ee 100644
--- a/sysdeps/i386/i686/multiarch/memmove_chk.S
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -27,6 +27,9 @@
 ENTRY(__memmove_chk)
 	.type	__memmove_chk, @gnu_indirect_function
 	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_chk_erms)
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
 	HAS_CPU_FEATURE (SSE2)
 	jz	2f
@@ -90,5 +93,17 @@ __memmove_chk_ia32:
 	jmp	__memmove_ia32
 	cfi_endproc
 	.size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+	.type __memmove_chk_erms, @function
+	.p2align 4;
+__memmove_chk_erms:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_erms
+	cfi_endproc
+	.size __memmove_chk_erms, .-__memmove_chk_erms
 # endif
 #endif
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-erms.S b/sysdeps/i386/i686/multiarch/mempcpy-erms.S
new file mode 100644
index 0000000..01d3bf8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_erms
+#define MEMCPY_CHK	__mempcpy_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
index b46f3fc..cceae9b 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -29,6 +29,9 @@
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
 	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_erms)
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
 	HAS_CPU_FEATURE (SSE2)
 	jz	2f
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
index 30f3629..97d5179 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -30,6 +30,9 @@
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
 	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_erms)
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
 	HAS_CPU_FEATURE (SSE2)
 	jz	2f
diff --git a/sysdeps/i386/i686/multiarch/memset-erms.S b/sysdeps/i386/i686/multiarch/memset-erms.S
new file mode 100644
index 0000000..807a6e4
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memset-erms.S
@@ -0,0 +1,69 @@
+/* memset with Enhanced REP MOVSB/STOSB
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define STR1  8
+#ifdef USE_AS_BZERO
+#define N     STR1+4
+#else
+#define STR2  STR1+4
+#define N     STR2+4
+#endif
+
+	.section .text.erms,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO
+ENTRY (__memset_chk_erms)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+#endif
+ENTRY (__memset_erms)
+	PUSH    (%edi)
+	movl	N(%esp), %ecx
+	movl	STR1(%esp), %edi
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	STR2(%esp), %eax
+	mov	%edi, %edx
+#endif
+	rep	stosb
+#ifndef USE_AS_BZERO
+	mov	%edx, %eax
+#endif
+	POP     (%edi)
+	ret
+END (__memset_erms)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memset.S b/sysdeps/i386/i686/multiarch/memset.S
index 14180e4..3c11b91 100644
--- a/sysdeps/i386/i686/multiarch/memset.S
+++ b/sysdeps/i386/i686/multiarch/memset.S
@@ -27,6 +27,9 @@
 ENTRY(memset)
 	.type	memset, @gnu_indirect_function
 	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_erms)
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	LOAD_FUNC_GOT_EAX (__memset_ia32)
 	HAS_CPU_FEATURE (SSE2)
 	jz	2f
diff --git a/sysdeps/i386/i686/multiarch/memset_chk.S b/sysdeps/i386/i686/multiarch/memset_chk.S
index d73f202..fa1c5fb 100644
--- a/sysdeps/i386/i686/multiarch/memset_chk.S
+++ b/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -27,6 +27,9 @@
 ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
 	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_chk_erms)
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
 	HAS_CPU_FEATURE (SSE2)
 	jz	2f
@@ -78,5 +81,17 @@ __memset_chk_ia32:
 	jmp	__memset_ia32
 	cfi_endproc
 	.size __memset_chk_ia32, .-__memset_chk_ia32
+
+	.type __memset_chk_erms, @function
+	.p2align 4;
+__memset_chk_erms:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_erms
+	cfi_endproc
+	.size __memset_chk_erms, .-__memset_chk_erms
 # endif
 #endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ef81a3a93a95dd0017fcefe4207306d6875c3794

commit ef81a3a93a95dd0017fcefe4207306d6875c3794
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Sep 15 16:16:10 2011 -0700

    Add 64-bit ERMS memcpy and memset
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	memcpy-erms, mempcpy-erms, memmove-erms and memset-erms.
    	* sysdeps/x86_64/multiarch/memcpy-erms.S: New.
    	* sysdeps/x86_64/multiarch/memmove-erms.S: Likewise.
    	* sysdeps/x86_64/multiarch/mempcpy-erms.S: Likewise.
    	* sysdeps/x86_64/multiarch/memset-erms.S: Likewise.
    	* sysdeps/x86_64/multiarch/memcpy.S: Enable ERMS optimization
    	for Fast_ERMS.
    	* sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
    	* sysdeps/x86_64/multiarch/memmove.c: Likewise.
    	* sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
    	* sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
    	* sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
    	* sysdeps/x86_64/multiarch/memset.S: Likewise.
    	* sysdeps/x86_64/multiarch/memset_chk.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d234f4a..2f29a2a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -20,7 +20,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
 		   strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
-		   memset-avx512-no-vzeroupper
+		   memset-avx512-no-vzeroupper \
+		   memcpy-erms mempcpy-erms memmove-erms \
+		   memset-erms
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memcpy-erms.S b/sysdeps/x86_64/multiarch/memcpy-erms.S
new file mode 100644
index 0000000..f0595d6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-erms.S
@@ -0,0 +1,71 @@
+/* memcpy with Enhanced REP MOVSB/STOSB
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#  include "asm-syntax.h"
+
+#  ifndef MEMCPY
+#   define MEMCPY	__memcpy_erms
+#   define MEMCPY_CHK	__memcpy_chk_erms
+#  endif
+
+	.section .text.erms,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+	mov	%rdx, %rcx
+# ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+# endif
+
+# ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	ja	L(copy_backward)
+	je	L(bwd_write_0bytes)
+# endif
+
+	rep movsb
+	ret
+
+# ifdef USE_AS_MEMMOVE
+L(copy_backward):
+	lea	-1(%rdi,%rdx), %rdi
+	lea	-1(%rsi,%rdx), %rsi
+	std
+	rep movsb
+	cld
+L(bwd_write_0bytes):
+	ret
+# endif
+
+END (MEMCPY)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 8882590..58d9223 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -30,6 +30,9 @@
 ENTRY(__new_memcpy)
 	.type	__new_memcpy, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
+	lea    __memcpy_erms(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 648217e..0e21c09 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -30,6 +30,9 @@
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memcpy_chk_erms(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz      1f
diff --git a/sysdeps/x86_64/multiarch/memmove-erms.S b/sysdeps/x86_64/multiarch/memmove-erms.S
new file mode 100644
index 0000000..357289a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_erms
+#define MEMCPY_CHK	__memmove_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index 8da5640..3777bea 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -35,6 +35,7 @@
 extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_erms attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
 # ifdef HAVE_AVX512_ASM_SUPPORT
   extern __typeof (__redirect_memmove) __memmove_avx512_no_vzeroupper attribute_hidden;
@@ -52,6 +53,9 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__redirect_memmove) __libc_memmove;
 libc_ifunc (__libc_memmove,
+	    HAS_ARCH_FEATURE (Fast_ERMS)
+	    ? __memmove_erms
+	    : (
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	    HAS_ARCH_FEATURE (AVX512F_Usable)
 	      && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
@@ -63,7 +67,7 @@ libc_ifunc (__libc_memmove,
 	    : (HAS_CPU_FEATURE (SSSE3)
 	       ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	          ? __memmove_ssse3_back : __memmove_ssse3)
-	       : __memmove_sse2)));
+	       : __memmove_sse2))));
 
 strong_alias (__libc_memmove, memmove)
 
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index f64da63..4cd360a 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -25,6 +25,7 @@
 extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
+extern __typeof (__memmove_chk) __memmove_chk_erms attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
 # ifdef HAVE_AVX512_ASM_SUPPORT
   extern __typeof (__memmove_chk) __memmove_chk_avx512_no_vzeroupper attribute_hidden;
@@ -33,6 +34,9 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
 #include "debug/memmove_chk.c"
 
 libc_ifunc (__memmove_chk,
+	    HAS_ARCH_FEATURE (Fast_ERMS)
+	    ? __memmove_chk_erms
+	    : (
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	    HAS_ARCH_FEATURE (AVX512F_Usable)
 	      && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
@@ -43,4 +47,4 @@ libc_ifunc (__memmove_chk,
 	    (HAS_CPU_FEATURE (SSSE3)
 	    ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
-	    : __memmove_chk_sse2));
+	    : __memmove_chk_sse2)));
diff --git a/sysdeps/x86_64/multiarch/mempcpy-erms.S b/sysdeps/x86_64/multiarch/mempcpy-erms.S
new file mode 100644
index 0000000..01d3bf8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_erms
+#define MEMCPY_CHK	__mempcpy_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index ed78623..b85cf27 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -28,6 +28,9 @@
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__mempcpy_erms(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 6e8a89d..de888f3 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -30,6 +30,9 @@
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__mempcpy_chk_erms(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
diff --git a/sysdeps/x86_64/multiarch/memset-erms.S b/sysdeps/x86_64/multiarch/memset-erms.S
new file mode 100644
index 0000000..af9f80b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-erms.S
@@ -0,0 +1,40 @@
+/* memset with Enhanced REP MOVSB/STOSB
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#ifndef NOT_IN_libc
+
+	.text
+# ifdef SHARED
+ENTRY (__memset_chk_erms)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+# endif
+
+ENTRY (__memset_erms)
+	mov	%rdx, %rcx
+	movzbl	%sil, %eax
+	mov	%rdi, %rdx
+	rep stosb
+	mov %rdx, %rax
+	ret
+END (__memset_erms)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 8e3b9b9..dda8185 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -26,6 +26,9 @@
 ENTRY(memset)
 	.type	memset, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memset_erms(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	leaq	__memset_sse2(%rip), %rax
 	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
index 9a7b270..b8c9940 100644
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -26,6 +26,9 @@
 ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memset_chk_erms(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_ERMS)
+	jnz	2f
 	leaq	__memset_chk_sse2(%rip), %rax
 	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cd8fa6c74a5f122a242c412b5ea526a31f13855a

commit cd8fa6c74a5f122a242c412b5ea526a31f13855a
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Sep 15 15:47:01 2011 -0700

    Initial ERMS support
    
    	* sysdeps/x86/cpu-features.h (bit_arch_Fast_ERMS): New.
    	(bit_cpu_ERMS): Likewise.
    	(index_cpu_ERMS): Likewise.
    	(index_arch_Fast_ERMS): Likewise.
    	(reg_ERMS): Likewise.

diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index e06eb7e..1c09712 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -35,6 +35,7 @@
 #define bit_arch_I686				(1 << 15)
 #define bit_arch_Prefer_MAP_32BIT_EXEC		(1 << 16)
 #define bit_arch_Prefer_No_VZEROUPPER		(1 << 17)
+#define bit_arch_Fast_ERMS			(1 << 18)
 
 /* CPUID Feature flags.  */
 
@@ -52,6 +53,7 @@
 #define bit_cpu_FMA4		(1 << 16)
 
 /* COMMON_CPUID_INDEX_7.  */
+#define bit_cpu_ERMS		(1 << 9)
 #define bit_cpu_RTM		(1 << 11)
 #define bit_cpu_AVX2		(1 << 5)
 #define bit_cpu_AVX512F		(1 << 16)
@@ -83,6 +85,7 @@
 # define index_cpu_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 # define index_cpu_AVX	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 # define index_cpu_AVX2	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
+# define index_cpu_ERMS	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
 
 # define index_arch_Fast_Rep_String	FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
@@ -101,6 +104,7 @@
 # define index_arch_I686		FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Fast_ERMS		FEATURE_INDEX_1*FEATURE_SIZE
 
 
 # if defined (_LIBC) && !IS_IN (nonlib)
@@ -226,6 +230,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_cpu_AVX2		COMMON_CPUID_INDEX_7
 # define index_cpu_AVX512F	COMMON_CPUID_INDEX_7
 # define index_cpu_AVX512DQ	COMMON_CPUID_INDEX_7
+# define index_cpu_ERMS		COMMON_CPUID_INDEX_7
 # define index_cpu_RTM		COMMON_CPUID_INDEX_7
 # define index_cpu_FMA		COMMON_CPUID_INDEX_1
 # define index_cpu_FMA4		COMMON_CPUID_INDEX_80000001
@@ -242,6 +247,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define reg_AVX2		ebx
 # define reg_AVX512F		ebx
 # define reg_AVX512DQ		ebx
+# define reg_ERMS		ebx
 # define reg_RTM		ebx
 # define reg_FMA		ecx
 # define reg_FMA4		ecx
@@ -265,6 +271,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_arch_I686		FEATURE_INDEX_1
 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
+# define index_arch_Fast_ERMS		FEATURE_INDEX_1
 
 #endif	/* !__ASSEMBLER__ */
 

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]