This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/erms/hybrid created. glibc-2.23-137-g6ce43b9
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 22 Mar 2016 15:13:46 -0000
- Subject: GNU C Library master sources branch hjl/erms/hybrid created. glibc-2.23-137-g6ce43b9
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/erms/hybrid has been created
at 6ce43b9abbfa089e80ae8fa3508b7ecdfb56c265 (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6ce43b9abbfa089e80ae8fa3508b7ecdfb56c265
commit 6ce43b9abbfa089e80ae8fa3508b7ecdfb56c265
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 12:36:03 2016 -0700
Add Hybrid_ERMS and use it in memset.S
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c8f81ef..b8f0b82 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -192,6 +192,12 @@ init_cpu_features (struct cpu_features *cpu_features)
}
}
+ /* Enable optimization of hybrid Enhanced REP MOVSB/STOSB with
+ SSE/AVX. */
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ cpu_features->feature[index_arch_Hybrid_ERMS]
+ |= bit_arch_Hybrid_ERMS;
+
/* Unaligned load with 256-bit AVX registers are faster on
Intel processors with AVX2. */
if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 1c09712..f1df917 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -36,6 +36,7 @@
#define bit_arch_Prefer_MAP_32BIT_EXEC (1 << 16)
#define bit_arch_Prefer_No_VZEROUPPER (1 << 17)
#define bit_arch_Fast_ERMS (1 << 18)
+#define bit_arch_Hybrid_ERMS (1 << 19)
/* CPUID Feature flags. */
@@ -105,6 +106,7 @@
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Fast_ERMS FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Hybrid_ERMS FEATURE_INDEX_1*FEATURE_SIZE
# if defined (_LIBC) && !IS_IN (nonlib)
@@ -272,6 +274,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
# define index_arch_Fast_ERMS FEATURE_INDEX_1
+# define index_arch_Hybrid_ERMS FEATURE_INDEX_1
#endif /* !__ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index dda8185..7364747 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -29,10 +29,18 @@ ENTRY(memset)
leaq __memset_erms(%rip), %rax
HAS_ARCH_FEATURE (Fast_ERMS)
jnz 2f
+ leaq __memset_sse2_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Hybrid_ERMS)
+ jnz 1f
leaq __memset_sse2(%rip), %rax
+1:
HAS_ARCH_FEATURE (AVX2_Usable)
jz 2f
+ leaq __memset_avx2_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Hybrid_ERMS)
+ jnz 3f
leaq __memset_avx2(%rip), %rax
+3:
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 2f
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9ad76210d60ab512bd329824ccf24add63c6a4b7
commit 9ad76210d60ab512bd329824ccf24add63c6a4b7
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 10:34:07 2016 -0700
Add __memset_avx2_erms and __memset_chk_avx2_erms
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 002800e..cd8a2c1 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -95,6 +95,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_chk_avx2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_chk_avx2_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
@@ -110,6 +113,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_avx2)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_avx2_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX512F_Usable),
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
index df63472..9a565a9 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2.S
@@ -28,6 +28,24 @@
.section .text.avx2,"ax",@progbits
#if defined PIC
+ENTRY_CHK (__memset_chk_avx2_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk_avx2_erms)
+# endif
+
+ENTRY (__memset_avx2_erms)
+ cmpq $REP_STOSB_THRESHOLD, %rdx
+ jbe L(start)
+ movq %rdx, %rcx
+ movzbl %sil, %eax
+ movq %rdi, %rdx
+ rep stosb
+ movq %rdx, %rax
+ ret
+END (__memset_avx2_erms)
+
+#if defined PIC
ENTRY (MEMSET_CHK)
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
@@ -35,6 +53,7 @@ END (MEMSET_CHK)
#endif
ENTRY (MEMSET)
+L(start):
vpxor %xmm0, %xmm0, %xmm0
vmovd %esi, %xmm1
lea (%rdi, %rdx), %rsi
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=eeedc55e73367131c49cc84749006fd7cf0d2f41
commit eeedc55e73367131c49cc84749006fd7cf0d2f41
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 10:27:58 2016 -0700
Add avx_unaligned_erms versions of memcpy/mempcpy
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 543c637..002800e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -281,6 +281,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
@@ -300,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -326,6 +332,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
@@ -350,6 +359,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index c2df9da..577b80c 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -33,6 +33,32 @@
.section .text.avx,"ax",@progbits
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (__mempcpy_chk_avx_unaligned_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_avx_unaligned_erms)
+
+ENTRY (__mempcpy_avx_unaligned_erms)
+ mov %rdi, %rax
+ add %rdx, %rax
+ jmp L(start_erms)
+END (__mempcpy_avx_unaligned_erms)
+
+ENTRY (__memcpy_chk_avx_unaligned_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk_avx_unaligned_erms)
+
+ENTRY(__memcpy_avx_unaligned_erms)
+ movq %rdi, %rax
+L(start_erms):
+ cmpq $REP_MOVSB_THRESHOLD, %rdx
+ jbe L(start)
+ mov %rdx, %rcx
+ rep movsb
+ ret
+END (__memcpy_avx_unaligned_erms)
+
ENTRY (MEMPCPY_CHK)
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=21a60509c4d164f7c2ac21add165e9868da0aab0
commit 21a60509c4d164f7c2ac21add165e9868da0aab0
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 10:07:48 2016 -0700
Remove mempcpy-*.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2f29a2a..fdb8448 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,10 +8,10 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
- memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
- memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
- memcpy-avx-unaligned mempcpy-avx-unaligned \
- mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
+ memcpy-avx512-no-vzeroupper memmove-ssse3 \
+ memcpy-ssse3-back memmove-avx-unaligned \
+ memcpy-avx-unaligned \
+ memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index 4f4bf45..c2df9da 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -27,8 +27,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_avx_unaligned
# define MEMCPY_CHK __memcpy_chk_avx_unaligned
-# define MEMPCPY __mempcpy_avx_unaligned_1
-# define MEMPCPY_CHK __mempcpy_chk_avx_unaligned_1
+# define MEMPCPY __mempcpy_avx_unaligned
+# define MEMPCPY_CHK __mempcpy_chk_avx_unaligned
#endif
.section .text.avx,"ax",@progbits
@@ -39,12 +39,8 @@ ENTRY (MEMPCPY_CHK)
END (MEMPCPY_CHK)
ENTRY (MEMPCPY)
-#if 0
- lea (%rdi, %rdx), %rax
-#else
mov %rdi, %rax
add %rdx, %rax
-#endif
jmp L(start)
END (MEMPCPY)
#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
index 3a57b73..7babb47 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -27,8 +27,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_avx512_no_vzeroupper
# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper
-# define MEMPCPY __mempcpy_avx512_no_vzeroupper_1
-# define MEMPCPY_CHK __mempcpy_chk_avx512_no_vzeroupper_1
+# define MEMPCPY __mempcpy_avx512_no_vzeroupper
+# define MEMPCPY_CHK __mempcpy_chk_avx512_no_vzeroupper
#endif
.section .text.avx512,"ax",@progbits
@@ -39,12 +39,8 @@ ENTRY (MEMPCPY_CHK)
END (MEMPCPY_CHK)
ENTRY (MEMPCPY)
-#if 0
- lea (%rdi, %rdx), %rax
-#else
mov %rdi, %rax
add %rdx, %rax
-#endif
jmp L(start)
END (MEMPCPY)
#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 6184e4e..9a872d3 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -29,8 +29,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_ssse3_back
# define MEMCPY_CHK __memcpy_chk_ssse3_back
-# define MEMPCPY __mempcpy_ssse3_back_1
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_back_1
+# define MEMPCPY __mempcpy_ssse3_back
+# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
#endif
#define JMPTBL(I, B) I - B
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 127afaa..643f322 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -29,8 +29,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_ssse3
# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3_1
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_1
+# define MEMPCPY __mempcpy_ssse3
+# define MEMPCPY_CHK __mempcpy_chk_ssse3
#endif
#define JMPTBL(I, B) I - B
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
deleted file mode 100644
index 241378e..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* mempcpy with AVX
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_avx_unaligned
-#define MEMCPY_CHK __mempcpy_chk_avx_unaligned
-#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
deleted file mode 100644
index fcc0945..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* mempcpy optimized with AVX512 for KNL hardware.
- Copyright (C) 2016 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_avx512_no_vzeroupper
-#define MEMCPY_CHK __mempcpy_chk_avx512_no_vzeroupper
-#include "memcpy-avx512-no-vzeroupper.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
deleted file mode 100644
index 82ffacb..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_ssse3_back
-#define MEMCPY_CHK __mempcpy_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
deleted file mode 100644
index 822d98e..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_ssse3
-#define MEMCPY_CHK __mempcpy_chk_ssse3
-#include "memcpy-ssse3.S"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=dce73065103d970a518ea7a869e218864c946198
commit dce73065103d970a518ea7a869e218864c946198
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 13:37:31 2016 -0800
Merge memcpy with mempcpy
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index 74fed18..4f4bf45 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -25,11 +25,30 @@
#include "asm-syntax.h"
#ifndef MEMCPY
-# define MEMCPY __memcpy_avx_unaligned
+# define MEMCPY __memcpy_avx_unaligned
# define MEMCPY_CHK __memcpy_chk_avx_unaligned
+# define MEMPCPY __mempcpy_avx_unaligned_1
+# define MEMPCPY_CHK __mempcpy_chk_avx_unaligned_1
#endif
.section .text.avx,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+ lea (%rdi, %rdx), %rax
+#else
+ mov %rdi, %rax
+ add %rdx, %rax
+#endif
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -42,6 +61,7 @@ ENTRY (MEMCPY)
#ifdef USE_AS_MEMPCPY
add %rdx, %rax
#endif
+L(start):
cmp $256, %rdx
jae L(256bytesormore)
cmp $16, %dl
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
index 3d567fc..3a57b73 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -27,9 +27,28 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_avx512_no_vzeroupper
# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper
+# define MEMPCPY __mempcpy_avx512_no_vzeroupper_1
+# define MEMPCPY_CHK __mempcpy_chk_avx512_no_vzeroupper_1
#endif
.section .text.avx512,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+ lea (%rdi, %rdx), %rax
+#else
+ mov %rdi, %rax
+ add %rdx, %rax
+#endif
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -42,6 +61,7 @@ ENTRY (MEMCPY)
#ifdef USE_AS_MEMPCPY
add %rdx, %rax
#endif
+L(start):
lea (%rsi, %rdx), %rcx
lea (%rdi, %rdx), %r9
cmp $512, %rdx
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 08b41e9..6184e4e 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -29,6 +29,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_ssse3_back
# define MEMCPY_CHK __memcpy_chk_ssse3_back
+# define MEMPCPY __mempcpy_ssse3_back_1
+# define MEMPCPY_CHK __mempcpy_chk_ssse3_back_1
#endif
#define JMPTBL(I, B) I - B
@@ -44,6 +46,23 @@
ud2
.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+ lea (%rdi, %rdx), %rax
+#else
+ mov %rdi, %rax
+ add %rdx, %rax
+#endif
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -66,6 +85,7 @@ ENTRY (MEMCPY)
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
L(copy_forward):
#endif
+L(start):
cmp $144, %rdx
jae L(144bytesormore)
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 95de969..127afaa 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -29,6 +29,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_ssse3
# define MEMCPY_CHK __memcpy_chk_ssse3
+# define MEMPCPY __mempcpy_ssse3_1
+# define MEMPCPY_CHK __mempcpy_chk_ssse3_1
#endif
#define JMPTBL(I, B) I - B
@@ -44,6 +46,23 @@
ud2
.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+ lea (%rdi, %rdx), %rax
+#else
+ mov %rdi, %rax
+ add %rdx, %rax
+#endif
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -66,6 +85,7 @@ ENTRY (MEMCPY)
jmp L(copy_backward)
L(copy_forward):
#endif
+L(start):
cmp $79, %rdx
lea L(table_less_80bytes)(%rip), %r11
ja L(80bytesormore)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ad023b590b32dcde6d1b641d41758e531290b6fd
commit ad023b590b32dcde6d1b641d41758e531290b6fd
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 09:22:56 2016 -0700
Add __memset_sse2_erms and __memset_chk_sse2_erms
* sysdeps/x86_64/memset.S (__memset_chk_sse2_erms): New
function.
(__memset_sse2_erms): Likewise.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memset_chk_sse2_erms and
__memset_sse2_erms.
* sysdeps/x86_64/sysdep.h (REP_STOSB_THRESHOLD): New.
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 4cf0da0..71796a7 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -42,6 +42,26 @@ ENTRY(__memset_tail)
END(__memset_tail)
#endif
+#ifdef USE_MULTIARCH
+# if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memset_chk_sse2_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk_sse2_erms)
+# endif
+
+ENTRY (__memset_sse2_erms)
+ cmpq $REP_STOSB_THRESHOLD, %rdx
+ jbe L(memset_entry)
+ movq %rdx, %rcx
+ movzbl %sil, %eax
+ movq %rdi, %rdx
+ rep stosb
+ movq %rdx, %rax
+ ret
+END (__memset_sse2_erms)
+#endif
+
#if defined PIC && IS_IN (libc)
ENTRY_CHK (__memset_chk)
cmpq %rdx, %rcx
@@ -50,6 +70,7 @@ END_CHK (__memset_chk)
#endif
ENTRY (memset)
+L(memset_entry):
movd %esi, %xmm0
movq %rdi, %rax
punpcklbw %xmm0, %xmm0
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index ba3202e..543c637 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -89,6 +89,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_sse2)
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_sse2_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
@@ -103,6 +105,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memset.S. */
IFUNC_IMPL (i, name, memset,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2_erms)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX2_Usable),
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index 2444d63..38d3a32 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -133,6 +133,9 @@ lose: \
/* Threshold to use Enhanced REP MOVSB. */
#define REP_MOVSB_THRESHOLD 2048
+/* Threshold to use Enhanced REP STOSB. */
+#define REP_STOSB_THRESHOLD 1024
+
#else /* __ASSEMBLER__ */
/* Long and pointer size in bytes. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f0bc5d68ff40abe2fe3ea1f2515811f9636f8ac9
commit f0bc5d68ff40abe2fe3ea1f2515811f9636f8ac9
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 08:32:05 2016 -0700
Add sse2_unaligned_erms versions of memcpy/mempcpy
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memcpy_chk_sse2_unaligned_erms,
__memcpy_sse2_unaligned_erms, __mempcpy_chk_sse2_unaligned_erms
and __mempcpy_sse2_unaligned_erms.
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
(__mempcpy_chk_sse2_unaligned_erms): New function.
(__mempcpy_sse2_unaligned_erms): Likewise.
(__memcpy_chk_sse2_unaligned_erms): Likewise.
(__memcpy_sse2_unaligned_erms): Likewise.
* sysdeps/x86_64/sysdep.h (REP_MOVSB_THRESHOLD): New.
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 26b4137..ba3202e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -286,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))
@@ -305,6 +307,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_avx512_no_vzeroupper)
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
@@ -327,6 +331,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))
@@ -347,6 +353,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 947c50f..53e9464 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -23,6 +23,51 @@
#include "asm-syntax.h"
# ifdef SHARED
+ENTRY (__mempcpy_chk_sse2_unaligned_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_sse2_unaligned_erms)
+# endif
+
+ENTRY (__mempcpy_sse2_unaligned_erms)
+ mov %rdi, %rax
+ add %rdx, %rax
+ jmp L(start_erms)
+END (__mempcpy_sse2_unaligned_erms)
+
+# ifdef SHARED
+ENTRY (__memcpy_chk_sse2_unaligned_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk_sse2_unaligned_erms)
+# endif
+
+ENTRY(__memcpy_sse2_unaligned_erms)
+ movq %rdi, %rax
+L(start_erms):
+ testq %rdx, %rdx
+ je L(return)
+ cmpq $16, %rdx
+ jbe L(less_16)
+ cmpq $REP_MOVSB_THRESHOLD, %rdx
+ ja .Lerms
+ movdqu (%rsi), %xmm8
+ cmpq $32, %rdx
+ movdqu %xmm8, (%rdi)
+ movdqu -16(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -16(%rdi,%rdx)
+ ja .L31
+ ret
+
+ .p2align 4,,10
+ .p2align 4
+.Lerms:
+ mov %rdx, %rcx
+ rep movsb
+ ret
+END (__memcpy_sse2_unaligned_erms)
+
+# ifdef SHARED
ENTRY (__mempcpy_chk_sse2_unaligned)
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index fbe3560..2444d63 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -130,6 +130,9 @@ lose: \
#define R14_LP r14
#define R15_LP r15
+/* Threshold to use Enhanced REP MOVSB. */
+#define REP_MOVSB_THRESHOLD 2048
+
#else /* __ASSEMBLER__ */
/* Long and pointer size in bytes. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=438ed4b59ada763de874ba3be10bb18ea52d5643
commit 438ed4b59ada763de874ba3be10bb18ea52d5643
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Mar 7 05:47:26 2016 -0800
Enable __memcpy_chk_sse2_unaligned
Check Fast_Unaligned_Load for __memcpy_chk_sse2_unaligned. The new
selection order is:
1. __memcpy_chk_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
2. __memcpy_chk_sse2_unaligned if Fast_Unaligned_Load bit is set.
3. __memcpy_chk_sse2 if SSSE3 isn't available.
4. __memcpy_chk_ssse3_back if Fast_Copy_Backward bit it set.
5. __memcpy_chk_ssse3
[BZ #19776]
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Check
Fast_Unaligned_Load to enable __mempcpy_chk_sse2_unaligned.
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 0e21c09..9e218c0 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -35,22 +35,25 @@ ENTRY(__memcpy_chk)
jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
+ jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jz 1f
- leaq __memcpy_chk_avx512_no_vzeroupper(%rip), %rax
+ jz 1f
+ lea __memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __memcpy_chk_sse2(%rip), %rax
+1: lea __memcpy_chk_avx_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ lea __memcpy_chk_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ lea __memcpy_chk_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __memcpy_chk_ssse3(%rip), %rax
+ lea __memcpy_chk_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __memcpy_chk_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __memcpy_chk_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __memcpy_chk_ssse3(%rip), %RAX_LP
2: ret
END(__memcpy_chk)
# else
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fe0ea5b0874b4ae4255c6ad3b2e0f1aaf94a97ff
commit fe0ea5b0874b4ae4255c6ad3b2e0f1aaf94a97ff
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Mar 7 05:44:58 2016 -0800
Enable __mempcpy_chk_sse2_unaligned
Check Fast_Unaligned_Load for __mempcpy_chk_sse2_unaligned. The new
selection order is:
1. __mempcpy_chk_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
2. __mempcpy_chk_sse2_unaligned if Fast_Unaligned_Load bit is set.
3. __mempcpy_chk_sse2 if SSSE3 isn't available.
4. __mempcpy_chk_ssse3_back if Fast_Copy_Backward bit it set.
5. __mempcpy_chk_ssse3
[BZ #19776]
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Check
Fast_Unaligned_Load to enable __mempcpy_chk_sse2_unaligned.
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index de888f3..7c888d3 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -38,19 +38,22 @@ ENTRY(__mempcpy_chk)
jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
jz 1f
- leaq __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+ lea __mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __mempcpy_chk_sse2(%rip), %rax
+1: lea __mempcpy_chk_avx_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_chk_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_chk_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __mempcpy_chk_ssse3(%rip), %rax
+ lea __mempcpy_chk_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __mempcpy_chk_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __mempcpy_chk_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __mempcpy_chk_ssse3(%rip), %RAX_LP
2: ret
END(__mempcpy_chk)
# else
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=175e5a4dcf431f9820dd483b0ba93d165d8652b2
commit 175e5a4dcf431f9820dd483b0ba93d165d8652b2
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Mar 7 05:42:46 2016 -0800
Enable __mempcpy_sse2_unaligned
Check Fast_Unaligned_Load for __mempcpy_sse2_unaligned. The new
selection order is:
1. __mempcpy_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
2. __mempcpy_sse2_unaligned if Fast_Unaligned_Load bit is set.
3. __mempcpy_sse2 if SSSE3 isn't available.
4. __mempcpy_ssse3_back if Fast_Copy_Backward bit it set.
5. __mempcpy_ssse3
[BZ #19776]
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Check
Fast_Unaligned_Load to enable __mempcpy_sse2_unaligned.
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b85cf27..05c1fc8 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -36,19 +36,22 @@ ENTRY(__mempcpy)
jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
jz 1f
- leaq __mempcpy_avx512_no_vzeroupper(%rip), %rax
+ lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __mempcpy_sse2(%rip), %rax
+1: lea __mempcpy_avx_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __mempcpy_ssse3(%rip), %rax
+ lea __mempcpy_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __mempcpy_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __mempcpy_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __mempcpy_ssse3(%rip), %RAX_LP
2: ret
END(__mempcpy)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=32920004ade2b9fd8ac3b3a0e177c0687b536a04
commit 32920004ade2b9fd8ac3b3a0e177c0687b536a04
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 17:06:41 2016 -0800
Add entry points for __mempcpy_sse2_unaligned and _chk functions
Add entry points for __mempcpy_chk_sse2_unaligned,
__mempcpy_sse2_unaligned and __memcpy_chk_sse2_unaligned.
[BZ #19776]
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memcpy_chk_sse2_unaligned,
__mempcpy_chk_sse2_unaligned and __mempcpy_sse2_unaligned.
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
(__mempcpy_chk_sse2_unaligned): New.
(__mempcpy_sse2_unaligned): Likewise.
(__memcpy_chk_sse2_unaligned): Likewise.
(L(start): New label.
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b0d300d..26b4137 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -284,6 +284,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))
@@ -323,6 +325,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))
@@ -341,6 +345,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 335a498..947c50f 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -22,9 +22,29 @@
#include "asm-syntax.h"
+# ifdef SHARED
+ENTRY (__mempcpy_chk_sse2_unaligned)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_sse2_unaligned)
+# endif
+
+ENTRY (__mempcpy_sse2_unaligned)
+ mov %rdi, %rax
+ add %rdx, %rax
+ jmp L(start)
+END (__mempcpy_sse2_unaligned)
+
+# ifdef SHARED
+ENTRY (__memcpy_chk_sse2_unaligned)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk_sse2_unaligned)
+# endif
ENTRY(__memcpy_sse2_unaligned)
movq %rdi, %rax
+L(start):
testq %rdx, %rdx
je L(return)
cmpq $16, %rdx
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=12cf609fa5ffb50af921f634c6e548b50960948d
commit 12cf609fa5ffb50af921f634c6e548b50960948d
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 16:52:53 2016 -0800
Remove L(overlapping) from memcpy-sse2-unaligned.S
Since memcpy doesn't need to check overlapping source and destination,
we can remove L(overlapping).
[BZ #19776]
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
(L(overlapping)): Removed.
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 19d8aa6..335a498 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -25,12 +25,8 @@
ENTRY(__memcpy_sse2_unaligned)
movq %rdi, %rax
- movq %rsi, %r11
- leaq (%rdx,%rdx), %rcx
- subq %rdi, %r11
- subq %rdx, %r11
- cmpq %rcx, %r11
- jb L(overlapping)
+ testq %rdx, %rdx
+ je L(return)
cmpq $16, %rdx
jbe L(less_16)
movdqu (%rsi), %xmm8
@@ -89,45 +85,6 @@ L(loop):
cmpq %rcx, %rdx
jne L(loop)
ret
-L(overlapping):
- testq %rdx, %rdx
- .p2align 4,,5
- je L(return)
- movq %rdx, %r9
- leaq 16(%rsi), %rcx
- leaq 16(%rdi), %r8
- shrq $4, %r9
- movq %r9, %r11
- salq $4, %r11
- cmpq %rcx, %rdi
- setae %cl
- cmpq %r8, %rsi
- setae %r8b
- orl %r8d, %ecx
- cmpq $15, %rdx
- seta %r8b
- testb %r8b, %cl
- je .L21
- testq %r11, %r11
- je .L21
- xorl %ecx, %ecx
- xorl %r8d, %r8d
-.L7:
- movdqu (%rsi,%rcx), %xmm8
- addq $1, %r8
- movdqu %xmm8, (%rdi,%rcx)
- addq $16, %rcx
- cmpq %r8, %r9
- ja .L7
- cmpq %r11, %rdx
- je L(return)
-.L21:
- movzbl (%rsi,%r11), %ecx
- movb %cl, (%rdi,%r11)
- addq $1, %r11
- cmpq %r11, %rdx
- ja .L21
- ret
L(less_16):
testb $24, %dl
jne L(between_9_16)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=849e297b6803c989f33c05f5be8f93dbb3274bb1
commit 849e297b6803c989f33c05f5be8f93dbb3274bb1
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 13:46:54 2016 -0800
Don't use RAX as scratch register
To prepare sharing code with mempcpy, don't use RAX as scratch register
so that RAX can be set to the return value at entrance.
[BZ #19776]
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Don't use
RAX as scratch register.
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 7207753..19d8aa6 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -24,11 +24,12 @@
ENTRY(__memcpy_sse2_unaligned)
- movq %rsi, %rax
+ movq %rdi, %rax
+ movq %rsi, %r11
leaq (%rdx,%rdx), %rcx
- subq %rdi, %rax
- subq %rdx, %rax
- cmpq %rcx, %rax
+ subq %rdi, %r11
+ subq %rdx, %r11
+ cmpq %rcx, %r11
jb L(overlapping)
cmpq $16, %rdx
jbe L(less_16)
@@ -39,7 +40,6 @@ ENTRY(__memcpy_sse2_unaligned)
movdqu %xmm8, -16(%rdi,%rdx)
ja .L31
L(return):
- movq %rdi, %rax
ret
.p2align 4,,10
.p2align 4
@@ -64,16 +64,16 @@ L(return):
addq %rdi, %rdx
andq $-64, %rdx
andq $-64, %rcx
- movq %rcx, %rax
- subq %rdi, %rax
- addq %rax, %rsi
+ movq %rcx, %r11
+ subq %rdi, %r11
+ addq %r11, %rsi
cmpq %rdx, %rcx
je L(return)
movq %rsi, %r10
subq %rcx, %r10
leaq 16(%r10), %r9
leaq 32(%r10), %r8
- leaq 48(%r10), %rax
+ leaq 48(%r10), %r11
.p2align 4,,10
.p2align 4
L(loop):
@@ -83,12 +83,12 @@ L(loop):
movdqa %xmm8, 16(%rcx)
movdqu (%rcx,%r8), %xmm8
movdqa %xmm8, 32(%rcx)
- movdqu (%rcx,%rax), %xmm8
+ movdqu (%rcx,%r11), %xmm8
movdqa %xmm8, 48(%rcx)
addq $64, %rcx
cmpq %rcx, %rdx
jne L(loop)
- jmp L(return)
+ ret
L(overlapping):
testq %rdx, %rdx
.p2align 4,,5
@@ -97,8 +97,8 @@ L(overlapping):
leaq 16(%rsi), %rcx
leaq 16(%rdi), %r8
shrq $4, %r9
- movq %r9, %rax
- salq $4, %rax
+ movq %r9, %r11
+ salq $4, %r11
cmpq %rcx, %rdi
setae %cl
cmpq %r8, %rsi
@@ -107,9 +107,9 @@ L(overlapping):
cmpq $15, %rdx
seta %r8b
testb %r8b, %cl
- je .L16
- testq %rax, %rax
- je .L16
+ je .L21
+ testq %r11, %r11
+ je .L21
xorl %ecx, %ecx
xorl %r8d, %r8d
.L7:
@@ -119,15 +119,15 @@ L(overlapping):
addq $16, %rcx
cmpq %r8, %r9
ja .L7
- cmpq %rax, %rdx
+ cmpq %r11, %rdx
je L(return)
.L21:
- movzbl (%rsi,%rax), %ecx
- movb %cl, (%rdi,%rax)
- addq $1, %rax
- cmpq %rax, %rdx
+ movzbl (%rsi,%r11), %ecx
+ movb %cl, (%rdi,%r11)
+ addq $1, %r11
+ cmpq %r11, %rdx
ja .L21
- jmp L(return)
+ ret
L(less_16):
testb $24, %dl
jne L(between_9_16)
@@ -137,28 +137,25 @@ L(less_16):
testq %rdx, %rdx
.p2align 4,,2
je L(return)
- movzbl (%rsi), %eax
+ movzbl (%rsi), %ecx
testb $2, %dl
- movb %al, (%rdi)
+ movb %cl, (%rdi)
je L(return)
- movzwl -2(%rsi,%rdx), %eax
- movw %ax, -2(%rdi,%rdx)
- jmp L(return)
+ movzwl -2(%rsi,%rdx), %ecx
+ movw %cx, -2(%rdi,%rdx)
+ ret
L(between_9_16):
- movq (%rsi), %rax
- movq %rax, (%rdi)
- movq -8(%rsi,%rdx), %rax
- movq %rax, -8(%rdi,%rdx)
- jmp L(return)
-.L16:
- xorl %eax, %eax
- jmp .L21
+ movq (%rsi), %rcx
+ movq %rcx, (%rdi)
+ movq -8(%rsi,%rdx), %rcx
+ movq %rcx, -8(%rdi,%rdx)
+ ret
L(between_5_8):
- movl (%rsi), %eax
- movl %eax, (%rdi)
- movl -4(%rsi,%rdx), %eax
- movl %eax, -4(%rdi,%rdx)
- jmp L(return)
+ movl (%rsi), %ecx
+ movl %ecx, (%rdi)
+ movl -4(%rsi,%rdx), %ecx
+ movl %ecx, -4(%rdi,%rdx)
+ ret
END(__memcpy_sse2_unaligned)
#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4891daae057485dfaa319971afd1d4fe52fbc929
commit 4891daae057485dfaa319971afd1d4fe52fbc929
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 14:16:32 2016 -0800
Remove dead code from memcpy-sse2-unaligned.S
There are
ENTRY(__memcpy_sse2_unaligned)
movq %rsi, %rax
leaq (%rdx,%rdx), %rcx
subq %rdi, %rax
subq %rdx, %rax
cmpq %rcx, %rax
jb L(overlapping)
When branch is taken,
cmpq %rsi, %rdi
jae .L3
will never be taken. We can remove the dead code.
[BZ #19776]
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S (.L3) Removed.
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index c450983..7207753 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -90,8 +90,6 @@ L(loop):
jne L(loop)
jmp L(return)
L(overlapping):
- cmpq %rsi, %rdi
- jae .L3
testq %rdx, %rdx
.p2align 4,,5
je L(return)
@@ -146,15 +144,6 @@ L(less_16):
movzwl -2(%rsi,%rdx), %eax
movw %ax, -2(%rdi,%rdx)
jmp L(return)
-.L3:
- leaq -1(%rdx), %rax
- .p2align 4,,10
- .p2align 4
-.L11:
- movzbl (%rsi,%rax), %edx
- movb %dl, (%rdi,%rax)
- subq $1, %rax
- jmp .L11
L(between_9_16):
movq (%rsi), %rax
movq %rax, (%rdi)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=735a17465774fc651dfe26cf27e3cf6cdf1883f6
commit 735a17465774fc651dfe26cf27e3cf6cdf1883f6
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Apr 11 08:51:16 2014 -0700
Test 32-bit ERMS memcpy/memset
* sysdeps/i386/i686/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __bcopy_erms, __bzero_erms,
__memmove_chk_erms, __memmove_erms, __memset_chk_erms,
__memset_erms, __memcpy_chk_erms, __memcpy_erms,
__mempcpy_chk_erms and __mempcpy_erms.
diff --git a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
index ef30a95..f3cbca0 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
+++ b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -44,6 +44,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__bcopy_ssse3)
IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2),
__bcopy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_erms)
IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
/* Support sysdeps/i386/i686/multiarch/bzero.S. */
@@ -52,6 +53,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__bzero_sse2_rep)
IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
__bzero_sse2)
+ IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_erms)
IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32))
/* Support sysdeps/i386/i686/multiarch/memchr.S. */
@@ -82,6 +84,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSE2),
__memmove_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_ia32))
/* Support sysdeps/i386/i686/multiarch/memmove.S. */
@@ -92,6 +96,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2),
__memmove_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
/* Support sysdeps/i386/i686/multiarch/memrchr.S. */
@@ -111,6 +116,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSE2),
__memset_chk_sse2)
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_ia32))
/* Support sysdeps/i386/i686/multiarch/memset.S. */
@@ -119,6 +126,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memset_sse2_rep)
IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
__memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32))
/* Support sysdeps/i386/i686/multiarch/rawmemchr.S. */
@@ -319,6 +327,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSE2),
__memcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_ia32))
/* Support sysdeps/i386/i686/multiarch/memcpy.S. */
@@ -329,6 +339,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_ssse3)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2),
__memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
/* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S. */
@@ -343,6 +354,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSE2),
__mempcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_ia32))
/* Support sysdeps/i386/i686/multiarch/mempcpy.S. */
@@ -353,6 +366,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2),
__mempcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
/* Support sysdeps/i386/i686/multiarch/strlen.S. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=665d19ea1eaa11d1a37d2e8fe679f92075583bf8
commit 665d19ea1eaa11d1a37d2e8fe679f92075583bf8
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Apr 11 08:25:17 2014 -0700
Test 64-bit ERMS memcpy/memset
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __memmove_chk_erms,
__memmove_erms, __memset_erms, __memset_chk_erms,
__memcpy_chk_erms, __memcpy_erms, __mempcpy_chk_erms and
__mempcpy_erms.
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 188b6d3..b0d300d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -63,6 +63,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2))
/* Support sysdeps/x86_64/multiarch/memmove.S. */
@@ -79,12 +81,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
/* Support sysdeps/x86_64/multiarch/memset_chk.S. */
IFUNC_IMPL (i, name, __memset_chk,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_sse2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_chk_avx2)
@@ -98,6 +103,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memset.S. */
IFUNC_IMPL (i, name, memset,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_avx2)
@@ -278,6 +284,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))
/* Support sysdeps/x86_64/multiarch/memcpy.S. */
@@ -295,6 +303,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_avx512_no_vzeroupper)
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
@@ -314,6 +323,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))
/* Support sysdeps/x86_64/multiarch/mempcpy.S. */
@@ -330,6 +341,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
/* Support sysdeps/x86_64/multiarch/strncmp.S. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ef0a94fe0d1f51db1532b7f530841bf5ad2e2ba5
commit ef0a94fe0d1f51db1532b7f530841bf5ad2e2ba5
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Sep 21 15:21:28 2011 -0700
Add 32it ERMS memcpy/memset
* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
bcopy-erms, memcpy-erms, memmove-erms, mempcpy-erms, bzero-erms
and memset-erms.
* sysdeps/i386/i686/multiarch/bcopy-erms.S: New file.
* sysdeps/i386/i686/multiarch/bzero-erms.S: Likewise.
* sysdeps/i386/i686/multiarch/memcpy-erms.S: Likewise.
* sysdeps/i386/i686/multiarch/memmove-erms.S: Likewise.
* sysdeps/i386/i686/multiarch/mempcpy-erms.S: Likewise.
* sysdeps/i386/i686/multiarch/memset-erms.S: Likewise.
* sysdeps/i386/i686/multiarch/ifunc-defines.sym: Add
COMMON_CPUID_INDEX_7.
* sysdeps/i386/i686/multiarch/bcopy.S: Enable ERMS optimization
for Fast_ERMS.
* sysdeps/i386/i686/multiarch/bzero.S: Likewise.
* sysdeps/i386/i686/multiarch/memcpy.S: Likewise.
* sysdeps/i386/i686/multiarch/memcpy_chk.S: Likewise.
* sysdeps/i386/i686/multiarch/memmove.S: Likewise.
* sysdeps/i386/i686/multiarch/memmove_chk.S: Likewise.
* sysdeps/i386/i686/multiarch/mempcpy.S: Likewise.
* sysdeps/i386/i686/multiarch/mempcpy_chk.S: Likewise.
* sysdeps/i386/i686/multiarch/memset.S: Likewise.
* sysdeps/i386/i686/multiarch/memset_chk.S: Likewise.
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 700010d..6bcef4c 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -25,7 +25,9 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
strcasecmp_l-sse4 strncase_l-sse4 \
bcopy-sse2-unaligned memcpy-sse2-unaligned \
mempcpy-sse2-unaligned memmove-sse2-unaligned \
- strcspn-c strpbrk-c strspn-c
+ strcspn-c strpbrk-c strspn-c \
+ bcopy-erms memcpy-erms memmove-erms mempcpy-erms \
+ bzero-erms memset-erms
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/bcopy-erms.S b/sysdeps/i386/i686/multiarch/bcopy-erms.S
new file mode 100644
index 0000000..da9e160
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bcopy-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S
index ce6661b..04f5a3a 100644
--- a/sysdeps/i386/i686/multiarch/bcopy.S
+++ b/sysdeps/i386/i686/multiarch/bcopy.S
@@ -27,6 +27,9 @@
ENTRY(bcopy)
.type bcopy, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__bcopy_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__bcopy_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/bzero-erms.S b/sysdeps/i386/i686/multiarch/bzero-erms.S
new file mode 100644
index 0000000..2c3bed6
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bzero-erms.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_erms __bzero_erms
+#include "memset-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/bzero.S b/sysdeps/i386/i686/multiarch/bzero.S
index 738ca69..a61b5d2 100644
--- a/sysdeps/i386/i686/multiarch/bzero.S
+++ b/sysdeps/i386/i686/multiarch/bzero.S
@@ -27,6 +27,9 @@
ENTRY(__bzero)
.type __bzero, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__bzero_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__bzero_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/ifunc-defines.sym b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
index 96e9cfa..3df946f 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-defines.sym
+++ b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
@@ -16,4 +16,5 @@ FEATURE_OFFSET offsetof (struct cpu_features, feature)
FEATURE_SIZE sizeof (unsigned int)
COMMON_CPUID_INDEX_1
+COMMON_CPUID_INDEX_7
FEATURE_INDEX_1
diff --git a/sysdeps/i386/i686/multiarch/memcpy-erms.S b/sysdeps/i386/i686/multiarch/memcpy-erms.S
new file mode 100644
index 0000000..df0c801
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy-erms.S
@@ -0,0 +1,100 @@
+/* memcpy with Enhanced REP MOVSB/STOSB
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY __memcpy_erms
+# define MEMCPY_CHK __memcpy_chk_erms
+#endif
+
+#ifdef USE_AS_BCOPY
+# define STR2 12
+# define STR1 STR2+4
+# define N STR1+4
+#else
+# define STR1 12
+# define STR2 STR1+4
+# define N STR2+4
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+ .section .text.erms,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+ PUSH (%esi)
+ PUSH (%edi)
+ movl N(%esp), %ecx
+ movl STR1(%esp), %edi
+ movl STR2(%esp), %esi
+ mov %edi, %eax
+#ifdef USE_AS_MEMPCPY
+ add %ecx, %eax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+ cmp %esi, %edi
+ ja L(copy_backward)
+ je L(bwd_write_0bytes)
+#endif
+
+ rep movsb
+ POP (%edi)
+ POP (%esi)
+ ret
+
+#ifdef USE_AS_MEMMOVE
+L(copy_backward):
+ lea -1(%edi,%ecx), %edi
+ lea -1(%esi,%ecx), %esi
+ std
+ rep movsb
+ cld
+L(bwd_write_0bytes):
+ POP (%edi)
+ POP (%esi)
+ ret
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
index 652b5a2..79ae41f 100644
--- a/sysdeps/i386/i686/multiarch/memcpy.S
+++ b/sysdeps/i386/i686/multiarch/memcpy.S
@@ -29,6 +29,9 @@
ENTRY(memcpy)
.type memcpy, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memcpy_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memcpy_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
index 0eee32c..dd1d38a 100644
--- a/sysdeps/i386/i686/multiarch/memcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -30,6 +30,9 @@
ENTRY(__memcpy_chk)
.type __memcpy_chk, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memcpy_chk_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/memmove-erms.S b/sysdeps/i386/i686/multiarch/memmove-erms.S
new file mode 100644
index 0000000..357289a
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_erms
+#define MEMCPY_CHK __memmove_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
index 725a421..13223b3 100644
--- a/sysdeps/i386/i686/multiarch/memmove.S
+++ b/sysdeps/i386/i686/multiarch/memmove.S
@@ -27,6 +27,9 @@
ENTRY(memmove)
.type memmove, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memmove_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memmove_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
index a29bbc9..ed000ee 100644
--- a/sysdeps/i386/i686/multiarch/memmove_chk.S
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -27,6 +27,9 @@
ENTRY(__memmove_chk)
.type __memmove_chk, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memmove_chk_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
@@ -90,5 +93,17 @@ __memmove_chk_ia32:
jmp __memmove_ia32
cfi_endproc
.size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+ .type __memmove_chk_erms, @function
+ .p2align 4;
+__memmove_chk_erms:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_erms
+ cfi_endproc
+ .size __memmove_chk_erms, .-__memmove_chk_erms
# endif
#endif
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-erms.S b/sysdeps/i386/i686/multiarch/mempcpy-erms.S
new file mode 100644
index 0000000..01d3bf8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_erms
+#define MEMCPY_CHK __mempcpy_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
index b46f3fc..cceae9b 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -29,6 +29,9 @@
ENTRY(__mempcpy)
.type __mempcpy, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__mempcpy_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
index 30f3629..97d5179 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -30,6 +30,9 @@
ENTRY(__mempcpy_chk)
.type __mempcpy_chk, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__mempcpy_chk_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/memset-erms.S b/sysdeps/i386/i686/multiarch/memset-erms.S
new file mode 100644
index 0000000..807a6e4
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memset-erms.S
@@ -0,0 +1,69 @@
+/* memset with Enhanced REP MOVSB/STOSB
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define STR1 8
+#ifdef USE_AS_BZERO
+#define N STR1+4
+#else
+#define STR2 STR1+4
+#define N STR2+4
+#endif
+
+ .section .text.erms,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO
+ENTRY (__memset_chk_erms)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+#endif
+ENTRY (__memset_erms)
+ PUSH (%edi)
+ movl N(%esp), %ecx
+ movl STR1(%esp), %edi
+#ifdef USE_AS_BZERO
+ xor %eax, %eax
+#else
+ movzbl STR2(%esp), %eax
+ mov %edi, %edx
+#endif
+ rep stosb
+#ifndef USE_AS_BZERO
+ mov %edx, %eax
+#endif
+ POP (%edi)
+ ret
+END (__memset_erms)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memset.S b/sysdeps/i386/i686/multiarch/memset.S
index 14180e4..3c11b91 100644
--- a/sysdeps/i386/i686/multiarch/memset.S
+++ b/sysdeps/i386/i686/multiarch/memset.S
@@ -27,6 +27,9 @@
ENTRY(memset)
.type memset, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memset_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memset_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/memset_chk.S b/sysdeps/i386/i686/multiarch/memset_chk.S
index d73f202..fa1c5fb 100644
--- a/sysdeps/i386/i686/multiarch/memset_chk.S
+++ b/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -27,6 +27,9 @@
ENTRY(__memset_chk)
.type __memset_chk, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memset_chk_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
@@ -78,5 +81,17 @@ __memset_chk_ia32:
jmp __memset_ia32
cfi_endproc
.size __memset_chk_ia32, .-__memset_chk_ia32
+
+ .type __memset_chk_erms, @function
+ .p2align 4;
+__memset_chk_erms:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_erms
+ cfi_endproc
+ .size __memset_chk_erms, .-__memset_chk_erms
# endif
#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ef81a3a93a95dd0017fcefe4207306d6875c3794
commit ef81a3a93a95dd0017fcefe4207306d6875c3794
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Sep 15 16:16:10 2011 -0700
Add 64-bit ERMS memcpy and memset
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memcpy-erms, mempcpy-erms, memmove-erms and memset-erms.
* sysdeps/x86_64/multiarch/memcpy-erms.S: New.
* sysdeps/x86_64/multiarch/memmove-erms.S: Likewise.
* sysdeps/x86_64/multiarch/mempcpy-erms.S: Likewise.
* sysdeps/x86_64/multiarch/memset-erms.S: Likewise.
* sysdeps/x86_64/multiarch/memcpy.S: Enable ERMS optimization
for Fast_ERMS.
* sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
* sysdeps/x86_64/multiarch/memmove.c: Likewise.
* sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
* sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
* sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
* sysdeps/x86_64/multiarch/memset.S: Likewise.
* sysdeps/x86_64/multiarch/memset_chk.S: Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d234f4a..2f29a2a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -20,7 +20,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
- memset-avx512-no-vzeroupper
+ memset-avx512-no-vzeroupper \
+ memcpy-erms mempcpy-erms memmove-erms \
+ memset-erms
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memcpy-erms.S b/sysdeps/x86_64/multiarch/memcpy-erms.S
new file mode 100644
index 0000000..f0595d6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-erms.S
@@ -0,0 +1,71 @@
+/* memcpy with Enhanced REP MOVSB/STOSB
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+# define MEMCPY __memcpy_erms
+# define MEMCPY_CHK __memcpy_chk_erms
+# endif
+
+ .section .text.erms,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+ mov %rdi, %rax
+ mov %rdx, %rcx
+# ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+# endif
+
+# ifdef USE_AS_MEMMOVE
+ cmp %rsi, %rdi
+ ja L(copy_backward)
+ je L(bwd_write_0bytes)
+# endif
+
+ rep movsb
+ ret
+
+# ifdef USE_AS_MEMMOVE
+L(copy_backward):
+ lea -1(%rdi,%rdx), %rdi
+ lea -1(%rsi,%rdx), %rsi
+ std
+ rep movsb
+ cld
+L(bwd_write_0bytes):
+ ret
+# endif
+
+END (MEMCPY)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 8882590..58d9223 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -30,6 +30,9 @@
ENTRY(__new_memcpy)
.type __new_memcpy, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ lea __memcpy_erms(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 648217e..0e21c09 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -30,6 +30,9 @@
ENTRY(__memcpy_chk)
.type __memcpy_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __memcpy_chk_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/memmove-erms.S b/sysdeps/x86_64/multiarch/memmove-erms.S
new file mode 100644
index 0000000..357289a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_erms
+#define MEMCPY_CHK __memmove_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index 8da5640..3777bea 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -35,6 +35,7 @@
extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_erms attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
# ifdef HAVE_AVX512_ASM_SUPPORT
extern __typeof (__redirect_memmove) __memmove_avx512_no_vzeroupper attribute_hidden;
@@ -52,6 +53,9 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
ifunc symbol properly. */
extern __typeof (__redirect_memmove) __libc_memmove;
libc_ifunc (__libc_memmove,
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ ? __memmove_erms
+ : (
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
&& HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
@@ -63,7 +67,7 @@ libc_ifunc (__libc_memmove,
: (HAS_CPU_FEATURE (SSSE3)
? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
? __memmove_ssse3_back : __memmove_ssse3)
- : __memmove_sse2)));
+ : __memmove_sse2))));
strong_alias (__libc_memmove, memmove)
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index f64da63..4cd360a 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -25,6 +25,7 @@
extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
+extern __typeof (__memmove_chk) __memmove_chk_erms attribute_hidden;
extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
# ifdef HAVE_AVX512_ASM_SUPPORT
extern __typeof (__memmove_chk) __memmove_chk_avx512_no_vzeroupper attribute_hidden;
@@ -33,6 +34,9 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
#include "debug/memmove_chk.c"
libc_ifunc (__memmove_chk,
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ ? __memmove_chk_erms
+ : (
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
&& HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
@@ -43,4 +47,4 @@ libc_ifunc (__memmove_chk,
(HAS_CPU_FEATURE (SSSE3)
? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
- : __memmove_chk_sse2));
+ : __memmove_chk_sse2)));
diff --git a/sysdeps/x86_64/multiarch/mempcpy-erms.S b/sysdeps/x86_64/multiarch/mempcpy-erms.S
new file mode 100644
index 0000000..01d3bf8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_erms
+#define MEMCPY_CHK __mempcpy_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index ed78623..b85cf27 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -28,6 +28,9 @@
ENTRY(__mempcpy)
.type __mempcpy, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __mempcpy_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 6e8a89d..de888f3 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -30,6 +30,9 @@
ENTRY(__mempcpy_chk)
.type __mempcpy_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __mempcpy_chk_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/memset-erms.S b/sysdeps/x86_64/multiarch/memset-erms.S
new file mode 100644
index 0000000..af9f80b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-erms.S
@@ -0,0 +1,40 @@
+/* memset with Enhanced REP MOVSB/STOSB
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#ifndef NOT_IN_libc
+
+ .text
+# ifdef SHARED
+ENTRY (__memset_chk_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+# endif
+
+ENTRY (__memset_erms)
+ mov %rdx, %rcx
+ movzbl %sil, %eax
+ mov %rdi, %rdx
+ rep stosb
+ mov %rdx, %rax
+ ret
+END (__memset_erms)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 8e3b9b9..dda8185 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -26,6 +26,9 @@
ENTRY(memset)
.type memset, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __memset_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
leaq __memset_sse2(%rip), %rax
HAS_ARCH_FEATURE (AVX2_Usable)
jz 2f
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
index 9a7b270..b8c9940 100644
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -26,6 +26,9 @@
ENTRY(__memset_chk)
.type __memset_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __memset_chk_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
leaq __memset_chk_sse2(%rip), %rax
HAS_ARCH_FEATURE (AVX2_Usable)
jz 2f
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cd8fa6c74a5f122a242c412b5ea526a31f13855a
commit cd8fa6c74a5f122a242c412b5ea526a31f13855a
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Sep 15 15:47:01 2011 -0700
Initial ERMS support
* sysdeps/x86/cpu-features.h (bit_arch_Fast_ERMS): New.
(bit_cpu_ERMS): Likewise.
(index_cpu_ERMS): Likewise.
(index_arch_Fast_ERMS): Likewise.
(reg_ERMS): Likewise.
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index e06eb7e..1c09712 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -35,6 +35,7 @@
#define bit_arch_I686 (1 << 15)
#define bit_arch_Prefer_MAP_32BIT_EXEC (1 << 16)
#define bit_arch_Prefer_No_VZEROUPPER (1 << 17)
+#define bit_arch_Fast_ERMS (1 << 18)
/* CPUID Feature flags. */
@@ -52,6 +53,7 @@
#define bit_cpu_FMA4 (1 << 16)
/* COMMON_CPUID_INDEX_7. */
+#define bit_cpu_ERMS (1 << 9)
#define bit_cpu_RTM (1 << 11)
#define bit_cpu_AVX2 (1 << 5)
#define bit_cpu_AVX512F (1 << 16)
@@ -83,6 +85,7 @@
# define index_cpu_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_cpu_AVX COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_cpu_AVX2 COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
+# define index_cpu_ERMS COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
# define index_arch_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE
@@ -101,6 +104,7 @@
# define index_arch_I686 FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Fast_ERMS FEATURE_INDEX_1*FEATURE_SIZE
# if defined (_LIBC) && !IS_IN (nonlib)
@@ -226,6 +230,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_cpu_AVX2 COMMON_CPUID_INDEX_7
# define index_cpu_AVX512F COMMON_CPUID_INDEX_7
# define index_cpu_AVX512DQ COMMON_CPUID_INDEX_7
+# define index_cpu_ERMS COMMON_CPUID_INDEX_7
# define index_cpu_RTM COMMON_CPUID_INDEX_7
# define index_cpu_FMA COMMON_CPUID_INDEX_1
# define index_cpu_FMA4 COMMON_CPUID_INDEX_80000001
@@ -242,6 +247,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define reg_AVX2 ebx
# define reg_AVX512F ebx
# define reg_AVX512DQ ebx
+# define reg_ERMS ebx
# define reg_RTM ebx
# define reg_FMA ecx
# define reg_FMA4 ecx
@@ -265,6 +271,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_arch_I686 FEATURE_INDEX_1
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
+# define index_arch_Fast_ERMS FEATURE_INDEX_1
#endif /* !__ASSEMBLER__ */
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources