This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/erms/master created. glibc-2.23-131-g0f7985d
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 28 Mar 2016 13:16:48 -0000
- Subject: GNU C Library master sources branch hjl/erms/master created. glibc-2.23-131-g0f7985d
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/erms/master has been created
at 0f7985d681251d1a2c4a0e8a1bee97092a7dfccf (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0f7985d681251d1a2c4a0e8a1bee97092a7dfccf
commit 0f7985d681251d1a2c4a0e8a1bee97092a7dfccf
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 25 08:20:17 2016 -0700
Add x86-64 memset with vector unaliged stores and rep stosb
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
memset-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
__memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
__memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
__memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
__memset_sse2_unaligned_erms, __memset_erms,
__memset_avx2_unaligned, __memset_avx2_unaligned_erms,
__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset.S (memset): Support ERMS.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ef4dbc0..8878efb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memset-avx512-no-vzeroupper \
memmove-sse2-unaligned-erms \
memmove-avx-unaligned-erms \
- memmove-avx512-unaligned-erms
+ memmove-avx512-unaligned-erms \
+ memset-sse2-unaligned-erms \
+ memset-avx2-unaligned-erms \
+ memset-avx512-unaligned-erms
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9204da4..1e880f6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -118,12 +118,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, __memset_chk,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_sse2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_chk_avx2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_chk_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_chk_avx2_unaligned_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_chk_avx512_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_chk_avx512_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
__memset_chk_avx512_no_vzeroupper)
#endif
)
@@ -131,12 +147,29 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memset.S. */
IFUNC_IMPL (i, name, memset,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1,
+ __memset_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset, 1,
+ __memset_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_avx2)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_avx2_unaligned_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_avx512_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_avx512_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
__memset_avx512_no_vzeroupper)
#endif
)
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
new file mode 100644
index 0000000..e0dc565
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -0,0 +1,14 @@
+#define VEC_SIZE 32
+#define VEC(i) ymm##i
+#define VMOVU vmovdqu
+#define VMOVA vmovdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastb %xmm0, %ymm0
+
+#define SECTION(p) p##.avx
+#define MEMSET_SYMBOL(p,s) p##_avx2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
new file mode 100644
index 0000000..72f4095
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -0,0 +1,17 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE 64
+# define VEC(i) zmm##i
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastb %xmm0, %xmm0; \
+ vpbroadcastq %xmm0, %zmm0
+
+# define SECTION(p) p##.avx512
+# define MEMSET_SYMBOL(p,s) p##_avx512_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
new file mode 100644
index 0000000..437a858
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -0,0 +1,16 @@
+#define VEC_SIZE 16
+#define VEC(i) xmm##i
+#define VMOVU movdqu
+#define VMOVA movdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movd d, %xmm0; \
+ movq r, %rax; \
+ punpcklbw %xmm0, %xmm0; \
+ punpcklwd %xmm0, %xmm0; \
+ pshufd $0, %xmm0, %xmm0
+
+#define SECTION(p) p
+#define MEMSET_SYMBOL(p,s) p##_sse2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
new file mode 100644
index 0000000..b2ae18b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -0,0 +1,234 @@
+/* memset/bzero with vector unaliged loads and rep stosb
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* memset is implemented as:
+ 1. Overlap one integer/VEC register store to avoid branch.
+ 2. Force 32-bit displacement for branches to avoid long nop between
+ instructions.
+ 3. If size is less than VEC, use integer register stores.
+ 4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+ 5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+ 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+ 4 VEC stores and store 4 * VEC at a time until done.
+ */
+#include <sysdep.h>
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+# define MOVQ vmovq
+# define VZEROUPPER vzeroupper
+# define VZEROUPPER_SHORT_RETURN vzeroupper
+# else
+# define MOVQ movq
+# define VZEROUPPER
+# define VZEROUPPER_SHORT_RETURN rep
+# endif
+#endif
+
+/* Threshold to use Enhanced REP STOSB. */
+#ifndef REP_STOSB_THRESHOLD
+# define REP_STOSB_THRESHOLD (1024 * (VEC_SIZE / 16))
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+#if !defined USE_MULTIARCH && IS_IN (libc)
+ .section SECTION(.text),"ax",@progbits
+ENTRY (__bzero)
+ movq %rdi, %rax /* Set return value. */
+ movq %rsi, %rdx /* Set n. */
+ pxor %xmm0, %xmm0
+ jmp L(entry_from_bzero)
+END (__bzero)
+weak_alias (__bzero, bzero)
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+L(memset_entry):
+ VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+L(entry_from_bzero):
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), (%rdi)
+ VZEROUPPER
+ ret
+END (MEMSET_SYMBOL (__memset, unaligned))
+
+#if VEC_SIZE == 16
+/* Only used to measure performance of REP STOSB. */
+ENTRY (__memset_erms)
+#else
+/* Provide a symbol to debugger. */
+ENTRY (MEMSET_SYMBOL (__memset, erms))
+#endif
+L(stosb):
+ movq %rdx, %rcx
+ movzbl %sil, %eax
+ movq %rdi, %rdx
+ rep stosb
+ movq %rdx, %rax
+ ret
+#if VEC_SIZE == 16
+END (__memset_erms)
+#else
+END (MEMSET_SYMBOL (__memset, erms))
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(stosb_more_2x_vec)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(stosb_more_2x_vec):
+ cmpq $REP_STOSB_THRESHOLD, %rdx
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ ja.d32 L(stosb)
+ .p2align 4
+L(more_2x_vec):
+ cmpq $(VEC_SIZE * 4), %rdx
+ ja L(loop_start)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), VEC_SIZE(%rdi)
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+L(return):
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(loop_start):
+ leaq (VEC_SIZE * 4)(%rdi), %rcx
+ VMOVU %VEC(0), (%rdi)
+ andq $-(VEC_SIZE * 4), %rcx
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), VEC_SIZE(%rdi)
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+ VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+ addq %rdi, %rdx
+ andq $-(VEC_SIZE * 4), %rdx
+ cmpq %rdx, %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ je.d32 L(return)
+# else
+ je L(return)
+# endif
+ .p2align 4
+L(loop):
+ VMOVA %VEC(0), (%rcx)
+ VMOVA %VEC(0), VEC_SIZE(%rcx)
+ VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
+ VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
+ addq $(VEC_SIZE * 4), %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+ VZEROUPPER_SHORT_RETURN
+ ret
+L(less_vec):
+ /* Less than 1 VEC. */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+ cmpb $32, %dl
+ jae L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+ cmpb $16, %dl
+ jae L(between_16_31)
+# endif
+ MOVQ %xmm0, %rcx
+ cmpb $8, %dl
+ jae L(between_8_15)
+ cmpb $4, %dl
+ jae L(between_4_7)
+ cmpb $1, %dl
+ ja L(between_2_3)
+ jb 1f
+ movb %cl, (%rdi)
+1:
+ VZEROUPPER
+ ret
+# if VEC_SIZE > 32
+ /* From 32 to 63. No branch when size == 32. */
+L(between_32_63):
+ vmovdqu %ymm0, -32(%rdi,%rdx)
+ vmovdqu %ymm0, (%rdi)
+ VZEROUPPER
+ ret
+# endif
+# if VEC_SIZE > 16
+ /* From 16 to 31. No branch when size == 16. */
+L(between_16_31):
+ vmovdqu %xmm0, -16(%rdi,%rdx)
+ vmovdqu %xmm0, (%rdi)
+ VZEROUPPER
+ ret
+# endif
+ /* From 8 to 15. No branch when size == 8. */
+L(between_8_15):
+ movq %rcx, -8(%rdi,%rdx)
+ movq %rcx, (%rdi)
+ VZEROUPPER
+ ret
+L(between_4_7):
+ /* From 4 to 7. No branch when size == 4. */
+ movl %ecx, -4(%rdi,%rdx)
+ movl %ecx, (%rdi)
+ VZEROUPPER
+ ret
+L(between_2_3):
+ /* From 2 to 3. No branch when size == 2. */
+ movw %cx, -2(%rdi,%rdx)
+ movw %cx, (%rdi)
+ VZEROUPPER
+ ret
+END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 8e3b9b9..7080e2d 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -26,10 +26,18 @@
ENTRY(memset)
.type memset, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __memset_sse2_unaligned_erms(%rip), %rax
+ HAS_CPU_FEATURE (ERMS)
+ jnz 1f
leaq __memset_sse2(%rip), %rax
+1:
HAS_ARCH_FEATURE (AVX2_Usable)
jz 2f
+ leaq __memset_avx2_unaligned_erms(%rip), %rax
+ HAS_CPU_FEATURE (ERMS)
+ jnz L(AVX512F)
leaq __memset_avx2(%rip), %rax
+L(AVX512F):
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 2f
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=70579c5ae8056ff4e0c30a063ace064b7b45f50a
commit 70579c5ae8056ff4e0c30a063ace064b7b45f50a
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 12:36:03 2016 -0700
Add x86-64 memmove with vector unaliged loads and rep movsb
1. __mempcpy_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
2. __mempcpy_sse2_unaligned if Fast_Unaligned_Load bit is set.
3. __mempcpy_sse2 if SSSE3 isn't available.
4. __mempcpy_ssse3_back if Fast_Copy_Backward bit it set.
5. __mempcpy_ssse3
[BZ #19776]
* sysdeps/x86/cpu-features.c (init_cpu_features): Set
bit_arch_Hybrid_ERMS if ERMS is supported.
* sysdeps/x86/cpu-features.h (bit_arch_Hybrid_ERMS): New.
(index_arch_Hybrid_ERMS): Likewise.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and
memmove-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test
__memmove_chk_avx512_unaligned_2,
__memmove_chk_avx512_unaligned_erms,
__memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms,
__memmove_chk_sse2_unaligned_2,
__memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2,
__memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2,
__memmove_avx512_unaligned_erms, __memmove_erms,
__memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms,
__memcpy_chk_avx512_unaligned_2,
__memcpy_chk_avx512_unaligned_erms,
__memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms,
__memcpy_chk_sse2_unaligned_2,
__memcpy_chk_sse2_unaligned_erms, __memcpy_avx_unaligned_2,
__memcpy_avx_unaligned_erms, __memcpy_avx512_unaligned_2,
__memcpy_avx512_unaligned_erms, __memcpy_sse2_unaligned_2,
__memcpy_sse2_unaligned_erms, __memcpy_erms,
__mempcpy_chk_avx512_unaligned_2,
__mempcpy_chk_avx512_unaligned_erms,
__mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms,
__mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms,
__mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms,
__mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms,
__mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms
and __mempcpy_erms.
* sysdeps/x86_64/multiarch/memcpy (__new_memcpy): Support ERMS.
* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk):
Likwise.
* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
Likwise.
* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
Likwise.
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
Likwise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 7fc89c2..ef4dbc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -20,7 +20,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
- memset-avx512-no-vzeroupper
+ memset-avx512-no-vzeroupper \
+ memmove-sse2-unaligned-erms \
+ memmove-avx-unaligned-erms \
+ memmove-avx512-unaligned-erms
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 188b6d3..9204da4 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -52,17 +52,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_chk_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_chk_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_chk_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memmove_chk_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memmove_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2))
/* Support sysdeps/x86_64/multiarch/memmove.S. */
@@ -70,15 +86,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memmove_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memmove_avx_unaligned_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
+ IFUNC_IMPL_ADD (array, i, memmove, 1,
+ __memmove_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memmove, 1,
+ __memmove_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
/* Support sysdeps/x86_64/multiarch/memset_chk.S. */
@@ -267,17 +300,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_chk_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_chk_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_chk_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_chk_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))
/* Support sysdeps/x86_64/multiarch/memcpy.S. */
@@ -285,6 +334,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -293,8 +348,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
@@ -303,17 +369,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_chk_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_chk_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_chk_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_chk_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))
/* Support sysdeps/x86_64/multiarch/mempcpy.S. */
@@ -322,14 +404,31 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
/* Support sysdeps/x86_64/multiarch/strncmp.S. */
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 5b045d7..f0b0b92 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -38,12 +38,22 @@ ENTRY(__new_memcpy)
lea __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: lea __memcpy_avx_unaligned(%rip), %RAX_LP
+1: lea __memcpy_avx_unaligned_erms(%rip), %RAX_LP
HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz L(Fast_Unaligned_Load)
+ HAS_CPU_FEATURE (ERMS)
jnz 2f
- lea __memcpy_sse2_unaligned(%rip), %RAX_LP
+ lea __memcpy_avx_unaligned(%rip), %RAX_LP
+ ret
+L(Fast_Unaligned_Load):
+ lea __memcpy_sse2_unaligned_erms(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+ jz L(SSE2)
+ HAS_CPU_FEATURE (ERMS)
jnz 2f
+ lea __memcpy_sse2_unaligned(%rip), %RAX_LP
+ ret
+L(SSE2):
lea __memcpy_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 648217e..03ec90c 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -32,22 +32,25 @@ ENTRY(__memcpy_chk)
LOAD_RTLD_GLOBAL_RO_RDX
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
+ jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jz 1f
- leaq __memcpy_chk_avx512_no_vzeroupper(%rip), %rax
+ jz 1f
+ lea __memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __memcpy_chk_sse2(%rip), %rax
+1: lea __memcpy_chk_avx_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ lea __memcpy_chk_sse2_unaligned_2(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ lea __memcpy_chk_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __memcpy_chk_ssse3(%rip), %rax
+ lea __memcpy_chk_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __memcpy_chk_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __memcpy_chk_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __memcpy_chk_ssse3(%rip), %RAX_LP
2: ret
END(__memcpy_chk)
# else
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
new file mode 100644
index 0000000..3a72c7e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -0,0 +1,9 @@
+#define VEC_SIZE 32
+#define VEC(i) ymm##i
+#define VMOVU vmovdqu
+#define VMOVA vmovdqa
+
+#define SECTION(p) p##.avx
+#define MEMMOVE_SYMBOL(p,s) p##_avx_##s
+
+#include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
new file mode 100644
index 0000000..38358fa
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -0,0 +1,11 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE 64
+# define VEC(i) zmm##i
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+# define SECTION(p) p##.avx512
+# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
new file mode 100644
index 0000000..52b9ae0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -0,0 +1,9 @@
+#define VEC_SIZE 16
+#define VEC(i) xmm##i
+#define VMOVU movdqu
+#define VMOVA movdqa
+
+#define SECTION(p) p
+#define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
+
+#include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
new file mode 100644
index 0000000..10cdc10
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -0,0 +1,461 @@
+/* memmove/memcpy/mempcpy with vector unaliged loads and rep movsb
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* memmove/memcpy/mempcpy is implemented as:
+ 1. Overlap one register load/store to avoid branch.
+ 2. Use 8-bit or 32-bit displacements for branches and nop paddings
+ to avoid long nop between instructions.
+ 3. Load all sources into registers and store them together to avoid
+ possible address overflap between source and destination.
+ 4. If size is 2 * VEC_SIZE or less, load all sources into registers
+ and store them together.
+ 5. If there is no address overflap, copy from both ends with
+ 4 * VEC_SIZE at a time.
+ 6. If size is 8 * VEC_SIZE or less, load all sources into registers
+ and store them together.
+ 7. If address of destination > address of source, backward copy
+ 8 * VEC_SIZE at a time.
+ 8. Otherwise, forward copy 8 * VEC_SIZE at a time.
+ */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef VZEROUPPER
+# if VEC_SIZE > 16
+# define VZEROUPPER vzeroupper
+# else
+# define VZEROUPPER
+# endif
+# endif
+
+/* Threshold to use Enhanced REP MOVSB. */
+# ifndef REP_MOVSB_THRESHOLD
+# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
+# endif
+
+# define MEMMOVE_DEBUG(m) m##_debug
+# define MEMMOVE_ENTRY(m) MEMMOVE_DEBUG(m)
+
+# ifndef SECTION
+# error SECTION is not defined!
+# endif
+ .section SECTION(.text),"ax",@progbits
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(start)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+ movq %rdi, %rax
+L(start):
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
+ VZEROUPPER
+ ret
+END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
+# if VEC_SIZE == 16
+/* Only used to measure performance of REP MOVSB. */
+# ifdef SHARED
+ENTRY (__mempcpy_erms)
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(movsb)
+END (__mempcpy_erms)
+# endif
+
+ENTRY (__memmove_erms)
+ movq %rdi, %rax
+ movq %rdx, %rcx
+ cmpq %rsi, %rdi
+ jbe 1f
+ leaq (%rsi,%rcx), %rdx
+ cmpq %rdx, %rdi
+ jb L(movsb_backward)
+1:
+ rep movsb
+ ret
+L(movsb_backward):
+ leaq -1(%rdi,%rcx), %rdi
+ leaq -1(%rsi,%rcx), %rsi
+ std
+ rep movsb
+ cld
+ ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ movq %rdi, %rax
+L(start_erms):
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(movsb_more_2x_vec)
+L(last_2x_vec):
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
+L(return):
+ VZEROUPPER
+ ret
+
+L(movsb):
+ cmpq %rsi, %rdi
+ jbe 1f
+ leaq (%rsi,%rdx), %r9
+ cmpq %r9, %rdi
+ /* Avoid slow backward REP MOVSB. */
+# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
+# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
+# endif
+ jb L(more_8x_vec_backward)
+1:
+ movq %rdx, %rcx
+ rep movsb
+ ret
+
+ .p2align 4
+L(movsb_more_2x_vec):
+ cmpq $REP_MOVSB_THRESHOLD, %rdx
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ ja.d32 L(movsb)
+ .p2align 4
+L(more_2x_vec):
+ /* More than 2 * VEC. */
+ cmpq %rsi, %rdi
+ jbe L(copy_forward)
+ leaq (%rsi,%rdx), %rcx
+ cmpq %rcx, %rdi
+ jb L(more_2x_vec_overlap)
+L(copy_forward):
+ leaq (%rdi,%rdx), %rcx
+ cmpq %rcx, %rsi
+ jb L(more_2x_vec_overlap)
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+ cmpq $(VEC_SIZE * 4), %rdx
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ jbe.d32 L(return)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
+ VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
+ VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(1), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
+ cmpq $(VEC_SIZE * 8), %rdx
+# if VEC_SIZE == 16
+ jbe L(return)
+# else
+ /* Use 8-bit displacement to avoid long nop between
+ instructions. */
+ jbe L(return_disp8)
+# endif
+ leaq (VEC_SIZE * 4)(%rdi), %rcx
+ addq %rdi, %rdx
+ andq $-(VEC_SIZE * 4), %rdx
+ andq $-(VEC_SIZE * 4), %rcx
+ movq %rcx, %r11
+ subq %rdi, %r11
+ addq %r11, %rsi
+ cmpq %rdx, %rcx
+# if VEC_SIZE == 16
+ je L(return)
+# else
+ /* Use 8-bit displacement to avoid long nop between
+ instructions. */
+ je L(return_disp8)
+# endif
+ movq %rsi, %r10
+ subq %rcx, %r10
+ leaq VEC_SIZE(%r10), %r9
+ leaq (VEC_SIZE * 2)(%r10), %r8
+ leaq (VEC_SIZE * 3)(%r10), %r11
+ .p2align 4
+L(loop):
+ VMOVU (%rcx,%r10), %VEC(0)
+ VMOVU (%rcx,%r9), %VEC(1)
+ VMOVU (%rcx,%r8), %VEC(2)
+ VMOVU (%rcx,%r11), %VEC(3)
+ VMOVA %VEC(0), (%rcx)
+ VMOVA %VEC(1), VEC_SIZE(%rcx)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rcx)
+ addq $(VEC_SIZE * 4), %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+L(return_disp8):
+ VZEROUPPER
+ ret
+L(less_vec):
+ /* Less than 1 VEC. */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+ cmpb $32, %dl
+ jae L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+ cmpb $16, %dl
+ jae L(between_16_31)
+# endif
+ cmpb $8, %dl
+ jae L(between_8_15)
+ cmpb $4, %dl
+ jae L(between_4_7)
+ cmpb $1, %dl
+ ja L(between_2_3)
+ jb 1f
+ movzbl (%rsi), %ecx
+ movb %cl, (%rdi)
+1:
+ ret
+# if VEC_SIZE > 32
+L(between_32_63):
+ /* From 32 to 63. No branch when size == 32. */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu -32(%rsi,%rdx), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, -32(%rdi,%rdx)
+ VZEROUPPER
+ ret
+# endif
+# if VEC_SIZE > 16
+ /* From 16 to 31. No branch when size == 16. */
+L(between_16_31):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -16(%rsi,%rdx), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -16(%rdi,%rdx)
+ ret
+# endif
+L(between_8_15):
+ /* From 8 to 15. No branch when size == 8. */
+ movq -8(%rsi,%rdx), %rcx
+ movq (%rsi), %rsi
+ movq %rcx, -8(%rdi,%rdx)
+ movq %rsi, (%rdi)
+ ret
+L(between_4_7):
+ /* From 4 to 7. No branch when size == 4. */
+ movl -4(%rsi,%rdx), %ecx
+ movl (%rsi), %esi
+ movl %ecx, -4(%rdi,%rdx)
+ movl %esi, (%rdi)
+ ret
+L(between_2_3):
+ /* From 2 to 3. No branch when size == 2. */
+ movzwl -2(%rsi,%rdx), %ecx
+ movzwl (%rsi), %esi
+ movw %cx, -2(%rdi,%rdx)
+ movw %si, (%rdi)
+ ret
+
+# if VEC_SIZE > 16
+ /* Align to 16 bytes to avoid long nop between instructions. */
+ .p2align 4
+# endif
+L(more_2x_vec_overlap):
+ /* More than 2 * VEC and there is overlap bewteen destination
+ and source. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(last_4x_vec)
+L(between_4x_vec_and_8x_vec):
+ /* Copy from 4 * VEC to 8 * VEC, inclusively. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
+ VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+ VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+ VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+ VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+ VZEROUPPER
+ ret
+L(last_4x_vec):
+ /* Copy from 2 * VEC to 4 * VEC. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VZEROUPPER
+ ret
+L(between_0_and_4x_vec):
+ /* Copy from 0 to 4 * VEC. */
+ cmpl $(VEC_SIZE * 2), %edx
+ jae L(last_4x_vec)
+ /* Copy from 0 to 2 * VEC. */
+ cmpl $VEC_SIZE, %edx
+ jae L(last_2x_vec)
+ /* Copy from 0 to VEC. */
+ VZEROUPPER
+ jmp L(less_vec)
+L(more_8x_vec):
+ cmpq %rsi, %rdi
+ ja L(more_8x_vec_backward)
+
+ .p2align 4
+L(loop_8x_vec_forward):
+ /* Copy 8 * VEC a time forward. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ VMOVU (VEC_SIZE * 4)(%rsi), %VEC(4)
+ VMOVU (VEC_SIZE * 5)(%rsi), %VEC(5)
+ VMOVU (VEC_SIZE * 6)(%rsi), %VEC(6)
+ VMOVU (VEC_SIZE * 7)(%rsi), %VEC(7)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(4), (VEC_SIZE * 4)(%rdi)
+ VMOVU %VEC(5), (VEC_SIZE * 5)(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 6)(%rdi)
+ VMOVU %VEC(7), (VEC_SIZE * 7)(%rdi)
+ addq $(VEC_SIZE * 8), %rdi
+ addq $(VEC_SIZE * 8), %rsi
+ subq $(VEC_SIZE * 8), %rdx
+ cmpq $(VEC_SIZE * 8), %rdx
+ je L(between_4x_vec_and_8x_vec)
+ ja L(loop_8x_vec_forward)
+ /* Less than 8 * VEC to copy. */
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(between_0_and_4x_vec)
+ jmp L(between_4x_vec_and_8x_vec)
+
+ .p2align 4
+L(more_8x_vec_backward):
+ leaq -VEC_SIZE(%rsi, %rdx), %rcx
+ leaq -VEC_SIZE(%rdi, %rdx), %r9
+
+ .p2align 4
+L(loop_8x_vec_backward):
+ /* Copy 8 * VEC a time backward. */
+ VMOVU (%rcx), %VEC(0)
+ VMOVU -VEC_SIZE(%rcx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
+ VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
+ VMOVU -(VEC_SIZE * 4)(%rcx), %VEC(4)
+ VMOVU -(VEC_SIZE * 5)(%rcx), %VEC(5)
+ VMOVU -(VEC_SIZE * 6)(%rcx), %VEC(6)
+ VMOVU -(VEC_SIZE * 7)(%rcx), %VEC(7)
+ VMOVU %VEC(0), (%r9)
+ VMOVU %VEC(1), -VEC_SIZE(%r9)
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%r9)
+ VMOVU %VEC(3), -(VEC_SIZE * 3)(%r9)
+ VMOVU %VEC(4), -(VEC_SIZE * 4)(%r9)
+ VMOVU %VEC(5), -(VEC_SIZE * 5)(%r9)
+ VMOVU %VEC(6), -(VEC_SIZE * 6)(%r9)
+ VMOVU %VEC(7), -(VEC_SIZE * 7)(%r9)
+ subq $(VEC_SIZE * 8), %rcx
+ subq $(VEC_SIZE * 8), %r9
+ subq $(VEC_SIZE * 8), %rdx
+ cmpq $(VEC_SIZE * 8), %rdx
+ je L(between_4x_vec_and_8x_vec)
+ ja L(loop_8x_vec_backward)
+ /* Less than 8 * VEC to copy. */
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(between_0_and_4x_vec)
+ jmp L(between_4x_vec_and_8x_vec)
+END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+
+# ifdef SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
+ MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
+ MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
+ MEMMOVE_SYMBOL (__memcpy, unaligned_2))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
+ MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
+# endif
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index ed78623..b83e1e3 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -33,19 +33,32 @@ ENTRY(__mempcpy)
jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
jz 1f
- leaq __mempcpy_avx512_no_vzeroupper(%rip), %rax
+ lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __mempcpy_sse2(%rip), %rax
+1: lea __mempcpy_avx_unaligned_erms(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz L(Fast_Unaligned_Load)
+ HAS_CPU_FEATURE (ERMS)
+ jnz 2f
+ lea __mempcpy_avx_unaligned(%rip), %RAX_LP
+ ret
+L(Fast_Unaligned_Load):
+ lea __mempcpy_sse2_unaligned_erms(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jz L(SSE2)
+ HAS_CPU_FEATURE (ERMS)
+ jnz 2f
+ lea __mempcpy_sse2_unaligned_2(%rip), %RAX_LP
+ ret
+L(SSE2):
+ lea __mempcpy_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __mempcpy_ssse3(%rip), %rax
+ lea __mempcpy_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __mempcpy_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __mempcpy_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __mempcpy_ssse3(%rip), %RAX_LP
2: ret
END(__mempcpy)
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 6e8a89d..18d4c00 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -35,19 +35,22 @@ ENTRY(__mempcpy_chk)
jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
jz 1f
- leaq __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+ lea __mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __mempcpy_chk_sse2(%rip), %rax
+1: lea __mempcpy_chk_avx_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_chk_sse2_unaligned_2(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_chk_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __mempcpy_chk_ssse3(%rip), %rax
+ lea __mempcpy_chk_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __mempcpy_chk_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __mempcpy_chk_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __mempcpy_chk_ssse3(%rip), %RAX_LP
2: ret
END(__mempcpy_chk)
# else
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=81641f1f3aafc33f33f20e02f88e696480578195
commit 81641f1f3aafc33f33f20e02f88e696480578195
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Sep 15 15:47:01 2011 -0700
Initial Enhanced REP MOVSB/STOSB (ERMS) support
The newer Intel processors support Enhanced REP MOVSB/STOSB (ERMS) which
has a feature bit in CPUID. This patch adds the Enhanced REP MOVSB/STOSB
(ERMS) bit to x86 cpu-features.
* sysdeps/x86/cpu-features.h (bit_cpu_ERMS): New.
(index_cpu_ERMS): Likewise.
(reg_ERMS): Likewise.
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index bfe1f4c..8f946c4 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -53,6 +53,7 @@
#define bit_cpu_FMA4 (1 << 16)
/* COMMON_CPUID_INDEX_7. */
+#define bit_cpu_ERMS (1 << 9)
#define bit_cpu_RTM (1 << 11)
#define bit_cpu_AVX2 (1 << 5)
#define bit_cpu_AVX512F (1 << 16)
@@ -84,6 +85,7 @@
# define index_cpu_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_cpu_AVX COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_cpu_AVX2 COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
+# define index_cpu_ERMS COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
# define index_arch_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE
@@ -228,6 +230,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_cpu_AVX2 COMMON_CPUID_INDEX_7
# define index_cpu_AVX512F COMMON_CPUID_INDEX_7
# define index_cpu_AVX512DQ COMMON_CPUID_INDEX_7
+# define index_cpu_ERMS COMMON_CPUID_INDEX_7
# define index_cpu_RTM COMMON_CPUID_INDEX_7
# define index_cpu_FMA COMMON_CPUID_INDEX_1
# define index_cpu_FMA4 COMMON_CPUID_INDEX_80000001
@@ -244,6 +247,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define reg_AVX2 ebx
# define reg_AVX512F ebx
# define reg_AVX512DQ ebx
+# define reg_ERMS ebx
# define reg_RTM ebx
# define reg_FMA ecx
# define reg_FMA4 ecx
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c6a93f91c66e4de17d1e1e25a87ac8fc460a02b0
commit c6a93f91c66e4de17d1e1e25a87ac8fc460a02b0
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 25 06:32:44 2016 -0700
Make __memcpy_avx512_no_vzeroupper an alias
Since x86-64 memcpy-avx512-no-vzeroupper.S implements memmove, we can make
__memcpy_avx512_no_vzeroupper an alias of __memmove_avx512_no_vzeroupper
to reduce code size of libc.so.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
memcpy-avx512-no-vzeroupper.
* sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S: Renamed
to ...
* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: This.
(MEMCPY): Don't define.
(MEMCPY_CHK): Likewise.
(MEMPCPY): Likewise.
(MEMPCPY_CHK): Likewise.
(MEMPCPY_CHK): Renamed to ...
(__mempcpy_chk_avx512_no_vzeroupper): This.
(MEMPCPY_CHK): Renamed to ...
(__mempcpy_chk_avx512_no_vzeroupper): This.
(MEMCPY_CHK): Renamed to ...
(__memmove_chk_avx512_no_vzeroupper): This.
(MEMCPY): Renamed to ...
(__memmove_avx512_no_vzeroupper): This.
(__memcpy_avx512_no_vzeroupper): New alias.
(__memcpy_chk_avx512_no_vzeroupper): Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 39c0905..7fc89c2 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,7 +8,7 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
- memcpy-avx512-no-vzeroupper memmove-ssse3 \
+ memmove-ssse3 \
memcpy-ssse3-back memmove-avx-unaligned \
memcpy-avx-unaligned \
memmove-ssse3-back \
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
deleted file mode 100644
index 285bb83..0000000
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,424 +0,0 @@
-/* memcpy optimized with AVX512 for KNL hardware.
- Copyright (C) 2016 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) \
- && (defined SHARED \
- || defined USE_AS_MEMMOVE \
- || !defined USE_MULTIARCH)
-
-#include "asm-syntax.h"
-#ifndef MEMCPY
-# define MEMCPY __memcpy_avx512_no_vzeroupper
-# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper
-# define MEMPCPY __mempcpy_avx512_no_vzeroupper
-# define MEMPCPY_CHK __mempcpy_chk_avx512_no_vzeroupper
-#endif
-
- .section .text.avx512,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %rdi, %rax
-#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
-#endif
-L(start):
- lea (%rsi, %rdx), %rcx
- lea (%rdi, %rdx), %r9
- cmp $512, %rdx
- ja L(512bytesormore)
-
-L(check):
- cmp $16, %rdx
- jbe L(less_16bytes)
- cmp $256, %rdx
- jb L(less_256bytes)
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups 0x80(%rsi), %zmm2
- vmovups 0xC0(%rsi), %zmm3
- vmovups -0x100(%rcx), %zmm4
- vmovups -0xC0(%rcx), %zmm5
- vmovups -0x80(%rcx), %zmm6
- vmovups -0x40(%rcx), %zmm7
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm3, 0xC0(%rdi)
- vmovups %zmm4, -0x100(%r9)
- vmovups %zmm5, -0xC0(%r9)
- vmovups %zmm6, -0x80(%r9)
- vmovups %zmm7, -0x40(%r9)
- ret
-
-L(less_256bytes):
- cmp $128, %dl
- jb L(less_128bytes)
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups -0x80(%rcx), %zmm2
- vmovups -0x40(%rcx), %zmm3
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, -0x80(%r9)
- vmovups %zmm3, -0x40(%r9)
- ret
-
-L(less_128bytes):
- cmp $64, %dl
- jb L(less_64bytes)
- vmovdqu (%rsi), %ymm0
- vmovdqu 0x20(%rsi), %ymm1
- vmovdqu -0x40(%rcx), %ymm2
- vmovdqu -0x20(%rcx), %ymm3
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 0x20(%rdi)
- vmovdqu %ymm2, -0x40(%r9)
- vmovdqu %ymm3, -0x20(%r9)
- ret
-
-L(less_64bytes):
- cmp $32, %dl
- jb L(less_32bytes)
- vmovdqu (%rsi), %ymm0
- vmovdqu -0x20(%rcx), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, -0x20(%r9)
- ret
-
-L(less_32bytes):
- vmovdqu (%rsi), %xmm0
- vmovdqu -0x10(%rcx), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, -0x10(%r9)
- ret
-
-L(less_16bytes):
- cmp $8, %dl
- jb L(less_8bytes)
- movq (%rsi), %rsi
- movq -0x8(%rcx), %rcx
- movq %rsi, (%rdi)
- movq %rcx, -0x8(%r9)
- ret
-
-L(less_8bytes):
- cmp $4, %dl
- jb L(less_4bytes)
- mov (%rsi), %esi
- mov -0x4(%rcx), %ecx
- mov %esi, (%rdi)
- mov %ecx, -0x4(%r9)
- ret
-
-L(less_4bytes):
- cmp $2, %dl
- jb L(less_2bytes)
- mov (%rsi), %si
- mov -0x2(%rcx), %cx
- mov %si, (%rdi)
- mov %cx, -0x2(%r9)
- ret
-
-L(less_2bytes):
- cmp $1, %dl
- jb L(less_1bytes)
- mov (%rsi), %cl
- mov %cl, (%rdi)
-L(less_1bytes):
- ret
-
-L(512bytesormore):
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %r8
-#else
- mov __x86_shared_cache_size_half(%rip), %r8
-#endif
- cmp %r8, %rdx
- jae L(preloop_large)
- cmp $1024, %rdx
- ja L(1024bytesormore)
- prefetcht1 (%rsi)
- prefetcht1 0x40(%rsi)
- prefetcht1 0x80(%rsi)
- prefetcht1 0xC0(%rsi)
- prefetcht1 0x100(%rsi)
- prefetcht1 0x140(%rsi)
- prefetcht1 0x180(%rsi)
- prefetcht1 0x1C0(%rsi)
- prefetcht1 -0x200(%rcx)
- prefetcht1 -0x1C0(%rcx)
- prefetcht1 -0x180(%rcx)
- prefetcht1 -0x140(%rcx)
- prefetcht1 -0x100(%rcx)
- prefetcht1 -0xC0(%rcx)
- prefetcht1 -0x80(%rcx)
- prefetcht1 -0x40(%rcx)
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups 0x80(%rsi), %zmm2
- vmovups 0xC0(%rsi), %zmm3
- vmovups 0x100(%rsi), %zmm4
- vmovups 0x140(%rsi), %zmm5
- vmovups 0x180(%rsi), %zmm6
- vmovups 0x1C0(%rsi), %zmm7
- vmovups -0x200(%rcx), %zmm8
- vmovups -0x1C0(%rcx), %zmm9
- vmovups -0x180(%rcx), %zmm10
- vmovups -0x140(%rcx), %zmm11
- vmovups -0x100(%rcx), %zmm12
- vmovups -0xC0(%rcx), %zmm13
- vmovups -0x80(%rcx), %zmm14
- vmovups -0x40(%rcx), %zmm15
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm3, 0xC0(%rdi)
- vmovups %zmm4, 0x100(%rdi)
- vmovups %zmm5, 0x140(%rdi)
- vmovups %zmm6, 0x180(%rdi)
- vmovups %zmm7, 0x1C0(%rdi)
- vmovups %zmm8, -0x200(%r9)
- vmovups %zmm9, -0x1C0(%r9)
- vmovups %zmm10, -0x180(%r9)
- vmovups %zmm11, -0x140(%r9)
- vmovups %zmm12, -0x100(%r9)
- vmovups %zmm13, -0xC0(%r9)
- vmovups %zmm14, -0x80(%r9)
- vmovups %zmm15, -0x40(%r9)
- ret
-
-L(1024bytesormore):
- cmp %rsi, %rdi
- ja L(1024bytesormore_bkw)
- sub $512, %r9
- vmovups -0x200(%rcx), %zmm8
- vmovups -0x1C0(%rcx), %zmm9
- vmovups -0x180(%rcx), %zmm10
- vmovups -0x140(%rcx), %zmm11
- vmovups -0x100(%rcx), %zmm12
- vmovups -0xC0(%rcx), %zmm13
- vmovups -0x80(%rcx), %zmm14
- vmovups -0x40(%rcx), %zmm15
- prefetcht1 (%rsi)
- prefetcht1 0x40(%rsi)
- prefetcht1 0x80(%rsi)
- prefetcht1 0xC0(%rsi)
- prefetcht1 0x100(%rsi)
- prefetcht1 0x140(%rsi)
- prefetcht1 0x180(%rsi)
- prefetcht1 0x1C0(%rsi)
-
-/* Loop with unaligned memory access. */
-L(gobble_512bytes_loop):
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups 0x80(%rsi), %zmm2
- vmovups 0xC0(%rsi), %zmm3
- vmovups 0x100(%rsi), %zmm4
- vmovups 0x140(%rsi), %zmm5
- vmovups 0x180(%rsi), %zmm6
- vmovups 0x1C0(%rsi), %zmm7
- add $512, %rsi
- prefetcht1 (%rsi)
- prefetcht1 0x40(%rsi)
- prefetcht1 0x80(%rsi)
- prefetcht1 0xC0(%rsi)
- prefetcht1 0x100(%rsi)
- prefetcht1 0x140(%rsi)
- prefetcht1 0x180(%rsi)
- prefetcht1 0x1C0(%rsi)
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm3, 0xC0(%rdi)
- vmovups %zmm4, 0x100(%rdi)
- vmovups %zmm5, 0x140(%rdi)
- vmovups %zmm6, 0x180(%rdi)
- vmovups %zmm7, 0x1C0(%rdi)
- add $512, %rdi
- cmp %r9, %rdi
- jb L(gobble_512bytes_loop)
- vmovups %zmm8, (%r9)
- vmovups %zmm9, 0x40(%r9)
- vmovups %zmm10, 0x80(%r9)
- vmovups %zmm11, 0xC0(%r9)
- vmovups %zmm12, 0x100(%r9)
- vmovups %zmm13, 0x140(%r9)
- vmovups %zmm14, 0x180(%r9)
- vmovups %zmm15, 0x1C0(%r9)
- ret
-
-L(1024bytesormore_bkw):
- add $512, %rdi
- vmovups 0x1C0(%rsi), %zmm8
- vmovups 0x180(%rsi), %zmm9
- vmovups 0x140(%rsi), %zmm10
- vmovups 0x100(%rsi), %zmm11
- vmovups 0xC0(%rsi), %zmm12
- vmovups 0x80(%rsi), %zmm13
- vmovups 0x40(%rsi), %zmm14
- vmovups (%rsi), %zmm15
- prefetcht1 -0x40(%rcx)
- prefetcht1 -0x80(%rcx)
- prefetcht1 -0xC0(%rcx)
- prefetcht1 -0x100(%rcx)
- prefetcht1 -0x140(%rcx)
- prefetcht1 -0x180(%rcx)
- prefetcht1 -0x1C0(%rcx)
- prefetcht1 -0x200(%rcx)
-
-/* Backward loop with unaligned memory access. */
-L(gobble_512bytes_loop_bkw):
- vmovups -0x40(%rcx), %zmm0
- vmovups -0x80(%rcx), %zmm1
- vmovups -0xC0(%rcx), %zmm2
- vmovups -0x100(%rcx), %zmm3
- vmovups -0x140(%rcx), %zmm4
- vmovups -0x180(%rcx), %zmm5
- vmovups -0x1C0(%rcx), %zmm6
- vmovups -0x200(%rcx), %zmm7
- sub $512, %rcx
- prefetcht1 -0x40(%rcx)
- prefetcht1 -0x80(%rcx)
- prefetcht1 -0xC0(%rcx)
- prefetcht1 -0x100(%rcx)
- prefetcht1 -0x140(%rcx)
- prefetcht1 -0x180(%rcx)
- prefetcht1 -0x1C0(%rcx)
- prefetcht1 -0x200(%rcx)
- vmovups %zmm0, -0x40(%r9)
- vmovups %zmm1, -0x80(%r9)
- vmovups %zmm2, -0xC0(%r9)
- vmovups %zmm3, -0x100(%r9)
- vmovups %zmm4, -0x140(%r9)
- vmovups %zmm5, -0x180(%r9)
- vmovups %zmm6, -0x1C0(%r9)
- vmovups %zmm7, -0x200(%r9)
- sub $512, %r9
- cmp %rdi, %r9
- ja L(gobble_512bytes_loop_bkw)
- vmovups %zmm8, -0x40(%rdi)
- vmovups %zmm9, -0x80(%rdi)
- vmovups %zmm10, -0xC0(%rdi)
- vmovups %zmm11, -0x100(%rdi)
- vmovups %zmm12, -0x140(%rdi)
- vmovups %zmm13, -0x180(%rdi)
- vmovups %zmm14, -0x1C0(%rdi)
- vmovups %zmm15, -0x200(%rdi)
- ret
-
-L(preloop_large):
- cmp %rsi, %rdi
- ja L(preloop_large_bkw)
- vmovups (%rsi), %zmm4
- vmovups 0x40(%rsi), %zmm5
-
-/* Align destination for access with non-temporal stores in the loop. */
- mov %rdi, %r8
- and $-0x80, %rdi
- add $0x80, %rdi
- sub %rdi, %r8
- sub %r8, %rsi
- add %r8, %rdx
-L(gobble_256bytes_nt_loop):
- prefetcht1 0x200(%rsi)
- prefetcht1 0x240(%rsi)
- prefetcht1 0x280(%rsi)
- prefetcht1 0x2C0(%rsi)
- prefetcht1 0x300(%rsi)
- prefetcht1 0x340(%rsi)
- prefetcht1 0x380(%rsi)
- prefetcht1 0x3C0(%rsi)
- vmovdqu64 (%rsi), %zmm0
- vmovdqu64 0x40(%rsi), %zmm1
- vmovdqu64 0x80(%rsi), %zmm2
- vmovdqu64 0xC0(%rsi), %zmm3
- vmovntdq %zmm0, (%rdi)
- vmovntdq %zmm1, 0x40(%rdi)
- vmovntdq %zmm2, 0x80(%rdi)
- vmovntdq %zmm3, 0xC0(%rdi)
- sub $256, %rdx
- add $256, %rsi
- add $256, %rdi
- cmp $256, %rdx
- ja L(gobble_256bytes_nt_loop)
- sfence
- vmovups %zmm4, (%rax)
- vmovups %zmm5, 0x40(%rax)
- jmp L(check)
-
-L(preloop_large_bkw):
- vmovups -0x80(%rcx), %zmm4
- vmovups -0x40(%rcx), %zmm5
-
-/* Align end of destination for access with non-temporal stores. */
- mov %r9, %r8
- and $-0x80, %r9
- sub %r9, %r8
- sub %r8, %rcx
- sub %r8, %rdx
- add %r9, %r8
-L(gobble_256bytes_nt_loop_bkw):
- prefetcht1 -0x400(%rcx)
- prefetcht1 -0x3C0(%rcx)
- prefetcht1 -0x380(%rcx)
- prefetcht1 -0x340(%rcx)
- prefetcht1 -0x300(%rcx)
- prefetcht1 -0x2C0(%rcx)
- prefetcht1 -0x280(%rcx)
- prefetcht1 -0x240(%rcx)
- vmovdqu64 -0x100(%rcx), %zmm0
- vmovdqu64 -0xC0(%rcx), %zmm1
- vmovdqu64 -0x80(%rcx), %zmm2
- vmovdqu64 -0x40(%rcx), %zmm3
- vmovntdq %zmm0, -0x100(%r9)
- vmovntdq %zmm1, -0xC0(%r9)
- vmovntdq %zmm2, -0x80(%r9)
- vmovntdq %zmm3, -0x40(%r9)
- sub $256, %rdx
- sub $256, %rcx
- sub $256, %r9
- cmp $256, %rdx
- ja L(gobble_256bytes_nt_loop_bkw)
- sfence
- vmovups %zmm4, -0x80(%r8)
- vmovups %zmm5, -0x40(%r8)
- jmp L(check)
-END (MEMCPY)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
index 518d1fe..5b8ff57 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -1,4 +1,4 @@
-/* memmove optimized with AVX512 for KNL hardware.
+/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
Copyright (C) 2016 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,7 +16,405 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_avx512_no_vzeroupper
-#define MEMCPY_CHK __memmove_chk_avx512_no_vzeroupper
-#include "memcpy-avx512-no-vzeroupper.S"
+#include <sysdep.h>
+
+#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
+
+# include "asm-syntax.h"
+
+ .section .text.avx512,"ax",@progbits
+# if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_avx512_no_vzeroupper)
+
+ENTRY (__mempcpy_avx512_no_vzeroupper)
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(start)
+END (__mempcpy_avx512_no_vzeroupper)
+# endif
+
+# ifdef SHARED
+ENTRY (__memmove_chk_avx512_no_vzeroupper)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_avx512_no_vzeroupper)
+# endif
+
+ENTRY (__memmove_avx512_no_vzeroupper)
+ mov %rdi, %rax
+# ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+# endif
+L(start):
+ lea (%rsi, %rdx), %rcx
+ lea (%rdi, %rdx), %r9
+ cmp $512, %rdx
+ ja L(512bytesormore)
+
+L(check):
+ cmp $16, %rdx
+ jbe L(less_16bytes)
+ cmp $256, %rdx
+ jb L(less_256bytes)
+ vmovups (%rsi), %zmm0
+ vmovups 0x40(%rsi), %zmm1
+ vmovups 0x80(%rsi), %zmm2
+ vmovups 0xC0(%rsi), %zmm3
+ vmovups -0x100(%rcx), %zmm4
+ vmovups -0xC0(%rcx), %zmm5
+ vmovups -0x80(%rcx), %zmm6
+ vmovups -0x40(%rcx), %zmm7
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm1, 0x40(%rdi)
+ vmovups %zmm2, 0x80(%rdi)
+ vmovups %zmm3, 0xC0(%rdi)
+ vmovups %zmm4, -0x100(%r9)
+ vmovups %zmm5, -0xC0(%r9)
+ vmovups %zmm6, -0x80(%r9)
+ vmovups %zmm7, -0x40(%r9)
+ ret
+
+L(less_256bytes):
+ cmp $128, %dl
+ jb L(less_128bytes)
+ vmovups (%rsi), %zmm0
+ vmovups 0x40(%rsi), %zmm1
+ vmovups -0x80(%rcx), %zmm2
+ vmovups -0x40(%rcx), %zmm3
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm1, 0x40(%rdi)
+ vmovups %zmm2, -0x80(%r9)
+ vmovups %zmm3, -0x40(%r9)
+ ret
+
+L(less_128bytes):
+ cmp $64, %dl
+ jb L(less_64bytes)
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 0x20(%rsi), %ymm1
+ vmovdqu -0x40(%rcx), %ymm2
+ vmovdqu -0x20(%rcx), %ymm3
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 0x20(%rdi)
+ vmovdqu %ymm2, -0x40(%r9)
+ vmovdqu %ymm3, -0x20(%r9)
+ ret
+
+L(less_64bytes):
+ cmp $32, %dl
+ jb L(less_32bytes)
+ vmovdqu (%rsi), %ymm0
+ vmovdqu -0x20(%rcx), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, -0x20(%r9)
+ ret
+
+L(less_32bytes):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -0x10(%rcx), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -0x10(%r9)
+ ret
+
+L(less_16bytes):
+ cmp $8, %dl
+ jb L(less_8bytes)
+ movq (%rsi), %rsi
+ movq -0x8(%rcx), %rcx
+ movq %rsi, (%rdi)
+ movq %rcx, -0x8(%r9)
+ ret
+
+L(less_8bytes):
+ cmp $4, %dl
+ jb L(less_4bytes)
+ mov (%rsi), %esi
+ mov -0x4(%rcx), %ecx
+ mov %esi, (%rdi)
+ mov %ecx, -0x4(%r9)
+ ret
+
+L(less_4bytes):
+ cmp $2, %dl
+ jb L(less_2bytes)
+ mov (%rsi), %si
+ mov -0x2(%rcx), %cx
+ mov %si, (%rdi)
+ mov %cx, -0x2(%r9)
+ ret
+
+L(less_2bytes):
+ cmp $1, %dl
+ jb L(less_1bytes)
+ mov (%rsi), %cl
+ mov %cl, (%rdi)
+L(less_1bytes):
+ ret
+
+L(512bytesormore):
+# ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %r8
+# else
+ mov __x86_shared_cache_size_half(%rip), %r8
+# endif
+ cmp %r8, %rdx
+ jae L(preloop_large)
+ cmp $1024, %rdx
+ ja L(1024bytesormore)
+ prefetcht1 (%rsi)
+ prefetcht1 0x40(%rsi)
+ prefetcht1 0x80(%rsi)
+ prefetcht1 0xC0(%rsi)
+ prefetcht1 0x100(%rsi)
+ prefetcht1 0x140(%rsi)
+ prefetcht1 0x180(%rsi)
+ prefetcht1 0x1C0(%rsi)
+ prefetcht1 -0x200(%rcx)
+ prefetcht1 -0x1C0(%rcx)
+ prefetcht1 -0x180(%rcx)
+ prefetcht1 -0x140(%rcx)
+ prefetcht1 -0x100(%rcx)
+ prefetcht1 -0xC0(%rcx)
+ prefetcht1 -0x80(%rcx)
+ prefetcht1 -0x40(%rcx)
+ vmovups (%rsi), %zmm0
+ vmovups 0x40(%rsi), %zmm1
+ vmovups 0x80(%rsi), %zmm2
+ vmovups 0xC0(%rsi), %zmm3
+ vmovups 0x100(%rsi), %zmm4
+ vmovups 0x140(%rsi), %zmm5
+ vmovups 0x180(%rsi), %zmm6
+ vmovups 0x1C0(%rsi), %zmm7
+ vmovups -0x200(%rcx), %zmm8
+ vmovups -0x1C0(%rcx), %zmm9
+ vmovups -0x180(%rcx), %zmm10
+ vmovups -0x140(%rcx), %zmm11
+ vmovups -0x100(%rcx), %zmm12
+ vmovups -0xC0(%rcx), %zmm13
+ vmovups -0x80(%rcx), %zmm14
+ vmovups -0x40(%rcx), %zmm15
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm1, 0x40(%rdi)
+ vmovups %zmm2, 0x80(%rdi)
+ vmovups %zmm3, 0xC0(%rdi)
+ vmovups %zmm4, 0x100(%rdi)
+ vmovups %zmm5, 0x140(%rdi)
+ vmovups %zmm6, 0x180(%rdi)
+ vmovups %zmm7, 0x1C0(%rdi)
+ vmovups %zmm8, -0x200(%r9)
+ vmovups %zmm9, -0x1C0(%r9)
+ vmovups %zmm10, -0x180(%r9)
+ vmovups %zmm11, -0x140(%r9)
+ vmovups %zmm12, -0x100(%r9)
+ vmovups %zmm13, -0xC0(%r9)
+ vmovups %zmm14, -0x80(%r9)
+ vmovups %zmm15, -0x40(%r9)
+ ret
+
+L(1024bytesormore):
+ cmp %rsi, %rdi
+ ja L(1024bytesormore_bkw)
+ sub $512, %r9
+ vmovups -0x200(%rcx), %zmm8
+ vmovups -0x1C0(%rcx), %zmm9
+ vmovups -0x180(%rcx), %zmm10
+ vmovups -0x140(%rcx), %zmm11
+ vmovups -0x100(%rcx), %zmm12
+ vmovups -0xC0(%rcx), %zmm13
+ vmovups -0x80(%rcx), %zmm14
+ vmovups -0x40(%rcx), %zmm15
+ prefetcht1 (%rsi)
+ prefetcht1 0x40(%rsi)
+ prefetcht1 0x80(%rsi)
+ prefetcht1 0xC0(%rsi)
+ prefetcht1 0x100(%rsi)
+ prefetcht1 0x140(%rsi)
+ prefetcht1 0x180(%rsi)
+ prefetcht1 0x1C0(%rsi)
+
+/* Loop with unaligned memory access. */
+L(gobble_512bytes_loop):
+ vmovups (%rsi), %zmm0
+ vmovups 0x40(%rsi), %zmm1
+ vmovups 0x80(%rsi), %zmm2
+ vmovups 0xC0(%rsi), %zmm3
+ vmovups 0x100(%rsi), %zmm4
+ vmovups 0x140(%rsi), %zmm5
+ vmovups 0x180(%rsi), %zmm6
+ vmovups 0x1C0(%rsi), %zmm7
+ add $512, %rsi
+ prefetcht1 (%rsi)
+ prefetcht1 0x40(%rsi)
+ prefetcht1 0x80(%rsi)
+ prefetcht1 0xC0(%rsi)
+ prefetcht1 0x100(%rsi)
+ prefetcht1 0x140(%rsi)
+ prefetcht1 0x180(%rsi)
+ prefetcht1 0x1C0(%rsi)
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm1, 0x40(%rdi)
+ vmovups %zmm2, 0x80(%rdi)
+ vmovups %zmm3, 0xC0(%rdi)
+ vmovups %zmm4, 0x100(%rdi)
+ vmovups %zmm5, 0x140(%rdi)
+ vmovups %zmm6, 0x180(%rdi)
+ vmovups %zmm7, 0x1C0(%rdi)
+ add $512, %rdi
+ cmp %r9, %rdi
+ jb L(gobble_512bytes_loop)
+ vmovups %zmm8, (%r9)
+ vmovups %zmm9, 0x40(%r9)
+ vmovups %zmm10, 0x80(%r9)
+ vmovups %zmm11, 0xC0(%r9)
+ vmovups %zmm12, 0x100(%r9)
+ vmovups %zmm13, 0x140(%r9)
+ vmovups %zmm14, 0x180(%r9)
+ vmovups %zmm15, 0x1C0(%r9)
+ ret
+
+L(1024bytesormore_bkw):
+ add $512, %rdi
+ vmovups 0x1C0(%rsi), %zmm8
+ vmovups 0x180(%rsi), %zmm9
+ vmovups 0x140(%rsi), %zmm10
+ vmovups 0x100(%rsi), %zmm11
+ vmovups 0xC0(%rsi), %zmm12
+ vmovups 0x80(%rsi), %zmm13
+ vmovups 0x40(%rsi), %zmm14
+ vmovups (%rsi), %zmm15
+ prefetcht1 -0x40(%rcx)
+ prefetcht1 -0x80(%rcx)
+ prefetcht1 -0xC0(%rcx)
+ prefetcht1 -0x100(%rcx)
+ prefetcht1 -0x140(%rcx)
+ prefetcht1 -0x180(%rcx)
+ prefetcht1 -0x1C0(%rcx)
+ prefetcht1 -0x200(%rcx)
+
+/* Backward loop with unaligned memory access. */
+L(gobble_512bytes_loop_bkw):
+ vmovups -0x40(%rcx), %zmm0
+ vmovups -0x80(%rcx), %zmm1
+ vmovups -0xC0(%rcx), %zmm2
+ vmovups -0x100(%rcx), %zmm3
+ vmovups -0x140(%rcx), %zmm4
+ vmovups -0x180(%rcx), %zmm5
+ vmovups -0x1C0(%rcx), %zmm6
+ vmovups -0x200(%rcx), %zmm7
+ sub $512, %rcx
+ prefetcht1 -0x40(%rcx)
+ prefetcht1 -0x80(%rcx)
+ prefetcht1 -0xC0(%rcx)
+ prefetcht1 -0x100(%rcx)
+ prefetcht1 -0x140(%rcx)
+ prefetcht1 -0x180(%rcx)
+ prefetcht1 -0x1C0(%rcx)
+ prefetcht1 -0x200(%rcx)
+ vmovups %zmm0, -0x40(%r9)
+ vmovups %zmm1, -0x80(%r9)
+ vmovups %zmm2, -0xC0(%r9)
+ vmovups %zmm3, -0x100(%r9)
+ vmovups %zmm4, -0x140(%r9)
+ vmovups %zmm5, -0x180(%r9)
+ vmovups %zmm6, -0x1C0(%r9)
+ vmovups %zmm7, -0x200(%r9)
+ sub $512, %r9
+ cmp %rdi, %r9
+ ja L(gobble_512bytes_loop_bkw)
+ vmovups %zmm8, -0x40(%rdi)
+ vmovups %zmm9, -0x80(%rdi)
+ vmovups %zmm10, -0xC0(%rdi)
+ vmovups %zmm11, -0x100(%rdi)
+ vmovups %zmm12, -0x140(%rdi)
+ vmovups %zmm13, -0x180(%rdi)
+ vmovups %zmm14, -0x1C0(%rdi)
+ vmovups %zmm15, -0x200(%rdi)
+ ret
+
+L(preloop_large):
+ cmp %rsi, %rdi
+ ja L(preloop_large_bkw)
+ vmovups (%rsi), %zmm4
+ vmovups 0x40(%rsi), %zmm5
+
+/* Align destination for access with non-temporal stores in the loop. */
+ mov %rdi, %r8
+ and $-0x80, %rdi
+ add $0x80, %rdi
+ sub %rdi, %r8
+ sub %r8, %rsi
+ add %r8, %rdx
+L(gobble_256bytes_nt_loop):
+ prefetcht1 0x200(%rsi)
+ prefetcht1 0x240(%rsi)
+ prefetcht1 0x280(%rsi)
+ prefetcht1 0x2C0(%rsi)
+ prefetcht1 0x300(%rsi)
+ prefetcht1 0x340(%rsi)
+ prefetcht1 0x380(%rsi)
+ prefetcht1 0x3C0(%rsi)
+ vmovdqu64 (%rsi), %zmm0
+ vmovdqu64 0x40(%rsi), %zmm1
+ vmovdqu64 0x80(%rsi), %zmm2
+ vmovdqu64 0xC0(%rsi), %zmm3
+ vmovntdq %zmm0, (%rdi)
+ vmovntdq %zmm1, 0x40(%rdi)
+ vmovntdq %zmm2, 0x80(%rdi)
+ vmovntdq %zmm3, 0xC0(%rdi)
+ sub $256, %rdx
+ add $256, %rsi
+ add $256, %rdi
+ cmp $256, %rdx
+ ja L(gobble_256bytes_nt_loop)
+ sfence
+ vmovups %zmm4, (%rax)
+ vmovups %zmm5, 0x40(%rax)
+ jmp L(check)
+
+L(preloop_large_bkw):
+ vmovups -0x80(%rcx), %zmm4
+ vmovups -0x40(%rcx), %zmm5
+
+/* Align end of destination for access with non-temporal stores. */
+ mov %r9, %r8
+ and $-0x80, %r9
+ sub %r9, %r8
+ sub %r8, %rcx
+ sub %r8, %rdx
+ add %r9, %r8
+L(gobble_256bytes_nt_loop_bkw):
+ prefetcht1 -0x400(%rcx)
+ prefetcht1 -0x3C0(%rcx)
+ prefetcht1 -0x380(%rcx)
+ prefetcht1 -0x340(%rcx)
+ prefetcht1 -0x300(%rcx)
+ prefetcht1 -0x2C0(%rcx)
+ prefetcht1 -0x280(%rcx)
+ prefetcht1 -0x240(%rcx)
+ vmovdqu64 -0x100(%rcx), %zmm0
+ vmovdqu64 -0xC0(%rcx), %zmm1
+ vmovdqu64 -0x80(%rcx), %zmm2
+ vmovdqu64 -0x40(%rcx), %zmm3
+ vmovntdq %zmm0, -0x100(%r9)
+ vmovntdq %zmm1, -0xC0(%r9)
+ vmovntdq %zmm2, -0x80(%r9)
+ vmovntdq %zmm3, -0x40(%r9)
+ sub $256, %rdx
+ sub $256, %rcx
+ sub $256, %r9
+ cmp $256, %rdx
+ ja L(gobble_256bytes_nt_loop_bkw)
+ sfence
+ vmovups %zmm4, -0x80(%r8)
+ vmovups %zmm5, -0x40(%r8)
+ jmp L(check)
+END (__memmove_avx512_no_vzeroupper)
+
+# ifdef SHARED
+strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
+strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
+# endif
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=85f40c56ef1e0a1716a85943bfd2824663b976c0
commit 85f40c56ef1e0a1716a85943bfd2824663b976c0
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 13:37:31 2016 -0800
Implement x86-64 multiarch mempcpy in memcpy
Implement x86-64 multiarch mempcpy in memcpy to share most of code.
It will reduce code size of libc.so.
[BZ #18858]
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
mempcpy-ssse3, mempcpy-ssse3-back, mempcpy-avx-unaligned
and mempcpy-avx512-no-vzeroupper.
* sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S (MEMPCPY_CHK):
New.
(MEMPCPY): Likewise.
* sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
(MEMPCPY_CHK): New.
(MEMPCPY): Likewise.
* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S (MEMPCPY_CHK): New.
(MEMPCPY): Likewise.
* sysdeps/x86_64/multiarch/memcpy-ssse3.S (MEMPCPY_CHK): New.
(MEMPCPY): Likewise.
* sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S: Removed.
* sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S:
Likewise.
* sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S: Likewise.
* sysdeps/x86_64/multiarch/mempcpy-ssse3.S: Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d234f4a..39c0905 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,10 +8,10 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
- memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
- memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
- memcpy-avx-unaligned mempcpy-avx-unaligned \
- mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
+ memcpy-avx512-no-vzeroupper memmove-ssse3 \
+ memcpy-ssse3-back memmove-avx-unaligned \
+ memcpy-avx-unaligned \
+ memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index b615d06..dd4187f 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -25,11 +25,26 @@
#include "asm-syntax.h"
#ifndef MEMCPY
-# define MEMCPY __memcpy_avx_unaligned
+# define MEMCPY __memcpy_avx_unaligned
# define MEMCPY_CHK __memcpy_chk_avx_unaligned
+# define MEMPCPY __mempcpy_avx_unaligned
+# define MEMPCPY_CHK __mempcpy_chk_avx_unaligned
#endif
.section .text.avx,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -42,6 +57,7 @@ ENTRY (MEMCPY)
#ifdef USE_AS_MEMPCPY
add %rdx, %rax
#endif
+L(start):
cmp $256, %rdx
jae L(256bytesormore)
cmp $16, %dl
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
index 3d567fc..285bb83 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -27,9 +27,24 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_avx512_no_vzeroupper
# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper
+# define MEMPCPY __mempcpy_avx512_no_vzeroupper
+# define MEMPCPY_CHK __mempcpy_chk_avx512_no_vzeroupper
#endif
.section .text.avx512,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -42,6 +57,7 @@ ENTRY (MEMCPY)
#ifdef USE_AS_MEMPCPY
add %rdx, %rax
#endif
+L(start):
lea (%rsi, %rdx), %rcx
lea (%rdi, %rdx), %r9
cmp $512, %rdx
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 08b41e9..b4890f4 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -29,6 +29,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_ssse3_back
# define MEMCPY_CHK __memcpy_chk_ssse3_back
+# define MEMPCPY __mempcpy_ssse3_back
+# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
#endif
#define JMPTBL(I, B) I - B
@@ -44,6 +46,19 @@
ud2
.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -66,6 +81,7 @@ ENTRY (MEMCPY)
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
L(copy_forward):
#endif
+L(start):
cmp $144, %rdx
jae L(144bytesormore)
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 95de969..1ca88c0 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -29,6 +29,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_ssse3
# define MEMCPY_CHK __memcpy_chk_ssse3
+# define MEMPCPY __mempcpy_ssse3
+# define MEMPCPY_CHK __mempcpy_chk_ssse3
#endif
#define JMPTBL(I, B) I - B
@@ -44,6 +46,19 @@
ud2
.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -66,6 +81,7 @@ ENTRY (MEMCPY)
jmp L(copy_backward)
L(copy_forward):
#endif
+L(start):
cmp $79, %rdx
lea L(table_less_80bytes)(%rip), %r11
ja L(80bytesormore)
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
deleted file mode 100644
index 241378e..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* mempcpy with AVX
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_avx_unaligned
-#define MEMCPY_CHK __mempcpy_chk_avx_unaligned
-#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
deleted file mode 100644
index fcc0945..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* mempcpy optimized with AVX512 for KNL hardware.
- Copyright (C) 2016 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_avx512_no_vzeroupper
-#define MEMCPY_CHK __mempcpy_chk_avx512_no_vzeroupper
-#include "memcpy-avx512-no-vzeroupper.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
deleted file mode 100644
index 82ffacb..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_ssse3_back
-#define MEMCPY_CHK __mempcpy_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
deleted file mode 100644
index 822d98e..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_ssse3
-#define MEMCPY_CHK __mempcpy_chk_ssse3
-#include "memcpy-ssse3.S"
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources