This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[committed, PATCH] X86-64: Prepare memmove-vec-unaligned-erms.S
- From: "H.J. Lu" <hongjiu dot lu at intel dot com>
- To: GNU C Library <libc-alpha at sourceware dot org>
- Date: Wed, 6 Apr 2016 10:26:13 -0700
- Subject: [committed, PATCH] X86-64: Prepare memmove-vec-unaligned-erms.S
- Authentication-results: sourceware.org; auth=none
- Reply-to: "H.J. Lu" <hjl dot tools at gmail dot com>
Prepare memmove-vec-unaligned-erms.S to make the SSE2 version as the
default memcpy, mempcpy and memmove.
Tested on x86-64. Checked in.
H.J.
---
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
(MEMCPY_SYMBOL): New.
(MEMPCPY_SYMBOL): Likewise.
(MEMMOVE_CHK_SYMBOL): Likewise.
Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk
symbols. Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on
__mempcpy symbols. Provide alias for __memcpy_chk in libc.a.
Provide alias for memcpy in libc.a and ld.so.
---
ChangeLog | 11 ++
.../x86_64/multiarch/memmove-vec-unaligned-erms.S | 138 +++++++++++++--------
2 files changed, 95 insertions(+), 54 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index c801aff..a10b8c4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,16 @@
2016-04-06 H.J. Lu <hongjiu.lu@intel.com>
+ * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+ (MEMCPY_SYMBOL): New.
+ (MEMPCPY_SYMBOL): Likewise.
+ (MEMMOVE_CHK_SYMBOL): Likewise.
+ Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk
+ symbols. Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on
+ __mempcpy symbols. Provide alias for __memcpy_chk in libc.a.
+ Provide alias for memcpy in libc.a and ld.so.
+
+2016-04-06 H.J. Lu <hongjiu.lu@intel.com>
+
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
(MEMSET_CHK_SYMBOL): New. Define if not defined.
(__bzero): Check VEC_SIZE == 16 instead of USE_MULTIARCH.
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 66779a3..8a60d0f 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -32,18 +32,27 @@
8 * VEC_SIZE at a time.
8. Otherwise, forward copy 8 * VEC_SIZE at a time. */
-#if IS_IN (libc)
+#include <sysdep.h>
-# include <sysdep.h>
-# include "asm-syntax.h"
+#ifndef MEMCPY_SYMBOL
+# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
+#endif
-# ifndef VZEROUPPER
-# if VEC_SIZE > 16
-# define VZEROUPPER vzeroupper
-# else
-# define VZEROUPPER
-# endif
+#ifndef MEMPCPY_SYMBOL
+# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMMOVE_CHK_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+# define VZEROUPPER vzeroupper
+# else
+# define VZEROUPPER
# endif
+#endif
/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
up REP MOVSB operation, REP MOVSB isn't faster on short data. The
@@ -52,32 +61,36 @@
on processors with Enhanced REP MOVSB. Since larger register size
can move more data with a single load and store, the threshold is
higher with larger register size. */
-# ifndef REP_MOVSB_THRESHOLD
-# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
-# endif
+#ifndef REP_MOVSB_THRESHOLD
+# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
+#endif
-# ifndef SECTION
-# error SECTION is not defined!
-# endif
- .section SECTION(.text),"ax",@progbits
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
-# ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+ .section SECTION(.text),"ax",@progbits
+#if defined SHARED && IS_IN (libc)
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
+#endif
-ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+#if VEC_SIZE == 16 || defined SHARED
+ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
movq %rdi, %rax
addq %rdx, %rax
jmp L(start)
-END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+END (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
+#endif
-ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+#if defined SHARED && IS_IN (libc)
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
-# endif
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
+#endif
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
movq %rdi, %rax
@@ -86,24 +99,29 @@ L(start):
jb L(less_vec)
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(last_2x_vec):
+#endif
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %VEC(0)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
VZEROUPPER
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(nop):
+#endif
ret
+#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
-# if VEC_SIZE == 16
+# if VEC_SIZE == 16 && defined SHARED
/* Only used to measure performance of REP MOVSB. */
-# ifdef SHARED
ENTRY (__mempcpy_erms)
movq %rdi, %rax
addq %rdx, %rax
jmp L(start_movsb)
END (__mempcpy_erms)
-# endif
ENTRY (__memmove_erms)
movq %rdi, %rax
@@ -132,11 +150,10 @@ strong_alias (__memmove_erms, __memcpy_erms)
# endif
# ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
-# endif
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
movq %rdi, %rax
@@ -144,11 +161,10 @@ ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
jmp L(start_erms)
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
-# ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
@@ -192,6 +208,7 @@ L(movsb_more_2x_vec):
/* Force 32-bit displacement to avoid long nop between
instructions. */
ja.d32 L(movsb)
+#endif
.p2align 4
L(more_2x_vec):
/* More than 2 * VEC. */
@@ -227,13 +244,19 @@ L(copy_forward):
VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
cmpq $(VEC_SIZE * 8), %rdx
-# if VEC_SIZE == 16
+#if VEC_SIZE == 16
+# if defined USE_MULTIARCH && IS_IN (libc)
jbe L(return)
# else
+ /* Use 32-bit displacement to avoid long nop between
+ instructions. */
+ jbe.d32 L(return)
+# endif
+#else
/* Use 8-bit displacement to avoid long nop between
instructions. */
jbe L(return_disp8)
-# endif
+#endif
leaq (VEC_SIZE * 4)(%rdi), %rcx
addq %rdi, %rdx
andq $-(VEC_SIZE * 4), %rdx
@@ -263,22 +286,25 @@ L(loop):
addq $(VEC_SIZE * 4), %rcx
cmpq %rcx, %rdx
jne L(loop)
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(return):
+#endif
L(return_disp8):
VZEROUPPER
ret
L(less_vec):
/* Less than 1 VEC. */
-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-# error Unsupported VEC_SIZE!
-# endif
-# if VEC_SIZE > 32
+#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+#endif
+#if VEC_SIZE > 32
cmpb $32, %dl
jae L(between_32_63)
-# endif
-# if VEC_SIZE > 16
+#endif
+#if VEC_SIZE > 16
cmpb $16, %dl
jae L(between_16_31)
-# endif
+#endif
cmpb $8, %dl
jae L(between_8_15)
cmpb $4, %dl
@@ -290,7 +316,7 @@ L(less_vec):
movb %cl, (%rdi)
1:
ret
-# if VEC_SIZE > 32
+#if VEC_SIZE > 32
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
vmovdqu (%rsi), %ymm0
@@ -299,8 +325,8 @@ L(between_32_63):
vmovdqu %ymm1, -32(%rdi,%rdx)
VZEROUPPER
ret
-# endif
-# if VEC_SIZE > 16
+#endif
+#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
vmovdqu (%rsi), %xmm0
@@ -308,7 +334,7 @@ L(between_16_31):
vmovdqu %xmm0, (%rdi)
vmovdqu %xmm1, -16(%rdi,%rdx)
ret
-# endif
+#endif
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
movq -8(%rsi,%rdx), %rcx
@@ -331,10 +357,10 @@ L(between_2_3):
movw %si, (%rdi)
ret
-# if VEC_SIZE > 16
+#if VEC_SIZE > 16
/* Align to 16 bytes to avoid long nop between instructions. */
.p2align 4
-# endif
+#endif
L(more_2x_vec_overlap):
/* More than 2 * VEC and there is overlap bewteen destination
and source. */
@@ -454,15 +480,19 @@ L(loop_8x_vec_backward):
jmp L(between_4x_vec_and_8x_vec)
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-# ifdef SHARED
+#ifdef SHARED
+# if IS_IN (libc)
+# ifdef USE_MULTIARCH
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
- MEMMOVE_SYMBOL (__memcpy, unaligned_2))
-strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
- MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
+# endif
+strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2),
+ MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned_2))
# endif
-
+#endif
+#if VEC_SIZE == 16 || defined SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
+ MEMCPY_SYMBOL (__memcpy, unaligned_2))
#endif
--
2.5.5