This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[committed, PATCH] X86-64: Prepare memmove-vec-unaligned-erms.S

From: "H.J. Lu" <hongjiu dot lu at intel dot com>
To: GNU C Library <libc-alpha at sourceware dot org>
Date: Wed, 6 Apr 2016 10:26:13 -0700
Subject: [committed, PATCH] X86-64: Prepare memmove-vec-unaligned-erms.S
Authentication-results: sourceware.org; auth=none
Reply-to: "H.J. Lu" <hjl dot tools at gmail dot com>
Prepare memmove-vec-unaligned-erms.S to make the SSE2 version as the
default memcpy, mempcpy and memmove.

Tested on x86-64.  Checked in.

H.J.
---
	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
	(MEMCPY_SYMBOL): New.
	(MEMPCPY_SYMBOL): Likewise.
	(MEMMOVE_CHK_SYMBOL): Likewise.
	Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk
	symbols.  Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on
	__mempcpy symbols.  Provide alias for __memcpy_chk in libc.a.
	Provide alias for memcpy in libc.a and ld.so.
---
 ChangeLog                                          |  11 ++
 .../x86_64/multiarch/memmove-vec-unaligned-erms.S  | 138 +++++++++++++--------
 2 files changed, 95 insertions(+), 54 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index c801aff..a10b8c4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,16 @@
 2016-04-06   H.J. Lu  <hongjiu.lu@intel.com>
 
+	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+	(MEMCPY_SYMBOL): New.
+	(MEMPCPY_SYMBOL): Likewise.
+	(MEMMOVE_CHK_SYMBOL): Likewise.
+	Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk
+	symbols.  Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on
+	__mempcpy symbols.  Provide alias for __memcpy_chk in libc.a.
+	Provide alias for memcpy in libc.a and ld.so.
+
+2016-04-06   H.J. Lu  <hongjiu.lu@intel.com>
+
 	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 	(MEMSET_CHK_SYMBOL): New.  Define if not defined.
 	(__bzero): Check VEC_SIZE == 16 instead of USE_MULTIARCH.
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 66779a3..8a60d0f 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -32,18 +32,27 @@
       8 * VEC_SIZE at a time.
    8. Otherwise, forward copy 8 * VEC_SIZE at a time.  */
 
-#if IS_IN (libc)
+#include <sysdep.h>
 
-# include <sysdep.h>
-# include "asm-syntax.h"
+#ifndef MEMCPY_SYMBOL
+# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
 
-# ifndef VZEROUPPER
-#  if VEC_SIZE > 16
-#   define VZEROUPPER vzeroupper
-#  else
-#   define VZEROUPPER
-#  endif
+#ifndef MEMPCPY_SYMBOL
+# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMMOVE_CHK_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER vzeroupper
+# else
+#  define VZEROUPPER
 # endif
+#endif
 
 /* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
    up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
@@ -52,32 +61,36 @@
    on processors with Enhanced REP MOVSB.  Since larger register size
    can move more data with a single load and store, the threshold is
    higher with larger register size.  */
-# ifndef REP_MOVSB_THRESHOLD
-#  define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
-# endif
+#ifndef REP_MOVSB_THRESHOLD
+# define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
+#endif
 
-# ifndef SECTION
-#  error SECTION is not defined!
-# endif
-	.section SECTION(.text),"ax",@progbits
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
 
-# ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+	.section SECTION(.text),"ax",@progbits
+#if defined SHARED && IS_IN (libc)
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
+#endif
 
-ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+#if VEC_SIZE == 16 || defined SHARED
+ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
 	movq	%rdi, %rax
 	addq	%rdx, %rax
 	jmp	L(start)
-END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+END (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
+#endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+#if defined SHARED && IS_IN (libc)
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
-# endif
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
+#endif
 
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
 	movq	%rdi, %rax
@@ -86,24 +99,29 @@ L(start):
 	jb	L(less_vec)
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(last_2x_vec):
+#endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
 	VZEROUPPER
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(nop):
+#endif
 	ret
+#if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
 
-# if VEC_SIZE == 16
+# if VEC_SIZE == 16 && defined SHARED
 /* Only used to measure performance of REP MOVSB.  */
-#  ifdef SHARED
 ENTRY (__mempcpy_erms)
 	movq	%rdi, %rax
 	addq	%rdx, %rax
 	jmp	L(start_movsb)
 END (__mempcpy_erms)
-#  endif
 
 ENTRY (__memmove_erms)
 	movq	%rdi, %rax
@@ -132,11 +150,10 @@ strong_alias (__memmove_erms, __memcpy_erms)
 # endif
 
 # ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
-# endif
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 
 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 	movq	%rdi, %rax
@@ -144,11 +161,10 @@ ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 	jmp	L(start_erms)
 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 
-# ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
@@ -192,6 +208,7 @@ L(movsb_more_2x_vec):
 	/* Force 32-bit displacement to avoid long nop between
 	   instructions.  */
 	ja.d32	L(movsb)
+#endif
 	.p2align 4
 L(more_2x_vec):
 	/* More than 2 * VEC.  */
@@ -227,13 +244,19 @@ L(copy_forward):
 	VMOVU	%VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
 	VMOVU	%VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
 	cmpq	$(VEC_SIZE * 8), %rdx
-# if  VEC_SIZE == 16
+#if  VEC_SIZE == 16
+# if defined USE_MULTIARCH && IS_IN (libc)
 	jbe	L(return)
 # else
+	/* Use 32-bit displacement to avoid long nop between
+	   instructions.  */
+	jbe.d32	L(return)
+# endif
+#else
 	/* Use 8-bit displacement to avoid long nop between
 	   instructions.  */
 	jbe	L(return_disp8)
-# endif
+#endif
 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
 	addq	%rdi, %rdx
 	andq	$-(VEC_SIZE * 4), %rdx
@@ -263,22 +286,25 @@ L(loop):
 	addq	$(VEC_SIZE * 4), %rcx
 	cmpq	%rcx, %rdx
 	jne	L(loop)
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(return):
+#endif
 L(return_disp8):
 	VZEROUPPER
 	ret
 L(less_vec):
 	/* Less than 1 VEC.  */
-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-#  error Unsupported VEC_SIZE!
-# endif
-# if VEC_SIZE > 32
+#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+#endif
+#if VEC_SIZE > 32
 	cmpb	$32, %dl
 	jae	L(between_32_63)
-# endif
-# if VEC_SIZE > 16
+#endif
+#if VEC_SIZE > 16
 	cmpb	$16, %dl
 	jae	L(between_16_31)
-# endif
+#endif
 	cmpb	$8, %dl
 	jae	L(between_8_15)
 	cmpb	$4, %dl
@@ -290,7 +316,7 @@ L(less_vec):
 	movb	%cl, (%rdi)
 1:
 	ret
-# if VEC_SIZE > 32
+#if VEC_SIZE > 32
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
 	vmovdqu	(%rsi), %ymm0
@@ -299,8 +325,8 @@ L(between_32_63):
 	vmovdqu	%ymm1, -32(%rdi,%rdx)
 	VZEROUPPER
 	ret
-# endif
-# if VEC_SIZE > 16
+#endif
+#if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
 	vmovdqu	(%rsi), %xmm0
@@ -308,7 +334,7 @@ L(between_16_31):
 	vmovdqu	%xmm0, (%rdi)
 	vmovdqu	%xmm1, -16(%rdi,%rdx)
 	ret
-# endif
+#endif
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
 	movq	-8(%rsi,%rdx), %rcx
@@ -331,10 +357,10 @@ L(between_2_3):
 	movw	%si, (%rdi)
 	ret
 
-# if VEC_SIZE > 16
+#if VEC_SIZE > 16
 	/* Align to 16 bytes to avoid long nop between instructions.  */
 	.p2align 4
-# endif
+#endif
 L(more_2x_vec_overlap):
 	/* More than 2 * VEC and there is overlap bewteen destination
 	   and source.  */
@@ -454,15 +480,19 @@ L(loop_8x_vec_backward):
 	jmp	L(between_4x_vec_and_8x_vec)
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 
-# ifdef SHARED
+#ifdef SHARED
+# if IS_IN (libc)
+#  ifdef USE_MULTIARCH
 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
 	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
 	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
-	      MEMMOVE_SYMBOL (__memcpy, unaligned_2))
-strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
-	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
+#  endif
+strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2),
+	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned_2))
 # endif
-
+#endif
+#if VEC_SIZE == 16 || defined SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
+	      MEMCPY_SYMBOL (__memcpy, unaligned_2))
 #endif
-- 
2.5.5
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]