This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction

From: ling dot ma dot program at gmail dot com
To: libc-alpha at sourceware dot org
Cc: liubov dot dmitrieva at gmail dot com, neleai at seznam dot cz, Ma Ling <ling dot ml at alibaba-inc dot com>
Date: Thu, 11 Jul 2013 08:51:36 -0400
Subject: [PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
From: Ma Ling <ling.ml@alibaba-inc.com>

We manage to avoid branch instructions, and force destination to be aligned
with avx instruction. We modified gcc.403 so that we can only measure memcpy function,
gcc.403 benchmarks indicate the version improved performance from 4% to 16% on different cases .

Best Regards
Ling
---
In this version we did clean-up work, thanks Liubov.

 sysdeps/x86_64/multiarch/Makefile                |   5 +-
 sysdeps/x86_64/multiarch/ifunc-defines.sym       |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c       |  11 +
 sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S  | 409 +++++++++++++++++++++++
 sysdeps/x86_64/multiarch/memmove-avx-unaligned.S |   4 +
 sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S |   4 +
 6 files changed, 433 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
 create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
 create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..f92cf18 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,8 +8,9 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
-		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
-		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
+		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back memmove-ssse3-back \
+		   memcpy-avx-unaligned mempcpy-avx-unaligned memmove-avx-unaligned \
+		   strcasestr-nonascii strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
index eb1538a..448b8c4 100644
--- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -17,4 +17,6 @@ FEATURE_OFFSET		offsetof (struct cpu_features, feature)
 FEATURE_SIZE		sizeof (unsigned int)
 
 COMMON_CPUID_INDEX_1
+COMMON_CPUID_INDEX_7
 FEATURE_INDEX_1
+FEATURE_INDEX_7
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 332a60d..5639702 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -50,6 +50,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
 			      __memmove_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+			  __memmove_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2))
 
@@ -59,6 +61,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
 			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+			  __memmove_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
@@ -235,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
 			      __memcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+			      __memcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2))
 
@@ -243,6 +249,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX, __memcpy_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
@@ -251,6 +258,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
 			      __mempcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+			      __mempcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2))
 
@@ -260,6 +269,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
 			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+			      __mempcpy_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strlen.S.  */
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
new file mode 100644
index 0000000..68901f6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -0,0 +1,409 @@
+/* memcpy with AVX
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+#ifndef MEMCPY
+# define MEMCPY	__memcpy_avx_unaligned
+# define MEMCPY_CHK	__memcpy_chk_avx_unaligned
+#endif
+
+	.section .text.avx,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	vzeroupper
+	mov	%rdi, %rax
+
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+
+	lea	(%rsi, %rdx), %r8
+	lea	(%rdi, %rdx), %r9
+	cmp	$256, %rdx
+	ja	L(256bytesormore)
+	cmp	$128, %edx
+	jb	L(less_128bytes)
+	vmovups (%rsi), %xmm0
+	vmovups 0x10(%rsi), %xmm1
+	vmovups 0x20(%rsi), %xmm2
+	vmovups 0x30(%rsi), %xmm3
+	vmovups 0x40(%rsi), %xmm4
+	vmovups 0x50(%rsi), %xmm5
+	vmovups 0x60(%rsi), %xmm6
+	vmovups 0x70(%rsi), %xmm7
+	vmovups -0x80(%r8), %xmm8
+	vmovups -0x70(%r8), %xmm9
+	vmovups -0x60(%r8), %xmm10
+	vmovups -0x50(%r8), %xmm11
+	vmovups -0x40(%r8), %xmm12
+	vmovups -0x30(%r8), %xmm13
+	vmovups -0x20(%r8), %xmm14
+	vmovups -0x10(%r8), %xmm15
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm1, 0x10(%rdi)
+	vmovups %xmm2, 0x20(%rdi)
+	vmovups %xmm3, 0x30(%rdi)
+	vmovups %xmm4, 0x40(%rdi)
+	vmovups %xmm5, 0x50(%rdi)
+	vmovups %xmm6, 0x60(%rdi)
+	vmovups %xmm7, 0x70(%rdi)
+	vmovups %xmm8, -0x80(%r9)
+	vmovups %xmm9, -0x70(%r9)
+	vmovups %xmm10, -0x60(%r9)
+	vmovups %xmm11, -0x50(%r9)
+	vmovups %xmm12, -0x40(%r9)
+	vmovups %xmm13, -0x30(%r9)
+	vmovups %xmm14, -0x20(%r9)
+	vmovups %xmm15, -0x10(%r9)
+	ret
+	ALIGN(4)
+L(less_128bytes):
+	cmp	$64, %edx
+	jb	L(less_64bytes)
+	vmovups (%rsi), %xmm0
+	vmovups 0x10(%rsi), %xmm1
+	vmovups 0x20(%rsi), %xmm2
+	vmovups 0x30(%rsi), %xmm3
+	vmovups -0x40(%r8), %xmm4
+	vmovups -0x30(%r8), %xmm5
+	vmovups -0x20(%r8), %xmm6
+	vmovups -0x10(%r8), %xmm7
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm1, 0x10(%rdi)
+	vmovups %xmm2, 0x20(%rdi)
+	vmovups %xmm3, 0x30(%rdi)
+	vmovups %xmm4, -0x40(%r9)
+	vmovups %xmm5, -0x30(%r9)
+	vmovups %xmm6, -0x20(%r9)
+	vmovups %xmm7, -0x10(%r9)
+	ret
+	ALIGN(4)
+L(less_64bytes):
+	cmp	$32, %edx
+	jb	L(less_32bytes)
+	vmovups (%rsi), %xmm0
+	vmovups 0x10(%rsi), %xmm1
+	vmovups -0x20(%r8), %xmm6
+	vmovups -0x10(%r8), %xmm7
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm1, 0x10(%rdi)
+	vmovups %xmm6, -0x20(%r9)
+	vmovups %xmm7, -0x10(%r9)
+	ret
+	ALIGN(4)
+L(less_32bytes):
+	cmp	$16, %edx
+	jb	L(less_16bytes)
+	vmovups (%rsi), %xmm0
+	vmovups -0x10(%r8), %xmm7
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm7, -0x10(%r9)
+	ret
+	ALIGN(4)
+L(less_16bytes):
+	cmp	$8, %edx
+	jb	L(less_8bytes)
+	movq (%rsi),	%rcx
+	movq -0x08(%r8),	%r10
+	movq %rcx, (%rdi)
+	movq %r10, -0x08(%r9)
+	ret
+	ALIGN(4)
+L(less_8bytes):
+	cmp	$4, %edx
+	jb	L(less_4bytes)
+	mov (%rsi),	%ecx
+	mov -0x04(%r8), %edx
+	mov %ecx, (%rdi)
+	mov %edx, -0x04(%r9)
+	ret
+	ALIGN(4)
+L(less_4bytes):
+	cmp	$2, %edx
+	jb	L(less_2bytes)
+	mov (%rsi),	%cx
+	mov -0x02(%r8),	%dx
+	mov %cx, (%rdi)
+	mov %dx, -0x02(%r9)
+	ret
+	ALIGN(4)
+L(less_2bytes):
+	cmp	$1, %rdx
+	jb	L(less_0bytes)
+	mov	(%rsi), %cl
+	mov	%cl,	(%rdi)
+L(less_0bytes):
+	ret
+
+	ALIGN(4)
+L(256bytesormore):
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jae	L(copy_backward)
+#endif
+	cmp	$2048, %rdx
+	jae	L(gobble_data_movsb)
+
+	vmovups -0x80(%r8), %xmm8
+	vmovups -0x70(%r8), %xmm9
+	vmovups -0x60(%r8), %xmm10
+	vmovups -0x50(%r8), %xmm11
+	vmovups -0x40(%r8), %xmm12
+	vmovups -0x30(%r8), %xmm13
+	vmovups -0x20(%r8), %xmm14
+	vmovups -0x10(%r8), %xmm15
+	vmovups	(%rsi), %ymm4
+	mov	%rdi, %r10
+	and	$-32, %rdi
+	add	$32, %rdi
+	mov	%rdi, %r11
+	sub	%r10, %r11
+	sub	%r11, %rdx
+	add	%r11, %rsi
+	sub	$0x80, %rdx
+L(goble_128_loop):
+	vmovups (%rsi), %ymm0
+	vmovups 0x20(%rsi), %ymm1
+	vmovups 0x40(%rsi), %ymm2
+	vmovups 0x60(%rsi), %ymm3
+	lea	0x80(%rsi), %rsi
+	vmovaps %ymm0, (%rdi)
+	vmovaps %ymm1, 0x20(%rdi)
+	vmovaps %ymm2, 0x40(%rdi)
+	vmovaps %ymm3, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(goble_128_loop)
+	vmovups	%ymm4, (%r10)
+	vzeroupper
+	vmovups %xmm8, -0x80(%r9)
+	vmovups %xmm9, -0x70(%r9)
+	vmovups %xmm10, -0x60(%r9)
+	vmovups %xmm11, -0x50(%r9)
+	vmovups %xmm12, -0x40(%r9)
+	vmovups %xmm13, -0x30(%r9)
+	vmovups %xmm14, -0x20(%r9)
+	vmovups %xmm15, -0x10(%r9)
+	ret
+
+L(gobble_data_movsb):
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+	shl	$3, %rcx
+
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r10
+	sub	%rdi, %r10
+	cmp	%rdx, %r10
+	jae	L(memmove_use_memcpy_fwd)
+	cmp	%rcx, %r10
+	jae	L(memmove_use_memcpy_fwd)
+	jmp L(gobble_mem_fwd_llc_start)
+L(memmove_use_memcpy_fwd):
+#endif
+
+	cmp	%rcx, %rdx
+	ja	L(gobble_big_data_fwd)
+
+#ifdef USE_AS_MEMMOVE
+L(gobble_mem_fwd_llc_start):
+#endif
+	mov	%rdx, %rcx
+	rep	movsb
+	ret
+
+L(gobble_big_data_fwd):
+	vmovups	(%rsi), %ymm4
+	vmovups -0x80(%r8), %xmm5
+	vmovups -0x70(%r8), %xmm6
+	vmovups -0x60(%r8), %xmm7
+	vmovups -0x50(%r8), %xmm8
+	vmovups -0x40(%r8), %xmm9
+	vmovups -0x30(%r8), %xmm10
+	vmovups -0x20(%r8), %xmm11
+	vmovups -0x10(%r8), %xmm12
+	mov	%rdi, %r8
+	and	$-32, %rdi
+	add	$32, %rdi
+	mov	%rdi, %r10
+	sub	%r8, %r10
+	sub	%r10, %rdx
+	add	%r10, %rsi
+	sub	$0x80, %rdx
+L(gobble_mem_fwd_loop):
+	prefetcht0 0x1c0(%rsi)
+	prefetcht0 0x280(%rsi)
+	vmovups	(%rsi), %xmm0
+	vmovups	0x10(%rsi), %xmm1
+	vmovups	0x20(%rsi), %xmm2
+	vmovups	0x30(%rsi), %xmm3
+	vmovntdq	%xmm0, (%rdi)
+	vmovntdq	%xmm1, 0x10(%rdi)
+	vmovntdq	%xmm2, 0x20(%rdi)
+	vmovntdq	%xmm3, 0x30(%rdi)
+	vmovups	0x40(%rsi), %xmm0
+	vmovups	0x50(%rsi), %xmm1
+	vmovups	0x60(%rsi), %xmm2
+	vmovups	0x70(%rsi), %xmm3
+	lea	0x80(%rsi), %rsi
+	vmovntdq	%xmm0, 0x40(%rdi)
+	vmovntdq	%xmm1, 0x50(%rdi)
+	vmovntdq	%xmm2, 0x60(%rdi)
+	vmovntdq	%xmm3, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_mem_fwd_loop)
+	sfence
+	vmovups	%ymm4, (%r8)
+	vzeroupper
+	vmovups %xmm5, -0x80(%r9)
+	vmovups %xmm6, -0x70(%r9)
+	vmovups %xmm7, -0x60(%r9)
+	vmovups %xmm8, -0x50(%r9)
+	vmovups %xmm9, -0x40(%r9)
+	vmovups %xmm10, -0x30(%r9)
+	vmovups %xmm11, -0x20(%r9)
+	vmovups %xmm12, -0x10(%r9)
+	ret
+
+	ALIGN (4)
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+	shl	$3, %rcx	
+	vmovups (%rsi), %xmm8
+	vmovups 0x10(%rsi), %xmm9
+	vmovups 0x20(%rsi), %xmm10
+	vmovups 0x30(%rsi), %xmm11
+	vmovups 0x40(%rsi), %xmm12
+	vmovups 0x50(%rsi), %xmm13
+	vmovups 0x60(%rsi), %xmm14
+	vmovups 0x70(%rsi), %xmm15
+	mov	%rdi, %r9
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	vmovups	-0x20(%rsi), %ymm4
+	lea	-0x20(%rdi), %r10
+	mov %rdi, %r11
+	and	$0x1f, %r11
+	xor	%r11, %rdi
+	sub	%r11, %rsi
+	sub	%r11, %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r11
+	sub	%rsi, %r11
+	cmp	%rdx, %r11
+	jae	L(memmove_use_memcpy_bwd)
+	cmp	%rcx, %r11
+	jae	L(memmove_use_memcpy_bwd)
+	jmp L(gobble_mem_bwd_llc_start)
+#endif
+L(memmove_use_memcpy_bwd):
+	cmp	%rcx, %rdx
+	ja	L(gobble_big_data_bwd)
+L(gobble_mem_bwd_llc_start):
+	sub	$0x80, %rdx
+L(gobble_mem_bwd_llc):
+	vmovups	-0x20(%rsi), %ymm0
+	vmovups	-0x40(%rsi), %ymm1
+	vmovups	-0x60(%rsi), %ymm2
+	vmovups	-0x80(%rsi), %ymm3
+	lea	-0x80(%rsi), %rsi
+	vmovaps	%ymm0, -0x20(%rdi)
+	vmovaps	%ymm1, -0x40(%rdi)
+	vmovaps	%ymm2, -0x60(%rdi)
+	vmovaps	%ymm3, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_mem_bwd_llc)
+	vmovups	%ymm4, (%r10)
+	vzeroupper
+	vmovups %xmm8, (%r9)
+	vmovups %xmm9, 0x10(%r9)
+	vmovups %xmm10, 0x20(%r9)
+	vmovups %xmm11, 0x30(%r9)
+	vmovups %xmm12, 0x40(%r9)
+	vmovups %xmm13, 0x50(%r9)
+	vmovups %xmm14, 0x60(%r9)
+	vmovups %xmm15, 0x70(%r9)
+	ret
+
+L(gobble_big_data_bwd):
+	sub	$0x80, %rdx
+L(gobble_mem_bwd_loop):
+	prefetcht0 -0x1c0(%rsi)
+	prefetcht0 -0x280(%rsi)
+	vmovups	-0x10(%rsi), %xmm0
+	vmovups	-0x20(%rsi), %xmm1
+	vmovups	-0x30(%rsi), %xmm2
+	vmovups	-0x40(%rsi), %xmm3
+	vmovntdq	%xmm0, -0x10(%rdi)
+	vmovntdq	%xmm1, -0x20(%rdi)
+	vmovntdq	%xmm2, -0x30(%rdi)
+	vmovntdq	%xmm3, -0x40(%rdi)
+	vmovups	-0x50(%rsi), %xmm0
+	vmovups	-0x60(%rsi), %xmm1
+	vmovups	-0x70(%rsi), %xmm2
+	vmovups	-0x80(%rsi), %xmm3
+	lea	-0x80(%rsi), %rsi
+	vmovntdq	%xmm0, -0x50(%rdi)
+	vmovntdq	%xmm1, -0x60(%rdi)
+	vmovntdq	%xmm2, -0x70(%rdi)
+	vmovntdq	%xmm3, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_mem_bwd_loop)
+	sfence
+	vmovups	%ymm4, (%r10)
+	vzeroupper
+	vmovups %xmm8, (%r9)
+	vmovups %xmm9, 0x10(%r9)
+	vmovups %xmm10, 0x20(%r9)
+	vmovups %xmm11, 0x30(%r9)
+	vmovups %xmm12, 0x40(%r9)
+	vmovups %xmm13, 0x50(%r9)
+	vmovups %xmm14, 0x60(%r9)
+	vmovups %xmm15, 0x70(%r9)
+	ret
+END (MEMCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
new file mode 100644
index 0000000..352a2c3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_avx_unaligned
+#define MEMCPY_CHK	__memmove_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
new file mode 100644
index 0000000..b31394e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_avx_unaligned
+#define MEMCPY_CHK	__mempcpy_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
-- 
1.8.1.4
Follow-Ups:
- Re: [PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
  - From: Liubov Dmitrieva
- Re: [PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
  - From: Andreas Jaeger
- Re: [PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
  - From: OndÅej BÃlka
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]