This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [PATCH RFC] Improve 64bit memcpy/memove for Corei7 with unaligned avx2 instruction

From: OndÅej BÃlka <neleai at seznam dot cz>
To: Ling Ma <ling dot ma dot program at gmail dot com>
Cc: Ma Ling <ling dot ml at alibaba-inc dot com>, libc-alpha at sourceware dot org
Date: Tue, 9 Jul 2013 21:11:37 +0200
Subject: Re: [PATCH RFC] Improve 64bit memcpy/memove for Corei7 with unaligned avx2 instruction
References: <1373254693-9582-1-git-send-email-ling dot ma dot program at gmail dot com> <CAOGi=dOnn_GzHwZmo2=7Zocxsgw8LRq-rd1fGp2QfpVyPJ=JEw at mail dot gmail dot com>
On Mon, Jul 08, 2013 at 11:53:24AM +0800, Ling Ma wrote:
> Attached memcpy_profile result for  __mempcpy_avx2_unaligned.
> 
> Thanks
> Ling
> 
> 
> 2013/7/8, ling.ma.program@gmail.com <ling.ma.program@gmail.com>:
> > From: Ma Ling <ling.ml@alibaba-inc.com>
> >
> > In this version we manage to avoid branch instructions, and force
> > destination to be aligned
> > with avx2 instruction. We modified gcc.403 so that we can only measure
> > memcpy function,
> > gcc.403 benchmarks indicate the version improved performance from 4% to 16%
> > on different cases .
> >
> > Ondra, I will send out results from your memcpy_profile.
> >
Nice, it improved performance and patch at glance it loogs good. I will review it when glibc freeze will end. 

> > Best Regards
> > Ling
> > ---
> >  sysdeps/x86_64/multiarch/Makefile                 |   5 +-
> >  sysdeps/x86_64/multiarch/ifunc-defines.sym        |   2 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c        |  11 +
> >  sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S  | 438
> > ++++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S |   4 +
> >  sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S |   4 +
> >  6 files changed, 462 insertions(+), 2 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
> >  create mode 100644 sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
> >  create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > b/sysdeps/x86_64/multiarch/Makefile
> > index dd6c27d..02c0a2a 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -8,8 +8,9 @@ ifeq ($(subdir),string)
> >
> >  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3
> > \
> >  		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
> > -		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> > -		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
> > +		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back memmove-ssse3-back
> > \
> > +		   memcpy-avx2-unaligned mempcpy-avx2-unaligned memmove-avx2-unaligned \
> > +		   strcasestr-nonascii strcasecmp_l-ssse3 \
> >  		   strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
> >  		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
> >  		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym
> > b/sysdeps/x86_64/multiarch/ifunc-defines.sym
> > index eb1538a..448b8c4 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
> > +++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
> > @@ -17,4 +17,6 @@ FEATURE_OFFSET		offsetof (struct cpu_features, feature)
> >  FEATURE_SIZE		sizeof (unsigned int)
> >
> >  COMMON_CPUID_INDEX_1
> > +COMMON_CPUID_INDEX_7
> >  FEATURE_INDEX_1
> > +FEATURE_INDEX_7
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 332a60d..5fb5663 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -50,6 +50,8 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >  			      __memmove_chk_ssse3_back)
> >  	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
> >  			      __memmove_chk_ssse3)
> > +	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX2,
> > +			  __memmove_chk_avx2_unaligned)
> >  	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
> >  			      __memmove_chk_sse2))
> >
> > @@ -59,6 +61,8 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >  			      __memmove_ssse3_back)
> >  	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
> >  			      __memmove_ssse3)
> > +	      IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX2,
> > +			  __memmove_avx2_unaligned)
> >  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
> > @@ -235,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >  			      __memcpy_chk_ssse3_back)
> >  	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
> >  			      __memcpy_chk_ssse3)
> > +	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX2,
> > +			      __memcpy_chk_avx2_unaligned)
> >  	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
> >  			      __memcpy_chk_sse2))
> >
> > @@ -243,6 +249,7 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
> >  			      __memcpy_ssse3_back)
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
> > +	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX2,
> > __memcpy_avx2_unaligned)
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
> > @@ -251,6 +258,8 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >  			      __mempcpy_chk_ssse3_back)
> >  	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
> >  			      __mempcpy_chk_ssse3)
> > +	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX2,
> > +			      __mempcpy_chk_avx2_unaligned)
> >  	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
> >  			      __mempcpy_chk_sse2))
> >
> > @@ -260,6 +269,8 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >  			      __mempcpy_ssse3_back)
> >  	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
> >  			      __mempcpy_ssse3)
> > +	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX2,
> > +			      __mempcpy_avx2_unaligned)
> >  	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/strlen.S.  */
> > diff --git a/sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
> > b/sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
> > new file mode 100644
> > index 0000000..d32cfad
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
> > @@ -0,0 +1,438 @@
> > +/* memcpy with AVX2
> > +   Copyright (C) 2010 Free Software Foundation, Inc.
> > +   Contributed by Intel Corporation.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <http://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +
> > +#if !defined NOT_IN_libc \
> > +    && (defined SHARED \
> > +        || defined USE_AS_MEMMOVE \
> > +	|| !defined USE_MULTIARCH)
> > +
> > +#include "asm-syntax.h"
> > +
> > +#ifndef MEMCPY
> > +# define MEMCPY	__memcpy_avx2_unaligned
> > +# define MEMCPY_CHK	__memcpy_chk_avx2_unaligned
> > +#endif
> > +
> > +#ifndef L
> > +# define L(label)	.L##label
> > +#endif
> > +
> > +#ifndef ALIGN
> > +# define ALIGN(n)	.p2align n
> > +#endif
> > +
> > +#ifndef cfi_startproc
> > +# define cfi_startproc	.cfi_startproc
> > +#endif
> > +
> > +#ifndef cfi_endproc
> > +# define cfi_endproc	.cfi_endproc
> > +#endif
> > +
> > +#ifndef ENTRY
> > +# define ENTRY(name)	\
> > +	.type name,  @function;	\
> > +	.globl name;	\
> > +	ALIGN(4);	\
> > +name:	\
> > +	cfi_startproc
> > +#endif
> > +
> > +#ifndef END
> > +# define END(name)	\
> > +	cfi_endproc;	\
> > +	.size name, .-name
> > +#endif
> > +
> > +	.section .text.avx2,"ax",@progbits
> > +#if !defined USE_AS_BCOPY
> > +ENTRY (MEMCPY_CHK)
> > +	cmpq	%rdx, %rcx
> > +	jb	HIDDEN_JUMPTARGET (__chk_fail)
> > +END (MEMCPY_CHK)
> > +#endif
> > +
> > +ENTRY (MEMCPY)
> > +	vzeroupper
> > +	mov	%rdi, %rax
> > +
> > +#ifdef USE_AS_MEMPCPY
> > +	add	%rdx, %rax
> > +#endif
> > +
> > +	lea	(%rsi, %rdx), %r8
> > +	lea	(%rdi, %rdx), %r9
> > +	cmp	$256, %rdx
> > +	ja	L(256bytesormore)
> > +	cmp	$128, %edx
> > +	jb	L(less_128bytes)
> > +	vmovups (%rsi), %xmm0
> > +	vmovups 0x10(%rsi), %xmm1
> > +	vmovups 0x20(%rsi), %xmm2
> > +	vmovups 0x30(%rsi), %xmm3
> > +	vmovups 0x40(%rsi), %xmm4
> > +	vmovups 0x50(%rsi), %xmm5
> > +	vmovups 0x60(%rsi), %xmm6
> > +	vmovups 0x70(%rsi), %xmm7
> > +	vmovups -0x80(%r8), %xmm8
> > +	vmovups -0x70(%r8), %xmm9
> > +	vmovups -0x60(%r8), %xmm10
> > +	vmovups -0x50(%r8), %xmm11
> > +	vmovups -0x40(%r8), %xmm12
> > +	vmovups -0x30(%r8), %xmm13
> > +	vmovups -0x20(%r8), %xmm14
> > +	vmovups -0x10(%r8), %xmm15
> > +	vmovups %xmm0, (%rdi)
> > +	vmovups %xmm1, 0x10(%rdi)
> > +	vmovups %xmm2, 0x20(%rdi)
> > +	vmovups %xmm3, 0x30(%rdi)
> > +	vmovups %xmm4, 0x40(%rdi)
> > +	vmovups %xmm5, 0x50(%rdi)
> > +	vmovups %xmm6, 0x60(%rdi)
> > +	vmovups %xmm7, 0x70(%rdi)
> > +	vmovups %xmm8, -0x80(%r9)
> > +	vmovups %xmm9, -0x70(%r9)
> > +	vmovups %xmm10, -0x60(%r9)
> > +	vmovups %xmm11, -0x50(%r9)
> > +	vmovups %xmm12, -0x40(%r9)
> > +	vmovups %xmm13, -0x30(%r9)
> > +	vmovups %xmm14, -0x20(%r9)
> > +	vmovups %xmm15, -0x10(%r9)
> > +	ret
> > +	ALIGN(4)
> > +L(less_128bytes):
> > +	cmp	$64, %edx
> > +	jb	L(less_64bytes)
> > +	vmovups (%rsi), %xmm0
> > +	vmovups 0x10(%rsi), %xmm1
> > +	vmovups 0x20(%rsi), %xmm2
> > +	vmovups 0x30(%rsi), %xmm3
> > +	vmovups -0x40(%r8), %xmm4
> > +	vmovups -0x30(%r8), %xmm5
> > +	vmovups -0x20(%r8), %xmm6
> > +	vmovups -0x10(%r8), %xmm7
> > +	vmovups %xmm0, (%rdi)
> > +	vmovups %xmm1, 0x10(%rdi)
> > +	vmovups %xmm2, 0x20(%rdi)
> > +	vmovups %xmm3, 0x30(%rdi)
> > +	vmovups %xmm4, -0x40(%r9)
> > +	vmovups %xmm5, -0x30(%r9)
> > +	vmovups %xmm6, -0x20(%r9)
> > +	vmovups %xmm7, -0x10(%r9)
> > +	ret
> > +	ALIGN(4)
> > +L(less_64bytes):
> > +	cmp	$32, %edx
> > +	jb	L(less_32bytes)
> > +	vmovups (%rsi), %xmm0
> > +	vmovups 0x10(%rsi), %xmm1
> > +	vmovups -0x20(%r8), %xmm6
> > +	vmovups -0x10(%r8), %xmm7
> > +	vmovups %xmm0, (%rdi)
> > +	vmovups %xmm1, 0x10(%rdi)
> > +	vmovups %xmm6, -0x20(%r9)
> > +	vmovups %xmm7, -0x10(%r9)
> > +	retq
> > +	ALIGN(4)
> > +L(less_32bytes):
> > +	cmp	$16, %edx
> > +	jb	L(less_16bytes)
> > +	vmovups (%rsi), %xmm0
> > +	vmovups -0x10(%r8), %xmm7
> > +	vmovups %xmm0, (%rdi)
> > +	vmovups %xmm7, -0x10(%r9)
> > +	retq
> > +	ALIGN(4)
> > +L(less_16bytes):
> > +	cmp	$8, %edx
> > +	jb	L(less_8bytes)
> > +	movq (%rsi),	%rcx
> > +	movq -0x08(%r8),	%r10
> > +	movq %rcx, (%rdi)
> > +	movq %r10, -0x08(%r9)
> > +	retq
> > +	ALIGN(4)
> > +L(less_8bytes):
> > +	cmp	$4, %edx
> > +	jb	L(less_4bytes)
> > +	mov (%rsi),	%ecx
> > +	mov -0x04(%r8), %edx
> > +	mov %ecx, (%rdi)
> > +	mov %edx, -0x04(%r9)
> > +	ret
> > +	ALIGN(4)
> > +L(less_4bytes):
> > +	cmp	$2, %edx
> > +	jb	L(less_2bytes)
> > +	mov (%rsi),	%cx
> > +	mov -0x02(%r8),	%dx
> > +	mov %cx, (%rdi)
> > +	mov %dx, -0x02(%r9)
> > +	ret
> > +	ALIGN(4)
> > +L(less_2bytes):
> > +	cmp	$1, %rdx
> > +	jb	L(less_0bytes)
> > +	mov	(%rsi), %cl
> > +	mov	%cl,	(%rdi)
> > +L(less_0bytes):
> > +	retq
> > +
> > +	ALIGN(4)
> > +L(256bytesormore):
> > +
> > +#ifdef USE_AS_MEMMOVE
> > +	cmp	%rsi, %rdi
> > +	jae	L(copy_backward)
> > +#endif
> > +	cmp	$2048, %rdx
> > +	jae	L(gobble_data_movsb)
> > +
> > +	vmovups -0x80(%r8), %xmm8
> > +	vmovups -0x70(%r8), %xmm9
> > +	vmovups -0x60(%r8), %xmm10
> > +	vmovups -0x50(%r8), %xmm11
> > +	vmovups -0x40(%r8), %xmm12
> > +	vmovups -0x30(%r8), %xmm13
> > +	vmovups -0x20(%r8), %xmm14
> > +	vmovups -0x10(%r8), %xmm15
> > +	vmovups	(%rsi), %ymm4
> > +	mov	%rdi, %r10
> > +	and	$-32, %rdi
> > +	add	$32, %rdi
> > +	mov	%rdi, %r11
> > +	sub	%r10, %r11
> > +	sub	%r11, %rdx
> > +	add	%r11, %rsi
> > +	sub	$0x80, %rdx
> > +L(goble_128_loop):
> > +	vmovups (%rsi), %ymm0
> > +	vmovups 0x20(%rsi), %ymm1
> > +	vmovups 0x40(%rsi), %ymm2
> > +	vmovups 0x60(%rsi), %ymm3
> > +	lea	0x80(%rsi), %rsi
> > +	vmovaps %ymm0, (%rdi)
> > +	vmovaps %ymm1, 0x20(%rdi)
> > +	vmovaps %ymm2, 0x40(%rdi)
> > +	vmovaps %ymm3, 0x60(%rdi)
> > +	lea	0x80(%rdi), %rdi
> > +	sub	$0x80, %rdx
> > +	jae	L(goble_128_loop)
> > +	vmovups	%ymm4, (%r10)
> > +	vzeroupper
> > +	vmovups %xmm8, -0x80(%r9)
> > +	vmovups %xmm9, -0x70(%r9)
> > +	vmovups %xmm10, -0x60(%r9)
> > +	vmovups %xmm11, -0x50(%r9)
> > +	vmovups %xmm12, -0x40(%r9)
> > +	vmovups %xmm13, -0x30(%r9)
> > +	vmovups %xmm14, -0x20(%r9)
> > +	vmovups %xmm15, -0x10(%r9)
> > +	ret
> > +
> > +L(gobble_data_movsb):
> > +
> > +#ifdef SHARED_CACHE_SIZE_HALF
> > +	mov	$SHARED_CACHE_SIZE_HALF, %rcx
> > +#else
> > +	mov	__x86_64_shared_cache_size_half(%rip), %rcx
> > +#endif
> > +	shl	$3, %rcx
> > +
> > +#ifdef USE_AS_MEMMOVE
> > +	mov	%rsi, %r10
> > +	sub	%rdi, %r10
> > +	cmp	%rdx, %r10
> > +	jae	L(memmove_use_memcpy_fwd)
> > +	cmp	%rcx, %r10
> > +	jae	L(memmove_use_memcpy_fwd)
> > +	jmp L(gobble_mem_fwd_llc_start)
> > +L(memmove_use_memcpy_fwd):
> > +#endif
> > +
> > +	cmp	%rcx, %rdx
> > +	ja	L(gobble_big_data_fwd)
> > +
> > +#ifdef USE_AS_MEMMOVE
> > +L(gobble_mem_fwd_llc_start):
> > +#endif
> > +	mov	%rdx, %rcx
> > +	rep	movsb
> > +	ret
> > +
> > +L(gobble_big_data_fwd):
> > +	vmovups	(%rsi), %ymm4
> > +	vmovups -0x80(%r8), %xmm5
> > +	vmovups -0x70(%r8), %xmm6
> > +	vmovups -0x60(%r8), %xmm7
> > +	vmovups -0x50(%r8), %xmm8
> > +	vmovups -0x40(%r8), %xmm9
> > +	vmovups -0x30(%r8), %xmm10
> > +	vmovups -0x20(%r8), %xmm11
> > +	vmovups -0x10(%r8), %xmm12
> > +	mov	%rdi, %r8
> > +	and	$-32, %rdi
> > +	add	$32, %rdi
> > +	mov	%rdi, %r10
> > +	sub	%r8, %r10
> > +	sub	%r10, %rdx
> > +	add	%r10, %rsi
> > +	sub	$0x80, %rdx
> > +L(gobble_mem_fwd_loop):
> > +	prefetcht0 0x1c0(%rsi)
> > +	prefetcht0 0x280(%rsi)
> > +	vmovups	(%rsi), %xmm0
> > +	vmovups	0x10(%rsi), %xmm1
> > +	vmovups	0x20(%rsi), %xmm2
> > +	vmovups	0x30(%rsi), %xmm3
> > +	vmovntdq	%xmm0, (%rdi)
> > +	vmovntdq	%xmm1, 0x10(%rdi)
> > +	vmovntdq	%xmm2, 0x20(%rdi)
> > +	vmovntdq	%xmm3, 0x30(%rdi)
> > +	vmovups	0x40(%rsi), %xmm0
> > +	vmovups	0x50(%rsi), %xmm1
> > +	vmovups	0x60(%rsi), %xmm2
> > +	vmovups	0x70(%rsi), %xmm3
> > +	lea	0x80(%rsi), %rsi
> > +	vmovntdq	%xmm0, 0x40(%rdi)
> > +	vmovntdq	%xmm1, 0x50(%rdi)
> > +	vmovntdq	%xmm2, 0x60(%rdi)
> > +	vmovntdq	%xmm3, 0x70(%rdi)
> > +	lea	0x80(%rdi), %rdi
> > +	sub	$0x80, %rdx
> > +	jae	L(gobble_mem_fwd_loop)
> > +	sfence
> > +	vmovups	%ymm4, (%r8)
> > +	vzeroupper
> > +	vmovups %xmm5, -0x80(%r9)
> > +	vmovups %xmm6, -0x70(%r9)
> > +	vmovups %xmm7, -0x60(%r9)
> > +	vmovups %xmm8, -0x50(%r9)
> > +	vmovups %xmm9, -0x40(%r9)
> > +	vmovups %xmm10, -0x30(%r9)
> > +	vmovups %xmm11, -0x20(%r9)
> > +	vmovups %xmm12, -0x10(%r9)
> > +	ret
> > +
> > +	ALIGN (4)
> > +L(copy_backward):
> > +#ifdef SHARED_CACHE_SIZE_HALF
> > +	mov	$SHARED_CACHE_SIZE_HALF, %rcx
> > +#else
> > +	mov	__x86_64_shared_cache_size_half(%rip), %rcx
> > +#endif
> > +	shl	$3, %rcx	
> > +	vmovups (%rsi), %xmm8
> > +	vmovups 0x10(%rsi), %xmm9
> > +	vmovups 0x20(%rsi), %xmm10
> > +	vmovups 0x30(%rsi), %xmm11
> > +	vmovups 0x40(%rsi), %xmm12
> > +	vmovups 0x50(%rsi), %xmm13
> > +	vmovups 0x60(%rsi), %xmm14
> > +	vmovups 0x70(%rsi), %xmm15
> > +	mov	%rdi, %r9
> > +	add	%rdx, %rsi
> > +	add	%rdx, %rdi
> > +	vmovups	-0x20(%rsi), %ymm4
> > +	lea	-0x20(%rdi), %r10
> > +	mov %rdi, %r11
> > +	and	$0x1f, %r11
> > +	xor	%r11, %rdi
> > +	sub	%r11, %rsi
> > +	sub	%r11, %rdx
> > +#ifdef USE_AS_MEMMOVE
> > +	mov	%rdi, %r11
> > +	sub	%rsi, %r11
> > +	cmp	%rdx, %r11
> > +	jae	L(memmove_use_memcpy_bwd)
> > +	cmp	%rcx, %r11
> > +	jae	L(memmove_use_memcpy_bwd)
> > +	jmp L(gobble_mem_bwd_llc_start)
> > +#endif
> > +L(memmove_use_memcpy_bwd):
> > +	cmp	%rcx, %rdx
> > +	ja	L(gobble_big_data_bwd)
> > +L(gobble_mem_bwd_llc_start):
> > +	sub	$0x80, %rdx
> > +L(gobble_mem_bwd_llc):
> > +	vmovups	-0x20(%rsi), %ymm0
> > +	vmovups	-0x40(%rsi), %ymm1
> > +	vmovups	-0x60(%rsi), %ymm2
> > +	vmovups	-0x80(%rsi), %ymm3
> > +	lea	-0x80(%rsi), %rsi
> > +	vmovaps	%ymm0, -0x20(%rdi)
> > +	vmovaps	%ymm1, -0x40(%rdi)
> > +	vmovaps	%ymm2, -0x60(%rdi)
> > +	vmovaps	%ymm3, -0x80(%rdi)
> > +	lea	-0x80(%rdi), %rdi
> > +	sub	$0x80, %rdx
> > +	jae	L(gobble_mem_bwd_llc)
> > +	vmovups	%ymm4, (%r10)
> > +	vzeroupper
> > +	vmovups %xmm8, (%r9)
> > +	vmovups %xmm9, 0x10(%r9)
> > +	vmovups %xmm10, 0x20(%r9)
> > +	vmovups %xmm11, 0x30(%r9)
> > +	vmovups %xmm12, 0x40(%r9)
> > +	vmovups %xmm13, 0x50(%r9)
> > +	vmovups %xmm14, 0x60(%r9)
> > +	vmovups %xmm15, 0x70(%r9)
> > +	ret
> > +
> > +L(gobble_big_data_bwd):
> > +	sub	$0x80, %rdx
> > +L(gobble_mem_bwd_loop):
> > +	prefetcht0 -0x1c0(%rsi)
> > +	prefetcht0 -0x280(%rsi)
> > +	vmovups	-0x10(%rsi), %xmm0
> > +	vmovups	-0x20(%rsi), %xmm1
> > +	vmovups	-0x30(%rsi), %xmm2
> > +	vmovups	-0x40(%rsi), %xmm3
> > +	vmovntdq	%xmm0, -0x10(%rdi)
> > +	vmovntdq	%xmm1, -0x20(%rdi)
> > +	vmovntdq	%xmm2, -0x30(%rdi)
> > +	vmovntdq	%xmm3, -0x40(%rdi)
> > +	vmovups	-0x50(%rsi), %xmm0
> > +	vmovups	-0x60(%rsi), %xmm1
> > +	vmovups	-0x70(%rsi), %xmm2
> > +	vmovups	-0x80(%rsi), %xmm3
> > +	lea	-0x80(%rsi), %rsi
> > +	vmovntdq	%xmm0, -0x50(%rdi)
> > +	vmovntdq	%xmm1, -0x60(%rdi)
> > +	vmovntdq	%xmm2, -0x70(%rdi)
> > +	vmovntdq	%xmm3, -0x80(%rdi)
> > +	lea	-0x80(%rdi), %rdi
> > +	sub	$0x80, %rdx
> > +	jae	L(gobble_mem_bwd_loop)
> > +	sfence
> > +	vmovups	%ymm4, (%r10)
> > +	vzeroupper
> > +	vmovups %xmm8, (%r9)
> > +	vmovups %xmm9, 0x10(%r9)
> > +	vmovups %xmm10, 0x20(%r9)
> > +	vmovups %xmm11, 0x30(%r9)
> > +	vmovups %xmm12, 0x40(%r9)
> > +	vmovups %xmm13, 0x50(%r9)
> > +	vmovups %xmm14, 0x60(%r9)
> > +	vmovups %xmm15, 0x70(%r9)
> > +	ret
> > +END (MEMCPY)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
> > b/sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
> > new file mode 100644
> > index 0000000..ddb2090
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
> > @@ -0,0 +1,4 @@
> > +#define USE_AS_MEMMOVE
> > +#define MEMCPY		__memmove_avx2_unaligned
> > +#define MEMCPY_CHK	__memmove_chk_avx2_unaligned
> > +#include "memcpy-avx2-unaligned.S"
> > diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S
> > b/sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S
> > new file mode 100644
> > index 0000000..a2f4af9
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S
> > @@ -0,0 +1,4 @@
> > +#define USE_AS_MEMPCPY
> > +#define MEMCPY		__mempcpy_avx2_unaligned
> > +#define MEMCPY_CHK	__mempcpy_chk_avx2_unaligned
> > +#include "memcpy-avx2-unaligned.S"
> > --
> > 1.8.1.4
> >
> >



-- 

Well fix that in the next (upgrade, update, patch release, service pack).
Follow-Ups:
- Re: [PATCH RFC] Improve 64bit memcpy/memove for Corei7 with unaligned avx2 instruction
  - From: Liubov Dmitrieva
References:
- [PATCH RFC] Improve 64bit memcpy/memove for Corei7 with unaligned avx2 instruction
  - From: ling . ma . program
- Re: [PATCH RFC] Improve 64bit memcpy/memove for Corei7 with unaligned avx2 instruction
  - From: Ling Ma
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]