This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH RFC] Imporve 64bit memcpy performance for Haswell CPU with AVX instruction


If there are still some issues on the latest memcpy and memset, please
let us know.

Thanks
Ling

2014-04-21 12:52 GMT+08:00, ling.ma.program@gmail.com
<ling.ma.program@gmail.com>:
> From: Ling Ma <ling.ml@alibaba-inc.com>
>
> In this patch we take advantage of HSW memory bandwidth, manage to
> reduce miss branch prediction by avoiding using branch instructions and
> force destination to be aligned with avx instruction.
>
> The CPU2006 403.gcc benchmark indicates this patch improves performance
> from 6% to 14%.
>
> This version only jump to backward for memove overlap case,
> Thanks for Ondra'comments, and that Yuriy gave me c code hint on it.
> ---
>  ChangeLog                                        |  16 +
>  sysdeps/x86_64/multiarch/Makefile                |   1 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c       |  12 +
>  sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S  | 399
> +++++++++++++++++++++++
>  sysdeps/x86_64/multiarch/memcpy.S                |   4 +
>  sysdeps/x86_64/multiarch/memcpy_chk.S            |   3 +
>  sysdeps/x86_64/multiarch/memmove-avx-unaligned.S |  22 ++
>  sysdeps/x86_64/multiarch/memmove.c               |   7 +-
>  sysdeps/x86_64/multiarch/memmove_chk.c           |   6 +-
>  sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S |  22 ++
>  sysdeps/x86_64/multiarch/mempcpy.S               |   3 +
>  sysdeps/x86_64/multiarch/mempcpy_chk.S           |   3 +
>  12 files changed, 494 insertions(+), 4 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
>  create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
>  create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
>
> diff --git a/ChangeLog b/ChangeLog
> index 9bb48ab..b8638e9 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,4 +1,20 @@
>  2014-04-21  Ling Ma  <ling.ml@alibaba-inc.com>
> +
> +	* sysdeps/x86_64/multiarch/Makefile: Add avx memcpy/mempcpy/memmove
> +	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add support for related
> +	flies with avx memcpy
> +	* sysdeps/x86_64/multiarch/memcpy.S: Add support for avx memcpy
> +	* sysdeps/x86_64/multiarch/memcpy_chk.S: Add support for avx memcpy_chk
> +	* sysdeps/x86_64/multiarch/memmove.c: Add support for avx memmove
> +	* sysdeps/x86_64/multiarch/memmove_chk.c: Add support for avx memmove_chk
> +	* sysdeps/x86_64/multiarch/mempcpy.S: Add support for avx mempcpy
> +	* sysdeps/x86_64/multiarch/mempcpy_chk.S: Add support for avx mempcpy_chk
> +	* sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: New file for avx
> memcpy
> +	* sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S: New file for avx
> mempcpy
> +	* sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: New file for avx
> +	memmove
> +
> +2014-04-21  Ling Ma  <ling.ml@alibaba-inc.com>
>
>  	* sysdeps/x86_64/multiarch/Makefile: Add memset-avx2
>  	* sysdeps/x86_64/multiarch/memset-avx2.S: New file for AVX2 memset
> diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> index 42df96f..5902098 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -11,6 +11,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c
> strcmp-ssse3 \
>  		   memcmp-sse4 memcpy-ssse3 \
>  		   memcpy-sse2-unaligned mempcpy-ssse3 \
>  		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> +		   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
>  		   memmove-ssse3-back strcasecmp_l-ssse3 \
>  		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>  		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 6da9be1..34db79e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -46,6 +46,8 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
>    IFUNC_IMPL (i, name, __memmove_chk,
> +	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
> +			      __memmove_chk_avx_unaligned)
>  	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
>  			      __memmove_chk_ssse3_back)
>  	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
> @@ -55,6 +57,8 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/memmove.S.  */
>    IFUNC_IMPL (i, name, memmove,
> +	      IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
> +			      __memmove_avx_unaligned)
>  	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
>  			      __memmove_ssse3_back)
>  	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
> @@ -201,6 +205,8 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>  #ifdef SHARED
>    /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
>    IFUNC_IMPL (i, name, __memcpy_chk,
> +	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
> +			      __memcpy_chk_avx_unaligned)
>  	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
>  			      __memcpy_chk_ssse3_back)
>  	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
> @@ -210,6 +216,8 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
>    IFUNC_IMPL (i, name, memcpy,
> +	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
> +			      __memcpy_avx_unaligned)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
>  			      __memcpy_ssse3_back)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
> @@ -218,6 +226,8 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
>    IFUNC_IMPL (i, name, __mempcpy_chk,
> +	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
> +			      __mempcpy_chk_avx_unaligned)
>  	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
>  			      __mempcpy_chk_ssse3_back)
>  	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
> @@ -227,6 +237,8 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
>    IFUNC_IMPL (i, name, mempcpy,
> +	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
> +			      __mempcpy_avx_unaligned)
>  	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
>  			      __mempcpy_ssse3_back)
>  	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
> diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
> b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
> new file mode 100644
> index 0000000..c74e306
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
> @@ -0,0 +1,399 @@
> +/* memcpy with AVX
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#if !defined NOT_IN_libc \
> +    && (defined SHARED \
> +        || defined USE_AS_MEMMOVE \
> +	|| !defined USE_MULTIARCH)
> +
> +#include "asm-syntax.h"
> +#ifndef MEMCPY
> +# define MEMCPY	__memcpy_avx_unaligned
> +# define MEMCPY_CHK	__memcpy_chk_avx_unaligned
> +#endif
> +
> +	.section .text.avx,"ax",@progbits
> +#if !defined USE_AS_BCOPY
> +ENTRY (MEMCPY_CHK)
> +	cmpq	%rdx, %rcx
> +	jb	HIDDEN_JUMPTARGET (__chk_fail)
> +END (MEMCPY_CHK)
> +#endif
> +
> +ENTRY (MEMCPY)
> +	mov	%rdi, %rax
> +#ifdef USE_AS_MEMPCPY
> +	add	%rdx, %rax
> +#endif
> +	cmp	$256, %rdx
> +	jae	L(256bytesormore)
> +	cmp	$128, %dl
> +	jb	L(less_128bytes)
> +	vmovdqu (%rsi), %xmm0
> +	lea	(%rsi, %rdx), %rcx
> +	vmovdqu 0x10(%rsi), %xmm1
> +	vmovdqu 0x20(%rsi), %xmm2
> +	vmovdqu 0x30(%rsi), %xmm3
> +	vmovdqu 0x40(%rsi), %xmm4
> +	vmovdqu 0x50(%rsi), %xmm5
> +	vmovdqu 0x60(%rsi), %xmm6
> +	vmovdqu 0x70(%rsi), %xmm7
> +	vmovdqu -0x80(%rcx), %xmm8
> +	vmovdqu -0x70(%rcx), %xmm9
> +	vmovdqu -0x60(%rcx), %xmm10
> +	vmovdqu -0x50(%rcx), %xmm11
> +	vmovdqu -0x40(%rcx), %xmm12
> +	vmovdqu -0x30(%rcx), %xmm13
> +	vmovdqu -0x20(%rcx), %xmm14
> +	vmovdqu -0x10(%rcx), %xmm15
> +	lea	(%rdi, %rdx), %rdx
> +	vmovdqu %xmm0, (%rdi)
> +	vmovdqu %xmm1, 0x10(%rdi)
> +	vmovdqu %xmm2, 0x20(%rdi)
> +	vmovdqu %xmm3, 0x30(%rdi)
> +	vmovdqu %xmm4, 0x40(%rdi)
> +	vmovdqu %xmm5, 0x50(%rdi)
> +	vmovdqu %xmm6, 0x60(%rdi)
> +	vmovdqu %xmm7, 0x70(%rdi)
> +	vmovdqu %xmm8, -0x80(%rdx)
> +	vmovdqu %xmm9, -0x70(%rdx)
> +	vmovdqu %xmm10, -0x60(%rdx)
> +	vmovdqu %xmm11, -0x50(%rdx)
> +	vmovdqu %xmm12, -0x40(%rdx)
> +	vmovdqu %xmm13, -0x30(%rdx)
> +	vmovdqu %xmm14, -0x20(%rdx)
> +	vmovdqu %xmm15, -0x10(%rdx)
> +	ret
> +
> +	.p2align 4
> +L(less_128bytes):
> +	cmp	$64, %dl
> +	jb	L(less_64bytes)
> +	vmovdqu (%rsi), %xmm0
> +	lea	(%rsi, %rdx), %rcx
> +	vmovdqu 0x10(%rsi), %xmm1
> +	vmovdqu 0x20(%rsi), %xmm2
> +	lea	(%rdi, %rdx), %rdx
> +	vmovdqu 0x30(%rsi), %xmm3
> +	vmovdqu -0x40(%rcx), %xmm4
> +	vmovdqu -0x30(%rcx), %xmm5
> +	vmovdqu -0x20(%rcx), %xmm6
> +	vmovdqu -0x10(%rcx), %xmm7
> +	vmovdqu %xmm0, (%rdi)
> +	vmovdqu %xmm1, 0x10(%rdi)
> +	vmovdqu %xmm2, 0x20(%rdi)
> +	vmovdqu %xmm3, 0x30(%rdi)
> +	vmovdqu %xmm4, -0x40(%rdx)
> +	vmovdqu %xmm5, -0x30(%rdx)
> +	vmovdqu %xmm6, -0x20(%rdx)
> +	vmovdqu %xmm7, -0x10(%rdx)
> +	ret
> +
> +	.p2align 4
> +L(less_64bytes):
> +	cmp	$32, %dl
> +	jb	L(less_32bytes)
> +	vmovdqu (%rsi), %xmm0
> +	vmovdqu 0x10(%rsi), %xmm1
> +	vmovdqu -0x20(%rsi, %rdx), %xmm6
> +	vmovdqu -0x10(%rsi, %rdx), %xmm7
> +	vmovdqu %xmm0, (%rdi)
> +	vmovdqu %xmm1, 0x10(%rdi)
> +	vmovdqu %xmm6, -0x20(%rdi, %rdx)
> +	vmovdqu %xmm7, -0x10(%rdi, %rdx)
> +	ret
> +
> +	.p2align 4
> +L(less_32bytes):
> +	cmp	$16, %dl
> +	jb	L(less_16bytes)
> +	vmovdqu (%rsi), %xmm0
> +	vmovdqu -0x10(%rsi, %rdx), %xmm7
> +	vmovdqu %xmm0, (%rdi)
> +	vmovdqu %xmm7, -0x10(%rdi, %rdx)
> +	ret
> +
> +	.p2align 4
> +L(less_16bytes):
> +	cmp	$8, %dl
> +	jb	L(less_8bytes)
> +	movq -0x08(%rsi, %rdx),	%rcx
> +	movq (%rsi),	%rsi
> +	movq %rsi, (%rdi)
> +	movq %rcx, -0x08(%rdi, %rdx)
> +	ret
> +
> +	.p2align 4
> +L(less_8bytes):
> +	cmp	$4, %dl
> +	jb	L(less_4bytes)
> +	mov -0x04(%rsi, %rdx), %ecx
> +	mov (%rsi),	%esi
> +	mov %esi, (%rdi)
> +	mov %ecx, -0x04(%rdi, %rdx)
> +	ret
> +
> +L(less_4bytes):
> +	cmp	$1, %dl
> +	jbe	L(less_2bytes)
> +	mov -0x02(%rsi, %rdx),	%cx
> +	mov (%rsi),	%si
> +	mov %si, (%rdi)
> +	mov %cx, -0x02(%rdi, %rdx)
> +	ret
> +
> +L(less_2bytes):
> +	jb	L(less_0bytes)
> +	mov	(%rsi), %cl
> +	mov	%cl,	(%rdi)
> +L(less_0bytes):
> +	ret
> +
> +	.p2align 4
> +L(256bytesormore):
> +#ifdef USE_AS_MEMMOVE
> +	mov	%rdi, %rcx
> +	sub	%rsi, %rcx
> +	cmp	%rdx, %rcx
> +	jc	L(copy_backward)
> +#endif
> +	cmp	$2048, %rdx
> +	jae	L(gobble_data_movsb)
> +	mov	%rax, %r8
> +	lea	(%rsi, %rdx), %rcx
> +	mov	%rdi, %r10
> +	vmovdqu -0x80(%rcx), %xmm5
> +	vmovdqu -0x70(%rcx), %xmm6
> +	mov	$0x80, %rax
> +	and	$-32, %rdi
> +	add	$32, %rdi
> +	vmovdqu -0x60(%rcx), %xmm7
> +	vmovdqu -0x50(%rcx), %xmm8
> +	mov	%rdi, %r11
> +	sub	%r10, %r11
> +	vmovdqu -0x40(%rcx), %xmm9
> +	vmovdqu -0x30(%rcx), %xmm10
> +	sub	%r11, %rdx
> +	vmovdqu -0x20(%rcx), %xmm11
> +	vmovdqu -0x10(%rcx), %xmm12
> +	vmovdqu	(%rsi), %ymm4
> +	add	%r11, %rsi
> +	sub	%eax, %edx
> +L(goble_128_loop):
> +	vmovdqu (%rsi), %ymm0
> +	vmovdqu 0x20(%rsi), %ymm1
> +	vmovdqu 0x40(%rsi), %ymm2
> +	vmovdqu 0x60(%rsi), %ymm3
> +	add	%rax, %rsi
> +	vmovdqa %ymm0, (%rdi)
> +	vmovdqa %ymm1, 0x20(%rdi)
> +	vmovdqa %ymm2, 0x40(%rdi)
> +	vmovdqa %ymm3, 0x60(%rdi)
> +	add	%rax, %rdi
> +	sub	%eax, %edx
> +	jae	L(goble_128_loop)
> +	add	%eax, %edx
> +	add	%rdi, %rdx
> +	vmovdqu	%ymm4, (%r10)
> +	vzeroupper
> +	vmovdqu %xmm5, -0x80(%rdx)
> +	vmovdqu %xmm6, -0x70(%rdx)
> +	vmovdqu %xmm7, -0x60(%rdx)
> +	vmovdqu %xmm8, -0x50(%rdx)
> +	vmovdqu %xmm9, -0x40(%rdx)
> +	vmovdqu %xmm10, -0x30(%rdx)
> +	vmovdqu %xmm11, -0x20(%rdx)
> +	vmovdqu %xmm12, -0x10(%rdx)
> +	mov	%r8, %rax
> +	ret
> +
> +	.p2align 4
> +L(gobble_data_movsb):
> +
> +#ifdef SHARED_CACHE_SIZE_HALF
> +	mov	$SHARED_CACHE_SIZE_HALF, %rcx
> +#else
> +	mov	__x86_shared_cache_size_half(%rip), %rcx
> +#endif
> +	shl	$3, %rcx
> +
> +#ifdef USE_AS_MEMMOVE
> +	mov	%rsi, %r10
> +	sub	%rdi, %r10
> +	cmp	%rdx, %r10
> +	jae	L(memmove_use_memcpy_fwd)
> +	cmp	%rcx, %r10
> +	jae	L(memmove_use_memcpy_fwd)
> +	jmp L(gobble_mem_fwd_llc_start)
> +L(memmove_use_memcpy_fwd):
> +#endif
> +	cmp	%rcx, %rdx
> +	jae	L(gobble_big_data_fwd)
> +#ifdef USE_AS_MEMMOVE
> +L(gobble_mem_fwd_llc_start):
> +#endif
> +	mov	%rdx, %rcx
> +	rep	movsb
> +	ret
> +
> +	.p2align 4
> +L(gobble_big_data_fwd):
> +	lea	(%rsi, %rdx), %rcx
> +	vmovdqu	(%rsi), %ymm4
> +	vmovdqu -0x80(%rsi,%rdx), %xmm5
> +	vmovdqu -0x70(%rcx), %xmm6
> +	vmovdqu -0x60(%rcx), %xmm7
> +	vmovdqu -0x50(%rcx), %xmm8
> +	vmovdqu -0x40(%rcx), %xmm9
> +	vmovdqu -0x30(%rcx), %xmm10
> +	vmovdqu -0x20(%rcx), %xmm11
> +	vmovdqu -0x10(%rcx), %xmm12
> +	mov	%rdi, %r8
> +	and	$-32, %rdi
> +	add	$32, %rdi
> +	mov	%rdi, %r10
> +	sub	%r8, %r10
> +	sub	%r10, %rdx
> +	add	%r10, %rsi
> +	lea	(%rdi, %rdx), %rcx
> +	sub	$0x80, %rdx
> +L(gobble_mem_fwd_loop):
> +	prefetchnta 0x1c0(%rsi)
> +	prefetchnta 0x280(%rsi)
> +	vmovdqu	(%rsi), %ymm0
> +	vmovdqu	0x20(%rsi), %ymm1
> +	vmovdqu	0x40(%rsi), %ymm2
> +	vmovdqu	0x60(%rsi), %ymm3
> +	lea	0x80(%rsi), %rsi
> +	vmovntdq	%ymm0, (%rdi)
> +	vmovntdq	%ymm1, 0x20(%rdi)
> +	vmovntdq	%ymm2, 0x40(%rdi)
> +	vmovntdq	%ymm3, 0x60(%rdi)
> +	lea	0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(gobble_mem_fwd_loop)
> +	sfence
> +	vmovdqu	%ymm4, (%r8)
> +	vzeroupper
> +	vmovdqu %xmm5, -0x80(%rcx)
> +	vmovdqu %xmm6, -0x70(%rcx)
> +	vmovdqu %xmm7, -0x60(%rcx)
> +	vmovdqu %xmm8, -0x50(%rcx)
> +	vmovdqu %xmm9, -0x40(%rcx)
> +	vmovdqu %xmm10, -0x30(%rcx)
> +	vmovdqu %xmm11, -0x20(%rcx)
> +	vmovdqu %xmm12, -0x10(%rcx)
> +	ret
> +
> +#ifdef USE_AS_MEMMOVE
> +	.p2align 4
> +L(copy_backward):
> +#ifdef SHARED_CACHE_SIZE_HALF
> +	mov	$SHARED_CACHE_SIZE_HALF, %rcx
> +#else
> +	mov	__x86_shared_cache_size_half(%rip), %rcx
> +#endif
> +	shl	$3, %rcx
> +	mov	%rdi, %r9
> +	vmovdqu (%rsi), %xmm8
> +	vmovdqu 0x10(%rsi), %xmm9
> +	add	%rdx, %rdi
> +	vmovdqu 0x20(%rsi), %xmm10
> +	vmovdqu 0x30(%rsi), %xmm11
> +	lea	-0x20(%rdi), %r10
> +	mov %rdi, %r11
> +	vmovdqu 0x40(%rsi), %xmm12
> +	vmovdqu 0x50(%rsi), %xmm13
> +	and	$0x1f, %r11
> +	vmovdqu 0x60(%rsi), %xmm14
> +	vmovdqu 0x70(%rsi), %xmm15
> +	xor	%r11, %rdi
> +	add	%rdx, %rsi
> +	vmovdqu	-0x20(%rsi), %ymm4
> +	sub	%r11, %rsi
> +	sub	%r11, %rdx
> +	mov	%rdi, %r11
> +	sub	%rsi, %r11
> +	cmp	%rdx, %r11
> +	jae	L(memmove_use_memcpy_bwd)
> +	cmp	%rcx, %r11
> +	jae	L(memmove_use_memcpy_bwd)
> +	jmp L(gobble_mem_bwd_llc_start)
> +L(memmove_use_memcpy_bwd):
> +	cmp	%rcx, %rdx
> +	ja	L(gobble_big_data_bwd)
> +L(gobble_mem_bwd_llc_start):
> +	sub	$0x80, %rdx
> +L(gobble_mem_bwd_llc):
> +	vmovdqu	-0x20(%rsi), %ymm0
> +	vmovdqu	-0x40(%rsi), %ymm1
> +	vmovdqu	-0x60(%rsi), %ymm2
> +	vmovdqu	-0x80(%rsi), %ymm3
> +	lea	-0x80(%rsi), %rsi
> +	vmovdqa	%ymm0, -0x20(%rdi)
> +	vmovdqa	%ymm1, -0x40(%rdi)
> +	vmovdqa	%ymm2, -0x60(%rdi)
> +	vmovdqa	%ymm3, -0x80(%rdi)
> +	lea	-0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(gobble_mem_bwd_llc)
> +	vmovdqu	%ymm4, (%r10)
> +	vzeroupper
> +	vmovdqu %xmm8, (%r9)
> +	vmovdqu %xmm9, 0x10(%r9)
> +	vmovdqu %xmm10, 0x20(%r9)
> +	vmovdqu %xmm11, 0x30(%r9)
> +	vmovdqu %xmm12, 0x40(%r9)
> +	vmovdqu %xmm13, 0x50(%r9)
> +	vmovdqu %xmm14, 0x60(%r9)
> +	vmovdqu %xmm15, 0x70(%r9)
> +	ret
> +L(gobble_big_data_bwd):
> +	sub	$0x80, %rdx
> +L(gobble_mem_bwd_loop):
> +	prefetchnta -0x1c0(%rsi)
> +	prefetchnta -0x280(%rsi)
> +	vmovdqu	-0x20(%rsi), %ymm0
> +	vmovdqu	-0x40(%rsi), %ymm1
> +	vmovdqu	-0x60(%rsi), %ymm2
> +	vmovdqu	-0x80(%rsi), %ymm3
> +	lea	-0x80(%rsi), %rsi
> +	vmovntdq	%ymm0, -0x20(%rdi)
> +	vmovntdq	%ymm1, -0x40(%rdi)
> +	vmovntdq	%ymm2, -0x60(%rdi)
> +	vmovntdq	%ymm3, -0x80(%rdi)
> +	lea	-0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(gobble_mem_bwd_loop)
> +	sfence
> +	vmovdqu	%ymm4, (%r10)
> +	vzeroupper
> +	vmovdqu %xmm8, (%r9)
> +	vmovdqu %xmm9, 0x10(%r9)
> +	vmovdqu %xmm10, 0x20(%r9)
> +	vmovdqu %xmm11, 0x30(%r9)
> +	vmovdqu %xmm12, 0x40(%r9)
> +	vmovdqu %xmm13, 0x50(%r9)
> +	vmovdqu %xmm14, 0x60(%r9)
> +	vmovdqu %xmm15, 0x70(%r9)
> +	ret
> +#endif
> +END (MEMCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/memcpy.S
> b/sysdeps/x86_64/multiarch/memcpy.S
> index 40ae926..b92f986 100644
> --- a/sysdeps/x86_64/multiarch/memcpy.S
> +++ b/sysdeps/x86_64/multiarch/memcpy.S
> @@ -32,6 +32,10 @@ ENTRY(__new_memcpy)
>  	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
>  	jne	1f
>  	call	__init_cpu_features
> +1:	leaq	__memcpy_avx_unaligned(%rip), %rax
> +	testl	$bit_AVX_Usable,
> __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)	
> +	jz 1f
> +	ret
>  1:	leaq	__memcpy_sse2(%rip), %rax
>  	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
>  	jnz	2f
> diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S
> b/sysdeps/x86_64/multiarch/memcpy_chk.S
> index 3c0270f..076b19a 100644
> --- a/sysdeps/x86_64/multiarch/memcpy_chk.S
> +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
> @@ -39,6 +39,9 @@ ENTRY(__memcpy_chk)
>  	testl	$bit_Fast_Copy_Backward,
> __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
>  	jz	2f
>  	leaq	__memcpy_chk_ssse3_back(%rip), %rax
> +	testl   $bit_AVX_Usable,
> __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> +	jz  2f
> +	leaq    __memcpy_chk_avx_unaligned(%rip), %rax
>  2:	ret
>  END(__memcpy_chk)
>  # else
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
> b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
> new file mode 100644
> index 0000000..faed9fb
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
> @@ -0,0 +1,22 @@
> +/* memmove with AVX
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define USE_AS_MEMMOVE
> +#define MEMCPY		__memmove_avx_unaligned
> +#define MEMCPY_CHK	__memmove_chk_avx_unaligned
> +#include "memcpy-avx-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/memmove.c
> b/sysdeps/x86_64/multiarch/memmove.c
> index ba86e7b..ee73f23 100644
> --- a/sysdeps/x86_64/multiarch/memmove.c
> +++ b/sysdeps/x86_64/multiarch/memmove.c
> @@ -35,6 +35,8 @@
>  extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_ssse3_back
> attribute_hidden;
> +extern __typeof (__redirect_memmove) __memmove_avx_unaligned
> attribute_hidden;
> +
>  #endif
>
>  #include "string/memmove.c"
> @@ -47,10 +49,11 @@ extern __typeof (__redirect_memmove)
> __memmove_ssse3_back attribute_hidden;
>     ifunc symbol properly.  */
>  extern __typeof (__redirect_memmove) __libc_memmove;
>  libc_ifunc (__libc_memmove,
> -	    HAS_SSSE3
> +	    HAS_AVX ? __memmove_avx_unaligned :
> +	    (HAS_SSSE3
>  	    ? (HAS_FAST_COPY_BACKWARD
>  	       ? __memmove_ssse3_back : __memmove_ssse3)
> -	    : __memmove_sse2)
> +	    : __memmove_sse2));
>
>  strong_alias (__libc_memmove, memmove)
>
> diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c
> b/sysdeps/x86_64/multiarch/memmove_chk.c
> index cb1acb6..44344f2 100644
> --- a/sysdeps/x86_64/multiarch/memmove_chk.c
> +++ b/sysdeps/x86_64/multiarch/memmove_chk.c
> @@ -25,11 +25,13 @@
>  extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
>  extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
>  extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
> +extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned
> attribute_hidden;
>
>  #include "debug/memmove_chk.c"
>
>  libc_ifunc (__memmove_chk,
> -	    HAS_SSSE3
> +	    HAS_AVX ? __memmove_chk_avx_unaligned :
> +	    (HAS_SSSE3
>  	    ? (HAS_FAST_COPY_BACKWARD
>  	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
> -	    : __memmove_chk_sse2);
> +	    : __memmove_chk_sse2));
> diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
> b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
> new file mode 100644
> index 0000000..438bda3
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
> @@ -0,0 +1,22 @@
> +/* mempcpy with AVX
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define USE_AS_MEMPCPY
> +#define MEMCPY		__mempcpy_avx_unaligned
> +#define MEMCPY_CHK	__mempcpy_chk_avx_unaligned
> +#include "memcpy-avx-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/mempcpy.S
> b/sysdeps/x86_64/multiarch/mempcpy.S
> index b9f04c2..7589d8c 100644
> --- a/sysdeps/x86_64/multiarch/mempcpy.S
> +++ b/sysdeps/x86_64/multiarch/mempcpy.S
> @@ -37,6 +37,9 @@ ENTRY(__mempcpy)
>  	testl	$bit_Fast_Copy_Backward,
> __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
>  	jz	2f
>  	leaq	__mempcpy_ssse3_back(%rip), %rax
> +	testl	$bit_AVX_Usable,
> __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> +	jz	2f
> +	leaq	__mempcpy_avx_unaligned(%rip), %rax
>  2:	ret
>  END(__mempcpy)
>
> diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S
> b/sysdeps/x86_64/multiarch/mempcpy_chk.S
> index c28473a..88e0b74 100644
> --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
> +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
> @@ -39,6 +39,9 @@ ENTRY(__mempcpy_chk)
>  	testl	$bit_Fast_Copy_Backward,
> __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
>  	jz	2f
>  	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
> +	testl	$bit_AVX_Usable,
> __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> +	jz	2f
> +	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
>  2:	ret
>  END(__mempcpy_chk)
>  # else
> --
> 1.8.1.4
>
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]