This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction

From: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
To: Ling Ma <ling dot ma dot program at gmail dot com>
Cc: GNU C Library <libc-alpha at sourceware dot org>, Ondrej Bilka <neleai at seznam dot cz>, Ma Ling <ling dot ml at alibaba-inc dot com>
Date: Thu, 11 Jul 2013 17:59:30 +0400
Subject: Re: [PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
References: <1373547096-8095-1-git-send-email-ling dot ma dot program at gmail dot com>
We need to check performance for core i7 with AVX before install this.
As far as I understood you checked on Haswell only? But AVX works for
more architectures than AVX2.

You missed to fix Copyright: s/2010/2013

--
Liubov

On Thu, Jul 11, 2013 at 4:51 PM,  <ling.ma.program@gmail.com> wrote:
> From: Ma Ling <ling.ml@alibaba-inc.com>
>
> We manage to avoid branch instructions, and force destination to be aligned
> with avx instruction. We modified gcc.403 so that we can only measure memcpy function,
> gcc.403 benchmarks indicate the version improved performance from 4% to 16% on different cases .
>
> Best Regards
> Ling
> ---
> In this version we did clean-up work, thanks Liubov.
>
>  sysdeps/x86_64/multiarch/Makefile                |   5 +-
>  sysdeps/x86_64/multiarch/ifunc-defines.sym       |   2 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c       |  11 +
>  sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S  | 409 +++++++++++++++++++++++
>  sysdeps/x86_64/multiarch/memmove-avx-unaligned.S |   4 +
>  sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S |   4 +
>  6 files changed, 433 insertions(+), 2 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
>  create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
>  create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index dd6c27d..f92cf18 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -8,8 +8,9 @@ ifeq ($(subdir),string)
>
>  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
>                    strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
> -                  memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> -                  memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
> +                  memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back memmove-ssse3-back \
> +                  memcpy-avx-unaligned mempcpy-avx-unaligned memmove-avx-unaligned \
> +                  strcasestr-nonascii strcasecmp_l-ssse3 \
>                    strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
>                    strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>                    strcpy-sse2-unaligned strncpy-sse2-unaligned \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
> index eb1538a..448b8c4 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
> +++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
> @@ -17,4 +17,6 @@ FEATURE_OFFSET                offsetof (struct cpu_features, feature)
>  FEATURE_SIZE           sizeof (unsigned int)
>
>  COMMON_CPUID_INDEX_1
> +COMMON_CPUID_INDEX_7
>  FEATURE_INDEX_1
> +FEATURE_INDEX_7
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 332a60d..5639702 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -50,6 +50,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __memmove_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
>                               __memmove_chk_ssse3)
> +             IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
> +                         __memmove_chk_avx_unaligned)
>               IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
>                               __memmove_chk_sse2))
>
> @@ -59,6 +61,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __memmove_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
>                               __memmove_ssse3)
> +             IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
> +                         __memmove_avx_unaligned)
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
> @@ -235,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __memcpy_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
>                               __memcpy_chk_ssse3)
> +             IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
> +                             __memcpy_chk_avx_unaligned)
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
>                               __memcpy_chk_sse2))
>
> @@ -243,6 +249,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
>                               __memcpy_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
> +             IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX, __memcpy_avx_unaligned)
>               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
> @@ -251,6 +258,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __mempcpy_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
>                               __mempcpy_chk_ssse3)
> +             IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
> +                             __mempcpy_chk_avx_unaligned)
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
>                               __mempcpy_chk_sse2))
>
> @@ -260,6 +269,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __mempcpy_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
>                               __mempcpy_ssse3)
> +             IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
> +                             __mempcpy_avx_unaligned)
>               IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/strlen.S.  */
> diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
> new file mode 100644
> index 0000000..68901f6
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
> @@ -0,0 +1,409 @@
> +/* memcpy with AVX
> +   Copyright (C) 2010 Free Software Foundation, Inc.
> +   Contributed by Intel Corporation.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#if !defined NOT_IN_libc \
> +    && (defined SHARED \
> +        || defined USE_AS_MEMMOVE \
> +       || !defined USE_MULTIARCH)
> +
> +#include "asm-syntax.h"
> +#ifndef ALIGN
> +# define ALIGN(n)      .p2align n
> +#endif
> +#ifndef MEMCPY
> +# define MEMCPY        __memcpy_avx_unaligned
> +# define MEMCPY_CHK    __memcpy_chk_avx_unaligned
> +#endif
> +
> +       .section .text.avx,"ax",@progbits
> +#if !defined USE_AS_BCOPY
> +ENTRY (MEMCPY_CHK)
> +       cmpq    %rdx, %rcx
> +       jb      HIDDEN_JUMPTARGET (__chk_fail)
> +END (MEMCPY_CHK)
> +#endif
> +
> +ENTRY (MEMCPY)
> +       vzeroupper
> +       mov     %rdi, %rax
> +
> +#ifdef USE_AS_MEMPCPY
> +       add     %rdx, %rax
> +#endif
> +
> +       lea     (%rsi, %rdx), %r8
> +       lea     (%rdi, %rdx), %r9
> +       cmp     $256, %rdx
> +       ja      L(256bytesormore)
> +       cmp     $128, %edx
> +       jb      L(less_128bytes)
> +       vmovups (%rsi), %xmm0
> +       vmovups 0x10(%rsi), %xmm1
> +       vmovups 0x20(%rsi), %xmm2
> +       vmovups 0x30(%rsi), %xmm3
> +       vmovups 0x40(%rsi), %xmm4
> +       vmovups 0x50(%rsi), %xmm5
> +       vmovups 0x60(%rsi), %xmm6
> +       vmovups 0x70(%rsi), %xmm7
> +       vmovups -0x80(%r8), %xmm8
> +       vmovups -0x70(%r8), %xmm9
> +       vmovups -0x60(%r8), %xmm10
> +       vmovups -0x50(%r8), %xmm11
> +       vmovups -0x40(%r8), %xmm12
> +       vmovups -0x30(%r8), %xmm13
> +       vmovups -0x20(%r8), %xmm14
> +       vmovups -0x10(%r8), %xmm15
> +       vmovups %xmm0, (%rdi)
> +       vmovups %xmm1, 0x10(%rdi)
> +       vmovups %xmm2, 0x20(%rdi)
> +       vmovups %xmm3, 0x30(%rdi)
> +       vmovups %xmm4, 0x40(%rdi)
> +       vmovups %xmm5, 0x50(%rdi)
> +       vmovups %xmm6, 0x60(%rdi)
> +       vmovups %xmm7, 0x70(%rdi)
> +       vmovups %xmm8, -0x80(%r9)
> +       vmovups %xmm9, -0x70(%r9)
> +       vmovups %xmm10, -0x60(%r9)
> +       vmovups %xmm11, -0x50(%r9)
> +       vmovups %xmm12, -0x40(%r9)
> +       vmovups %xmm13, -0x30(%r9)
> +       vmovups %xmm14, -0x20(%r9)
> +       vmovups %xmm15, -0x10(%r9)
> +       ret
> +       ALIGN(4)
> +L(less_128bytes):
> +       cmp     $64, %edx
> +       jb      L(less_64bytes)
> +       vmovups (%rsi), %xmm0
> +       vmovups 0x10(%rsi), %xmm1
> +       vmovups 0x20(%rsi), %xmm2
> +       vmovups 0x30(%rsi), %xmm3
> +       vmovups -0x40(%r8), %xmm4
> +       vmovups -0x30(%r8), %xmm5
> +       vmovups -0x20(%r8), %xmm6
> +       vmovups -0x10(%r8), %xmm7
> +       vmovups %xmm0, (%rdi)
> +       vmovups %xmm1, 0x10(%rdi)
> +       vmovups %xmm2, 0x20(%rdi)
> +       vmovups %xmm3, 0x30(%rdi)
> +       vmovups %xmm4, -0x40(%r9)
> +       vmovups %xmm5, -0x30(%r9)
> +       vmovups %xmm6, -0x20(%r9)
> +       vmovups %xmm7, -0x10(%r9)
> +       ret
> +       ALIGN(4)
> +L(less_64bytes):
> +       cmp     $32, %edx
> +       jb      L(less_32bytes)
> +       vmovups (%rsi), %xmm0
> +       vmovups 0x10(%rsi), %xmm1
> +       vmovups -0x20(%r8), %xmm6
> +       vmovups -0x10(%r8), %xmm7
> +       vmovups %xmm0, (%rdi)
> +       vmovups %xmm1, 0x10(%rdi)
> +       vmovups %xmm6, -0x20(%r9)
> +       vmovups %xmm7, -0x10(%r9)
> +       ret
> +       ALIGN(4)
> +L(less_32bytes):
> +       cmp     $16, %edx
> +       jb      L(less_16bytes)
> +       vmovups (%rsi), %xmm0
> +       vmovups -0x10(%r8), %xmm7
> +       vmovups %xmm0, (%rdi)
> +       vmovups %xmm7, -0x10(%r9)
> +       ret
> +       ALIGN(4)
> +L(less_16bytes):
> +       cmp     $8, %edx
> +       jb      L(less_8bytes)
> +       movq (%rsi),    %rcx
> +       movq -0x08(%r8),        %r10
> +       movq %rcx, (%rdi)
> +       movq %r10, -0x08(%r9)
> +       ret
> +       ALIGN(4)
> +L(less_8bytes):
> +       cmp     $4, %edx
> +       jb      L(less_4bytes)
> +       mov (%rsi),     %ecx
> +       mov -0x04(%r8), %edx
> +       mov %ecx, (%rdi)
> +       mov %edx, -0x04(%r9)
> +       ret
> +       ALIGN(4)
> +L(less_4bytes):
> +       cmp     $2, %edx
> +       jb      L(less_2bytes)
> +       mov (%rsi),     %cx
> +       mov -0x02(%r8), %dx
> +       mov %cx, (%rdi)
> +       mov %dx, -0x02(%r9)
> +       ret
> +       ALIGN(4)
> +L(less_2bytes):
> +       cmp     $1, %rdx
> +       jb      L(less_0bytes)
> +       mov     (%rsi), %cl
> +       mov     %cl,    (%rdi)
> +L(less_0bytes):
> +       ret
> +
> +       ALIGN(4)
> +L(256bytesormore):
> +
> +#ifdef USE_AS_MEMMOVE
> +       cmp     %rsi, %rdi
> +       jae     L(copy_backward)
> +#endif
> +       cmp     $2048, %rdx
> +       jae     L(gobble_data_movsb)
> +
> +       vmovups -0x80(%r8), %xmm8
> +       vmovups -0x70(%r8), %xmm9
> +       vmovups -0x60(%r8), %xmm10
> +       vmovups -0x50(%r8), %xmm11
> +       vmovups -0x40(%r8), %xmm12
> +       vmovups -0x30(%r8), %xmm13
> +       vmovups -0x20(%r8), %xmm14
> +       vmovups -0x10(%r8), %xmm15
> +       vmovups (%rsi), %ymm4
> +       mov     %rdi, %r10
> +       and     $-32, %rdi
> +       add     $32, %rdi
> +       mov     %rdi, %r11
> +       sub     %r10, %r11
> +       sub     %r11, %rdx
> +       add     %r11, %rsi
> +       sub     $0x80, %rdx
> +L(goble_128_loop):
> +       vmovups (%rsi), %ymm0
> +       vmovups 0x20(%rsi), %ymm1
> +       vmovups 0x40(%rsi), %ymm2
> +       vmovups 0x60(%rsi), %ymm3
> +       lea     0x80(%rsi), %rsi
> +       vmovaps %ymm0, (%rdi)
> +       vmovaps %ymm1, 0x20(%rdi)
> +       vmovaps %ymm2, 0x40(%rdi)
> +       vmovaps %ymm3, 0x60(%rdi)
> +       lea     0x80(%rdi), %rdi
> +       sub     $0x80, %rdx
> +       jae     L(goble_128_loop)
> +       vmovups %ymm4, (%r10)
> +       vzeroupper
> +       vmovups %xmm8, -0x80(%r9)
> +       vmovups %xmm9, -0x70(%r9)
> +       vmovups %xmm10, -0x60(%r9)
> +       vmovups %xmm11, -0x50(%r9)
> +       vmovups %xmm12, -0x40(%r9)
> +       vmovups %xmm13, -0x30(%r9)
> +       vmovups %xmm14, -0x20(%r9)
> +       vmovups %xmm15, -0x10(%r9)
> +       ret
> +
> +L(gobble_data_movsb):
> +
> +#ifdef SHARED_CACHE_SIZE_HALF
> +       mov     $SHARED_CACHE_SIZE_HALF, %rcx
> +#else
> +       mov     __x86_64_shared_cache_size_half(%rip), %rcx
> +#endif
> +       shl     $3, %rcx
> +
> +#ifdef USE_AS_MEMMOVE
> +       mov     %rsi, %r10
> +       sub     %rdi, %r10
> +       cmp     %rdx, %r10
> +       jae     L(memmove_use_memcpy_fwd)
> +       cmp     %rcx, %r10
> +       jae     L(memmove_use_memcpy_fwd)
> +       jmp L(gobble_mem_fwd_llc_start)
> +L(memmove_use_memcpy_fwd):
> +#endif
> +
> +       cmp     %rcx, %rdx
> +       ja      L(gobble_big_data_fwd)
> +
> +#ifdef USE_AS_MEMMOVE
> +L(gobble_mem_fwd_llc_start):
> +#endif
> +       mov     %rdx, %rcx
> +       rep     movsb
> +       ret
> +
> +L(gobble_big_data_fwd):
> +       vmovups (%rsi), %ymm4
> +       vmovups -0x80(%r8), %xmm5
> +       vmovups -0x70(%r8), %xmm6
> +       vmovups -0x60(%r8), %xmm7
> +       vmovups -0x50(%r8), %xmm8
> +       vmovups -0x40(%r8), %xmm9
> +       vmovups -0x30(%r8), %xmm10
> +       vmovups -0x20(%r8), %xmm11
> +       vmovups -0x10(%r8), %xmm12
> +       mov     %rdi, %r8
> +       and     $-32, %rdi
> +       add     $32, %rdi
> +       mov     %rdi, %r10
> +       sub     %r8, %r10
> +       sub     %r10, %rdx
> +       add     %r10, %rsi
> +       sub     $0x80, %rdx
> +L(gobble_mem_fwd_loop):
> +       prefetcht0 0x1c0(%rsi)
> +       prefetcht0 0x280(%rsi)
> +       vmovups (%rsi), %xmm0
> +       vmovups 0x10(%rsi), %xmm1
> +       vmovups 0x20(%rsi), %xmm2
> +       vmovups 0x30(%rsi), %xmm3
> +       vmovntdq        %xmm0, (%rdi)
> +       vmovntdq        %xmm1, 0x10(%rdi)
> +       vmovntdq        %xmm2, 0x20(%rdi)
> +       vmovntdq        %xmm3, 0x30(%rdi)
> +       vmovups 0x40(%rsi), %xmm0
> +       vmovups 0x50(%rsi), %xmm1
> +       vmovups 0x60(%rsi), %xmm2
> +       vmovups 0x70(%rsi), %xmm3
> +       lea     0x80(%rsi), %rsi
> +       vmovntdq        %xmm0, 0x40(%rdi)
> +       vmovntdq        %xmm1, 0x50(%rdi)
> +       vmovntdq        %xmm2, 0x60(%rdi)
> +       vmovntdq        %xmm3, 0x70(%rdi)
> +       lea     0x80(%rdi), %rdi
> +       sub     $0x80, %rdx
> +       jae     L(gobble_mem_fwd_loop)
> +       sfence
> +       vmovups %ymm4, (%r8)
> +       vzeroupper
> +       vmovups %xmm5, -0x80(%r9)
> +       vmovups %xmm6, -0x70(%r9)
> +       vmovups %xmm7, -0x60(%r9)
> +       vmovups %xmm8, -0x50(%r9)
> +       vmovups %xmm9, -0x40(%r9)
> +       vmovups %xmm10, -0x30(%r9)
> +       vmovups %xmm11, -0x20(%r9)
> +       vmovups %xmm12, -0x10(%r9)
> +       ret
> +
> +       ALIGN (4)
> +L(copy_backward):
> +#ifdef SHARED_CACHE_SIZE_HALF
> +       mov     $SHARED_CACHE_SIZE_HALF, %rcx
> +#else
> +       mov     __x86_64_shared_cache_size_half(%rip), %rcx
> +#endif
> +       shl     $3, %rcx
> +       vmovups (%rsi), %xmm8
> +       vmovups 0x10(%rsi), %xmm9
> +       vmovups 0x20(%rsi), %xmm10
> +       vmovups 0x30(%rsi), %xmm11
> +       vmovups 0x40(%rsi), %xmm12
> +       vmovups 0x50(%rsi), %xmm13
> +       vmovups 0x60(%rsi), %xmm14
> +       vmovups 0x70(%rsi), %xmm15
> +       mov     %rdi, %r9
> +       add     %rdx, %rsi
> +       add     %rdx, %rdi
> +       vmovups -0x20(%rsi), %ymm4
> +       lea     -0x20(%rdi), %r10
> +       mov %rdi, %r11
> +       and     $0x1f, %r11
> +       xor     %r11, %rdi
> +       sub     %r11, %rsi
> +       sub     %r11, %rdx
> +#ifdef USE_AS_MEMMOVE
> +       mov     %rdi, %r11
> +       sub     %rsi, %r11
> +       cmp     %rdx, %r11
> +       jae     L(memmove_use_memcpy_bwd)
> +       cmp     %rcx, %r11
> +       jae     L(memmove_use_memcpy_bwd)
> +       jmp L(gobble_mem_bwd_llc_start)
> +#endif
> +L(memmove_use_memcpy_bwd):
> +       cmp     %rcx, %rdx
> +       ja      L(gobble_big_data_bwd)
> +L(gobble_mem_bwd_llc_start):
> +       sub     $0x80, %rdx
> +L(gobble_mem_bwd_llc):
> +       vmovups -0x20(%rsi), %ymm0
> +       vmovups -0x40(%rsi), %ymm1
> +       vmovups -0x60(%rsi), %ymm2
> +       vmovups -0x80(%rsi), %ymm3
> +       lea     -0x80(%rsi), %rsi
> +       vmovaps %ymm0, -0x20(%rdi)
> +       vmovaps %ymm1, -0x40(%rdi)
> +       vmovaps %ymm2, -0x60(%rdi)
> +       vmovaps %ymm3, -0x80(%rdi)
> +       lea     -0x80(%rdi), %rdi
> +       sub     $0x80, %rdx
> +       jae     L(gobble_mem_bwd_llc)
> +       vmovups %ymm4, (%r10)
> +       vzeroupper
> +       vmovups %xmm8, (%r9)
> +       vmovups %xmm9, 0x10(%r9)
> +       vmovups %xmm10, 0x20(%r9)
> +       vmovups %xmm11, 0x30(%r9)
> +       vmovups %xmm12, 0x40(%r9)
> +       vmovups %xmm13, 0x50(%r9)
> +       vmovups %xmm14, 0x60(%r9)
> +       vmovups %xmm15, 0x70(%r9)
> +       ret
> +
> +L(gobble_big_data_bwd):
> +       sub     $0x80, %rdx
> +L(gobble_mem_bwd_loop):
> +       prefetcht0 -0x1c0(%rsi)
> +       prefetcht0 -0x280(%rsi)
> +       vmovups -0x10(%rsi), %xmm0
> +       vmovups -0x20(%rsi), %xmm1
> +       vmovups -0x30(%rsi), %xmm2
> +       vmovups -0x40(%rsi), %xmm3
> +       vmovntdq        %xmm0, -0x10(%rdi)
> +       vmovntdq        %xmm1, -0x20(%rdi)
> +       vmovntdq        %xmm2, -0x30(%rdi)
> +       vmovntdq        %xmm3, -0x40(%rdi)
> +       vmovups -0x50(%rsi), %xmm0
> +       vmovups -0x60(%rsi), %xmm1
> +       vmovups -0x70(%rsi), %xmm2
> +       vmovups -0x80(%rsi), %xmm3
> +       lea     -0x80(%rsi), %rsi
> +       vmovntdq        %xmm0, -0x50(%rdi)
> +       vmovntdq        %xmm1, -0x60(%rdi)
> +       vmovntdq        %xmm2, -0x70(%rdi)
> +       vmovntdq        %xmm3, -0x80(%rdi)
> +       lea     -0x80(%rdi), %rdi
> +       sub     $0x80, %rdx
> +       jae     L(gobble_mem_bwd_loop)
> +       sfence
> +       vmovups %ymm4, (%r10)
> +       vzeroupper
> +       vmovups %xmm8, (%r9)
> +       vmovups %xmm9, 0x10(%r9)
> +       vmovups %xmm10, 0x20(%r9)
> +       vmovups %xmm11, 0x30(%r9)
> +       vmovups %xmm12, 0x40(%r9)
> +       vmovups %xmm13, 0x50(%r9)
> +       vmovups %xmm14, 0x60(%r9)
> +       vmovups %xmm15, 0x70(%r9)
> +       ret
> +END (MEMCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
> new file mode 100644
> index 0000000..352a2c3
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
> @@ -0,0 +1,4 @@
> +#define USE_AS_MEMMOVE
> +#define MEMCPY         __memmove_avx_unaligned
> +#define MEMCPY_CHK     __memmove_chk_avx_unaligned
> +#include "memcpy-avx-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
> new file mode 100644
> index 0000000..b31394e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
> @@ -0,0 +1,4 @@
> +#define USE_AS_MEMPCPY
> +#define MEMCPY         __mempcpy_avx_unaligned
> +#define MEMCPY_CHK     __mempcpy_chk_avx_unaligned
> +#include "memcpy-avx-unaligned.S"
> --
> 1.8.1.4
>
Follow-Ups:
- Re: [PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
  - From: OndÅej BÃlka
- Re: [PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
  - From: Ling Ma
References:
- [PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
  - From: ling . ma . program
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]