This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PING] [PATCH] faster memcpy on x64.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 6 May 2013 09:43:15 +0200
- Subject: [PING] [PATCH] faster memcpy on x64.
- References: <20130427221620 dot GA16537 at domone dot kolej dot mff dot cuni dot cz>
ping
On Sun, Apr 28, 2013 at 12:16:20AM +0200, OndÅej BÃlka wrote:
> Hi,
>
> I was occupied for last few week on analyzing memcpy and memset and I
> have better implementations than current. This patch is about memcpy.
>
> Benchmark results are at
> http://kam.mff.cuni.cz/~ondra/memcpy_profile.html
> or archived at
> http://kam.mff.cuni.cz/~ondra/memcpy_profile_result27_04_13.tar.bz2
>
> I tried to modify this for memmove and I found that additional
> cost is close to zero when not overlapping.
> So this implementation can be aliased to memmove.
>
> Important part there is test of memcpy in hooked gcc which shows small
> but real speedup. A memcpy_new 1) is faster on newer processors while
> memcpy_new_small on slower.
>
> Could we test 2) on wider range of usecases and report results?
>
> Here we hit fact that strings in practice are small and there is we hit
> latency to get data.
>
> The microbenchmarks tests look much better.
>
> Main speedup is obtained by avoiding computed loops and simplify control
> flow for better speculative execution. 1)
>
> This gives 20% speedup for 32-1000 byte strings.
>
> Second is that loop that I use is in most architectures asymptoticaly
> faster than gcc one for data in L1, L2, L3 cache.
> When data is in memory then memory is bottleneck and choice of
> implementation can give at most 1%.
>
> I tested avx version which is slower on current processors due fact that
> it is faster to load high and low half separately.
>
> 1) Except core2,athlon where I need even simpler control flow
> (memcpy_new_small) to get that speedup.
>
> I attached file from which I generated this patch. There are few
> mistakes made by gcc, I could post diff againist vanilla version.
>
> I did not tried optimize for atom yet so I keep ifunc for it.
>
> Passes testsuite. OK for 2.18?
>
> Ondra
>
> 1) File variant/memcpy_new_small.s in 2)
> 2) http://kam.mff.cuni.cz/~ondra/memcpy_profile27_04_13.tar.bz2
>
>
> * sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: New file.
> * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Add
> __memcpy_sse2_unaligned ifunc selection.
> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
> Add memcpy-sse2-unaligned.S.
> sysdeps/x86_64/multiarch/ifunc-impl-list.c: __memcpy_sse2_unaligned.
>
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 +-
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 1 +
> sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S | 176 ++++++++++++++++++++++
> sysdeps/x86_64/multiarch/memcpy.S | 15 +-
> 4 files changed, 186 insertions(+), 8 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 86787ee..203d16e 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -7,7 +7,7 @@ endif
> ifeq ($(subdir),string)
>
> sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
> - strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
> + strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
> memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
> strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 05315fd..28d3579 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -227,6 +227,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
> __memcpy_ssse3_back)
> IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
> + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
>
> /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
> diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
> new file mode 100644
> index 0000000..1b86c41
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
> @@ -0,0 +1,176 @@
> +/* memcpy with unaliged loads
> + Copyright (C) 2013 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +#include "asm-syntax.h"
> +
> +#ifndef ALIGN
> +# define ALIGN(n) .p2align n
> +#endif
> +
> +
> +ENTRY(__memcpy_sse2_unaligned)
> + movq %rsi, %rax
> + leaq (%rdx,%rdx), %rcx
> + subq %rdi, %rax
> + subq %rdx, %rax
> + cmpq %rcx, %rax
> + jb L(overlapping)
> + cmpq $16, %rdx
> + jbe L(less_16)
> + movdqu (%rsi), %xmm8
> + cmpq $32, %rdx
> + movdqu %xmm8, (%rdi)
> + movdqu -16(%rsi,%rdx), %xmm8
> + movdqu %xmm8, -16(%rdi,%rdx)
> + ja .L31
> +L(return):
> + movq %rdi, %rax
> + ret
> + .p2align 4,,10
> + ALIGN(4)
> +.L31:
> + movdqu 16(%rsi), %xmm8
> + cmpq $64, %rdx
> + movdqu %xmm8, 16(%rdi)
> + movdqu -32(%rsi,%rdx), %xmm8
> + movdqu %xmm8, -32(%rdi,%rdx)
> + jbe L(return)
> + movdqu 32(%rsi), %xmm8
> + cmpq $128, %rdx
> + movdqu %xmm8, 32(%rdi)
> + movdqu -48(%rsi,%rdx), %xmm8
> + movdqu %xmm8, -48(%rdi,%rdx)
> + movdqu 48(%rsi), %xmm8
> + movdqu %xmm8, 48(%rdi)
> + movdqu -64(%rsi,%rdx), %xmm8
> + movdqu %xmm8, -64(%rdi,%rdx)
> + jbe L(return)
> + leaq 64(%rdi), %rcx
> + addq %rdi, %rdx
> + andq $-64, %rdx
> + andq $-64, %rcx
> + movq %rcx, %rax
> + subq %rdi, %rax
> + addq %rax, %rsi
> + cmpq %rdx, %rcx
> + je L(return)
> + movq %rsi, %r10
> + subq %rcx, %r10
> + leaq 16(%r10), %r9
> + leaq 32(%r10), %r8
> + leaq 48(%r10), %rax
> + .p2align 4,,10
> + ALIGN(4)
> +L(loop):
> + movdqu (%rcx,%r10), %xmm8
> + movdqa %xmm8, (%rcx)
> + movdqu (%rcx,%r9), %xmm8
> + movdqa %xmm8, 16(%rcx)
> + movdqu (%rcx,%r8), %xmm8
> + movdqa %xmm8, 32(%rcx)
> + movdqu (%rcx,%rax), %xmm8
> + movdqa %xmm8, 48(%rcx)
> + addq $64, %rcx
> + cmpq %rcx, %rdx
> + jne L(loop)
> + jmp L(return)
> +L(overlapping):
> + cmpq %rsi, %rdi
> + jae .L3
> + testq %rdx, %rdx
> + .p2align 4,,5
> + je L(return)
> + movq %rdx, %r9
> + leaq 16(%rsi), %rcx
> + leaq 16(%rdi), %r8
> + shrq $4, %r9
> + movq %r9, %rax
> + salq $4, %rax
> + cmpq %rcx, %rdi
> + setae %cl
> + cmpq %r8, %rsi
> + setae %r8b
> + orl %r8d, %ecx
> + cmpq $15, %rdx
> + seta %r8b
> + testb %r8b, %cl
> + je .L16
> + testq %rax, %rax
> + je .L16
> + xorl %ecx, %ecx
> + xorl %r8d, %r8d
> +.L7:
> + movdqu (%rsi,%rcx), %xmm8
> + addq $1, %r8
> + movdqu %xmm8, (%rdi,%rcx)
> + addq $16, %rcx
> + cmpq %r8, %r9
> + ja .L7
> + cmpq %rax, %rdx
> + je L(return)
> +.L21:
> + movzbl (%rsi,%rax), %ecx
> + movb %cl, (%rdi,%rax)
> + addq $1, %rax
> + cmpq %rax, %rdx
> + ja .L21
> + jmp L(return)
> +L(less_16):
> + testb $24, %dl
> + jne L(between_9_16)
> + testb $4, %dl
> + .p2align 4,,5
> + jne L(between_5_8)
> + testq %rdx, %rdx
> + .p2align 4,,2
> + je L(return)
> + movzbl (%rsi), %eax
> + testb $2, %dl
> + movb %al, (%rdi)
> + je L(return)
> + movzwl -2(%rsi,%rdx), %eax
> + movw %ax, -2(%rdi,%rdx)
> + jmp L(return)
> +.L3:
> + leaq -1(%rdx), %rax
> + .p2align 4,,10
> + ALIGN(4)
> +.L11:
> + movzbl (%rsi,%rax), %edx
> + movb %dl, (%rdi,%rax)
> + subq $1, %rax
> + jmp .L11
> +L(between_9_16):
> + movq (%rsi), %rax
> + movq %rax, (%rdi)
> + movq -8(%rsi,%rdx), %rax
> + movq %rax, -8(%rdi,%rdx)
> + jmp L(return)
> +.L16:
> + xorl %eax, %eax
> + jmp .L21
> +L(between_5_8):
> + movl (%rsi), %eax
> + movl %eax, (%rdi)
> + movl -4(%rsi,%rdx), %eax
> + movl %eax, -4(%rdi,%rdx)
> + jmp L(return)
> +END(__memcpy_sse2_unaligned)
> +
> diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
> index b452f53..a1e5031 100644
> --- a/sysdeps/x86_64/multiarch/memcpy.S
> +++ b/sysdeps/x86_64/multiarch/memcpy.S
> @@ -33,13 +33,14 @@ ENTRY(__new_memcpy)
> jne 1f
> call __init_cpu_features
> 1: leaq __memcpy_sse2(%rip), %rax
> - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> - jz 2f
> - leaq __memcpy_ssse3(%rip), %rax
> - testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
> - jz 2f
> - leaq __memcpy_ssse3_back(%rip), %rax
> -2: ret
> + testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
> + jnz 2f
> + leaq __memcpy_sse2_unaligned(%rip), %rax
> + ret
> +2: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> + jz 3f
> + leaq __memcpy_ssse3(%rip), %rax
> +3: ret
> END(__new_memcpy)
>
> # undef ENTRY
> --
> 1.7.4.4
>
> #include <emmintrin.h>
> #include <stdint.h>
> #include <stdlib.h>
> typedef __m128i tp_vector;
> typedef uint64_t tp_mask;
>
> #define LOAD(x) _mm_load_si128 ((tp_vector *) (x))
> #define LOADU(x) _mm_loadu_si128 ((tp_vector *) (x))
> #define STORE(x, y) _mm_store_si128 ((tp_vector *) (x), (y))
> #define STOREU(x, y) _mm_storeu_si128 ((tp_vector *) (x), (y))
> #define CONCAT(x, y, n) _mm_alignr_epi8 (y , x ,n)
> #define MIN _mm_min_epu8
> #define EQ _mm_cmpeq_epi8
> #define OR _mm_or_si128
>
> #define BROADCAST(x) _mm_set1_epi8 (x)
> #define get_mask(x) ((uint64_t) _mm_movemask_epi8 (x))
>
> /* Align VALUE down by ALIGN bytes. */
> #define ALIGN_DOWN(value, align) \
> ALIGN_DOWN_M1(value, align - 1)
> /* Align VALUE down by ALIGN_M1 + 1 bytes.
> Useful if you have precomputed ALIGN - 1. */
> #define ALIGN_DOWN_M1(value, align_m1) \
> (void *)((uintptr_t)(value) \
> & ~(uintptr_t)(align_m1))
>
> /* Align VALUE up by ALIGN bytes. */
> #define ALIGN_UP(value, align) \
> ALIGN_UP_M1(value, align - 1)
> /* Align VALUE up by ALIGN_M1 + 1 bytes.
> Useful if you have precomputed ALIGN - 1. */
> #define ALIGN_UP_M1(value, align_m1) \
> (void *)(((uintptr_t)(value) + (uintptr_t)(align_m1)) \
> & ~(uintptr_t)(align_m1))
>
>
> static char *__memcpy_overlap(char *dest,char *src,size_t n)
> {
> size_t i;
> if (dest < src)
> for (i=0;i < n ;i++)
> dest[i]=src[i];
> else
> for (i=n-1;i >=0 ;i--)
> dest[i]=src[i];
> return dest;
> }
> char *memcpy_libc(char *x,char *y,size_t n);
>
> static char *memcpy_small (char *dest, char *src, size_t no, char *ret);
> void *memcpy_new_u(char *dest, char *src, size_t n)
> {
> char *from,*mid,*to;
> char *dest2= dest;
> if (__builtin_expect((uint64_t)((src - dest)-n) < 2*n,0))
> return __memcpy_overlap(dest,src,n);
> if (__builtin_expect(n <= 128,1))
> {
> if (n<= 16);
> return memcpy_small(dest, src, n, dest2);
> STOREU(dest, LOADU(src));
> STOREU(dest + n - 16, LOADU(src + n - 16));
> if (n <= 32) return dest2;
> STOREU(dest+16, LOADU(src+16));
> STOREU(dest + n - 32, LOADU(src + n - 32));
> if (n <= 64) return dest2;
> STOREU(dest+32, LOADU(src+32));
> STOREU(dest + n - 48, LOADU(src + n - 48));
> STOREU(dest+48, LOADU(src+48));
> STOREU(dest + n - 64, LOADU(src + n - 64));
> return dest2;
> }
> STOREU(dest, LOADU(src));
> STOREU(dest + n - 16, LOADU(src + n - 16));
> STOREU(dest+16, LOADU(src+16));
> STOREU(dest + n - 32, LOADU(src + n - 32));
> STOREU(dest+32, LOADU(src+32));
> STOREU(dest + n - 48, LOADU(src + n - 48));
> STOREU(dest+48, LOADU(src+48));
> STOREU(dest + n - 64, LOADU(src + n - 64));
>
> from = ALIGN_DOWN(dest + 64, 64);
> to = ALIGN_DOWN(dest + n , 64);
> src += from - dest;
> while (from != to)
> {
> STORE(from, LOADU(src));
> STORE(from+16, LOADU(src+16));
> STORE(from+32, LOADU(src+32));
> STORE(from+48, LOADU(src+48));
> from += 64;
> src += 64;
> }
> return dest2;
> }
>
> static char *memcpy_small (char *dest, char *src, size_t no, char *ret)
> {
> if (no & (8 + 16))
> {
> ((uint64_t *) dest)[0] = ((uint64_t *) src)[0];
> ((uint64_t *)(dest + no - 8))[0] = ((uint64_t *)(src + no - 8))[0];
> return ret;
> }
> if (no & 4)
> {
> ((uint32_t *) dest)[0] = ((uint32_t *) src)[0];
> ((uint32_t *)(dest + no - 4))[0] = ((uint32_t *)(src + no - 4))[0];
> return ret;
> }
> if (no )
> dest[0] = src[0];
> if (no & 2)
> {
> ((uint16_t *)(dest + no - 2))[0] = ((uint16_t *)(src + no - 2))[0];
> return ret;
> }
> return ret;
> }
>
--
divide-by-zero error