[PATCH RFC] Imporve 64bit memset performance for Haswell CPU with AVX2 instruction
Marko Myllynen
myllynen@redhat.com
Tue Jul 1 09:03:00 GMT 2014
Hi,
On 2014-04-04 10:34, ling.ma.program@gmail.com wrote:
> From: Ling Ma <ling.ml@alibaba-inc.com>
>
> In this patch we manage to reduce miss branch prediction by
> avoid using branch instructions and force destination to be aligned
> with avx instruction.
>
> ---
> In this version we removed prefetch and append vmovd.
>
> ChangeLog | 9 ++
> sysdeps/x86_64/multiarch/Makefile | 4 +-
> sysdeps/x86_64/multiarch/memset-avx2.S | 192 +++++++++++++++++++++++++++++++++
> sysdeps/x86_64/multiarch/memset.S | 59 ++++++++++
> sysdeps/x86_64/multiarch/memset_chk.S | 44 ++++++++
> 5 files changed, 307 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
> create mode 100644 sysdeps/x86_64/multiarch/memset.S
> create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
>
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
> new file mode 100644
> index 0000000..08e8ee8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
> @@ -0,0 +1,192 @@
> +/* memset with AVX2
> + Copyright (C) 2014 Free Software Foundation, Inc.
> + Contributed by Alibaba Group.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +#if !defined NOT_IN_libc
> +
> +#include "asm-syntax.h"
> +#ifndef ALIGN
> +# define ALIGN(n) .p2align n
> +#endif
> +#ifndef MEMSET
> +# define MEMSET __memset_avx2
> +# define MEMSET_CHK __memset_chk_avx2
> +#endif
> +
> + .section .text.avx2,"ax",@progbits
> +#if defined PIC
> +ENTRY (MEMSET_CHK)
> + cmpq %rdx, %rcx
> + jb HIDDEN_JUMPTARGET (__chk_fail)
> +END (MEMSET_CHK)
> +#endif
> +
> +ENTRY (MEMSET)
> + vpxor %xmm0, %xmm0, %xmm0
> + vmovd %esi, %xmm1
> + lea (%rdi, %rdx), %r8
> + vpshufb %xmm0, %xmm1, %xmm0
> + mov %rdi, %rax
> + cmp $256, %rdx
> + jae L(256bytesormore)
> + vmovd %xmm0, %rcx
> + cmp $128, %rdx
> + jb L(less_128bytes)
> + vmovups %xmm0, (%rdi)
> + vmovups %xmm0, 0x10(%rdi)
> + vmovups %xmm0, 0x20(%rdi)
> + vmovups %xmm0, 0x30(%rdi)
> + vmovups %xmm0, 0x40(%rdi)
> + vmovups %xmm0, 0x50(%rdi)
> + vmovups %xmm0, 0x60(%rdi)
> + vmovups %xmm0, 0x70(%rdi)
> + vmovups %xmm0, -0x80(%r8)
> + vmovups %xmm0, -0x70(%r8)
> + vmovups %xmm0, -0x60(%r8)
> + vmovups %xmm0, -0x50(%r8)
> + vmovups %xmm0, -0x40(%r8)
> + vmovups %xmm0, -0x30(%r8)
> + vmovups %xmm0, -0x20(%r8)
> + vmovups %xmm0, -0x10(%r8)
> + ret
> + ALIGN(4)
> +L(less_128bytes):
> + cmp $64, %edx
> + jb L(less_64bytes)
> + vmovups %xmm0, (%rdi)
> + vmovups %xmm0, 0x10(%rdi)
> + vmovups %xmm0, 0x20(%rdi)
> + vmovups %xmm0, 0x30(%rdi)
> + vmovups %xmm0, -0x40(%r8)
> + vmovups %xmm0, -0x30(%r8)
> + vmovups %xmm0, -0x20(%r8)
> + vmovups %xmm0, -0x10(%r8)
> + ret
> + ALIGN(4)
> +L(less_64bytes):
> + cmp $32, %edx
> + jb L(less_32bytes)
> + vmovups %xmm0, (%rdi)
> + vmovups %xmm0, 0x10(%rdi)
> + vmovups %xmm0, -0x20(%r8)
> + vmovups %xmm0, -0x10(%r8)
> + ret
> + ALIGN(4)
> +L(less_32bytes):
> + cmp $16, %edx
> + jb L(less_16bytes)
> + vmovups %xmm0, (%rdi)
> + vmovups %xmm0, -0x10(%r8)
> + ret
> + ALIGN(4)
> +L(less_16bytes):
> + cmp $8, %edx
> + jb L(less_8bytes)
> + mov %rcx, (%rdi)
> + mov %rcx, -0x08(%r8)
> + ret
> + ALIGN(4)
> +L(less_8bytes):
> + cmp $4, %edx
> + jb L(less_4bytes)
> + mov %ecx, (%rdi)
> + mov %ecx, -0x04(%r8)
> + ALIGN(4)
> +L(less_4bytes):
> + cmp $2, %edx
> + jb L(less_2bytes)
> + mov %cx, (%rdi)
> + mov %cx, -0x02(%r8)
> + ret
> + ALIGN(4)
> +L(less_2bytes):
> + cmp $1, %edx
> + jb L(less_1bytes)
> + mov %cl, (%rdi)
> +L(less_1bytes):
> + ret
> +
> + ALIGN(4)
> +L(256bytesormore):
> + vinserti128 $1, %xmm0, %ymm0, %ymm0
this breaks build on RHEL 6 x86_64:
../sysdeps/x86_64/multiarch/memset-avx2.S:
../sysdeps/x86_64/multiarch/memset-avx2.S: Assembler messages:
Assembler messages:
../sysdeps/x86_64/multiarch/memset-avx2.S:132:
../sysdeps/x86_64/multiarch/memset-avx2.S:132: Error: Error: no such
instruction: `vinserti128 $1,%xmm0,%ymm0,%ymm0'no such instruction:
`vinserti128 $1,%xmm0,%ymm0,%ymm0'
Cheers,
--
Marko Myllynen
More information about the Libc-alpha
mailing list