This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH RFC V2] Improve 64bit memset for Corei7 with avx2 instruction
- From: ling dot ma dot program at gmail dot com
- To: libc-alpha at sourceware dot org
- Cc: neleai at seznam dot cz, liubov dot dmitrieva at gmail dot com, aj at suse dot com, Ma Ling <ling dot ml at alibaba-inc dot com>
- Date: Tue, 16 Jul 2013 09:37:41 -0400
- Subject: [PATCH RFC V2] Improve 64bit memset for Corei7 with avx2 instruction
From: Ma Ling <ling.ml@alibaba-inc.com>
In this patch we use the similar approach with memcpy to avoid branch instructions
and force destination to be aligned with avx instruction.
By gcc.403 benchmark we find memset spend more time than memcpy by 5~10 times.
The benchmark also indicate this patch improve performance from 30% to 100%
compared with original __memset_sse2.
Ondra, I sent test gcc.403 test suit ,patch for glibc and readme.txt as well.
Thanks
Ling
---
In this version we do clearify vzeroupper instruction to avoid SAVE & STORE Penalty.
vpshufb need only one cycle to fill xmm0 register, thanks Ondra.
sysdeps/x86_64/multiarch/Makefile | 2 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 2 +
sysdeps/x86_64/multiarch/memset-avx2.S | 202 +++++++++++++++++++++++++++++
3 files changed, 205 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f92cf18..ae666bf 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -18,7 +18,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
- memcmp-ssse3
+ memcmp-ssse3 memset-avx2
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 5639702..24d05d7 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -67,12 +67,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memset_chk.S. */
IFUNC_IMPL (i, name, __memset_chk,
+ IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_AVX2, __memset_chk_avx2)
IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2)
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_x86_64))
/* Support sysdeps/x86_64/multiarch/memset.S. */
IFUNC_IMPL (i, name, memset,
+ IFUNC_IMPL_ADD (array, i, memset, HAS_AVX2, __memset_avx2)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_x86_64))
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
new file mode 100644
index 0000000..dc778c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2.S
@@ -0,0 +1,202 @@
+/* memset with AVX2
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc
+
+#include "asm-syntax.h"
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+#ifndef MEMSET
+# define MEMSET __memset_avx2
+# define MEMSET_CHK __memset_chk_avx2
+#endif
+
+ .section .text.avx2,"ax",@progbits
+#if defined PIC
+ENTRY (MEMSET_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMSET_CHK)
+#endif
+
+ENTRY (MEMSET)
+ vpxor %xmm0, %xmm0, %xmm0
+ vmovd %esi, %xmm1
+ lea (%rdi, %rdx), %r8
+ vpshufb %xmm0, %xmm1, %xmm0
+ mov %rdi, %rax
+ cmp $256, %rdx
+ jae L(256bytesormore)
+ xor %ecx, %ecx
+ mov %sil, %cl
+ mov %cl, %ch
+ cmp $128, %rdx
+ jb L(less_128bytes)
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm0, 0x10(%rdi)
+ vmovups %xmm0, 0x20(%rdi)
+ vmovups %xmm0, 0x30(%rdi)
+ vmovups %xmm0, 0x40(%rdi)
+ vmovups %xmm0, 0x50(%rdi)
+ vmovups %xmm0, 0x60(%rdi)
+ vmovups %xmm0, 0x70(%rdi)
+ vmovups %xmm0, -0x80(%r8)
+ vmovups %xmm0, -0x70(%r8)
+ vmovups %xmm0, -0x60(%r8)
+ vmovups %xmm0, -0x50(%r8)
+ vmovups %xmm0, -0x40(%r8)
+ vmovups %xmm0, -0x30(%r8)
+ vmovups %xmm0, -0x20(%r8)
+ vmovups %xmm0, -0x10(%r8)
+ ret
+ ALIGN(4)
+L(less_128bytes):
+ xor %esi, %esi
+ mov %ecx, %esi
+ shl $16, %ecx
+ cmp $64, %edx
+ jb L(less_64bytes)
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm0, 0x10(%rdi)
+ vmovups %xmm0, 0x20(%rdi)
+ vmovups %xmm0, 0x30(%rdi)
+ vmovups %xmm0, -0x40(%r8)
+ vmovups %xmm0, -0x30(%r8)
+ vmovups %xmm0, -0x20(%r8)
+ vmovups %xmm0, -0x10(%r8)
+ ret
+ ALIGN(4)
+L(less_64bytes):
+ orl %esi, %ecx
+ mov %ecx, %esi
+ cmp $32, %edx
+ jb L(less_32bytes)
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm0, 0x10(%rdi)
+ vmovups %xmm0, -0x20(%r8)
+ vmovups %xmm0, -0x10(%r8)
+ ret
+ ALIGN(4)
+L(less_32bytes):
+ shl $32, %rcx
+ cmp $16, %edx
+ jb L(less_16bytes)
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm0, -0x10(%r8)
+ ret
+ ALIGN(4)
+L(less_16bytes):
+ or %rsi, %rcx
+ cmp $8, %edx
+ jb L(less_8bytes)
+ mov %rcx, (%rdi)
+ mov %rcx, -0x08(%r8)
+ ret
+ ALIGN(4)
+L(less_8bytes):
+ cmp $4, %edx
+ jb L(less_4bytes)
+ mov %ecx, (%rdi)
+ mov %ecx, -0x04(%r8)
+ ALIGN(4)
+L(less_4bytes):
+ cmp $2, %edx
+ jb L(less_2bytes)
+ mov %cx, (%rdi)
+ mov %cx, -0x02(%r8)
+ ret
+ ALIGN(4)
+L(less_2bytes):
+ cmp $1, %edx
+ jb L(less_1bytes)
+ mov %cl, (%rdi)
+L(less_1bytes):
+ ret
+
+ ALIGN(4)
+L(256bytesormore):
+ vinserti128 $1, %xmm0, %ymm0, %ymm0
+ vmovups %ymm0, (%rdi)
+ mov %rdi, %r9
+ and $-0x20, %rdi
+ add $32, %rdi
+ sub %rdi, %r9
+ add %r9, %rdx
+ cmp $4096, %rdx
+ ja L(gobble_data)
+
+ sub $0x80, %rdx
+L(gobble_128_loop):
+ prefetcht0 0x1c0(%rdi)
+ vmovaps %ymm0, (%rdi)
+ prefetcht0 0x280(%rdi)
+ vmovaps %ymm0, 0x20(%rdi)
+ vmovaps %ymm0, 0x40(%rdi)
+ vmovaps %ymm0, 0x60(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_128_loop)
+ vmovups %ymm0, -0x80(%r8)
+ vmovups %ymm0, -0x60(%r8)
+ vmovups %ymm0, -0x40(%r8)
+ vmovups %ymm0, -0x20(%r8)
+ vzeroupper
+ ret
+
+ ALIGN(4)
+L(gobble_data):
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %r9
+#else
+ mov __x86_64_shared_cache_size_half(%rip), %r9
+#endif
+ shl $4, %r9
+ cmp %r9, %rdx
+ ja L(gobble_big_data)
+ mov %rax, %r9
+ mov %esi, %eax
+ mov %rdx, %rcx
+ rep stosb
+ mov %r9, %rax
+ vzeroupper
+ ret
+
+ ALIGN(4)
+L(gobble_big_data):
+ sub $0x80, %rdx
+L(gobble_big_data_loop):
+ vmovntdq %ymm0, (%rdi)
+ vmovntdq %ymm0, 0x20(%rdi)
+ vmovntdq %ymm0, 0x40(%rdi)
+ vmovntdq %ymm0, 0x60(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_big_data_loop)
+ vmovups %ymm0, -0x80(%r8)
+ vmovups %ymm0, -0x60(%r8)
+ vmovups %ymm0, -0x40(%r8)
+ vmovups %ymm0, -0x20(%r8)
+ vzeroupper
+ sfence
+ ret
+
+END (MEMSET)
+#endif
--
1.8.1.4