This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb


On Wed, Mar 30, 2016 at 4:18 PM, Carlos O'Donell <carlos@redhat.com> wrote:
> On 03/29/2016 03:23 PM, H.J. Lu wrote:
>> The goal of this patch is to replace SSE2 and AVX2 memset.S
>> with faster and smaller alternatives, also support 64-byte vector
>> register size.  bench-memset data on various Intel and AMD
>> processors is at
>>
>> https://sourceware.org/bugzilla/show_bug.cgi?id=19881
>>
>> Any comments, feedbacks?
>
> Caveats about Penryn being slower apply here, and I expect your answer
> is the same: the selection of the ifunc will not change, and so Penryn
> will not use the newer versions.
>
> This looks good to me.
>
> Again, same question about thresholding below.
>

Here is the updated patch I am going to check in.

Thanks.

-- 
H.J.
From 7df7c6a195d6bc6ffdd90db0786d5de9c67d037a Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 25 Mar 2016 08:20:17 -0700
Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb

Implement x86-64 memset with unaligned store and rep movsb.  Support
16-byte, 32-byte and 64-byte vector register sizes.  A single file
provides 2 implementations of memset, one with rep stosb and the other
without rep stosb.  They share the same codes when size is between 2
times of vector register size and REP_STOSB_THRESHOLD which defaults
to 2KB.

Key features:

1. Use overlapping store to avoid branch.
2. For size <= 4 times of vector register size, fully unroll the loop.
3. For size > 4 times of vector register size, store 4 times of vector
register size at a time.

	[BZ #19881]
	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
	memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
	memset-avx512-unaligned-erms.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
	__memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
	__memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
	__memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
	__memset_sse2_unaligned_erms, __memset_erms,
	__memset_avx2_unaligned, __memset_avx2_unaligned_erms,
	__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
	file.
	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
	Likewise.
	* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
	Likewise.
	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
	Likewise.
---
 sysdeps/x86_64/multiarch/Makefile                  |   5 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c         |  33 +++
 .../x86_64/multiarch/memset-avx2-unaligned-erms.S  |  14 ++
 .../multiarch/memset-avx512-unaligned-erms.S       |  17 ++
 .../x86_64/multiarch/memset-sse2-unaligned-erms.S  |  16 ++
 .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 251 +++++++++++++++++++++
 6 files changed, 335 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
 create mode 100644 sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 create mode 100644 sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
 create mode 100644 sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ef4dbc0..8878efb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memset-avx512-no-vzeroupper \
 		   memmove-sse2-unaligned-erms \
 		   memmove-avx-unaligned-erms \
-		   memmove-avx512-unaligned-erms
+		   memmove-avx512-unaligned-erms \
+		   memset-sse2-unaligned-erms \
+		   memset-avx2-unaligned-erms \
+		   memset-avx512-unaligned-erms
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9204da4..1e880f6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -118,12 +118,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, __memset_chk,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_sse2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_chk_avx2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_chk_avx2_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memset_chk_avx512_no_vzeroupper)
 #endif
 	      )
@@ -131,12 +147,29 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+	      IFUNC_IMPL_ADD (array, i, memset, 1,
+			      __memset_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset, 1,
+			      __memset_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_avx2)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memset_avx512_no_vzeroupper)
 #endif
 	     )
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
new file mode 100644
index 0000000..e0dc565
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -0,0 +1,14 @@
+#define VEC_SIZE	32
+#define VEC(i)		ymm##i
+#define VMOVU		vmovdqu
+#define VMOVA		vmovdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastb %xmm0, %ymm0
+
+#define SECTION(p)		p##.avx
+#define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
new file mode 100644
index 0000000..72f4095
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -0,0 +1,17 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE	64
+# define VEC(i)		zmm##i
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastb %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
+# define SECTION(p)		p##.avx512
+# define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
new file mode 100644
index 0000000..437a858
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -0,0 +1,16 @@
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+#define VMOVU		movdqu
+#define VMOVA		movdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  punpcklbw %xmm0, %xmm0; \
+  punpcklwd %xmm0, %xmm0; \
+  pshufd $0, %xmm0, %xmm0
+
+#define SECTION(p)		p
+#define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
new file mode 100644
index 0000000..9383517
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -0,0 +1,251 @@
+/* memset/bzero with unaligned store and rep stosb
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* memset is implemented as:
+   1. Use overlapping store to avoid branch.
+   2. Force 32-bit displacement for branches to avoid long nop between
+      instructions.
+   3. If size is less than VEC, use integer register stores.
+   4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+   5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+   6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+      4 VEC stores and store 4 * VEC at a time until done.  */
+
+#include <sysdep.h>
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER			vzeroupper
+# else
+#  define VZEROUPPER
+# endif
+#endif
+
+#ifndef VZEROUPPER_SHORT_RETURN
+# if VEC_SIZE > 16
+#  define VZEROUPPER_SHORT_RETURN	vzeroupper
+# else
+#  define VZEROUPPER_SHORT_RETURN	rep
+# endif
+#endif
+
+#ifndef MOVQ
+# if VEC_SIZE > 16
+#  define MOVQ				vmovq
+# else
+#  define MOVQ				movq
+# endif
+#endif
+
+/* Threshold to use Enhanced REP STOSB.  Since there is overhead to set
+   up REP STOSB operation, REP STOSB isn't faster on short data.  The
+   memset micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP STOSB becomes faster on processors with
+   Enhanced REP STOSB.  Since the stored value is fixed, larger register
+   size has minimal impact on threshold.  */
+#ifndef REP_STOSB_THRESHOLD
+# define REP_STOSB_THRESHOLD		2048
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+#if !defined USE_MULTIARCH && IS_IN (libc)
+	.section SECTION(.text),"ax",@progbits
+ENTRY (__bzero)
+	movq	%rdi, %rax /* Set return value.  */
+	movq	%rsi, %rdx /* Set n.  */
+	pxor	%xmm0, %xmm0
+	jmp	L(entry_from_bzero)
+END (__bzero)
+weak_alias (__bzero, bzero)
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+L(memset_entry):
+	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+L(entry_from_bzero):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), (%rdi)
+	VZEROUPPER
+	ret
+END (MEMSET_SYMBOL (__memset, unaligned))
+
+#if VEC_SIZE == 16
+/* Only used to measure performance of REP STOSB.  */
+ENTRY (__memset_erms)
+#else
+/* Provide a symbol to debugger.  */
+ENTRY (MEMSET_SYMBOL (__memset, erms))
+#endif
+L(stosb):
+	movq	%rdx, %rcx
+	movzbl	%sil, %eax
+	movq	%rdi, %rdx
+	rep stosb
+	movq	%rdx, %rax
+	ret
+#if VEC_SIZE == 16
+END (__memset_erms)
+#else
+END (MEMSET_SYMBOL (__memset, erms))
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(stosb_more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), (%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(stosb_more_2x_vec):
+	cmpq	$REP_STOSB_THRESHOLD, %rdx
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	ja.d32	L(stosb)
+	.p2align 4
+L(more_2x_vec):
+	cmpq  $(VEC_SIZE * 4), %rdx
+	ja	L(loop_start)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+L(return):
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(loop_start):
+	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+	VMOVU	%VEC(0), (%rdi)
+	andq	$-(VEC_SIZE * 4), %rcx
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+	addq	%rdi, %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+	cmpq	%rdx, %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	je.d32	L(return)
+# else
+	je	L(return)
+# endif
+	.p2align 4
+L(loop):
+	VMOVA	%VEC(0), (%rcx)
+	VMOVA	%VEC(0), VEC_SIZE(%rcx)
+	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
+	addq	$(VEC_SIZE * 4), %rcx
+	cmpq	%rcx, %rdx
+	jne	L(loop)
+	VZEROUPPER_SHORT_RETURN
+	ret
+L(less_vec):
+	/* Less than 1 VEC.  */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+#  error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+	cmpb	$32, %dl
+	jae	L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+# endif
+	MOVQ	%xmm0, %rcx
+	cmpb	$8, %dl
+	jae	L(between_8_15)
+	cmpb	$4, %dl
+	jae	L(between_4_7)
+	cmpb	$1, %dl
+	ja	L(between_2_3)
+	jb	1f
+	movb	%cl, (%rdi)
+1:
+	VZEROUPPER
+	ret
+# if VEC_SIZE > 32
+	/* From 32 to 63.  No branch when size == 32.  */
+L(between_32_63):
+	vmovdqu	%ymm0, -32(%rdi,%rdx)
+	vmovdqu	%ymm0, (%rdi)
+	VZEROUPPER
+	ret
+# endif
+# if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+	vmovdqu	%xmm0, -16(%rdi,%rdx)
+	vmovdqu	%xmm0, (%rdi)
+	VZEROUPPER
+	ret
+# endif
+	/* From 8 to 15.  No branch when size == 8.  */
+L(between_8_15):
+	movq	%rcx, -8(%rdi,%rdx)
+	movq	%rcx, (%rdi)
+	VZEROUPPER
+	ret
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	%ecx, -4(%rdi,%rdx)
+	movl	%ecx, (%rdi)
+	VZEROUPPER
+	ret
+L(between_2_3):
+	/* From 2 to 3.  No branch when size == 2.  */
+	movw	%cx, -2(%rdi,%rdx)
+	movw	%cx, (%rdi)
+	VZEROUPPER
+	ret
+END (MEMSET_SYMBOL (__memset, unaligned_erms))
-- 
2.5.5


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]