x86: Improve vec generation in memset-vec-unaligned-erms.S

author Noah Goldstein <goldstein.w.n@gmail.com>

Sun, 6 Feb 2022 06:54:18 +0000 (00:54 -0600)

committer Noah Goldstein <goldstein.w.n@gmail.com>

Mon, 7 Feb 2022 02:58:07 +0000 (20:58 -0600)
author Noah Goldstein <goldstein.w.n@gmail.com>
Sun, 6 Feb 2022 06:54:18 +0000 (00:54 -0600)
committer Noah Goldstein <goldstein.w.n@gmail.com>
Mon, 7 Feb 2022 02:58:07 +0000 (20:58 -0600)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S

index 65c09bd0ac5acad371e19b429854e5729e9a1af9..ccf036be534f7dedb5125cd9a6c5b98a28739af3 100644 (file)
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -28,17 +28,22 @@
  #define VMOVU     movups
  #define VMOVA     movaps
  
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
    movd d, %xmm0; \
-  movq r, %rax; \
-  punpcklbw %xmm0, %xmm0; \
-  punpcklwd %xmm0, %xmm0; \
-  pshufd $0, %xmm0, %xmm0
+  pxor %xmm1, %xmm1; \
+  pshufb %xmm1, %xmm0; \
+  movq r, %rax
  
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
    movd d, %xmm0; \
-  movq r, %rax; \
-  pshufd $0, %xmm0, %xmm0
+  pshufd $0, %xmm0, %xmm0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
  
  #define SECTION(p)             p
  
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S

index 1af668af0aeda59ee2d7740b875881f6fc24e98f..c0bf2875d03d51ab6c0c759bbde876c0afc6bc0f 100644 (file)
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -10,15 +10,18 @@
  # define VMOVU     vmovdqu
  # define VMOVA     vmovdqa
  
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
    vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastb %xmm0, %ymm0
+  movq r, %rax;
  
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
  
  # ifndef SECTION
  #  define SECTION(p)           p##.avx
@@ -30,5 +33,6 @@
  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
  # endif
  
+# define USE_XMM_LESS_VEC
  # include "memset-vec-unaligned-erms.S"
  #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S

index f14d6f8493c21a360d613f61d6a4745c0cbfd0a8..5241216a77bf72b714f342bef0919fb87a07bfbd 100644 (file)
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -15,13 +15,19 @@
  
  # define VZEROUPPER
  
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
  
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
  
  # define SECTION(p)            p##.evex512
  # define MEMSET_SYMBOL(p,s)    p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S

index 64b09e77cc20cc42c1ebd2b61cbd674b0ca26e5d..637002150659123c7fce8612e03eb6b041cfba05 100644 (file)
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -15,13 +15,19 @@
  
  # define VZEROUPPER
  
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
  
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
  
  # define SECTION(p)            p##.evex
  # define MEMSET_SYMBOL(p,s)    p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S

index 1e0511c79abcf3840bea74d3a2fc62d36ad24d8e..1b502b78e424b8b39b11d8a1b4670818bb11862e 100644 (file)
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -58,8 +58,10 @@
  #ifndef MOVQ
  # if VEC_SIZE > 16
  #  define MOVQ                         vmovq
+#  define MOVD                         vmovd
  # else
  #  define MOVQ                         movq
+#  define MOVD                         movd
  # endif
  #endif
  
@@ -72,9 +74,17 @@
  #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
  # define END_REG       rcx
  # define LOOP_REG      rdi
+# define LESS_VEC_REG  rax
  #else
  # define END_REG       rdi
  # define LOOP_REG      rdx
+# define LESS_VEC_REG  rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL     1
+#else
+# define XMM_SMALL     0
  #endif
  
  #define PAGE_SIZE 4096
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
  
  ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
         shl     $2, %RDX_LP
-       WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
-       jmp     L(entry_from_bzero)
+       WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+       WMEMSET_VDUP_TO_VEC0_LOW()
+       cmpq    $VEC_SIZE, %rdx
+       jb      L(less_vec_no_vdup)
+       WMEMSET_VDUP_TO_VEC0_HIGH()
+       jmp     L(entry_from_wmemset)
  END (WMEMSET_SYMBOL (__wmemset, unaligned))
  #endif
  
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
  #endif
  
  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+       MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
  # ifdef __ILP32__
         /* Clear the upper 32 bits.  */
         mov     %edx, %edx
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
  L(entry_from_bzero):
         cmpq    $VEC_SIZE, %rdx
         jb      L(less_vec)
+       MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
         cmpq    $(VEC_SIZE * 2), %rdx
         ja      L(more_2x_vec)
         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
  # endif
  
  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
-       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+       MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
  # ifdef __ILP32__
         /* Clear the upper 32 bits.  */
         mov     %edx, %edx
  # endif
         cmp     $VEC_SIZE, %RDX_LP
         jb      L(less_vec)
+       MEMSET_VDUP_TO_VEC0_HIGH ()
         cmp     $(VEC_SIZE * 2), %RDX_LP
         ja      L(stosb_more_2x_vec)
-       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
-        */
-       VMOVU   %VEC(0), (%rax)
-       VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
+       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
         VZEROUPPER_RETURN
  #endif
  
-       .p2align 4,, 10
+       .p2align 4,, 4
  L(last_2x_vec):
  #ifdef USE_LESS_VEC_MASK_STORE
-       VMOVU   %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
-       VMOVU   %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+       VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
  #else
         VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi)
         VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi)
@@ -212,6 +228,7 @@ L(last_2x_vec):
  #ifdef USE_LESS_VEC_MASK_STORE
         .p2align 4,, 10
  L(less_vec):
+L(less_vec_no_vdup):
         /* Less than 1 VEC.  */
  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
  #  error Unsupported VEC_SIZE!
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
         /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
            and (4x, 8x] jump to target.  */
  L(more_2x_vec):
-
-       /* Two different methods of setting up pointers / compare. The
-          two methods are based on the fact that EVEX/AVX512 mov
-          instructions take more bytes then AVX2/SSE2 mov instructions. As
-          well that EVEX/AVX512 machines also have fast LEA_BID. Both
-          setup and END_REG to avoid complex address mode. For EVEX/AVX512
-          this saves code size and keeps a few targets in one fetch block.
-          For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
-       /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
-          LOOP_4X_OFFSET) with LEA_BID.  */
-
-       /* END_REG is rcx for EVEX/AVX512.  */
-       leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
-#endif
-
-       /* Stores to first 2x VEC before cmp as any path forward will
-          require it.  */
-       VMOVU   %VEC(0), (%rax)
-       VMOVU   %VEC(0), VEC_SIZE(%rax)
+       /* Store next 2x vec regardless.  */
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(0), (VEC_SIZE * 1)(%rdi)
  
  
+       /* Two different methods of setting up pointers / compare. The two
+          methods are based on the fact that EVEX/AVX512 mov instructions take
+          more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+          machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+          address mode. For EVEX/AVX512 this saves code size and keeps a few
+          targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+          bottlenecks.  */
  #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
         /* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
         addq    %rdx, %END_REG
@@ -292,6 +299,15 @@ L(more_2x_vec):
         cmpq    $(VEC_SIZE * 4), %rdx
         jbe     L(last_2x_vec)
  
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+       /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+          LEA_BID.  */
+
+       /* END_REG is rcx for EVEX/AVX512.  */
+       leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
         /* Store next 2x vec regardless.  */
         VMOVU   %VEC(0), (VEC_SIZE * 2)(%rax)
         VMOVU   %VEC(0), (VEC_SIZE * 3)(%rax)
@@ -355,65 +371,93 @@ L(stosb_local):
         /* Define L(less_vec) only if not otherwise defined.  */
         .p2align 4
  L(less_vec):
+       /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+          xmm). This is only does anything for AVX2.  */
+       MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_no_vdup):
  #endif
  L(cross_page):
  #if VEC_SIZE > 32
         cmpl    $32, %edx
-       jae     L(between_32_63)
+       jge     L(between_32_63)
  #endif
  #if VEC_SIZE > 16
         cmpl    $16, %edx
-       jae     L(between_16_31)
+       jge     L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+       MOVQ    %XMM0, %rcx
  #endif
-       MOVQ    %XMM0, %rdi
         cmpl    $8, %edx
-       jae     L(between_8_15)
+       jge     L(between_8_15)
         cmpl    $4, %edx
-       jae     L(between_4_7)
+       jge     L(between_4_7)
         cmpl    $1, %edx
-       ja      L(between_2_3)
-       jb      L(return)
-       movb    %sil, (%rax)
-       VZEROUPPER_RETURN
+       jg      L(between_2_3)
+       jl      L(between_0_0)
+       movb    %sil, (%LESS_VEC_REG)
+L(between_0_0):
+       ret
  
-       /* Align small targets only if not doing so would cross a fetch
-          line.  */
+       /* Align small targets only if not doing so would cross a fetch line.
+        */
  #if VEC_SIZE > 32
         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
         /* From 32 to 63.  No branch when size == 32.  */
  L(between_32_63):
-       VMOVU   %YMM0, (%rax)
-       VMOVU   %YMM0, -32(%rax, %rdx)
+       VMOVU   %YMM0, (%LESS_VEC_REG)
+       VMOVU   %YMM0, -32(%LESS_VEC_REG, %rdx)
         VZEROUPPER_RETURN
  #endif
  
  #if VEC_SIZE >= 32
-       .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+       .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
  L(between_16_31):
         /* From 16 to 31.  No branch when size == 16.  */
-       VMOVU   %XMM0, (%rax)
-       VMOVU   %XMM0, -16(%rax, %rdx)
-       VZEROUPPER_RETURN
+       VMOVU   %XMM0, (%LESS_VEC_REG)
+       VMOVU   %XMM0, -16(%LESS_VEC_REG, %rdx)
+       ret
  #endif
  
-       .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+       /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+        */
+       .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
  L(between_8_15):
         /* From 8 to 15.  No branch when size == 8.  */
-       movq    %rdi, (%rax)
-       movq    %rdi, -8(%rax, %rdx)
-       VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+       MOVQ    %XMM0, (%rdi)
+       MOVQ    %XMM0, -8(%rdi, %rdx)
+#else
+       movq    %rcx, (%LESS_VEC_REG)
+       movq    %rcx, -8(%LESS_VEC_REG, %rdx)
+#endif
+       ret
  
-       .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+       /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+        */
+       .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
  L(between_4_7):
         /* From 4 to 7.  No branch when size == 4.  */
-       movl    %edi, (%rax)
-       movl    %edi, -4(%rax, %rdx)
-       VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+       MOVD    %XMM0, (%rdi)
+       MOVD    %XMM0, -4(%rdi, %rdx)
+#else
+       movl    %ecx, (%LESS_VEC_REG)
+       movl    %ecx, -4(%LESS_VEC_REG, %rdx)
+#endif
+       ret
  
-       .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+       /* 4 * XMM_SMALL for the third mov for AVX2.  */
+       .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
  L(between_2_3):
         /* From 2 to 3.  No branch when size == 2.  */
-       movw    %di, (%rax)
-       movb    %dil, -1(%rax, %rdx)
-       VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+       movb    %sil, (%rdi)
+       movb    %sil, 1(%rdi)
+       movb    %sil, -1(%rdi, %rdx)
+#else
+       movw    %cx, (%LESS_VEC_REG)
+       movb    %sil, -1(%LESS_VEC_REG, %rdx)
+#endif
+       ret
  END (MEMSET_SYMBOL (__memset, unaligned_erms))
author	Noah Goldstein <goldstein.w.n@gmail.com>
	Sun, 6 Feb 2022 06:54:18 +0000 (00:54 -0600)
committer	Noah Goldstein <goldstein.w.n@gmail.com>
	Mon, 7 Feb 2022 02:58:07 +0000 (20:58 -0600)
sysdeps/x86_64/memset.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S		patch \| blob \| blame \| history