x86: Optimize strnlen-evex.S and implement with VMM headers

author Noah Goldstein <goldstein.w.n@gmail.com>

Wed, 19 Oct 2022 00:44:05 +0000 (17:44 -0700)

committer Noah Goldstein <goldstein.w.n@gmail.com>

Thu, 20 Oct 2022 00:31:03 +0000 (17:31 -0700)
author Noah Goldstein <goldstein.w.n@gmail.com>
Wed, 19 Oct 2022 00:44:05 +0000 (17:44 -0700)
committer Noah Goldstein <goldstein.w.n@gmail.com>
Thu, 20 Oct 2022 00:31:03 +0000 (17:31 -0700)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S

index 2109ec2f7a2aa6d18a7fa39375db891386542dbe..487846f09814ffd44ec76cb0cd438a6cf78001f8 100644 (file)
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -26,466 +26,220 @@
  #  define STRLEN       __strlen_evex
  # endif
  
-# define VMOVA         vmovdqa64
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
  
  # ifdef USE_AS_WCSLEN
-#  define VPCMP                vpcmpd
+#  define VPCMPEQ      vpcmpeqd
+#  define VPCMPNEQ     vpcmpneqd
+#  define VPTESTN      vptestnmd
+#  define VPTEST       vptestmd
  #  define VPMINU       vpminud
-#  define SHIFT_REG ecx
  #  define CHAR_SIZE    4
+#  define CHAR_SIZE_SHIFT_REG(reg)     sar $2, %reg
  # else
-#  define VPCMP                vpcmpb
+#  define VPCMPEQ      vpcmpeqb
+#  define VPCMPNEQ     vpcmpneqb
+#  define VPTESTN      vptestnmb
+#  define VPTEST       vptestmb
  #  define VPMINU       vpminub
-#  define SHIFT_REG edx
  #  define CHAR_SIZE    1
+#  define CHAR_SIZE_SHIFT_REG(reg)
+
+#  define REG_WIDTH    VEC_SIZE
  # endif
  
-# define XMMZERO       xmm16
-# define YMMZERO       ymm16
-# define YMM1          ymm17
-# define YMM2          ymm18
-# define YMM3          ymm19
-# define YMM4          ymm20
-# define YMM5          ymm21
-# define YMM6          ymm22
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-       .section .text.evex,"ax",@progbits
-ENTRY (STRLEN)
-# ifdef USE_AS_STRNLEN
-       /* Check zero length.  */
-       test    %RSI_LP, %RSI_LP
-       jz      L(zero)
-#  ifdef __ILP32__
-       /* Clear the upper 32 bits.  */
-       movl    %esi, %esi
-#  endif
-       mov     %RSI_LP, %R8_LP
+# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL      first_vec_x2
+#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
+
+# else
+
+#  define TAIL_RETURN_LBL      first_vec_x3
+#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
  # endif
+
+# define XZERO VMM_128(0)
+# define VZERO VMM(0)
+# define PAGE_SIZE     4096
+
+       .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRLEN, 6)
         movl    %edi, %eax
-       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
-       /* Clear high bits from edi. Only keeping bits relevant to page
-          cross check.  */
+       vpxorq  %XZERO, %XZERO, %XZERO
         andl    $(PAGE_SIZE - 1), %eax
-       /* Check if we may cross page boundary with one vector load.  */
         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
         ja      L(cross_page_boundary)
  
         /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
            null byte.  */
-       VPCMP   $0, (%rdi), %YMMZERO, %k0
-       kmovd   %k0, %eax
-# ifdef USE_AS_STRNLEN
-       /* If length < CHAR_PER_VEC handle special.  */
-       cmpq    $CHAR_PER_VEC, %rsi
-       jbe     L(first_vec_x0)
-# endif
-       testl   %eax, %eax
+       VPCMPEQ (%rdi), %VZERO, %k0
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
         jz      L(aligned_more)
-       tzcntl  %eax, %eax
-       ret
-# ifdef USE_AS_STRNLEN
-L(zero):
-       xorl    %eax, %eax
-       ret
-
-       .p2align 4
-L(first_vec_x0):
-       /* Set bit for max len so that tzcnt will return min of max len
-          and position of first match.  */
-       btsq    %rsi, %rax
-       tzcntl  %eax, %eax
-       ret
-# endif
-
-       .p2align 4
-L(first_vec_x1):
-       tzcntl  %eax, %eax
-       /* Safe to use 32 bit instructions as these are only called for
-          size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-       /* Use ecx which was computed earlier to compute correct value.
-        */
-       leal    -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
-# else
-       subl    %edx, %edi
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarl    $2, %edi
-#  endif
-       leal    CHAR_PER_VEC(%rdi, %rax), %eax
-# endif
-       ret
-
-       .p2align 4
-L(first_vec_x2):
-       tzcntl  %eax, %eax
-       /* Safe to use 32 bit instructions as these are only called for
-          size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-       /* Use ecx which was computed earlier to compute correct value.
-        */
-       leal    -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
-# else
-       subl    %edx, %edi
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarl    $2, %edi
-#  endif
-       leal    (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
-# endif
+       bsf     %VRAX, %VRAX
         ret
  
-       .p2align 4
-L(first_vec_x3):
-       tzcntl  %eax, %eax
-       /* Safe to use 32 bit instructions as these are only called for
-          size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-       /* Use ecx which was computed earlier to compute correct value.
-        */
-       leal    -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
-# else
-       subl    %edx, %edi
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarl    $2, %edi
-#  endif
-       leal    (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
-# endif
-       ret
-
-       .p2align 4
+       .p2align 4,, 8
  L(first_vec_x4):
-       tzcntl  %eax, %eax
-       /* Safe to use 32 bit instructions as these are only called for
-          size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-       /* Use ecx which was computed earlier to compute correct value.
-        */
-       leal    -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
-# else
-       subl    %edx, %edi
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarl    $2, %edi
-#  endif
+       bsf     %VRAX, %VRAX
+       subl    %ecx, %edi
+       CHAR_SIZE_SHIFT_REG (edi)
         leal    (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
-# endif
         ret
  
-       .p2align 5
+
+
+       /* Aligned more for strnlen compares remaining length vs 2 *
+          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+          going to the loop.  */
+       .p2align 4,, 10
  L(aligned_more):
-       movq    %rdi, %rdx
-       /* Align data to VEC_SIZE.  */
-       andq    $-(VEC_SIZE), %rdi
+       movq    %rdi, %rcx
+       andq    $(VEC_SIZE * -1), %rdi
  L(cross_page_continue):
-       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-          since data is only aligned to VEC_SIZE.  */
-# ifdef USE_AS_STRNLEN
-       /* + CHAR_SIZE because it simplies the logic in
-          last_4x_vec_or_less.  */
-       leaq    (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
-       subq    %rdx, %rcx
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarl    $2, %ecx
-#  endif
-# endif
-       /* Load first VEC regardless.  */
-       VPCMP   $0, VEC_SIZE(%rdi), %YMMZERO, %k0
-# ifdef USE_AS_STRNLEN
-       /* Adjust length. If near end handle specially.  */
-       subq    %rcx, %rsi
-       jb      L(last_4x_vec_or_less)
-# endif
-       kmovd   %k0, %eax
-       testl   %eax, %eax
+       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+          rechecking bounds.  */
+       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
         jnz     L(first_vec_x1)
  
-       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-       kmovd   %k0, %eax
-       test    %eax, %eax
+       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
         jnz     L(first_vec_x2)
  
-       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-       kmovd   %k0, %eax
-       testl   %eax, %eax
+       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
         jnz     L(first_vec_x3)
  
-       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
-       kmovd   %k0, %eax
-       testl   %eax, %eax
+       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
         jnz     L(first_vec_x4)
  
-       addq    $VEC_SIZE, %rdi
-# ifdef USE_AS_STRNLEN
-       /* Check if at last VEC_SIZE * 4 length.  */
-       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
-       jbe     L(last_4x_vec_or_less_load)
-       movl    %edi, %ecx
-       andl    $(VEC_SIZE * 4 - 1), %ecx
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarl    $2, %ecx
-#  endif
-       /* Readjust length.  */
-       addq    %rcx, %rsi
-# endif
-       /* Align data to VEC_SIZE * 4.  */
+       subq    $(VEC_SIZE * -1), %rdi
+
+# if CHAR_PER_VEC == 64
+       /* No partial register stalls on processors that we use evex512
+          on and this saves code size.  */
+       xorb    %dil, %dil
+# else
         andq    $-(VEC_SIZE * 4), %rdi
+# endif
+
+
  
         /* Compare 4 * VEC at a time forward.  */
         .p2align 4
  L(loop_4x_vec):
-       /* Load first VEC regardless.  */
-       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
-# ifdef USE_AS_STRNLEN
-       /* Break if at end of length.  */
-       subq    $(CHAR_PER_VEC * 4), %rsi
-       jb      L(last_4x_vec_or_less_cmpeq)
-# endif
-       /* Save some code size by microfusing VPMINU with the load. Since
-          the matches in ymm2/ymm4 can only be returned if there where no
-          matches in ymm1/ymm3 respectively there is no issue with overlap.
-        */
-       VPMINU  (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
-       VMOVA   (VEC_SIZE * 6)(%rdi), %YMM3
-       VPMINU  (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
+       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
+       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+       VPTESTN %VMM(2), %VMM(2), %k0
+       VPTESTN %VMM(4), %VMM(4), %k2
  
-       VPCMP   $0, %YMM2, %YMMZERO, %k0
-       VPCMP   $0, %YMM4, %YMMZERO, %k1
         subq    $-(VEC_SIZE * 4), %rdi
-       kortestd        %k0, %k1
+       KORTEST %k0, %k2
         jz      L(loop_4x_vec)
  
-       /* Check if end was in first half.  */
-       kmovd   %k0, %eax
-       subq    %rdx, %rdi
-# ifdef USE_AS_WCSLEN
-       shrq    $2, %rdi
-# endif
-       testl   %eax, %eax
-       jz      L(second_vec_return)
+       VPTESTN %VMM(1), %VMM(1), %k1
+       KMOV    %k1, %VRAX
+       test    %VRAX, %VRAX
+       jnz     L(first_vec_x0)
  
-       VPCMP   $0, %YMM1, %YMMZERO, %k2
-       kmovd   %k2, %edx
-       /* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
-# ifdef USE_AS_WCSLEN
-       sall    $CHAR_PER_VEC, %eax
-       orl     %edx, %eax
-       tzcntl  %eax, %eax
-# else
-       salq    $CHAR_PER_VEC, %rax
-       orq     %rdx, %rax
-       tzcntq  %rax, %rax
-# endif
-       addq    %rdi, %rax
-       ret
-
-
-# ifdef USE_AS_STRNLEN
-
-L(last_4x_vec_or_less_load):
-       /* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
-       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
-L(last_4x_vec_or_less_cmpeq):
-       VPCMP   $0, %YMM1, %YMMZERO, %k0
-       addq    $(VEC_SIZE * 3), %rdi
-L(last_4x_vec_or_less):
-       kmovd   %k0, %eax
-       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
-          VEC_SIZE * 4.  */
-       testl   $(CHAR_PER_VEC * 2), %esi
-       jnz     L(last_4x_vec)
-
-       /* length may have been negative or positive by an offset of
-          CHAR_PER_VEC * 4 depending on where this was called from. This
-          fixes that.  */
-       andl    $(CHAR_PER_VEC * 4 - 1), %esi
-       testl   %eax, %eax
-       jnz     L(last_vec_x1_check)
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
+       jnz     L(first_vec_x1)
  
-       /* Check the end of data.  */
-       subl    $CHAR_PER_VEC, %esi
-       jb      L(max)
+       VPTESTN %VMM(3), %VMM(3), %k0
  
-       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-       kmovd   %k0, %eax
-       tzcntl  %eax, %eax
-       /* Check the end of data.  */
-       cmpl    %eax, %esi
-       jb      L(max)
-
-       subq    %rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarq    $2, %rdi
-#  endif
-       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-       ret
-L(max):
-       movq    %r8, %rax
-       ret
-# endif
-
-       /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
-          in the 4x VEC loop can use 2 byte encoding.  */
-       .p2align 4
-L(second_vec_return):
-       VPCMP   $0, %YMM3, %YMMZERO, %k0
-       /* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
-# ifdef USE_AS_WCSLEN
-       kunpckbw        %k0, %k1, %k0
-       kmovd   %k0, %eax
-       tzcntl  %eax, %eax
+# if CHAR_PER_VEC == 64
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
+       jnz     L(first_vec_x2)
+       KMOV    %k2, %VRAX
  # else
-       kunpckdq        %k0, %k1, %k0
-       kmovq   %k0, %rax
-       tzcntq  %rax, %rax
+       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+        */
+       kmovd   %k2, %edx
+       kmovd   %k0, %eax
+       salq    $CHAR_PER_VEC, %rdx
+       orq     %rdx, %rax
  # endif
-       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-       ret
  
-
-# ifdef USE_AS_STRNLEN
-L(last_vec_x1_check):
-       tzcntl  %eax, %eax
-       /* Check the end of data.  */
-       cmpl    %eax, %esi
-       jb      L(max)
-       subq    %rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarq    $2, %rdi
-#  endif
-       leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
+       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+        */
+       .p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+       bsfq    %rax, %rax
+       subq    %rcx, %rdi
+       CHAR_SIZE_SHIFT_REG (rdi)
+       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
         ret
  
-       .p2align 4
-L(last_4x_vec):
-       /* Test first 2x VEC normally.  */
-       testl   %eax, %eax
-       jnz     L(last_vec_x1)
-
-       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-       kmovd   %k0, %eax
-       testl   %eax, %eax
-       jnz     L(last_vec_x2)
-
-       /* Normalize length.  */
-       andl    $(CHAR_PER_VEC * 4 - 1), %esi
-       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-       kmovd   %k0, %eax
-       testl   %eax, %eax
-       jnz     L(last_vec_x3)
-
-       /* Check the end of data.  */
-       subl    $(CHAR_PER_VEC * 3), %esi
-       jb      L(max)
-
-       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
-       kmovd   %k0, %eax
-       tzcntl  %eax, %eax
-       /* Check the end of data.  */
-       cmpl    %eax, %esi
-       jb      L(max_end)
-
-       subq    %rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarq    $2, %rdi
-#  endif
-       leaq    (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+       .p2align 4,, 8
+L(first_vec_x0):
+       bsf     %VRAX, %VRAX
+       sub     %rcx, %rdi
+       CHAR_SIZE_SHIFT_REG (rdi)
+       addq    %rdi, %rax
         ret
  
-       .p2align 4
-L(last_vec_x1):
-       tzcntl  %eax, %eax
-       subq    %rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarq    $2, %rdi
-#  endif
+       .p2align 4,, 10
+L(first_vec_x1):
+       bsf     %VRAX, %VRAX
+       sub     %rcx, %rdi
+       CHAR_SIZE_SHIFT_REG (rdi)
         leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
         ret
  
-       .p2align 4
-L(last_vec_x2):
-       tzcntl  %eax, %eax
-       subq    %rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarq    $2, %rdi
-#  endif
-       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-       ret
-
-       .p2align 4
-L(last_vec_x3):
-       tzcntl  %eax, %eax
-       subl    $(CHAR_PER_VEC * 2), %esi
-       /* Check the end of data.  */
-       cmpl    %eax, %esi
-       jb      L(max_end)
-       subq    %rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-       /* NB: Divide bytes by 4 to get the wchar_t count.  */
-       sarq    $2, %rdi
-#  endif
-       leaq    (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
-       ret
-L(max_end):
-       movq    %r8, %rax
+       .p2align 4,, 10
+       /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
+        */
+L(TAIL_RETURN_LBL):
+       bsf     %VRAX, %VRAX
+       sub     %VRCX, %VRDI
+       CHAR_SIZE_SHIFT_REG (VRDI)
+       lea     (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
         ret
-# endif
  
-       /* Cold case for crossing page with first load.  */
-       .p2align 4
+       .p2align 4,, 8
  L(cross_page_boundary):
-       movq    %rdi, %rdx
+       movq    %rdi, %rcx
         /* Align data to VEC_SIZE.  */
         andq    $-VEC_SIZE, %rdi
-       VPCMP   $0, (%rdi), %YMMZERO, %k0
-       kmovd   %k0, %eax
-       /* Remove the leading bytes.  */
+
+       VPCMPEQ (%rdi), %VZERO, %k0
+
+       KMOV    %k0, %VRAX
  # ifdef USE_AS_WCSLEN
-       /* NB: Divide shift count by 4 since each bit in K0 represent 4
-          bytes.  */
-       movl    %edx, %ecx
-       shrl    $2, %ecx
-       andl    $(CHAR_PER_VEC - 1), %ecx
-# endif
-       /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
-       sarxl   %SHIFT_REG, %eax, %eax
+       movl    %ecx, %edx
+       shrl    $2, %edx
+       andl    $(CHAR_PER_VEC - 1), %edx
+       shrx    %edx, %eax, %eax
         testl   %eax, %eax
-# ifndef USE_AS_STRNLEN
-       jz      L(cross_page_continue)
-       tzcntl  %eax, %eax
-       ret
  # else
-       jnz     L(cross_page_less_vec)
-#  ifndef USE_AS_WCSLEN
-       movl    %edx, %ecx
-       andl    $(CHAR_PER_VEC - 1), %ecx
-#  endif
-       movl    $CHAR_PER_VEC, %eax
-       subl    %ecx, %eax
-       /* Check the end of data.  */
-       cmpq    %rax, %rsi
-       ja      L(cross_page_continue)
-       movl    %esi, %eax
-       ret
-L(cross_page_less_vec):
-       tzcntl  %eax, %eax
-       /* Select min of length and position of first null.  */
-       cmpq    %rax, %rsi
-       cmovb   %esi, %eax
-       ret
+       shr     %cl, %VRAX
  # endif
+       jz      L(cross_page_continue)
+       bsf     %VRAX, %VRAX
+       ret
  
  END (STRLEN)
  #endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S

index 64a9fc26064db18aa788df65104244dc20a37ec2..443a32a7496450c44fef78e3a00cb4781166b8cb 100644 (file)
--- a/sysdeps/x86_64/multiarch/strnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -1,8 +1,423 @@
-#ifndef STRNLEN
-# define STRNLEN __strnlen_evex
-#endif
+/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNLEN
+#  define STRNLEN      __strnlen_evex
+# endif
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ      vpcmpeqd
+#  define VPCMPNEQ     vpcmpneqd
+#  define VPTESTN      vptestnmd
+#  define VPTEST       vptestmd
+#  define VPMINU       vpminud
+#  define CHAR_SIZE    4
+
+# else
+#  define VPCMPEQ      vpcmpeqb
+#  define VPCMPNEQ     vpcmpneqb
+#  define VPTESTN      vptestnmb
+#  define VPTEST       vptestmb
+#  define VPMINU       vpminub
+#  define CHAR_SIZE    1
+
+#  define REG_WIDTH    VEC_SIZE
+# endif
+
+# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 32
+#  define SUB_SHORT(imm, reg)  subb $(imm), %VGPR_SZ(reg, 8)
+# else
+#  define SUB_SHORT(imm, reg)  subl $(imm), %VGPR_SZ(reg, 32)
+# endif
+
+
+
+# if CHAR_PER_VEC == 64
+#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
+# else
+#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
+# endif
+
+
+# define XZERO VMM_128(0)
+# define VZERO VMM(0)
+# define PAGE_SIZE     4096
+
+       .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRNLEN, 6)
+       /* Check zero length.  */
+       test    %RSI_LP, %RSI_LP
+       jz      L(zero)
+# ifdef __ILP32__
+       /* Clear the upper 32 bits.  */
+       movl    %esi, %esi
+# endif
+
+       movl    %edi, %eax
+       vpxorq  %XZERO, %XZERO, %XZERO
+       andl    $(PAGE_SIZE - 1), %eax
+       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
+       ja      L(cross_page_boundary)
+
+       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+          null byte.  */
+       VPCMPEQ (%rdi), %VZERO, %k0
+
+       KMOV    %k0, %VRCX
+       movq    %rsi, %rax
+
+       /* If src (rcx) is zero, bsf does not change the result.  NB:
+          Must use 64-bit bsf here so that upper bits of len are not
+          cleared.  */
+       bsfq    %rcx, %rax
+       /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+          CHAR) and rsi must be > CHAR_PER_VEC.  */
+       cmpq    $CHAR_PER_VEC, %rax
+       ja      L(more_1x_vec)
+       /* Check if first match in bounds.  */
+       cmpq    %rax, %rsi
+       cmovb   %esi, %eax
+       ret
+
+
+# if CHAR_PER_VEC != 32
+       .p2align 4,, 2
+L(zero):
+L(max_0):
+       movl    %esi, %eax
+       ret
+# endif
+
+       /* Aligned more for strnlen compares remaining length vs 2 *
+          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+          going to the loop.  */
+       .p2align 4,, 10
+L(more_1x_vec):
+L(cross_page_continue):
+       /* Compute number of words checked after aligning.  */
+# ifdef USE_AS_WCSLEN
+       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+          overflow.  */
+       movq    %rdi, %rax
+       andq    $(VEC_SIZE * -1), %rdi
+       subq    %rdi, %rax
+       sarq    $2, %rax
+       leaq    -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+# else
+       leaq    (VEC_SIZE * -1)(%rsi, %rdi), %rax
+       andq    $(VEC_SIZE * -1), %rdi
+       subq    %rdi, %rax
+# endif
+
+
+       VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
+
+       cmpq    $(CHAR_PER_VEC * 2), %rax
+       ja      L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_check)
+
+       /* Check the end of data.  */
+       SUB_SHORT (CHAR_PER_VEC, rax)
+       jbe     L(max_0)
+       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jz      L(max_0)
+       /* Best place for LAST_VEC_CHECK if ZMM.  */
+       .p2align 4,, 8
+L(last_vec_check):
+       bsf     %VRDX, %VRDX
+       sub     %eax, %edx
+       lea     (%rsi, %rdx), %eax
+       cmovae  %esi, %eax
+       ret
+
+# if CHAR_PER_VEC == 32
+       .p2align 4,, 2
+L(zero):
+L(max_0):
+       movl    %esi, %eax
+       ret
+# endif
+
+       .p2align 4,, 8
+L(last_4x_vec_or_less):
+       addl    $(CHAR_PER_VEC * -4), %eax
+       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
+       subq    $(VEC_SIZE * -4), %rdi
+       cmpl    $(CHAR_PER_VEC * 2), %eax
+       jbe     L(last_2x_vec_or_less)
+
+       .p2align 4,, 6
+L(more_2x_vec):
+       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+          rechecking bounds.  */
  
-#define USE_AS_STRNLEN 1
-#define STRLEN STRNLEN
+       KMOV    %k0, %VRDX
  
-#include "strlen-evex.S"
+       test    %VRDX, %VRDX
+       jnz     L(first_vec_x1)
+
+       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(first_vec_x2)
+
+       cmpq    $(CHAR_PER_VEC * 4), %rax
+       ja      L(more_4x_vec)
+
+
+       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+       addl    $(CHAR_PER_VEC * -2), %eax
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_check)
+
+       subl    $(CHAR_PER_VEC), %eax
+       jbe     L(max_1)
+
+       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_check)
+L(max_1):
+       movl    %esi, %eax
+       ret
+
+       .p2align 4,, 3
+L(first_vec_x2):
+# if VEC_SIZE == 64
+       /* If VEC_SIZE == 64 we can fit logic for full return label in
+          spare bytes before next cache line.  */
+       bsf     %VRDX, %VRDX
+       sub     %eax, %esi
+       leal    (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+       ret
+       .p2align 4,, 6
+# else
+       addl    $CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x1):
+       bsf     %VRDX, %VRDX
+       sub     %eax, %esi
+       leal    (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+       ret
+
+
+       .p2align 4,, 6
+L(first_vec_x4):
+# if VEC_SIZE == 64
+       /* If VEC_SIZE == 64 we can fit logic for full return label in
+          spare bytes before next cache line.  */
+       bsf     %VRDX, %VRDX
+       sub     %eax, %esi
+       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+       ret
+       .p2align 4,, 6
+# else
+       addl    $CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+       bsf     %VRDX, %VRDX
+       sub     %eax, %esi
+       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+       ret
+
+       .p2align 4,, 5
+L(more_4x_vec):
+       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(first_vec_x3)
+
+       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(first_vec_x4)
+
+       /* Check if at last VEC_SIZE * 4 length before aligning for the
+          loop.  */
+       cmpq    $(CHAR_PER_VEC * 8), %rax
+       jbe     L(last_4x_vec_or_less)
+
+
+       /* Compute number of words checked after aligning.  */
+# ifdef USE_AS_WCSLEN
+       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+          overflow.  */
+       leaq    (VEC_SIZE * -3)(%rdi), %rdx
+# else
+       leaq    (VEC_SIZE * -3)(%rdi, %rax), %rax
+# endif
+
+       subq    $(VEC_SIZE * -1), %rdi
+
+       /* Align data to VEC_SIZE * 4.  */
+# if VEC_SIZE == 64
+       /* Saves code size.  No evex512 processor has partial register
+          stalls.  If that change this can be replaced with `andq
+          $-(VEC_SIZE * 4), %rdi`.  */
+       xorb    %dil, %dil
+# else
+       andq    $-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WCSLEN
+       subq    %rdi, %rdx
+       sarq    $2, %rdx
+       addq    %rdx, %rax
+# else
+       subq    %rdi, %rax
+# endif
+       /* Compare 4 * VEC at a time forward.  */
+       .p2align 4,, 11
+L(loop_4x_vec):
+       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
+       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
+       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+       VPTESTN %VMM(2), %VMM(2), %k0
+       VPTESTN %VMM(4), %VMM(4), %k2
+       subq    $-(VEC_SIZE * 4), %rdi
+       /* Break if at end of length.  */
+       subq    $(CHAR_PER_VEC * 4), %rax
+       jbe     L(loop_len_end)
+
+
+       KORTEST %k0, %k2
+       jz      L(loop_4x_vec)
+
+
+L(loop_last_4x_vec):
+       movq    %rsi, %rcx
+       subq    %rax, %rsi
+       VPTESTN %VMM(1), %VMM(1), %k1
+       KMOV    %k1, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_x0)
+
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_x1)
+
+       VPTESTN %VMM(3), %VMM(3), %k0
+
+       /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+          individually, for VEC_SIZE == 32 we combine them in a single
+          64-bit GPR.  */
+# if CHAR_PER_VEC == 64
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_x2)
+       KMOV    %k2, %VRDX
+# else
+       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+        */
+       kmovd   %k2, %edx
+       kmovd   %k0, %eax
+       salq    $CHAR_PER_VEC, %rdx
+       orq     %rax, %rdx
+# endif
+
+       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+        */
+       bsfq    %rdx, %rdx
+       leaq    (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
+       cmpq    %rax, %rcx
+       cmovb   %rcx, %rax
+       ret
+
+       /* Handle last 4x VEC after loop. All VECs have been loaded.  */
+       .p2align 4,, 4
+L(loop_len_end):
+       KORTEST %k0, %k2
+       jnz     L(loop_last_4x_vec)
+       movq    %rsi, %rax
+       ret
+
+
+# if CHAR_PER_VEC == 64
+       /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
+          need return label for it.  */
+       .p2align 4,, 8
+L(last_vec_x2):
+       bsf     %VRDX, %VRDX
+       leaq    (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
+       cmpq    %rax, %rcx
+       cmovb   %rcx, %rax
+       ret
+# endif
+
+
+       .p2align 4,, 10
+L(last_vec_x1):
+       addq    $CHAR_PER_VEC, %rsi
+L(last_vec_x0):
+       bsf     %VRDX, %VRDX
+       leaq    (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
+       cmpq    %rax, %rcx
+       cmovb   %rcx, %rax
+       ret
+
+
+       .p2align 4,, 8
+L(cross_page_boundary):
+       /* Align data to VEC_SIZE.  */
+       movq    %rdi, %rcx
+       andq    $-VEC_SIZE, %rcx
+       VPCMPEQ (%rcx), %VZERO, %k0
+
+       KMOV    %k0, %VRCX
+# ifdef USE_AS_WCSLEN
+       shrl    $2, %eax
+       andl    $(CHAR_PER_VEC - 1), %eax
+# endif
+       shrx    %VRAX, %VRCX, %VRCX
+
+       negl    %eax
+       andl    $(CHAR_PER_VEC - 1), %eax
+       movq    %rsi, %rdx
+       bsf     %VRCX, %VRDX
+       cmpq    %rax, %rdx
+       ja      L(cross_page_continue)
+       movl    %edx, %eax
+       cmpq    %rdx, %rsi
+       cmovb   %esi, %eax
+       ret
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S

index e2aad94c1e8dd13aacdcfe7880d8f6c4f33fbafb..57a7e93fbf2e67afcb7ccdc428ecdd1cdd615454 100644 (file)
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
@@ -2,8 +2,7 @@
  # define WCSNLEN       __wcsnlen_evex
  #endif
  
-#define STRLEN WCSNLEN
+#define STRNLEN        WCSNLEN
  #define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
  
-#include "strlen-evex.S"
+#include "strnlen-evex.S"
author	Noah Goldstein <goldstein.w.n@gmail.com>
	Wed, 19 Oct 2022 00:44:05 +0000 (17:44 -0700)
committer	Noah Goldstein <goldstein.w.n@gmail.com>
	Thu, 20 Oct 2022 00:31:03 +0000 (17:31 -0700)
sysdeps/x86_64/multiarch/strlen-evex.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/strnlen-evex.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/wcsnlen-evex.S		patch \| blob \| blame \| history