[PATCH v1 08/23] x86: Optimize strspn in strspn-c.c

Noah Goldstein goldstein.w.n@gmail.com
Wed Mar 23 21:57:24 GMT 2022


Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
sign extensions.

geometric_mean(N=20) of all benchmarks that dont fallback on
sse2; New / Original: .901

All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2,  pos, New Time / Old Time
  1,      0,      0,  512,               0.768
  1,      1,      0,  512,               0.666
  1,      0,      1,  512,               1.193
  1,      1,      1,  512,               0.872
  2,      0,      0,  512,               0.698
  2,      2,      0,  512,               0.687
  2,      0,      2,  512,               1.393
  2,      2,      2,  512,               0.944
  3,      0,      0,  512,               0.691
  3,      3,      0,  512,               0.676
  3,      0,      3,  512,               1.388
  3,      3,      3,  512,               0.948
  4,      0,      0,  512,                0.74
  4,      4,      0,  512,               0.678
  4,      0,      4,  512,               1.421
  4,      4,      4,  512,               0.943
  5,      0,      0,  512,               0.691
  5,      5,      0,  512,               0.675
  5,      0,      5,  512,               1.348
  5,      5,      5,  512,               0.952
  6,      0,      0,  512,               0.685
  6,      6,      0,  512,                0.67
  6,      0,      6,  512,               1.333
  6,      6,      6,  512,                0.95
  7,      0,      0,  512,               0.688
  7,      7,      0,  512,               0.675
  7,      0,      7,  512,               1.344
  7,      7,      7,  512,               0.919
  8,      0,      0,  512,               0.716
  8,      0,      8,  512,               0.935
  9,      0,      0,  512,               0.716
  9,      1,      0,  512,               0.712
  9,      0,      9,  512,               0.956
  9,      1,      9,  512,               0.992
 10,      0,      0,  512,               0.699
 10,      2,      0,  512,                0.68
 10,      0,     10,  512,               0.952
 10,      2,     10,  512,               0.932
 11,      0,      0,  512,               0.705
 11,      3,      0,  512,               0.685
 11,      0,     11,  512,               0.956
 11,      3,     11,  512,               0.927
 12,      0,      0,  512,               0.695
 12,      4,      0,  512,               0.675
 12,      0,     12,  512,               0.948
 12,      4,     12,  512,               0.928
 13,      0,      0,  512,                 0.7
 13,      5,      0,  512,               0.678
 13,      0,     13,  512,               0.944
 13,      5,     13,  512,               0.931
 14,      0,      0,  512,               0.703
 14,      6,      0,  512,               0.678
 14,      0,     14,  512,               0.949
 14,      6,     14,  512,                0.93
 15,      0,      0,  512,               0.694
 15,      7,      0,  512,               0.678
 15,      0,     15,  512,               0.953
 15,      7,     15,  512,               0.924
 16,      0,      0,  512,               1.021
 16,      0,     16,  512,               1.067
 17,      0,      0,  512,               0.991
 17,      1,      0,  512,               0.984
 17,      0,     17,  512,               0.979
 17,      1,     17,  512,               0.993
 18,      0,      0,  512,               0.992
 18,      2,      0,  512,               1.008
 18,      0,     18,  512,               1.016
 18,      2,     18,  512,               0.993
 19,      0,      0,  512,               0.984
 19,      3,      0,  512,               0.985
 19,      0,     19,  512,               1.007
 19,      3,     19,  512,               1.006
 20,      0,      0,  512,               0.969
 20,      4,      0,  512,               0.968
 20,      0,     20,  512,               0.975
 20,      4,     20,  512,               0.975
 21,      0,      0,  512,               0.992
 21,      5,      0,  512,               0.992
 21,      0,     21,  512,                0.98
 21,      5,     21,  512,                0.97
 22,      0,      0,  512,               0.989
 22,      6,      0,  512,               0.987
 22,      0,     22,  512,                0.99
 22,      6,     22,  512,               0.985
 23,      0,      0,  512,               0.989
 23,      7,      0,  512,                0.98
 23,      0,     23,  512,                 1.0
 23,      7,     23,  512,               0.993
 24,      0,      0,  512,                0.99
 24,      0,     24,  512,               0.998
 25,      0,      0,  512,                1.01
 25,      1,      0,  512,                 1.0
 25,      0,     25,  512,                0.97
 25,      1,     25,  512,               0.967
 26,      0,      0,  512,               1.009
 26,      2,      0,  512,               0.986
 26,      0,     26,  512,               0.997
 26,      2,     26,  512,               0.993
 27,      0,      0,  512,               0.984
 27,      3,      0,  512,               0.997
 27,      0,     27,  512,               0.989
 27,      3,     27,  512,               0.976
 28,      0,      0,  512,               0.991
 28,      4,      0,  512,               1.003
 28,      0,     28,  512,               0.986
 28,      4,     28,  512,               0.989
 29,      0,      0,  512,               0.986
 29,      5,      0,  512,               0.985
 29,      0,     29,  512,               0.984
 29,      5,     29,  512,               0.977
 30,      0,      0,  512,               0.991
 30,      6,      0,  512,               0.987
 30,      0,     30,  512,               0.979
 30,      6,     30,  512,               0.974
 31,      0,      0,  512,               0.995
 31,      7,      0,  512,               0.995
 31,      0,     31,  512,               0.994
 31,      7,     31,  512,               0.984
  4,      0,      0,   32,               0.861
  4,      1,      0,   32,               0.864
  4,      0,      1,   32,               0.962
  4,      1,      1,   32,               0.967
  4,      0,      0,   64,               0.884
  4,      2,      0,   64,               0.818
  4,      0,      2,   64,               0.889
  4,      2,      2,   64,               0.918
  4,      0,      0,  128,               0.942
  4,      3,      0,  128,               0.884
  4,      0,      3,  128,               0.931
  4,      3,      3,  128,               0.883
  4,      0,      0,  256,               0.964
  4,      4,      0,  256,               0.922
  4,      0,      4,  256,               0.956
  4,      4,      4,  256,                0.93
  4,      5,      0,  512,               0.833
  4,      0,      5,  512,               1.027
  4,      5,      5,  512,               0.929
  4,      0,      0, 1024,               0.998
  4,      6,      0, 1024,               0.986
  4,      0,      6, 1024,               0.984
  4,      6,      6, 1024,               0.977
  4,      0,      0, 2048,               0.991
  4,      7,      0, 2048,               0.987
  4,      0,      7, 2048,               0.996
  4,      7,      7, 2048,                0.98
 10,      1,      0,   64,               0.826
 10,      1,      1,   64,               0.907
 10,      2,      0,   64,               0.829
 10,      2,      2,   64,                0.91
 10,      3,      0,   64,                0.83
 10,      3,      3,   64,               0.915
 10,      4,      0,   64,                0.83
 10,      4,      4,   64,               0.911
 10,      5,      0,   64,               0.828
 10,      5,      5,   64,               0.905
 10,      6,      0,   64,               0.828
 10,      6,      6,   64,               0.812
 10,      7,      0,   64,                0.83
 10,      7,      7,   64,               0.819
  6,      0,      0,    0,               1.261
  6,      0,      0,    1,               1.252
  6,      0,      1,    1,               0.845
  6,      0,      0,    2,                1.27
  6,      0,      2,    2,                0.85
  6,      0,      0,    3,               1.269
  6,      0,      3,    3,               0.845
  6,      0,      0,    4,               1.287
  6,      0,      4,    4,               0.852
  6,      0,      0,    5,               1.278
  6,      0,      5,    5,               0.851
  6,      0,      0,    6,               1.269
  6,      0,      6,    6,               0.841
  6,      0,      0,    7,               1.268
  6,      0,      7,    7,               0.851
  6,      0,      0,    8,               1.291
  6,      0,      8,    8,               0.837
  6,      0,      0,    9,               1.283
  6,      0,      9,    9,               0.831
  6,      0,      0,   10,               1.252
  6,      0,     10,   10,               0.997
  6,      0,      0,   11,               1.295
  6,      0,     11,   11,               1.046
  6,      0,      0,   12,               1.296
  6,      0,     12,   12,               1.038
  6,      0,      0,   13,               1.287
  6,      0,     13,   13,               1.082
  6,      0,      0,   14,               1.284
  6,      0,     14,   14,               1.001
  6,      0,      0,   15,               1.286
  6,      0,     15,   15,               1.002
  6,      0,      0,   16,               0.894
  6,      0,     16,   16,               0.874
  6,      0,      0,   17,               0.892
  6,      0,     17,   17,               0.974
  6,      0,      0,   18,               0.907
  6,      0,     18,   18,               0.993
  6,      0,      0,   19,               0.909
  6,      0,     19,   19,                0.99
  6,      0,      0,   20,               0.894
  6,      0,     20,   20,               0.978
  6,      0,      0,   21,                0.89
  6,      0,     21,   21,               0.958
  6,      0,      0,   22,               0.893
  6,      0,     22,   22,                0.99
  6,      0,      0,   23,               0.899
  6,      0,     23,   23,               0.986
  6,      0,      0,   24,               0.893
  6,      0,     24,   24,               0.989
  6,      0,      0,   25,               0.889
  6,      0,     25,   25,               0.982
  6,      0,      0,   26,               0.889
  6,      0,     26,   26,               0.852
  6,      0,      0,   27,                0.89
  6,      0,     27,   27,               0.832
  6,      0,      0,   28,                0.89
  6,      0,     28,   28,               0.831
  6,      0,      0,   29,                0.89
  6,      0,     29,   29,               0.838
  6,      0,      0,   30,               0.907
  6,      0,     30,   30,               0.833
  6,      0,      0,   31,               0.888
  6,      0,     31,   31,               0.837
  6,      0,      0,   32,               0.853
  6,      0,     32,   32,               0.828
  6,      0,      0,   33,               0.857
  6,      0,     33,   33,               0.947
  6,      0,      0,   34,               0.847
  6,      0,     34,   34,               0.954
  6,      0,      0,   35,               0.841
  6,      0,     35,   35,                0.94
  6,      0,      0,   36,               0.854
  6,      0,     36,   36,               0.958
  6,      0,      0,   37,               0.856
  6,      0,     37,   37,               0.957
  6,      0,      0,   38,               0.839
  6,      0,     38,   38,               0.962
  6,      0,      0,   39,               0.866
  6,      0,     39,   39,               0.945
  6,      0,      0,   40,               0.845
  6,      0,     40,   40,               0.961
  6,      0,      0,   41,               0.858
  6,      0,     41,   41,               0.961
  6,      0,      0,   42,               0.862
  6,      0,     42,   42,               0.825
  6,      0,      0,   43,               0.864
  6,      0,     43,   43,                0.82
  6,      0,      0,   44,               0.843
  6,      0,     44,   44,                0.81
  6,      0,      0,   45,               0.859
  6,      0,     45,   45,               0.816
  6,      0,      0,   46,               0.866
  6,      0,     46,   46,                0.81
  6,      0,      0,   47,               0.858
  6,      0,     47,   47,               0.807
  6,      0,      0,   48,                0.87
  6,      0,     48,   48,                0.87
  6,      0,      0,   49,               0.871
  6,      0,     49,   49,               0.874
  6,      0,      0,   50,                0.87
  6,      0,     50,   50,               0.881
  6,      0,      0,   51,               0.868
  6,      0,     51,   51,               0.875
  6,      0,      0,   52,               0.873
  6,      0,     52,   52,               0.871
  6,      0,      0,   53,               0.866
  6,      0,     53,   53,               0.882
  6,      0,      0,   54,               0.863
  6,      0,     54,   54,               0.876
  6,      0,      0,   55,               0.851
  6,      0,     55,   55,               0.871
  6,      0,      0,   56,               0.867
  6,      0,     56,   56,               0.888
  6,      0,      0,   57,               0.862
  6,      0,     57,   57,               0.899
  6,      0,      0,   58,               0.873
  6,      0,     58,   58,               0.798
  6,      0,      0,   59,               0.881
  6,      0,     59,   59,               0.785
  6,      0,      0,   60,               0.867
  6,      0,     60,   60,               0.797
  6,      0,      0,   61,               0.872
  6,      0,     61,   61,               0.791
  6,      0,      0,   62,               0.859
  6,      0,     62,   62,                0.79
  6,      0,      0,   63,                0.87
  6,      0,     63,   63,               0.796

 sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
 1 file changed, 39 insertions(+), 47 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 8fb3aba64d..6124033ceb 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
     return 0;
 
   const char *aligned;
-  __m128i mask;
-  int offset = (int) ((size_t) a & 15);
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
   if (offset != 0)
     {
       /* Load masks.  */
       aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
-      mask = __m128i_shift_right (mask0, offset);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
 
       /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16 - offset)
-	{
-	  /* There is no NULL terminator.  */
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
-	  length += index;
-
-	  /* Don't use SSE4.2 if the length of A > 16.  */
-	  if (length > 16)
-	    return __strspn_sse2 (s, a);
-
-	  if (index != 0)
-	    {
-	      /* Combine mask0 and mask1.  We could play games with
-		 palignr, but frankly this data should be in L1 now
-		 so do the merge via an unaligned load.  */
-	      mask = _mm_loadu_si128 ((__m128i *) a);
-	    }
-	}
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
     }
-  else
-    {
-      /* A is aligned.  */
-      mask = _mm_load_si128 ((__m128i *) a);
 
-      /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16)
-	{
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
-	     of A > 16.  */
-	  if (a[16] != 0)
-	    return __strspn_sse2 (s, a);
-	}
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return __strspn_sse2 (s, a);
     }
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
 
-  offset = (int) ((size_t) s & 15);
   if (offset != 0)
     {
+    start_unaligned:
       /* Check partial string.  */
       aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      __m128i adj_value = __m128i_shift_right (value, offset);
 
-      value = __m128i_shift_right (value, offset);
-
-      int length = _mm_cmpistri (mask, value, 0x12);
+      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
       /* No need to check CFlag since it is always 1.  */
       if (length < 16 - offset)
 	return length;
       /* Find where the NULL terminator is.  */
-      int index = _mm_cmpistri (value, value, 0x3a);
-      if (index < 16 - offset)
+      maskz = _mm_cmpeq_epi8 (value, zero);
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
 	return length;
       aligned += 16;
     }
-  else
-    aligned = s;
 
+start_loop:
   while (1)
     {
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      int index = _mm_cmpistri (mask, value, 0x12);
-      int cflag = _mm_cmpistrc (mask, value, 0x12);
+      unsigned int index = _mm_cmpistri (mask, value, 0x12);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
       if (cflag)
 	return (size_t) (aligned + index - s);
       aligned += 16;
-- 
2.25.1



More information about the Libc-alpha mailing list