[PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S

H.J. Lu hjl.tools@gmail.com
Thu Mar 24 19:02:05 GMT 2022


On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Slightly faster method of doing TOLOWER that saves an
> instruction.
>
> Also replace the hard coded 5-byte no with .p2align 4. On builds with
> CET enabled this misaligned entry to strcasecmp.
>
> geometric_mean(N=40) of all benchmarks New / Original: .894
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, New Time / Old Time
>      1,      1,      1,      127,               0.903
>      2,      2,      2,      127,               0.905
>      3,      3,      3,      127,               0.877
>      4,      4,      4,      127,               0.888
>      5,      5,      5,      127,               0.901
>      6,      6,      6,      127,               0.954
>      7,      7,      7,      127,               0.932
>      8,      0,      0,      127,               0.918
>      9,      1,      1,      127,               0.914
>     10,      2,      2,      127,               0.877
>     11,      3,      3,      127,               0.909
>     12,      4,      4,      127,               0.876
>     13,      5,      5,      127,               0.886
>     14,      6,      6,      127,               0.914
>     15,      7,      7,      127,               0.939
>      4,      0,      0,      127,               0.963
>      4,      0,      0,      254,               0.943
>      8,      0,      0,      254,               0.927
>     16,      0,      0,      127,               0.876
>     16,      0,      0,      254,               0.865
>     32,      0,      0,      127,               0.865
>     32,      0,      0,      254,               0.862
>     64,      0,      0,      127,               0.863
>     64,      0,      0,      254,               0.896
>    128,      0,      0,      127,               0.885
>    128,      0,      0,      254,               0.882
>    256,      0,      0,      127,                0.87
>    256,      0,      0,      254,               0.869
>    512,      0,      0,      127,               0.832
>    512,      0,      0,      254,               0.848
>   1024,      0,      0,      127,               0.835
>   1024,      0,      0,      254,               0.843
>     16,      1,      2,      127,               0.914
>     16,      2,      1,      254,               0.949
>     32,      2,      4,      127,               0.955
>     32,      4,      2,      254,               1.004
>     64,      3,      6,      127,               0.844
>     64,      6,      3,      254,               0.905
>    128,      4,      0,      127,               0.889
>    128,      0,      4,      254,               0.845
>    256,      5,      2,      127,               0.929
>    256,      2,      5,      254,               0.907
>    512,      6,      4,      127,               0.837
>    512,      4,      6,      254,               0.862
>   1024,      7,      6,      127,               0.895
>   1024,      6,      7,      254,                0.89
>
>  sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
>  1 file changed, 29 insertions(+), 35 deletions(-)
>
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index e2ab59c555..99d8b36f1d 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
>         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
>         mov     %fs:(%rax),%RDX_LP
>
> -       // XXX 5 byte should be before the function
> -       /* 5-byte NOP.  */
> -       .byte   0x0f,0x1f,0x44,0x00,0x00
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
>  END2 (__strcasecmp)
>  # ifndef NO_NOLOCALE_ALIAS
>  weak_alias (__strcasecmp, strcasecmp)
> @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
>         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
>         mov     %fs:(%rax),%RCX_LP
>
> -       // XXX 5 byte should be before the function
> -       /* 5-byte NOP.  */
> -       .byte   0x0f,0x1f,0x44,0x00,0x00
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
>  END2 (__strncasecmp)
>  # ifndef NO_NOLOCALE_ALIAS
>  weak_alias (__strncasecmp, strncasecmp)
> @@ -146,22 +144,22 @@ ENTRY (STRCMP)
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
>         .section .rodata.cst16,"aM",@progbits,16
>         .align 16
> -.Lbelowupper:
> -       .quad   0x4040404040404040
> -       .quad   0x4040404040404040
> -.Ltopupper:
> -       .quad   0x5b5b5b5b5b5b5b5b
> -       .quad   0x5b5b5b5b5b5b5b5b
> -.Ltouppermask:
> +.Llcase_min:
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +.Llcase_max:
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +.Lcase_add:
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
>         .previous
> -       movdqa  .Lbelowupper(%rip), %xmm5
> -# define UCLOW_reg %xmm5
> -       movdqa  .Ltopupper(%rip), %xmm6
> -# define UCHIGH_reg %xmm6
> -       movdqa  .Ltouppermask(%rip), %xmm7
> -# define LCQWORD_reg %xmm7
> +       movdqa  .Llcase_min(%rip), %xmm5
> +# define LCASE_MIN_reg %xmm5
> +       movdqa  .Llcase_max(%rip), %xmm6
> +# define LCASE_MAX_reg %xmm6
> +       movdqa  .Lcase_add(%rip), %xmm7
> +# define CASE_ADD_reg %xmm7
>  #endif
>         cmp     $0x30, %ecx
>         ja      LABEL(crosscache)       /* rsi: 16-byte load will cross cache line */
> @@ -172,22 +170,18 @@ ENTRY (STRCMP)
>         movhpd  8(%rdi), %xmm1
>         movhpd  8(%rsi), %xmm2
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# define TOLOWER(reg1, reg2) \
> -       movdqa  reg1, %xmm8;                                    \
> -       movdqa  UCHIGH_reg, %xmm9;                              \
> -       movdqa  reg2, %xmm10;                                   \
> -       movdqa  UCHIGH_reg, %xmm11;                             \
> -       pcmpgtb UCLOW_reg, %xmm8;                               \
> -       pcmpgtb reg1, %xmm9;                                    \
> -       pcmpgtb UCLOW_reg, %xmm10;                              \
> -       pcmpgtb reg2, %xmm11;                                   \
> -       pand    %xmm9, %xmm8;                                   \
> -       pand    %xmm11, %xmm10;                                 \
> -       pand    LCQWORD_reg, %xmm8;                             \
> -       pand    LCQWORD_reg, %xmm10;                            \
> -       por     %xmm8, reg1;                                    \
> -       por     %xmm10, reg2
> -       TOLOWER (%xmm1, %xmm2)
> +#  define TOLOWER(reg1, reg2) \
> +       movdqa  LCASE_MIN_reg, %xmm8;                                   \
> +       movdqa  LCASE_MIN_reg, %xmm9;                                   \
> +       paddb   reg1, %xmm8;                                    \
> +       paddb   reg2, %xmm9;                                    \
> +       pcmpgtb LCASE_MAX_reg, %xmm8;                           \
> +       pcmpgtb LCASE_MAX_reg, %xmm9;                           \
> +       pandn   CASE_ADD_reg, %xmm8;                                    \
> +       pandn   CASE_ADD_reg, %xmm9;                                    \
> +       paddb   %xmm8, reg1;                                    \
> +       paddb   %xmm9, reg2
> +       TOLOWER (%xmm1, %xmm2)
>  #else
>  # define TOLOWER(reg1, reg2)
>  #endif
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.


More information about the Libc-alpha mailing list