[PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
H.J. Lu
hjl.tools@gmail.com
Thu Mar 24 19:02:05 GMT 2022
On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Slightly faster method of doing TOLOWER that saves an
> instruction.
>
> Also replace the hard coded 5-byte no with .p2align 4. On builds with
> CET enabled this misaligned entry to strcasecmp.
>
> geometric_mean(N=40) of all benchmarks New / Original: .894
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, New Time / Old Time
> 1, 1, 1, 127, 0.903
> 2, 2, 2, 127, 0.905
> 3, 3, 3, 127, 0.877
> 4, 4, 4, 127, 0.888
> 5, 5, 5, 127, 0.901
> 6, 6, 6, 127, 0.954
> 7, 7, 7, 127, 0.932
> 8, 0, 0, 127, 0.918
> 9, 1, 1, 127, 0.914
> 10, 2, 2, 127, 0.877
> 11, 3, 3, 127, 0.909
> 12, 4, 4, 127, 0.876
> 13, 5, 5, 127, 0.886
> 14, 6, 6, 127, 0.914
> 15, 7, 7, 127, 0.939
> 4, 0, 0, 127, 0.963
> 4, 0, 0, 254, 0.943
> 8, 0, 0, 254, 0.927
> 16, 0, 0, 127, 0.876
> 16, 0, 0, 254, 0.865
> 32, 0, 0, 127, 0.865
> 32, 0, 0, 254, 0.862
> 64, 0, 0, 127, 0.863
> 64, 0, 0, 254, 0.896
> 128, 0, 0, 127, 0.885
> 128, 0, 0, 254, 0.882
> 256, 0, 0, 127, 0.87
> 256, 0, 0, 254, 0.869
> 512, 0, 0, 127, 0.832
> 512, 0, 0, 254, 0.848
> 1024, 0, 0, 127, 0.835
> 1024, 0, 0, 254, 0.843
> 16, 1, 2, 127, 0.914
> 16, 2, 1, 254, 0.949
> 32, 2, 4, 127, 0.955
> 32, 4, 2, 254, 1.004
> 64, 3, 6, 127, 0.844
> 64, 6, 3, 254, 0.905
> 128, 4, 0, 127, 0.889
> 128, 0, 4, 254, 0.845
> 256, 5, 2, 127, 0.929
> 256, 2, 5, 254, 0.907
> 512, 6, 4, 127, 0.837
> 512, 4, 6, 254, 0.862
> 1024, 7, 6, 127, 0.895
> 1024, 6, 7, 254, 0.89
>
> sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
> 1 file changed, 29 insertions(+), 35 deletions(-)
>
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index e2ab59c555..99d8b36f1d 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
> movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> mov %fs:(%rax),%RDX_LP
>
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> END2 (__strcasecmp)
> # ifndef NO_NOLOCALE_ALIAS
> weak_alias (__strcasecmp, strcasecmp)
> @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
> movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> mov %fs:(%rax),%RCX_LP
>
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> END2 (__strncasecmp)
> # ifndef NO_NOLOCALE_ALIAS
> weak_alias (__strncasecmp, strncasecmp)
> @@ -146,22 +144,22 @@ ENTRY (STRCMP)
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> .section .rodata.cst16,"aM",@progbits,16
> .align 16
> -.Lbelowupper:
> - .quad 0x4040404040404040
> - .quad 0x4040404040404040
> -.Ltopupper:
> - .quad 0x5b5b5b5b5b5b5b5b
> - .quad 0x5b5b5b5b5b5b5b5b
> -.Ltouppermask:
> +.Llcase_min:
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> +.Llcase_max:
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> +.Lcase_add:
> .quad 0x2020202020202020
> .quad 0x2020202020202020
> .previous
> - movdqa .Lbelowupper(%rip), %xmm5
> -# define UCLOW_reg %xmm5
> - movdqa .Ltopupper(%rip), %xmm6
> -# define UCHIGH_reg %xmm6
> - movdqa .Ltouppermask(%rip), %xmm7
> -# define LCQWORD_reg %xmm7
> + movdqa .Llcase_min(%rip), %xmm5
> +# define LCASE_MIN_reg %xmm5
> + movdqa .Llcase_max(%rip), %xmm6
> +# define LCASE_MAX_reg %xmm6
> + movdqa .Lcase_add(%rip), %xmm7
> +# define CASE_ADD_reg %xmm7
> #endif
> cmp $0x30, %ecx
> ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
> @@ -172,22 +170,18 @@ ENTRY (STRCMP)
> movhpd 8(%rdi), %xmm1
> movhpd 8(%rsi), %xmm2
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# define TOLOWER(reg1, reg2) \
> - movdqa reg1, %xmm8; \
> - movdqa UCHIGH_reg, %xmm9; \
> - movdqa reg2, %xmm10; \
> - movdqa UCHIGH_reg, %xmm11; \
> - pcmpgtb UCLOW_reg, %xmm8; \
> - pcmpgtb reg1, %xmm9; \
> - pcmpgtb UCLOW_reg, %xmm10; \
> - pcmpgtb reg2, %xmm11; \
> - pand %xmm9, %xmm8; \
> - pand %xmm11, %xmm10; \
> - pand LCQWORD_reg, %xmm8; \
> - pand LCQWORD_reg, %xmm10; \
> - por %xmm8, reg1; \
> - por %xmm10, reg2
> - TOLOWER (%xmm1, %xmm2)
> +# define TOLOWER(reg1, reg2) \
> + movdqa LCASE_MIN_reg, %xmm8; \
> + movdqa LCASE_MIN_reg, %xmm9; \
> + paddb reg1, %xmm8; \
> + paddb reg2, %xmm9; \
> + pcmpgtb LCASE_MAX_reg, %xmm8; \
> + pcmpgtb LCASE_MAX_reg, %xmm9; \
> + pandn CASE_ADD_reg, %xmm8; \
> + pandn CASE_ADD_reg, %xmm9; \
> + paddb %xmm8, reg1; \
> + paddb %xmm9, reg2
> + TOLOWER (%xmm1, %xmm2)
> #else
> # define TOLOWER(reg1, reg2)
> #endif
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
More information about the Libc-alpha
mailing list