[PATCH v1 2/2] x86: Small improvements for wcslen
H.J. Lu
hjl.tools@gmail.com
Mon Mar 28 18:51:59 GMT 2022
On Fri, Mar 25, 2022 at 3:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just a few QOL changes.
> 1. Prefer `add` > `lea` as it has high execution units it can run
> on.
> 2. Don't break macro-fusion between `test` and `jcc`
> 3. Reduce code size by removing gratuitous padding bytes (-90
> bytes).
>
> geometric_mean(N=20) of all benchmarks New / Original: 0.959
>
> All string/memory tests pass.
> ---
> sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
> 1 file changed, 41 insertions(+), 45 deletions(-)
>
> diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
> index c9165dbf03..d641141d75 100644
> --- a/sysdeps/x86_64/wcslen.S
> +++ b/sysdeps/x86_64/wcslen.S
> @@ -40,82 +40,82 @@ ENTRY (__wcslen)
> pxor %xmm0, %xmm0
>
> lea 32(%rdi), %rax
> - lea 16(%rdi), %rcx
> + addq $16, %rdi
> and $-16, %rax
>
> pcmpeqd (%rax), %xmm0
> pmovmskb %xmm0, %edx
> pxor %xmm1, %xmm1
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm1
> pmovmskb %xmm1, %edx
> pxor %xmm2, %xmm2
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm2
> pmovmskb %xmm2, %edx
> pxor %xmm3, %xmm3
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm3
> pmovmskb %xmm3, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm0
> pmovmskb %xmm0, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm1
> pmovmskb %xmm1, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm2
> pmovmskb %xmm2, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm3
> pmovmskb %xmm3, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm0
> pmovmskb %xmm0, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm1
> pmovmskb %xmm1, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm2
> pmovmskb %xmm2, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm3
> pmovmskb %xmm3, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> and $-0x40, %rax
> @@ -132,104 +132,100 @@ L(aligned_64_loop):
> pminub %xmm0, %xmm2
> pcmpeqd %xmm3, %xmm2
> pmovmskb %xmm2, %edx
> + addq $64, %rax
> test %edx, %edx
> - lea 64(%rax), %rax
> jz L(aligned_64_loop)
>
> pcmpeqd -64(%rax), %xmm3
> pmovmskb %xmm3, %edx
> + addq $48, %rdi
> test %edx, %edx
> - lea 48(%rcx), %rcx
> jnz L(exit)
>
> pcmpeqd %xmm1, %xmm3
> pmovmskb %xmm3, %edx
> + addq $-16, %rdi
> test %edx, %edx
> - lea -16(%rcx), %rcx
> jnz L(exit)
>
> pcmpeqd -32(%rax), %xmm3
> pmovmskb %xmm3, %edx
> + addq $-16, %rdi
> test %edx, %edx
> - lea -16(%rcx), %rcx
> jnz L(exit)
>
> pcmpeqd %xmm6, %xmm3
> pmovmskb %xmm3, %edx
> + addq $-16, %rdi
> test %edx, %edx
> - lea -16(%rcx), %rcx
> - jnz L(exit)
> -
> - jmp L(aligned_64_loop)
> + jz L(aligned_64_loop)
>
> .p2align 4
> L(exit):
> - sub %rcx, %rax
> + sub %rdi, %rax
> shr $2, %rax
> test %dl, %dl
> jz L(exit_high)
>
> - mov %dl, %cl
> - and $15, %cl
> + andl $15, %edx
> jz L(exit_1)
> ret
>
> - .p2align 4
> + /* No align here. Naturally aligned % 16 == 1. */
> L(exit_high):
> - mov %dh, %ch
> - and $15, %ch
> + andl $(15 << 8), %edx
> jz L(exit_3)
> add $2, %rax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_1):
> add $1, %rax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_3):
> add $3, %rax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail0):
> - xor %rax, %rax
> + xorl %eax, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail1):
> - mov $1, %rax
> + movl $1, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail2):
> - mov $2, %rax
> + movl $2, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail3):
> - mov $3, %rax
> + movl $3, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail4):
> - mov $4, %rax
> + movl $4, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail5):
> - mov $5, %rax
> + movl $5, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail6):
> - mov $6, %rax
> + movl $6, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail7):
> - mov $7, %rax
> + movl $7, %eax
> ret
>
> END (__wcslen)
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
More information about the Libc-alpha
mailing list