[PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S
H.J. Lu
hjl.tools@gmail.com
Tue Jul 12 22:58:42 GMT 2022
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/memrchr.S | 332 +----------------------
> sysdeps/x86_64/multiarch/memrchr-sse2.S | 336 +++++++++++++++++++++++-
> 2 files changed, 334 insertions(+), 334 deletions(-)
>
> diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> index b0dffd2ae2..385e2c5668 100644
> --- a/sysdeps/x86_64/memrchr.S
> +++ b/sysdeps/x86_64/memrchr.S
> @@ -17,334 +17,6 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> -#define VEC_SIZE 16
> -#define PAGE_SIZE 4096
> -
> - .text
> -ENTRY_P2ALIGN(__memrchr, 6)
> -#ifdef __ILP32__
> - /* Clear upper bits. */
> - mov %RDX_LP, %RDX_LP
> -#endif
> - movd %esi, %xmm0
> -
> - /* Get end pointer. */
> - leaq (%rdx, %rdi), %rcx
> -
> - punpcklbw %xmm0, %xmm0
> - punpcklwd %xmm0, %xmm0
> - pshufd $0, %xmm0, %xmm0
> -
> - /* Check if we can load 1x VEC without cross a page. */
> - testl $(PAGE_SIZE - VEC_SIZE), %ecx
> - jz L(page_cross)
> -
> - /* NB: This load happens regardless of whether rdx (len) is zero. Since
> - it doesn't cross a page and the standard gurantees any pointer have
> - at least one-valid byte this load must be safe. For the entire
> - history of the x86 memrchr implementation this has been possible so
> - no code "should" be relying on a zero-length check before this load.
> - The zero-length check is moved to the page cross case because it is
> - 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> - into 2-cache lines. */
> - movups -(VEC_SIZE)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - subq $VEC_SIZE, %rdx
> - ja L(more_1x_vec)
> -L(ret_vec_x0_test):
> - /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> - zero. */
> - bsrl %eax, %eax
> - jz L(ret_0)
> - /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> - if out of bounds. */
> - addl %edx, %eax
> - jl L(zero_0)
> - /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> - ptr. */
> - addq %rdi, %rax
> -L(ret_0):
> - ret
> -
> - .p2align 4,, 5
> -L(ret_vec_x0):
> - bsrl %eax, %eax
> - leaq -(VEC_SIZE)(%rcx, %rax), %rax
> - ret
> -
> - .p2align 4,, 2
> -L(zero_0):
> - xorl %eax, %eax
> - ret
> -
> -
> - .p2align 4,, 8
> -L(more_1x_vec):
> - testl %eax, %eax
> - jnz L(ret_vec_x0)
> -
> - /* Align rcx (pointer to string). */
> - decq %rcx
> - andq $-VEC_SIZE, %rcx
> -
> - movq %rcx, %rdx
> - /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> - %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> - it adds more frontend uops (even if the moves can be eliminated) and
> - some percentage of the time actual backend uops. */
> - movaps -(VEC_SIZE)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - subq %rdi, %rdx
> - pmovmskb %xmm1, %eax
> -
> - cmpq $(VEC_SIZE * 2), %rdx
> - ja L(more_2x_vec)
> -L(last_2x_vec):
> - subl $VEC_SIZE, %edx
> - jbe L(ret_vec_x0_test)
> -
> - testl %eax, %eax
> - jnz L(ret_vec_x0)
> -
> - movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - subl $VEC_SIZE, %edx
> - bsrl %eax, %eax
> - jz L(ret_1)
> - addl %edx, %eax
> - jl L(zero_0)
> - addq %rdi, %rax
> -L(ret_1):
> - ret
> -
> - /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> - causes the hot pause (length <= VEC_SIZE) to span multiple cache
> - lines. Naturally aligned % 16 to 8-bytes. */
> -L(page_cross):
> - /* Zero length check. */
> - testq %rdx, %rdx
> - jz L(zero_0)
> -
> - leaq -1(%rcx), %r8
> - andq $-(VEC_SIZE), %r8
> -
> - movaps (%r8), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %esi
> - /* Shift out negative alignment (because we are starting from endptr and
> - working backwards). */
> - negl %ecx
> - /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> - explicitly. */
> - andl $(VEC_SIZE - 1), %ecx
> - shl %cl, %esi
> - movzwl %si, %eax
> - leaq (%rdi, %rdx), %rcx
> - cmpq %rdi, %r8
> - ja L(more_1x_vec)
> - subl $VEC_SIZE, %edx
> - bsrl %eax, %eax
> - jz L(ret_2)
> - addl %edx, %eax
> - jl L(zero_1)
> - addq %rdi, %rax
> -L(ret_2):
> - ret
> -
> - /* Fits in aliging bytes. */
> -L(zero_1):
> - xorl %eax, %eax
> - ret
> -
> - .p2align 4,, 5
> -L(ret_vec_x1):
> - bsrl %eax, %eax
> - leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
> - ret
> -
> - .p2align 4,, 8
> -L(more_2x_vec):
> - testl %eax, %eax
> - jnz L(ret_vec_x0)
> -
> - movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> - testl %eax, %eax
> - jnz L(ret_vec_x1)
> -
> -
> - movaps -(VEC_SIZE * 3)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - subq $(VEC_SIZE * 4), %rdx
> - ja L(more_4x_vec)
> -
> - addl $(VEC_SIZE), %edx
> - jle L(ret_vec_x2_test)
> -
> -L(last_vec):
> - testl %eax, %eax
> - jnz L(ret_vec_x2)
> -
> - movaps -(VEC_SIZE * 4)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - subl $(VEC_SIZE), %edx
> - bsrl %eax, %eax
> - jz L(ret_3)
> - addl %edx, %eax
> - jl L(zero_2)
> - addq %rdi, %rax
> -L(ret_3):
> - ret
> -
> - .p2align 4,, 6
> -L(ret_vec_x2_test):
> - bsrl %eax, %eax
> - jz L(zero_2)
> - addl %edx, %eax
> - jl L(zero_2)
> - addq %rdi, %rax
> - ret
> -
> -L(zero_2):
> - xorl %eax, %eax
> - ret
> -
> -
> - .p2align 4,, 5
> -L(ret_vec_x2):
> - bsrl %eax, %eax
> - leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
> - ret
> -
> - .p2align 4,, 5
> -L(ret_vec_x3):
> - bsrl %eax, %eax
> - leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
> - ret
> -
> - .p2align 4,, 8
> -L(more_4x_vec):
> - testl %eax, %eax
> - jnz L(ret_vec_x2)
> -
> - movaps -(VEC_SIZE * 4)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - testl %eax, %eax
> - jnz L(ret_vec_x3)
> -
> - addq $-(VEC_SIZE * 4), %rcx
> - cmpq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec)
> -
> - /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> - keeping the code from spilling to the next cache line. */
> - addq $(VEC_SIZE * 4 - 1), %rcx
> - andq $-(VEC_SIZE * 4), %rcx
> - leaq (VEC_SIZE * 4)(%rdi), %rdx
> - andq $-(VEC_SIZE * 4), %rdx
> -
> - .p2align 4,, 11
> -L(loop_4x_vec):
> - movaps (VEC_SIZE * -1)(%rcx), %xmm1
> - movaps (VEC_SIZE * -2)(%rcx), %xmm2
> - movaps (VEC_SIZE * -3)(%rcx), %xmm3
> - movaps (VEC_SIZE * -4)(%rcx), %xmm4
> - pcmpeqb %xmm0, %xmm1
> - pcmpeqb %xmm0, %xmm2
> - pcmpeqb %xmm0, %xmm3
> - pcmpeqb %xmm0, %xmm4
> -
> - por %xmm1, %xmm2
> - por %xmm3, %xmm4
> - por %xmm2, %xmm4
> -
> - pmovmskb %xmm4, %esi
> - testl %esi, %esi
> - jnz L(loop_end)
> -
> - addq $-(VEC_SIZE * 4), %rcx
> - cmpq %rdx, %rcx
> - jne L(loop_4x_vec)
> -
> - subl %edi, %edx
> -
> - /* Ends up being 1-byte nop. */
> - .p2align 4,, 2
> -L(last_4x_vec):
> - movaps -(VEC_SIZE)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - cmpl $(VEC_SIZE * 2), %edx
> - jbe L(last_2x_vec)
> -
> - testl %eax, %eax
> - jnz L(ret_vec_x0)
> -
> -
> - movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - testl %eax, %eax
> - jnz L(ret_vec_end)
> -
> - movaps -(VEC_SIZE * 3)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - subl $(VEC_SIZE * 3), %edx
> - ja L(last_vec)
> - bsrl %eax, %eax
> - jz L(ret_4)
> - addl %edx, %eax
> - jl L(zero_3)
> - addq %rdi, %rax
> -L(ret_4):
> - ret
> -
> - /* Ends up being 1-byte nop. */
> - .p2align 4,, 3
> -L(loop_end):
> - pmovmskb %xmm1, %eax
> - sall $16, %eax
> - jnz L(ret_vec_end)
> -
> - pmovmskb %xmm2, %eax
> - testl %eax, %eax
> - jnz L(ret_vec_end)
> -
> - pmovmskb %xmm3, %eax
> - /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> - then it won't affect the result in esi (VEC4). If ecx is non-zero
> - then CHAR in VEC3 and bsrq will use that position. */
> - sall $16, %eax
> - orl %esi, %eax
> - bsrl %eax, %eax
> - leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
> - ret
> -
> -L(ret_vec_end):
> - bsrl %eax, %eax
> - leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
> - ret
> - /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> - aligning bytes. */
> -L(zero_3):
> - xorl %eax, %eax
> - ret
> - /* 2-bytes from next cache line. */
> -END(__memrchr)
> +#define MEMRCHR __memrchr
> +#include "multiarch/memrchr-sse2.S"
> weak_alias (__memrchr, memrchr)
> diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S
> index b04202e171..d92a4022dc 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
> @@ -17,10 +17,338 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define __memrchr __memrchr_sse2
> +# ifndef MEMRCHR
> +# define MEMRCHR __memrchr_sse2
> +# endif
> +#endif
> +
> +#include <sysdep.h>
> +#define VEC_SIZE 16
> +#define PAGE_SIZE 4096
>
> -# undef weak_alias
> -# define weak_alias(__memrchr, memrchr)
> + .text
> +ENTRY_P2ALIGN(MEMRCHR, 6)
> +#ifdef __ILP32__
> + /* Clear upper bits. */
> + mov %RDX_LP, %RDX_LP
> #endif
> + movd %esi, %xmm0
> +
> + /* Get end pointer. */
> + leaq (%rdx, %rdi), %rcx
> +
> + punpcklbw %xmm0, %xmm0
> + punpcklwd %xmm0, %xmm0
> + pshufd $0, %xmm0, %xmm0
> +
> + /* Check if we can load 1x VEC without cross a page. */
> + testl $(PAGE_SIZE - VEC_SIZE), %ecx
> + jz L(page_cross)
> +
> + /* NB: This load happens regardless of whether rdx (len) is zero. Since
> + it doesn't cross a page and the standard gurantees any pointer have
> + at least one-valid byte this load must be safe. For the entire
> + history of the x86 memrchr implementation this has been possible so
> + no code "should" be relying on a zero-length check before this load.
> + The zero-length check is moved to the page cross case because it is
> + 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> + into 2-cache lines. */
> + movups -(VEC_SIZE)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + subq $VEC_SIZE, %rdx
> + ja L(more_1x_vec)
> +L(ret_vec_x0_test):
> + /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> + zero. */
> + bsrl %eax, %eax
> + jz L(ret_0)
> + /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> + if out of bounds. */
> + addl %edx, %eax
> + jl L(zero_0)
> + /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> + ptr. */
> + addq %rdi, %rax
> +L(ret_0):
> + ret
> +
> + .p2align 4,, 5
> +L(ret_vec_x0):
> + bsrl %eax, %eax
> + leaq -(VEC_SIZE)(%rcx, %rax), %rax
> + ret
> +
> + .p2align 4,, 2
> +L(zero_0):
> + xorl %eax, %eax
> + ret
> +
> +
> + .p2align 4,, 8
> +L(more_1x_vec):
> + testl %eax, %eax
> + jnz L(ret_vec_x0)
> +
> + /* Align rcx (pointer to string). */
> + decq %rcx
> + andq $-VEC_SIZE, %rcx
> +
> + movq %rcx, %rdx
> + /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> + %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> + it adds more frontend uops (even if the moves can be eliminated) and
> + some percentage of the time actual backend uops. */
> + movaps -(VEC_SIZE)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + subq %rdi, %rdx
> + pmovmskb %xmm1, %eax
> +
> + cmpq $(VEC_SIZE * 2), %rdx
> + ja L(more_2x_vec)
> +L(last_2x_vec):
> + subl $VEC_SIZE, %edx
> + jbe L(ret_vec_x0_test)
> +
> + testl %eax, %eax
> + jnz L(ret_vec_x0)
> +
> + movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + subl $VEC_SIZE, %edx
> + bsrl %eax, %eax
> + jz L(ret_1)
> + addl %edx, %eax
> + jl L(zero_0)
> + addq %rdi, %rax
> +L(ret_1):
> + ret
> +
> + /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> + causes the hot pause (length <= VEC_SIZE) to span multiple cache
> + lines. Naturally aligned % 16 to 8-bytes. */
> +L(page_cross):
> + /* Zero length check. */
> + testq %rdx, %rdx
> + jz L(zero_0)
> +
> + leaq -1(%rcx), %r8
> + andq $-(VEC_SIZE), %r8
> +
> + movaps (%r8), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %esi
> + /* Shift out negative alignment (because we are starting from endptr and
> + working backwards). */
> + negl %ecx
> + /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> + explicitly. */
> + andl $(VEC_SIZE - 1), %ecx
> + shl %cl, %esi
> + movzwl %si, %eax
> + leaq (%rdi, %rdx), %rcx
> + cmpq %rdi, %r8
> + ja L(more_1x_vec)
> + subl $VEC_SIZE, %edx
> + bsrl %eax, %eax
> + jz L(ret_2)
> + addl %edx, %eax
> + jl L(zero_1)
> + addq %rdi, %rax
> +L(ret_2):
> + ret
> +
> + /* Fits in aliging bytes. */
> +L(zero_1):
> + xorl %eax, %eax
> + ret
> +
> + .p2align 4,, 5
> +L(ret_vec_x1):
> + bsrl %eax, %eax
> + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
> + ret
> +
> + .p2align 4,, 8
> +L(more_2x_vec):
> + testl %eax, %eax
> + jnz L(ret_vec_x0)
> +
> + movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + testl %eax, %eax
> + jnz L(ret_vec_x1)
> +
> +
> + movaps -(VEC_SIZE * 3)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + subq $(VEC_SIZE * 4), %rdx
> + ja L(more_4x_vec)
> +
> + addl $(VEC_SIZE), %edx
> + jle L(ret_vec_x2_test)
> +
> +L(last_vec):
> + testl %eax, %eax
> + jnz L(ret_vec_x2)
> +
> + movaps -(VEC_SIZE * 4)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + subl $(VEC_SIZE), %edx
> + bsrl %eax, %eax
> + jz L(ret_3)
> + addl %edx, %eax
> + jl L(zero_2)
> + addq %rdi, %rax
> +L(ret_3):
> + ret
> +
> + .p2align 4,, 6
> +L(ret_vec_x2_test):
> + bsrl %eax, %eax
> + jz L(zero_2)
> + addl %edx, %eax
> + jl L(zero_2)
> + addq %rdi, %rax
> + ret
> +
> +L(zero_2):
> + xorl %eax, %eax
> + ret
> +
> +
> + .p2align 4,, 5
> +L(ret_vec_x2):
> + bsrl %eax, %eax
> + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
> + ret
> +
> + .p2align 4,, 5
> +L(ret_vec_x3):
> + bsrl %eax, %eax
> + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
> + ret
> +
> + .p2align 4,, 8
> +L(more_4x_vec):
> + testl %eax, %eax
> + jnz L(ret_vec_x2)
> +
> + movaps -(VEC_SIZE * 4)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + testl %eax, %eax
> + jnz L(ret_vec_x3)
> +
> + addq $-(VEC_SIZE * 4), %rcx
> + cmpq $(VEC_SIZE * 4), %rdx
> + jbe L(last_4x_vec)
> +
> + /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> + keeping the code from spilling to the next cache line. */
> + addq $(VEC_SIZE * 4 - 1), %rcx
> + andq $-(VEC_SIZE * 4), %rcx
> + leaq (VEC_SIZE * 4)(%rdi), %rdx
> + andq $-(VEC_SIZE * 4), %rdx
> +
> + .p2align 4,, 11
> +L(loop_4x_vec):
> + movaps (VEC_SIZE * -1)(%rcx), %xmm1
> + movaps (VEC_SIZE * -2)(%rcx), %xmm2
> + movaps (VEC_SIZE * -3)(%rcx), %xmm3
> + movaps (VEC_SIZE * -4)(%rcx), %xmm4
> + pcmpeqb %xmm0, %xmm1
> + pcmpeqb %xmm0, %xmm2
> + pcmpeqb %xmm0, %xmm3
> + pcmpeqb %xmm0, %xmm4
> +
> + por %xmm1, %xmm2
> + por %xmm3, %xmm4
> + por %xmm2, %xmm4
> +
> + pmovmskb %xmm4, %esi
> + testl %esi, %esi
> + jnz L(loop_end)
> +
> + addq $-(VEC_SIZE * 4), %rcx
> + cmpq %rdx, %rcx
> + jne L(loop_4x_vec)
> +
> + subl %edi, %edx
> +
> + /* Ends up being 1-byte nop. */
> + .p2align 4,, 2
> +L(last_4x_vec):
> + movaps -(VEC_SIZE)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + cmpl $(VEC_SIZE * 2), %edx
> + jbe L(last_2x_vec)
> +
> + testl %eax, %eax
> + jnz L(ret_vec_x0)
> +
> +
> + movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + testl %eax, %eax
> + jnz L(ret_vec_end)
> +
> + movaps -(VEC_SIZE * 3)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + subl $(VEC_SIZE * 3), %edx
> + ja L(last_vec)
> + bsrl %eax, %eax
> + jz L(ret_4)
> + addl %edx, %eax
> + jl L(zero_3)
> + addq %rdi, %rax
> +L(ret_4):
> + ret
> +
> + /* Ends up being 1-byte nop. */
> + .p2align 4,, 3
> +L(loop_end):
> + pmovmskb %xmm1, %eax
> + sall $16, %eax
> + jnz L(ret_vec_end)
> +
> + pmovmskb %xmm2, %eax
> + testl %eax, %eax
> + jnz L(ret_vec_end)
> +
> + pmovmskb %xmm3, %eax
> + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> + then it won't affect the result in esi (VEC4). If ecx is non-zero
> + then CHAR in VEC3 and bsrq will use that position. */
> + sall $16, %eax
> + orl %esi, %eax
> + bsrl %eax, %eax
> + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
> + ret
>
> -#include "../memrchr.S"
> +L(ret_vec_end):
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
> + ret
> + /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> + aligning bytes. */
> +L(zero_3):
> + xorl %eax, %eax
> + ret
> + /* 2-bytes from next cache line. */
> +END(MEMRCHR)
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
More information about the Libc-alpha
mailing list