1 /* memrchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (4)
26 # include "x86-evex256-vecs.h"
29 # include "reg-macros.h"
32 # define MEMRCHR __memrchr_evex
35 # define PAGE_SIZE 4096
36 # define VMATCH VMM(0)
38 .section SECTION(.text), "ax", @progbits
39 ENTRY_P2ALIGN(MEMRCHR, 6)
41 /* Clear upper bits. */
48 /* Get end pointer. Minus one for three reasons. 1) It is
49 necessary for a correct page cross check and 2) it correctly
50 sets up end ptr to be subtract by lzcnt aligned. 3) it is a
51 necessary step in aligning ptr. */
52 leaq -1(%rdi, %rdx), %rax
53 vpbroadcastb %esi, %VMATCH
55 /* Check if we can load 1x VEC without cross a page. */
56 testl $(PAGE_SIZE - VEC_SIZE), %eax
59 /* Don't use rax for pointer here because EVEX has better
60 encoding with offset % VEC_SIZE == 0. */
61 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0
64 /* If rcx is zero then lzcnt -> VEC_SIZE. NB: there is a
65 already a dependency between rcx and rsi so no worries about
68 /* If rdx <= rsi then either 1) rcx was non-zero (there was a
69 match) but it was out of bounds or 2) rcx was zero and rdx
70 was <= VEC_SIZE so we are done scanning. */
72 /* NB: Use branch to return zero/non-zero. Common usage will
73 branch on result of function (if return is null/non-null).
74 This branch can be used to predict the ensuing one so there
75 is no reason to extend the data-dependency with cmovcc. */
78 /* If rcx is zero then len must be > RDX, otherwise since we
79 already tested len vs lzcnt(rcx) (in rsi) we are good to
86 /* Fits in aligning bytes of first cache line for VEC_SIZE ==
97 /* Align rax (pointer to string). */
99 L(page_cross_continue):
100 /* Recompute length after aligning. */
103 cmpq $(VEC_SIZE * 2), %rax
107 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
111 jnz L(ret_vec_x0_test)
113 /* If VEC_SIZE == 64 need to subtract because lzcntq won't
114 implicitly add VEC_SIZE to match position. */
122 /* We adjusted rax (length) for VEC_SIZE == 64 so need separate
125 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
127 vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
130 /* NB: 64-bit lzcnt. This will naturally add 32 to position for
134 ja L(first_vec_x1_ret)
135 /* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the
136 first cache line (this is the second cache line). */
144 /* NB: Fits in aligning bytes before next cache line for
145 VEC_SIZE == 32. For VEC_SIZE == 64 this is attached to
146 L(first_vec_x0_test). */
149 leaq -1(%rdi, %rax), %rax
159 /* Reuse code at the end of L(ret_vec_x0_test) as we can't fit
160 L(first_vec_x1_ret) in the same cache line as its jmp base
161 so we might as well save code size. */
164 leaq -1(%rdi, %rax), %rax
169 /* Compute remaining length. */
172 cmpl $(VEC_SIZE * 2), %eax
175 /* Only align for VEC_SIZE == 32. For VEC_SIZE == 64 we need
176 the spare bytes to align the loop properly. */
181 /* Length > VEC_SIZE * 2 so check the first 2x VEC for match and
182 return if either hit. */
183 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
189 vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
194 /* Need no matter what. */
195 vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0
198 /* Check if we are near the end. */
199 subq $(VEC_SIZE * 4), %rax
203 jnz L(first_vec_x2_test)
205 /* Adjust length for final check and check if we are at the end.
207 addl $(VEC_SIZE * 1), %eax
210 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
215 ja L(first_vec_x3_ret)
220 leaq -1(%rdi, %rax), %rax
224 L(first_vec_x2_test):
225 /* Must adjust length before check. */
226 subl $-(VEC_SIZE * 2 - 1), %eax
237 leaq (VEC_SIZE * -1)(%rdi, %rax), %rax
241 /* Fits unobtrusively here. */
249 leaq (VEC_SIZE * -2)(%rdi, %rax), %rax
263 leaq (VEC_SIZE * 1)(%rdi, %rax), %rax
272 vpcmpeqb (%rdi, %rax), %VMATCH, %k0
278 /* Check if near end before re-aligning (otherwise might do an
279 unnecessary loop iteration). */
280 cmpq $(VEC_SIZE * 4), %rax
284 /* NB: We setup the loop to NOT use index-address-mode for the
285 buffer. This costs some instructions & code size but avoids
286 stalls due to unlaminated micro-fused instructions (as used
287 in the loop) from being forced to issue in the same group
288 (essentially narrowing the backend width). */
290 /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi
291 because lengths that overflow can be valid and break the
294 /* Use rdx as intermediate to compute rax, this gets us imm8
295 encoding which just allows the L(more_4x_vec) block to fit
297 leaq (VEC_SIZE * 4)(%rdi), %rdx
298 leaq (VEC_SIZE * -1)(%rdx, %rax), %rax
300 /* No evex machine has partial register stalls. This can be
301 replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that
306 leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
307 andq $(VEC_SIZE * -4), %rax
308 leaq (VEC_SIZE * 4)(%rdi), %rdx
309 andq $(VEC_SIZE * -4), %rdx
315 /* NB: We could do the same optimization here as we do for
316 memchr/rawmemchr by using VEX encoding in the loop for access
317 to VEX vpcmpeqb + vpternlogd. Since memrchr is not as hot as
318 memchr it may not be worth the extra code size, but if the
319 need arises it an easy ~15% perf improvement to the loop. */
322 je L(loop_last_4x_vec)
323 /* Store 1 were not-equals and 0 where equals in k1 (used to
325 vpcmpb $4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1
327 /* VEC(2/3) will have zero-byte where we found a CHAR. */
328 vpxorq (VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2)
329 vpxorq (VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3)
330 vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4
332 /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit
333 where CHAR is found and VEC(2/3) have zero-byte where CHAR
335 vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z}
336 vptestnmb %VMM(3), %VMM(3), %k2
338 addq $-(VEC_SIZE * 4), %rax
340 /* Any 1s and we found CHAR. */
345 /* K1 has non-matches for first VEC. inc; jz will overflow rcx
346 iff all bytes where non-matches. */
349 jnz L(first_vec_x0_end)
351 vptestnmb %VMM(2), %VMM(2), %k0
354 jnz L(first_vec_x1_end)
357 /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
358 returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
359 individually, for VEC_SIZE == 32 we combine them in a single
363 jnz L(first_vec_x2_end)
366 /* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from
367 VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the
368 result in rsi (from VEC(4)). If rcx is non-zero then CHAR in
369 VEC(3) and bsrq will use that position. */
380 /* rcx has 1s at non-matches so we need to `not` it. We used
381 `inc` to test if zero so use `neg` to complete the `not` so
382 the last 1 bit represent a match. NB: (-x + 1 == ~x). */
385 leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
391 leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
395 /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
396 need return label for it. */
400 leaq (VEC_SIZE * 1)(%rcx, %rax), %rax
407 /* only lower bits of eax[log2(VEC_SIZE):0] are set so we can
408 use movzbl to get the amount of bytes we are checking here.
411 andq $-VEC_SIZE, %rax
412 vpcmpeqb (%rax), %VMATCH, %k0
415 /* eax was comptued as %rdi + %rdx - 1 so need to add back 1
419 /* Invert ecx to get shift count for byte matches out of range.
422 shlx %VRCX, %VRSI, %VRSI
424 /* if r8 < rdx then the entire [buf, buf + len] is handled in
425 the page cross case. NB: we can't use the trick here we use
426 in the non page-cross case because we aren't checking full
429 ja L(page_cross_check)
438 jz L(page_cross_continue)
443 leaq -1(%rdi, %rdx), %rax