sysdeps/x86_64/multiarch/memrchr-evex.S

   1 /* memrchr optimized with 256-bit EVEX instructions.
   2    Copyright (C) 2021-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (4)
  22
  23 # include <sysdep.h>
  24
  25 # ifndef VEC_SIZE
  26 #  include "x86-evex256-vecs.h"
  27 # endif
  28
  29 # include "reg-macros.h"
  30
  31 # ifndef MEMRCHR
  32 #  define MEMRCHR       __memrchr_evex
  33 # endif
  34
  35 # define PAGE_SIZE      4096
  36 # define VMATCH VMM(0)
  37
  38         .section SECTION(.text), "ax", @progbits
  39 ENTRY_P2ALIGN(MEMRCHR, 6)
  40 # ifdef __ILP32__
  41         /* Clear upper bits.  */
  42         and     %RDX_LP, %RDX_LP
  43 # else
  44         test    %RDX_LP, %RDX_LP
  45 # endif
  46         jz      L(zero_0)
  47
  48         /* Get end pointer. Minus one for three reasons. 1) It is
  49            necessary for a correct page cross check and 2) it correctly
  50            sets up end ptr to be subtract by lzcnt aligned. 3) it is a
  51            necessary step in aligning ptr.  */
  52         leaq    -1(%rdi, %rdx), %rax
  53         vpbroadcastb %esi, %VMATCH
  54
  55         /* Check if we can load 1x VEC without cross a page.  */
  56         testl   $(PAGE_SIZE - VEC_SIZE), %eax
  57         jz      L(page_cross)
  58
  59         /* Don't use rax for pointer here because EVEX has better
  60            encoding with offset % VEC_SIZE == 0.  */
  61         vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0
  62         KMOV    %k0, %VRCX
  63
  64         /* If rcx is zero then lzcnt -> VEC_SIZE.  NB: there is a
  65            already a dependency between rcx and rsi so no worries about
  66            false-dep here.  */
  67         lzcnt   %VRCX, %VRSI
  68         /* If rdx <= rsi then either 1) rcx was non-zero (there was a
  69            match) but it was out of bounds or 2) rcx was zero and rdx
  70            was <= VEC_SIZE so we are done scanning.  */
  71         cmpq    %rsi, %rdx
  72         /* NB: Use branch to return zero/non-zero.  Common usage will
  73            branch on result of function (if return is null/non-null).
  74            This branch can be used to predict the ensuing one so there
  75            is no reason to extend the data-dependency with cmovcc.  */
  76         jbe     L(zero_0)
  77
  78         /* If rcx is zero then len must be > RDX, otherwise since we
  79            already tested len vs lzcnt(rcx) (in rsi) we are good to
  80            return this match.  */
  81         test    %VRCX, %VRCX
  82         jz      L(more_1x_vec)
  83         subq    %rsi, %rax
  84         ret
  85
  86         /* Fits in aligning bytes of first cache line for VEC_SIZE ==
  87            32.  */
  88 # if VEC_SIZE == 32
  89         .p2align 4,, 2
  90 L(zero_0):
  91         xorl    %eax, %eax
  92         ret
  93 # endif
  94
  95         .p2align 4,, 10
  96 L(more_1x_vec):
  97         /* Align rax (pointer to string).  */
  98         andq    $-VEC_SIZE, %rax
  99 L(page_cross_continue):
 100         /* Recompute length after aligning.  */
 101         subq    %rdi, %rax
 102
 103         cmpq    $(VEC_SIZE * 2), %rax
 104         ja      L(more_2x_vec)
 105
 106 L(last_2x_vec):
 107         vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
 108         KMOV    %k0, %VRCX
 109
 110         test    %VRCX, %VRCX
 111         jnz     L(ret_vec_x0_test)
 112
 113         /* If VEC_SIZE == 64 need to subtract because lzcntq won't
 114            implicitly add VEC_SIZE to match position.  */
 115 # if VEC_SIZE == 64
 116         subl    $VEC_SIZE, %eax
 117 # else
 118         cmpb    $VEC_SIZE, %al
 119 # endif
 120         jle     L(zero_2)
 121
 122         /* We adjusted rax (length) for VEC_SIZE == 64 so need separate
 123            offsets.  */
 124 # if VEC_SIZE == 64
 125         vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
 126 # else
 127         vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
 128 # endif
 129         KMOV    %k0, %VRCX
 130         /* NB: 64-bit lzcnt. This will naturally add 32 to position for
 131            VEC_SIZE == 32.  */
 132         lzcntq  %rcx, %rcx
 133         subl    %ecx, %eax
 134         ja      L(first_vec_x1_ret)
 135         /* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the
 136            first cache line (this is the second cache line).  */
 137 # if VEC_SIZE == 64
 138 L(zero_0):
 139 # endif
 140 L(zero_2):
 141         xorl    %eax, %eax
 142         ret
 143
 144         /* NB: Fits in aligning bytes before next cache line for
 145            VEC_SIZE == 32.  For VEC_SIZE == 64 this is attached to
 146            L(first_vec_x0_test).  */
 147 # if VEC_SIZE == 32
 148 L(first_vec_x1_ret):
 149         leaq    -1(%rdi, %rax), %rax
 150         ret
 151 # endif
 152
 153         .p2align 4,, 6
 154 L(ret_vec_x0_test):
 155         lzcnt   %VRCX, %VRCX
 156         subl    %ecx, %eax
 157         jle     L(zero_2)
 158 # if VEC_SIZE == 64
 159         /* Reuse code at the end of L(ret_vec_x0_test) as we can't fit
 160            L(first_vec_x1_ret) in the same cache line as its jmp base
 161            so we might as well save code size.  */
 162 L(first_vec_x1_ret):
 163 # endif
 164         leaq    -1(%rdi, %rax), %rax
 165         ret
 166
 167         .p2align 4,, 6
 168 L(loop_last_4x_vec):
 169         /* Compute remaining length.  */
 170         subl    %edi, %eax
 171 L(last_4x_vec):
 172         cmpl    $(VEC_SIZE * 2), %eax
 173         jle     L(last_2x_vec)
 174 # if VEC_SIZE == 32
 175         /* Only align for VEC_SIZE == 32.  For VEC_SIZE == 64 we need
 176            the spare bytes to align the loop properly.  */
 177         .p2align 4,, 10
 178 # endif
 179 L(more_2x_vec):
 180
 181         /* Length > VEC_SIZE * 2 so check the first 2x VEC for match and
 182            return if either hit.  */
 183         vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
 184         KMOV    %k0, %VRCX
 185
 186         test    %VRCX, %VRCX
 187         jnz     L(first_vec_x0)
 188
 189         vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
 190         KMOV    %k0, %VRCX
 191         test    %VRCX, %VRCX
 192         jnz     L(first_vec_x1)
 193
 194         /* Need no matter what.  */
 195         vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0
 196         KMOV    %k0, %VRCX
 197
 198         /* Check if we are near the end.  */
 199         subq    $(VEC_SIZE * 4), %rax
 200         ja      L(more_4x_vec)
 201
 202         test    %VRCX, %VRCX
 203         jnz     L(first_vec_x2_test)
 204
 205         /* Adjust length for final check and check if we are at the end.
 206          */
 207         addl    $(VEC_SIZE * 1), %eax
 208         jle     L(zero_1)
 209
 210         vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
 211         KMOV    %k0, %VRCX
 212
 213         lzcnt   %VRCX, %VRCX
 214         subl    %ecx, %eax
 215         ja      L(first_vec_x3_ret)
 216 L(zero_1):
 217         xorl    %eax, %eax
 218         ret
 219 L(first_vec_x3_ret):
 220         leaq    -1(%rdi, %rax), %rax
 221         ret
 222
 223         .p2align 4,, 6
 224 L(first_vec_x2_test):
 225         /* Must adjust length before check.  */
 226         subl    $-(VEC_SIZE * 2 - 1), %eax
 227         lzcnt   %VRCX, %VRCX
 228         subl    %ecx, %eax
 229         jl      L(zero_4)
 230         addq    %rdi, %rax
 231         ret
 232
 233
 234         .p2align 4,, 10
 235 L(first_vec_x0):
 236         bsr     %VRCX, %VRCX
 237         leaq    (VEC_SIZE * -1)(%rdi, %rax), %rax
 238         addq    %rcx, %rax
 239         ret
 240
 241         /* Fits unobtrusively here.  */
 242 L(zero_4):
 243         xorl    %eax, %eax
 244         ret
 245
 246         .p2align 4,, 10
 247 L(first_vec_x1):
 248         bsr     %VRCX, %VRCX
 249         leaq    (VEC_SIZE * -2)(%rdi, %rax), %rax
 250         addq    %rcx, %rax
 251         ret
 252
 253         .p2align 4,, 8
 254 L(first_vec_x3):
 255         bsr     %VRCX, %VRCX
 256         addq    %rdi, %rax
 257         addq    %rcx, %rax
 258         ret
 259
 260         .p2align 4,, 6
 261 L(first_vec_x2):
 262         bsr     %VRCX, %VRCX
 263         leaq    (VEC_SIZE * 1)(%rdi, %rax), %rax
 264         addq    %rcx, %rax
 265         ret
 266
 267         .p2align 4,, 2
 268 L(more_4x_vec):
 269         test    %VRCX, %VRCX
 270         jnz     L(first_vec_x2)
 271
 272         vpcmpeqb (%rdi, %rax), %VMATCH, %k0
 273         KMOV    %k0, %VRCX
 274
 275         test    %VRCX, %VRCX
 276         jnz     L(first_vec_x3)
 277
 278         /* Check if near end before re-aligning (otherwise might do an
 279            unnecessary loop iteration).  */
 280         cmpq    $(VEC_SIZE * 4), %rax
 281         jbe     L(last_4x_vec)
 282
 283
 284         /* NB: We setup the loop to NOT use index-address-mode for the
 285            buffer.  This costs some instructions & code size but avoids
 286            stalls due to unlaminated micro-fused instructions (as used
 287            in the loop) from being forced to issue in the same group
 288            (essentially narrowing the backend width).  */
 289
 290         /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi
 291            because lengths that overflow can be valid and break the
 292            comparison.  */
 293 # if VEC_SIZE == 64
 294         /* Use rdx as intermediate to compute rax, this gets us imm8
 295            encoding which just allows the L(more_4x_vec) block to fit
 296            in 1 cache-line.  */
 297         leaq    (VEC_SIZE * 4)(%rdi), %rdx
 298         leaq    (VEC_SIZE * -1)(%rdx, %rax), %rax
 299
 300         /* No evex machine has partial register stalls. This can be
 301            replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that
 302            changes.  */
 303         xorb    %al, %al
 304         xorb    %dl, %dl
 305 # else
 306         leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
 307         andq    $(VEC_SIZE * -4), %rax
 308         leaq    (VEC_SIZE * 4)(%rdi), %rdx
 309         andq    $(VEC_SIZE * -4), %rdx
 310 # endif
 311
 312
 313         .p2align 4
 314 L(loop_4x_vec):
 315         /* NB: We could do the same optimization here as we do for
 316            memchr/rawmemchr by using VEX encoding in the loop for access
 317            to VEX vpcmpeqb + vpternlogd.  Since memrchr is not as hot as
 318            memchr it may not be worth the extra code size, but if the
 319            need arises it an easy ~15% perf improvement to the loop.  */
 320
 321         cmpq    %rdx, %rax
 322         je      L(loop_last_4x_vec)
 323         /* Store 1 were not-equals and 0 where equals in k1 (used to
 324            mask later on).  */
 325         vpcmpb  $4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1
 326
 327         /* VEC(2/3) will have zero-byte where we found a CHAR.  */
 328         vpxorq  (VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2)
 329         vpxorq  (VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3)
 330         vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4
 331
 332         /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit
 333            where CHAR is found and VEC(2/3) have zero-byte where CHAR
 334            is found.  */
 335         vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z}
 336         vptestnmb %VMM(3), %VMM(3), %k2
 337
 338         addq    $-(VEC_SIZE * 4), %rax
 339
 340         /* Any 1s and we found CHAR.  */
 341         KORTEST %k2, %k4
 342         jz      L(loop_4x_vec)
 343
 344
 345         /* K1 has non-matches for first VEC. inc; jz will overflow rcx
 346            iff all bytes where non-matches.  */
 347         KMOV    %k1, %VRCX
 348         inc     %VRCX
 349         jnz     L(first_vec_x0_end)
 350
 351         vptestnmb %VMM(2), %VMM(2), %k0
 352         KMOV    %k0, %VRCX
 353         test    %VRCX, %VRCX
 354         jnz     L(first_vec_x1_end)
 355         KMOV    %k2, %VRCX
 356
 357         /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
 358            returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
 359            individually, for VEC_SIZE == 32 we combine them in a single
 360            64-bit GPR.  */
 361 # if VEC_SIZE == 64
 362         test    %VRCX, %VRCX
 363         jnz     L(first_vec_x2_end)
 364         KMOV    %k4, %VRCX
 365 # else
 366         /* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from
 367            VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the
 368            result in rsi (from VEC(4)). If rcx is non-zero then CHAR in
 369            VEC(3) and bsrq will use that position.  */
 370         KMOV    %k4, %VRSI
 371         salq    $32, %rcx
 372         orq     %rsi, %rcx
 373 # endif
 374         bsrq    %rcx, %rcx
 375         addq    %rcx, %rax
 376         ret
 377
 378         .p2align 4,, 4
 379 L(first_vec_x0_end):
 380         /* rcx has 1s at non-matches so we need to `not` it. We used
 381            `inc` to test if zero so use `neg` to complete the `not` so
 382            the last 1 bit represent a match.  NB: (-x + 1 == ~x).  */
 383         neg     %VRCX
 384         bsr     %VRCX, %VRCX
 385         leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
 386         ret
 387
 388         .p2align 4,, 10
 389 L(first_vec_x1_end):
 390         bsr     %VRCX, %VRCX
 391         leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
 392         ret
 393
 394 # if VEC_SIZE == 64
 395         /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
 396            need return label for it.  */
 397         .p2align 4,, 4
 398 L(first_vec_x2_end):
 399         bsr     %VRCX, %VRCX
 400         leaq    (VEC_SIZE * 1)(%rcx, %rax), %rax
 401         ret
 402 # endif
 403
 404
 405         .p2align 4,, 4
 406 L(page_cross):
 407         /* only lower bits of eax[log2(VEC_SIZE):0] are set so we can
 408            use movzbl to get the amount of bytes we are checking here.
 409          */
 410         movzbl  %al, %ecx
 411         andq    $-VEC_SIZE, %rax
 412         vpcmpeqb (%rax), %VMATCH, %k0
 413         KMOV    %k0, %VRSI
 414
 415         /* eax was comptued as %rdi + %rdx - 1 so need to add back 1
 416            here.  */
 417         leal    1(%rcx), %r8d
 418
 419         /* Invert ecx to get shift count for byte matches out of range.
 420          */
 421         notl    %ecx
 422         shlx    %VRCX, %VRSI, %VRSI
 423
 424         /* if r8 < rdx then the entire [buf, buf + len] is handled in
 425            the page cross case.  NB: we can't use the trick here we use
 426            in the non page-cross case because we aren't checking full
 427            VEC_SIZE.  */
 428         cmpq    %r8, %rdx
 429         ja      L(page_cross_check)
 430         lzcnt   %VRSI, %VRSI
 431         subl    %esi, %edx
 432         ja      L(page_cross_ret)
 433         xorl    %eax, %eax
 434         ret
 435
 436 L(page_cross_check):
 437         test    %VRSI, %VRSI
 438         jz      L(page_cross_continue)
 439
 440         lzcnt   %VRSI, %VRSI
 441         subl    %esi, %edx
 442 L(page_cross_ret):
 443         leaq    -1(%rdi, %rdx), %rax
 444         ret
 445 END(MEMRCHR)
 446 #endif