diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S index 4a75e73945..fd1b941666 100644 --- a/sysdeps/aarch64/strchr.S +++ b/sysdeps/aarch64/strchr.S @@ -22,118 +22,98 @@ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ -/* Arguments and results. */ #define srcin x0 #define chrin w1 - #define result x0 #define src x2 -#define tmp1 x3 -#define wtmp2 w4 -#define tmp3 x5 +#define tmp1 x1 +#define wtmp2 w3 +#define tmp3 x3 #define vrepchr v0 -#define vdata1 v1 -#define vdata2 v2 -#define vhas_nul1 v3 -#define vhas_nul2 v4 -#define vhas_chr1 v5 -#define vhas_chr2 v6 -#define vrepmask_0 v7 -#define vrepmask_c v16 -#define vend1 v17 -#define vend2 v18 - - /* Core algorithm. - For each 32-byte hunk we calculate a 64-bit syndrome value, with - two bits per byte (LSB is always in bits 0 and 1, for both big - and little-endian systems). Bit 0 is set iff the relevant byte - matched the requested character. Bit 1 is set iff the - relevant byte matched the NUL end of string (we trigger off bit0 - for the special case of looking for NUL). Since the bits - in the syndrome reflect exactly the order in which things occur - in the original string a count_trailing_zeros() operation will - identify exactly which byte is causing the termination, and why. */ - -/* Locals and temporaries. */ +#define vdata v1 +#define qdata q1 +#define vhas_nul v2 +#define vhas_chr v3 +#define vrepmask v4 +#define vrepmask2 v5 +#define vend v6 +#define dend d6 + +/* Core algorithm. + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-1 are set if the relevant byte matched the + requested character, bits 2-3 are set if the byte is NUL (or matched), and + bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd + bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits + in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (strchr) DELOUSE (0) - mov wtmp2, #0x0401 - movk wtmp2, #0x4010, lsl #16 + bic src, srcin, 15 dup vrepchr.16b, chrin - bic src, srcin, #31 - dup vrepmask_c.4s, wtmp2 - ands tmp1, srcin, #31 - add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s // lsl #1 - b.eq L(loop) - - /* Input string is not 32-byte aligned. Rather than forcing - the padding bytes to a safe value, we calculate the syndrome - for all the bytes, but then mask off those bits of the - syndrome that are related to the padding. */ - ld1 {vdata1.16b, vdata2.16b}, [src], #32 - neg tmp1, tmp1 - cmeq vhas_nul1.16b, vdata1.16b, #0 - cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 - cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b - lsl tmp1, tmp1, #1 - addp vend1.16b, vend1.16b, vend2.16b // 256->128 - mov tmp3, #~0 - addp vend1.16b, vend1.16b, vend2.16b // 128->64 - lsr tmp1, tmp3, tmp1 - - mov tmp3, vend1.2d[0] - bic tmp1, tmp3, tmp1 // Mask padding bits. - cbnz tmp1, L(tail) + ld1 {vdata.16b}, [src] + mov wtmp2, 0x3003 + dup vrepmask.8h, wtmp2 + cmeq vhas_nul.16b, vdata.16b, 0 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + mov wtmp2, 0xf00f + dup vrepmask2.8h, wtmp2 + + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + lsl tmp3, srcin, 2 + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + + fmov tmp1, dend + lsr tmp1, tmp1, tmp3 + cbz tmp1, L(loop) + + rbit tmp1, tmp1 + clz tmp1, tmp1 + /* Tmp1 is an even multiple of 2 if the target character was + found first. Otherwise we've found the end of string. */ + tst tmp1, 2 + add result, srcin, tmp1, lsr 2 + csel result, result, xzr, eq + ret + .p2align 4 L(loop): - ld1 {vdata1.16b, vdata2.16b}, [src], #32 - cmeq vhas_nul1.16b, vdata1.16b, #0 - cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 - cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - /* Use a fast check for the termination condition. */ - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b - orr vend1.16b, vend1.16b, vend2.16b - addp vend1.2d, vend1.2d, vend1.2d - mov tmp1, vend1.2d[0] + ldr qdata, [src, 16]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov tmp1, dend cbz tmp1, L(loop) - /* Termination condition found. Now need to establish exactly why - we terminated. */ - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b - addp vend1.16b, vend1.16b, vend2.16b // 256->128 - addp vend1.16b, vend1.16b, vend2.16b // 128->64 - - mov tmp1, vend1.2d[0] -L(tail): - sub src, src, #32 +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov tmp1, dend +#else + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov tmp1, dend rbit tmp1, tmp1 +#endif clz tmp1, tmp1 - /* Tmp1 is even if the target charager was found first. Otherwise - we've found the end of string and we weren't looking for NUL. */ - tst tmp1, #1 - add result, src, tmp1, lsr #1 + /* Tmp1 is an even multiple of 2 if the target character was + found first. Otherwise we've found the end of string. */ + tst tmp1, 2 + add result, src, tmp1, lsr 2 csel result, result, xzr, eq ret + END (strchr) libc_hidden_builtin_def (strchr) weak_alias (strchr, index)