/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
-/* Arguments and results. */
#define srcin x0
#define chrin w1
-
#define result x0
#define src x2
-#define tmp1 x3
-#define wtmp2 w4
-#define tmp3 x5
+#define tmp1 x1
+#define wtmp2 w3
+#define tmp3 x3
#define vrepchr v0
-#define vdata1 v1
-#define vdata2 v2
-#define vhas_nul1 v3
-#define vhas_nul2 v4
-#define vhas_chr1 v5
-#define vhas_chr2 v6
-#define vrepmask_0 v7
-#define vrepmask_c v16
-#define vend1 v17
-#define vend2 v18
-
- /* Core algorithm.
- For each 32-byte hunk we calculate a 64-bit syndrome value, with
- two bits per byte (LSB is always in bits 0 and 1, for both big
- and little-endian systems). Bit 0 is set iff the relevant byte
- matched the requested character. Bit 1 is set iff the
- relevant byte matched the NUL end of string (we trigger off bit0
- for the special case of looking for NUL). Since the bits
- in the syndrome reflect exactly the order in which things occur
- in the original string a count_trailing_zeros() operation will
- identify exactly which byte is causing the termination, and why. */
-
-/* Locals and temporaries. */
+#define vdata v1
+#define qdata q1
+#define vhas_nul v2
+#define vhas_chr v3
+#define vrepmask v4
+#define vrepmask2 v5
+#define vend v6
+#define dend d6
+
+/* Core algorithm.
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
+ requested character, bits 2-3 are set if the byte is NUL (or matched), and
+ bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
+ bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
+ in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (strchr)
DELOUSE (0)
- mov wtmp2, #0x0401
- movk wtmp2, #0x4010, lsl #16
+ bic src, srcin, 15
dup vrepchr.16b, chrin
- bic src, srcin, #31
- dup vrepmask_c.4s, wtmp2
- ands tmp1, srcin, #31
- add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s // lsl #1
- b.eq L(loop)
-
- /* Input string is not 32-byte aligned. Rather than forcing
- the padding bytes to a safe value, we calculate the syndrome
- for all the bytes, but then mask off those bits of the
- syndrome that are related to the padding. */
- ld1 {vdata1.16b, vdata2.16b}, [src], #32
- neg tmp1, tmp1
- cmeq vhas_nul1.16b, vdata1.16b, #0
- cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
- cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
- and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
- lsl tmp1, tmp1, #1
- addp vend1.16b, vend1.16b, vend2.16b // 256->128
- mov tmp3, #~0
- addp vend1.16b, vend1.16b, vend2.16b // 128->64
- lsr tmp1, tmp3, tmp1
-
- mov tmp3, vend1.2d[0]
- bic tmp1, tmp3, tmp1 // Mask padding bits.
- cbnz tmp1, L(tail)
+ ld1 {vdata.16b}, [src]
+ mov wtmp2, 0x3003
+ dup vrepmask.8h, wtmp2
+ cmeq vhas_nul.16b, vdata.16b, 0
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ mov wtmp2, 0xf00f
+ dup vrepmask2.8h, wtmp2
+
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ lsl tmp3, srcin, 2
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+
+ fmov tmp1, dend
+ lsr tmp1, tmp1, tmp3
+ cbz tmp1, L(loop)
+
+ rbit tmp1, tmp1
+ clz tmp1, tmp1
+ /* Tmp1 is an even multiple of 2 if the target character was
+ found first. Otherwise we've found the end of string. */
+ tst tmp1, 2
+ add result, srcin, tmp1, lsr 2
+ csel result, result, xzr, eq
+ ret
+ .p2align 4
L(loop):
- ld1 {vdata1.16b, vdata2.16b}, [src], #32
- cmeq vhas_nul1.16b, vdata1.16b, #0
- cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
- cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- /* Use a fast check for the termination condition. */
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
- orr vend1.16b, vend1.16b, vend2.16b
- addp vend1.2d, vend1.2d, vend1.2d
- mov tmp1, vend1.2d[0]
+ ldr qdata, [src, 16]!
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov tmp1, dend
cbz tmp1, L(loop)
- /* Termination condition found. Now need to establish exactly why
- we terminated. */
- and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
- and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
- addp vend1.16b, vend1.16b, vend2.16b // 256->128
- addp vend1.16b, vend1.16b, vend2.16b // 128->64
-
- mov tmp1, vend1.2d[0]
-L(tail):
- sub src, src, #32
+#ifdef __AARCH64EB__
+ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ fmov tmp1, dend
+#else
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ fmov tmp1, dend
rbit tmp1, tmp1
+#endif
clz tmp1, tmp1
- /* Tmp1 is even if the target charager was found first. Otherwise
- we've found the end of string and we weren't looking for NUL. */
- tst tmp1, #1
- add result, src, tmp1, lsr #1
+ /* Tmp1 is an even multiple of 2 if the target character was
+ found first. Otherwise we've found the end of string. */
+ tst tmp1, 2
+ add result, src, tmp1, lsr 2
csel result, result, xzr, eq
ret
+
END (strchr)
libc_hidden_builtin_def (strchr)
weak_alias (strchr, index)