From 138ff9a5d15a9dd6fcf7238074f94632d2a05acb Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Tue, 9 Feb 2021 17:56:02 +0000 Subject: [PATCH] aarch64: Optimize __libc_mtag_tag_region This is a target hook for memory tagging, the original was a naive implementation. The optimized version relies on "dc gva" to tag 64 bytes at a time for large allocations and optimizes small cases without adding too many branches. This was not benchmarked on real cpu, but expected to be faster than the naive implementation. --- sysdeps/aarch64/__mtag_tag_region.S | 98 +++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/sysdeps/aarch64/__mtag_tag_region.S b/sysdeps/aarch64/__mtag_tag_region.S index 9a8a3ffb60..cae0c8f121 100644 --- a/sysdeps/aarch64/__mtag_tag_region.S +++ b/sysdeps/aarch64/__mtag_tag_region.S @@ -20,32 +20,94 @@ #ifdef USE_MTAG -/* Use the same register names and assignments as memset. */ - +/* Assumptions: + * + * ARMv8-a, AArch64, MTE, LP64 ABI. + * + * Interface contract: + * Address is 16 byte aligned and size is multiple of 16. + * Returns the passed pointer. + * The memory region may remain untagged if tagging is not enabled. + */ .arch armv8.5-a .arch_extension memtag -/* NB, only supported on variants with 64-bit pointers. */ +#define dstin x0 +#define count x1 +#define dst x2 +#define dstend x3 +#define tmp x4 +#define zva_val x4 + +ENTRY (__libc_mtag_tag_region) + PTR_ARG (0) + SIZE_ARG (1) + + add dstend, dstin, count -/* FIXME: This is a minimal implementation. We could do better than - this for larger values of COUNT. */ + cmp count, 96 + b.hi L(set_long) -#define dstin x0 -#define count x1 -#define dst x2 + tbnz count, 6, L(set96) -ENTRY_ALIGN(__libc_mtag_tag_region, 6) + /* Set 0, 16, 32, or 48 bytes. */ + lsr tmp, count, 5 + add tmp, dstin, tmp, lsl 4 + cbz count, L(end) + stg dstin, [dstin] + stg dstin, [tmp] + stg dstin, [dstend, -16] +L(end): + ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + st2g dstin, [dstin] + st2g dstin, [dstin, 32] + st2g dstin, [dstend, -32] + ret - mov dst, dstin -L(loop): - stg dst, [dst], #16 - subs count, count, 16 - bne L(loop) -#if 0 - /* This is not currently needed, since for now we are only called - to tag memory that is taggable. */ - ldg dstin, [dstin] // Recover the tag created (might be untagged). + .p2align 4 + /* Size is > 96 bytes. */ +L(set_long): + cmp count, 160 + b.lo L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) #endif + st2g dstin, [dstin] + st2g dstin, [dstin, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc gva, dst + subs count, count, 64 + b.hi L(zva_loop) + st2g dstin, [dstend, -64] + st2g dstin, [dstend, -32] ret + +L(no_zva): + sub dst, dstin, 32 /* Dst is biased by -32. */ + sub count, count, 64 /* Adjust count for loop. */ +L(no_zva_loop): + st2g dstin, [dst, 32] + st2g dstin, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + st2g dstin, [dstend, -64] + st2g dstin, [dstend, -32] + ret + END (__libc_mtag_tag_region) #endif /* USE_MTAG */ -- 2.43.5