AArch64: Optimize memset

author Wilco Dijkstra <wilco.dijkstra@arm.com>

Mon, 9 Sep 2024 14:26:47 +0000 (15:26 +0100)

committer Wilco Dijkstra <wilco.dijkstra@arm.com>

Mon, 9 Sep 2024 14:30:00 +0000 (15:30 +0100)
author Wilco Dijkstra <wilco.dijkstra@arm.com>
Mon, 9 Sep 2024 14:26:47 +0000 (15:26 +0100)
committer Wilco Dijkstra <wilco.dijkstra@arm.com>
Mon, 9 Sep 2024 14:30:00 +0000 (15:30 +0100)
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S

index 7ef77ee8c926de211a2c6d193a23f49808481a82..caafb019e2b6217b8a7ebc48a453a48a66ce76d3 100644 (file)
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
+/* Generic optimized memset using SIMD.
+   Copyright (C) 2012-2024 Free Software Foundation, Inc.
  
     This file is part of the GNU C Library.
  
@@ -17,7 +18,6 @@
     <https://www.gnu.org/licenses/>.  */
  
  #include <sysdep.h>
-#include "memset-reg.h"
  
  #ifndef MEMSET
  # define MEMSET memset
@@ -25,130 +25,132 @@
  
  /* Assumptions:
   *
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
   *
   */
  
-ENTRY (MEMSET)
+#define dstin  x0
+#define val    x1
+#define valw   w1
+#define count  x2
+#define dst    x3
+#define dstend x4
+#define zva_val        x5
+#define off    x3
+#define dstend2        x5
  
+ENTRY (MEMSET)
         PTR_ARG (0)
         SIZE_ARG (2)
  
         dup     v0.16B, valw
+       cmp     count, 16
+       b.lo    L(set_small)
+
         add     dstend, dstin, count
+       cmp     count, 64
+       b.hs    L(set_128)
  
-       cmp     count, 96
-       b.hi    L(set_long)
-       cmp     count, 16
-       b.hs    L(set_medium)
-       mov     val, v0.D[0]
+       /* Set 16..63 bytes.  */
+       mov     off, 16
+       and     off, off, count, lsr 1
+       sub     dstend2, dstend, off
+       str     q0, [dstin]
+       str     q0, [dstin, off]
+       str     q0, [dstend2, -16]
+       str     q0, [dstend, -16]
+       ret
  
+       .p2align 4
         /* Set 0..15 bytes.  */
-       tbz     count, 3, 1f
-       str     val, [dstin]
-       str     val, [dstend, -8]
-       ret
-       nop
-1:     tbz     count, 2, 2f
-       str     valw, [dstin]
-       str     valw, [dstend, -4]
+L(set_small):
+       add     dstend, dstin, count
+       cmp     count, 4
+       b.lo    2f
+       lsr     off, count, 3
+       sub     dstend2, dstend, off, lsl 2
+       str     s0, [dstin]
+       str     s0, [dstin, off, lsl 2]
+       str     s0, [dstend2, -4]
+       str     s0, [dstend, -4]
         ret
+
+       /* Set 0..3 bytes.  */
  2:     cbz     count, 3f
+       lsr     off, count, 1
         strb    valw, [dstin]
-       tbz     count, 1, 3f
-       strh    valw, [dstend, -2]
+       strb    valw, [dstin, off]
+       strb    valw, [dstend, -1]
  3:     ret
  
-       /* Set 17..96 bytes.  */
-L(set_medium):
-       str     q0, [dstin]
-       tbnz    count, 6, L(set96)
-       str     q0, [dstend, -16]
-       tbz     count, 5, 1f
-       str     q0, [dstin, 16]
-       str     q0, [dstend, -32]
-1:     ret
-
         .p2align 4
-       /* Set 64..96 bytes.  Write 64 bytes from the start and
-          32 bytes from the end.  */
-L(set96):
-       str     q0, [dstin, 16]
+L(set_128):
+       bic     dst, dstin, 15
+       cmp     count, 128
+       b.hi    L(set_long)
+       stp     q0, q0, [dstin]
         stp     q0, q0, [dstin, 32]
+       stp     q0, q0, [dstend, -64]
         stp     q0, q0, [dstend, -32]
         ret
  
-       .p2align 3
-       nop
+       .p2align 4
  L(set_long):
-       and     valw, valw, 255
-       bic     dst, dstin, 15
         str     q0, [dstin]
-       cmp     count, 256
-       ccmp    valw, 0, 0, cs
-       b.eq    L(try_zva)
-L(no_zva):
-       sub     count, dstend, dst      /* Count is 16 too large.  */
-       sub     dst, dst, 16            /* Dst is biased by -32.  */
-       sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-1:     stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]!
-L(tail64):
-       subs    count, count, 64
-       b.hi    1b
-2:     stp     q0, q0, [dstend, -64]
+       str     q0, [dst, 16]
+       tst     valw, 255
+       b.ne    L(no_zva)
+#ifndef ZVA64_ONLY
+       mrs     zva_val, dczid_el0
+       and     zva_val, zva_val, 31
+       cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
+       b.ne    L(zva_128)
+#endif
+       stp     q0, q0, [dst, 32]
+       bic     dst, dstin, 63
+       sub     count, dstend, dst      /* Count is now 64 too large.  */
+       sub     count, count, 64 + 64   /* Adjust count and bias for loop.  */
+
+       /* Write last bytes before ZVA loop.  */
+       stp     q0, q0, [dstend, -64]
         stp     q0, q0, [dstend, -32]
+
+       .p2align 4
+L(zva64_loop):
+       add     dst, dst, 64
+       dc      zva, dst
+       subs    count, count, 64
+       b.hi    L(zva64_loop)
         ret
  
-L(try_zva):
-#ifndef ZVA64_ONLY
         .p2align 3
-       mrs     tmp1, dczid_el0
-       tbnz    tmp1w, 4, L(no_zva)
-       and     tmp1w, tmp1w, 15
-       cmp     tmp1w, 4        /* ZVA size is 64 bytes.  */
-       b.ne     L(zva_128)
-       nop
-#endif
-       /* Write the first and last 64 byte aligned block using stp rather
-          than using DC ZVA.  This is faster on some cores.
-        */
-       .p2align 4
-L(zva_64):
-       str     q0, [dst, 16]
+L(no_zva):
+       sub     count, dstend, dst      /* Count is 32 too large.  */
+       sub     count, count, 64 + 32   /* Adjust count and bias for loop.  */
+L(no_zva_loop):
         stp     q0, q0, [dst, 32]
-       bic     dst, dst, 63
         stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
-       sub     count, dstend, dst      /* Count is now 128 too large.  */
-       sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
-       add     dst, dst, 128
-1:     dc      zva, dst
         add     dst, dst, 64
         subs    count, count, 64
-       b.hi    1b
-       stp     q0, q0, [dst, 0]
-       stp     q0, q0, [dst, 32]
+       b.hi    L(no_zva_loop)
         stp     q0, q0, [dstend, -64]
         stp     q0, q0, [dstend, -32]
         ret
  
  #ifndef ZVA64_ONLY
-       .p2align 3
+       .p2align 4
  L(zva_128):
-       cmp     tmp1w, 5        /* ZVA size is 128 bytes.  */
-       b.ne    L(zva_other)
+       cmp     zva_val, 5              /* ZVA size is 128 bytes.  */
+       b.ne    L(no_zva)
  
-       str     q0, [dst, 16]
         stp     q0, q0, [dst, 32]
         stp     q0, q0, [dst, 64]
         stp     q0, q0, [dst, 96]
         bic     dst, dst, 127
         sub     count, dstend, dst      /* Count is now 128 too large.  */
-       sub     count, count, 128+128   /* Adjust count and bias for loop.  */
-       add     dst, dst, 128
-1:     dc      zva, dst
-       add     dst, dst, 128
+       sub     count, count, 128 + 128 /* Adjust count and bias for loop.  */
+1:     add     dst, dst, 128
+       dc      zva, dst
         subs    count, count, 128
         b.hi    1b
         stp     q0, q0, [dstend, -128]
@@ -156,35 +158,6 @@ L(zva_128):
         stp     q0, q0, [dstend, -64]
         stp     q0, q0, [dstend, -32]
         ret
-
-L(zva_other):
-       mov     tmp2w, 4
-       lsl     zva_lenw, tmp2w, tmp1w
-       add     tmp1, zva_len, 64       /* Max alignment bytes written.  */
-       cmp     count, tmp1
-       blo     L(no_zva)
-
-       sub     tmp2, zva_len, 1
-       add     tmp1, dst, zva_len
-       add     dst, dst, 16
-       subs    count, tmp1, dst        /* Actual alignment bytes to write.  */
-       bic     tmp1, tmp1, tmp2        /* Aligned dc zva start address.  */
-       beq     2f
-1:     stp     q0, q0, [dst], 64
-       stp     q0, q0, [dst, -32]
-       subs    count, count, 64
-       b.hi    1b
-2:     mov     dst, tmp1
-       sub     count, dstend, tmp1     /* Remaining bytes to write.  */
-       subs    count, count, zva_len
-       b.lo    4f
-3:     dc      zva, dst
-       add     dst, dst, zva_len
-       subs    count, count, zva_len
-       b.hs    3b
-4:     add     count, count, zva_len
-       sub     dst, dst, 32            /* Bias dst for tail loop.  */
-       b       L(tail64)
  #endif
  
  END (MEMSET)
author	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Mon, 9 Sep 2024 14:26:47 +0000 (15:26 +0100)
committer	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Mon, 9 Sep 2024 14:30:00 +0000 (15:30 +0100)