This is an optimized memset for AArch64. Memset is split into 4 main cases:

author Wilco Dijkstra <wdijkstr@arm.com>

Thu, 12 May 2016 15:41:00 +0000 (16:41 +0100)

committer Wilco Dijkstra <wdijkstr@arm.com>

Thu, 12 May 2016 15:44:53 +0000 (16:44 +0100)
author Wilco Dijkstra <wdijkstr@arm.com>
Thu, 12 May 2016 15:41:00 +0000 (16:41 +0100)
committer Wilco Dijkstra <wdijkstr@arm.com>
Thu, 12 May 2016 15:44:53 +0000 (16:44 +0100)
diff --git a/ChangeLog b/ChangeLog

index e62324708508b9a99d1af598e479824951524e88..a7dd3a72ff76aa7d2ae859d154d6603a7c4af706 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2016-05-12  Wilco Dijkstra  <wdijkstr@arm.com>
+
+       * sysdeps/aarch64/memset.S (__memset):
+       Rewrite of optimized memset.
+
  2016-05-12  Florian Weimer  <fweimer@redhat.com>
  
         [BZ #19703]
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S

index e49f4d6fafc30db1217cc0b22f8ae752859bfaf2..4d222c50aed3ca1be140bec8aa39acb47e410a4e 100644 (file)
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -16,213 +16,176 @@
     License along with the GNU C Library.  If not, see
     <http://www.gnu.org/licenses/>.  */
  
+#include <sysdep.h>
+
  /* Assumptions:
   *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses
   *
   */
  
-#include <sysdep.h>
-
-/* By default we assume that the DC instruction can be used to zero
-   data blocks more efficiently.  In some circumstances this might be
-   unsafe, for example in an asymmetric multiprocessor environment with
-   different DC clear lengths (neither the upper nor lower lengths are
-   safe to use).  The feature can be disabled by defining DONT_USE_DC.
-
-   If code may be run in a virtualized environment, then define
-   MAYBE_VIRT.  This will cause the code to cache the system register
-   values rather than re-reading them each call.  */
-
-#define dstin          x0
-#define val            w1
-#define count          x2
-#define tmp1           x3
-#define tmp1w          w3
-#define tmp2           x4
-#define tmp2w          w4
-#define zva_len_x      x5
-#define zva_len                w5
-#define zva_bits_x     x6
-
-#define A_l            x7
-#define A_lw           w7
-#define dst            x8
-#define tmp3w          w9
+#define dstin  x0
+#define val    x1
+#define valw   w1
+#define count  x2
+#define dst    x3
+#define dstend x4
+#define tmp1   x5
+#define tmp1w  w5
+#define tmp2   x6
+#define tmp2w  w6
+#define zva_len x7
+#define zva_lenw w7
  
  ENTRY_ALIGN (__memset, 6)
  
-       mov     dst, dstin              /* Preserve return value.  */
-       ands    A_lw, val, #255
-#ifndef DONT_USE_DC
-       b.eq    L(zero_mem)
-#endif
-       orr     A_lw, A_lw, A_lw, lsl #8
-       orr     A_lw, A_lw, A_lw, lsl #16
-       orr     A_l, A_l, A_l, lsl #32
-L(tail_maybe_long):
-       cmp     count, #64
-       b.ge    L(not_short)
-L(tail_maybe_tiny):
-       cmp     count, #15
-       b.le    L(tail15tiny)
-L(tail63):
-       ands    tmp1, count, #0x30
-       b.eq    L(tail15)
-       add     dst, dst, tmp1
-       cmp     tmp1w, #0x20
-       b.eq    1f
-       b.lt    2f
-       stp     A_l, A_l, [dst, #-48]
-1:
-       stp     A_l, A_l, [dst, #-32]
-2:
-       stp     A_l, A_l, [dst, #-16]
-
-L(tail15):
-       and     count, count, #15
-       add     dst, dst, count
-       stp     A_l, A_l, [dst, #-16]   /* Repeat some/all of last store. */
-       RET
-
-L(tail15tiny):
-       /* Set up to 15 bytes.  Does not assume earlier memory
-          being set.  */
-       tbz     count, #3, 1f
-       str     A_l, [dst], #8
-1:
-       tbz     count, #2, 1f
-       str     A_lw, [dst], #4
-1:
-       tbz     count, #1, 1f
-       strh    A_lw, [dst], #2
-1:
-       tbz     count, #0, 1f
-       strb    A_lw, [dst]
-1:
-       RET
-
-       /* Critical loop.  Start at a new cache line boundary.  Assuming
-        * 64 bytes per line, this ensures the entire loop is in one line.  */
-       .p2align 6
-L(not_short):
-       neg     tmp2, dst
-       ands    tmp2, tmp2, #15
-       b.eq    2f
-       /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
-        * more than that to set, so we simply store 16 bytes and advance by
-        * the amount required to reach alignment.  */
-       sub     count, count, tmp2
-       stp     A_l, A_l, [dst]
-       add     dst, dst, tmp2
-       /* There may be less than 63 bytes to go now.  */
-       cmp     count, #63
-       b.le    L(tail63)
-2:
-       sub     dst, dst, #16           /* Pre-bias.  */
-       sub     count, count, #64
-1:
-       stp     A_l, A_l, [dst, #16]
-       stp     A_l, A_l, [dst, #32]
-       stp     A_l, A_l, [dst, #48]
-       stp     A_l, A_l, [dst, #64]!
-       subs    count, count, #64
-       b.ge    1b
-       tst     count, #0x3f
-       add     dst, dst, #16
-       b.ne    L(tail63)
-       RET
-
-#ifndef DONT_USE_DC
-       /* For zeroing memory, check to see if we can use the ZVA feature to
-        * zero entire 'cache' lines.  */
-L(zero_mem):
-       mov     A_l, #0
-       cmp     count, #63
-       b.le    L(tail_maybe_tiny)
-       neg     tmp2, dst
-       ands    tmp2, tmp2, #15
-       b.eq    1f
-       sub     count, count, tmp2
-       stp     A_l, A_l, [dst]
-       add     dst, dst, tmp2
-       cmp     count, #63
-       b.le    L(tail63)
-1:
-       /* For zeroing small amounts of memory, it's not worth setting up
-        * the line-clear code.  */
-       cmp     count, #128
-       b.lt    L(not_short)
-#ifdef MAYBE_VIRT
-       /* For efficiency when virtualized, we cache the ZVA capability.  */
-       adrp    tmp2, L(cache_clear)
-       ldr     zva_len, [tmp2, #:lo12:L(cache_clear)]
-       tbnz    zva_len, #31, L(not_short)
-       cbnz    zva_len, L(zero_by_line)
-       mrs     tmp1, dczid_el0
-       tbz     tmp1, #4, 1f
-       /* ZVA not available.  Remember this for next time.  */
-       mov     zva_len, #~0
-       str     zva_len, [tmp2, #:lo12:L(cache_clear)]
-       b       L(not_short)
-1:
-       mov     tmp3w, #4
-       and     zva_len, tmp1w, #15     /* Safety: other bits reserved.  */
-       lsl     zva_len, tmp3w, zva_len
-       str     zva_len, [tmp2, #:lo12:L(cache_clear)]
-#else
+       dup     v0.16B, valw
+       add     dstend, dstin, count
+
+       cmp     count, 96
+       b.hi    L(set_long)
+       cmp     count, 16
+       b.hs    L(set_medium)
+       mov     val, v0.D[0]
+
+       /* Set 0..15 bytes.  */
+       tbz     count, 3, 1f
+       str     val, [dstin]
+       str     val, [dstend, -8]
+       ret
+       nop
+1:     tbz     count, 2, 2f
+       str     valw, [dstin]
+       str     valw, [dstend, -4]
+       ret
+2:     cbz     count, 3f
+       strb    valw, [dstin]
+       tbz     count, 1, 3f
+       strh    valw, [dstend, -2]
+3:     ret
+
+       /* Set 17..96 bytes.  */
+L(set_medium):
+       str     q0, [dstin]
+       tbnz    count, 6, L(set96)
+       str     q0, [dstend, -16]
+       tbz     count, 5, 1f
+       str     q0, [dstin, 16]
+       str     q0, [dstend, -32]
+1:     ret
+
+       .p2align 4
+       /* Set 64..96 bytes.  Write 64 bytes from the start and
+          32 bytes from the end.  */
+L(set96):
+       str     q0, [dstin, 16]
+       stp     q0, q0, [dstin, 32]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+       .p2align 3
+       nop
+L(set_long):
+       and     valw, valw, 255
+       bic     dst, dstin, 15
+       str     q0, [dstin]
+       cmp     count, 256
+       ccmp    valw, 0, 0, cs
+       b.eq    L(try_zva)
+L(no_zva):
+       sub     count, dstend, dst      /* Count is 16 too large.  */
+       add     dst, dst, 16
+       sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
+1:     stp     q0, q0, [dst], 64
+       stp     q0, q0, [dst, -32]
+L(tail64):
+       subs    count, count, 64
+       b.hi    1b
+2:     stp     q0, q0, [dstend, -64]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+       .p2align 3
+L(try_zva):
         mrs     tmp1, dczid_el0
-       tbnz    tmp1, #4, L(not_short)
-       mov     tmp3w, #4
-       and     zva_len, tmp1w, #15     /* Safety: other bits reserved.  */
-       lsl     zva_len, tmp3w, zva_len
-#endif
-
-L(zero_by_line):
-       /* Compute how far we need to go to become suitably aligned.  We're
-        * already at quad-word alignment.  */
-       cmp     count, zva_len_x
-       b.lt    L(not_short)            /* Not enough to reach alignment.  */
-       sub     zva_bits_x, zva_len_x, #1
-       neg     tmp2, dst
-       ands    tmp2, tmp2, zva_bits_x
-       b.eq    1f                      /* Already aligned.  */
-       /* Not aligned, check that there's enough to copy after alignment.  */
-       sub     tmp1, count, tmp2
-       cmp     tmp1, #64
-       ccmp    tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
-       b.lt    L(not_short)
-       /* We know that there's at least 64 bytes to zero and that it's safe
-        * to overrun by 64 bytes.  */
-       mov     count, tmp1
-2:
-       stp     A_l, A_l, [dst]
-       stp     A_l, A_l, [dst, #16]
-       stp     A_l, A_l, [dst, #32]
-       subs    tmp2, tmp2, #64
-       stp     A_l, A_l, [dst, #48]
-       add     dst, dst, #64
-       b.ge    2b
-       /* We've overrun a bit, so adjust dst downwards.  */
-       add     dst, dst, tmp2
-1:
-       sub     count, count, zva_len_x
-3:
-       dc      zva, dst
-       add     dst, dst, zva_len_x
-       subs    count, count, zva_len_x
-       b.ge    3b
-       ands    count, count, zva_bits_x
-       b.ne    L(tail_maybe_long)
-       RET
-#ifdef MAYBE_VIRT
-       .bss
-       .p2align 2
-L(cache_clear):
-       .space 4
-#endif
-#endif /* DONT_USE_DC */
+       tbnz    tmp1w, 4, L(no_zva)
+       and     tmp1w, tmp1w, 15
+       cmp     tmp1w, 4        /* ZVA size is 64 bytes.  */
+       b.ne     L(zva_128)
+
+       /* Write the first and last 64 byte aligned block using stp rather
+          than using DC ZVA.  This is faster on some cores.
+        */
+L(zva_64):
+       str     q0, [dst, 16]
+       stp     q0, q0, [dst, 32]
+       bic     dst, dst, 63
+       stp     q0, q0, [dst, 64]
+       stp     q0, q0, [dst, 96]
+       sub     count, dstend, dst      /* Count is now 128 too large.  */
+       sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
+       add     dst, dst, 128
+       nop
+1:     dc      zva, dst
+       add     dst, dst, 64
+       subs    count, count, 64
+       b.hi    1b
+       stp     q0, q0, [dst, 0]
+       stp     q0, q0, [dst, 32]
+       stp     q0, q0, [dstend, -64]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+       .p2align 3
+L(zva_128):
+       cmp     tmp1w, 5        /* ZVA size is 128 bytes.  */
+       b.ne    L(zva_other)
+
+       str     q0, [dst, 16]
+       stp     q0, q0, [dst, 32]
+       stp     q0, q0, [dst, 64]
+       stp     q0, q0, [dst, 96]
+       bic     dst, dst, 127
+       sub     count, dstend, dst      /* Count is now 128 too large.  */
+       sub     count, count, 128+128   /* Adjust count and bias for loop.  */
+       add     dst, dst, 128
+1:     dc      zva, dst
+       add     dst, dst, 128
+       subs    count, count, 128
+       b.hi    1b
+       stp     q0, q0, [dstend, -128]
+       stp     q0, q0, [dstend, -96]
+       stp     q0, q0, [dstend, -64]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+L(zva_other):
+       mov     tmp2w, 4
+       lsl     zva_lenw, tmp2w, tmp1w
+       add     tmp1, zva_len, 64       /* Max alignment bytes written.  */
+       cmp     count, tmp1
+       blo     L(no_zva)
+
+       sub     tmp2, zva_len, 1
+       add     tmp1, dst, zva_len
+       add     dst, dst, 16
+       subs    count, tmp1, dst        /* Actual alignment bytes to write.  */
+       bic     tmp1, tmp1, tmp2        /* Aligned dc zva start address.  */
+       beq     2f
+1:     stp     q0, q0, [dst], 64
+       stp     q0, q0, [dst, -32]
+       subs    count, count, 64
+       b.hi    1b
+2:     mov     dst, tmp1
+       sub     count, dstend, tmp1     /* Remaining bytes to write.  */
+       subs    count, count, zva_len
+       b.lo    4f
+3:     dc      zva, dst
+       add     dst, dst, zva_len
+       subs    count, count, zva_len
+       b.hs    3b
+4:     add     count, count, zva_len
+       b       L(tail64)
  
  END (__memset)
  weak_alias (__memset, memset)
author	Wilco Dijkstra <wdijkstr@arm.com>
	Thu, 12 May 2016 15:41:00 +0000 (16:41 +0100)
committer	Wilco Dijkstra <wdijkstr@arm.com>
	Thu, 12 May 2016 15:44:53 +0000 (16:44 +0100)
ChangeLog		patch \| blob \| blame \| history
sysdeps/aarch64/memset.S		patch \| blob \| blame \| history