aarch64: optimized memcpy implementation for thunderx2

author Anton Youdkevitch <anton.youdkevitch@bell-sw.com>

Tue, 16 Oct 2018 18:00:27 +0000 (11:00 -0700)

committer Steve Ellcey <sellcey@caviumnetworks.com>

Tue, 16 Oct 2018 18:00:27 +0000 (11:00 -0700)
author Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
Tue, 16 Oct 2018 18:00:27 +0000 (11:00 -0700)
committer Steve Ellcey <sellcey@caviumnetworks.com>
Tue, 16 Oct 2018 18:00:27 +0000 (11:00 -0700)
diff --git a/ChangeLog b/ChangeLog

index 8a1ca3e1e1463617f6f12132a8bc15804b034165..f3c543aa413ecbce48a95167f5716de98821f57d 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2018-10-16 Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
+
+       * sysdeps/aarch64/multiarch/memcpy_thunderx.S: Remove thunderx2 code.
+       * sysdeps/aarch64/multiarch/memcpy_thunderx2.S: New implementation
+       for thunderX2.
+
  2018-10-15  Joseph Myers  <joseph@codesourcery.com>
  
         * sysdeps/unix/sysv/linux/Makefile (sysdep_headers): Add
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S

index de494d933d8c481a2a6938e59bc50b75bc1f873a..6000365e82c143cd6ba31cce9b1e6cde2f1e27f4 100644 (file)
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -74,13 +74,10 @@
  
  #if IS_IN (libc)
  
-# ifndef USE_THUNDERX2
  #  undef MEMCPY
  #  define MEMCPY __memcpy_thunderx
  #  undef MEMMOVE
  #  define MEMMOVE __memmove_thunderx
-#  define USE_THUNDERX
-# endif
  
  ENTRY_ALIGN (MEMMOVE, 6)
  
@@ -182,8 +179,6 @@ L(copy96):
         .p2align 4
  L(copy_long):
  
-# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
-
         /* On thunderx, large memcpy's are helped by software prefetching.
            This loop is identical to the one below it but with prefetching
            instructions included.  For loops that are less than 32768 bytes,
@@ -196,11 +191,7 @@ L(copy_long):
         bic     dst, dstin, 15
         ldp     D_l, D_h, [src]
         sub     src, src, tmp1
-#  if defined(USE_THUNDERX)
         prfm    pldl1strm, [src, 384]
-#  elif defined(USE_THUNDERX2)
-       prfm    pldl1strm, [src, 256]
-#  endif
         add     count, count, tmp1      /* Count is now 16 too large.  */
         ldp     A_l, A_h, [src, 16]
         stp     D_l, D_h, [dstin]
@@ -210,13 +201,9 @@ L(copy_long):
         subs    count, count, 128 + 16  /* Test and readjust count.  */
  
  L(prefetch_loop64):
-#  if defined(USE_THUNDERX)
         tbz     src, #6, 1f
         prfm    pldl1strm, [src, 512]
  1:
-#  elif defined(USE_THUNDERX2)
-       prfm    pldl1strm, [src, 256]
-#  endif
         stp     A_l, A_h, [dst, 16]
         ldp     A_l, A_h, [src, 16]
         stp     B_l, B_h, [dst, 32]
@@ -230,7 +217,6 @@ L(prefetch_loop64):
         b       L(last64)
  
  L(copy_long_without_prefetch):
-# endif
  
         and     tmp1, dstin, 15
         bic     dst, dstin, 15
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S

index 8501abf7258dc63b97bd45613e4537846df4c0f5..5bdf0f8f21544f7dcfc2539a113d728f1d209184 100644 (file)
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -17,11 +17,610 @@
     License along with the GNU C Library; if not, see
     <http://www.gnu.org/licenses/>.  */
  
-/* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
-   The only real differences are with the prefetching instructions.  */
+#include <sysdep.h>
  
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin  x0
+#define src    x1
+#define count  x2
+#define dst    x3
+#define srcend x4
+#define dstend x5
+#define tmp2    x6
+#define tmp3    x7
+#define tmp3w   w7
+#define A_l    x6
+#define A_lw   w6
+#define A_h    x7
+#define A_hw   w7
+#define B_l    x8
+#define B_lw   w8
+#define B_h    x9
+#define C_l    x10
+#define C_h    x11
+#define D_l    x12
+#define D_h    x13
+#define E_l    src
+#define E_h    count
+#define F_l    srcend
+#define F_h    dst
+#define G_l    count
+#define G_h    dst
+#define tmp1   x14
+
+#define A_q     q0
+#define B_q     q1
+#define C_q     q2
+#define D_q     q3
+#define E_q     q4
+#define F_q     q5
+#define G_q     q6
+#define H_q    q7
+#define I_q    q16
+#define J_q    q17
+
+#define A_v     v0
+#define B_v     v1
+#define C_v     v2
+#define D_v     v3
+#define E_v     v4
+#define F_v     v5
+#define G_v     v6
+#define H_v     v7
+#define I_v     v16
+#define J_v    v17
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#if IS_IN (libc)
+
+#undef MEMCPY
+#undef MEMMOVE
  #define MEMCPY __memcpy_thunderx2
  #define MEMMOVE __memmove_thunderx2
-#define USE_THUNDERX2
  
-#include "memcpy_thunderx.S"
+
+/* Moves are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   Overlapping large forward memmoves use a loop that copies backwards.
+*/
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+       DELOUSE (0)
+       DELOUSE (1)
+       DELOUSE (2)
+
+       sub     tmp1, dstin, src
+       cmp     count, 96
+       ccmp    tmp1, count, 2, hi
+       b.lo    L(move_long)
+
+       prfm    PLDL1KEEP, [src]
+       add     srcend, src, count
+       add     dstend, dstin, count
+       cmp     count, 16
+       b.ls    L(copy16)
+       cmp     count, 96
+       b.hi    L(copy_long)
+
+       /* Medium copies: 17..96 bytes.  */
+       sub     tmp1, count, 1
+       ldp     A_l, A_h, [src]
+       tbnz    tmp1, 6, L(copy96)
+       ldp     D_l, D_h, [srcend, -16]
+       tbz     tmp1, 5, 1f
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+1:
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
+       /* Small copies: 0..16 bytes.  */
+L(copy16):
+       cmp     count, 8
+       b.lo    1f
+       ldr     A_l, [src]
+       ldr     A_h, [srcend, -8]
+       str     A_l, [dstin]
+       str     A_h, [dstend, -8]
+       ret
+       .p2align 4
+1:
+       tbz     count, 2, 1f
+       ldr     A_lw, [src]
+       ldr     A_hw, [srcend, -4]
+       str     A_lw, [dstin]
+       str     A_hw, [dstend, -4]
+       ret
+
+       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+       cbz     count, 2f
+       lsr     tmp1, count, 1
+       ldrb    A_lw, [src]
+       ldrb    A_hw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    A_hw, [dstend, -1]
+2:     ret
+
+       .p2align 4
+       /* Copy 64..96 bytes.  Copy 64 bytes from the start and
+          32 bytes from the end.  */
+L(copy96):
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [src, 32]
+       ldp     D_l, D_h, [src, 48]
+       ldp     E_l, E_h, [srcend, -32]
+       ldp     F_l, F_h, [srcend, -16]
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstin, 32]
+       stp     D_l, D_h, [dstin, 48]
+       stp     E_l, E_h, [dstend, -32]
+       stp     F_l, F_h, [dstend, -16]
+       ret
+
+       /* Align DST to 16 byte alignment so that we don't cross cache line
+          boundaries on both loads and stores.  There are at least 96 bytes
+          to copy, so copy 16 bytes unaligned and then align.  The loop
+          copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+       .p2align 4
+L(copy_long):
+       and     tmp1, dstin, 15
+       bic     dst, dstin, 15
+       ldp     D_l, D_h, [src]
+       sub     src, src, tmp1
+       add     count, count, tmp1      /* Count is now 16 too large.  */
+       ldp     A_l, A_h, [src, 16]
+       stp     D_l, D_h, [dstin]
+       ldp     B_l, B_h, [src, 32]
+       ldp     C_l, C_h, [src, 48]
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 128 + 16  /* Test and readjust count.  */
+       b.ls    L(last64)
+L(loop64):
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [src, 16]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [src, 32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [src, 48]
+       stp     D_l, D_h, [dst, 64]!
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 64
+       b.hi    L(loop64)
+
+       /* Write the last full set of 64 bytes.  The remainder is at most 64
+          bytes, so it is safe to always copy 64 bytes from the end even if
+          there is just 1 byte left.  */
+L(last64):
+       ldp     E_l, E_h, [srcend, -64]
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [srcend, -48]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [srcend, -16]
+       stp     D_l, D_h, [dst, 64]
+       stp     E_l, E_h, [dstend, -64]
+       stp     A_l, A_h, [dstend, -48]
+       stp     B_l, B_h, [dstend, -32]
+       stp     C_l, C_h, [dstend, -16]
+       ret
+
+       .p2align 4
+L(move_long):
+       cbz     tmp1, 3f
+
+       add     srcend, src, count
+       add     dstend, dstin, count
+
+       /* Align dstend to 16 byte alignment so that we don't cross cache line
+          boundaries on both loads and stores.  There are at least 96 bytes
+          to copy, so copy 16 bytes unaligned and then align.  The loop
+          copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+       and     tmp1, dstend, 15
+       ldp     D_l, D_h, [srcend, -16]
+       sub     srcend, srcend, tmp1
+       sub     count, count, tmp1
+       ldp     A_l, A_h, [srcend, -16]
+       stp     D_l, D_h, [dstend, -16]
+       ldp     B_l, B_h, [srcend, -32]
+       ldp     C_l, C_h, [srcend, -48]
+       ldp     D_l, D_h, [srcend, -64]!
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    2f
+
+       nop
+1:
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [srcend, -16]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [srcend, -48]
+       stp     D_l, D_h, [dstend, -64]!
+       ldp     D_l, D_h, [srcend, -64]!
+       subs    count, count, 64
+       b.hi    1b
+
+       /* Write the last full set of 64 bytes.  The remainder is at most 64
+          bytes, so it is safe to always copy 64 bytes from the start even if
+          there is just 1 byte left.  */
+2:
+       ldp     G_l, G_h, [src, 48]
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [src, 32]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [src, 16]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [src]
+       stp     D_l, D_h, [dstend, -64]
+       stp     G_l, G_h, [dstin, 48]
+       stp     A_l, A_h, [dstin, 32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstin]
+3:     ret
+
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use load-and-merge
+   approach in the case src and dst addresses are unaligned not evenly,
+   so that, loads and stores are always aligned.
+   Large copies use an unrolled loop processing 64 bytes per iteration.
+   The current optimized memcpy implementation is not compatible with
+   memmove and is separated from it completely.
+
+   memcpy implementation below is not compatible with memmove
+   because of pipelined loads/stores, which are faster, but they
+   can't be used in the case of overlapping memmove arrays */
+
+#define MEMCPY_PREFETCH_LDR 640
+
+ENTRY (MEMCPY)
+       DELOUSE (0)
+       DELOUSE (1)
+       DELOUSE (2)
+
+       add     srcend, src, count
+       cmp     count, 16
+       b.ls    L(memcopy16)
+       ldr     A_q, [src], #16
+       add     dstend, dstin, count
+       and     tmp1, src, 15
+       cmp     count, 96
+       b.hi    L(memcopy_long)
+
+       /* Medium copies: 17..96 bytes.  */
+       ldr     E_q, [srcend, -16]
+       cmp     count, 64
+       b.gt    L(memcpy_copy96)
+       cmp     count, 48
+       b.le    L(bytes_17_to_48)
+       /* 49..64 bytes */
+       ldp     B_q, C_q, [src]
+       str     E_q, [dstend, -16]
+       stp     A_q, B_q, [dstin]
+       str     C_q, [dstin, 32]
+       ret
+
+L(bytes_17_to_48):
+       /* 17..48 bytes*/
+       cmp     count, 32
+       b.gt    L(bytes_32_to_48)
+       /* 17..32 bytes*/
+       str     A_q, [dstin]
+       str     E_q, [dstend, -16]
+       ret
+
+L(bytes_32_to_48):
+       /* 32..48 */
+       ldr     B_q, [src]
+       str     A_q, [dstin]
+       str     E_q, [dstend, -16]
+       str     B_q, [dstin, 16]
+       ret
+
+       .p2align 4
+       /* Small copies: 0..16 bytes.  */
+L(memcopy16):
+       cmp     count, 8
+       b.lo    L(bytes_0_to_8)
+       ldr     A_l, [src]
+       ldr     A_h, [srcend, -8]
+       add     dstend, dstin, count
+       str     A_l, [dstin]
+       str     A_h, [dstend, -8]
+       ret
+       .p2align 4
+
+L(bytes_0_to_8):
+       tbz     count, 2, L(bytes_0_to_3)
+       ldr     A_lw, [src]
+       ldr     A_hw, [srcend, -4]
+       add     dstend, dstin, count
+       str     A_lw, [dstin]
+       str     A_hw, [dstend, -4]
+       ret
+
+       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+L(bytes_0_to_3):
+       cbz     count, L(end)
+       lsr     tmp1, count, 1
+       ldrb    A_lw, [src]
+       ldrb    A_hw, [srcend, -1]
+       add     dstend, dstin, count
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    A_hw, [dstend, -1]
+L(end): ret
+
+       .p2align 4
+
+L(memcpy_copy96):
+       /* Copying 65..96 bytes. A_q (first 16 bytes) and
+          E_q(last 16 bytes) are already loaded.
+
+          The size is large enough to benefit from aligned
+          loads */
+       bic     src, src, 15
+       ldp     B_q, C_q, [src]
+       str     A_q, [dstin]
+       /* Loaded 64 bytes, second 16-bytes chunk can be
+          overlapping with the first chunk by tmp1 bytes.
+          Stored 16 bytes. */
+       sub     dst, dstin, tmp1
+       add     count, count, tmp1
+       /* The range of count being [65..96] becomes [65..111]
+          after tmp [0..15] gets added to it,
+          count now is <bytes-left-to-load>+48 */
+       cmp     count, 80
+       b.gt    L(copy96_medium)
+       ldr     D_q, [src, 32]
+       stp     B_q, C_q, [dst, 16]
+       str     E_q, [dstend, -16]
+       str     D_q, [dst, 48]
+       ret
+
+       .p2align 4
+L(copy96_medium):
+       ldp     D_q, A_q, [src, 32]
+       str     B_q, [dst, 16]
+       cmp     count, 96
+       b.gt    L(copy96_large)
+       str     E_q, [dstend, -16]
+       stp     C_q, D_q, [dst, 32]
+       str     A_q, [dst, 64]
+       ret
+
+L(copy96_large):
+       ldr     F_q, [src, 64]
+       stp     C_q, D_q, [dst, 32]
+       str     E_q, [dstend, -16]
+       stp     A_q, F_q, [dst, 64]
+       ret
+
+       .p2align 4
+L(memcopy_long):
+       bic     src, src, 15
+       ldp     B_q, C_q, [src], #32
+       str     A_q, [dstin]
+       sub     dst, dstin, tmp1
+       add     count, count, tmp1
+       add     dst, dst, 16
+       and     tmp1, dst, 15
+       ldp     D_q, E_q, [src], #32
+       str     B_q, [dst], #16
+
+       /* Already loaded 64+16 bytes. Check if at
+          least 64 more bytes left */
+       subs    count, count, 64+64+16
+       b.lt    L(loop128_exit2)
+       cmp     count, MEMCPY_PREFETCH_LDR + 64 + 32
+       b.lt    L(loop128)
+       cbnz    tmp1, L(dst_unaligned)
+       sub     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+
+       .p2align 4
+
+L(loop128_prefetch):
+       str     C_q, [dst], #16
+       prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+       str     D_q, [dst], #16
+       ldp     F_q, G_q, [src], #32
+       str     E_q, [dst], #16
+       ldp     H_q, A_q, [src], #32
+       str     F_q, [dst], #16
+       prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+       str     G_q, [dst], #16
+       ldp     B_q, C_q, [src], #32
+       str     H_q, [dst], #16
+       ldp     D_q, E_q, [src], #32
+       stp     A_q, B_q, [dst], #32
+       subs    count, count, 128
+       b.ge    L(loop128_prefetch)
+
+L(preloop128):
+       add     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+       .p2align 4
+L(loop128):
+       ldp     F_q, G_q, [src], #32
+       str     C_q, [dst], #16
+       ldp     B_q, A_q, [src], #32
+       str     D_q, [dst], #16
+       stp     E_q, F_q, [dst], #32
+       stp     G_q, B_q, [dst], #32
+       subs    count, count, 64
+       b.lt    L(loop128_exit1)
+L(loop128_proceed):
+       ldp     B_q, C_q, [src], #32
+       str     A_q, [dst], #16
+       ldp     D_q, E_q, [src], #32
+       str     B_q, [dst], #16
+       subs    count, count, 64
+       b.ge    L(loop128)
+
+       .p2align 4
+L(loop128_exit2):
+       stp     C_q, D_q, [dst], #32
+       str     E_q, [dst], #16
+       b       L(copy_long_check32);
+
+L(loop128_exit1):
+       /* A_q is still not stored and 0..63 bytes left,
+          so, count is -64..-1.
+          Check if less than 32 bytes left (count < -32) */
+       str     A_q, [dst], #16
+L(copy_long_check32):
+       cmn     count, 64
+       b.eq    L(copy_long_done)
+       cmn     count, 32
+       b.le    L(copy_long_last32)
+       ldp     B_q, C_q, [src]
+       stp     B_q, C_q, [dst]
+
+L(copy_long_last32):
+       ldp     F_q, G_q, [srcend, -32]
+       stp     F_q, G_q, [dstend, -32]
+
+L(copy_long_done):
+       ret
+
+L(dst_unaligned):
+       /* For the unaligned store case the code loads two
+          aligned chunks and then merges them using ext
+          instruction. This can be up to 30% faster than
+          the the simple unaligned store access.
+
+          Current state: tmp1 = dst % 16; C_q, D_q, E_q
+          contains data yet to be stored. src and dst points
+          to next-to-be-processed data. A_q, B_q contains
+          data already stored before, count = bytes left to
+          be load decremented by 64.
+
+          The control is passed here if at least 64 bytes left
+          to be loaded. The code does two aligned loads and then
+          extracts (16-tmp1) bytes from the first register and
+          tmp1 bytes from the next register forming the value
+          for the aligned store.
+
+          As ext instruction can only have it's index encoded
+          as immediate. 15 code chunks process each possible
+          index value. Computed goto is used to reach the
+          required code. */
+       
+       /* Store the 16 bytes to dst and align dst for further
+          operations, several bytes will be stored at this
+          address once more */
+       str     C_q, [dst], #16
+       ldp     F_q, G_q, [src], #32
+       bic     dst, dst, 15
+       adrp    tmp2, L(ext_table)
+       add     tmp2, tmp2, :lo12:L(ext_table)
+       add     tmp2, tmp2, tmp1, LSL #2
+       ldr     tmp3w, [tmp2]
+       add     tmp2, tmp2, tmp3w, SXTW
+       br      tmp2
+
+#define EXT_CHUNK(shft) \
+.p2align 4 ;\
+L(ext_size_ ## shft):;\
+       ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
+       ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
+       subs    count, count, 32;\
+       b.ge    2f;\
+1:;\
+       stp     A_q, B_q, [dst], #32;\
+       ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+       ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+       stp     H_q, I_q, [dst], #16;\
+       add     dst, dst, tmp1;\
+       str     G_q, [dst], #16;\
+       b       L(copy_long_check32);\
+2:;\
+       stp     A_q, B_q, [dst], #32;\
+       prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
+       ldp     D_q, J_q, [src], #32;\
+       ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+       ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+       mov     C_v.16b, G_v.16b;\
+       stp     H_q, I_q, [dst], #32;\
+       ldp     F_q, G_q, [src], #32;\
+       ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
+       ext     B_v.16b, D_v.16b, J_v.16b, 16-shft;\
+       mov     E_v.16b, J_v.16b;\
+       subs    count, count, 64;\
+       b.ge    2b;\
+       b       1b;\
+
+EXT_CHUNK(1)
+EXT_CHUNK(2)
+EXT_CHUNK(3)
+EXT_CHUNK(4)
+EXT_CHUNK(5)
+EXT_CHUNK(6)
+EXT_CHUNK(7)
+EXT_CHUNK(8)
+EXT_CHUNK(9)
+EXT_CHUNK(10)
+EXT_CHUNK(11)
+EXT_CHUNK(12)
+EXT_CHUNK(13)
+EXT_CHUNK(14)
+EXT_CHUNK(15)
+
+END (MEMCPY)
+       .section        .rodata
+       .p2align        4
+
+L(ext_table):
+       /* The first entry is for the alignment of 0 and is never
+          actually used (could be any value).  */
+       .word   0
+       .word   L(ext_size_1) -.
+       .word   L(ext_size_2) -.
+       .word   L(ext_size_3) -.
+       .word   L(ext_size_4) -.
+       .word   L(ext_size_5) -.
+       .word   L(ext_size_6) -.
+       .word   L(ext_size_7) -.
+       .word   L(ext_size_8) -.
+       .word   L(ext_size_9) -.
+       .word   L(ext_size_10) -.
+       .word   L(ext_size_11) -.
+       .word   L(ext_size_12) -.
+       .word   L(ext_size_13) -.
+       .word   L(ext_size_14) -.
+       .word   L(ext_size_15) -.
+
+libc_hidden_builtin_def (MEMCPY)
+#endif
author	Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
	Tue, 16 Oct 2018 18:00:27 +0000 (11:00 -0700)
committer	Steve Ellcey <sellcey@caviumnetworks.com>
	Tue, 16 Oct 2018 18:00:27 +0000 (11:00 -0700)
ChangeLog		patch \| blob \| blame \| history
sysdeps/aarch64/multiarch/memcpy_thunderx.S		patch \| blob \| blame \| history
sysdeps/aarch64/multiarch/memcpy_thunderx2.S		patch \| blob \| blame \| history