+#define EXT_SIZE 1
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE