This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] powerpc: P9 vector load instruction change in memcpy and memmove
- From: "Tulio Magno Quites Machado Filho" <tuliom at linux dot vnet dot ibm dot com>
- To: libc-alpha at sourceware dot org
- Cc: raji at linux dot vnet dot ibm dot com
- Date: Thu, 19 Oct 2017 13:25:31 -0200
- Subject: [PATCH] powerpc: P9 vector load instruction change in memcpy and memmove
- Authentication-results: sourceware.org; auth=none
From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load traps to the kernel. To handle this in memcpy
and memmove, lvx/stvx is used for aligned addresses instead
of lxvd2x/stxvd2x. The remaining part of the optimization remains
same as existing POWER7 code.
Reference: https://patchwork.ozlabs.org/patch/814059/
Tested on powerpc64le.
2017-10-19 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/multiarch/Makefile
(sysdep_routines): Add memcpy_power9 and memmove_power9.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
(memcpy): Add __memcpy_power9 to list of memcpy functions.
(memmove): Add __memmove_power9 to list of memmove functions.
(bcopy): Add __bcopy_power9 to list of bcopy functions.
* sysdeps/powerpc/powerpc64/multiarch/memcpy.c
(memcpy): Add __memcpy_power9 to ifunc list.
* sysdeps/powerpc/powerpc64/power9/memcpy.S: New File.
* sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S: Likewise.
* sysdeps/powerpc/powerpc64/multiarch/bcopy.c
(bcopy): Add __bcopy_power9 to ifunc list.
* sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
Change bcopy as __bcopy.
* sysdeps/powerpc/powerpc64/multiarch/memmove.c
(memmove): Add __memmove_power9 to ifunc list.
* sysdeps/powerpc/powerpc64/power7/memmove.S:
Alias bcopy only if not defined before.
* sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S:
New file.
* sysdeps/powerpc/powerpc64/power9/memmove.S: Likewise.
---
sysdeps/powerpc/powerpc64/multiarch/Makefile | 7 +-
sysdeps/powerpc/powerpc64/multiarch/bcopy.c | 6 +-
.../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 6 +
.../powerpc/powerpc64/multiarch/memcpy-power9.S | 26 +
sysdeps/powerpc/powerpc64/multiarch/memcpy.c | 3 +
.../powerpc/powerpc64/multiarch/memmove-power7.S | 4 +-
.../powerpc/powerpc64/multiarch/memmove-power9.S | 29 +
sysdeps/powerpc/powerpc64/multiarch/memmove.c | 5 +-
sysdeps/powerpc/powerpc64/power7/memmove.S | 2 +
sysdeps/powerpc/powerpc64/power9/memcpy.S | 429 +++++++++++
sysdeps/powerpc/powerpc64/power9/memmove.S | 837 +++++++++++++++++++++
11 files changed, 1347 insertions(+), 7 deletions(-)
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
create mode 100644 sysdeps/powerpc/powerpc64/power9/memcpy.S
create mode 100644 sysdeps/powerpc/powerpc64/power9/memmove.S
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index dea49ac..82728fa 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -1,6 +1,6 @@
ifeq ($(subdir),string)
-sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
- memcpy-power4 memcpy-ppc64 \
+sysdep_routines += memcpy-power9 memcpy-power7 memcpy-a2 memcpy-power6 \
+ memcpy-cell memcpy-power4 memcpy-ppc64 \
memcmp-power8 memcmp-power7 memcmp-power4 memcmp-ppc64 \
memset-power7 memset-power6 memset-power4 \
memset-ppc64 memset-power8 \
@@ -24,7 +24,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
strcat-power8 strcat-power7 strcat-ppc64 \
- memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
+ memmove-power9 memmove-power7 memmove-ppc64 \
+ wordcopy-ppc64 bcopy-ppc64 \
strncpy-power8 strstr-power7 strstr-ppc64 \
strspn-power8 strspn-ppc64 strcspn-power8 strcspn-ppc64 \
strlen-power8 strcasestr-power8 strcasestr-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
index 05d46e2..4a4ee6e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
@@ -22,8 +22,12 @@
extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
/* __bcopy_power7 symbol is implemented at memmove-power7.S */
extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
+/* __bcopy_power9 symbol is implemented at memmove-power9.S. */
+extern __typeof (bcopy) __bcopy_power9 attribute_hidden;
libc_ifunc (bcopy,
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __bcopy_power9
+ : (hwcap & PPC_FEATURE_HAS_VSX)
? __bcopy_power7
: __bcopy_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 6a88536..9040bbc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -51,6 +51,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#ifdef SHARED
/* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c. */
IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __memcpy_power9)
IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
__memcpy_power7)
IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
@@ -65,6 +67,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c. */
IFUNC_IMPL (i, name, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __memmove_power9)
IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
__memmove_power7)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
@@ -168,6 +172,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c. */
IFUNC_IMPL (i, name, bcopy,
+ IFUNC_IMPL_ADD (array, i, bcopy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __bcopy_power9)
IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
__bcopy_power7)
IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
new file mode 100644
index 0000000..fbd0788
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
@@ -0,0 +1,26 @@
+/* Optimized memcpy implementation for PowerPC/POWER9.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define MEMCPY __memcpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power9/memcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
index 9f4286c..4c16fa0 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
@@ -35,8 +35,11 @@ extern __typeof (__redirect_memcpy) __memcpy_cell attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_power9 attribute_hidden;
libc_ifunc (__libc_memcpy,
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __memcpy_power9 :
(hwcap & PPC_FEATURE_HAS_VSX)
? __memcpy_power7 :
(hwcap & PPC_FEATURE_ARCH_2_06)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
index a9435fa..0599a39 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
@@ -23,7 +23,7 @@
#undef libc_hidden_builtin_def
#define libc_hidden_builtin_def(name)
-#undef bcopy
-#define bcopy __bcopy_power7
+#undef __bcopy
+#define __bcopy __bcopy_power7
#include <sysdeps/powerpc/powerpc64/power7/memmove.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
new file mode 100644
index 0000000..16a2267
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
@@ -0,0 +1,29 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define MEMMOVE __memmove_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bcopy
+#define __bcopy __bcopy_power9
+
+#include <sysdeps/powerpc/powerpc64/power9/memmove.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
index db2bbc7..f02498e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -31,9 +31,12 @@ extern __typeof (__redirect_memmove) __libc_memmove;
extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_power9 attribute_hidden;
libc_ifunc (__libc_memmove,
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __memmove_power9
+ : (hwcap & PPC_FEATURE_HAS_VSX)
? __memmove_power7
: __memmove_ppc);
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
index 93baa69..0bb8ddc 100644
--- a/sysdeps/powerpc/powerpc64/power7/memmove.S
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -832,4 +832,6 @@ ENTRY_TOCLESS (__bcopy)
mr r4,r6
b L(_memmove)
END (__bcopy)
+#ifndef __bcopy
weak_alias (__bcopy, bcopy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power9/memcpy.S b/sysdeps/powerpc/powerpc64/power9/memcpy.S
new file mode 100644
index 0000000..0731bac
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power9/memcpy.S
@@ -0,0 +1,429 @@
+/* Optimized memcpy implementation for PowerPC64/POWER9.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
+ Returns 'dst'. */
+
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#define dst 11 /* Use r11 so r3 kept unchanged. */
+#define src 4
+#define cnt 5
+
+ .machine power7
+ENTRY_TOCLESS (MEMCPY, 5)
+ CALL_MCOUNT 3
+
+ cmpldi cr1,cnt,31
+ neg 0,3
+ ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
+ code. */
+
+/* Align copies using VSX instructions to quadword. It is to avoid alignment
+ traps when memcpy is used on non-cacheable memory (for instance, memory
+ mapped I/O). */
+ andi. 10,3,15
+ clrldi 11,4,60
+ cmpld cr6,10,11 /* SRC and DST alignments match? */
+
+ mr dst,3
+ bne cr6,L(copy_GE_32_unaligned)
+ beq L(aligned_copy)
+
+ mtocrf 0x01,0
+ clrldi 0,0,60
+
+/* Get the DST and SRC aligned to 16 bytes. */
+1:
+ bf 31,2f
+ lbz 6,0(src)
+ addi src,src,1
+ stb 6,0(dst)
+ addi dst,dst,1
+2:
+ bf 30,4f
+ lhz 6,0(src)
+ addi src,src,2
+ sth 6,0(dst)
+ addi dst,dst,2
+4:
+ bf 29,8f
+ lwz 6,0(src)
+ addi src,src,4
+ stw 6,0(dst)
+ addi dst,dst,4
+8:
+ bf 28,16f
+ ld 6,0(src)
+ addi src,src,8
+ std 6,0(dst)
+ addi dst,dst,8
+16:
+ subf cnt,0,cnt
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+ li 6,16
+ li 7,32
+ li 8,48
+ mtocrf 0x02,cnt
+ srdi 12,cnt,7
+ cmpdi 12,0
+ beq L(aligned_tail)
+ lvx 6,0,src
+ lvx 7,src,6
+ mtctr 12
+ b L(aligned_128loop)
+
+ .align 4
+L(aligned_128head):
+ /* for the 2nd + iteration of this loop. */
+ lvx 6,0,src
+ lvx 7,src,6
+L(aligned_128loop):
+ lvx 8,src,7
+ lvx 9,src,8
+ stvx 6,0,dst
+ addi src,src,64
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
+ lvx 6,0,src
+ lvx 7,src,6
+ addi dst,dst,64
+ lvx 8,src,7
+ lvx 9,src,8
+ addi src,src,64
+ stvx 6,0,dst
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
+ addi dst,dst,64
+ bdnz L(aligned_128head)
+
+L(aligned_tail):
+ mtocrf 0x01,cnt
+ bf 25,32f
+ lvx 6,0,src
+ lvx 7,src,6
+ lvx 8,src,7
+ lvx 9,src,8
+ addi src,src,64
+ stvx 6,0,dst
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
+ addi dst,dst,64
+32:
+ bf 26,16f
+ lvx 6,0,src
+ lvx 7,src,6
+ addi src,src,32
+ stvx 6,0,dst
+ stvx 7,dst,6
+ addi dst,dst,32
+16:
+ bf 27,8f
+ lvx 6,0,src
+ addi src,src,16
+ stvx 6,0,dst
+ addi dst,dst,16
+8:
+ bf 28,4f
+ ld 6,0(src)
+ addi src,src,8
+ std 6,0(dst)
+ addi dst,dst,8
+4: /* Copies 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(src)
+ stw 6,0(dst)
+ bf 30,L(tail5)
+ lhz 7,4(src)
+ sth 7,4(dst)
+ bflr 31
+ lbz 8,6(src)
+ stb 8,6(dst)
+ /* Return original DST pointer. */
+ blr
+
+
+/* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ mr dst,3
+ cmpldi cr6,cnt,8
+ mtocrf 0x01,cnt
+ ble cr6,L(copy_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg 8,4
+ andi. 0,8,3
+ cmpldi cr1,cnt,16
+ beq L(copy_LT_32_aligned)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,0
+ subf cnt,0,cnt
+2:
+ bf 30,1f
+ lhz 6,0(src)
+ addi src,src,2
+ sth 6,0(dst)
+ addi dst,dst,2
+1:
+ bf 31,L(end_4bytes_alignment)
+ lbz 6,0(src)
+ addi src,src,1
+ stb 6,0(dst)
+ addi dst,dst,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,cnt,16
+ mtocrf 0x01,cnt
+
+L(copy_LT_32_aligned):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz 6,0(src)
+ lwz 7,4(src)
+ stw 6,0(dst)
+ lwz 8,8(src)
+ stw 7,4(dst)
+ lwz 6,12(src)
+ addi src,src,16
+ stw 8,8(dst)
+ stw 6,12(dst)
+ addi dst,dst,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4)
+ lwz 6,0(src)
+ lwz 7,4(src)
+ addi src,src,8
+ stw 6,0(dst)
+ stw 7,4(dst)
+ addi dst,dst,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ lwz 6,0(src)
+ stw 6,0(dst)
+ bf 30,L(tail5)
+ lhz 7,4(src)
+ sth 7,4(dst)
+ bflr 31
+ lbz 8,6(src)
+ stb 8,6(dst)
+ /* Return original DST pointer. */
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ lhz 6,0(src)
+ sth 6,0(dst)
+ bflr 31
+ lbz 7,2(src)
+ stb 7,2(dst)
+ blr
+
+ .align 4
+L(tail5):
+ bflr 31
+ lbz 6,4(src)
+ stb 6,4(dst)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz 6,0(src)
+ stb 6,0(dst)
+ /* Return original DST pointer. */
+ blr
+
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,L(tail4)
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+
+ lwz 6,0(src)
+ lwz 7,4(src)
+ stw 6,0(dst)
+ stw 7,4(dst)
+ blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned):
+ clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
+ srdi 9,cnt,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont)
+
+ /* DST is not quadword aligned, get it aligned. */
+
+ mtocrf 0x01,0
+ subf cnt,0,cnt
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1:
+ bf 31,2f
+ lbz 6,0(src)
+ addi src,src,1
+ stb 6,0(dst)
+ addi dst,dst,1
+2:
+ bf 30,4f
+ lhz 6,0(src)
+ addi src,src,2
+ sth 6,0(dst)
+ addi dst,dst,2
+4:
+ bf 29,8f
+ lwz 6,0(src)
+ addi src,src,4
+ stw 6,0(dst)
+ addi dst,dst,4
+8:
+ bf 28,0f
+ ld 6,0(src)
+ addi src,src,8
+ std 6,0(dst)
+ addi dst,dst,8
+0:
+ srdi 9,cnt,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrldi 10,cnt,60
+ li 6,16 /* Index for 16-bytes offsets. */
+ li 7,32 /* Index for 32-bytes offsets. */
+ cmpldi cr1,10,0
+ srdi 8,cnt,5 /* Setup the loop counter. */
+ mtocrf 0x01,9
+ cmpldi cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,src
+#else
+ lvsl 5,0,src
+#endif
+ lvx 3,0,src
+ li 0,0
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */
+ lvx 4,src,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ addi src,src,16
+ stvx 6,0,dst
+ addi dst,dst,16
+ vor 3,4,4
+ clrrdi 0,src,60
+
+L(setup_unaligned_loop):
+ mtctr 8
+ ble cr6,L(end_unaligned_loop)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx 4,src,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ lvx 3,src,7
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
+ addi src,src,32
+ stvx 6,0,dst
+ stvx 10,dst,6
+ addi dst,dst,32
+ bdnz L(unaligned_loop)
+
+ clrrdi 0,src,60
+
+ .align 4
+L(end_unaligned_loop):
+
+ /* Check for tail bytes. */
+ mtocrf 0x01,cnt
+ beqlr cr1
+
+ add src,src,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+ /* Copy 8 bytes. */
+ bf 28,4f
+ lwz 6,0(src)
+ lwz 7,4(src)
+ addi src,src,8
+ stw 6,0(dst)
+ stw 7,4(dst)
+ addi dst,dst,8
+4: /* Copy 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(src)
+ stw 6,0(dst)
+ bf 30,L(tail5)
+ lhz 7,4(src)
+ sth 7,4(dst)
+ bflr 31
+ lbz 8,6(src)
+ stb 8,6(dst)
+ /* Return original DST pointer. */
+ blr
+
+END_GEN_TB (MEMCPY,TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/powerpc/powerpc64/power9/memmove.S b/sysdeps/powerpc/powerpc64/power9/memmove.S
new file mode 100644
index 0000000..9ed8f77
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power9/memmove.S
@@ -0,0 +1,837 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
+
+ This optimization check if memory 'dest' overlaps with 'src'. If it does
+ not then it calls an optimized memcpy call (similar to memcpy for POWER7,
+ embedded here to gain some cycles).
+ If source and destiny overlaps, a optimized backwards memcpy is used
+ instead. */
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+ .machine power7
+ENTRY_TOCLESS (MEMMOVE, 5)
+ CALL_MCOUNT 3
+
+L(_memmove):
+ subf r9,r4,r3
+ cmpld cr7,r9,r5
+ blt cr7,L(memmove_bwd)
+
+ cmpldi cr1,r5,31
+ neg 0,3
+ ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
+ code. */
+
+ andi. 10,3,15
+ clrldi 11,4,60
+ cmpld cr6,10,11 /* SRC and DST alignments match? */
+
+ mr r11,3
+ bne cr6,L(copy_GE_32_unaligned)
+ beq L(aligned_copy)
+
+ mtocrf 0x01,0
+ clrldi 0,0,60
+
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
+1:
+ bf 31,2f
+ lbz 6,0(r4)
+ addi r4,r4,1
+ stb 6,0(r11)
+ addi r11,r11,1
+2:
+ bf 30,4f
+ lhz 6,0(r4)
+ addi r4,r4,2
+ sth 6,0(r11)
+ addi r11,r11,2
+4:
+ bf 29,8f
+ lwz 6,0(r4)
+ addi r4,r4,4
+ stw 6,0(r11)
+ addi r11,r11,4
+8:
+ bf 28,16f
+ ld 6,0(r4)
+ addi r4,r4,8
+ std 6,0(r11)
+ addi r11,r11,8
+16:
+ subf r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+ li 6,16
+ li 7,32
+ li 8,48
+ mtocrf 0x02,r5
+ srdi 12,r5,7
+ cmpdi 12,0
+ beq L(aligned_tail)
+ lvx 6,0,r4
+ lvx 7,r4,6
+ mtctr 12
+ b L(aligned_128loop)
+
+ .align 4
+L(aligned_128head):
+ /* for the 2nd + iteration of this loop. */
+ lvx 6,0,r4
+ lvx 7,r4,6
+L(aligned_128loop):
+ lvx 8,r4,7
+ lvx 9,r4,8
+ stvx 6,0,r11
+ addi r4,r4,64
+ stvx 7,r11,6
+ stvx 8,r11,7
+ stvx 9,r11,8
+ lvx 6,0,r4
+ lvx 7,r4,6
+ addi r11,r11,64
+ lvx 8,r4,7
+ lvx 9,r4,8
+ addi r4,r4,64
+ stvx 6,0,r11
+ stvx 7,r11,6
+ stvx 8,r11,7
+ stvx 9,r11,8
+ addi r11,r11,64
+ bdnz L(aligned_128head)
+
+L(aligned_tail):
+ mtocrf 0x01,r5
+ bf 25,32f
+ lvx 6,0,r4
+ lvx 7,r4,6
+ lvx 8,r4,7
+ lvx 9,r4,8
+ addi r4,r4,64
+ stvx 6,0,r11
+ stvx 7,r11,6
+ stvx 8,r11,7
+ stvx 9,r11,8
+ addi r11,r11,64
+32:
+ bf 26,16f
+ lvx 6,0,r4
+ lvx 7,r4,6
+ addi r4,r4,32
+ stvx 6,0,r11
+ stvx 7,r11,6
+ addi r11,r11,32
+16:
+ bf 27,8f
+ lvx 6,0,r4
+ addi r4,r4,16
+ stvx 6,0,r11
+ addi r11,r11,16
+8:
+ bf 28,4f
+ ld 6,0(r4)
+ addi r4,r4,8
+ std 6,0(r11)
+ addi r11,r11,8
+4: /* Copies 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(r4)
+ stw 6,0(r11)
+ bf 30,L(tail5)
+ lhz 7,4(r4)
+ sth 7,4(r11)
+ bflr 31
+ lbz 8,6(r4)
+ stb 8,6(r11)
+ /* Return original DST pointer. */
+ blr
+
+/* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ mr r11,3
+ cmpldi cr6,r5,8
+ mtocrf 0x01,r5
+ ble cr6,L(copy_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg 8,4
+ andi. 0,8,3
+ cmpldi cr1,r5,16
+ beq L(copy_LT_32_aligned)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,0
+ subf r5,0,r5
+2:
+ bf 30,1f
+ lhz 6,0(r4)
+ addi r4,r4,2
+ sth 6,0(r11)
+ addi r11,r11,2
+1:
+ bf 31,L(end_4bytes_alignment)
+ lbz 6,0(r4)
+ addi r4,r4,1
+ stb 6,0(r11)
+ addi r11,r11,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(copy_LT_32_aligned):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ stw 6,0(r11)
+ lwz 8,8(r4)
+ stw 7,4(r11)
+ lwz 6,12(r4)
+ addi r4,r4,16
+ stw 8,8(r11)
+ stw 6,12(r11)
+ addi r11,r11,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4)
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ addi r4,r4,8
+ stw 6,0(r11)
+ stw 7,4(r11)
+ addi r11,r11,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ lwz 6,0(r4)
+ stw 6,0(r11)
+ bf 30,L(tail5)
+ lhz 7,4(r4)
+ sth 7,4(r11)
+ bflr 31
+ lbz 8,6(r4)
+ stb 8,6(r11)
+ /* Return original DST pointer. */
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ lhz 6,0(r4)
+ sth 6,0(r11)
+ bflr 31
+ lbz 7,2(r4)
+ stb 7,2(r11)
+ blr
+
+ .align 4
+L(tail5):
+ bflr 31
+ lbz 6,4(r4)
+ stb 6,4(r11)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz 6,0(r4)
+ stb 6,0(r11)
+ /* Return original DST pointer. */
+ blr
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,L(tail4)
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ stw 6,0(r11)
+ stw 7,4(r11)
+ blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned):
+ clrldi 0,0,60 /* Number of bytes until the 1st r11 quadword. */
+ srdi 9,r5,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont)
+
+ /* DST is not quadword aligned, get it aligned. */
+
+ mtocrf 0x01,0
+ subf r5,0,r5
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1:
+ bf 31,2f
+ lbz 6,0(r4)
+ addi r4,r4,1
+ stb 6,0(r11)
+ addi r11,r11,1
+2:
+ bf 30,4f
+ lhz 6,0(r4)
+ addi r4,r4,2
+ sth 6,0(r11)
+ addi r11,r11,2
+4:
+ bf 29,8f
+ lwz 6,0(r4)
+ addi r4,r4,4
+ stw 6,0(r11)
+ addi r11,r11,4
+8:
+ bf 28,0f
+ ld 6,0(r4)
+ addi r4,r4,8
+ std 6,0(r11)
+ addi r11,r11,8
+0:
+ srdi 9,r5,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrldi 10,r5,60
+ li 6,16 /* Index for 16-bytes offsets. */
+ li 7,32 /* Index for 32-bytes offsets. */
+ cmpldi cr1,10,0
+ srdi 8,r5,5 /* Setup the loop counter. */
+ mtocrf 0x01,9
+ cmpldi cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,r4
+#else
+ lvsl 5,0,r4
+#endif
+ lvx 3,0,r4
+ li 0,0
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */
+ lvx 4,r4,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ addi r4,r4,16
+ stvx 6,0,r11
+ addi r11,r11,16
+ vor 3,4,4
+ clrrdi 0,r4,60
+
+L(setup_unaligned_loop):
+ mtctr 8
+ ble cr6,L(end_unaligned_loop)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx 4,r4,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ lvx 3,r4,7
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
+ addi r4,r4,32
+ stvx 6,0,r11
+ stvx 10,r11,6
+ addi r11,r11,32
+ bdnz L(unaligned_loop)
+
+ clrrdi 0,r4,60
+
+ .align 4
+L(end_unaligned_loop):
+
+ /* Check for tail bytes. */
+ mtocrf 0x01,r5
+ beqlr cr1
+
+ add r4,r4,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+ /* Copy 8 bytes. */
+ bf 28,4f
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ addi r4,r4,8
+ stw 6,0(r11)
+ stw 7,4(r11)
+ addi r11,r11,8
+4: /* Copy 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(r4)
+ stw 6,0(r11)
+ bf 30,L(tail5)
+ lhz 7,4(r4)
+ sth 7,4(r11)
+ bflr 31
+ lbz 8,6(r4)
+ stb 8,6(r11)
+ /* Return original DST pointer. */
+ blr
+
+ /* Start to memcpy backward implementation: the algorith first check if
+ src and dest have the same alignment and if it does align both to 16
+ bytes and copy using VSX instructions.
+ If does not, align dest to 16 bytes and use VMX (altivec) instruction
+ to read two 16 bytes at time, shift/permute the bytes read and write
+ aligned to dest. */
+L(memmove_bwd):
+ cmpldi cr1,r5,31
+ /* Copy is done backwards: update the pointers and check alignment. */
+ add r11,r3,r5
+ add r4,r4,r5
+ mr r0,r11
+ ble cr1, L(copy_LT_32_bwd) /* If move < 32 bytes use short move
+ code. */
+
+ andi. r10,r11,15 /* Check if r11 is aligned to 16 bytes */
+ clrldi r9,r4,60 /* Check if r4 is aligned to 16 bytes */
+ cmpld cr6,r10,r9 /* SRC and DST alignments match? */
+
+ bne cr6,L(copy_GE_32_unaligned_bwd)
+ beq L(aligned_copy_bwd)
+
+ mtocrf 0x01,r0
+ clrldi r0,r0,60
+
+/* Get the DST and SRC aligned to 16 bytes. */
+1:
+ bf 31,2f
+ lbz r6,-1(r4)
+ subi r4,r4,1
+ stb r6,-1(r11)
+ subi r11,r11,1
+2:
+ bf 30,4f
+ lhz r6,-2(r4)
+ subi r4,r4,2
+ sth r6,-2(r11)
+ subi r11,r11,2
+4:
+ bf 29,8f
+ lwz r6,-4(r4)
+ subi r4,r4,4
+ stw r6,-4(r11)
+ subi r11,r11,4
+8:
+ bf 28,16f
+ ld r6,-8(r4)
+ subi r4,r4,8
+ std r6,-8(r11)
+ subi r11,r11,8
+16:
+ subf r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy_bwd):
+ li r6,-16
+ li r7,-32
+ li r8,-48
+ li r9,-64
+ mtocrf 0x02,r5
+ srdi r12,r5,7
+ cmpdi r12,0
+ beq L(aligned_tail_bwd)
+ lvx v6,r4,r6
+ lvx v7,r4,r7
+ mtctr 12
+ b L(aligned_128loop_bwd)
+
+ .align 4
+L(aligned_128head_bwd):
+ /* for the 2nd + iteration of this loop. */
+ lvx v6,r4,r6
+ lvx v7,r4,r7
+L(aligned_128loop_bwd):
+ lvx v8,r4,r8
+ lvx v9,r4,r9
+ stvx v6,r11,r6
+ subi r4,r4,64
+ stvx v7,r11,r7
+ stvx v8,r11,r8
+ stvx v9,r11,r9
+ lvx v6,r4,r6
+ lvx v7,r4,7
+ subi r11,r11,64
+ lvx v8,r4,r8
+ lvx v9,r4,r9
+ subi r4,r4,64
+ stvx v6,r11,r6
+ stvx v7,r11,r7
+ stvx v8,r11,r8
+ stvx v9,r11,r9
+ subi r11,r11,64
+ bdnz L(aligned_128head_bwd)
+
+L(aligned_tail_bwd):
+ mtocrf 0x01,r5
+ bf 25,32f
+ lvx v6,r4,r6
+ lvx v7,r4,r7
+ lvx v8,r4,r8
+ lvx v9,r4,r9
+ subi r4,r4,64
+ stvx v6,r11,r6
+ stvx v7,r11,r7
+ stvx v8,r11,r8
+ stvx v9,r11,r9
+ subi r11,r11,64
+32:
+ bf 26,16f
+ lvx v6,r4,r6
+ lvx v7,r4,r7
+ subi r4,r4,32
+ stvx v6,r11,r6
+ stvx v7,r11,r7
+ subi r11,r11,32
+16:
+ bf 27,8f
+ lvx v6,r4,r6
+ subi r4,r4,16
+ stvx v6,r11,r6
+ subi r11,r11,16
+8:
+ bf 28,4f
+ ld r6,-8(r4)
+ subi r4,r4,8
+ std r6,-8(r11)
+ subi r11,r11,8
+4: /* Copies 4~7 bytes. */
+ bf 29,L(tail2_bwd)
+ lwz r6,-4(r4)
+ stw r6,-4(r11)
+ bf 30,L(tail5_bwd)
+ lhz r7,-6(r4)
+ sth r7,-6(r11)
+ bflr 31
+ lbz r8,-7(r4)
+ stb r8,-7(r11)
+ /* Return original DST pointer. */
+ blr
+
+/* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32_bwd):
+ cmpldi cr6,r5,8
+ mtocrf 0x01,r5
+ ble cr6,L(copy_LE_8_bwd)
+
+ /* At least 9 bytes to go. */
+ neg r8,r4
+ andi. r0,r8,3
+ cmpldi cr1,r5,16
+ beq L(copy_LT_32_aligned_bwd)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,0
+ subf r5,0,r5
+2:
+ bf 30,1f
+ lhz r6,-2(r4)
+ subi r4,r4,2
+ sth r6,-2(r11)
+ subi r11,r11,2
+1:
+ bf 31,L(end_4bytes_alignment_bwd)
+ lbz 6,-1(r4)
+ subi r4,r4,1
+ stb 6,-1(r11)
+ subi r11,r11,1
+
+ .align 4
+L(end_4bytes_alignment_bwd):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(copy_LT_32_aligned_bwd):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ stw r6,-4(r11)
+ lwz r8,-12(r4)
+ stw r7,-8(r11)
+ lwz r6,-16(r4)
+ subi r4,r4,16
+ stw r8,-12(r11)
+ stw r6,-16(r11)
+ subi r11,r11,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4_bwd)
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ subi r4,r4,8
+ stw r6,-4(r11)
+ stw r7,-8(r11)
+ subi r11,r11,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4_bwd):
+ bf 29,L(tail2_bwd)
+ lwz 6,-4(r4)
+ stw 6,-4(r11)
+ bf 30,L(tail5_bwd)
+ lhz 7,-6(r4)
+ sth 7,-6(r11)
+ bflr 31
+ lbz 8,-7(r4)
+ stb 8,-7(r11)
+ /* Return original DST pointer. */
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2_bwd):
+ bf 30,1f
+ lhz 6,-2(r4)
+ sth 6,-2(r11)
+ bflr 31
+ lbz 7,-3(r4)
+ stb 7,-3(r11)
+ blr
+
+ .align 4
+L(tail5_bwd):
+ bflr 31
+ lbz 6,-5(r4)
+ stb 6,-5(r11)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz 6,-1(r4)
+ stb 6,-1(r11)
+ /* Return original DST pointer. */
+ blr
+
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8_bwd):
+ bne cr6,L(tail4_bwd)
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+ lwz 6,-8(r4)
+ lwz 7,-4(r4)
+ stw 6,-8(r11)
+ stw 7,-4(r11)
+ blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned_bwd):
+ andi. r10,r11,15 /* Check alignment of DST against 16 bytes.. */
+ srdi r9,r5,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont_bwd)
+
+ /* DST is not quadword aligned and r10 holds the address masked to
+ compare alignments. */
+ mtocrf 0x01,r10
+ subf r5,r10,r5
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1:
+ bf 31,2f
+ lbz r6,-1(r4)
+ subi r4,r4,1
+ stb r6,-1(r11)
+ subi r11,r11,1
+2:
+ bf 30,4f
+ lhz r6,-2(r4)
+ subi r4,r4,2
+ sth r6,-2(r11)
+ subi r11,r11,2
+4:
+ bf 29,8f
+ lwz r6,-4(r4)
+ subi r4,r4,4
+ stw r6,-4(r11)
+ subi r11,r11,4
+8:
+ bf 28,0f
+ ld r6,-8(r4)
+ subi r4,r4,8
+ std r6,-8(r11)
+ subi r11,r11,8
+0:
+ srdi r9,r5,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont_bwd):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrldi r10,r5,60
+ li r6,-16 /* Index for 16-bytes offsets. */
+ li r7,-32 /* Index for 32-bytes offsets. */
+ cmpldi cr1,10,0
+ srdi r8,r5,5 /* Setup the loop counter. */
+ mtocrf 0x01,9
+ cmpldi cr6,r9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr v5,r0,r4
+#else
+ lvsl v5,r0,r4
+#endif
+ lvx v3,0,r4
+ li r0,0
+ bf 31,L(setup_unaligned_loop_bwd)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */
+ lvx v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+ vperm v6,v3,v4,v5
+#else
+ vperm v6,v4,v3,v5
+#endif
+ subi r4,r4,16
+ stvx v6,r11,r6
+ subi r11,r11,16
+ vor v3,v4,v4
+ clrrdi r0,r4,60
+
+L(setup_unaligned_loop_bwd):
+ mtctr r8
+ ble cr6,L(end_unaligned_loop_bwd)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop_bwd):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+ vperm v6,v3,v4,v5
+#else
+ vperm v6,v4,v3,v5
+#endif
+ lvx v3,r4,r7
+#ifdef __LITTLE_ENDIAN__
+ vperm v10,v4,v3,v5
+#else
+ vperm v10,v3,v4,v5
+#endif
+ subi r4,r4,32
+ stvx v6,r11,r6
+ stvx v10,r11,r7
+ subi r11,r11,32
+ bdnz L(unaligned_loop_bwd)
+
+ clrrdi r0,r4,60
+
+ .align 4
+L(end_unaligned_loop_bwd):
+
+ /* Check for tail bytes. */
+ mtocrf 0x01,r5
+ beqlr cr1
+
+ add r4,r4,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+ /* Copy 8 bytes. */
+ bf 28,4f
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ subi r4,r4,8
+ stw r6,-4(r11)
+ stw r7,-8(r11)
+ subi r11,r11,8
+4: /* Copy 4~7 bytes. */
+ bf 29,L(tail2_bwd)
+ lwz r6,-4(r4)
+ stw r6,-4(r11)
+ bf 30,L(tail5_bwd)
+ lhz r7,-6(r4)
+ sth r7,-6(r11)
+ bflr 31
+ lbz r8,-7(r4)
+ stb r8,-7(r11)
+ /* Return original DST pointer. */
+ blr
+END_GEN_TB (MEMMOVE, TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+ Implemented in this file to avoid linker create a stub function call
+ in the branch to '_memmove'. */
+ENTRY_TOCLESS (__bcopy)
+ mr r6,r3
+ mr r3,r4
+ mr r4,r6
+ b L(_memmove)
+END (__bcopy)
+#ifndef __bcopy
+weak_alias (__bcopy, bcopy)
+#endif
--
2.9.5