This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] powerpc: P9 vector load instruction change in memcpy and memmove


From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>

POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load traps to the kernel.  To handle this in memcpy
and memmove, lvx/stvx is used for aligned addresses instead
of lxvd2x/stxvd2x.  The remaining part of the optimization remains
same as existing POWER7 code.

Reference: https://patchwork.ozlabs.org/patch/814059/
Tested on powerpc64le.

2017-10-19  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/multiarch/Makefile
	(sysdep_routines): Add memcpy_power9 and memmove_power9.
	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
	(memcpy): Add __memcpy_power9 to list of memcpy functions.
	(memmove): Add __memmove_power9 to list of memmove functions.
	(bcopy): Add __bcopy_power9 to list of bcopy functions.
	* sysdeps/powerpc/powerpc64/multiarch/memcpy.c
	(memcpy): Add __memcpy_power9 to ifunc list.
	* sysdeps/powerpc/powerpc64/power9/memcpy.S: New File.
	* sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S: Likewise.
 	* sysdeps/powerpc/powerpc64/multiarch/bcopy.c
	(bcopy): Add __bcopy_power9 to ifunc list.
	* sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
	Change bcopy as __bcopy.
	* sysdeps/powerpc/powerpc64/multiarch/memmove.c
	(memmove): Add __memmove_power9 to ifunc list.
	* sysdeps/powerpc/powerpc64/power7/memmove.S:
	Alias bcopy only if not defined before.
	* sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S:
	New file.
	* sysdeps/powerpc/powerpc64/power9/memmove.S: Likewise.
---
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   7 +-
 sysdeps/powerpc/powerpc64/multiarch/bcopy.c        |   6 +-
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   6 +
 .../powerpc/powerpc64/multiarch/memcpy-power9.S    |  26 +
 sysdeps/powerpc/powerpc64/multiarch/memcpy.c       |   3 +
 .../powerpc/powerpc64/multiarch/memmove-power7.S   |   4 +-
 .../powerpc/powerpc64/multiarch/memmove-power9.S   |  29 +
 sysdeps/powerpc/powerpc64/multiarch/memmove.c      |   5 +-
 sysdeps/powerpc/powerpc64/power7/memmove.S         |   2 +
 sysdeps/powerpc/powerpc64/power9/memcpy.S          | 429 +++++++++++
 sysdeps/powerpc/powerpc64/power9/memmove.S         | 837 +++++++++++++++++++++
 11 files changed, 1347 insertions(+), 7 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
 create mode 100644 sysdeps/powerpc/powerpc64/power9/memcpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/power9/memmove.S

diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index dea49ac..82728fa 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -1,6 +1,6 @@
 ifeq ($(subdir),string)
-sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
-		   memcpy-power4 memcpy-ppc64 \
+sysdep_routines += memcpy-power9 memcpy-power7 memcpy-a2 memcpy-power6 \
+		   memcpy-cell memcpy-power4 memcpy-ppc64 \
 		   memcmp-power8 memcmp-power7 memcmp-power4 memcmp-ppc64 \
 		   memset-power7 memset-power6 memset-power4 \
 		   memset-ppc64 memset-power8 \
@@ -24,7 +24,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
 		   strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
 		   strcat-power8 strcat-power7 strcat-ppc64 \
-		   memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
+		   memmove-power9 memmove-power7 memmove-ppc64 \
+		   wordcopy-ppc64 bcopy-ppc64 \
 		   strncpy-power8 strstr-power7 strstr-ppc64 \
 		   strspn-power8 strspn-ppc64 strcspn-power8 strcspn-ppc64 \
 		   strlen-power8 strcasestr-power8 strcasestr-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
index 05d46e2..4a4ee6e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
@@ -22,8 +22,12 @@
 extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
 /* __bcopy_power7 symbol is implemented at memmove-power7.S  */
 extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
+/* __bcopy_power9 symbol is implemented at memmove-power9.S.  */
+extern __typeof (bcopy) __bcopy_power9 attribute_hidden;
 
 libc_ifunc (bcopy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
+	    (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+	    ? __bcopy_power9
+	    : (hwcap & PPC_FEATURE_HAS_VSX)
             ? __bcopy_power7
             : __bcopy_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 6a88536..9040bbc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -51,6 +51,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #ifdef SHARED
   /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c.  */
   IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __memcpy_power9)
 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __memcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
@@ -65,6 +67,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c.  */
   IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __memmove_power9)
 	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
 			      __memmove_power7)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
@@ -168,6 +172,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c.  */
   IFUNC_IMPL (i, name, bcopy,
+	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __bcopy_power9)
 	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __bcopy_power7)
 	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
new file mode 100644
index 0000000..fbd0788
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
@@ -0,0 +1,26 @@
+/* Optimized memcpy implementation for PowerPC/POWER9.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define MEMCPY __memcpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power9/memcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
index 9f4286c..4c16fa0 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
@@ -35,8 +35,11 @@ extern __typeof (__redirect_memcpy) __memcpy_cell attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_power9 attribute_hidden;
 
 libc_ifunc (__libc_memcpy,
+	   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+	   ? __memcpy_power9 :
             (hwcap & PPC_FEATURE_HAS_VSX)
             ? __memcpy_power7 :
 	      (hwcap & PPC_FEATURE_ARCH_2_06)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
index a9435fa..0599a39 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
@@ -23,7 +23,7 @@
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#undef bcopy
-#define bcopy __bcopy_power7
+#undef __bcopy
+#define __bcopy __bcopy_power7
 
 #include <sysdeps/powerpc/powerpc64/power7/memmove.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
new file mode 100644
index 0000000..16a2267
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
@@ -0,0 +1,29 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define MEMMOVE __memmove_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bcopy
+#define __bcopy __bcopy_power9
+
+#include <sysdeps/powerpc/powerpc64/power9/memmove.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
index db2bbc7..f02498e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -31,9 +31,12 @@ extern __typeof (__redirect_memmove) __libc_memmove;
 
 extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_power9 attribute_hidden;
 
 libc_ifunc (__libc_memmove,
-            (hwcap & PPC_FEATURE_HAS_VSX)
+	    (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+	    ? __memmove_power9
+	    : (hwcap & PPC_FEATURE_HAS_VSX)
             ? __memmove_power7
             : __memmove_ppc);
 
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
index 93baa69..0bb8ddc 100644
--- a/sysdeps/powerpc/powerpc64/power7/memmove.S
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -832,4 +832,6 @@ ENTRY_TOCLESS (__bcopy)
 	mr	r4,r6
 	b	L(_memmove)
 END (__bcopy)
+#ifndef __bcopy
 weak_alias (__bcopy, bcopy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power9/memcpy.S b/sysdeps/powerpc/powerpc64/power9/memcpy.S
new file mode 100644
index 0000000..0731bac
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power9/memcpy.S
@@ -0,0 +1,429 @@
+/* Optimized memcpy implementation for PowerPC64/POWER9.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
+   Returns 'dst'.  */
+
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#define dst 11		/* Use r11 so r3 kept unchanged.  */
+#define src 4
+#define cnt 5
+
+	.machine power7
+ENTRY_TOCLESS (MEMCPY, 5)
+	CALL_MCOUNT 3
+
+	cmpldi	cr1,cnt,31
+	neg	0,3
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				    code.  */
+
+/* Align copies using VSX instructions to quadword. It is to avoid alignment
+   traps when memcpy is used on non-cacheable memory (for instance, memory
+   mapped I/O).  */
+	andi.	10,3,15
+	clrldi	11,4,60
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	dst,3
+	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
+
+	mtocrf	0x01,0
+	clrldi	0,0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+	bf	28,16f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+16:
+	subf	cnt,0,cnt
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,cnt
+	srdi	12,cnt,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lvx	6,0,src
+	lvx	7,src,6
+	mtctr	12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lvx	6,0,src
+	lvx	7,src,6
+L(aligned_128loop):
+	lvx	8,src,7
+	lvx	9,src,8
+	stvx	6,0,dst
+	addi	src,src,64
+	stvx	7,dst,6
+	stvx	8,dst,7
+	stvx	9,dst,8
+	lvx	6,0,src
+	lvx	7,src,6
+	addi	dst,dst,64
+	lvx	8,src,7
+	lvx	9,src,8
+	addi	src,src,64
+	stvx	6,0,dst
+	stvx	7,dst,6
+	stvx	8,dst,7
+	stvx	9,dst,8
+	addi	dst,dst,64
+	bdnz	L(aligned_128head)
+
+L(aligned_tail):
+	mtocrf	0x01,cnt
+	bf	25,32f
+	lvx	6,0,src
+	lvx	7,src,6
+	lvx	8,src,7
+	lvx	9,src,8
+	addi	src,src,64
+	stvx	6,0,dst
+	stvx	7,dst,6
+	stvx	8,dst,7
+	stvx	9,dst,8
+	addi	dst,dst,64
+32:
+	bf	26,16f
+	lvx	6,0,src
+	lvx	7,src,6
+	addi	src,src,32
+	stvx	6,0,dst
+	stvx	7,dst,6
+	addi	dst,dst,32
+16:
+	bf	27,8f
+	lvx	6,0,src
+	addi	src,src,16
+	stvx	6,0,dst
+	addi	dst,dst,16
+8:
+	bf	28,4f
+	ld	6,0(src)
+	addi	src,src,8
+	std     6,0(dst)
+	addi	dst,dst,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw     6,0(dst)
+	bf      30,L(tail5)
+	lhz     7,4(src)
+	sth     7,4(dst)
+	bflr	31
+	lbz     8,6(src)
+	stb     8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	mr	dst,3
+	cmpldi	cr6,cnt,8
+	mtocrf	0x01,cnt
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	andi.	0,8,3
+	cmpldi	cr1,cnt,16
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+2:
+	bf	30,1f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,cnt,16
+	mtocrf	0x01,cnt
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	lwz	8,8(src)
+	stw	7,4(dst)
+	lwz	6,12(src)
+	addi	src,src,16
+	stw	8,8(dst)
+	stw	6,12(dst)
+	addi	dst,dst,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	6,0(src)
+	sth	6,0(dst)
+	bflr	31
+	lbz	7,2(src)
+	stb	7,2(dst)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(src)
+	stb	6,4(dst)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(src)
+	stb	6,0(dst)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	stw	7,4(dst)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+	bf	28,0f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+0:
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	10,cnt,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,src
+#else
+	lvsl	5,0,src
+#endif
+	lvx	3,0,src
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	src,src,16
+	stvx	6,0,dst
+	addi	dst,dst,16
+	vor	3,4,4
+	clrrdi	0,src,60
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,src,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	src,src,32
+	stvx	6,0,dst
+	stvx	10,dst,6
+	addi	dst,dst,32
+	bdnz	L(unaligned_loop)
+
+	clrrdi	0,src,60
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,cnt
+	beqlr	cr1
+
+	add	src,src,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+END_GEN_TB (MEMCPY,TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/powerpc/powerpc64/power9/memmove.S b/sysdeps/powerpc/powerpc64/power9/memmove.S
new file mode 100644
index 0000000..9ed8f77
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power9/memmove.S
@@ -0,0 +1,837 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
+
+   This optimization check if memory 'dest'  overlaps with 'src'. If it does
+   not then it calls an optimized memcpy call (similar to memcpy for POWER7,
+   embedded here to gain some cycles).
+   If source and destiny overlaps, a optimized backwards memcpy is used
+   instead.  */
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+	.machine power7
+ENTRY_TOCLESS (MEMMOVE, 5)
+	CALL_MCOUNT 3
+
+L(_memmove):
+	subf    r9,r4,r3
+	cmpld   cr7,r9,r5
+	blt	cr7,L(memmove_bwd)
+
+	cmpldi	cr1,r5,31
+	neg	0,3
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				       code.  */
+
+	andi.	10,3,15
+	clrldi	11,4,60
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	r11,3
+	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
+
+	mtocrf	0x01,0
+	clrldi	0,0,60
+
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,16f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,r5
+	srdi	12,r5,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lvx	6,0,r4
+	lvx	7,r4,6
+	mtctr	12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lvx	6,0,r4
+	lvx	7,r4,6
+L(aligned_128loop):
+	lvx	8,r4,7
+	lvx	9,r4,8
+	stvx	6,0,r11
+	addi	r4,r4,64
+	stvx	7,r11,6
+	stvx	8,r11,7
+	stvx	9,r11,8
+	lvx	6,0,r4
+	lvx	7,r4,6
+	addi	r11,r11,64
+	lvx	8,r4,7
+	lvx	9,r4,8
+	addi	r4,r4,64
+	stvx	6,0,r11
+	stvx	7,r11,6
+	stvx	8,r11,7
+	stvx	9,r11,8
+	addi	r11,r11,64
+	bdnz	L(aligned_128head)
+
+L(aligned_tail):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lvx	6,0,r4
+	lvx	7,r4,6
+	lvx	8,r4,7
+	lvx	9,r4,8
+	addi	r4,r4,64
+	stvx	6,0,r11
+	stvx	7,r11,6
+	stvx	8,r11,7
+	stvx	9,r11,8
+	addi	r11,r11,64
+32:
+	bf	26,16f
+	lvx	6,0,r4
+	lvx	7,r4,6
+	addi	r4,r4,32
+	stvx	6,0,r11
+	stvx	7,r11,6
+	addi	r11,r11,32
+16:
+	bf	27,8f
+	lvx	6,0,r4
+	addi	r4,r4,16
+	stvx	6,0,r11
+	addi	r11,r11,16
+8:
+	bf	28,4f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std     6,0(r11)
+	addi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw     6,0(r11)
+	bf      30,L(tail5)
+	lhz     7,4(r4)
+	sth     7,4(r11)
+	bflr	31
+	lbz     8,6(r4)
+	stb     8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	mr	r11,3
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	andi.	0,8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	lwz	8,8(r4)
+	stw	7,4(r11)
+	lwz	6,12(r4)
+	addi	r4,r4,16
+	stw	8,8(r11)
+	stw	6,12(r11)
+	addi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	6,0(r4)
+	sth	6,0(r11)
+	bflr	31
+	lbz	7,2(r4)
+	stb	7,2(r11)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(r4)
+	stb	6,4(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(r4)
+	stb	6,0(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	stw	7,4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st r11 quadword.  */
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtocrf	0x01,0
+	subf	r5,0,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,0f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+0:
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	10,r5,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,r4
+#else
+	lvsl	5,0,r4
+#endif
+	lvx	3,0,r4
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	r4,r4,16
+	stvx	6,0,r11
+	addi	r11,r11,16
+	vor	3,4,4
+	clrrdi	0,r4,60
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,r4,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	r4,r4,32
+	stvx	6,0,r11
+	stvx	10,r11,6
+	addi	r11,r11,32
+	bdnz	L(unaligned_loop)
+
+	clrrdi	0,r4,60
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	/* Start to memcpy backward implementation: the algorith first check if
+	   src and dest have the same alignment and if it does align both to 16
+	   bytes and copy using VSX instructions.
+	   If does not, align dest to 16 bytes and use VMX (altivec) instruction
+	   to read two 16 bytes at time, shift/permute the bytes read and write
+	   aligned to dest.  */
+L(memmove_bwd):
+	cmpldi	cr1,r5,31
+	/* Copy is done backwards: update the pointers and check alignment.  */
+	add	r11,r3,r5
+	add	r4,r4,r5
+	mr	r0,r11
+	ble	cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
+				           code.  */
+
+	andi.	r10,r11,15	    /* Check if r11 is aligned to 16 bytes  */
+	clrldi	r9,r4,60	    /* Check if r4 is aligned to 16 bytes  */
+	cmpld	cr6,r10,r9	    /* SRC and DST alignments match?  */
+
+	bne     cr6,L(copy_GE_32_unaligned_bwd)
+	beq     L(aligned_copy_bwd)
+
+	mtocrf	0x01,r0
+	clrldi	r0,r0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,16f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy_bwd):
+	li	r6,-16
+	li	r7,-32
+	li	r8,-48
+	li	r9,-64
+	mtocrf	0x02,r5
+	srdi	r12,r5,7
+	cmpdi	r12,0
+	beq	L(aligned_tail_bwd)
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
+	mtctr	12
+	b	L(aligned_128loop_bwd)
+
+	.align  4
+L(aligned_128head_bwd):
+	/* for the 2nd + iteration of this loop. */
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
+L(aligned_128loop_bwd):
+	lvx	v8,r4,r8
+	lvx	v9,r4,r9
+	stvx	v6,r11,r6
+	subi	r4,r4,64
+	stvx	v7,r11,r7
+	stvx	v8,r11,r8
+	stvx	v9,r11,r9
+	lvx	v6,r4,r6
+	lvx	v7,r4,7
+	subi	r11,r11,64
+	lvx	v8,r4,r8
+	lvx	v9,r4,r9
+	subi	r4,r4,64
+	stvx	v6,r11,r6
+	stvx	v7,r11,r7
+	stvx	v8,r11,r8
+	stvx	v9,r11,r9
+	subi	r11,r11,64
+	bdnz	L(aligned_128head_bwd)
+
+L(aligned_tail_bwd):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
+	lvx	v8,r4,r8
+	lvx	v9,r4,r9
+	subi	r4,r4,64
+	stvx	v6,r11,r6
+	stvx	v7,r11,r7
+	stvx	v8,r11,r8
+	stvx	v9,r11,r9
+	subi	r11,r11,64
+32:
+	bf	26,16f
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
+	subi	r4,r4,32
+	stvx	v6,r11,r6
+	stvx	v7,r11,r7
+	subi	r11,r11,32
+16:
+	bf	27,8f
+	lvx	v6,r4,r6
+	subi	r4,r4,16
+	stvx	v6,r11,r6
+	subi	r11,r11,16
+8:
+	bf	28,4f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std     r6,-8(r11)
+	subi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw     r6,-4(r11)
+	bf      30,L(tail5_bwd)
+	lhz     r7,-6(r4)
+	sth     r7,-6(r11)
+	bflr	31
+	lbz     r8,-7(r4)
+	stb     r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32_bwd):
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8_bwd)
+
+	/* At least 9 bytes to go.  */
+	neg	r8,r4
+	andi.	r0,r8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned_bwd)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment_bwd)
+	lbz	6,-1(r4)
+	subi	r4,r4,1
+	stb	6,-1(r11)
+	subi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment_bwd):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned_bwd):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	stw	r6,-4(r11)
+	lwz	r8,-12(r4)
+	stw	r7,-8(r11)
+	lwz	r6,-16(r4)
+	subi	r4,r4,16
+	stw	r8,-12(r11)
+	stw	r6,-16(r11)
+	subi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4_bwd)
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4_bwd):
+	bf	29,L(tail2_bwd)
+	lwz	6,-4(r4)
+	stw	6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	7,-6(r4)
+	sth	7,-6(r11)
+	bflr	31
+	lbz	8,-7(r4)
+	stb	8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2_bwd):
+	bf	30,1f
+	lhz	6,-2(r4)
+	sth	6,-2(r11)
+	bflr	31
+	lbz	7,-3(r4)
+	stb	7,-3(r11)
+	blr
+
+	.align	4
+L(tail5_bwd):
+	bflr	31
+	lbz	6,-5(r4)
+	stb	6,-5(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,-1(r4)
+	stb	6,-1(r11)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8_bwd):
+	bne	cr6,L(tail4_bwd)
+
+	/* Though we could've used ld/std here, they are still
+	   slow for unaligned cases.  */
+	lwz	6,-8(r4)
+	lwz	7,-4(r4)
+	stw	6,-8(r11)
+	stw	7,-4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned_bwd):
+	andi.	r10,r11,15      /* Check alignment of DST against 16 bytes..  */
+	srdi	r9,r5,4		/* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont_bwd)
+
+	/* DST is not quadword aligned and r10 holds the address masked to
+           compare alignments.  */
+	mtocrf	0x01,r10
+	subf	r5,r10,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,0f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+0:
+	srdi	r9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont_bwd):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	r10,r5,60
+	li	r6,-16	      /* Index for 16-bytes offsets.  */
+	li	r7,-32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	r8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,r9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v5,r0,r4
+#else
+	lvsl	v5,r0,r4
+#endif
+	lvx	v3,0,r4
+	li	r0,0
+	bf	31,L(setup_unaligned_loop_bwd)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	subi	r4,r4,16
+	stvx	v6,r11,r6
+	subi	r11,r11,16
+	vor	v3,v4,v4
+	clrrdi	r0,r4,60
+
+L(setup_unaligned_loop_bwd):
+	mtctr	r8
+	ble	cr6,L(end_unaligned_loop_bwd)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop_bwd):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	lvx	v3,r4,r7
+#ifdef __LITTLE_ENDIAN__
+	vperm	v10,v4,v3,v5
+#else
+	vperm	v10,v3,v4,v5
+#endif
+	subi	r4,r4,32
+	stvx	v6,r11,r6
+	stvx	v10,r11,r7
+	subi	r11,r11,32
+	bdnz	L(unaligned_loop_bwd)
+
+	clrrdi	r0,r4,60
+
+	.align	4
+L(end_unaligned_loop_bwd):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw	r6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	r7,-6(r4)
+	sth	r7,-6(r11)
+	bflr	31
+	lbz	r8,-7(r4)
+	stb	r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+END_GEN_TB (MEMMOVE, TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+   Implemented in this file to avoid linker create a stub function call
+   in the branch to '_memmove'.  */
+ENTRY_TOCLESS (__bcopy)
+	mr	r6,r3
+	mr	r3,r4
+	mr	r4,r6
+	b	L(_memmove)
+END (__bcopy)
+#ifndef __bcopy
+weak_alias (__bcopy, bcopy)
+#endif
-- 
2.9.5


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]