This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH 04/36] PowerPC: memset/bzero multilib for PowerPC32

From: Adhemerval Zanella <azanella at linux dot vnet dot ibm dot com>
To: "GNU C. Library" <libc-alpha at sourceware dot org>
Date: Mon, 19 Aug 2013 18:27:50 -0300
Subject: [PATCH 04/36] PowerPC: memset/bzero multilib for PowerPC32
References: <52127ABC dot 40008 at linux dot vnet dot ibm dot com>
2013-08-19  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc32/power4/memset.S: Move to ...
	* sysdeps/powerpc/powerpc32/multiarch/memset-power4.S: ... here.
	(__memset): Rename symbol name to __memset_power4 and remove the
	libc_hidden_builtin_def.
	* sysdeps/powerpc/powerpc32/power6/memset.S: Move to ...
	* sysdeps/powerpc/powerpc32/multiarch/memset-power6.S: ... here.
	(__memset): Rename symbol name to __memset_power6 and remove the
	libc_hidden_builtin_def.
	* sysdeps/powerpc/powerpc32/power7/memset.S: Move to ...
	* sysdeps/powerpc/powerpc32/multiarch/memset-power7.S: ... here.
	(__memset): Rename symbol name to __memset_power7 and remove the
	libc_hidden_builtin_def.
	* sysdeps/powerpc/powerpc32/multiarch/memset.c: New file: multiarch
	memset for PPC32.
	* sysdeps/powerpc/powerpc32/multiarch/memset-ppc32.S: New file:
	default memset PPC32 implementation.
	* sysdeps/powerpc/powerpc32/multiarch/rtld-memset.S: New file:
	default loader memset PPC32 implementation.
	* sysdeps/powerpc/powerpc32/multiarch/bzero.c: New file: multiarch
	bzero for PPC32.
	* sysdeps/powerpc/powerpc32/multiarch/Makefile: Added memset and bzero
	multiarch objects.
	* sysdep/powerpc/powerpc32/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Likewise.

--

diff --git a/sysdeps/powerpc/powerpc32/multiarch/Makefile b/sysdeps/powerpc/powerpc32/multiarch/Makefile
index c74e600..f3778a7 100644
--- a/sysdeps/powerpc/powerpc32/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc32/multiarch/Makefile
@@ -1,5 +1,6 @@
 ifeq ($(subdir),string)
 sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   memcpy-power4 memcpy-ppc32 memcmp-power7 memcmp-power4 \
-		   memcmp-ppc32
+		   memcmp-ppc32 memset-power7 memset-power6 memset-power4 \
+		   memset-ppc32
 endif
diff --git a/sysdeps/powerpc/powerpc32/multiarch/bzero.c b/sysdeps/powerpc/powerpc32/multiarch/bzero.c
new file mode 100644
index 0000000..4a14a18
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/multiarch/bzero.c
@@ -0,0 +1,68 @@
+/* Multiple versions of bzero.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for definition in libc.  */
+#ifndef NOT_IN_libc
+# include <string.h>
+# include <strings.h>
+# include "init-arch.h"
+
+extern __typeof (memset) __memset_ppc32 attribute_hidden;
+extern __typeof (memset) __memset_power4 attribute_hidden;
+extern __typeof (memset) __memset_power6 attribute_hidden;
+extern __typeof (memset) __memset_power7 attribute_hidden;
+
+__typeof(bzero) __bzero_ppc32 attribute_hidden;
+void
+__bzero_ppc32 (void *s, size_t n)
+{
+  __memset_ppc32 (s, 0, n);
+}
+
+__typeof(bzero) __bzero_power4 attribute_hidden;
+void
+__bzero_power4 (void *s, size_t n)
+{
+  __memset_power4 (s, 0, n);
+}
+
+__typeof(bzero) __bzero_power6 attribute_hidden;
+void
+__bzero_power6 (void *s, size_t n)
+{
+  __memset_power6 (s, 0, n);
+}
+
+__typeof(bzero) __bzero_power7 attribute_hidden;
+void
+__bzero_power7 (void *s, size_t n)
+{
+  __memset_power7 (s, 0, n);
+}
+
+libc_ifunc (__bzero,
+            (hwcap & PPC_FEATURE_HAS_VSX)
+            ? __bzero_power7 :
+	      (hwcap & PPC_FEATURE_ARCH_2_05)
+		? __bzero_power6 :
+		  (hwcap & PPC_FEATURE_POWER4)
+		? __bzero_power4
+            : __bzero_ppc32);
+
+weak_alias (__bzero, bzero)
+#endif
diff --git a/sysdeps/powerpc/powerpc32/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc32/multiarch/ifunc-impl-list.c
index 800aff7..45943cf 100644
--- a/sysdeps/powerpc/powerpc32/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc32/multiarch/ifunc-impl-list.c
@@ -46,7 +46,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   else if (hwcap & PPC_FEATURE_POWER5)
     hwcap |= PPC_FEATURE_POWER4;
 
+  /* Support sysdeps/powerpc/powerpc32/multiarch/bzero.c.  */
+  IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
+			      __bzero_power7)
+	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
+			      __bzero_power6)
+	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_POWER4,
+			      __bzero_power4)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ppc32))
+
 #ifdef SHARED
+  /* Support sysdeps/powerpc/powerpc32/multiarch/memset.c.  */
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
+			      __memset_power7)
+	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
+			      __memset_power6)
+	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_POWER4,
+			      __memset_power4)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ppc32))
+
   /* Support sysdeps/powerpc/powerpc32/multiarch/memcpy.c.  */
   IFUNC_IMPL (i, name, memcpy,
 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
diff --git a/sysdeps/powerpc/powerpc32/multiarch/memset-power4.S b/sysdeps/powerpc/powerpc32/multiarch/memset-power4.S
new file mode 100644
index 0000000..ceb5888
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/multiarch/memset-power4.S
@@ -0,0 +1,225 @@
+/* Optimized memset implementation for PowerPC32.
+   Copyright (C) 1997-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.
+
+   The memset is done in three sizes: byte (8 bits), word (32 bits),
+   cache line (1024 bits). There is a special case for setting cache lines
+   to 0, to take advantage of the dcbz instruction.  */
+
+	.machine power4
+EALIGN (__memset_power4, 5, 0)
+	CALL_MCOUNT
+
+#define rTMP	r0
+#define rRTN	r3	/* Initial value of 1st argument.  */
+#define rMEMP0	r3	/* Original value of 1st arg.  */
+#define rCHR	r4	/* Char to set in each byte.  */
+#define rLEN	r5	/* Length of region to set.  */
+#define rMEMP	r6	/* Address at which we are storing.  */
+#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
+#define rMEMP2	r8
+
+#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
+#define rCLS	r8	/* Cache line size (known to be 128).  */
+#define rCLM	r9	/* Cache line size mask to check for cache alignment.  */
+L(_memset):
+/* Take care of case for size <= 4.  */
+	cmplwi	cr1, rLEN, 4
+	andi.	rALIGN, rMEMP0, 3
+	mr	rMEMP, rMEMP0
+	ble-	cr1, L(small)
+
+/* Align to word boundary.  */
+	cmplwi	cr5, rLEN, 31
+	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	beq+	L(aligned)
+	mtcrf	0x01, rMEMP0
+	subfic	rALIGN, rALIGN, 4
+	add	rMEMP, rMEMP, rALIGN
+	sub	rLEN, rLEN, rALIGN
+	bf+	31, L(g0)
+	stb	rCHR, 0(rMEMP0)
+	bt	30, L(aligned)
+L(g0):
+	sth	rCHR, -2(rMEMP)
+
+/* Handle the case of size < 31.  */
+L(aligned):
+	mtcrf	0x01, rLEN
+	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	ble	cr5, L(medium)
+/* Align to 32-byte boundary.  */
+	andi.	rALIGN, rMEMP, 0x1C
+	subfic	rALIGN, rALIGN, 0x20
+	beq	L(caligned)
+	mtcrf	0x01, rALIGN
+	add	rMEMP, rMEMP, rALIGN
+	sub	rLEN, rLEN, rALIGN
+	cmplwi	cr1, rALIGN, 0x10
+	mr	rMEMP2, rMEMP
+	bf	28, L(a1)
+        stw     rCHR, -4(rMEMP2)
+	stwu	rCHR, -8(rMEMP2)
+L(a1):	blt	cr1, L(a2)
+        stw     rCHR, -4(rMEMP2)
+	stw	rCHR, -8(rMEMP2)
+	stw	rCHR, -12(rMEMP2)
+	stwu	rCHR, -16(rMEMP2)
+L(a2):  bf      29, L(caligned)
+        stw     rCHR, -4(rMEMP2)
+
+/* Now aligned to a 32 byte boundary.  */
+L(caligned):
+	cmplwi	cr1, rCHR, 0
+	clrrwi.	rALIGN, rLEN, 5
+	mtcrf	0x01, rLEN
+	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
+L(nondcbz):
+	srwi	rTMP, rALIGN, 5
+	mtctr	rTMP
+	beq	L(medium)	/* We may not actually get to do a full line.  */
+	clrlwi.	rLEN, rLEN, 27
+	add	rMEMP, rMEMP, rALIGN
+	li	rNEG64, -0x40
+	bdz	L(cloopdone)
+
+        .align 4
+L(c3): 	dcbtst	rNEG64, rMEMP
+        stw     rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+        stw     rCHR, -12(rMEMP)
+	stw	rCHR, -16(rMEMP)
+        stw     rCHR, -20(rMEMP)
+	stw	rCHR, -24(rMEMP)
+        stw     rCHR, -28(rMEMP)
+	stwu	rCHR, -32(rMEMP)
+	bdnz	L(c3)
+L(cloopdone):
+        stw     rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+        stw     rCHR, -12(rMEMP)
+	stw	rCHR, -16(rMEMP)
+	cmplwi	cr1, rLEN, 16
+        stw     rCHR, -20(rMEMP)
+	stw	rCHR, -24(rMEMP)
+        stw     rCHR, -28(rMEMP)
+	stwu	rCHR, -32(rMEMP)
+	beqlr
+	add	rMEMP, rMEMP, rALIGN
+	b	L(medium_tail2)
+
+	.align 5
+/* Clear lines of memory in 128-byte chunks.  */
+L(zloopstart):
+/* If the remaining length is less the 32 bytes, don't bother getting
+	 the cache line size.  */
+	beq	L(medium)
+	li      rCLS,128  /* cache line size is 128 */
+	dcbt	0,rMEMP
+L(getCacheAligned):
+	cmplwi	cr1,rLEN,32
+	andi.	rTMP,rMEMP,127
+	blt	cr1,L(handletail32)
+	beq	L(cacheAligned)
+	addi	rMEMP,rMEMP,32
+	addi	rLEN,rLEN,-32
+	stw	rCHR,-32(rMEMP)
+        stw     rCHR,-28(rMEMP)
+	stw	rCHR,-24(rMEMP)
+	stw     rCHR,-20(rMEMP)
+	stw	rCHR,-16(rMEMP)
+        stw     rCHR,-12(rMEMP)
+	stw	rCHR,-8(rMEMP)
+        stw     rCHR,-4(rMEMP)
+	b	L(getCacheAligned)
+
+/* Now we are aligned to the cache line and can use dcbz.  */
+        .align 4
+L(cacheAligned):
+	cmplw	cr1,rLEN,rCLS
+	blt	cr1,L(handletail32)
+	dcbz	0,rMEMP
+	subf	rLEN,rCLS,rLEN
+	add	rMEMP,rMEMP,rCLS
+	b	L(cacheAligned)
+
+/* We are here because the cache line size was set and the remainder
+  (rLEN) is less than the actual cache line size.
+   So set up the preconditions for L(nondcbz) and go there.  */
+L(handletail32):
+	clrrwi.	rALIGN, rLEN, 5
+	b		L(nondcbz)
+
+	.align 5
+L(small):
+/* Memset of 4 bytes or less.  */
+	cmplwi	cr5, rLEN, 1
+	cmplwi	cr1, rLEN, 3
+	bltlr	cr5
+	stb	rCHR, 0(rMEMP)
+	beqlr	cr5
+	stb	rCHR, 1(rMEMP)
+	bltlr	cr1
+	stb	rCHR, 2(rMEMP)
+	beqlr	cr1
+	stb	rCHR, 3(rMEMP)
+	blr
+
+/* Memset of 0-31 bytes.  */
+	.align 5
+L(medium):
+	cmplwi	cr1, rLEN, 16
+L(medium_tail2):
+	add	rMEMP, rMEMP, rLEN
+L(medium_tail):
+	bt-	31, L(medium_31t)
+	bt-	30, L(medium_30t)
+L(medium_30f):
+	bt-	29, L(medium_29t)
+L(medium_29f):
+	bge-	cr1, L(medium_27t)
+	bflr-	28
+        stw     rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+	blr
+
+L(medium_31t):
+	stbu	rCHR, -1(rMEMP)
+	bf-	30, L(medium_30f)
+L(medium_30t):
+	sthu	rCHR, -2(rMEMP)
+	bf-	29, L(medium_29f)
+L(medium_29t):
+	stwu	rCHR, -4(rMEMP)
+	blt-	cr1, L(medium_27f)
+L(medium_27t):
+        stw     rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+        stw     rCHR, -12(rMEMP)
+	stwu	rCHR, -16(rMEMP)
+L(medium_27f):
+	bflr-	28
+L(medium_28t):
+        stw     rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+	blr
+END (__memset_power4)
diff --git a/sysdeps/powerpc/powerpc32/multiarch/memset-power6.S b/sysdeps/powerpc/powerpc32/multiarch/memset-power6.S
new file mode 100644
index 0000000..493c1ef
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/multiarch/memset-power6.S
@@ -0,0 +1,538 @@
+/* Optimized 32-bit memset implementation for POWER6.
+   Copyright (C) 1997-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.
+
+   The memset is done in three sizes: byte (8 bits), word (32 bits),
+   cache line (1024 bits). There is a special case for setting cache lines
+   to 0, to take advantage of the dcbz instruction.  */
+
+	.machine power6
+EALIGN (__memset_power6, 7, 0)
+	CALL_MCOUNT
+
+#define rTMP	r0
+#define rRTN	r3	/* Initial value of 1st argument.  */
+#define rMEMP0	r3	/* Original value of 1st arg.  */
+#define rCHR	r4	/* Char to set in each byte.  */
+#define rLEN	r5	/* Length of region to set.  */
+#define rMEMP	r6	/* Address at which we are storing.  */
+#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
+#define rMEMP2	r8
+
+#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
+#define rMEMP3	r9	/* Alt mem pointer.  */
+L(_memset):
+/* Take care of case for size <= 4.  */
+	cmplwi	cr1, rLEN, 4
+	andi.	rALIGN, rMEMP0, 3
+	mr	rMEMP, rMEMP0
+	ble-	cr1, L(small)
+/* Align to word boundary.  */
+	cmplwi	cr5, rLEN, 31
+	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	beq+	L(aligned)
+	mtcrf	0x01, rMEMP0
+	subfic	rALIGN, rALIGN, 4
+	add	rMEMP, rMEMP, rALIGN
+	sub	rLEN, rLEN, rALIGN
+	bf+	31, L(g0)
+	stb	rCHR, 0(rMEMP0)
+	bt	30, L(aligned)
+L(g0):
+	sth	rCHR, -2(rMEMP)
+
+        .align 4
+/* Handle the case of size < 31.  */
+L(aligned):
+	mtcrf	0x01, rLEN
+	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	ble	cr5, L(medium)
+/* Align to 32-byte boundary.  */
+	andi.	rALIGN, rMEMP, 0x1C
+	subfic	rALIGN, rALIGN, 0x20
+	beq	L(caligned)
+	mtcrf	0x01, rALIGN
+	add	rMEMP, rMEMP, rALIGN
+	sub	rLEN, rLEN, rALIGN
+	cmplwi	cr1, rALIGN, 0x10
+	mr	rMEMP2, rMEMP
+	bf	28, L(a1)
+        stw     rCHR, -4(rMEMP2)
+	stwu	rCHR, -8(rMEMP2)
+	nop
+L(a1):	blt	cr1, L(a2)
+        stw     rCHR, -4(rMEMP2)
+	stw	rCHR, -8(rMEMP2)
+	stw	rCHR, -12(rMEMP2)
+	stwu	rCHR, -16(rMEMP2)
+L(a2):  bf      29, L(caligned)
+        stw     rCHR, -4(rMEMP2)
+
+        .align 3
+/* Now aligned to a 32 byte boundary.  */
+L(caligned):
+	cmplwi	cr1, rCHR, 0
+	clrrwi.	rALIGN, rLEN, 5
+	mtcrf	0x01, rLEN
+	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
+L(nondcbz):
+	beq	L(medium)	/* We may not actually get to do a full line.  */
+	nop
+/* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
+   boundary may not be at cache line (128-byte) boundary.  */
+L(nzloopstart):
+/* memset in 32-byte chunks until we get to a cache line boundary.
+   If rLEN is less then the distance to the next cache-line boundary use
+   cacheAligned1 code to finish the tail.  */
+	cmplwi	cr1,rLEN,128
+
+	andi.	rTMP,rMEMP,127
+	blt	cr1,L(cacheAligned1)
+	addi	rMEMP3,rMEMP,32
+	beq	L(nzCacheAligned)
+	addi	rLEN,rLEN,-32
+	stw	rCHR,0(rMEMP)
+        stw     rCHR,4(rMEMP)
+	stw	rCHR,8(rMEMP)
+	stw     rCHR,12(rMEMP)
+	stw	rCHR,16(rMEMP)
+        stw     rCHR,20(rMEMP)
+	addi	rMEMP,rMEMP,32
+	andi.	rTMP,rMEMP3,127
+	stw	rCHR,-8(rMEMP3)
+        stw     rCHR,-4(rMEMP3)
+
+	beq	L(nzCacheAligned)
+	addi	rLEN,rLEN,-32
+	stw	rCHR,0(rMEMP3)
+        stw     rCHR,4(rMEMP3)
+	addi	rMEMP,rMEMP,32
+	stw	rCHR,8(rMEMP3)
+	stw     rCHR,12(rMEMP3)
+	andi.	rTMP,rMEMP,127
+	stw	rCHR,16(rMEMP3)
+        stw     rCHR,20(rMEMP3)
+	stw	rCHR,24(rMEMP3)
+        stw     rCHR,28(rMEMP3)
+
+	beq	L(nzCacheAligned)
+	addi	rLEN,rLEN,-32
+/* At this point we can overrun the store queue (pipe reject) so it is
+   time to slow things down. The store queue can merge two adjacent
+   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
+   So we add "group ending nops" to guarantee that we dispatch only two
+   stores every other cycle. */
+	ori	r1,r1,0
+	ori	r1,r1,0
+	stw	rCHR,32(rMEMP3)
+        stw     rCHR,36(rMEMP3)
+	addi	rMEMP,rMEMP,32
+	cmplwi	cr1,rLEN,128
+	ori	r1,r1,0
+	ori	r1,r1,0
+	stw	rCHR,40(rMEMP3)
+	stw     rCHR,44(rMEMP3)
+	ori	r1,r1,0
+	ori	r1,r1,0
+	stw	rCHR,48(rMEMP3)
+        stw     rCHR,52(rMEMP3)
+	ori	r1,r1,0
+	ori	r1,r1,0
+	stw	rCHR,56(rMEMP3)
+        stw     rCHR,60(rMEMP3)
+	blt	cr1,L(cacheAligned1)
+	b	L(nzCacheAligned)
+
+/* Now we are aligned to the cache line and can use dcbtst.  */
+        .align 5
+L(nzCacheAligned):
+	cmplwi	cr1,rLEN,128
+	cmplwi	cr6,rLEN,256
+	blt	cr1,L(cacheAligned1)
+	blt	cr6,L(nzCacheAligned128)
+        .align 4
+L(nzCacheAligned128):
+	nop
+	addi	rMEMP3,rMEMP,64
+	stw	rCHR,0(rMEMP)
+        stw     rCHR,4(rMEMP)
+	stw	rCHR,8(rMEMP)
+	stw     rCHR,12(rMEMP)
+	stw	rCHR,16(rMEMP)
+        stw     rCHR,20(rMEMP)
+	stw	rCHR,24(rMEMP)
+        stw     rCHR,28(rMEMP)
+	stw	rCHR,32(rMEMP)
+        stw     rCHR,36(rMEMP)
+	stw	rCHR,40(rMEMP)
+	stw     rCHR,44(rMEMP)
+	stw	rCHR,48(rMEMP)
+        stw     rCHR,52(rMEMP)
+	stw	rCHR,56(rMEMP)
+        stw     rCHR,60(rMEMP)
+	addi	rMEMP,rMEMP3,64
+	addi	rLEN,rLEN,-128
+/* At this point we can overrun the store queue (pipe reject) so it is
+   time to slow things down. The store queue can merge two adjacent
+   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
+   So we add "group ending nops" to guarantee that we dispatch only one
+   store per cycle. */
+	stw	rCHR,0(rMEMP3)
+	ori	r1,r1,0
+        stw     rCHR,4(rMEMP3)
+	ori	r1,r1,0
+	stw	rCHR,8(rMEMP3)
+	ori	r1,r1,0
+	stw     rCHR,12(rMEMP3)
+	ori	r1,r1,0
+	stw	rCHR,16(rMEMP3)
+	ori	r1,r1,0
+        stw     rCHR,20(rMEMP3)
+	ori	r1,r1,0
+	stw	rCHR,24(rMEMP3)
+	ori	r1,r1,0
+        stw     rCHR,28(rMEMP3)
+	ori	r1,r1,0
+	stw	rCHR,32(rMEMP3)
+	ori	r1,r1,0
+        stw     rCHR,36(rMEMP3)
+	ori	r1,r1,0
+	stw	rCHR,40(rMEMP3)
+	ori	r1,r1,0
+	stw     rCHR,44(rMEMP3)
+	ori	r1,r1,0
+	stw	rCHR,48(rMEMP3)
+	ori	r1,r1,0
+        stw     rCHR,52(rMEMP3)
+	ori	r1,r1,0
+	stw	rCHR,56(rMEMP3)
+	ori	r1,r1,0
+        stw     rCHR,60(rMEMP3)
+	blt	cr6,L(cacheAligned1)
+#ifndef NOT_IN_libc
+	lfd	0,-128(rMEMP)
+#endif
+	b	L(nzCacheAligned256)
+        .align 5
+L(nzCacheAligned256):
+	cmplwi	cr1,rLEN,256
+	addi	rMEMP3,rMEMP,64
+#ifdef NOT_IN_libc
+/* When we are not in libc we should use only GPRs to avoid the FPU lock
+   interrupt.  */
+	stw	rCHR,0(rMEMP)
+        stw     rCHR,4(rMEMP)
+	stw	rCHR,8(rMEMP)
+	stw     rCHR,12(rMEMP)
+	stw	rCHR,16(rMEMP)
+        stw     rCHR,20(rMEMP)
+	stw	rCHR,24(rMEMP)
+        stw     rCHR,28(rMEMP)
+	stw	rCHR,32(rMEMP)
+        stw     rCHR,36(rMEMP)
+	stw	rCHR,40(rMEMP)
+	stw     rCHR,44(rMEMP)
+	stw	rCHR,48(rMEMP)
+        stw     rCHR,52(rMEMP)
+	stw	rCHR,56(rMEMP)
+        stw     rCHR,60(rMEMP)
+	addi	rMEMP,rMEMP3,64
+	addi	rLEN,rLEN,-128
+	stw	rCHR,0(rMEMP3)
+        stw     rCHR,4(rMEMP3)
+	stw	rCHR,8(rMEMP3)
+	stw     rCHR,12(rMEMP3)
+	stw	rCHR,16(rMEMP3)
+        stw     rCHR,20(rMEMP3)
+	stw	rCHR,24(rMEMP3)
+        stw     rCHR,28(rMEMP3)
+	stw	rCHR,32(rMEMP3)
+        stw     rCHR,36(rMEMP3)
+	stw	rCHR,40(rMEMP3)
+	stw     rCHR,44(rMEMP3)
+	stw	rCHR,48(rMEMP3)
+        stw     rCHR,52(rMEMP3)
+	stw	rCHR,56(rMEMP3)
+        stw     rCHR,60(rMEMP3)
+#else
+/* We are in libc and this is a long memset so we can use FPRs and can afford
+   occasional FPU locked interrupts.  */
+	stfd	0,0(rMEMP)
+	stfd	0,8(rMEMP)
+	stfd	0,16(rMEMP)
+	stfd	0,24(rMEMP)
+	stfd	0,32(rMEMP)
+	stfd	0,40(rMEMP)
+	stfd	0,48(rMEMP)
+	stfd	0,56(rMEMP)
+	addi	rMEMP,rMEMP3,64
+	addi	rLEN,rLEN,-128
+	stfd	0,0(rMEMP3)
+	stfd	0,8(rMEMP3)
+	stfd	0,16(rMEMP3)
+	stfd	0,24(rMEMP3)
+	stfd	0,32(rMEMP3)
+	stfd	0,40(rMEMP3)
+	stfd	0,48(rMEMP3)
+	stfd	0,56(rMEMP3)
+#endif
+	bge	cr1,L(nzCacheAligned256)
+	dcbtst	0,rMEMP
+	b	L(cacheAligned1)
+
+	.align 4
+/* Storing a zero "c" value. We are aligned at a sector (32-byte)
+   boundary but may not be at cache line (128-byte) boundary.  If the
+   remaining length spans a full cache line we can use the Data cache
+   block zero instruction. */
+L(zloopstart):
+/* memset in 32-byte chunks until we get to a cache line boundary.
+   If rLEN is less then the distance to the next cache-line boundary use
+   cacheAligned1 code to finish the tail.  */
+	cmplwi	cr1,rLEN,128
+	beq	L(medium)
+L(getCacheAligned):
+	andi.	rTMP,rMEMP,127
+	blt	cr1,L(cacheAligned1)
+	addi	rMEMP3,rMEMP,32
+	beq	L(cacheAligned)
+	addi	rLEN,rLEN,-32
+	stw	rCHR,0(rMEMP)
+        stw     rCHR,4(rMEMP)
+	stw	rCHR,8(rMEMP)
+	stw     rCHR,12(rMEMP)
+	stw	rCHR,16(rMEMP)
+        stw     rCHR,20(rMEMP)
+	addi	rMEMP,rMEMP,32
+	andi.	rTMP,rMEMP3,127
+	stw	rCHR,-8(rMEMP3)
+        stw     rCHR,-4(rMEMP3)
+L(getCacheAligned2):
+	beq	L(cacheAligned)
+	addi	rLEN,rLEN,-32
+	addi	rMEMP,rMEMP,32
+	stw	rCHR,0(rMEMP3)
+        stw     rCHR,4(rMEMP3)
+	stw	rCHR,8(rMEMP3)
+	stw     rCHR,12(rMEMP3)
+	andi.	rTMP,rMEMP,127
+	nop
+	stw	rCHR,16(rMEMP3)
+        stw     rCHR,20(rMEMP3)
+	stw	rCHR,24(rMEMP3)
+        stw     rCHR,28(rMEMP3)
+L(getCacheAligned3):
+	beq	L(cacheAligned)
+/* At this point we can overrun the store queue (pipe reject) so it is
+   time to slow things down. The store queue can merge two adjacent
+   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
+   So we add "group ending nops" to guarantee that we dispatch only two
+   stores every other cycle. */
+	addi	rLEN,rLEN,-32
+	ori	r1,r1,0
+	ori	r1,r1,0
+	stw	rCHR,32(rMEMP3)
+        stw     rCHR,36(rMEMP3)
+	addi	rMEMP,rMEMP,32
+	cmplwi	cr1,rLEN,128
+	ori	r1,r1,0
+	stw	rCHR,40(rMEMP3)
+	stw     rCHR,44(rMEMP3)
+	cmplwi	cr6,rLEN,256
+	li	rMEMP2,128
+	ori	r1,r1,0
+	stw	rCHR,48(rMEMP3)
+        stw     rCHR,52(rMEMP3)
+	ori	r1,r1,0
+	ori	r1,r1,0
+	stw	rCHR,56(rMEMP3)
+        stw     rCHR,60(rMEMP3)
+	blt	cr1,L(cacheAligned1)
+	blt	cr6,L(cacheAligned128)
+	b	L(cacheAlignedx)
+
+/* Now we are aligned to the cache line and can use dcbz.  */
+        .align 4
+L(cacheAligned):
+	cmplwi	cr1,rLEN,128
+	cmplwi	cr6,rLEN,256
+	blt	cr1,L(cacheAligned1)
+	li	rMEMP2,128
+L(cacheAlignedx):
+	cmplwi	cr5,rLEN,640
+	blt	cr6,L(cacheAligned128)
+	bgt	cr5,L(cacheAligned512)
+	cmplwi	cr6,rLEN,512
+	dcbz	0,rMEMP
+	cmplwi	cr1,rLEN,384
+	dcbz	rMEMP2,rMEMP
+	addi	rMEMP,rMEMP,256
+	addi	rLEN,rLEN,-256
+	blt	cr1,L(cacheAligned1)
+	blt	cr6,L(cacheAligned128)
+	b	L(cacheAligned256)
+	.align 5
+/* A simple loop for the longer (>640 bytes) lengths.  This form limits
+   the branch miss-predicted to exactly 1 at loop exit.*/
+L(cacheAligned512):
+	cmpli	cr1,rLEN,128
+	blt	cr1,L(cacheAligned1)
+	dcbz	0,rMEMP
+	addi	rLEN,rLEN,-128
+	addi	rMEMP,rMEMP,128
+	b	L(cacheAligned512)
+        .align 5
+L(cacheAligned256):
+	cmplwi	cr6,rLEN,512
+	dcbz	0,rMEMP
+	cmplwi	cr1,rLEN,384
+	dcbz	rMEMP2,rMEMP
+	addi	rMEMP,rMEMP,256
+	addi	rLEN,rLEN,-256
+	bge	cr6,L(cacheAligned256)
+	blt	cr1,L(cacheAligned1)
+        .align 4
+L(cacheAligned128):
+	dcbz	0,rMEMP
+	addi	rMEMP,rMEMP,128
+	addi	rLEN,rLEN,-128
+        .align 4
+L(cacheAligned1):
+	cmplwi	cr1,rLEN,32
+	blt	cr1,L(handletail32)
+	addi	rMEMP3,rMEMP,32
+	addi	rLEN,rLEN,-32
+	stw	rCHR,0(rMEMP)
+        stw     rCHR,4(rMEMP)
+	stw	rCHR,8(rMEMP)
+	stw     rCHR,12(rMEMP)
+	stw	rCHR,16(rMEMP)
+        stw     rCHR,20(rMEMP)
+	addi	rMEMP,rMEMP,32
+	cmplwi	cr1,rLEN,32
+	stw	rCHR,-8(rMEMP3)
+        stw     rCHR,-4(rMEMP3)
+L(cacheAligned2):
+	blt	cr1,L(handletail32)
+	addi	rLEN,rLEN,-32
+	stw	rCHR,0(rMEMP3)
+        stw     rCHR,4(rMEMP3)
+	stw	rCHR,8(rMEMP3)
+	stw     rCHR,12(rMEMP3)
+	addi	rMEMP,rMEMP,32
+	cmplwi	cr1,rLEN,32
+	stw	rCHR,16(rMEMP3)
+        stw     rCHR,20(rMEMP3)
+	stw	rCHR,24(rMEMP3)
+        stw     rCHR,28(rMEMP3)
+	nop
+L(cacheAligned3):
+	blt	cr1,L(handletail32)
+/* At this point we can overrun the store queue (pipe reject) so it is
+   time to slow things down. The store queue can merge two adjacent
+   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
+   So we add "group ending nops" to guarantee that we dispatch only two
+   stores every other cycle. */
+	ori	r1,r1,0
+	ori	r1,r1,0
+	addi	rMEMP,rMEMP,32
+	addi	rLEN,rLEN,-32
+	ori	r1,r1,0
+	ori	r1,r1,0
+	stw	rCHR,32(rMEMP3)
+        stw     rCHR,36(rMEMP3)
+	ori	r1,r1,0
+	ori	r1,r1,0
+	stw	rCHR,40(rMEMP3)
+	stw     rCHR,44(rMEMP3)
+	ori	r1,r1,0
+	ori	r1,r1,0
+	stw	rCHR,48(rMEMP3)
+        stw     rCHR,52(rMEMP3)
+	ori	r1,r1,0
+	ori	r1,r1,0
+	stw	rCHR,56(rMEMP3)
+        stw     rCHR,60(rMEMP3)
+
+/* We are here because the length or remainder (rLEN) is less than the
+   cache line/sector size and does not justify aggressive loop unrolling.
+   So set up the preconditions for L(medium) and go there.  */
+        .align 3
+L(handletail32):
+	cmplwi	cr1,rLEN,0
+	beqlr   cr1
+	b	L(medium)
+
+	.align 4
+L(small):
+/* Memset of 4 bytes or less.  */
+	cmplwi	cr5, rLEN, 1
+	cmplwi	cr1, rLEN, 3
+	bltlr	cr5
+	stb	rCHR, 0(rMEMP)
+	beqlr	cr5
+	stb	rCHR, 1(rMEMP)
+	bltlr	cr1
+	stb	rCHR, 2(rMEMP)
+	beqlr	cr1
+	stb	rCHR, 3(rMEMP)
+	blr
+
+/* Memset of 0-31 bytes.  */
+	.align 5
+L(medium):
+	cmplwi	cr1, rLEN, 16
+L(medium_tail2):
+	add	rMEMP, rMEMP, rLEN
+L(medium_tail):
+	bt-	31, L(medium_31t)
+	bt-	30, L(medium_30t)
+L(medium_30f):
+	bt	29, L(medium_29t)
+L(medium_29f):
+	bge	cr1, L(medium_27t)
+	bflr	28
+        stw     rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+	blr
+
+L(medium_31t):
+	stbu	rCHR, -1(rMEMP)
+	bf-	30, L(medium_30f)
+L(medium_30t):
+	sthu	rCHR, -2(rMEMP)
+	bf-	29, L(medium_29f)
+L(medium_29t):
+	stwu	rCHR, -4(rMEMP)
+	blt	cr1, L(medium_27f)
+L(medium_27t):
+        stw     rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+        stw     rCHR, -12(rMEMP)
+	stwu	rCHR, -16(rMEMP)
+L(medium_27f):
+	bflr	28
+L(medium_28t):
+        stw     rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+	blr
+END (__memset_power6)
diff --git a/sysdeps/powerpc/powerpc32/multiarch/memset-power7.S b/sysdeps/powerpc/powerpc32/multiarch/memset-power7.S
new file mode 100644
index 0000000..f8adb0b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/multiarch/memset-power7.S
@@ -0,0 +1,429 @@
+/* Optimized memset implementation for PowerPC32/POWER7.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+	.machine  power7
+EALIGN (__memset_power7, 5, 0)
+	CALL_MCOUNT
+
+	.align	4
+L(_memset):
+	cmplwi	cr7,5,31
+	cmplwi	cr6,5,8
+	mr	10,3		/* Save original argument for later.  */
+	mr	7,1		/* Save original r1 for later.  */
+	cfi_offset(31,-8)
+
+	/* Replicate byte to word.  */
+	rlwimi	4,4,8,16,23
+	rlwimi	4,4,16,0,15
+
+	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
+
+	neg	0,3
+	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
+
+	/* Save our word twice to create a doubleword that we will later
+	   copy to a FPR.  */
+	stwu	1,-32(1)
+	andi.	11,10,7		/* Check alignment of DST.  */
+	mr	12,5
+	stw	4,24(1)
+	stw	4,28(1)
+	beq	L(big_aligned)
+
+	clrlwi	0,0,29
+	mtocrf	0x01,0
+	subf	5,0,5
+
+	/* Get DST aligned to 8 bytes.  */
+1:	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	bf	30,4f
+
+	sth	4,0(10)
+	addi	10,10,2
+4:	bf	29,L(big_aligned)
+
+	stw	4,0(10)
+	addi	10,10,4
+
+	.align	4
+L(big_aligned):
+	cmplwi	cr5,5,255
+	li	0,32
+	cmplwi	cr1,5,160
+	dcbtst	0,10
+	cmplwi	cr6,4,0
+	srwi	9,5,3		/* Number of full doublewords remaining.  */
+	crand	27,26,21
+	mtocrf	0x01,9
+	bt	27,L(huge)
+
+	/* From this point on, we'll copy 32+ bytes and the value
+	   isn't 0 (so we can't use dcbz).  */
+
+	srwi	8,5,5
+	clrlwi	11,5,29
+	cmplwi	cr6,11,0
+	cmplwi	cr1,9,4
+	mtctr	8
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf	30,1f
+
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	addi	10,10,16
+	bf	31,L(big_loop)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+	mr	12,10
+	blt	cr1,L(tail_bytes)
+
+	b	L(big_loop)
+
+	.align	4
+1:	/* Copy 1 doubleword.  */
+	bf	31,L(big_loop)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+
+	/* First use a 32-bytes loop with stw's to try and avoid the LHS due
+	   to the lfd we will do next.  Also, ping-pong through r10 and r12
+	   to avoid AGEN delays.  */
+	.align	4
+L(big_loop):
+	addi	12,10,32
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	stw	4,16(10)
+	stw	4,20(10)
+	stw	4,24(10)
+	stw	4,28(10)
+	bdz	L(tail_bytes)
+
+	addi	10,10,64
+	stw	4,0(12)
+	stw	4,4(12)
+	stw	4,8(12)
+	stw	4,12(12)
+	stw	4,16(12)
+	stw	4,20(12)
+	stw	4,24(12)
+	stw	4,28(12)
+	bdnz	L(big_loop_fast_setup)
+
+	mr	12,10
+	b	L(tail_bytes)
+
+	/* Now that we're probably past the LHS window, use the VSX to
+	   speed up the loop.  */
+L(big_loop_fast_setup):
+	li	11,24
+	li	6,16
+	lxvdsx	4,1,11
+
+	.align	4
+L(big_loop_fast):
+	addi	12,10,32
+	stxvd2x	4,0,10
+	stxvd2x	4,10,6
+	bdz	L(tail_bytes)
+
+	addi	10,10,64
+	stxvd2x	4,0,12
+	stxvd2x	4,12,6
+	bdnz	L(big_loop_fast)
+
+	mr	12,10
+
+	.align	4
+L(tail_bytes):
+
+	/* Check for tail bytes.  */
+	mr	1,7		/* Restore r1.  */
+	beqlr	cr6
+
+	clrlwi	0,5,29
+	mtocrf	0x01,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(12)
+	addi	12,12,4
+2:	/* Copy 2 bytes.  */
+	bf	30,1f
+
+	sth	4,0(12)
+	addi	12,12,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(12)
+	blr
+
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
+	   dcbz though, we need to get the destination 128-bytes aligned.  */
+	.align	4
+L(huge):
+	lfd	4,24(1)
+	andi.	11,10,127
+	neg	0,10
+	beq	L(huge_aligned)
+
+	clrlwi	0,0,25
+	subf	5,0,5
+	srwi	0,0,3
+	mtocrf  0x01,0
+
+	/* Get DST aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	stfd	4,32(10)
+	stfd	4,40(10)
+	stfd	4,48(10)
+	stfd	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(huge_aligned)
+
+	stfd	4,0(10)
+	addi	10,10,8
+
+L(huge_aligned):
+	srwi	8,5,7
+	clrlwi	11,5,25
+	cmplwi	cr6,11,0
+	mtctr	8
+
+	/* Copies 128-bytes at a time.  */
+	.align	4
+L(huge_loop):
+	dcbz	0,10
+	addi	10,10,128
+	bdnz	L(huge_loop)
+
+	/* We have a tail of 0~127 bytes to handle.  */
+	mr	1,7		/* Restore r1.  */
+	beqlr	cr6
+
+	subf	9,3,10
+	subf	5,9,12
+	srwi	8,5,3
+	cmplwi	cr6,8,0
+	mtocrf	0x01,8
+
+	/* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
+	speed.  We'll handle the resulting tail bytes later.  */
+	beq	cr6,L(tail)
+
+8:	bf	28,4f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	stfd	4,32(10)
+	stfd	4,40(10)
+	stfd	4,48(10)
+	stfd	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(tail)
+
+	stfd	4,0(10)
+	addi	10,10,8
+
+	/* Handle the rest of the tail bytes here.  */
+L(tail):
+	mtocrf	0x01,5
+
+	.align	4
+4:	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+	.align	4
+2:	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+	.align	4
+1:	bflr	31
+
+	stb	4,0(10)
+	blr
+
+
+	/* Expanded tree to copy tail bytes without increments.  */
+	.align	4
+L(copy_tail):
+	bf	29,L(FXX)
+
+	stw	4,0(10)
+	bf	30,L(TFX)
+
+	sth	4,4(10)
+	bflr	31
+
+	stb	4,6(10)
+	blr
+
+	.align	4
+L(FXX):	bf	30,L(FFX)
+
+	sth	4,0(10)
+	bflr	31
+
+	stb	4,2(10)
+	blr
+
+	.align	4
+L(TFX):	bflr	31
+
+	stb	4,4(10)
+	blr
+
+	.align	4
+L(FFX):	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handle copies of 9~31 bytes.  */
+	.align	4
+L(medium):
+	/* At least 9 bytes to go.  */
+	andi.	11,10,3
+	clrlwi	0,0,30
+	beq	L(medium_aligned)
+
+	/* Force 4-bytes alignment for DST.  */
+	mtocrf	0x01,0
+	subf	5,0,5
+1:	/* Copy 1 byte.  */
+	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	/* Copy 2 bytes.  */
+	bf	30,L(medium_aligned)
+
+	sth	4,0(10)
+	addi	10,10,2
+
+	.align	4
+L(medium_aligned):
+	/* At least 6 bytes to go, and DST is word-aligned.  */
+	cmplwi	cr1,5,16
+	mtocrf	0x01,5
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	addi	10,10,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(small):
+	mtocrf	0x01,5
+	bne	cr6,L(copy_tail)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	blr
+END (__memset_power7)
diff --git a/sysdeps/powerpc/powerpc32/multiarch/memset-ppc32.S b/sysdeps/powerpc/powerpc32/multiarch/memset-ppc32.S
new file mode 100644
index 0000000..775a85a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/multiarch/memset-ppc32.S
@@ -0,0 +1,41 @@
+/* Default memset implementation for PowerPC32.
+   Copyright (C) 1997-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if defined SHARED && !defined NOT_IN_libc
+# undef EALIGN
+# define EALIGN(name, alignt, words)				\
+  .globl C_SYMBOL_NAME(__memset_ppc32);				\
+  .type C_SYMBOL_NAME(__memset_ppc32),@function;		\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  C_LABEL(__memset_ppc32)					\
+  cfi_startproc;
+
+# undef END
+# define END(name)						\
+  cfi_endproc;							\
+  ASM_SIZE_DIRECTIVE(__memset_ppc32)
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)				\
+    .globl __GI_memset; __GI_memset = __memset_ppc32
+#endif
+
+#include <sysdeps/powerpc/powerpc32/memset.S>
diff --git a/sysdeps/powerpc/powerpc32/multiarch/memset.c b/sysdeps/powerpc/powerpc32/multiarch/memset.c
new file mode 100644
index 0000000..b4c63d5
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/multiarch/memset.c
@@ -0,0 +1,40 @@
+/* Multiple versions of memset.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for definition in libc.  */
+#if defined SHARED && !defined NOT_IN_libc
+# include <string.h>
+# include <shlib-compat.h>
+# include "init-arch.h"
+
+extern __typeof (memset) __memset_ppc32 attribute_hidden;
+extern __typeof (memset) __memset_power4 attribute_hidden;
+extern __typeof (memset) __memset_power6 attribute_hidden;
+extern __typeof (memset) __memset_power7 attribute_hidden;
+
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+   ifunc symbol properly.  */
+libc_ifunc (memset,
+            (hwcap & PPC_FEATURE_HAS_VSX)
+            ? __memset_power7 :
+	      (hwcap & PPC_FEATURE_ARCH_2_05)
+		? __memset_power6 :
+		  (hwcap & PPC_FEATURE_POWER4)
+		? __memset_power4
+            : __memset_ppc32);
+#endif
diff --git a/sysdeps/powerpc/powerpc32/multiarch/rtld-memset.S b/sysdeps/powerpc/powerpc32/multiarch/rtld-memset.S
new file mode 100644
index 0000000..efd2780
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/multiarch/rtld-memset.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/powerpc/powerpc32/memset.S>
diff --git a/sysdeps/powerpc/powerpc32/power4/memset.S b/sysdeps/powerpc/powerpc32/power4/memset.S
deleted file mode 100644
index c2d288b..0000000
--- a/sysdeps/powerpc/powerpc32/power4/memset.S
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Optimized memset implementation for PowerPC64.
-   Copyright (C) 1997-2013 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
-   Returns 's'.
-
-   The memset is done in three sizes: byte (8 bits), word (32 bits),
-   cache line (1024 bits). There is a special case for setting cache lines
-   to 0, to take advantage of the dcbz instruction.  */
-
-	.machine power4
-EALIGN (memset, 5, 0)
-	CALL_MCOUNT
-
-#define rTMP	r0
-#define rRTN	r3	/* Initial value of 1st argument.  */
-#define rMEMP0	r3	/* Original value of 1st arg.  */
-#define rCHR	r4	/* Char to set in each byte.  */
-#define rLEN	r5	/* Length of region to set.  */
-#define rMEMP	r6	/* Address at which we are storing.  */
-#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
-#define rMEMP2	r8
-
-#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
-#define rCLS	r8	/* Cache line size (known to be 128).  */
-#define rCLM	r9	/* Cache line size mask to check for cache alignment.  */
-L(_memset):
-/* Take care of case for size <= 4.  */
-	cmplwi	cr1, rLEN, 4
-	andi.	rALIGN, rMEMP0, 3
-	mr	rMEMP, rMEMP0
-	ble-	cr1, L(small)
-
-/* Align to word boundary.  */
-	cmplwi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
-	beq+	L(aligned)
-	mtcrf	0x01, rMEMP0
-	subfic	rALIGN, rALIGN, 4
-	add	rMEMP, rMEMP, rALIGN
-	sub	rLEN, rLEN, rALIGN
-	bf+	31, L(g0)
-	stb	rCHR, 0(rMEMP0)
-	bt	30, L(aligned)
-L(g0):
-	sth	rCHR, -2(rMEMP)
-
-/* Handle the case of size < 31.  */
-L(aligned):
-	mtcrf	0x01, rLEN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
-	ble	cr5, L(medium)
-/* Align to 32-byte boundary.  */
-	andi.	rALIGN, rMEMP, 0x1C
-	subfic	rALIGN, rALIGN, 0x20
-	beq	L(caligned)
-	mtcrf	0x01, rALIGN
-	add	rMEMP, rMEMP, rALIGN
-	sub	rLEN, rLEN, rALIGN
-	cmplwi	cr1, rALIGN, 0x10
-	mr	rMEMP2, rMEMP
-	bf	28, L(a1)
-        stw     rCHR, -4(rMEMP2)
-	stwu	rCHR, -8(rMEMP2)
-L(a1):	blt	cr1, L(a2)
-        stw     rCHR, -4(rMEMP2)
-	stw	rCHR, -8(rMEMP2)
-	stw	rCHR, -12(rMEMP2)
-	stwu	rCHR, -16(rMEMP2)
-L(a2):  bf      29, L(caligned)
-        stw     rCHR, -4(rMEMP2)
-
-/* Now aligned to a 32 byte boundary.  */
-L(caligned):
-	cmplwi	cr1, rCHR, 0
-	clrrwi.	rALIGN, rLEN, 5
-	mtcrf	0x01, rLEN
-	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
-L(nondcbz):
-	srwi	rTMP, rALIGN, 5
-	mtctr	rTMP
-	beq	L(medium)	/* We may not actually get to do a full line.  */
-	clrlwi.	rLEN, rLEN, 27
-	add	rMEMP, rMEMP, rALIGN
-	li	rNEG64, -0x40
-	bdz	L(cloopdone)
-
-        .align 4
-L(c3): 	dcbtst	rNEG64, rMEMP
-        stw     rCHR, -4(rMEMP)
-	stw	rCHR, -8(rMEMP)
-        stw     rCHR, -12(rMEMP)
-	stw	rCHR, -16(rMEMP)
-        stw     rCHR, -20(rMEMP)
-	stw	rCHR, -24(rMEMP)
-        stw     rCHR, -28(rMEMP)
-	stwu	rCHR, -32(rMEMP)
-	bdnz	L(c3)
-L(cloopdone):
-        stw     rCHR, -4(rMEMP)
-	stw	rCHR, -8(rMEMP)
-        stw     rCHR, -12(rMEMP)
-	stw	rCHR, -16(rMEMP)
-	cmplwi	cr1, rLEN, 16
-        stw     rCHR, -20(rMEMP)
-	stw	rCHR, -24(rMEMP)
-        stw     rCHR, -28(rMEMP)
-	stwu	rCHR, -32(rMEMP)
-	beqlr
-	add	rMEMP, rMEMP, rALIGN
-	b	L(medium_tail2)
-
-	.align 5
-/* Clear lines of memory in 128-byte chunks.  */
-L(zloopstart):
-/* If the remaining length is less the 32 bytes, don't bother getting
-	 the cache line size.  */
-	beq	L(medium)
-	li      rCLS,128  /* cache line size is 128 */
-	dcbt	0,rMEMP
-L(getCacheAligned):
-	cmplwi	cr1,rLEN,32
-	andi.	rTMP,rMEMP,127
-	blt	cr1,L(handletail32)
-	beq	L(cacheAligned)
-	addi	rMEMP,rMEMP,32
-	addi	rLEN,rLEN,-32
-	stw	rCHR,-32(rMEMP)
-        stw     rCHR,-28(rMEMP)
-	stw	rCHR,-24(rMEMP)
-	stw     rCHR,-20(rMEMP)
-	stw	rCHR,-16(rMEMP)
-        stw     rCHR,-12(rMEMP)
-	stw	rCHR,-8(rMEMP)
-        stw     rCHR,-4(rMEMP)
-	b	L(getCacheAligned)
-
-/* Now we are aligned to the cache line and can use dcbz.  */
-        .align 4
-L(cacheAligned):
-	cmplw	cr1,rLEN,rCLS
-	blt	cr1,L(handletail32)
-	dcbz	0,rMEMP
-	subf	rLEN,rCLS,rLEN
-	add	rMEMP,rMEMP,rCLS
-	b	L(cacheAligned)
-
-/* We are here because the cache line size was set and the remainder
-  (rLEN) is less than the actual cache line size.
-   So set up the preconditions for L(nondcbz) and go there.  */
-L(handletail32):
-	clrrwi.	rALIGN, rLEN, 5
-	b		L(nondcbz)
-
-	.align 5
-L(small):
-/* Memset of 4 bytes or less.  */
-	cmplwi	cr5, rLEN, 1
-	cmplwi	cr1, rLEN, 3
-	bltlr	cr5
-	stb	rCHR, 0(rMEMP)
-	beqlr	cr5
-	stb	rCHR, 1(rMEMP)
-	bltlr	cr1
-	stb	rCHR, 2(rMEMP)
-	beqlr	cr1
-	stb	rCHR, 3(rMEMP)
-	blr
-
-/* Memset of 0-31 bytes.  */
-	.align 5
-L(medium):
-	cmplwi	cr1, rLEN, 16
-L(medium_tail2):
-	add	rMEMP, rMEMP, rLEN
-L(medium_tail):
-	bt-	31, L(medium_31t)
-	bt-	30, L(medium_30t)
-L(medium_30f):
-	bt-	29, L(medium_29t)
-L(medium_29f):
-	bge-	cr1, L(medium_27t)
-	bflr-	28
-        stw     rCHR, -4(rMEMP)
-	stw	rCHR, -8(rMEMP)
-	blr
-
-L(medium_31t):
-	stbu	rCHR, -1(rMEMP)
-	bf-	30, L(medium_30f)
-L(medium_30t):
-	sthu	rCHR, -2(rMEMP)
-	bf-	29, L(medium_29f)
-L(medium_29t):
-	stwu	rCHR, -4(rMEMP)
-	blt-	cr1, L(medium_27f)
-L(medium_27t):
-        stw     rCHR, -4(rMEMP)
-	stw	rCHR, -8(rMEMP)
-        stw     rCHR, -12(rMEMP)
-	stwu	rCHR, -16(rMEMP)
-L(medium_27f):
-	bflr-	28
-L(medium_28t):
-        stw     rCHR, -4(rMEMP)
-	stw	rCHR, -8(rMEMP)
-	blr
-END (memset)
-libc_hidden_builtin_def (memset)
diff --git a/sysdeps/powerpc/powerpc32/power6/memset.S b/sysdeps/powerpc/powerpc32/power6/memset.S
deleted file mode 100644
index ce06630..0000000
--- a/sysdeps/powerpc/powerpc32/power6/memset.S
+++ /dev/null
@@ -1,539 +0,0 @@
-/* Optimized 32-bit memset implementation for POWER6.
-   Copyright (C) 1997-2013 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
-   Returns 's'.
-
-   The memset is done in three sizes: byte (8 bits), word (32 bits),
-   cache line (1024 bits). There is a special case for setting cache lines
-   to 0, to take advantage of the dcbz instruction.  */
-
-	.machine power6
-EALIGN (memset, 7, 0)
-	CALL_MCOUNT
-
-#define rTMP	r0
-#define rRTN	r3	/* Initial value of 1st argument.  */
-#define rMEMP0	r3	/* Original value of 1st arg.  */
-#define rCHR	r4	/* Char to set in each byte.  */
-#define rLEN	r5	/* Length of region to set.  */
-#define rMEMP	r6	/* Address at which we are storing.  */
-#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
-#define rMEMP2	r8
-
-#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
-#define rMEMP3	r9	/* Alt mem pointer.  */
-L(_memset):
-/* Take care of case for size <= 4.  */
-	cmplwi	cr1, rLEN, 4
-	andi.	rALIGN, rMEMP0, 3
-	mr	rMEMP, rMEMP0
-	ble-	cr1, L(small)
-/* Align to word boundary.  */
-	cmplwi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
-	beq+	L(aligned)
-	mtcrf	0x01, rMEMP0
-	subfic	rALIGN, rALIGN, 4
-	add	rMEMP, rMEMP, rALIGN
-	sub	rLEN, rLEN, rALIGN
-	bf+	31, L(g0)
-	stb	rCHR, 0(rMEMP0)
-	bt	30, L(aligned)
-L(g0):
-	sth	rCHR, -2(rMEMP)
-
-        .align 4
-/* Handle the case of size < 31.  */
-L(aligned):
-	mtcrf	0x01, rLEN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
-	ble	cr5, L(medium)
-/* Align to 32-byte boundary.  */
-	andi.	rALIGN, rMEMP, 0x1C
-	subfic	rALIGN, rALIGN, 0x20
-	beq	L(caligned)
-	mtcrf	0x01, rALIGN
-	add	rMEMP, rMEMP, rALIGN
-	sub	rLEN, rLEN, rALIGN
-	cmplwi	cr1, rALIGN, 0x10
-	mr	rMEMP2, rMEMP
-	bf	28, L(a1)
-        stw     rCHR, -4(rMEMP2)
-	stwu	rCHR, -8(rMEMP2)
-	nop
-L(a1):	blt	cr1, L(a2)
-        stw     rCHR, -4(rMEMP2)
-	stw	rCHR, -8(rMEMP2)
-	stw	rCHR, -12(rMEMP2)
-	stwu	rCHR, -16(rMEMP2)
-L(a2):  bf      29, L(caligned)
-        stw     rCHR, -4(rMEMP2)
-
-        .align 3
-/* Now aligned to a 32 byte boundary.  */
-L(caligned):
-	cmplwi	cr1, rCHR, 0
-	clrrwi.	rALIGN, rLEN, 5
-	mtcrf	0x01, rLEN
-	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
-L(nondcbz):
-	beq	L(medium)	/* We may not actually get to do a full line.  */
-	nop
-/* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
-   boundary may not be at cache line (128-byte) boundary.  */
-L(nzloopstart):
-/* memset in 32-byte chunks until we get to a cache line boundary.
-   If rLEN is less then the distance to the next cache-line boundary use
-   cacheAligned1 code to finish the tail.  */
-	cmplwi	cr1,rLEN,128
-
-	andi.	rTMP,rMEMP,127
-	blt	cr1,L(cacheAligned1)
-	addi	rMEMP3,rMEMP,32
-	beq	L(nzCacheAligned)
-	addi	rLEN,rLEN,-32
-	stw	rCHR,0(rMEMP)
-        stw     rCHR,4(rMEMP)
-	stw	rCHR,8(rMEMP)
-	stw     rCHR,12(rMEMP)
-	stw	rCHR,16(rMEMP)
-        stw     rCHR,20(rMEMP)
-	addi	rMEMP,rMEMP,32
-	andi.	rTMP,rMEMP3,127
-	stw	rCHR,-8(rMEMP3)
-        stw     rCHR,-4(rMEMP3)
-
-	beq	L(nzCacheAligned)
-	addi	rLEN,rLEN,-32
-	stw	rCHR,0(rMEMP3)
-        stw     rCHR,4(rMEMP3)
-	addi	rMEMP,rMEMP,32
-	stw	rCHR,8(rMEMP3)
-	stw     rCHR,12(rMEMP3)
-	andi.	rTMP,rMEMP,127
-	stw	rCHR,16(rMEMP3)
-        stw     rCHR,20(rMEMP3)
-	stw	rCHR,24(rMEMP3)
-        stw     rCHR,28(rMEMP3)
-
-	beq	L(nzCacheAligned)
-	addi	rLEN,rLEN,-32
-/* At this point we can overrun the store queue (pipe reject) so it is
-   time to slow things down. The store queue can merge two adjacent
-   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
-   So we add "group ending nops" to guarantee that we dispatch only two
-   stores every other cycle. */
-	ori	r1,r1,0
-	ori	r1,r1,0
-	stw	rCHR,32(rMEMP3)
-        stw     rCHR,36(rMEMP3)
-	addi	rMEMP,rMEMP,32
-	cmplwi	cr1,rLEN,128
-	ori	r1,r1,0
-	ori	r1,r1,0
-	stw	rCHR,40(rMEMP3)
-	stw     rCHR,44(rMEMP3)
-	ori	r1,r1,0
-	ori	r1,r1,0
-	stw	rCHR,48(rMEMP3)
-        stw     rCHR,52(rMEMP3)
-	ori	r1,r1,0
-	ori	r1,r1,0
-	stw	rCHR,56(rMEMP3)
-        stw     rCHR,60(rMEMP3)
-	blt	cr1,L(cacheAligned1)
-	b	L(nzCacheAligned)
-
-/* Now we are aligned to the cache line and can use dcbtst.  */
-        .align 5
-L(nzCacheAligned):
-	cmplwi	cr1,rLEN,128
-	cmplwi	cr6,rLEN,256
-	blt	cr1,L(cacheAligned1)
-	blt	cr6,L(nzCacheAligned128)
-        .align 4
-L(nzCacheAligned128):
-	nop
-	addi	rMEMP3,rMEMP,64
-	stw	rCHR,0(rMEMP)
-        stw     rCHR,4(rMEMP)
-	stw	rCHR,8(rMEMP)
-	stw     rCHR,12(rMEMP)
-	stw	rCHR,16(rMEMP)
-        stw     rCHR,20(rMEMP)
-	stw	rCHR,24(rMEMP)
-        stw     rCHR,28(rMEMP)
-	stw	rCHR,32(rMEMP)
-        stw     rCHR,36(rMEMP)
-	stw	rCHR,40(rMEMP)
-	stw     rCHR,44(rMEMP)
-	stw	rCHR,48(rMEMP)
-        stw     rCHR,52(rMEMP)
-	stw	rCHR,56(rMEMP)
-        stw     rCHR,60(rMEMP)
-	addi	rMEMP,rMEMP3,64
-	addi	rLEN,rLEN,-128
-/* At this point we can overrun the store queue (pipe reject) so it is
-   time to slow things down. The store queue can merge two adjacent
-   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
-   So we add "group ending nops" to guarantee that we dispatch only one
-   store per cycle. */
-	stw	rCHR,0(rMEMP3)
-	ori	r1,r1,0
-        stw     rCHR,4(rMEMP3)
-	ori	r1,r1,0
-	stw	rCHR,8(rMEMP3)
-	ori	r1,r1,0
-	stw     rCHR,12(rMEMP3)
-	ori	r1,r1,0
-	stw	rCHR,16(rMEMP3)
-	ori	r1,r1,0
-        stw     rCHR,20(rMEMP3)
-	ori	r1,r1,0
-	stw	rCHR,24(rMEMP3)
-	ori	r1,r1,0
-        stw     rCHR,28(rMEMP3)
-	ori	r1,r1,0
-	stw	rCHR,32(rMEMP3)
-	ori	r1,r1,0
-        stw     rCHR,36(rMEMP3)
-	ori	r1,r1,0
-	stw	rCHR,40(rMEMP3)
-	ori	r1,r1,0
-	stw     rCHR,44(rMEMP3)
-	ori	r1,r1,0
-	stw	rCHR,48(rMEMP3)
-	ori	r1,r1,0
-        stw     rCHR,52(rMEMP3)
-	ori	r1,r1,0
-	stw	rCHR,56(rMEMP3)
-	ori	r1,r1,0
-        stw     rCHR,60(rMEMP3)
-	blt	cr6,L(cacheAligned1)
-#ifndef NOT_IN_libc
-	lfd	0,-128(rMEMP)
-#endif
-	b	L(nzCacheAligned256)
-        .align 5
-L(nzCacheAligned256):
-	cmplwi	cr1,rLEN,256
-	addi	rMEMP3,rMEMP,64
-#ifdef NOT_IN_libc
-/* When we are not in libc we should use only GPRs to avoid the FPU lock
-   interrupt.  */
-	stw	rCHR,0(rMEMP)
-        stw     rCHR,4(rMEMP)
-	stw	rCHR,8(rMEMP)
-	stw     rCHR,12(rMEMP)
-	stw	rCHR,16(rMEMP)
-        stw     rCHR,20(rMEMP)
-	stw	rCHR,24(rMEMP)
-        stw     rCHR,28(rMEMP)
-	stw	rCHR,32(rMEMP)
-        stw     rCHR,36(rMEMP)
-	stw	rCHR,40(rMEMP)
-	stw     rCHR,44(rMEMP)
-	stw	rCHR,48(rMEMP)
-        stw     rCHR,52(rMEMP)
-	stw	rCHR,56(rMEMP)
-        stw     rCHR,60(rMEMP)
-	addi	rMEMP,rMEMP3,64
-	addi	rLEN,rLEN,-128
-	stw	rCHR,0(rMEMP3)
-        stw     rCHR,4(rMEMP3)
-	stw	rCHR,8(rMEMP3)
-	stw     rCHR,12(rMEMP3)
-	stw	rCHR,16(rMEMP3)
-        stw     rCHR,20(rMEMP3)
-	stw	rCHR,24(rMEMP3)
-        stw     rCHR,28(rMEMP3)
-	stw	rCHR,32(rMEMP3)
-        stw     rCHR,36(rMEMP3)
-	stw	rCHR,40(rMEMP3)
-	stw     rCHR,44(rMEMP3)
-	stw	rCHR,48(rMEMP3)
-        stw     rCHR,52(rMEMP3)
-	stw	rCHR,56(rMEMP3)
-        stw     rCHR,60(rMEMP3)
-#else
-/* We are in libc and this is a long memset so we can use FPRs and can afford
-   occasional FPU locked interrupts.  */
-	stfd	0,0(rMEMP)
-	stfd	0,8(rMEMP)
-	stfd	0,16(rMEMP)
-	stfd	0,24(rMEMP)
-	stfd	0,32(rMEMP)
-	stfd	0,40(rMEMP)
-	stfd	0,48(rMEMP)
-	stfd	0,56(rMEMP)
-	addi	rMEMP,rMEMP3,64
-	addi	rLEN,rLEN,-128
-	stfd	0,0(rMEMP3)
-	stfd	0,8(rMEMP3)
-	stfd	0,16(rMEMP3)
-	stfd	0,24(rMEMP3)
-	stfd	0,32(rMEMP3)
-	stfd	0,40(rMEMP3)
-	stfd	0,48(rMEMP3)
-	stfd	0,56(rMEMP3)
-#endif
-	bge	cr1,L(nzCacheAligned256)
-	dcbtst	0,rMEMP
-	b	L(cacheAligned1)
-
-	.align 4
-/* Storing a zero "c" value. We are aligned at a sector (32-byte)
-   boundary but may not be at cache line (128-byte) boundary.  If the
-   remaining length spans a full cache line we can use the Data cache
-   block zero instruction. */
-L(zloopstart):
-/* memset in 32-byte chunks until we get to a cache line boundary.
-   If rLEN is less then the distance to the next cache-line boundary use
-   cacheAligned1 code to finish the tail.  */
-	cmplwi	cr1,rLEN,128
-	beq	L(medium)
-L(getCacheAligned):
-	andi.	rTMP,rMEMP,127
-	blt	cr1,L(cacheAligned1)
-	addi	rMEMP3,rMEMP,32
-	beq	L(cacheAligned)
-	addi	rLEN,rLEN,-32
-	stw	rCHR,0(rMEMP)
-        stw     rCHR,4(rMEMP)
-	stw	rCHR,8(rMEMP)
-	stw     rCHR,12(rMEMP)
-	stw	rCHR,16(rMEMP)
-        stw     rCHR,20(rMEMP)
-	addi	rMEMP,rMEMP,32
-	andi.	rTMP,rMEMP3,127
-	stw	rCHR,-8(rMEMP3)
-        stw     rCHR,-4(rMEMP3)
-L(getCacheAligned2):
-	beq	L(cacheAligned)
-	addi	rLEN,rLEN,-32
-	addi	rMEMP,rMEMP,32
-	stw	rCHR,0(rMEMP3)
-        stw     rCHR,4(rMEMP3)
-	stw	rCHR,8(rMEMP3)
-	stw     rCHR,12(rMEMP3)
-	andi.	rTMP,rMEMP,127
-	nop
-	stw	rCHR,16(rMEMP3)
-        stw     rCHR,20(rMEMP3)
-	stw	rCHR,24(rMEMP3)
-        stw     rCHR,28(rMEMP3)
-L(getCacheAligned3):
-	beq	L(cacheAligned)
-/* At this point we can overrun the store queue (pipe reject) so it is
-   time to slow things down. The store queue can merge two adjacent
-   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
-   So we add "group ending nops" to guarantee that we dispatch only two
-   stores every other cycle. */
-	addi	rLEN,rLEN,-32
-	ori	r1,r1,0
-	ori	r1,r1,0
-	stw	rCHR,32(rMEMP3)
-        stw     rCHR,36(rMEMP3)
-	addi	rMEMP,rMEMP,32
-	cmplwi	cr1,rLEN,128
-	ori	r1,r1,0
-	stw	rCHR,40(rMEMP3)
-	stw     rCHR,44(rMEMP3)
-	cmplwi	cr6,rLEN,256
-	li	rMEMP2,128
-	ori	r1,r1,0
-	stw	rCHR,48(rMEMP3)
-        stw     rCHR,52(rMEMP3)
-	ori	r1,r1,0
-	ori	r1,r1,0
-	stw	rCHR,56(rMEMP3)
-        stw     rCHR,60(rMEMP3)
-	blt	cr1,L(cacheAligned1)
-	blt	cr6,L(cacheAligned128)
-	b	L(cacheAlignedx)
-
-/* Now we are aligned to the cache line and can use dcbz.  */
-        .align 4
-L(cacheAligned):
-	cmplwi	cr1,rLEN,128
-	cmplwi	cr6,rLEN,256
-	blt	cr1,L(cacheAligned1)
-	li	rMEMP2,128
-L(cacheAlignedx):
-	cmplwi	cr5,rLEN,640
-	blt	cr6,L(cacheAligned128)
-	bgt	cr5,L(cacheAligned512)
-	cmplwi	cr6,rLEN,512
-	dcbz	0,rMEMP
-	cmplwi	cr1,rLEN,384
-	dcbz	rMEMP2,rMEMP
-	addi	rMEMP,rMEMP,256
-	addi	rLEN,rLEN,-256
-	blt	cr1,L(cacheAligned1)
-	blt	cr6,L(cacheAligned128)
-	b	L(cacheAligned256)
-	.align 5
-/* A simple loop for the longer (>640 bytes) lengths.  This form limits
-   the branch miss-predicted to exactly 1 at loop exit.*/
-L(cacheAligned512):
-	cmpli	cr1,rLEN,128
-	blt	cr1,L(cacheAligned1)
-	dcbz	0,rMEMP
-	addi	rLEN,rLEN,-128
-	addi	rMEMP,rMEMP,128
-	b	L(cacheAligned512)
-        .align 5
-L(cacheAligned256):
-	cmplwi	cr6,rLEN,512
-	dcbz	0,rMEMP
-	cmplwi	cr1,rLEN,384
-	dcbz	rMEMP2,rMEMP
-	addi	rMEMP,rMEMP,256
-	addi	rLEN,rLEN,-256
-	bge	cr6,L(cacheAligned256)
-	blt	cr1,L(cacheAligned1)
-        .align 4
-L(cacheAligned128):
-	dcbz	0,rMEMP
-	addi	rMEMP,rMEMP,128
-	addi	rLEN,rLEN,-128
-        .align 4
-L(cacheAligned1):
-	cmplwi	cr1,rLEN,32
-	blt	cr1,L(handletail32)
-	addi	rMEMP3,rMEMP,32
-	addi	rLEN,rLEN,-32
-	stw	rCHR,0(rMEMP)
-        stw     rCHR,4(rMEMP)
-	stw	rCHR,8(rMEMP)
-	stw     rCHR,12(rMEMP)
-	stw	rCHR,16(rMEMP)
-        stw     rCHR,20(rMEMP)
-	addi	rMEMP,rMEMP,32
-	cmplwi	cr1,rLEN,32
-	stw	rCHR,-8(rMEMP3)
-        stw     rCHR,-4(rMEMP3)
-L(cacheAligned2):
-	blt	cr1,L(handletail32)
-	addi	rLEN,rLEN,-32
-	stw	rCHR,0(rMEMP3)
-        stw     rCHR,4(rMEMP3)
-	stw	rCHR,8(rMEMP3)
-	stw     rCHR,12(rMEMP3)
-	addi	rMEMP,rMEMP,32
-	cmplwi	cr1,rLEN,32
-	stw	rCHR,16(rMEMP3)
-        stw     rCHR,20(rMEMP3)
-	stw	rCHR,24(rMEMP3)
-        stw     rCHR,28(rMEMP3)
-	nop
-L(cacheAligned3):
-	blt	cr1,L(handletail32)
-/* At this point we can overrun the store queue (pipe reject) so it is
-   time to slow things down. The store queue can merge two adjacent
-   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
-   So we add "group ending nops" to guarantee that we dispatch only two
-   stores every other cycle. */
-	ori	r1,r1,0
-	ori	r1,r1,0
-	addi	rMEMP,rMEMP,32
-	addi	rLEN,rLEN,-32
-	ori	r1,r1,0
-	ori	r1,r1,0
-	stw	rCHR,32(rMEMP3)
-        stw     rCHR,36(rMEMP3)
-	ori	r1,r1,0
-	ori	r1,r1,0
-	stw	rCHR,40(rMEMP3)
-	stw     rCHR,44(rMEMP3)
-	ori	r1,r1,0
-	ori	r1,r1,0
-	stw	rCHR,48(rMEMP3)
-        stw     rCHR,52(rMEMP3)
-	ori	r1,r1,0
-	ori	r1,r1,0
-	stw	rCHR,56(rMEMP3)
-        stw     rCHR,60(rMEMP3)
-
-/* We are here because the length or remainder (rLEN) is less than the
-   cache line/sector size and does not justify aggressive loop unrolling.
-   So set up the preconditions for L(medium) and go there.  */
-        .align 3
-L(handletail32):
-	cmplwi	cr1,rLEN,0
-	beqlr   cr1
-	b	L(medium)
-
-	.align 4
-L(small):
-/* Memset of 4 bytes or less.  */
-	cmplwi	cr5, rLEN, 1
-	cmplwi	cr1, rLEN, 3
-	bltlr	cr5
-	stb	rCHR, 0(rMEMP)
-	beqlr	cr5
-	stb	rCHR, 1(rMEMP)
-	bltlr	cr1
-	stb	rCHR, 2(rMEMP)
-	beqlr	cr1
-	stb	rCHR, 3(rMEMP)
-	blr
-
-/* Memset of 0-31 bytes.  */
-	.align 5
-L(medium):
-	cmplwi	cr1, rLEN, 16
-L(medium_tail2):
-	add	rMEMP, rMEMP, rLEN
-L(medium_tail):
-	bt-	31, L(medium_31t)
-	bt-	30, L(medium_30t)
-L(medium_30f):
-	bt	29, L(medium_29t)
-L(medium_29f):
-	bge	cr1, L(medium_27t)
-	bflr	28
-        stw     rCHR, -4(rMEMP)
-	stw	rCHR, -8(rMEMP)
-	blr
-
-L(medium_31t):
-	stbu	rCHR, -1(rMEMP)
-	bf-	30, L(medium_30f)
-L(medium_30t):
-	sthu	rCHR, -2(rMEMP)
-	bf-	29, L(medium_29f)
-L(medium_29t):
-	stwu	rCHR, -4(rMEMP)
-	blt	cr1, L(medium_27f)
-L(medium_27t):
-        stw     rCHR, -4(rMEMP)
-	stw	rCHR, -8(rMEMP)
-        stw     rCHR, -12(rMEMP)
-	stwu	rCHR, -16(rMEMP)
-L(medium_27f):
-	bflr	28
-L(medium_28t):
-        stw     rCHR, -4(rMEMP)
-	stw	rCHR, -8(rMEMP)
-	blr
-END (memset)
-libc_hidden_builtin_def (memset)
diff --git a/sysdeps/powerpc/powerpc32/power7/memset.S b/sysdeps/powerpc/powerpc32/power7/memset.S
deleted file mode 100644
index 360ea71..0000000
--- a/sysdeps/powerpc/powerpc32/power7/memset.S
+++ /dev/null
@@ -1,431 +0,0 @@
-/* Optimized memset implementation for PowerPC32/POWER7.
-   Copyright (C) 2010-2013 Free Software Foundation, Inc.
-   Contributed by Luis Machado <luisgpm@br.ibm.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
-   Returns 's'.  */
-
-	.machine  power7
-EALIGN (memset, 5, 0)
-	CALL_MCOUNT
-
-	.align	4
-L(_memset):
-	cmplwi	cr7,5,31
-	cmplwi	cr6,5,8
-	mr	10,3		/* Save original argument for later.  */
-	mr	7,1		/* Save original r1 for later.  */
-	cfi_offset(31,-8)
-
-	/* Replicate byte to word.  */
-	rlwimi	4,4,8,16,23
-	rlwimi	4,4,16,0,15
-
-	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
-
-	neg	0,3
-	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
-
-	/* Save our word twice to create a doubleword that we will later
-	   copy to a FPR.  */
-	stwu	1,-32(1)
-	andi.	11,10,7		/* Check alignment of DST.  */
-	mr	12,5
-	stw	4,24(1)
-	stw	4,28(1)
-	beq	L(big_aligned)
-
-	clrlwi	0,0,29
-	mtocrf	0x01,0
-	subf	5,0,5
-
-	/* Get DST aligned to 8 bytes.  */
-1:	bf	31,2f
-
-	stb	4,0(10)
-	addi	10,10,1
-2:	bf	30,4f
-
-	sth	4,0(10)
-	addi	10,10,2
-4:	bf	29,L(big_aligned)
-
-	stw	4,0(10)
-	addi	10,10,4
-
-	.align	4
-L(big_aligned):
-	cmplwi	cr5,5,255
-	li	0,32
-	cmplwi	cr1,5,160
-	dcbtst	0,10
-	cmplwi	cr6,4,0
-	srwi	9,5,3		/* Number of full doublewords remaining.  */
-	crand	27,26,21
-	mtocrf	0x01,9
-	bt	27,L(huge)
-
-	/* From this point on, we'll copy 32+ bytes and the value
-	   isn't 0 (so we can't use dcbz).  */
-
-	srwi	8,5,5
-	clrlwi	11,5,29
-	cmplwi	cr6,11,0
-	cmplwi	cr1,9,4
-	mtctr	8
-
-	/* Copy 1~3 doublewords so the main loop starts
-	at a multiple of 32 bytes.  */
-
-	bf	30,1f
-
-	stw	4,0(10)
-	stw	4,4(10)
-	stw	4,8(10)
-	stw	4,12(10)
-	addi	10,10,16
-	bf	31,L(big_loop)
-
-	stw	4,0(10)
-	stw	4,4(10)
-	addi	10,10,8
-	mr	12,10
-	blt	cr1,L(tail_bytes)
-
-	b	L(big_loop)
-
-	.align	4
-1:	/* Copy 1 doubleword.  */
-	bf	31,L(big_loop)
-
-	stw	4,0(10)
-	stw	4,4(10)
-	addi	10,10,8
-
-	/* First use a 32-bytes loop with stw's to try and avoid the LHS due
-	   to the lfd we will do next.  Also, ping-pong through r10 and r12
-	   to avoid AGEN delays.  */
-	.align	4
-L(big_loop):
-	addi	12,10,32
-	stw	4,0(10)
-	stw	4,4(10)
-	stw	4,8(10)
-	stw	4,12(10)
-	stw	4,16(10)
-	stw	4,20(10)
-	stw	4,24(10)
-	stw	4,28(10)
-	bdz	L(tail_bytes)
-
-	addi	10,10,64
-	stw	4,0(12)
-	stw	4,4(12)
-	stw	4,8(12)
-	stw	4,12(12)
-	stw	4,16(12)
-	stw	4,20(12)
-	stw	4,24(12)
-	stw	4,28(12)
-	bdnz	L(big_loop_fast_setup)
-
-	mr	12,10
-	b	L(tail_bytes)
-
-	/* Now that we're probably past the LHS window, use the VSX to
-	   speed up the loop.  */
-L(big_loop_fast_setup):
-	li	11,24
-	li	6,16
-	lxvdsx	4,1,11
-
-	.align	4
-L(big_loop_fast):
-	addi	12,10,32
-	stxvd2x	4,0,10
-	stxvd2x	4,10,6
-	bdz	L(tail_bytes)
-
-	addi	10,10,64
-	stxvd2x	4,0,12
-	stxvd2x	4,12,6
-	bdnz	L(big_loop_fast)
-
-	mr	12,10
-
-	.align	4
-L(tail_bytes):
-
-	/* Check for tail bytes.  */
-	mr	1,7		/* Restore r1.  */
-	beqlr	cr6
-
-	clrlwi	0,5,29
-	mtocrf	0x01,0
-
-	/*  At this point we have a tail of 0-7 bytes and we know that the
-	destination is doubleword-aligned.  */
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
-
-	stw	4,0(12)
-	addi	12,12,4
-2:	/* Copy 2 bytes.  */
-	bf	30,1f
-
-	sth	4,0(12)
-	addi	12,12,2
-1:	/* Copy 1 byte.  */
-	bflr	31
-
-	stb	4,0(12)
-	blr
-
-
-	/* Special case when value is 0 and we have a long length to deal
-	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
-	   dcbz though, we need to get the destination 128-bytes aligned.  */
-	.align	4
-L(huge):
-	lfd	4,24(1)
-	andi.	11,10,127
-	neg	0,10
-	beq	L(huge_aligned)
-
-	clrlwi	0,0,25
-	subf	5,0,5
-	srwi	0,0,3
-	mtocrf  0x01,0
-
-	/* Get DST aligned to 128 bytes.  */
-8:	bf	28,4f
-
-	stfd	4,0(10)
-	stfd	4,8(10)
-	stfd	4,16(10)
-	stfd	4,24(10)
-	stfd	4,32(10)
-	stfd	4,40(10)
-	stfd	4,48(10)
-	stfd	4,56(10)
-	addi	10,10,64
-	.align	4
-4:	bf	29,2f
-
-	stfd	4,0(10)
-	stfd	4,8(10)
-	stfd	4,16(10)
-	stfd	4,24(10)
-	addi	10,10,32
-	.align	4
-2:	bf	30,1f
-
-	stfd	4,0(10)
-	stfd	4,8(10)
-	addi	10,10,16
-	.align	4
-1:	bf	31,L(huge_aligned)
-
-	stfd	4,0(10)
-	addi	10,10,8
-
-L(huge_aligned):
-	srwi	8,5,7
-	clrlwi	11,5,25
-	cmplwi	cr6,11,0
-	mtctr	8
-
-	/* Copies 128-bytes at a time.  */
-	.align	4
-L(huge_loop):
-	dcbz	0,10
-	addi	10,10,128
-	bdnz	L(huge_loop)
-
-	/* We have a tail of 0~127 bytes to handle.  */
-	mr	1,7		/* Restore r1.  */
-	beqlr	cr6
-
-	subf	9,3,10
-	subf	5,9,12
-	srwi	8,5,3
-	cmplwi	cr6,8,0
-	mtocrf	0x01,8
-
-	/* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
-	speed.  We'll handle the resulting tail bytes later.  */
-	beq	cr6,L(tail)
-
-8:	bf	28,4f
-
-	stfd	4,0(10)
-	stfd	4,8(10)
-	stfd	4,16(10)
-	stfd	4,24(10)
-	stfd	4,32(10)
-	stfd	4,40(10)
-	stfd	4,48(10)
-	stfd	4,56(10)
-	addi	10,10,64
-	.align	4
-4:	bf	29,2f
-
-	stfd	4,0(10)
-	stfd	4,8(10)
-	stfd	4,16(10)
-	stfd	4,24(10)
-	addi	10,10,32
-	.align	4
-2:	bf	30,1f
-
-	stfd	4,0(10)
-	stfd	4,8(10)
-	addi	10,10,16
-	.align	4
-1:	bf	31,L(tail)
-
-	stfd	4,0(10)
-	addi	10,10,8
-
-	/* Handle the rest of the tail bytes here.  */
-L(tail):
-	mtocrf	0x01,5
-
-	.align	4
-4:	bf	29,2f
-
-	stw	4,0(10)
-	addi	10,10,4
-	.align	4
-2:	bf	30,1f
-
-	sth	4,0(10)
-	addi	10,10,2
-	.align	4
-1:	bflr	31
-
-	stb	4,0(10)
-	blr
-
-
-	/* Expanded tree to copy tail bytes without increments.  */
-	.align	4
-L(copy_tail):
-	bf	29,L(FXX)
-
-	stw	4,0(10)
-	bf	30,L(TFX)
-
-	sth	4,4(10)
-	bflr	31
-
-	stb	4,6(10)
-	blr
-
-	.align	4
-L(FXX):	bf	30,L(FFX)
-
-	sth	4,0(10)
-	bflr	31
-
-	stb	4,2(10)
-	blr
-
-	.align	4
-L(TFX):	bflr	31
-
-	stb	4,4(10)
-	blr
-
-	.align	4
-L(FFX):	bflr	31
-
-	stb	4,0(10)
-	blr
-
-	/* Handle copies of 9~31 bytes.  */
-	.align	4
-L(medium):
-	/* At least 9 bytes to go.  */
-	andi.	11,10,3
-	clrlwi	0,0,30
-	beq	L(medium_aligned)
-
-	/* Force 4-bytes alignment for DST.  */
-	mtocrf	0x01,0
-	subf	5,0,5
-1:	/* Copy 1 byte.  */
-	bf	31,2f
-
-	stb	4,0(10)
-	addi	10,10,1
-2:	/* Copy 2 bytes.  */
-	bf	30,L(medium_aligned)
-
-	sth	4,0(10)
-	addi	10,10,2
-
-	.align	4
-L(medium_aligned):
-	/* At least 6 bytes to go, and DST is word-aligned.  */
-	cmplwi	cr1,5,16
-	mtocrf	0x01,5
-	blt	cr1,8f
-
-	/* Copy 16 bytes.  */
-	stw	4,0(10)
-	stw	4,4(10)
-	stw	4,8(10)
-	stw	4,12(10)
-	addi	10,10,16
-8:	/* Copy 8 bytes.  */
-	bf	28,4f
-
-	stw	4,0(10)
-	stw	4,4(10)
-	addi	10,10,8
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
-
-	stw	4,0(10)
-	addi	10,10,4
-2:	/* Copy 2-3 bytes.  */
-	bf	30,1f
-
-	sth	4,0(10)
-	addi	10,10,2
-1:	/* Copy 1 byte.  */
-	bflr	31
-
-	stb	4,0(10)
-	blr
-
-	/* Handles copies of 0~8 bytes.  */
-	.align	4
-L(small):
-	mtocrf	0x01,5
-	bne	cr6,L(copy_tail)
-
-	stw	4,0(10)
-	stw	4,4(10)
-	blr
-
-END (memset)
-libc_hidden_builtin_def (memset)
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]