This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC][PATCH] PowerPC - e6500 optimized memset function


Hello All,

Please find below, patch for optimized implementation of 'memset' for PowerPC e6500 (32-bit & 64-bit) target using Altivec instructions.

2015-08-31  Rohit Arul Raj <rohitarulraj@freescale.com>

	* sysdeps/powerpc/powerpc32/e6500/memset.S: New File: optimized memset
	implementation using altivec instructions.
	* sysdeps/powerpc/powerpc32/e6500/multiarch/rtld-memset.S: New File.
	* sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c:: Add
	check for e6500 bzero function.
	* sysdeps/powerpc/powerpc32/power4/multiarch/bzero-e6500.S: New File:
	multiarch e6500 bzero.
	* sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add check for e6500 memset & bzero function.
	* sysdeps/powerpc/powerpc32/power4/multiarch/Makefile: Add
	memset-e6500 and bzero-e6500 object.
	* sysdeps/powerpc/powerpc32/power4/multiarch/memset.c: Add
	check for e6500 memset function.
	* sysdeps/powerpc/powerpc32/power4/multiarch/memset-e6500.S New File:
	multiarch e6500 memset.
	* sysdeps/powerpc/powerpc64/e6500/memset.S: New File: optimized memset
	implementation using altivec instructions.
	* sysdeps/powerpc/powerpc64/multiarch/bzero.c: Add
	check for e6500 bzero function.
	* sysdeps/powerpc/powerpc64/multiarch/bzero-e6500.S: New File:
	multiarch e6500 bzero.
	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add check for e6500 memset & bzero function.
	* sysdeps/powerpc/powerpc64/multiarch/Makefile: Add
	memset-e6500 and bzero-e6500 object.
	* sysdeps/powerpc/powerpc64/multiarch/memset.c: Add
	check for e6500 memset function.
	* sysdeps/powerpc/powerpc64/multiarch/memset-e6500.S: New File:
	multiarch e6500 memset.

diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/e6500/memset.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/e6500/memset.S
--- glibc-2.20/sysdeps/powerpc/powerpc32/e6500/memset.S	1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/e6500/memset.S	2015-08-29 16:00:45.023478670 -0500
@@ -0,0 +1,257 @@
+/* Optimized memset implementation for e6500 32-bit PowerPC.
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+#define rTMP	r0
+#define rRTN	r3	/* initial value of 1st argument.  */
+#define rMEMP0	r3	/* original value of 1st arg.  */
+#define rCHR	r4	/* char to set in each byte.  */
+#define rLEN	r5	/* length of region to set.  */
+#define rMEMP	r6	/* address at which we are storing.  */
+#define rALIGN	r7	/* no. of bytes we are setting now (when aligning).  */
+#define rPOS16	r7	/* constant +16.  */
+#define rPOS32	r8	/* constant +32.  */
+#define rPOS48	r9	/* constant +48.  */
+#define rGOT	r9	/* Address of the Global Offset Table.  */
+#define rCLS	r9	/* Cache line size obtained from static.  */
+#define rCTR2	r7
+#define rCTR1	r11
+#define rTMP1	r12
+#define vCHR	v14	/* char to set in each byte.  */
+#define vTMP1	v15
+#define vTMP2	v16
+
+	.section ".text"
+EALIGN (memset, 5, 1)
+	cmplwi	cr1, rLEN, 4
+	cmplwi	cr5, rLEN, 32
+	mr	rMEMP, rMEMP0
+	ble	cr1, L(small)
+	rlwimi	rCHR, rCHR, 8, 16, 23
+	rlwimi	rCHR, rCHR, 16, 0, 15
+	blt	cr5, L(medium)
+	neg	rTMP, rMEMP
+	andi.	rTMP, rTMP, 15
+	bne	L(nalign16)
+L(align16):
+	cmplwi	7, rLEN, 63
+	rlwinm.	rTMP1, rCHR, 28, 28, 3
+	li	rPOS16, 16
+	ble	7, L(copy_remaining)
+	beq	L(check_cache_line_size)
+L(vec_nz):
+	srwi	rCTR1, rLEN, 6		/* No of 64 byte copy count.  */
+	rlwinm	rLEN, rLEN, 0, 26, 31	/* remaining bytes.  */
+	vxor	vCHR, vCHR, vCHR
+	mtctr	rCTR1			/* move count.  */
+	lvsl	vCHR, 0, rTMP1		/* LSU Move upper
+					   nibble to byte 0 of VR.  */
+	vspltisb	vTMP1, 4	/* VPU Splat 0x4 to every byte.  */
+	lvsl	vTMP2, 0, rCHR		/* LSU Move lower
+					   nibble to byte 0 of VR.  */
+	vslb	vCHR, vCHR, vTMP1	/* VIU Move upper nibble to VR[0:3].  */
+	vor	vCHR, vCHR, vTMP2	/* VIU Form FILL byte in VR[0:7].  */
+	vspltb	vCHR, vCHR, 0		/* VPU Splat the fill
+					   byte to all bytes.  */
+	li	rPOS32, 32
+	li	rPOS48, 48
+L(vnz_loop):
+	stvx	vCHR, 0, rMEMP
+	stvx	vCHR, rPOS16, rMEMP
+	stvx	vCHR, rPOS32, rMEMP
+	stvx	vCHR, rPOS48, rMEMP
+	addi	rMEMP, rMEMP, 64
+	bdnz	L(vnz_loop)
+L(copy_remaining):
+	srwi.	rCTR1, rLEN, 3		/* No of 8 byte copy count.  */
+	rlwinm	rLEN, rLEN, 0, 29, 31	/* remaining bytes.  */
+	cmplwi	cr1, rLEN, 1
+	bne	0, L(copy_words)
+L(copy_bytes):
+	bltlr	cr1
+	cmplwi	cr0, rLEN, 4
+	beq	cr1, 2f			/* nb <= 1? (0, 1 bytes).  */
+	bgt	cr0, 1f			/* nb > 4?  (5, 6, 7 bytes).  */
+	addi	rTMP, rLEN, -2		/* 2, 3, 4 bytes.  */
+	sth	rCHR, 0(rMEMP)
+	sthx	rCHR, rMEMP, rTMP
+	blr
+1:
+	addi	rTMP, rLEN, -4		/* 5, 6, 7 bytes.  */
+	stw	rCHR, 0(rMEMP)
+	stwx	rCHR, rMEMP, rTMP
+	blr
+2:	stb	rCHR, 0(rMEMP)
+	blr
+
+L(copy_words):
+	mtcrf	0x01, rCTR1
+	bf	cr7*4+1, 16f
+	stw	rCHR, 0(rMEMP)
+	stw	rCHR, 4(rMEMP)
+	stw	rCHR, 8(rMEMP)
+	stw	rCHR, 12(rMEMP)
+	stw	rCHR, 16(rMEMP)
+	stw	rCHR, 20(rMEMP)
+	stw	rCHR, 24(rMEMP)
+	stw	rCHR, 28(rMEMP)
+	addi	rMEMP, rMEMP, 32
+16:
+	bf	cr7*4+2, 8f
+	stw	rCHR, 0(rMEMP)
+	stw	rCHR, 4(rMEMP)
+	stw	rCHR, 8(rMEMP)
+	stw	rCHR, 12(rMEMP)
+	addi	rMEMP, rMEMP, 16
+8:
+	bf	cr7*4+3, L(copy_bytes)
+	stw	rCHR, 0(rMEMP)
+	stw	rCHR, 4(rMEMP)
+	bltlr	cr1
+	addi	rMEMP, rMEMP, 8
+	b	L(copy_bytes)
+
+	.align 5
+L(check_cache_line_size):
+#ifdef	SHARED
+	mflr	rTMP
+/* Establishes GOT addressability so we can load __cache_line_size
+   from static.  This value was set from the aux vector during startup.  */
+	SETUP_GOT_ACCESS(rGOT,got_label_1)
+	addis	rGOT, rGOT, __cache_line_size-got_label_1@ha
+	lwz	rCLS, __cache_line_size-got_label_1@l(rGOT)
+	mtlr	rTMP
+#else
+/* Load __cache_line_size from static.  This value was set from the
+   aux vector during startup.  */
+	lis	rCLS, __cache_line_size@ha
+	lwz	rCLS, __cache_line_size@l(rCLS)
+#endif
+	cmplwi	5, rCLS, 64
+	neg	rTMP, rMEMP
+	bne	5, L(vec_nz)
+	andi.	rTMP, rTMP, 63
+	bne	L(nalign64)
+L(align64):
+	srwi	rCTR1, rLEN, 6
+	cmplwi	7, rCTR1, 32767
+	rlwinm	rLEN, rLEN, 0, 26, 31
+	mtctr	rCTR1
+	bgt	7, L(vec_zbig)
+L(vz_loop):
+	dcbzl	0, rMEMP
+	addi	rMEMP, rMEMP, 64
+	bdnz	L(vz_loop)
+	b	L(copy_remaining)
+
+L(vec_zbig):
+	addi	rCTR2, rCTR1, -32767
+	mtctr	rCTR2
+L(vz_big_loop):
+	dcbzl	0, rMEMP
+	dcbf	0, rMEMP
+	addi	rMEMP, rMEMP, 64
+	bdnz	L(vz_big_loop)
+	li	rCTR1, 32767
+	mtctr	rCTR1
+	b	L(vz_loop)
+
+L(nalign64):
+	vxor	vCHR, vCHR, vCHR
+	subf	rLEN, rTMP, rLEN
+	li	rPOS48, 48
+	li	rPOS32, 32
+	stvx	vCHR, 0, rMEMP
+	stvx	vCHR, rPOS16, rMEMP
+	cmplwi	7, rLEN, 64
+	stvx	vCHR, rPOS32, rMEMP
+	stvx	vCHR, rPOS48, rMEMP
+	add	rMEMP, rMEMP, rTMP
+	blt	7, L(copy_remaining)
+	b	L(align64)
+
+L(nalign16):
+	stw	rCHR, 0(rMEMP)
+	stw	rCHR, 4(rMEMP)
+	subf	rLEN, rTMP, rLEN
+	stw	rCHR, 8(rMEMP)
+	stw	rCHR, 12(rMEMP)
+	add	rMEMP, rMEMP, rTMP
+	b	L(align16)
+
+	.align 5
+	/* Memset of 0-4 bytes.  Taken from GLIBC default memset.  */
+L(small):
+	cmplwi	cr5, rLEN, 1
+	cmplwi	cr1, rLEN, 3
+	bltlr	cr5
+	stb	rCHR, 0(rMEMP)
+	beqlr	cr5
+	nop
+	stb	rCHR, 1(rMEMP)
+	bltlr	cr1
+	stb	rCHR, 2(rMEMP)
+	beqlr	cr1
+	nop
+	stb	rCHR, 3(rMEMP)
+	blr
+
+	/* Memset of 0-31 bytes.  Taken from GLIBC default memset.  */
+	.align 5
+L(medium):
+	mtcrf	0x01, rLEN
+	cmplwi	cr1, rLEN, 16
+	add	rMEMP, rMEMP, rLEN
+	bt	31, L(medium_31t)
+	bt	30, L(medium_30t)
+L(medium_30f):
+	bt	29, L(medium_29t)
+L(medium_29f):
+	bge	cr1, L(medium_27t)
+	bflr	28
+	stw	rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+	blr
+L(medium_31t):
+	stbu	rCHR, -1(rMEMP)
+	bf	30, L(medium_30f)
+L(medium_30t):
+	sthu	rCHR, -2(rMEMP)
+	bf	29, L(medium_29f)
+L(medium_29t):
+	stwu	rCHR, -4(rMEMP)
+	blt	cr1, L(medium_27f)
+L(medium_27t):
+	stw	rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+	stw	rCHR, -12(rMEMP)
+	stwu	rCHR, -16(rMEMP)
+L(medium_27f):
+	bflr	28
+L(medium_28t):
+	stw	rCHR, -4(rMEMP)
+	stw	rCHR, -8(rMEMP)
+	blr
+
+END (memset)
+libc_hidden_builtin_def (memset)
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/e6500/multiarch/rtld-memset.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/e6500/multiarch/rtld-memset.S
--- glibc-2.20/sysdeps/powerpc/powerpc32/e6500/multiarch/rtld-memset.S	1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/e6500/multiarch/rtld-memset.S	2015-08-29 16:15:33.250533132 -0500
@@ -0,0 +1,18 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/powerpc/powerpc32/e6500/memset.S>
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c
--- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c	2015-08-29 15:45:05.846419682 -0500
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c	2015-08-29 16:00:45.023478670 -0500
@@ -23,6 +23,7 @@
 # include "init-arch.h"
 
 extern __typeof (bzero) __bzero_ppc attribute_hidden;
+extern __typeof (bzero) __bzero_e6500 attribute_hidden;
 extern __typeof (bzero) __bzero_power6 attribute_hidden;
 extern __typeof (bzero) __bzero_power7 attribute_hidden;
 
@@ -30,7 +31,10 @@
             (hwcap & PPC_FEATURE_HAS_VSX)
             ? __bzero_power7 :
 	      (hwcap & PPC_FEATURE_ARCH_2_05)
-		? __bzero_power6
+		? __bzero_power6 :
+	          (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+	          && (hwcap2 & PPC_FEATURE2_HAS_ISEL))
+		    ? __bzero_e6500
             : __bzero_ppc);
 
 weak_alias (__bzero, bzero)
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/bzero-e6500.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/bzero-e6500.S
--- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/bzero-e6500.S	1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/bzero-e6500.S	2015-08-29 16:00:45.023478670 -0500
@@ -0,0 +1,26 @@
+/* Optimized bzero implementation for PowerPC32/e6500.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY (__bzero_e6500)
+        mr      r5,r4
+        li      r4,0
+        b       __memset_e6500@local
+END (__bzero_e6500)
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c
--- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c	2015-08-29 15:45:37.044421872 -0500
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c	2015-08-29 16:00:45.023478670 -0500
@@ -76,6 +76,10 @@
 			      __memset_power7)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
 			      __memset_power6)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+			      && (hwcap2 & PPC_FEATURE2_HAS_ISEL)),
+			      __memset_e6500)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ppc))
 
   /* Support sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c.  */
@@ -84,6 +88,10 @@
 			      __bzero_power7)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
 			      __bzero_power6)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+			      && (hwcap2 & PPC_FEATURE2_HAS_ISEL)),
+			      __bzero_e6500)
 	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ppc))
 
   /* Support sysdeps/powerpc/powerpc32/power4/multiarch/strlen.c.  */
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile
--- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile	2015-08-29 15:46:34.217426773 -0500
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile	2015-08-29 16:10:18.798514193 -0500
@@ -1,9 +1,9 @@
 ifeq ($(subdir),string)
 sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   memcpy-e6500 memcpy-ppc32 memcmp-power7 memcmp-e6500 \
-		   memcmp-ppc32 memset-power7 memset-power6 memset-ppc32 \
-		   bzero-power7 bzero-power6 bzero-ppc32 \
-		   mempcpy-power7 mempcpy-ppc32 memchr-power7 \
+		   memcmp-ppc32 memset-power7 memset-power6 memset-e6500 \
+		   memset-ppc32 bzero-power7 bzero-power6 bzero-e6500 \
+		   bzero-ppc32 mempcpy-power7 mempcpy-ppc32 memchr-power7 \
 		   memchr-ppc32 memrchr-power7 memrchr-ppc32 rawmemchr-power7 \
 		   rawmemchr-ppc32 strlen-power7 strlen-ppc32 strnlen-power7 \
 		   strnlen-ppc32 strncmp-power7 strncmp-ppc32 \
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memset.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/memset.c
--- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memset.c	2015-08-29 15:45:05.845419682 -0500
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/memset.c	2015-08-29 16:00:45.024478649 -0500
@@ -23,6 +23,7 @@
 # include "init-arch.h"
 
 extern __typeof (memset) __memset_ppc attribute_hidden;
+extern __typeof (memset) __memset_e6500 attribute_hidden;
 extern __typeof (memset) __memset_power6 attribute_hidden;
 extern __typeof (memset) __memset_power7 attribute_hidden;
 
@@ -32,6 +33,9 @@
             (hwcap & PPC_FEATURE_HAS_VSX)
             ? __memset_power7 :
 	      (hwcap & PPC_FEATURE_ARCH_2_05)
-		? __memset_power6
+		? __memset_power6 :
+		  (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+		  && (hwcap2 & PPC_FEATURE2_HAS_ISEL))
+		    ? __memset_e6500
             : __memset_ppc);
 #endif
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memset-e6500.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/memset-e6500.S
--- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memset-e6500.S	1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/memset-e6500.S	2015-08-29 16:00:45.024478649 -0500
@@ -0,0 +1,38 @@
+/* Optimized memset implementation for PowerPC32/e6500.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+ .globl C_SYMBOL_NAME(__memset_e6500);				\
+ .type C_SYMBOL_NAME(__memset_e6500),@function;		\
+ .align ALIGNARG(alignt);					\
+ EALIGN_W_##words;						\
+ C_LABEL(__memset_e6500)					\
+ cfi_startproc;
+
+#undef END
+#define END(name)						\
+ cfi_endproc;							\
+ ASM_SIZE_DIRECTIVE(__memset_e6500)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc32/e6500/memset.S>
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/e6500/memset.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/e6500/memset.S
--- glibc-2.20/sysdeps/powerpc/powerpc64/e6500/memset.S	1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/e6500/memset.S	2015-08-29 16:00:45.024478649 -0500
@@ -0,0 +1,262 @@
+/* Optimized memset implementation for e6500 64-bit PowerPC.
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define rTMP	r0
+#define rRTN	r3	/* initial value of 1st argument.  */
+#define rMEMP0	r3	/* original value of 1st arg.  */
+#define rCHR	r4	/* char to set in each byte.  */
+#define rLEN	r5	/* length of region to set.  */
+#define rMEMP	r6	/* address at which we are storing.  */
+#define rALIGN	r7	/* no. of bytes we are setting now (when aligning).  */
+#define rPOS16	r7	/* constant +16.  */
+#define rPOS32	r8	/* constant +32.  */
+#define rPOS48	r9	/* constant +48.  */
+#define rGOT	r9	/* Address of the Global Offset Table.  */
+#define rCLS	r9	/* Cache line size obtained from static.  */
+#define rCTR2	r7
+#define rCTR1	r11
+#define rTMP1	r12
+#define vCHR	v14	/* char to set in each byte.  */
+#define vTMP1	v15
+#define vTMP2	v16
+
+#include <sysdep.h>
+
+	.section ".toc", "aw"
+.LC0:
+	.tc __cache_line_size[TC], __cache_line_size
+	.section ".text"
+	.align 2
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+EALIGN (memset, 5, 0)
+	CALL_MCOUNT 3
+L(_memset):
+	cmpldi	cr1, rLEN, 8
+	cmpldi	cr5, rLEN, 32
+	mr	rMEMP, rMEMP0
+	ble	cr1, L(small)
+	rlwimi	rCHR, rCHR, 8, 16, 23
+	rlwimi	rCHR, rCHR, 16, 0, 15
+	blt	cr5, L(medium)
+	neg	rTMP, rMEMP
+	andi.	rTMP, rTMP, 15
+	bne	L(nalign16)
+L(align16):
+	cmpldi	7, rLEN, 63
+	rlwinm.	rTMP1, rCHR, 28, 28, 3
+	li	rPOS16, 16
+	ble	7, L(copy_remaining)
+	beq	L(check_cache_line_size)
+L(vec_nz):
+	srwi	rCTR1, rLEN, 6		/* No of 64 byte copy count.  */
+	rlwinm	rLEN, rLEN, 0, 26, 31	/* remaining bytes.  */
+	vxor	vCHR, vCHR, vCHR
+	mtctr	rCTR1			/* move count.  */
+	lvsl	vCHR, 0, rTMP1		/* LSU Move upper nibble
+					   to byte 0 of VR.  */
+	vspltisb	vTMP1, 4	/* VPU Splat 0x4 to every byte.  */
+	lvsl	vTMP2, 0, rCHR		/* LSU Move lower nibble
+					   to byte 0 of VR.  */
+	vslb	vCHR, vCHR, vTMP1	/* VIU Move upper nibble to VR[0:3].  */
+	vor	vCHR, vCHR, vTMP2	/* VIU Form FILL byte in VR[0:7].  */
+	vspltb	vCHR, vCHR, 0		/* VPU Splat the fill
+					   byte to all bytes.  */
+	li	rPOS32, 32
+	li	rPOS48, 48
+L(vnz_loop):
+	stvx	vCHR, 0, rMEMP
+	stvx	vCHR, rPOS16, rMEMP
+	stvx	vCHR, rPOS32, rMEMP
+	stvx	vCHR, rPOS48, rMEMP
+	addi	rMEMP, rMEMP, 64
+	bdnz	L(vnz_loop)
+L(copy_remaining):
+	srwi.	rCTR1, rLEN, 3		/* No of 8 byte copy count.  */
+	rlwinm	rLEN, rLEN, 0, 29, 31	/* remaining bytes.  */
+	cmpldi	cr1, rLEN, 1
+	bne	0, L(copy_words)
+L(copy_bytes):
+	bltlr	cr1
+	cmpldi	cr0, rLEN, 4
+	beq	cr1, 2f			/* nb <= 1? (0, 1 bytes).  */
+	bgt	cr0, 1f			/* nb > 4?  (5, 6, 7 bytes).  */
+	addi	rTMP, rLEN, -2		/* 2, 3, 4 bytes.  */
+	sth	rCHR, 0(rMEMP)
+	sthx	rCHR, rMEMP, rTMP
+	blr
+1:
+	addi	rTMP, rLEN, -4		/* 5, 6, 7 bytes.  */
+	stw	rCHR, 0(rMEMP)
+	stwx	rCHR, rMEMP, rTMP
+	blr
+2:	stb	rCHR, 0(rMEMP)
+	blr
+
+L(copy_words):
+	mtcrf	0x01, rCTR1
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
+	bf	cr7*4+1, 16f
+	std	rCHR, 0(rMEMP)
+	std	rCHR, 8(rMEMP)
+	std	rCHR, 16(rMEMP)
+	std	rCHR, 24(rMEMP)
+	addi	rMEMP, rMEMP, 32
+16:
+	bf	cr7*4+2, 8f
+	std	rCHR, 0(rMEMP)
+	std	rCHR, 8(rMEMP)
+	addi	rMEMP, rMEMP, 16
+8:
+	bf	cr7*4+3, L(copy_bytes)
+	std	rCHR, 0(rMEMP)
+	addi	rMEMP, rMEMP, 8
+	b	L(copy_bytes)
+
+	.align 5
+L(check_cache_line_size):
+	ld	rCLS, .LC0@toc(r2)
+	lwz	rCLS, 0(rCLS)
+	cmpldi	5, rCLS, 64
+	neg	rTMP, rMEMP
+	bne	5, L(vec_nz)
+	andi.	rTMP, rTMP, 63
+	bne	L(nalign64)
+L(align64):
+	srwi	rCTR1, rLEN, 6
+	cmpldi	7, rCTR1, 32767
+	rlwinm	rLEN, rLEN, 0, 26, 31
+	mtctr	rCTR1
+	bgt	7, L(vec_zbig)
+L(vz_loop):
+	dcbzl	0, rMEMP
+	addi	rMEMP, rMEMP, 64
+	bdnz	L(vz_loop)
+	b	L(copy_remaining)
+
+L(vec_zbig):
+	addi	rCTR2, rCTR1, -32767
+	mtctr	rCTR2
+L(vz_big_loop):
+	dcbzl	0, rMEMP
+	dcbf	0, rMEMP
+	addi	rMEMP, rMEMP, 64
+	bdnz	L(vz_big_loop)
+	li	rCTR1, 32767
+	mtctr	rCTR1
+	b	L(vz_loop)
+
+L(nalign64):
+	vxor	vCHR, vCHR, vCHR
+	subf	rLEN, rTMP, rLEN
+	li	rPOS48, 48
+	li	rPOS32, 32
+	stvx	vCHR, 0, rMEMP
+	stvx	vCHR, rPOS16, rMEMP
+	cmpldi	7, rLEN, 64
+	stvx	vCHR, rPOS32, rMEMP
+	stvx	vCHR, rPOS48, rMEMP
+	add	rMEMP, rMEMP, rTMP
+	blt	7, L(copy_remaining)
+	b	L(align64)
+
+L(nalign16):
+	insrdi	rCHR, rCHR, 32, 0 	/* Replicate word to double word.  */
+	std	rCHR, 0(rMEMP)
+	subf	rLEN, rTMP, rLEN
+	std	rCHR, 8(rMEMP)
+	add	rMEMP, rMEMP, rTMP
+	b	L(align16)
+
+	/* Memset of 8 bytes or less.  Taken from GLIBC default memset.  */
+	.align 5
+L(small):
+	cmpldi	cr6, rLEN, 4
+	cmpldi	cr5, rLEN, 1
+	ble	cr6, L(le4)
+	subi	rLEN, rLEN, 4
+	stb	rCHR, 0(rMEMP)
+	stb	rCHR, 1(rMEMP)
+	stb	rCHR, 2(rMEMP)
+	stb	rCHR, 3(rMEMP)
+	addi	rMEMP, rMEMP, 4
+	cmpldi	cr5, rLEN, 1
+L(le4):
+	cmpldi	cr1, rLEN, 3
+	bltlr	cr5
+	stb	rCHR, 0(rMEMP)
+	beqlr	cr5
+	stb	rCHR, 1(rMEMP)
+	bltlr	cr1
+	stb	rCHR, 2(rMEMP)
+	beqlr	cr1
+	stb	rCHR, 3(rMEMP)
+	blr
+
+	/* Memset of 0-31 bytes.  Taken from GLIBC default memset.  */
+	.align 5
+L(medium):
+	mtcrf	0x01, rLEN
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
+	cmpldi	cr1, rLEN, 16
+L(medium_tail2):
+	add	rMEMP, rMEMP, rLEN
+L(medium_tail):
+	bt	31, L(medium_31t)
+	bt	30, L(medium_30t)
+L(medium_30f):
+	bt	29, L(medium_29t)
+L(medium_29f):
+	bge	cr1, L(medium_27t)
+	bflr	28
+	std	rCHR, -8(rMEMP)
+	blr
+L(medium_31t):
+	stbu	rCHR, -1(rMEMP)
+	bf	30, L(medium_30f)
+L(medium_30t):
+	sthu	rCHR, -2(rMEMP)
+	bf	29, L(medium_29f)
+L(medium_29t):
+	stwu	rCHR, -4(rMEMP)
+	blt	cr1, L(medium_27f)
+L(medium_27t):
+	std	rCHR, -8(rMEMP)
+	stdu	rCHR, -16(rMEMP)
+L(medium_27f):
+	bflr	28
+L(medium_28t):
+	std	rCHR, -8(rMEMP)
+	blr
+END_GEN_TB (memset,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+#ifndef	NO_BZERO_IMPL
+/* Copied from bzero.S to prevent the linker from inserting a stub
+   between bzero and memset.  */
+ENTRY (__bzero)
+	mr	r5, r4
+	li	r4, 0
+	b	L(_memset)
+END_GEN_TB (__bzero,TB_TOCLESS)
+
+weak_alias (__bzero, bzero)
+#endif
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/bzero.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/bzero.c
--- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/bzero.c	2015-08-29 15:45:05.867419683 -0500
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/bzero.c	2015-08-29 16:00:45.024478649 -0500
@@ -23,6 +23,7 @@
 # include "init-arch.h"
 
 extern __typeof (bzero) __bzero_ppc attribute_hidden;
+extern __typeof (bzero) __bzero_e6500 attribute_hidden;
 extern __typeof (bzero) __bzero_power4 attribute_hidden;
 extern __typeof (bzero) __bzero_power6 attribute_hidden;
 extern __typeof (bzero) __bzero_power7 attribute_hidden;
@@ -33,7 +34,10 @@
 	      (hwcap & PPC_FEATURE_ARCH_2_05)
 		? __bzero_power6 :
 		  (hwcap & PPC_FEATURE_POWER4)
-		? __bzero_power4
+		? __bzero_power4 :
+		  (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+		  && (hwcap2 & PPC_FEATURE2_HAS_ISEL))
+		? __bzero_e6500
             : __bzero_ppc);
 
 weak_alias (__bzero, bzero)
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/bzero-e6500.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/bzero-e6500.S
--- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/bzero-e6500.S	1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/bzero-e6500.S	2015-08-29 16:00:45.024478649 -0500
@@ -0,0 +1,26 @@
+/* Optimized bzero implementation for PowerPC64/e6500.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY (__bzero_e6500)
+	CALL_MCOUNT 3
+	mr	r5,r4
+	li	r4,0
+	b	__memset_e6500
+END_GEN_TB (__bzero_e6500,TB_TOCLESS)
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
--- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c	2015-08-29 15:45:37.045421842 -0500
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c	2015-08-29 16:00:45.024478649 -0500
@@ -80,6 +80,10 @@
 			      __memset_power6)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_POWER4,
 			      __memset_power4)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+			      && (hwcap2 & PPC_FEATURE2_HAS_ISEL)),
+			      __memset_e6500)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ppc))
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
@@ -149,6 +153,10 @@
 			      __bzero_power6)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_POWER4,
 			      __bzero_power4)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+			      && (hwcap2 & PPC_FEATURE2_HAS_ISEL)),
+			      __bzero_e6500)
 	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ppc))
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c.  */
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/Makefile glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/Makefile
--- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/Makefile	2015-08-29 15:47:51.985430863 -0500
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/Makefile	2015-08-29 16:12:43.814523563 -0500
@@ -1,9 +1,9 @@
 ifeq ($(subdir),string)
 sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   memcpy-power4 memcpy-e6500 memcpy-ppc64 memcmp-power7 \
-		   memcmp-power4 memcmp-e6500 memcmp-ppc64 \
-		   memset-power7 memset-power6 memset-power4 \
-		   memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \
+		   memcmp-power4 memcmp-e6500 memcmp-ppc64 memset-power7 \
+		   memset-power6 memset-power4 memset-e6500 memset-ppc64 \
+		   bzero-power4 bzero-power6 bzero-power7 bzero-e6500 \
 		   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
 		   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
 		   rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memset.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/memset.c
--- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memset.c	2015-08-29 15:45:05.866419683 -0500
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/memset.c	2015-08-29 16:00:45.025478633 -0500
@@ -29,6 +29,7 @@
 extern __typeof (__redirect_memset) __libc_memset;
 
 extern __typeof (__redirect_memset) __memset_ppc attribute_hidden;
+extern __typeof (__redirect_memset) __memset_e6500 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
@@ -41,7 +42,10 @@
 	      (hwcap & PPC_FEATURE_ARCH_2_05)
 		? __memset_power6 :
 		  (hwcap & PPC_FEATURE_POWER4)
-		? __memset_power4
+		? __memset_power4 :
+		  (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+		  && (hwcap2 & PPC_FEATURE2_HAS_ISEL))
+		? __memset_e6500
             : __memset_ppc);
 
 #undef memset
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memset-e6500.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/memset-e6500.S
--- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memset-e6500.S	1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/memset-e6500.S	2015-08-29 16:00:45.025478633 -0500
@@ -0,0 +1,41 @@
+/* Optimized memset implementation for PowerPC64/e6500.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__memset_e6500)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__memset_e6500):					\
+  cfi_startproc;						\
+  LOCALENTRY(__memset_e6500)
+
+#undef END_GEN_TB
+#define END_GEN_TB(name, mask)					\
+  cfi_endproc;							\
+  TRACEBACK_MASK(__memset_e6500,mask)				\
+  END_2(__memset_e6500)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#define NO_BZERO_IMPL
+#include <sysdeps/powerpc/powerpc64/e6500/memset.S>

The patch was generated on top of glibc v2.20 source base.
The patch was tested with dejaGNU and glibc testsuite. There were no regressions.

The benchsuite (both 32-bit and 64-bit) results are attached for your reference.

Please let me know your comments.

Regards,
Rohit


Attachment: benchtest-e6500-64bit-memset.txt
Description: benchtest-e6500-64bit-memset.txt

Attachment: benchtest-e6500-32bit-memset.txt
Description: benchtest-e6500-32bit-memset.txt


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]