This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
Hello All, Please find below, patch for optimized implementation of 'memset' for PowerPC e6500 (32-bit & 64-bit) target using Altivec instructions. 2015-08-31 Rohit Arul Raj <rohitarulraj@freescale.com> * sysdeps/powerpc/powerpc32/e6500/memset.S: New File: optimized memset implementation using altivec instructions. * sysdeps/powerpc/powerpc32/e6500/multiarch/rtld-memset.S: New File. * sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c:: Add check for e6500 bzero function. * sysdeps/powerpc/powerpc32/power4/multiarch/bzero-e6500.S: New File: multiarch e6500 bzero. * sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add check for e6500 memset & bzero function. * sysdeps/powerpc/powerpc32/power4/multiarch/Makefile: Add memset-e6500 and bzero-e6500 object. * sysdeps/powerpc/powerpc32/power4/multiarch/memset.c: Add check for e6500 memset function. * sysdeps/powerpc/powerpc32/power4/multiarch/memset-e6500.S New File: multiarch e6500 memset. * sysdeps/powerpc/powerpc64/e6500/memset.S: New File: optimized memset implementation using altivec instructions. * sysdeps/powerpc/powerpc64/multiarch/bzero.c: Add check for e6500 bzero function. * sysdeps/powerpc/powerpc64/multiarch/bzero-e6500.S: New File: multiarch e6500 bzero. * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add check for e6500 memset & bzero function. * sysdeps/powerpc/powerpc64/multiarch/Makefile: Add memset-e6500 and bzero-e6500 object. * sysdeps/powerpc/powerpc64/multiarch/memset.c: Add check for e6500 memset function. * sysdeps/powerpc/powerpc64/multiarch/memset-e6500.S: New File: multiarch e6500 memset. diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/e6500/memset.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/e6500/memset.S --- glibc-2.20/sysdeps/powerpc/powerpc32/e6500/memset.S 1969-12-31 18:00:00.000000000 -0600 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/e6500/memset.S 2015-08-29 16:00:45.023478670 -0500 @@ -0,0 +1,257 @@ +/* Optimized memset implementation for e6500 32-bit PowerPC. + + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); + Returns 's'. */ + +#define rTMP r0 +#define rRTN r3 /* initial value of 1st argument. */ +#define rMEMP0 r3 /* original value of 1st arg. */ +#define rCHR r4 /* char to set in each byte. */ +#define rLEN r5 /* length of region to set. */ +#define rMEMP r6 /* address at which we are storing. */ +#define rALIGN r7 /* no. of bytes we are setting now (when aligning). */ +#define rPOS16 r7 /* constant +16. */ +#define rPOS32 r8 /* constant +32. */ +#define rPOS48 r9 /* constant +48. */ +#define rGOT r9 /* Address of the Global Offset Table. */ +#define rCLS r9 /* Cache line size obtained from static. */ +#define rCTR2 r7 +#define rCTR1 r11 +#define rTMP1 r12 +#define vCHR v14 /* char to set in each byte. */ +#define vTMP1 v15 +#define vTMP2 v16 + + .section ".text" +EALIGN (memset, 5, 1) + cmplwi cr1, rLEN, 4 + cmplwi cr5, rLEN, 32 + mr rMEMP, rMEMP0 + ble cr1, L(small) + rlwimi rCHR, rCHR, 8, 16, 23 + rlwimi rCHR, rCHR, 16, 0, 15 + blt cr5, L(medium) + neg rTMP, rMEMP + andi. rTMP, rTMP, 15 + bne L(nalign16) +L(align16): + cmplwi 7, rLEN, 63 + rlwinm. rTMP1, rCHR, 28, 28, 3 + li rPOS16, 16 + ble 7, L(copy_remaining) + beq L(check_cache_line_size) +L(vec_nz): + srwi rCTR1, rLEN, 6 /* No of 64 byte copy count. */ + rlwinm rLEN, rLEN, 0, 26, 31 /* remaining bytes. */ + vxor vCHR, vCHR, vCHR + mtctr rCTR1 /* move count. */ + lvsl vCHR, 0, rTMP1 /* LSU Move upper + nibble to byte 0 of VR. */ + vspltisb vTMP1, 4 /* VPU Splat 0x4 to every byte. */ + lvsl vTMP2, 0, rCHR /* LSU Move lower + nibble to byte 0 of VR. */ + vslb vCHR, vCHR, vTMP1 /* VIU Move upper nibble to VR[0:3]. */ + vor vCHR, vCHR, vTMP2 /* VIU Form FILL byte in VR[0:7]. */ + vspltb vCHR, vCHR, 0 /* VPU Splat the fill + byte to all bytes. */ + li rPOS32, 32 + li rPOS48, 48 +L(vnz_loop): + stvx vCHR, 0, rMEMP + stvx vCHR, rPOS16, rMEMP + stvx vCHR, rPOS32, rMEMP + stvx vCHR, rPOS48, rMEMP + addi rMEMP, rMEMP, 64 + bdnz L(vnz_loop) +L(copy_remaining): + srwi. rCTR1, rLEN, 3 /* No of 8 byte copy count. */ + rlwinm rLEN, rLEN, 0, 29, 31 /* remaining bytes. */ + cmplwi cr1, rLEN, 1 + bne 0, L(copy_words) +L(copy_bytes): + bltlr cr1 + cmplwi cr0, rLEN, 4 + beq cr1, 2f /* nb <= 1? (0, 1 bytes). */ + bgt cr0, 1f /* nb > 4? (5, 6, 7 bytes). */ + addi rTMP, rLEN, -2 /* 2, 3, 4 bytes. */ + sth rCHR, 0(rMEMP) + sthx rCHR, rMEMP, rTMP + blr +1: + addi rTMP, rLEN, -4 /* 5, 6, 7 bytes. */ + stw rCHR, 0(rMEMP) + stwx rCHR, rMEMP, rTMP + blr +2: stb rCHR, 0(rMEMP) + blr + +L(copy_words): + mtcrf 0x01, rCTR1 + bf cr7*4+1, 16f + stw rCHR, 0(rMEMP) + stw rCHR, 4(rMEMP) + stw rCHR, 8(rMEMP) + stw rCHR, 12(rMEMP) + stw rCHR, 16(rMEMP) + stw rCHR, 20(rMEMP) + stw rCHR, 24(rMEMP) + stw rCHR, 28(rMEMP) + addi rMEMP, rMEMP, 32 +16: + bf cr7*4+2, 8f + stw rCHR, 0(rMEMP) + stw rCHR, 4(rMEMP) + stw rCHR, 8(rMEMP) + stw rCHR, 12(rMEMP) + addi rMEMP, rMEMP, 16 +8: + bf cr7*4+3, L(copy_bytes) + stw rCHR, 0(rMEMP) + stw rCHR, 4(rMEMP) + bltlr cr1 + addi rMEMP, rMEMP, 8 + b L(copy_bytes) + + .align 5 +L(check_cache_line_size): +#ifdef SHARED + mflr rTMP +/* Establishes GOT addressability so we can load __cache_line_size + from static. This value was set from the aux vector during startup. */ + SETUP_GOT_ACCESS(rGOT,got_label_1) + addis rGOT, rGOT, __cache_line_size-got_label_1@ha + lwz rCLS, __cache_line_size-got_label_1@l(rGOT) + mtlr rTMP +#else +/* Load __cache_line_size from static. This value was set from the + aux vector during startup. */ + lis rCLS, __cache_line_size@ha + lwz rCLS, __cache_line_size@l(rCLS) +#endif + cmplwi 5, rCLS, 64 + neg rTMP, rMEMP + bne 5, L(vec_nz) + andi. rTMP, rTMP, 63 + bne L(nalign64) +L(align64): + srwi rCTR1, rLEN, 6 + cmplwi 7, rCTR1, 32767 + rlwinm rLEN, rLEN, 0, 26, 31 + mtctr rCTR1 + bgt 7, L(vec_zbig) +L(vz_loop): + dcbzl 0, rMEMP + addi rMEMP, rMEMP, 64 + bdnz L(vz_loop) + b L(copy_remaining) + +L(vec_zbig): + addi rCTR2, rCTR1, -32767 + mtctr rCTR2 +L(vz_big_loop): + dcbzl 0, rMEMP + dcbf 0, rMEMP + addi rMEMP, rMEMP, 64 + bdnz L(vz_big_loop) + li rCTR1, 32767 + mtctr rCTR1 + b L(vz_loop) + +L(nalign64): + vxor vCHR, vCHR, vCHR + subf rLEN, rTMP, rLEN + li rPOS48, 48 + li rPOS32, 32 + stvx vCHR, 0, rMEMP + stvx vCHR, rPOS16, rMEMP + cmplwi 7, rLEN, 64 + stvx vCHR, rPOS32, rMEMP + stvx vCHR, rPOS48, rMEMP + add rMEMP, rMEMP, rTMP + blt 7, L(copy_remaining) + b L(align64) + +L(nalign16): + stw rCHR, 0(rMEMP) + stw rCHR, 4(rMEMP) + subf rLEN, rTMP, rLEN + stw rCHR, 8(rMEMP) + stw rCHR, 12(rMEMP) + add rMEMP, rMEMP, rTMP + b L(align16) + + .align 5 + /* Memset of 0-4 bytes. Taken from GLIBC default memset. */ +L(small): + cmplwi cr5, rLEN, 1 + cmplwi cr1, rLEN, 3 + bltlr cr5 + stb rCHR, 0(rMEMP) + beqlr cr5 + nop + stb rCHR, 1(rMEMP) + bltlr cr1 + stb rCHR, 2(rMEMP) + beqlr cr1 + nop + stb rCHR, 3(rMEMP) + blr + + /* Memset of 0-31 bytes. Taken from GLIBC default memset. */ + .align 5 +L(medium): + mtcrf 0x01, rLEN + cmplwi cr1, rLEN, 16 + add rMEMP, rMEMP, rLEN + bt 31, L(medium_31t) + bt 30, L(medium_30t) +L(medium_30f): + bt 29, L(medium_29t) +L(medium_29f): + bge cr1, L(medium_27t) + bflr 28 + stw rCHR, -4(rMEMP) + stw rCHR, -8(rMEMP) + blr +L(medium_31t): + stbu rCHR, -1(rMEMP) + bf 30, L(medium_30f) +L(medium_30t): + sthu rCHR, -2(rMEMP) + bf 29, L(medium_29f) +L(medium_29t): + stwu rCHR, -4(rMEMP) + blt cr1, L(medium_27f) +L(medium_27t): + stw rCHR, -4(rMEMP) + stw rCHR, -8(rMEMP) + stw rCHR, -12(rMEMP) + stwu rCHR, -16(rMEMP) +L(medium_27f): + bflr 28 +L(medium_28t): + stw rCHR, -4(rMEMP) + stw rCHR, -8(rMEMP) + blr + +END (memset) +libc_hidden_builtin_def (memset) diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/e6500/multiarch/rtld-memset.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/e6500/multiarch/rtld-memset.S --- glibc-2.20/sysdeps/powerpc/powerpc32/e6500/multiarch/rtld-memset.S 1969-12-31 18:00:00.000000000 -0600 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/e6500/multiarch/rtld-memset.S 2015-08-29 16:15:33.250533132 -0500 @@ -0,0 +1,18 @@ +/* Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdeps/powerpc/powerpc32/e6500/memset.S> diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c --- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c 2015-08-29 15:45:05.846419682 -0500 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c 2015-08-29 16:00:45.023478670 -0500 @@ -23,6 +23,7 @@ # include "init-arch.h" extern __typeof (bzero) __bzero_ppc attribute_hidden; +extern __typeof (bzero) __bzero_e6500 attribute_hidden; extern __typeof (bzero) __bzero_power6 attribute_hidden; extern __typeof (bzero) __bzero_power7 attribute_hidden; @@ -30,7 +31,10 @@ (hwcap & PPC_FEATURE_HAS_VSX) ? __bzero_power7 : (hwcap & PPC_FEATURE_ARCH_2_05) - ? __bzero_power6 + ? __bzero_power6 : + (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500) + && (hwcap2 & PPC_FEATURE2_HAS_ISEL)) + ? __bzero_e6500 : __bzero_ppc); weak_alias (__bzero, bzero) diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/bzero-e6500.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/bzero-e6500.S --- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/bzero-e6500.S 1969-12-31 18:00:00.000000000 -0600 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/bzero-e6500.S 2015-08-29 16:00:45.023478670 -0500 @@ -0,0 +1,26 @@ +/* Optimized bzero implementation for PowerPC32/e6500. + Copyright (C) 2015 Free Software Foundation, Inc. + Contributed by Luis Machado <luisgpm@br.ibm.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +ENTRY (__bzero_e6500) + mr r5,r4 + li r4,0 + b __memset_e6500@local +END (__bzero_e6500) diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c --- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c 2015-08-29 15:45:37.044421872 -0500 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c 2015-08-29 16:00:45.023478670 -0500 @@ -76,6 +76,10 @@ __memset_power7) IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05, __memset_power6) + IFUNC_IMPL_ADD (array, i, memset, + (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500) + && (hwcap2 & PPC_FEATURE2_HAS_ISEL)), + __memset_e6500) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ppc)) /* Support sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c. */ @@ -84,6 +88,10 @@ __bzero_power7) IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05, __bzero_power6) + IFUNC_IMPL_ADD (array, i, bzero, + (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500) + && (hwcap2 & PPC_FEATURE2_HAS_ISEL)), + __bzero_e6500) IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ppc)) /* Support sysdeps/powerpc/powerpc32/power4/multiarch/strlen.c. */ diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile --- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile 2015-08-29 15:46:34.217426773 -0500 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile 2015-08-29 16:10:18.798514193 -0500 @@ -1,9 +1,9 @@ ifeq ($(subdir),string) sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ memcpy-e6500 memcpy-ppc32 memcmp-power7 memcmp-e6500 \ - memcmp-ppc32 memset-power7 memset-power6 memset-ppc32 \ - bzero-power7 bzero-power6 bzero-ppc32 \ - mempcpy-power7 mempcpy-ppc32 memchr-power7 \ + memcmp-ppc32 memset-power7 memset-power6 memset-e6500 \ + memset-ppc32 bzero-power7 bzero-power6 bzero-e6500 \ + bzero-ppc32 mempcpy-power7 mempcpy-ppc32 memchr-power7 \ memchr-ppc32 memrchr-power7 memrchr-ppc32 rawmemchr-power7 \ rawmemchr-ppc32 strlen-power7 strlen-ppc32 strnlen-power7 \ strnlen-ppc32 strncmp-power7 strncmp-ppc32 \ diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memset.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/memset.c --- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memset.c 2015-08-29 15:45:05.845419682 -0500 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/memset.c 2015-08-29 16:00:45.024478649 -0500 @@ -23,6 +23,7 @@ # include "init-arch.h" extern __typeof (memset) __memset_ppc attribute_hidden; +extern __typeof (memset) __memset_e6500 attribute_hidden; extern __typeof (memset) __memset_power6 attribute_hidden; extern __typeof (memset) __memset_power7 attribute_hidden; @@ -32,6 +33,9 @@ (hwcap & PPC_FEATURE_HAS_VSX) ? __memset_power7 : (hwcap & PPC_FEATURE_ARCH_2_05) - ? __memset_power6 + ? __memset_power6 : + (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500) + && (hwcap2 & PPC_FEATURE2_HAS_ISEL)) + ? __memset_e6500 : __memset_ppc); #endif diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memset-e6500.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/memset-e6500.S --- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memset-e6500.S 1969-12-31 18:00:00.000000000 -0600 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc32/power4/multiarch/memset-e6500.S 2015-08-29 16:00:45.024478649 -0500 @@ -0,0 +1,38 @@ +/* Optimized memset implementation for PowerPC32/e6500. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .globl C_SYMBOL_NAME(__memset_e6500); \ + .type C_SYMBOL_NAME(__memset_e6500),@function; \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + C_LABEL(__memset_e6500) \ + cfi_startproc; + +#undef END +#define END(name) \ + cfi_endproc; \ + ASM_SIZE_DIRECTIVE(__memset_e6500) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc32/e6500/memset.S> diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/e6500/memset.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/e6500/memset.S --- glibc-2.20/sysdeps/powerpc/powerpc64/e6500/memset.S 1969-12-31 18:00:00.000000000 -0600 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/e6500/memset.S 2015-08-29 16:00:45.024478649 -0500 @@ -0,0 +1,262 @@ +/* Optimized memset implementation for e6500 64-bit PowerPC. + + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define rTMP r0 +#define rRTN r3 /* initial value of 1st argument. */ +#define rMEMP0 r3 /* original value of 1st arg. */ +#define rCHR r4 /* char to set in each byte. */ +#define rLEN r5 /* length of region to set. */ +#define rMEMP r6 /* address at which we are storing. */ +#define rALIGN r7 /* no. of bytes we are setting now (when aligning). */ +#define rPOS16 r7 /* constant +16. */ +#define rPOS32 r8 /* constant +32. */ +#define rPOS48 r9 /* constant +48. */ +#define rGOT r9 /* Address of the Global Offset Table. */ +#define rCLS r9 /* Cache line size obtained from static. */ +#define rCTR2 r7 +#define rCTR1 r11 +#define rTMP1 r12 +#define vCHR v14 /* char to set in each byte. */ +#define vTMP1 v15 +#define vTMP2 v16 + +#include <sysdep.h> + + .section ".toc", "aw" +.LC0: + .tc __cache_line_size[TC], __cache_line_size + .section ".text" + .align 2 + +/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); + Returns 's'. */ + +EALIGN (memset, 5, 0) + CALL_MCOUNT 3 +L(_memset): + cmpldi cr1, rLEN, 8 + cmpldi cr5, rLEN, 32 + mr rMEMP, rMEMP0 + ble cr1, L(small) + rlwimi rCHR, rCHR, 8, 16, 23 + rlwimi rCHR, rCHR, 16, 0, 15 + blt cr5, L(medium) + neg rTMP, rMEMP + andi. rTMP, rTMP, 15 + bne L(nalign16) +L(align16): + cmpldi 7, rLEN, 63 + rlwinm. rTMP1, rCHR, 28, 28, 3 + li rPOS16, 16 + ble 7, L(copy_remaining) + beq L(check_cache_line_size) +L(vec_nz): + srwi rCTR1, rLEN, 6 /* No of 64 byte copy count. */ + rlwinm rLEN, rLEN, 0, 26, 31 /* remaining bytes. */ + vxor vCHR, vCHR, vCHR + mtctr rCTR1 /* move count. */ + lvsl vCHR, 0, rTMP1 /* LSU Move upper nibble + to byte 0 of VR. */ + vspltisb vTMP1, 4 /* VPU Splat 0x4 to every byte. */ + lvsl vTMP2, 0, rCHR /* LSU Move lower nibble + to byte 0 of VR. */ + vslb vCHR, vCHR, vTMP1 /* VIU Move upper nibble to VR[0:3]. */ + vor vCHR, vCHR, vTMP2 /* VIU Form FILL byte in VR[0:7]. */ + vspltb vCHR, vCHR, 0 /* VPU Splat the fill + byte to all bytes. */ + li rPOS32, 32 + li rPOS48, 48 +L(vnz_loop): + stvx vCHR, 0, rMEMP + stvx vCHR, rPOS16, rMEMP + stvx vCHR, rPOS32, rMEMP + stvx vCHR, rPOS48, rMEMP + addi rMEMP, rMEMP, 64 + bdnz L(vnz_loop) +L(copy_remaining): + srwi. rCTR1, rLEN, 3 /* No of 8 byte copy count. */ + rlwinm rLEN, rLEN, 0, 29, 31 /* remaining bytes. */ + cmpldi cr1, rLEN, 1 + bne 0, L(copy_words) +L(copy_bytes): + bltlr cr1 + cmpldi cr0, rLEN, 4 + beq cr1, 2f /* nb <= 1? (0, 1 bytes). */ + bgt cr0, 1f /* nb > 4? (5, 6, 7 bytes). */ + addi rTMP, rLEN, -2 /* 2, 3, 4 bytes. */ + sth rCHR, 0(rMEMP) + sthx rCHR, rMEMP, rTMP + blr +1: + addi rTMP, rLEN, -4 /* 5, 6, 7 bytes. */ + stw rCHR, 0(rMEMP) + stwx rCHR, rMEMP, rTMP + blr +2: stb rCHR, 0(rMEMP) + blr + +L(copy_words): + mtcrf 0x01, rCTR1 + insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ + bf cr7*4+1, 16f + std rCHR, 0(rMEMP) + std rCHR, 8(rMEMP) + std rCHR, 16(rMEMP) + std rCHR, 24(rMEMP) + addi rMEMP, rMEMP, 32 +16: + bf cr7*4+2, 8f + std rCHR, 0(rMEMP) + std rCHR, 8(rMEMP) + addi rMEMP, rMEMP, 16 +8: + bf cr7*4+3, L(copy_bytes) + std rCHR, 0(rMEMP) + addi rMEMP, rMEMP, 8 + b L(copy_bytes) + + .align 5 +L(check_cache_line_size): + ld rCLS, .LC0@toc(r2) + lwz rCLS, 0(rCLS) + cmpldi 5, rCLS, 64 + neg rTMP, rMEMP + bne 5, L(vec_nz) + andi. rTMP, rTMP, 63 + bne L(nalign64) +L(align64): + srwi rCTR1, rLEN, 6 + cmpldi 7, rCTR1, 32767 + rlwinm rLEN, rLEN, 0, 26, 31 + mtctr rCTR1 + bgt 7, L(vec_zbig) +L(vz_loop): + dcbzl 0, rMEMP + addi rMEMP, rMEMP, 64 + bdnz L(vz_loop) + b L(copy_remaining) + +L(vec_zbig): + addi rCTR2, rCTR1, -32767 + mtctr rCTR2 +L(vz_big_loop): + dcbzl 0, rMEMP + dcbf 0, rMEMP + addi rMEMP, rMEMP, 64 + bdnz L(vz_big_loop) + li rCTR1, 32767 + mtctr rCTR1 + b L(vz_loop) + +L(nalign64): + vxor vCHR, vCHR, vCHR + subf rLEN, rTMP, rLEN + li rPOS48, 48 + li rPOS32, 32 + stvx vCHR, 0, rMEMP + stvx vCHR, rPOS16, rMEMP + cmpldi 7, rLEN, 64 + stvx vCHR, rPOS32, rMEMP + stvx vCHR, rPOS48, rMEMP + add rMEMP, rMEMP, rTMP + blt 7, L(copy_remaining) + b L(align64) + +L(nalign16): + insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ + std rCHR, 0(rMEMP) + subf rLEN, rTMP, rLEN + std rCHR, 8(rMEMP) + add rMEMP, rMEMP, rTMP + b L(align16) + + /* Memset of 8 bytes or less. Taken from GLIBC default memset. */ + .align 5 +L(small): + cmpldi cr6, rLEN, 4 + cmpldi cr5, rLEN, 1 + ble cr6, L(le4) + subi rLEN, rLEN, 4 + stb rCHR, 0(rMEMP) + stb rCHR, 1(rMEMP) + stb rCHR, 2(rMEMP) + stb rCHR, 3(rMEMP) + addi rMEMP, rMEMP, 4 + cmpldi cr5, rLEN, 1 +L(le4): + cmpldi cr1, rLEN, 3 + bltlr cr5 + stb rCHR, 0(rMEMP) + beqlr cr5 + stb rCHR, 1(rMEMP) + bltlr cr1 + stb rCHR, 2(rMEMP) + beqlr cr1 + stb rCHR, 3(rMEMP) + blr + + /* Memset of 0-31 bytes. Taken from GLIBC default memset. */ + .align 5 +L(medium): + mtcrf 0x01, rLEN + insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ + cmpldi cr1, rLEN, 16 +L(medium_tail2): + add rMEMP, rMEMP, rLEN +L(medium_tail): + bt 31, L(medium_31t) + bt 30, L(medium_30t) +L(medium_30f): + bt 29, L(medium_29t) +L(medium_29f): + bge cr1, L(medium_27t) + bflr 28 + std rCHR, -8(rMEMP) + blr +L(medium_31t): + stbu rCHR, -1(rMEMP) + bf 30, L(medium_30f) +L(medium_30t): + sthu rCHR, -2(rMEMP) + bf 29, L(medium_29f) +L(medium_29t): + stwu rCHR, -4(rMEMP) + blt cr1, L(medium_27f) +L(medium_27t): + std rCHR, -8(rMEMP) + stdu rCHR, -16(rMEMP) +L(medium_27f): + bflr 28 +L(medium_28t): + std rCHR, -8(rMEMP) + blr +END_GEN_TB (memset,TB_TOCLESS) +libc_hidden_builtin_def (memset) + +#ifndef NO_BZERO_IMPL +/* Copied from bzero.S to prevent the linker from inserting a stub + between bzero and memset. */ +ENTRY (__bzero) + mr r5, r4 + li r4, 0 + b L(_memset) +END_GEN_TB (__bzero,TB_TOCLESS) + +weak_alias (__bzero, bzero) +#endif diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/bzero.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/bzero.c --- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/bzero.c 2015-08-29 15:45:05.867419683 -0500 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/bzero.c 2015-08-29 16:00:45.024478649 -0500 @@ -23,6 +23,7 @@ # include "init-arch.h" extern __typeof (bzero) __bzero_ppc attribute_hidden; +extern __typeof (bzero) __bzero_e6500 attribute_hidden; extern __typeof (bzero) __bzero_power4 attribute_hidden; extern __typeof (bzero) __bzero_power6 attribute_hidden; extern __typeof (bzero) __bzero_power7 attribute_hidden; @@ -33,7 +34,10 @@ (hwcap & PPC_FEATURE_ARCH_2_05) ? __bzero_power6 : (hwcap & PPC_FEATURE_POWER4) - ? __bzero_power4 + ? __bzero_power4 : + (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500) + && (hwcap2 & PPC_FEATURE2_HAS_ISEL)) + ? __bzero_e6500 : __bzero_ppc); weak_alias (__bzero, bzero) diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/bzero-e6500.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/bzero-e6500.S --- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/bzero-e6500.S 1969-12-31 18:00:00.000000000 -0600 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/bzero-e6500.S 2015-08-29 16:00:45.024478649 -0500 @@ -0,0 +1,26 @@ +/* Optimized bzero implementation for PowerPC64/e6500. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +ENTRY (__bzero_e6500) + CALL_MCOUNT 3 + mr r5,r4 + li r4,0 + b __memset_e6500 +END_GEN_TB (__bzero_e6500,TB_TOCLESS) diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c --- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 2015-08-29 15:45:37.045421842 -0500 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 2015-08-29 16:00:45.024478649 -0500 @@ -80,6 +80,10 @@ __memset_power6) IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_POWER4, __memset_power4) + IFUNC_IMPL_ADD (array, i, memset, + (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500) + && (hwcap2 & PPC_FEATURE2_HAS_ISEL)), + __memset_e6500) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ppc)) /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c. */ @@ -149,6 +153,10 @@ __bzero_power6) IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_POWER4, __bzero_power4) + IFUNC_IMPL_ADD (array, i, bzero, + (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500) + && (hwcap2 & PPC_FEATURE2_HAS_ISEL)), + __bzero_e6500) IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ppc)) /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c. */ diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/Makefile glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/Makefile --- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/Makefile 2015-08-29 15:47:51.985430863 -0500 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/Makefile 2015-08-29 16:12:43.814523563 -0500 @@ -1,9 +1,9 @@ ifeq ($(subdir),string) sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ memcpy-power4 memcpy-e6500 memcpy-ppc64 memcmp-power7 \ - memcmp-power4 memcmp-e6500 memcmp-ppc64 \ - memset-power7 memset-power6 memset-power4 \ - memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \ + memcmp-power4 memcmp-e6500 memcmp-ppc64 memset-power7 \ + memset-power6 memset-power4 memset-e6500 memset-ppc64 \ + bzero-power4 bzero-power6 bzero-power7 bzero-e6500 \ mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \ memrchr-power7 memrchr-ppc64 rawmemchr-power7 \ rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \ diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memset.c glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/memset.c --- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memset.c 2015-08-29 15:45:05.866419683 -0500 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/memset.c 2015-08-29 16:00:45.025478633 -0500 @@ -29,6 +29,7 @@ extern __typeof (__redirect_memset) __libc_memset; extern __typeof (__redirect_memset) __memset_ppc attribute_hidden; +extern __typeof (__redirect_memset) __memset_e6500 attribute_hidden; extern __typeof (__redirect_memset) __memset_power4 attribute_hidden; extern __typeof (__redirect_memset) __memset_power6 attribute_hidden; extern __typeof (__redirect_memset) __memset_power7 attribute_hidden; @@ -41,7 +42,10 @@ (hwcap & PPC_FEATURE_ARCH_2_05) ? __memset_power6 : (hwcap & PPC_FEATURE_POWER4) - ? __memset_power4 + ? __memset_power4 : + (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500) + && (hwcap2 & PPC_FEATURE2_HAS_ISEL)) + ? __memset_e6500 : __memset_ppc); #undef memset diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memset-e6500.S glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/memset-e6500.S --- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memset-e6500.S 1969-12-31 18:00:00.000000000 -0600 +++ glibc-2.20-e6500-mset/sysdeps/powerpc/powerpc64/multiarch/memset-e6500.S 2015-08-29 16:00:45.025478633 -0500 @@ -0,0 +1,41 @@ +/* Optimized memset implementation for PowerPC64/e6500. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__memset_e6500) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__memset_e6500): \ + cfi_startproc; \ + LOCALENTRY(__memset_e6500) + +#undef END_GEN_TB +#define END_GEN_TB(name, mask) \ + cfi_endproc; \ + TRACEBACK_MASK(__memset_e6500,mask) \ + END_2(__memset_e6500) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#define NO_BZERO_IMPL +#include <sysdeps/powerpc/powerpc64/e6500/memset.S> The patch was generated on top of glibc v2.20 source base. The patch was tested with dejaGNU and glibc testsuite. There were no regressions. The benchsuite (both 32-bit and 64-bit) results are attached for your reference. Please let me know your comments. Regards, Rohit
Attachment:
benchtest-e6500-64bit-memset.txt
Description: benchtest-e6500-64bit-memset.txt
Attachment:
benchtest-e6500-32bit-memset.txt
Description: benchtest-e6500-32bit-memset.txt
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |