This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC][PATCH] PowerPC - e6500 optimized memcpy function


Hello All,

Please find below, patch for optimized implementation of 'memcpy' for PowerPC e6500 (32-bit & 64-bit) target using Altivec instructions.

2015-08-31  Rohit Arul Raj <rohitarulraj@freescale.com>

                * sysdeps/powerpc/powerpc32/e6500/memcpy.S: New File: optimized memcpy
                implementation using altivec instructions.
                * sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c
                (__libc_ifunc_impl_list): Add check for e6500 memcpy function.
                * sysdeps/powerpc/powerpc32/power4/multiarch/Makefile: Add
                memcpy-e6500 object.
                * sysdeps/powerpc/powerpc32/power4/multiarch/memcpy.c: Add
                check for e6500 memcpy function.
                * sysdeps/powerpc/powerpc32/power4/multiarch/memcpy-e6500.S: New File:
                multiarch e6500 memcpy.
                * sysdeps/powerpc/powerpc64/e6500/memcpy.S: New File: optimized memcpy
                implementation using altivec instructions.
                * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
                (__libc_ifunc_impl_list): Add check for e6500 memcpy function.
                * sysdeps/powerpc/powerpc64/multiarch/Makefile: Add
                memcpy-e6500 object.
                * sysdeps/powerpc/powerpc64/multiarch/memcpy.c: Add
                check for e6500 memcpy function.
                * sysdeps/powerpc/powerpc64/multiarch/memcpy-e6500.S: New File:
                multiarch e6500 memcpy.

diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/e6500/memcpy.S glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc32/e6500/memcpy.S
--- glibc-2.20/sysdeps/powerpc/powerpc32/e6500/memcpy.S   1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc32/e6500/memcpy.S     2015-08-29 15:45:37.044421872 -0500
@@ -0,0 +1,212 @@
+/* Optimized memcpy implementation for e6500 32-bit PowerPC.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+   Returns 'dst'.
+
+             r3 = destination
+             r4 = source
+             r5 = byte count
+
+             volatile fixed point registers usable:
+             r0, r3-r12
+
+             volatile floating point registers usable:
+             f0-f13.  */
+
+EALIGN (memcpy, 5, 0)
+             cmplw   cr0, r4, r3                             /* if source==destination, return.  */
+             beqlr     cr0
+             /* if number of bytes is less than 16, (optimal value TBD),
+                but greater than zero copy byte-by-byte.  */
+             cmplwi  r5, 16
+             mr          r6, r3
+             blt           L(copy_remaining)
+             neg        r0, r3
+             andi.      r11, r0, 15
+             beq        L(dst_align16)
+             lwz         r0, 0(r4)
+             lwz         r7, 4(r4)
+             lwz         r8, 8(r4)
+             lwz         r9, 12(r4)
+             subf       r5, r11, r5
+             add        r4, r4, r11
+             stw         r0, 0(r6)
+             stw         r7, 4(r6)
+             stw         r8, 8(r6)
+             stw         r9, 12(r6)
+             add        r6, r6, r11
+L(dst_align16):
+             cmplwi  7, r5, 63
+             ble          7, L(copy_remaining)
+             srwi        r11, r5, 6                               /* No of 64 byte copy count.  */
+             rlwinm  r5, r5, 0, 26, 31   /* remaining bytes.  */
+             rlwinm. r0, r4, 0, 28, 31
+             mtctr     r11
+             li              r7, 16
+             li              r8, 32
+             li              r9, 48
+             bne        0, L(src_naligned)
+L(copy_salign16):
+             lvx          v14, 0, r4                              /* copy 64 bytes.  */
+             lvx          v15, r7, r4
+             lvx          v16, r8, r4
+             lvx          v17, r9, r4
+             addi       r4, r4, 64
+             stvx        v14, 0, r6
+             stvx        v15, r7, r6
+             stvx        v16, r8, r6
+             stvx        v17, r9, r6
+             addi       r6, r6, 64
+             bdnz      L(copy_salign16)
+L(copy_remaining):
+             srwi.      r11, r5, 3                               /* No of 8 byte copy count.  */
+             rlwinm  r5, r5, 0, 29, 31   /* remaining bytes.  */
+             beq        0, L(copy_bytes)
+             mtcrf     0x01, r11
+             bf            cr7*4+1, L(cp16b)
+
+             lwz         r0, 0(r4)                                /* copy 32 bytes */
+             lwz         r7, 4(r4)
+             lwz         r8, 8(r4)
+             lwz         r9, 12(r4)
+
+             stw         r0, 0(r6)
+             stw         r7, 4(r6)
+             stw         r8, 8(r6)
+             stw         r9, 12(r6)
+
+             lwz         r0, 16(r4)
+             lwz         r7, 20(r4)
+             lwz         r8, 24(r4)
+             lwz         r9, 28(r4)
+             addi       r4, r4, 32
+
+             stw         r0, 16(r6)
+             stw         r7, 20(r6)
+             stw         r8, 24(r6)
+             stw         r9, 28(r6)
+             addi r6, r6, 32
+L(cp16b):
+             bf            cr7*4+2, L(cp8b)
+             lwz         r0, 0(r4)                                /* copy 16 bytes */
+             lwz         r7, 4(r4)
+             lwz         r8, 8(r4)
+             lwz         r9, 12(r4)
+
+             addi       r4, r4, 16
+
+             stw         r0, 0(r6)
+             stw         r7, 4(r6)
+             stw         r8, 8(r6)
+             stw         r9, 12(r6)
+             addi       r6, r6, 16
+L(cp8b):
+             bf            cr7*4+3, L(copy_bytes)
+             lwz         r0, 0(r4)                                /* copy 8 bytes */
+             lwz         r7, 4(r4)
+             addi       r4, r4, 8
+
+             stw         r0, 0(r6)
+             stw         r7, 4(r6)
+             addi       r6, r6, 8
+L(copy_bytes):
+             cmplwi  cr1, r5, 4
+             cmplwi  cr0, r5, 1
+             bgt         cr1, L(gt4b)                         /* nb > 4?  (5, 6, 7 bytes).  */
+             ble          cr0, L(lt1b)                           /* nb <= 1? (0, 1 bytes).  */
+             addi       r0, r5, -2                               /* 2, 3, 4 bytes.  */
+             lhz          r9, 0(r4)
+             lhzx        r11, r4, r0
+             sth          r9, 0(r6)
+             sthx       r11, r6, r0
+             blr
+L(gt4b):
+             addi       r0, r5, -4                               /* 5, 6, 7 bytes.  */
+             lwz         r9, 0(r4)
+             lwzx       r11, r4, r0
+             stw         r9, 0(r6)
+             stwx      r11, r6, r0
+             blr
+L(lt1b):
+             mtocrf  0x1, r5                                   /* nb == 0 ? return.  */
+             bflr         31
+             lbz          r0, 0(r4)                                /* nb == 1.  */
+             stb          r0, 0(r6)
+             blr
+
+L(src_naligned):
+#ifndef _SOFT_FLOAT
+             rlwinm. r0, r4, 0, 29, 31
+             beq        0, L(copy_salign8)
+#endif
+L(copy_snalign):                                            /* copy 64 bytes.  */
+             lvx          v0, 0, r4                /* load MSQ.  */
+             lvsl          v18, 0, r4                              /* set permute control vector.  */
+             lvx          v19, r7, r4                            /* load LSQ.  */
+             vperm   v14, v0, v19, v18               /* align the data.  */
+             lvx          v0, r7, r4                               /* load MSQ.  */
+             lvsl          v18, r7, r4                            /* set permute control vector.  */
+             lvx          v19, r8, r4                            /* load LSQ.  */
+             vperm   v15, v0, v19, v18               /* align the data.  */
+             lvx          v0, r8, r4                               /* load MSQ.  */
+             lvsl          v18, r8, r4                            /* set permute control vector.  */
+             lvx          v19, r9, r4                            /* load LSQ.  */
+             vperm   v16, v0, v19, v18               /* align the data.  */
+             lvx          v0, r9, r4                               /* load MSQ.  */
+             lvsl          v18, r9, r4                            /* set permute control vector.  */
+             addi       r4, r4, 64
+             lvx          v19, 0, r4                              /* load LSQ.  */
+             vperm   v17, v0, v19, v18               /* align the data.  */
+             stvx        v14, 0, r6
+             stvx        v15, r7, r6
+             stvx        v16, r8, r6
+             stvx        v17, r9, r6
+             addi       r6, r6, 64
+             bdnz      L(copy_snalign)
+             b             L(copy_remaining)
+
+#ifndef _SOFT_FLOAT
+L(copy_salign8):
+             lfd           0, 0(r4)                  /* copy 64 bytes.  */
+             lfd           1, 8(r4)
+             lfd           2, 16(r4)
+             lfd           3, 24(r4)
+             stfd        0, 0(r6)
+             stfd        1, 8(r6)
+             stfd        2, 16(r6)
+             stfd        3, 24(r6)
+             lfd           0, 32(r4)
+             lfd           1, 40(r4)
+             lfd           2, 48(r4)
+             lfd           3, 56(r4)
+             addi       r4, r4, 64
+             stfd        0, 32(r6)
+             stfd        1, 40(r6)
+             stfd        2, 48(r6)
+             stfd        3, 56(r6)
+             addi       r6, r6, 64
+             bdnz      L(copy_salign8)
+             b             L(copy_remaining)
+#endif
+
+END (memcpy)
+libc_hidden_builtin_def (memcpy)
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c
--- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c 2015-08-29 15:42:09.769408236 -0500
+++ glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c   2015-08-29 15:45:37.044421872 -0500
@@ -58,6 +58,10 @@
                                                     __memcpy_power6)
                     IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_CELL_BE,
                                                     __memcpy_cell)
+                   IFUNC_IMPL_ADD (array, i, memcpy,
+                                                   (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+                                                   && (hwcap2 & PPC_FEATURE2_HAS_ISEL)),
+                                                   __memcpy_e6500)
                     IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ppc))

   /* Support sysdeps/powerpc/powerpc32/power4/multiarch/memmove.c.  */
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile
--- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile             2015-08-29 15:42:09.769408236 -0500
+++ glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc32/power4/multiarch/Makefile                2015-08-29 15:46:34.217426773 -0500
@@ -1,7 +1,7 @@
ifeq ($(subdir),string)
sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
-                                 memcpy-ppc32 memcmp-power7 memcmp-e6500 memcmp-ppc32 \
-                                 memset-power7 memset-power6 memset-ppc32 \
+                                memcpy-e6500 memcpy-ppc32 memcmp-power7 memcmp-e6500 \
+                                memcmp-ppc32 memset-power7 memset-power6 memset-ppc32 \
                                  bzero-power7 bzero-power6 bzero-ppc32 \
                                  mempcpy-power7 mempcpy-ppc32 memchr-power7 \
                                  memchr-ppc32 memrchr-power7 memrchr-ppc32 rawmemchr-power7 \
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memcpy.c glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc32/power4/multiarch/memcpy.c
--- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memcpy.c          2015-08-29 15:41:52.333407557 -0500
+++ glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc32/power4/multiarch/memcpy.c             2015-08-29 15:45:37.044421872 -0500
@@ -25,6 +25,7 @@
# include "init-arch.h"

 extern __typeof (memcpy) __memcpy_ppc attribute_hidden;
+extern __typeof (memcpy) __memcpy_e6500 attribute_hidden;
extern __typeof (memcpy) __memcpy_cell attribute_hidden;
extern __typeof (memcpy) __memcpy_power6 attribute_hidden;
extern __typeof (memcpy) __memcpy_a2 attribute_hidden;
@@ -40,6 +41,9 @@
                               (hwcap & PPC_FEATURE_ARCH_2_05)
                               ? __memcpy_power6 :
                                 (hwcap & PPC_FEATURE_CELL_BE)
-                                ? __memcpy_cell
+                               ? __memcpy_cell :
+                                 (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+                                 && (hwcap2 & PPC_FEATURE2_HAS_ISEL))
+                                 ? __memcpy_e6500
             : __memcpy_ppc);
#endif
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memcpy-e6500.S glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc32/power4/multiarch/memcpy-e6500.S
--- glibc-2.20/sysdeps/powerpc/powerpc32/power4/multiarch/memcpy-e6500.S            1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc32/power4/multiarch/memcpy-e6500.S               2015-08-29 15:45:37.045421842 -0500
@@ -0,0 +1,38 @@
+/* Optimized memcpy implementation for PowerPC32/e6500.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)                                                  \
+ .globl C_SYMBOL_NAME(__memcpy_e6500);                                                                \
+ .type C_SYMBOL_NAME(__memcpy_e6500),@function;                          \
+ .align ALIGNARG(alignt);                                                                           \
+ EALIGN_W_##words;                                                                                                \
+ C_LABEL(__memcpy_e6500)                                                                  \
+ cfi_startproc;
+
+#undef END
+#define END(name)                                                                                     \
+ cfi_endproc;                                                                                                   \
+ ASM_SIZE_DIRECTIVE(__memcpy_e6500)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc32/e6500/memcpy.S>
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/e6500/memcpy.S glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc64/e6500/memcpy.S
--- glibc-2.20/sysdeps/powerpc/powerpc64/e6500/memcpy.S   1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc64/e6500/memcpy.S     2015-08-29 15:45:37.045421842 -0500
@@ -0,0 +1,184 @@
+/* Optimized memcpy implementation for e6500 64-bit PowerPC.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+   Returns 'dst'.
+
+             r3 = destination
+             r4 = source
+             r5 = byte count
+
+             volatile fixed point registers usable:
+             r0, r3-r12
+
+             volatile floating point registers usable:
+             f0-f13.  */
+
+EALIGN (memcpy, 5, 0)
+             CALL_MCOUNT 3
+             cmpld    cr0, r4, r3                             /* if source==destination, return.  */
+             beqlr     cr0
+             /* if number of bytes is less than 16 but greater than zero,
+                copy byte-by-byte.  */
+             cmpldi   r5, 16
+             mr          r6, r3
+             ble          L(copy_remaining)
+             neg        r0, r3
+             andi.      r11, r0, 15
+             beq        L(dst_align)
+             ld            r12, 0(r4)
+             ld            r0, 8(r4)
+             subf       r5, r11, r5
+             add        r4, r4, r11
+             std          r12, 0(r6)
+             std          r0, 8(r6)
+             add        r6, r6, r11
+L(dst_align):
+             cmpldi   7, r5, 63
+             ble          7, L(copy_remaining)
+             srwi        r11, r5, 6                               /* No of 64 byte copy count.  */
+             rlwinm  r5, r5, 0, 26, 31   /* remaining bytes.  */
+             rlwinm. r0, r4, 0, 28, 31
+             mtctr     r11
+             li              r7, 16
+             li              r8, 32
+             li              r9, 48
+             bne        0, L(src_naligned)
+L(copy_salign):
+             lvx          v14, 0, r4
+             lvx          v15, r7, r4
+             lvx          v16, r8, r4
+             lvx          v17, r9, r4
+             addi       r4, r4, 64
+             stvx        v14, 0, r6
+             stvx        v15, r7, r6
+             stvx        v16, r8, r6
+             stvx        v17, r9, r6
+             addi       r6, r6, 64
+             bdnz      L(copy_salign)
+L(copy_remaining):
+             srwi.      r11, r5, 3                               /* No of 8 byte copy count.  */
+             rlwinm  r5, r5, 0, 29, 31   /* remaining bytes.  */
+             beq        0, L(copy_bytes)
+             mtcrf     0x01, r11
+             bf            cr7*4+1, L(cp16b)
+             ld            r0, 0(r4)                                /* copy 32 bytes.  */
+             ld            r7, 8(r4)
+             ld            r8, 16(r4)
+             ld            r9, 24(r4)
+             addi       r4, r4, 32
+             std          r0, 0(r6)
+             std          r7, 8(r6)
+             std          r8, 16(r6)
+             std          r9, 24(r6)
+             addi       r6, r6, 32
+L(cp16b):
+             bf            cr7*4+2, L(cp8b)
+             ld            r7, 0(r4)                                /* copy 16 bytes.  */
+             ld            r8, 8(r4)
+             addi       r4, r4, 16
+             std          r7, 0(r6)
+             std          r8, 8(r6)
+             addi       r6, r6, 16
+L(cp8b):
+             bf            cr7*4+3, L(copy_bytes)
+             ld            r7, 0(r4)                                /* copy 8 bytes.  */
+             addi       r4, r4, 8
+             std          r7, 0(r6)
+             addi       r6, r6, 8
+L(copy_bytes):
+             cmpldi   cr1, r5, 4
+             cmpldi   cr0, r5, 1
+             bgt         cr1, L(gt4b)                         /* nb > 4?  (5, 6, 7 bytes).  */
+             ble          cr0, L(lt1b)                           /* nb <= 1? (0, 1 bytes).  */
+             addi       r0, r5, -2                               /* 2, 3, 4 bytes.  */
+             lhz          r9, 0(r4)
+             lhzx        r11, r4, r0
+             sth          r9, 0(r6)
+             sthx       r11, r6, r0
+             blr
+L(gt4b):
+             addi       r0, r5, -4                               /* 5, 6, 7 bytes.  */
+             lwz         r9, 0(r4)
+             lwzx       r11, r4, r0
+             stw         r9, 0(r6)
+             stwx      r11, r6, r0
+             blr
+L(lt1b):
+             mtocrf  0x1, r5                                   /* nb == 0 ? return.  */
+             bflr         31
+             lbz          r0, 0(r4)                                /* nb == 1.  */
+             stb          r0, 0(r6)
+             blr
+
+L(src_naligned):
+             rlwinm. r0, r4, 0, 29, 31
+             beq        0, L(copy_salign8)
+L(copy_snalign):
+             lvx          v0, 0, r4                /* load MSQ.  */
+             lvsl          v18, 0, r4                              /* set permute control vector.  */
+             lvx          v19, r7, r4                            /* load LSQ.  */
+             vperm   v14, v0, v19, v18               /* align the data.  */
+             lvx          v0, r7, r4                               /* load MSQ.  */
+             lvsl          v18, r7, r4                            /* set permute control vector.  */
+             lvx          v19, r8, r4                            /* load LSQ.  */
+             vperm   v15, v0, v19, v18               /* align the data.  */
+             lvx          v0, r8, r4                               /* load MSQ.  */
+             lvsl          v18, r8, r4                            /* set permute control vector.  */
+             lvx          v19, r9, r4                            /* load LSQ.  */
+             vperm   v16, v0, v19, v18               /* align the data.  */
+             lvx          v0, r9, r4                               /* load MSQ.  */
+             lvsl          v18, r9, r4                            /* set permute control vector.  */
+             addi       r4, r4, 64
+             lvx          v19, 0, r4                              /* load LSQ.  */
+             vperm   v17, v0, v19, v18               /* align the data.  */
+             stvx        v14, 0, r6
+             stvx        v15, r7, r6
+             stvx        v16, r8, r6
+             stvx        v17, r9, r6
+             addi       r6, r6, 64
+             bdnz      L(copy_snalign)
+             b             L(copy_remaining)
+
+L(copy_salign8):
+             ld            r0, 0(r4)
+             ld            r7, 8(r4)
+             ld            r8, 16(r4)
+             ld            r9, 24(r4)
+             std          r0, 0(r6)
+             std          r7, 8(r6)
+             std          r8, 16(r6)
+             std          r9, 24(r6)
+             ld            r0, 32(r4)
+             ld            r7, 40(r4)
+             ld            r8, 48(r4)
+             ld            r9, 56(r4)
+             addi       r4, r4, 64
+             std          r0, 32(r6)
+             std          r7, 40(r6)
+             std          r8, 48(r6)
+             std          r9, 56(r6)
+             addi       r6, r6, 64
+             bdnz      L(copy_salign8)
+             b             L(copy_remaining)
+
+END_GEN_TB (memcpy,TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
--- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c  2015-08-29 15:42:09.771408290 -0500
+++ glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c     2015-08-29 15:45:37.045421842 -0500
@@ -60,6 +60,10 @@
                                                     __memcpy_cell)
                     IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_POWER4,
                                                     __memcpy_power4)
+                   IFUNC_IMPL_ADD (array, i, memcpy,
+                                                   (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+                                                   && (hwcap2 & PPC_FEATURE2_HAS_ISEL)),
+                                                   __memcpy_e6500)
                     IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ppc))

   /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c.  */
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/Makefile glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc64/multiarch/Makefile
--- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/Makefile               2015-08-29 15:42:09.771408290 -0500
+++ glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc64/multiarch/Makefile  2015-08-29 15:47:51.985430863 -0500
@@ -1,7 +1,7 @@
ifeq ($(subdir),string)
sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
-                                 memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
-                                 memcmp-e6500 memcmp-ppc64 \
+                                memcpy-power4 memcpy-e6500 memcpy-ppc64 memcmp-power7 \
+                                memcmp-power4 memcmp-e6500 memcmp-ppc64 \
                                  memset-power7 memset-power6 memset-power4 \
                                  memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \
                                  mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memcpy.c glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
--- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memcpy.c            2015-08-29 15:41:52.354407558 -0500
+++ glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc64/multiarch/memcpy.c               2015-08-29 15:45:37.045421842 -0500
@@ -30,6 +30,7 @@
extern __typeof (__redirect_memcpy) __libc_memcpy;

 extern __typeof (__redirect_memcpy) __memcpy_ppc attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_e6500 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_power4 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_cell attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
@@ -46,7 +47,10 @@
                                 (hwcap & PPC_FEATURE_CELL_BE)
                                 ? __memcpy_cell :
                                   (hwcap & PPC_FEATURE_POWER4)
-                                  ? __memcpy_power4
+                                 ? __memcpy_power4 :
+                                   (((hwcap & PPC_FEATURE_E6500) == PPC_FEATURE_E6500)
+                                   && (hwcap2 & PPC_FEATURE2_HAS_ISEL))
+                                   ? __memcpy_e6500
             : __memcpy_ppc);

 #undef memcpy
diff -Naur glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memcpy-e6500.S glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc64/multiarch/memcpy-e6500.S
--- glibc-2.20/sysdeps/powerpc/powerpc64/multiarch/memcpy-e6500.S              1969-12-31 18:00:00.000000000 -0600
+++ glibc-2.20-e6500-mcpy/sysdeps/powerpc/powerpc64/multiarch/memcpy-e6500.S 2015-08-29 15:45:37.045421842 -0500
@@ -0,0 +1,40 @@
+/* Optimized memcpy implementation for PowerPC64/e6500.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)                                                  \
+  .section ".text";                                                                                           \
+  ENTRY_2(__memcpy_e6500)                                                                \
+  .align ALIGNARG(alignt);                                                                          \
+  EALIGN_W_##words;                                                                                               \
+  BODY_LABEL(__memcpy_e6500):                                                                       \
+  cfi_startproc;                                                                                \
+  LOCALENTRY(__memcpy_e6500)
+
+#undef END_GEN_TB
+#define END_GEN_TB(name, mask)                                                                    \
+  cfi_endproc;                                                                                                  \
+  TRACEBACK_MASK(__memcpy_e6500,mask)                                                               \
+  END_2(__memcpy_e6500)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/e6500/memcpy.S>

The patch was generated on top of glibc v2.20 source base.
The patch was tested with dejaGNU and glibc testsuite. There were no regressions.

The benchsuite (both 32-bit and 64-bit) results are attached for your reference.

Please let me know your comments.

Regards,
Rohit

Attachment: benchtest-e6500-64bit-memcpy.txt
Description: benchtest-e6500-64bit-memcpy.txt

Attachment: benchtest-e6500-32bit-memcpy.txt
Description: benchtest-e6500-32bit-memcpy.txt


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]