This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] powerpc: P9 vector load instruction change in memcpy and memmove
- From: Adhemerval Zanella <adhemerval dot zanella at linaro dot org>
- To: libc-alpha at sourceware dot org
- Date: Thu, 19 Oct 2017 13:52:33 -0200
- Subject: Re: [PATCH] powerpc: P9 vector load instruction change in memcpy and memmove
- Authentication-results: sourceware.org; auth=none
- References: <20171019152531.12064-1-tuliom@linux.vnet.ibm.com>
On 19/10/2017 13:25, Tulio Magno Quites Machado Filho wrote:
> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>
> POWER9 DD2.1 and earlier has an issue where some cache inhibited
> vector load traps to the kernel. To handle this in memcpy
> and memmove, lvx/stvx is used for aligned addresses instead
> of lxvd2x/stxvd2x. The remaining part of the optimization remains
> same as existing POWER7 code.
>
> Reference: https://patchwork.ozlabs.org/patch/814059/
> Tested on powerpc64le.
According to "POWER8 Processor User’s Manual for the Single-Chip Module"
(it is buried on a sign wall at [1]), both lxv2dx/lvx and stxvd2x/stvx
uses the same pipeline, have the same latency and same throughput. The
only difference is lxv2dx/stxv2x have microcode handling for unaligned
case and for 4k crossing or 32-byte cross L1 miss (which should not
occur in the with aligned address).
Why not change POWER7 implementation instead of dropping another one
which is exactly the same for POWER9?
[1] https://www-355.ibm.com/systems/power/openpower/tgcmDocumentRepository.xhtml?aliasId=POWER8
>
> 2017-10-19 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>
> * sysdeps/powerpc/powerpc64/multiarch/Makefile
> (sysdep_routines): Add memcpy_power9 and memmove_power9.
> * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> (memcpy): Add __memcpy_power9 to list of memcpy functions.
> (memmove): Add __memmove_power9 to list of memmove functions.
> (bcopy): Add __bcopy_power9 to list of bcopy functions.
> * sysdeps/powerpc/powerpc64/multiarch/memcpy.c
> (memcpy): Add __memcpy_power9 to ifunc list.
> * sysdeps/powerpc/powerpc64/power9/memcpy.S: New File.
> * sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S: Likewise.
> * sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> (bcopy): Add __bcopy_power9 to ifunc list.
> * sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
> Change bcopy as __bcopy.
> * sysdeps/powerpc/powerpc64/multiarch/memmove.c
> (memmove): Add __memmove_power9 to ifunc list.
> * sysdeps/powerpc/powerpc64/power7/memmove.S:
> Alias bcopy only if not defined before.
> * sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S:
> New file.
> * sysdeps/powerpc/powerpc64/power9/memmove.S: Likewise.
> ---
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 7 +-
> sysdeps/powerpc/powerpc64/multiarch/bcopy.c | 6 +-
> .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 6 +
> .../powerpc/powerpc64/multiarch/memcpy-power9.S | 26 +
> sysdeps/powerpc/powerpc64/multiarch/memcpy.c | 3 +
> .../powerpc/powerpc64/multiarch/memmove-power7.S | 4 +-
> .../powerpc/powerpc64/multiarch/memmove-power9.S | 29 +
> sysdeps/powerpc/powerpc64/multiarch/memmove.c | 5 +-
> sysdeps/powerpc/powerpc64/power7/memmove.S | 2 +
> sysdeps/powerpc/powerpc64/power9/memcpy.S | 429 +++++++++++
> sysdeps/powerpc/powerpc64/power9/memmove.S | 837 +++++++++++++++++++++
> 11 files changed, 1347 insertions(+), 7 deletions(-)
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
> create mode 100644 sysdeps/powerpc/powerpc64/power9/memcpy.S
> create mode 100644 sysdeps/powerpc/powerpc64/power9/memmove.S
>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index dea49ac..82728fa 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -1,6 +1,6 @@
> ifeq ($(subdir),string)
> -sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
> - memcpy-power4 memcpy-ppc64 \
> +sysdep_routines += memcpy-power9 memcpy-power7 memcpy-a2 memcpy-power6 \
> + memcpy-cell memcpy-power4 memcpy-ppc64 \
> memcmp-power8 memcmp-power7 memcmp-power4 memcmp-ppc64 \
> memset-power7 memset-power6 memset-power4 \
> memset-ppc64 memset-power8 \
> @@ -24,7 +24,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
> stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
> strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
> strcat-power8 strcat-power7 strcat-ppc64 \
> - memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
> + memmove-power9 memmove-power7 memmove-ppc64 \
> + wordcopy-ppc64 bcopy-ppc64 \
> strncpy-power8 strstr-power7 strstr-ppc64 \
> strspn-power8 strspn-ppc64 strcspn-power8 strcspn-ppc64 \
> strlen-power8 strcasestr-power8 strcasestr-ppc64 \
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> index 05d46e2..4a4ee6e 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> @@ -22,8 +22,12 @@
> extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
> /* __bcopy_power7 symbol is implemented at memmove-power7.S */
> extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
> +/* __bcopy_power9 symbol is implemented at memmove-power9.S. */
> +extern __typeof (bcopy) __bcopy_power9 attribute_hidden;
>
> libc_ifunc (bcopy,
> - (hwcap & PPC_FEATURE_HAS_VSX)
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __bcopy_power9
> + : (hwcap & PPC_FEATURE_HAS_VSX)
> ? __bcopy_power7
> : __bcopy_ppc);
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index 6a88536..9040bbc 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -51,6 +51,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> #ifdef SHARED
> /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c. */
> IFUNC_IMPL (i, name, memcpy,
> + IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> + __memcpy_power9)
> IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
> __memcpy_power7)
> IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
> @@ -65,6 +67,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c. */
> IFUNC_IMPL (i, name, memmove,
> + IFUNC_IMPL_ADD (array, i, memmove, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> + __memmove_power9)
> IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
> __memmove_power7)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
> @@ -168,6 +172,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c. */
> IFUNC_IMPL (i, name, bcopy,
> + IFUNC_IMPL_ADD (array, i, bcopy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> + __bcopy_power9)
> IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
> __bcopy_power7)
> IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
> new file mode 100644
> index 0000000..fbd0788
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
> @@ -0,0 +1,26 @@
> +/* Optimized memcpy implementation for PowerPC/POWER9.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +#define MEMCPY __memcpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/power9/memcpy.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
> index 9f4286c..4c16fa0 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
> @@ -35,8 +35,11 @@ extern __typeof (__redirect_memcpy) __memcpy_cell attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden;
> +extern __typeof (__redirect_memcpy) __memcpy_power9 attribute_hidden;
>
> libc_ifunc (__libc_memcpy,
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __memcpy_power9 :
> (hwcap & PPC_FEATURE_HAS_VSX)
> ? __memcpy_power7 :
> (hwcap & PPC_FEATURE_ARCH_2_06)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
> index a9435fa..0599a39 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
> @@ -23,7 +23,7 @@
> #undef libc_hidden_builtin_def
> #define libc_hidden_builtin_def(name)
>
> -#undef bcopy
> -#define bcopy __bcopy_power7
> +#undef __bcopy
> +#define __bcopy __bcopy_power7
>
> #include <sysdeps/powerpc/powerpc64/power7/memmove.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
> new file mode 100644
> index 0000000..16a2267
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
> @@ -0,0 +1,29 @@
> +/* Optimized memmove implementation for PowerPC64/POWER7.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +#define MEMMOVE __memmove_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#undef __bcopy
> +#define __bcopy __bcopy_power9
> +
> +#include <sysdeps/powerpc/powerpc64/power9/memmove.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
> index db2bbc7..f02498e 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
> @@ -31,9 +31,12 @@ extern __typeof (__redirect_memmove) __libc_memmove;
>
> extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
> +extern __typeof (__redirect_memmove) __memmove_power9 attribute_hidden;
>
> libc_ifunc (__libc_memmove,
> - (hwcap & PPC_FEATURE_HAS_VSX)
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __memmove_power9
> + : (hwcap & PPC_FEATURE_HAS_VSX)
> ? __memmove_power7
> : __memmove_ppc);
>
> diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
> index 93baa69..0bb8ddc 100644
> --- a/sysdeps/powerpc/powerpc64/power7/memmove.S
> +++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
> @@ -832,4 +832,6 @@ ENTRY_TOCLESS (__bcopy)
> mr r4,r6
> b L(_memmove)
> END (__bcopy)
> +#ifndef __bcopy
> weak_alias (__bcopy, bcopy)
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/power9/memcpy.S b/sysdeps/powerpc/powerpc64/power9/memcpy.S
> new file mode 100644
> index 0000000..0731bac
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power9/memcpy.S
> @@ -0,0 +1,429 @@
> +/* Optimized memcpy implementation for PowerPC64/POWER9.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +
> +/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
> + Returns 'dst'. */
> +
> +#ifndef MEMCPY
> +# define MEMCPY memcpy
> +#endif
> +
> +#define dst 11 /* Use r11 so r3 kept unchanged. */
> +#define src 4
> +#define cnt 5
> +
> + .machine power7
> +ENTRY_TOCLESS (MEMCPY, 5)
> + CALL_MCOUNT 3
> +
> + cmpldi cr1,cnt,31
> + neg 0,3
> + ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
> + code. */
> +
> +/* Align copies using VSX instructions to quadword. It is to avoid alignment
> + traps when memcpy is used on non-cacheable memory (for instance, memory
> + mapped I/O). */
> + andi. 10,3,15
> + clrldi 11,4,60
> + cmpld cr6,10,11 /* SRC and DST alignments match? */
> +
> + mr dst,3
> + bne cr6,L(copy_GE_32_unaligned)
> + beq L(aligned_copy)
> +
> + mtocrf 0x01,0
> + clrldi 0,0,60
> +
> +/* Get the DST and SRC aligned to 16 bytes. */
> +1:
> + bf 31,2f
> + lbz 6,0(src)
> + addi src,src,1
> + stb 6,0(dst)
> + addi dst,dst,1
> +2:
> + bf 30,4f
> + lhz 6,0(src)
> + addi src,src,2
> + sth 6,0(dst)
> + addi dst,dst,2
> +4:
> + bf 29,8f
> + lwz 6,0(src)
> + addi src,src,4
> + stw 6,0(dst)
> + addi dst,dst,4
> +8:
> + bf 28,16f
> + ld 6,0(src)
> + addi src,src,8
> + std 6,0(dst)
> + addi dst,dst,8
> +16:
> + subf cnt,0,cnt
> +
> +/* Main aligned copy loop. Copies 128 bytes at a time. */
> +L(aligned_copy):
> + li 6,16
> + li 7,32
> + li 8,48
> + mtocrf 0x02,cnt
> + srdi 12,cnt,7
> + cmpdi 12,0
> + beq L(aligned_tail)
> + lvx 6,0,src
> + lvx 7,src,6
> + mtctr 12
> + b L(aligned_128loop)
> +
> + .align 4
> +L(aligned_128head):
> + /* for the 2nd + iteration of this loop. */
> + lvx 6,0,src
> + lvx 7,src,6
> +L(aligned_128loop):
> + lvx 8,src,7
> + lvx 9,src,8
> + stvx 6,0,dst
> + addi src,src,64
> + stvx 7,dst,6
> + stvx 8,dst,7
> + stvx 9,dst,8
> + lvx 6,0,src
> + lvx 7,src,6
> + addi dst,dst,64
> + lvx 8,src,7
> + lvx 9,src,8
> + addi src,src,64
> + stvx 6,0,dst
> + stvx 7,dst,6
> + stvx 8,dst,7
> + stvx 9,dst,8
> + addi dst,dst,64
> + bdnz L(aligned_128head)
> +
> +L(aligned_tail):
> + mtocrf 0x01,cnt
> + bf 25,32f
> + lvx 6,0,src
> + lvx 7,src,6
> + lvx 8,src,7
> + lvx 9,src,8
> + addi src,src,64
> + stvx 6,0,dst
> + stvx 7,dst,6
> + stvx 8,dst,7
> + stvx 9,dst,8
> + addi dst,dst,64
> +32:
> + bf 26,16f
> + lvx 6,0,src
> + lvx 7,src,6
> + addi src,src,32
> + stvx 6,0,dst
> + stvx 7,dst,6
> + addi dst,dst,32
> +16:
> + bf 27,8f
> + lvx 6,0,src
> + addi src,src,16
> + stvx 6,0,dst
> + addi dst,dst,16
> +8:
> + bf 28,4f
> + ld 6,0(src)
> + addi src,src,8
> + std 6,0(dst)
> + addi dst,dst,8
> +4: /* Copies 4~7 bytes. */
> + bf 29,L(tail2)
> + lwz 6,0(src)
> + stw 6,0(dst)
> + bf 30,L(tail5)
> + lhz 7,4(src)
> + sth 7,4(dst)
> + bflr 31
> + lbz 8,6(src)
> + stb 8,6(dst)
> + /* Return original DST pointer. */
> + blr
> +
> +
> +/* Handle copies of 0~31 bytes. */
> + .align 4
> +L(copy_LT_32):
> + mr dst,3
> + cmpldi cr6,cnt,8
> + mtocrf 0x01,cnt
> + ble cr6,L(copy_LE_8)
> +
> + /* At least 9 bytes to go. */
> + neg 8,4
> + andi. 0,8,3
> + cmpldi cr1,cnt,16
> + beq L(copy_LT_32_aligned)
> +
> + /* Force 4-byte alignment for SRC. */
> + mtocrf 0x01,0
> + subf cnt,0,cnt
> +2:
> + bf 30,1f
> + lhz 6,0(src)
> + addi src,src,2
> + sth 6,0(dst)
> + addi dst,dst,2
> +1:
> + bf 31,L(end_4bytes_alignment)
> + lbz 6,0(src)
> + addi src,src,1
> + stb 6,0(dst)
> + addi dst,dst,1
> +
> + .align 4
> +L(end_4bytes_alignment):
> + cmpldi cr1,cnt,16
> + mtocrf 0x01,cnt
> +
> +L(copy_LT_32_aligned):
> + /* At least 6 bytes to go, and SRC is word-aligned. */
> + blt cr1,8f
> +
> + /* Copy 16 bytes. */
> + lwz 6,0(src)
> + lwz 7,4(src)
> + stw 6,0(dst)
> + lwz 8,8(src)
> + stw 7,4(dst)
> + lwz 6,12(src)
> + addi src,src,16
> + stw 8,8(dst)
> + stw 6,12(dst)
> + addi dst,dst,16
> +8: /* Copy 8 bytes. */
> + bf 28,L(tail4)
> + lwz 6,0(src)
> + lwz 7,4(src)
> + addi src,src,8
> + stw 6,0(dst)
> + stw 7,4(dst)
> + addi dst,dst,8
> +
> + .align 4
> +/* Copies 4~7 bytes. */
> +L(tail4):
> + bf 29,L(tail2)
> + lwz 6,0(src)
> + stw 6,0(dst)
> + bf 30,L(tail5)
> + lhz 7,4(src)
> + sth 7,4(dst)
> + bflr 31
> + lbz 8,6(src)
> + stb 8,6(dst)
> + /* Return original DST pointer. */
> + blr
> +
> + .align 4
> +/* Copies 2~3 bytes. */
> +L(tail2):
> + bf 30,1f
> + lhz 6,0(src)
> + sth 6,0(dst)
> + bflr 31
> + lbz 7,2(src)
> + stb 7,2(dst)
> + blr
> +
> + .align 4
> +L(tail5):
> + bflr 31
> + lbz 6,4(src)
> + stb 6,4(dst)
> + blr
> +
> + .align 4
> +1:
> + bflr 31
> + lbz 6,0(src)
> + stb 6,0(dst)
> + /* Return original DST pointer. */
> + blr
> +
> +
> +/* Handles copies of 0~8 bytes. */
> + .align 4
> +L(copy_LE_8):
> + bne cr6,L(tail4)
> +
> + /* Though we could've used ld/std here, they are still
> + slow for unaligned cases. */
> +
> + lwz 6,0(src)
> + lwz 7,4(src)
> + stw 6,0(dst)
> + stw 7,4(dst)
> + blr
> +
> +
> +/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
> + SRC is not. Use aligned quadword loads from SRC, shifted to realign
> + the data, allowing for aligned DST stores. */
> + .align 4
> +L(copy_GE_32_unaligned):
> + clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
> + srdi 9,cnt,4 /* Number of full quadwords remaining. */
> +
> + beq L(copy_GE_32_unaligned_cont)
> +
> + /* DST is not quadword aligned, get it aligned. */
> +
> + mtocrf 0x01,0
> + subf cnt,0,cnt
> +
> + /* Vector instructions work best when proper alignment (16-bytes)
> + is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
> +1:
> + bf 31,2f
> + lbz 6,0(src)
> + addi src,src,1
> + stb 6,0(dst)
> + addi dst,dst,1
> +2:
> + bf 30,4f
> + lhz 6,0(src)
> + addi src,src,2
> + sth 6,0(dst)
> + addi dst,dst,2
> +4:
> + bf 29,8f
> + lwz 6,0(src)
> + addi src,src,4
> + stw 6,0(dst)
> + addi dst,dst,4
> +8:
> + bf 28,0f
> + ld 6,0(src)
> + addi src,src,8
> + std 6,0(dst)
> + addi dst,dst,8
> +0:
> + srdi 9,cnt,4 /* Number of full quadwords remaining. */
> +
> + /* The proper alignment is present, it is OK to copy the bytes now. */
> +L(copy_GE_32_unaligned_cont):
> +
> + /* Setup two indexes to speed up the indexed vector operations. */
> + clrldi 10,cnt,60
> + li 6,16 /* Index for 16-bytes offsets. */
> + li 7,32 /* Index for 32-bytes offsets. */
> + cmpldi cr1,10,0
> + srdi 8,cnt,5 /* Setup the loop counter. */
> + mtocrf 0x01,9
> + cmpldi cr6,9,1
> +#ifdef __LITTLE_ENDIAN__
> + lvsr 5,0,src
> +#else
> + lvsl 5,0,src
> +#endif
> + lvx 3,0,src
> + li 0,0
> + bf 31,L(setup_unaligned_loop)
> +
> + /* Copy another 16 bytes to align to 32-bytes due to the loop. */
> + lvx 4,src,6
> +#ifdef __LITTLE_ENDIAN__
> + vperm 6,4,3,5
> +#else
> + vperm 6,3,4,5
> +#endif
> + addi src,src,16
> + stvx 6,0,dst
> + addi dst,dst,16
> + vor 3,4,4
> + clrrdi 0,src,60
> +
> +L(setup_unaligned_loop):
> + mtctr 8
> + ble cr6,L(end_unaligned_loop)
> +
> + /* Copy 32 bytes at a time using vector instructions. */
> + .align 4
> +L(unaligned_loop):
> +
> + /* Note: vr6/vr10 may contain data that was already copied,
> + but in order to get proper alignment, we may have to copy
> + some portions again. This is faster than having unaligned
> + vector instructions though. */
> +
> + lvx 4,src,6
> +#ifdef __LITTLE_ENDIAN__
> + vperm 6,4,3,5
> +#else
> + vperm 6,3,4,5
> +#endif
> + lvx 3,src,7
> +#ifdef __LITTLE_ENDIAN__
> + vperm 10,3,4,5
> +#else
> + vperm 10,4,3,5
> +#endif
> + addi src,src,32
> + stvx 6,0,dst
> + stvx 10,dst,6
> + addi dst,dst,32
> + bdnz L(unaligned_loop)
> +
> + clrrdi 0,src,60
> +
> + .align 4
> +L(end_unaligned_loop):
> +
> + /* Check for tail bytes. */
> + mtocrf 0x01,cnt
> + beqlr cr1
> +
> + add src,src,0
> +
> + /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
> + /* Copy 8 bytes. */
> + bf 28,4f
> + lwz 6,0(src)
> + lwz 7,4(src)
> + addi src,src,8
> + stw 6,0(dst)
> + stw 7,4(dst)
> + addi dst,dst,8
> +4: /* Copy 4~7 bytes. */
> + bf 29,L(tail2)
> + lwz 6,0(src)
> + stw 6,0(dst)
> + bf 30,L(tail5)
> + lhz 7,4(src)
> + sth 7,4(dst)
> + bflr 31
> + lbz 8,6(src)
> + stb 8,6(dst)
> + /* Return original DST pointer. */
> + blr
> +
> +END_GEN_TB (MEMCPY,TB_TOCLESS)
> +libc_hidden_builtin_def (memcpy)
> diff --git a/sysdeps/powerpc/powerpc64/power9/memmove.S b/sysdeps/powerpc/powerpc64/power9/memmove.S
> new file mode 100644
> index 0000000..9ed8f77
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power9/memmove.S
> @@ -0,0 +1,837 @@
> +/* Optimized memmove implementation for PowerPC64/POWER7.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +
> +/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
> +
> + This optimization check if memory 'dest' overlaps with 'src'. If it does
> + not then it calls an optimized memcpy call (similar to memcpy for POWER7,
> + embedded here to gain some cycles).
> + If source and destiny overlaps, a optimized backwards memcpy is used
> + instead. */
> +
> +#ifndef MEMMOVE
> +# define MEMMOVE memmove
> +#endif
> + .machine power7
> +ENTRY_TOCLESS (MEMMOVE, 5)
> + CALL_MCOUNT 3
> +
> +L(_memmove):
> + subf r9,r4,r3
> + cmpld cr7,r9,r5
> + blt cr7,L(memmove_bwd)
> +
> + cmpldi cr1,r5,31
> + neg 0,3
> + ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
> + code. */
> +
> + andi. 10,3,15
> + clrldi 11,4,60
> + cmpld cr6,10,11 /* SRC and DST alignments match? */
> +
> + mr r11,3
> + bne cr6,L(copy_GE_32_unaligned)
> + beq L(aligned_copy)
> +
> + mtocrf 0x01,0
> + clrldi 0,0,60
> +
> +/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
> +1:
> + bf 31,2f
> + lbz 6,0(r4)
> + addi r4,r4,1
> + stb 6,0(r11)
> + addi r11,r11,1
> +2:
> + bf 30,4f
> + lhz 6,0(r4)
> + addi r4,r4,2
> + sth 6,0(r11)
> + addi r11,r11,2
> +4:
> + bf 29,8f
> + lwz 6,0(r4)
> + addi r4,r4,4
> + stw 6,0(r11)
> + addi r11,r11,4
> +8:
> + bf 28,16f
> + ld 6,0(r4)
> + addi r4,r4,8
> + std 6,0(r11)
> + addi r11,r11,8
> +16:
> + subf r5,0,r5
> +
> +/* Main aligned copy loop. Copies 128 bytes at a time. */
> +L(aligned_copy):
> + li 6,16
> + li 7,32
> + li 8,48
> + mtocrf 0x02,r5
> + srdi 12,r5,7
> + cmpdi 12,0
> + beq L(aligned_tail)
> + lvx 6,0,r4
> + lvx 7,r4,6
> + mtctr 12
> + b L(aligned_128loop)
> +
> + .align 4
> +L(aligned_128head):
> + /* for the 2nd + iteration of this loop. */
> + lvx 6,0,r4
> + lvx 7,r4,6
> +L(aligned_128loop):
> + lvx 8,r4,7
> + lvx 9,r4,8
> + stvx 6,0,r11
> + addi r4,r4,64
> + stvx 7,r11,6
> + stvx 8,r11,7
> + stvx 9,r11,8
> + lvx 6,0,r4
> + lvx 7,r4,6
> + addi r11,r11,64
> + lvx 8,r4,7
> + lvx 9,r4,8
> + addi r4,r4,64
> + stvx 6,0,r11
> + stvx 7,r11,6
> + stvx 8,r11,7
> + stvx 9,r11,8
> + addi r11,r11,64
> + bdnz L(aligned_128head)
> +
> +L(aligned_tail):
> + mtocrf 0x01,r5
> + bf 25,32f
> + lvx 6,0,r4
> + lvx 7,r4,6
> + lvx 8,r4,7
> + lvx 9,r4,8
> + addi r4,r4,64
> + stvx 6,0,r11
> + stvx 7,r11,6
> + stvx 8,r11,7
> + stvx 9,r11,8
> + addi r11,r11,64
> +32:
> + bf 26,16f
> + lvx 6,0,r4
> + lvx 7,r4,6
> + addi r4,r4,32
> + stvx 6,0,r11
> + stvx 7,r11,6
> + addi r11,r11,32
> +16:
> + bf 27,8f
> + lvx 6,0,r4
> + addi r4,r4,16
> + stvx 6,0,r11
> + addi r11,r11,16
> +8:
> + bf 28,4f
> + ld 6,0(r4)
> + addi r4,r4,8
> + std 6,0(r11)
> + addi r11,r11,8
> +4: /* Copies 4~7 bytes. */
> + bf 29,L(tail2)
> + lwz 6,0(r4)
> + stw 6,0(r11)
> + bf 30,L(tail5)
> + lhz 7,4(r4)
> + sth 7,4(r11)
> + bflr 31
> + lbz 8,6(r4)
> + stb 8,6(r11)
> + /* Return original DST pointer. */
> + blr
> +
> +/* Handle copies of 0~31 bytes. */
> + .align 4
> +L(copy_LT_32):
> + mr r11,3
> + cmpldi cr6,r5,8
> + mtocrf 0x01,r5
> + ble cr6,L(copy_LE_8)
> +
> + /* At least 9 bytes to go. */
> + neg 8,4
> + andi. 0,8,3
> + cmpldi cr1,r5,16
> + beq L(copy_LT_32_aligned)
> +
> + /* Force 4-byte alignment for SRC. */
> + mtocrf 0x01,0
> + subf r5,0,r5
> +2:
> + bf 30,1f
> + lhz 6,0(r4)
> + addi r4,r4,2
> + sth 6,0(r11)
> + addi r11,r11,2
> +1:
> + bf 31,L(end_4bytes_alignment)
> + lbz 6,0(r4)
> + addi r4,r4,1
> + stb 6,0(r11)
> + addi r11,r11,1
> +
> + .align 4
> +L(end_4bytes_alignment):
> + cmpldi cr1,r5,16
> + mtocrf 0x01,r5
> +
> +L(copy_LT_32_aligned):
> + /* At least 6 bytes to go, and SRC is word-aligned. */
> + blt cr1,8f
> +
> + /* Copy 16 bytes. */
> + lwz 6,0(r4)
> + lwz 7,4(r4)
> + stw 6,0(r11)
> + lwz 8,8(r4)
> + stw 7,4(r11)
> + lwz 6,12(r4)
> + addi r4,r4,16
> + stw 8,8(r11)
> + stw 6,12(r11)
> + addi r11,r11,16
> +8: /* Copy 8 bytes. */
> + bf 28,L(tail4)
> + lwz 6,0(r4)
> + lwz 7,4(r4)
> + addi r4,r4,8
> + stw 6,0(r11)
> + stw 7,4(r11)
> + addi r11,r11,8
> +
> + .align 4
> +/* Copies 4~7 bytes. */
> +L(tail4):
> + bf 29,L(tail2)
> + lwz 6,0(r4)
> + stw 6,0(r11)
> + bf 30,L(tail5)
> + lhz 7,4(r4)
> + sth 7,4(r11)
> + bflr 31
> + lbz 8,6(r4)
> + stb 8,6(r11)
> + /* Return original DST pointer. */
> + blr
> +
> + .align 4
> +/* Copies 2~3 bytes. */
> +L(tail2):
> + bf 30,1f
> + lhz 6,0(r4)
> + sth 6,0(r11)
> + bflr 31
> + lbz 7,2(r4)
> + stb 7,2(r11)
> + blr
> +
> + .align 4
> +L(tail5):
> + bflr 31
> + lbz 6,4(r4)
> + stb 6,4(r11)
> + blr
> +
> + .align 4
> +1:
> + bflr 31
> + lbz 6,0(r4)
> + stb 6,0(r11)
> + /* Return original DST pointer. */
> + blr
> +
> +/* Handles copies of 0~8 bytes. */
> + .align 4
> +L(copy_LE_8):
> + bne cr6,L(tail4)
> +
> + /* Though we could've used ld/std here, they are still
> + slow for unaligned cases. */
> +
> + lwz 6,0(r4)
> + lwz 7,4(r4)
> + stw 6,0(r11)
> + stw 7,4(r11)
> + blr
> +
> +
> +/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
> + SRC is not. Use aligned quadword loads from SRC, shifted to realign
> + the data, allowing for aligned DST stores. */
> + .align 4
> +L(copy_GE_32_unaligned):
> + clrldi 0,0,60 /* Number of bytes until the 1st r11 quadword. */
> + srdi 9,r5,4 /* Number of full quadwords remaining. */
> +
> + beq L(copy_GE_32_unaligned_cont)
> +
> + /* DST is not quadword aligned, get it aligned. */
> +
> + mtocrf 0x01,0
> + subf r5,0,r5
> +
> + /* Vector instructions work best when proper alignment (16-bytes)
> + is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
> +1:
> + bf 31,2f
> + lbz 6,0(r4)
> + addi r4,r4,1
> + stb 6,0(r11)
> + addi r11,r11,1
> +2:
> + bf 30,4f
> + lhz 6,0(r4)
> + addi r4,r4,2
> + sth 6,0(r11)
> + addi r11,r11,2
> +4:
> + bf 29,8f
> + lwz 6,0(r4)
> + addi r4,r4,4
> + stw 6,0(r11)
> + addi r11,r11,4
> +8:
> + bf 28,0f
> + ld 6,0(r4)
> + addi r4,r4,8
> + std 6,0(r11)
> + addi r11,r11,8
> +0:
> + srdi 9,r5,4 /* Number of full quadwords remaining. */
> +
> + /* The proper alignment is present, it is OK to copy the bytes now. */
> +L(copy_GE_32_unaligned_cont):
> +
> + /* Setup two indexes to speed up the indexed vector operations. */
> + clrldi 10,r5,60
> + li 6,16 /* Index for 16-bytes offsets. */
> + li 7,32 /* Index for 32-bytes offsets. */
> + cmpldi cr1,10,0
> + srdi 8,r5,5 /* Setup the loop counter. */
> + mtocrf 0x01,9
> + cmpldi cr6,9,1
> +#ifdef __LITTLE_ENDIAN__
> + lvsr 5,0,r4
> +#else
> + lvsl 5,0,r4
> +#endif
> + lvx 3,0,r4
> + li 0,0
> + bf 31,L(setup_unaligned_loop)
> +
> + /* Copy another 16 bytes to align to 32-bytes due to the loop. */
> + lvx 4,r4,6
> +#ifdef __LITTLE_ENDIAN__
> + vperm 6,4,3,5
> +#else
> + vperm 6,3,4,5
> +#endif
> + addi r4,r4,16
> + stvx 6,0,r11
> + addi r11,r11,16
> + vor 3,4,4
> + clrrdi 0,r4,60
> +
> +L(setup_unaligned_loop):
> + mtctr 8
> + ble cr6,L(end_unaligned_loop)
> +
> + /* Copy 32 bytes at a time using vector instructions. */
> + .align 4
> +L(unaligned_loop):
> +
> + /* Note: vr6/vr10 may contain data that was already copied,
> + but in order to get proper alignment, we may have to copy
> + some portions again. This is faster than having unaligned
> + vector instructions though. */
> +
> + lvx 4,r4,6
> +#ifdef __LITTLE_ENDIAN__
> + vperm 6,4,3,5
> +#else
> + vperm 6,3,4,5
> +#endif
> + lvx 3,r4,7
> +#ifdef __LITTLE_ENDIAN__
> + vperm 10,3,4,5
> +#else
> + vperm 10,4,3,5
> +#endif
> + addi r4,r4,32
> + stvx 6,0,r11
> + stvx 10,r11,6
> + addi r11,r11,32
> + bdnz L(unaligned_loop)
> +
> + clrrdi 0,r4,60
> +
> + .align 4
> +L(end_unaligned_loop):
> +
> + /* Check for tail bytes. */
> + mtocrf 0x01,r5
> + beqlr cr1
> +
> + add r4,r4,0
> +
> + /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
> + /* Copy 8 bytes. */
> + bf 28,4f
> + lwz 6,0(r4)
> + lwz 7,4(r4)
> + addi r4,r4,8
> + stw 6,0(r11)
> + stw 7,4(r11)
> + addi r11,r11,8
> +4: /* Copy 4~7 bytes. */
> + bf 29,L(tail2)
> + lwz 6,0(r4)
> + stw 6,0(r11)
> + bf 30,L(tail5)
> + lhz 7,4(r4)
> + sth 7,4(r11)
> + bflr 31
> + lbz 8,6(r4)
> + stb 8,6(r11)
> + /* Return original DST pointer. */
> + blr
> +
> + /* Start to memcpy backward implementation: the algorith first check if
> + src and dest have the same alignment and if it does align both to 16
> + bytes and copy using VSX instructions.
> + If does not, align dest to 16 bytes and use VMX (altivec) instruction
> + to read two 16 bytes at time, shift/permute the bytes read and write
> + aligned to dest. */
> +L(memmove_bwd):
> + cmpldi cr1,r5,31
> + /* Copy is done backwards: update the pointers and check alignment. */
> + add r11,r3,r5
> + add r4,r4,r5
> + mr r0,r11
> + ble cr1, L(copy_LT_32_bwd) /* If move < 32 bytes use short move
> + code. */
> +
> + andi. r10,r11,15 /* Check if r11 is aligned to 16 bytes */
> + clrldi r9,r4,60 /* Check if r4 is aligned to 16 bytes */
> + cmpld cr6,r10,r9 /* SRC and DST alignments match? */
> +
> + bne cr6,L(copy_GE_32_unaligned_bwd)
> + beq L(aligned_copy_bwd)
> +
> + mtocrf 0x01,r0
> + clrldi r0,r0,60
> +
> +/* Get the DST and SRC aligned to 16 bytes. */
> +1:
> + bf 31,2f
> + lbz r6,-1(r4)
> + subi r4,r4,1
> + stb r6,-1(r11)
> + subi r11,r11,1
> +2:
> + bf 30,4f
> + lhz r6,-2(r4)
> + subi r4,r4,2
> + sth r6,-2(r11)
> + subi r11,r11,2
> +4:
> + bf 29,8f
> + lwz r6,-4(r4)
> + subi r4,r4,4
> + stw r6,-4(r11)
> + subi r11,r11,4
> +8:
> + bf 28,16f
> + ld r6,-8(r4)
> + subi r4,r4,8
> + std r6,-8(r11)
> + subi r11,r11,8
> +16:
> + subf r5,0,r5
> +
> +/* Main aligned copy loop. Copies 128 bytes at a time. */
> +L(aligned_copy_bwd):
> + li r6,-16
> + li r7,-32
> + li r8,-48
> + li r9,-64
> + mtocrf 0x02,r5
> + srdi r12,r5,7
> + cmpdi r12,0
> + beq L(aligned_tail_bwd)
> + lvx v6,r4,r6
> + lvx v7,r4,r7
> + mtctr 12
> + b L(aligned_128loop_bwd)
> +
> + .align 4
> +L(aligned_128head_bwd):
> + /* for the 2nd + iteration of this loop. */
> + lvx v6,r4,r6
> + lvx v7,r4,r7
> +L(aligned_128loop_bwd):
> + lvx v8,r4,r8
> + lvx v9,r4,r9
> + stvx v6,r11,r6
> + subi r4,r4,64
> + stvx v7,r11,r7
> + stvx v8,r11,r8
> + stvx v9,r11,r9
> + lvx v6,r4,r6
> + lvx v7,r4,7
> + subi r11,r11,64
> + lvx v8,r4,r8
> + lvx v9,r4,r9
> + subi r4,r4,64
> + stvx v6,r11,r6
> + stvx v7,r11,r7
> + stvx v8,r11,r8
> + stvx v9,r11,r9
> + subi r11,r11,64
> + bdnz L(aligned_128head_bwd)
> +
> +L(aligned_tail_bwd):
> + mtocrf 0x01,r5
> + bf 25,32f
> + lvx v6,r4,r6
> + lvx v7,r4,r7
> + lvx v8,r4,r8
> + lvx v9,r4,r9
> + subi r4,r4,64
> + stvx v6,r11,r6
> + stvx v7,r11,r7
> + stvx v8,r11,r8
> + stvx v9,r11,r9
> + subi r11,r11,64
> +32:
> + bf 26,16f
> + lvx v6,r4,r6
> + lvx v7,r4,r7
> + subi r4,r4,32
> + stvx v6,r11,r6
> + stvx v7,r11,r7
> + subi r11,r11,32
> +16:
> + bf 27,8f
> + lvx v6,r4,r6
> + subi r4,r4,16
> + stvx v6,r11,r6
> + subi r11,r11,16
> +8:
> + bf 28,4f
> + ld r6,-8(r4)
> + subi r4,r4,8
> + std r6,-8(r11)
> + subi r11,r11,8
> +4: /* Copies 4~7 bytes. */
> + bf 29,L(tail2_bwd)
> + lwz r6,-4(r4)
> + stw r6,-4(r11)
> + bf 30,L(tail5_bwd)
> + lhz r7,-6(r4)
> + sth r7,-6(r11)
> + bflr 31
> + lbz r8,-7(r4)
> + stb r8,-7(r11)
> + /* Return original DST pointer. */
> + blr
> +
> +/* Handle copies of 0~31 bytes. */
> + .align 4
> +L(copy_LT_32_bwd):
> + cmpldi cr6,r5,8
> + mtocrf 0x01,r5
> + ble cr6,L(copy_LE_8_bwd)
> +
> + /* At least 9 bytes to go. */
> + neg r8,r4
> + andi. r0,r8,3
> + cmpldi cr1,r5,16
> + beq L(copy_LT_32_aligned_bwd)
> +
> + /* Force 4-byte alignment for SRC. */
> + mtocrf 0x01,0
> + subf r5,0,r5
> +2:
> + bf 30,1f
> + lhz r6,-2(r4)
> + subi r4,r4,2
> + sth r6,-2(r11)
> + subi r11,r11,2
> +1:
> + bf 31,L(end_4bytes_alignment_bwd)
> + lbz 6,-1(r4)
> + subi r4,r4,1
> + stb 6,-1(r11)
> + subi r11,r11,1
> +
> + .align 4
> +L(end_4bytes_alignment_bwd):
> + cmpldi cr1,r5,16
> + mtocrf 0x01,r5
> +
> +L(copy_LT_32_aligned_bwd):
> + /* At least 6 bytes to go, and SRC is word-aligned. */
> + blt cr1,8f
> +
> + /* Copy 16 bytes. */
> + lwz r6,-4(r4)
> + lwz r7,-8(r4)
> + stw r6,-4(r11)
> + lwz r8,-12(r4)
> + stw r7,-8(r11)
> + lwz r6,-16(r4)
> + subi r4,r4,16
> + stw r8,-12(r11)
> + stw r6,-16(r11)
> + subi r11,r11,16
> +8: /* Copy 8 bytes. */
> + bf 28,L(tail4_bwd)
> + lwz r6,-4(r4)
> + lwz r7,-8(r4)
> + subi r4,r4,8
> + stw r6,-4(r11)
> + stw r7,-8(r11)
> + subi r11,r11,8
> +
> + .align 4
> +/* Copies 4~7 bytes. */
> +L(tail4_bwd):
> + bf 29,L(tail2_bwd)
> + lwz 6,-4(r4)
> + stw 6,-4(r11)
> + bf 30,L(tail5_bwd)
> + lhz 7,-6(r4)
> + sth 7,-6(r11)
> + bflr 31
> + lbz 8,-7(r4)
> + stb 8,-7(r11)
> + /* Return original DST pointer. */
> + blr
> +
> + .align 4
> +/* Copies 2~3 bytes. */
> +L(tail2_bwd):
> + bf 30,1f
> + lhz 6,-2(r4)
> + sth 6,-2(r11)
> + bflr 31
> + lbz 7,-3(r4)
> + stb 7,-3(r11)
> + blr
> +
> + .align 4
> +L(tail5_bwd):
> + bflr 31
> + lbz 6,-5(r4)
> + stb 6,-5(r11)
> + blr
> +
> + .align 4
> +1:
> + bflr 31
> + lbz 6,-1(r4)
> + stb 6,-1(r11)
> + /* Return original DST pointer. */
> + blr
> +
> +
> +/* Handles copies of 0~8 bytes. */
> + .align 4
> +L(copy_LE_8_bwd):
> + bne cr6,L(tail4_bwd)
> +
> + /* Though we could've used ld/std here, they are still
> + slow for unaligned cases. */
> + lwz 6,-8(r4)
> + lwz 7,-4(r4)
> + stw 6,-8(r11)
> + stw 7,-4(r11)
> + blr
> +
> +
> +/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
> + SRC is not. Use aligned quadword loads from SRC, shifted to realign
> + the data, allowing for aligned DST stores. */
> + .align 4
> +L(copy_GE_32_unaligned_bwd):
> + andi. r10,r11,15 /* Check alignment of DST against 16 bytes.. */
> + srdi r9,r5,4 /* Number of full quadwords remaining. */
> +
> + beq L(copy_GE_32_unaligned_cont_bwd)
> +
> + /* DST is not quadword aligned and r10 holds the address masked to
> + compare alignments. */
> + mtocrf 0x01,r10
> + subf r5,r10,r5
> +
> + /* Vector instructions work best when proper alignment (16-bytes)
> + is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
> +1:
> + bf 31,2f
> + lbz r6,-1(r4)
> + subi r4,r4,1
> + stb r6,-1(r11)
> + subi r11,r11,1
> +2:
> + bf 30,4f
> + lhz r6,-2(r4)
> + subi r4,r4,2
> + sth r6,-2(r11)
> + subi r11,r11,2
> +4:
> + bf 29,8f
> + lwz r6,-4(r4)
> + subi r4,r4,4
> + stw r6,-4(r11)
> + subi r11,r11,4
> +8:
> + bf 28,0f
> + ld r6,-8(r4)
> + subi r4,r4,8
> + std r6,-8(r11)
> + subi r11,r11,8
> +0:
> + srdi r9,r5,4 /* Number of full quadwords remaining. */
> +
> + /* The proper alignment is present, it is OK to copy the bytes now. */
> +L(copy_GE_32_unaligned_cont_bwd):
> +
> + /* Setup two indexes to speed up the indexed vector operations. */
> + clrldi r10,r5,60
> + li r6,-16 /* Index for 16-bytes offsets. */
> + li r7,-32 /* Index for 32-bytes offsets. */
> + cmpldi cr1,10,0
> + srdi r8,r5,5 /* Setup the loop counter. */
> + mtocrf 0x01,9
> + cmpldi cr6,r9,1
> +#ifdef __LITTLE_ENDIAN__
> + lvsr v5,r0,r4
> +#else
> + lvsl v5,r0,r4
> +#endif
> + lvx v3,0,r4
> + li r0,0
> + bf 31,L(setup_unaligned_loop_bwd)
> +
> + /* Copy another 16 bytes to align to 32-bytes due to the loop. */
> + lvx v4,r4,r6
> +#ifdef __LITTLE_ENDIAN__
> + vperm v6,v3,v4,v5
> +#else
> + vperm v6,v4,v3,v5
> +#endif
> + subi r4,r4,16
> + stvx v6,r11,r6
> + subi r11,r11,16
> + vor v3,v4,v4
> + clrrdi r0,r4,60
> +
> +L(setup_unaligned_loop_bwd):
> + mtctr r8
> + ble cr6,L(end_unaligned_loop_bwd)
> +
> + /* Copy 32 bytes at a time using vector instructions. */
> + .align 4
> +L(unaligned_loop_bwd):
> +
> + /* Note: vr6/vr10 may contain data that was already copied,
> + but in order to get proper alignment, we may have to copy
> + some portions again. This is faster than having unaligned
> + vector instructions though. */
> +
> + lvx v4,r4,r6
> +#ifdef __LITTLE_ENDIAN__
> + vperm v6,v3,v4,v5
> +#else
> + vperm v6,v4,v3,v5
> +#endif
> + lvx v3,r4,r7
> +#ifdef __LITTLE_ENDIAN__
> + vperm v10,v4,v3,v5
> +#else
> + vperm v10,v3,v4,v5
> +#endif
> + subi r4,r4,32
> + stvx v6,r11,r6
> + stvx v10,r11,r7
> + subi r11,r11,32
> + bdnz L(unaligned_loop_bwd)
> +
> + clrrdi r0,r4,60
> +
> + .align 4
> +L(end_unaligned_loop_bwd):
> +
> + /* Check for tail bytes. */
> + mtocrf 0x01,r5
> + beqlr cr1
> +
> + add r4,r4,0
> +
> + /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
> + /* Copy 8 bytes. */
> + bf 28,4f
> + lwz r6,-4(r4)
> + lwz r7,-8(r4)
> + subi r4,r4,8
> + stw r6,-4(r11)
> + stw r7,-8(r11)
> + subi r11,r11,8
> +4: /* Copy 4~7 bytes. */
> + bf 29,L(tail2_bwd)
> + lwz r6,-4(r4)
> + stw r6,-4(r11)
> + bf 30,L(tail5_bwd)
> + lhz r7,-6(r4)
> + sth r7,-6(r11)
> + bflr 31
> + lbz r8,-7(r4)
> + stb r8,-7(r11)
> + /* Return original DST pointer. */
> + blr
> +END_GEN_TB (MEMMOVE, TB_TOCLESS)
> +libc_hidden_builtin_def (memmove)
> +
> +
> +/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
> + Implemented in this file to avoid linker create a stub function call
> + in the branch to '_memmove'. */
> +ENTRY_TOCLESS (__bcopy)
> + mr r6,r3
> + mr r3,r4
> + mr r4,r6
> + b L(_memmove)
> +END (__bcopy)
> +#ifndef __bcopy
> +weak_alias (__bcopy, bcopy)
> +#endif
>