This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
PowerPC: memset optimization for POWER8/PPC64
- From: Adhemerval Zanella <azanella at linux dot vnet dot ibm dot com>
- To: "GNU C. Library" <libc-alpha at sourceware dot org>
- Date: Fri, 18 Jul 2014 10:27:41 -0300
- Subject: PowerPC: memset optimization for POWER8/PPC64
- Authentication-results: sourceware.org; auth=none
This patch adds an optimized memset implementation for POWER8. For
sizes from 0 to 255 bytes, a word/doubleword algorithm similar to
POWER7 optimized one is used.
For size higher than 255 two strategies are used:
1. If the constant is different than 0, the memory is written with
altivec vector instruction;
2. If constant is 0, dbcz instructions are used. The loop is unrolled
to clear 512 byte at time.
Using vector instructions increases throughput considerable, with a double
performance for sizes larger than 1024. The dcbz loops unrolls also shows
performance improvement, by doubling throughput for sizes larger than
8192 bytes.
Tested on powerpc64 and powerpc64le (POWER8), GLIBC benchmark output attached.
--
* benchtests/bench-memset.c (test_main): Add more test from size
from 32 to 512 bytes.
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
Add POWER8 memset object.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add POWER8 memset and bzero implementations.
* sysdeps/powerpc/powerpc64/multiarch/bzero.c (__bzero): Add POWER8
implementation.
* sysdeps/powerpc/powerpc64/multiarch/memset.c (__libc_memset):
Likewise.
* sysdeps/powerpc/powerpc64/multiarch/memset-power8.S: New file:
multiarch POWER8 memset optimization.
* sysdeps/powerpc/powerpc64/power8/memset.S: New file: optimized
POWER8 memset optimization.
---
diff --git a/benchtests/bench-memset.c b/benchtests/bench-memset.c
index 5304113..2026593 100644
--- a/benchtests/bench-memset.c
+++ b/benchtests/bench-memset.c
@@ -150,6 +150,11 @@ test_main (void)
if (i & (i - 1))
do_test (0, c, i);
}
+ for (i = 32; i < 512; i+=32)
+ {
+ do_test (0, c, i);
+ do_test (i, c, i);
+ }
do_test (1, c, 14);
do_test (3, c, 1024);
do_test (4, c, 64);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 0de3804..abc9d2e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -2,7 +2,7 @@ ifeq ($(subdir),string)
sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
memcmp-ppc64 memset-power7 memset-power6 memset-power4 \
- memset-ppc64 \
+ memset-ppc64 memset-power8 \
mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
index ed83541..298cf00 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -26,14 +26,17 @@ extern __typeof (bzero) __bzero_ppc attribute_hidden;
extern __typeof (bzero) __bzero_power4 attribute_hidden;
extern __typeof (bzero) __bzero_power6 attribute_hidden;
extern __typeof (bzero) __bzero_power7 attribute_hidden;
+extern __typeof (bzero) __bzero_power8 attribute_hidden;
libc_ifunc (__bzero,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __bzero_power7 :
- (hwcap & PPC_FEATURE_ARCH_2_05)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __bzero_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __bzero_power7 :
+ (hwcap & PPC_FEATURE_ARCH_2_05)
? __bzero_power6 :
(hwcap & PPC_FEATURE_POWER4)
- ? __bzero_power4
+ ? __bzero_power4
: __bzero_ppc);
weak_alias (__bzero, bzero)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index a574487..06d5be9 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -34,6 +34,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
size_t i = 0;
unsigned long int hwcap = GLRO(dl_hwcap);
+ unsigned long int hwcap2 = GLRO(dl_hwcap2);
+
/* hwcap contains only the latest supported ISA, the code checks which is
and fills the previous supported ones. */
if (hwcap & PPC_FEATURE_ARCH_2_06)
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memset.c. */
IFUNC_IMPL (i, name, memset,
+ IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __memset_power8)
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
__memset_power7)
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
@@ -134,6 +138,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c. */
IFUNC_IMPL (i, name, bzero,
+ IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __bzero_power8)
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
__bzero_power7)
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
new file mode 100644
index 0000000..e8a604b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
@@ -0,0 +1,43 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__memset_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__memset_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__memset_power8)
+
+#undef END_GEN_TB
+#define END_GEN_TB(name, mask) \
+ cfi_endproc; \
+ TRACEBACK_MASK(__memset_power8,mask) \
+ END_2(__memset_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bzero
+#define __bzero __bzero_power8
+
+#include <sysdeps/powerpc/powerpc64/power8/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index aa2ae70..9c7ed10 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -32,16 +32,19 @@ extern __typeof (__redirect_memset) __memset_ppc attribute_hidden;
extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__libc_memset,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __memset_power7 :
- (hwcap & PPC_FEATURE_ARCH_2_05)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __memset_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __memset_power7 :
+ (hwcap & PPC_FEATURE_ARCH_2_05)
? __memset_power6 :
(hwcap & PPC_FEATURE_POWER4)
- ? __memset_power4
+ ? __memset_power4
: __memset_ppc);
#undef memset
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
new file mode 100644
index 0000000..3379e97
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -0,0 +1,449 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+ Returns 's'. */
+
+ .machine power8
+EALIGN (memset, 5, 0)
+ CALL_MCOUNT 3
+
+L(_memset):
+ cmpldi cr7,r5,31
+ neg r0,r3
+ mr r10,r3
+
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32 /* Replicate byte to word. */
+ ble cr7,L(write_LT_32)
+
+ andi. r11,r10,r15 /* Check alignment of DST. */
+ insrdi r4,r4,32,0 /* Replicate word to double word. */
+
+ beq L(big_aligned)
+
+ mtocrf 0x01,r0
+ clrldi r0,r0,60
+
+ /* Get DST aligned to 16 bytes. */
+1: bf 31,2f
+ stb r4,0(r10)
+ addi r10,r10,1
+
+2: bf 30,4f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+4: bf 29,8f
+ stw r4,0(r10)
+ addi r10,r10,4
+
+8: bf 28,16f
+ std r4,0(r10)
+ addi r10,r10,8
+
+16: subf r5,r0,r5
+
+ .align 4
+L(big_aligned):
+ /* For sizes larger than 255 two possible paths:
+ - if constant is '0', zero full cache lines with dcbz
+ - otherwise uses vector instructions. */
+ cmpldi cr5,r5,255
+ dcbtst 0,r10
+ cmpldi cr6,r4,0
+ crand 27,26,21
+ bt 27,L(huge_dcbz)
+ bge cr5,L(huge_vector)
+
+
+ /* Size betwen 32 and 255 bytes with constant different than 0, use
+ doubleword store instruction to achieve best throughput. */
+ srdi r8,r5,5
+ clrldi r11,r5,59
+ cmpldi cr6,r11,0
+ cmpdi r8,0
+ beq L(tail_bytes)
+ mtctr r8
+
+ /* Main aligned write loop, writes 32-bytes at a time. */
+ .align 4
+L(big_loop):
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ addi 10,10,32
+ bdz L(tail_bytes)
+
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ addi 10,10,32
+ bdnz L(big_loop)
+
+ b L(tail_bytes)
+
+ /* Write remaining 1~31 bytes. */
+ .align 4
+L(tail_bytes):
+ beqlr cr6
+
+ srdi r7,r11,4
+ clrldi r8,r11,60
+ mtocrf 0x01,r7
+
+ .align 4
+ bf 31,8f
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+
+ .align 4
+8: mtocrf 0x1,r8
+ bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ .align 4
+4: bf 29,2f
+ stw 4,0(10)
+ addi 10,10,4
+
+ .align 4
+2: bf 30,1f
+ sth 4,0(10)
+ addi 10,10,2
+
+ .align 4
+1: bflr 31
+ stb 4,0(10)
+ blr
+
+ /* Size larger than 255 bytes with constant different than 0, use
+ vector instruction to achieve best throughput. */
+L(huge_vector):
+ /* Replicate set byte to quardword in VMX register. */
+ mtvsrd v1,r4
+ xxpermdi 32,v0,v1,0
+ vspltb v2,v0,15
+
+ /* Main aligned write loop: 128 bytes at a time. */
+ li r6,16
+ li r7,32
+ li r8,48
+ mtocrf 0x02,r5
+ srdi r12,r5,7
+ cmpdi r12,0
+ beq L(aligned_tail)
+ mtctr r12
+ b L(aligned_128loop)
+
+ .align 4
+L(aligned_128loop):
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi 10,r10,64
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+ bdnz L(aligned_128loop)
+
+ /* Write remaining 1~127 bytes. */
+L(aligned_tail):
+ mtocrf 0x01,r5
+ bf 25,32f
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+
+32: bf 26,16f
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ addi r10,r10,32
+
+16: bf 27,8f
+ stvx v2,0,r10
+ addi r10,r10,16
+
+8: bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ /* Copies 4~7 bytes. */
+4: bf 29,L(tail2)
+ stw r4,0(r10)
+ bf 30,L(tail5)
+ sth r4,4(r10)
+ bflr 31
+ stb r4,6(r10)
+ /* Return original DST pointer. */
+ blr
+
+ /* Special case when value is 0 and we have a long length to deal
+ with. Use dcbz to zero out a full cacheline of 128-bytes at a time.
+ Before using dcbz though, we need to get the destination 128-bytes
+ aligned. */
+ .align 4
+L(huge_dcbz):
+ andi. r11,r10,127
+ neg r0,r10
+ beq L(huge_dcbz_aligned)
+
+ clrldi r0,r0,57
+ subf r5,r0,r5
+ srdi r0,r0,3
+ mtocrf 0x01,r0
+
+ /* Write 1~128 bytes until DST is aligned to 128 bytes. */
+8: bf 28,4f
+
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ std r4,32(r10)
+ std r4,40(r10)
+ std r4,48(r10)
+ std r4,56(r10)
+ addi r10,r10,64
+
+ .align 4
+4: bf 29,2f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+
+ .align 4
+2: bf 30,1f
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+
+ .align 4
+1: bf 31,L(huge_dcbz_aligned)
+ std r4,0(r10)
+ addi r10,r10,8
+
+L(huge_dcbz_aligned):
+ /* Setup dcbz unroll offsets and count numbers. */
+ srdi r8,r5,9
+ clrldi r11,r5,55
+ cmpldi cr6,r11,0
+ li r9,128
+ cmpdi r8,0
+ beq L(huge_tail)
+ li r7,256
+ li r6,384
+ mtctr r8
+
+ .align 4
+L(huge_loop):
+ /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+ a throughput boost for large sizes (2048 bytes or higher). */
+ dcbz 0,r10
+ dcbz r9,r10
+ dcbz r7,r10
+ dcbz r6,r10
+ addi r10,r10,512
+ bdnz L(huge_loop)
+
+ beqlr cr6
+
+L(huge_tail):
+ srdi r6,r11,8
+ srdi r7,r11,4
+ clrldi r8,r11,4
+ cmpldi cr6,r8,0
+ mtocrf 0x01,r6
+
+ beq cr6,L(tail)
+
+ /* We have 1~511 bytes remaining. */
+ .align 4
+32: bf 31,16f
+ dcbz 0,r10
+ dcbz r9,r10
+ addi r10,r10,256
+
+ .align 4
+16: mtocrf 0x01,r7
+ bf 28,8f
+ dcbz 0,r10
+ addi r10,r10,128
+
+ .align 4
+8: bf 29,4f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ std r4,32(r10)
+ std r4,40(r10)
+ std r4,48(r10)
+ std r4,56(r10)
+ addi r10,r10,64
+
+ .align 4
+4: bf 30,2f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+
+ .align 4
+2: bf 31,L(tail)
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+ .align 4
+
+ /* Remaining 1~15 bytes. */
+L(tail):
+ mtocrf 0x01,r8
+
+ .align
+8: bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ .align 4
+4: bf 29,2f
+ stw r4,0(r10)
+ addi r10,r10,4
+
+ .align 4
+2: bf 30,1f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ blr
+
+ /* Handle short copies of 0~31 bytes. Best throughput is achieved
+ by just unrolling all operations. */
+ .align 4
+L(write_LT_32):
+ cmpldi cr6,5,8
+ mtocrf 0x01,5
+ ble cr6,L(write_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg r8,r4
+ andi. r0,r8,3
+ cmpldi cr1,r5,16
+ beq L(write_LT_32_aligned)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,r0
+ subf r5,r0,r5
+
+2: bf 30,1f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+1: bf 31,L(end_4bytes_alignment)
+ stb r4,0(r10)
+ addi r10,r10,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(write_LT_32_aligned):
+ blt cr1,8f
+
+ stw r4,0(r10)
+ stw r4,4(r10)
+ stw r4,8(r10)
+ stw r4,12(r10)
+ addi r10,r10,16
+
+8: bf 28,L(tail4)
+ stw r4,0(r10)
+ stw r4,4(r10)
+ addi r10,r10,8
+
+ .align 4
+ /* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ stw r4,0(r10)
+ bf 30,L(tail5)
+ sth r4,4(r10)
+ bflr 31
+ stb r4,6(r10)
+ blr
+
+ .align 4
+ /* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ sth r4,0(r10)
+ bflr 31
+ stb r4,2(r10)
+ blr
+
+ .align 4
+L(tail5):
+ bflr 31
+ stb r4,4(r10)
+ blr
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(write_LE_8):
+ bne cr6,L(tail4)
+
+ stw r4,0(r10)
+ stw r4,4(r10)
+ blr
+END_GEN_TB (memset,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+ between bzero and memset. */
+ENTRY (__bzero)
+ CALL_MCOUNT 3
+ mr r5,r4
+ li r4,0
+ b L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
>From cbd995ed00ca74befbd2ecab26956b90ae627bcd Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Tue, 15 Jul 2014 12:19:09 -0400
Subject: [PATCH] PowerPC: memset optimization for POWER8/PPC64
This patch adds an optimized memset implementation for POWER8 by using
vector instruction to write into memory, showing a throughput boost
for sizes large than 256 (with sizes of 8192 showing double
performance).
For constant 0 (bzero) the dcbz loop in unrollep to issue 4 instruction
each interation (512 bytes). It doubles throughput with sizes larger
than 2048 bytes.
---
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
sysdeps/powerpc/powerpc64/multiarch/bzero.c | 11 +-
.../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 6 +
.../powerpc/powerpc64/multiarch/memset-power8.S | 43 +++
sysdeps/powerpc/powerpc64/multiarch/memset.c | 11 +-
sysdeps/powerpc/powerpc64/power8/memset.S | 389 +++++++++++++++++++++
6 files changed, 453 insertions(+), 9 deletions(-)
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
create mode 100644 sysdeps/powerpc/powerpc64/power8/memset.S
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 82722fb..aeab813 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -19,7 +19,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
- bcopy-ppc64
+ bcopy-ppc64 memset-power8
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
index ed83541..298cf00 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -26,14 +26,17 @@ extern __typeof (bzero) __bzero_ppc attribute_hidden;
extern __typeof (bzero) __bzero_power4 attribute_hidden;
extern __typeof (bzero) __bzero_power6 attribute_hidden;
extern __typeof (bzero) __bzero_power7 attribute_hidden;
+extern __typeof (bzero) __bzero_power8 attribute_hidden;
libc_ifunc (__bzero,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __bzero_power7 :
- (hwcap & PPC_FEATURE_ARCH_2_05)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __bzero_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __bzero_power7 :
+ (hwcap & PPC_FEATURE_ARCH_2_05)
? __bzero_power6 :
(hwcap & PPC_FEATURE_POWER4)
- ? __bzero_power4
+ ? __bzero_power4
: __bzero_ppc);
weak_alias (__bzero, bzero)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index a574487..06d5be9 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -34,6 +34,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
size_t i = 0;
unsigned long int hwcap = GLRO(dl_hwcap);
+ unsigned long int hwcap2 = GLRO(dl_hwcap2);
+
/* hwcap contains only the latest supported ISA, the code checks which is
and fills the previous supported ones. */
if (hwcap & PPC_FEATURE_ARCH_2_06)
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memset.c. */
IFUNC_IMPL (i, name, memset,
+ IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __memset_power8)
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
__memset_power7)
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
@@ -134,6 +138,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c. */
IFUNC_IMPL (i, name, bzero,
+ IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __bzero_power8)
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
__bzero_power7)
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
new file mode 100644
index 0000000..70d83f0
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
@@ -0,0 +1,43 @@
+/* Optimized memset implementation for PowerPC64/POWER7.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__memset_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__memset_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__memset_power8)
+
+#undef END_GEN_TB
+#define END_GEN_TB(name, mask) \
+ cfi_endproc; \
+ TRACEBACK_MASK(__memset_power8,mask) \
+ END_2(__memset_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bzero
+#define __bzero __bzero_power8
+
+#include <sysdeps/powerpc/powerpc64/power8/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index aa2ae70..9c7ed10 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -32,16 +32,19 @@ extern __typeof (__redirect_memset) __memset_ppc attribute_hidden;
extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__libc_memset,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __memset_power7 :
- (hwcap & PPC_FEATURE_ARCH_2_05)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __memset_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __memset_power7 :
+ (hwcap & PPC_FEATURE_ARCH_2_05)
? __memset_power6 :
(hwcap & PPC_FEATURE_POWER4)
- ? __memset_power4
+ ? __memset_power4
: __memset_ppc);
#undef memset
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
new file mode 100644
index 0000000..ce41a63
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -0,0 +1,389 @@
+/* Optimized memset implementation for PowerPC64/POWER7.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+ Returns 's'. */
+
+ .machine power8
+EALIGN (memset, 5, 0)
+ CALL_MCOUNT 3
+
+L(_memset):
+ cmpldi cr7,r5,31
+ neg r0,r3
+ mr r10,r3
+
+ /* Replicate byte to word. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ ble cr7,L(copy_LT_32)
+
+ andi. r11,r10,r15 /* Check alignment of SRC. */
+ insrdi r4,r4,32,0 /* Replicate word to double word. */
+
+ mr r12,r5
+ beq L(big_aligned)
+
+ mtocrf 0x01,r0
+ clrldi r0,r0,60
+
+ /* Get DST aligned to 16 bytes. */
+1: bf 31,2f
+ stb r4,0(r10)
+ addi r10,r10,1
+
+2: bf 30,4f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+4: bf 29,8f
+ stw r4,0(r10)
+ addi r10,r10,4
+
+8: bf 28,16f
+ std r4,0(r10)
+ addi r10,r10,8
+
+16: subf r5,r0,r5
+
+ .align 4
+L(big_aligned):
+ cmpldi cr5,r5,255
+ li r0,32
+ dcbtst 0,r10
+ cmpldi cr6,r4,0
+ srdi r9,r5,3 /* Number of full doublewords remaining. */
+ crand 27,26,21
+ mtocrf 0x01,r9
+ bt 27,L(huge)
+
+ /* From this point on, we'll copy 32+ bytes and the value
+ isn't 0 (so we can't use dcbz). */
+
+ /* Replicate set byte to quardword in VMX register. */
+ mtvsrd v1,r4
+ xxpermdi 32,v0,v1,0
+ vspltb v2,v0,15
+
+ /* Main aligned write loop: 128 bytes at a time. */
+ li r6,16
+ li r7,32
+ li r8,48
+ mtocrf 0x02,r5
+ srdi r12,r5,7
+ cmpdi r12,0
+ beq L(aligned_tail)
+ mtctr r12
+ b L(aligned_128loop)
+
+ .align 4
+L(aligned_128loop):
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi 10,r10,64
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+ bdnz L(aligned_128loop)
+
+ /* Write remaining 1~127 bytes. */
+L(aligned_tail):
+ mtocrf 0x01,r5
+ bf 25,32f
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+
+32: bf 26,16f
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ addi r10,r10,32
+
+16: bf 27,8f
+ stvx v2,0,r10
+ addi r10,r10,16
+
+8: bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ /* Copies 4~7 bytes. */
+4: bf 29,L(tail2)
+ stw r4,0(r10)
+ bf 30,L(tail5)
+ sth r4,4(r10)
+ bflr 31
+ stb r4,6(r10)
+ /* Return original DST pointer. */
+ blr
+
+ /* Special case when value is 0 and we have a long length to deal
+ with. Use dcbz to zero out 128-bytes at a time. Before using
+ dcbz though, we need to get the destination 128-bytes aligned. */
+ .align 4
+L(huge):
+ andi. r11,r10,127
+ neg r0,r10
+ beq L(huge_aligned)
+
+ clrldi r0,r0,57
+ subf r5,r0,r5
+ srdi r0,r0,3
+ mtocrf 0x01,r0
+
+ /* Write 1~128 bytes until DST is aligned to 128 bytes. */
+8: bf 28,4f
+
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ std r4,32(r10)
+ std r4,40(r10)
+ std r4,48(r10)
+ std r4,56(r10)
+ addi r10,r10,64
+
+ .align 4
+4: bf 29,2f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+
+ .align 4
+2: bf 30,1f
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+
+ .align 4
+1: bf 31,L(huge_aligned)
+ std r4,0(r10)
+ addi r10,r10,8
+
+L(huge_aligned):
+ srdi r8,r5,9
+ clrldi r11,r5,55
+ cmpldi cr6,r11,0
+ li r9,128
+ cmpdi r8,0
+ beq L(huge_tail)
+ li r7,256
+ li r6,384
+ mtctr r8
+
+ .align 4
+L(huge_loop):
+ /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+ a throughput boost for large sizes (2048 bytes or higher). */
+ dcbz 0,r10
+ dcbz r9,r10
+ dcbz r7,r10
+ dcbz r6,r10
+ addi r10,r10,512
+ bdnz L(huge_loop)
+
+ beqlr cr6
+
+L(huge_tail):
+ srdi r6,r11,8
+ srdi r7,r11,4
+ clrldi r8,r11,4
+ cmpldi cr6,r8,0
+ mtocrf 0x01,r6
+
+ beq cr6,L(tail)
+
+ /* We have 1~511 bytes remaining. */
+ .align 4
+32: bf 31,16f
+ dcbz 0,r10
+ dcbz r9,r10
+ addi r10,r10,256
+
+ .align 4
+16: mtocrf 0x01,r7
+ bf 28,8f
+ dcbz 0,r10
+ addi r10,r10,128
+
+ .align 4
+8: bf 29,4f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ std r4,32(r10)
+ std r4,40(r10)
+ std r4,48(r10)
+ std r4,56(r10)
+ addi r10,r10,64
+
+ .align 4
+4: bf 30,2f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+
+ .align 4
+2: bf 31,L(tail)
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+ .align 4
+
+ /* Remaining 1~15 bytes. */
+L(tail):
+ mtocrf 0x01,r8
+
+ .align
+8: bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ .align 4
+4: bf 29,2f
+ stw r4,0(r10)
+ addi r10,r10,4
+
+ .align 4
+2: bf 30,1f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ blr
+
+ /* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ cmpldi cr6,5,8
+ mtocrf 0x01,5
+ ble cr6,L(copy_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg r8,r4
+ andi. r0,r8,3
+ cmpldi cr1,r5,16
+ beq L(copy_LT_32_aligned)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,r0
+ subf r5,r0,r5
+
+2: bf 30,1f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+1: bf 31,L(end_4bytes_alignment)
+ stb r4,0(r10)
+ addi r10,r10,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(copy_LT_32_aligned):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ stw r4,0(r10)
+ stw r4,4(r10)
+ stw r4,8(r10)
+ stw r4,12(r10)
+ addi r10,r10,16
+
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4)
+ stw r4,0(r10)
+ stw r4,4(r10)
+ addi r10,r10,8
+
+ .align 4
+ /* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ stw r4,0(r10)
+ bf 30,L(tail5)
+ sth r4,4(r10)
+ bflr 31
+ stb r4,6(r10)
+ blr
+
+ .align 4
+ /* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ sth r4,0(r10)
+ bflr 31
+ stb r4,2(r10)
+ blr
+
+ .align 4
+L(tail5):
+ bflr 31
+ stb r4,4(r10)
+ blr
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ /* Return original DST pointer. */
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,L(tail4)
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+ stw r4,0(r10)
+ stw r4,4(r10)
+ blr
+END_GEN_TB (memset,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+ between bzero and memset. */
+ENTRY (__bzero)
+ CALL_MCOUNT 3
+ mr r5,r4
+ li r4,0
+ b L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
--
1.8.2.1