+2019-08-01 Raoni Fassina Firmino <raoni@linux.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/power8/memchr.S: Update power8
+ mnemonics and set .machine power8.
+ * sysdeps/powerpc/powerpc64/power8/memcmp.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power8/memrchr.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power8/memset.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power8/strchr.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power8/strlen.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power8/strncmp.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power8/strncpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power8/strnlen.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power8/strrchr.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power8/strspn.S: Likewise.
+
2019-08-01 Adhemerval Zanella <adhemerval.zanella@linaro.org>
* sysdeps/hppa/fpu/libm-test-ulps: Update.
/* void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]) */
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MTVRD(v, r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define MFVRD(r, v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define VBPERMQ(t, a, b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-
#ifndef MEMCHR
# define MEMCHR __memchr
#endif
-/* TODO: change this to .machine power8 when the minimum required binutils
- allows it. */
- .machine power7
+ .machine power8
ENTRY_TOCLESS (MEMCHR)
CALL_MCOUNT 3
dcbt 0, r3
li r0, 0
lvsl v11, r0, r0
vslb v10, v11, v10
- MTVRD(v1, r4)
+ mtvrd v1, r4
vspltb v1, v1, 7
cmpldi r5, 64
ble L(tail64)
.align 4
L(found):
/* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v6, v6, v10)
- VBPERMQ(v7, v7, v10)
- VBPERMQ(v8, v8, v10)
- VBPERMQ(v9, v9, v10)
+ vbpermq v6, v6, v10
+ vbpermq v7, v7, v10
+ vbpermq v8, v8, v10
+ vbpermq v9, v9, v10
/* Shift each component into its correct position for merging. */
#ifdef __LITTLE_ENDIAN__
vsldoi v7, v7, v7, 2
vor v11, v6, v7
vor v4, v9, v8
vor v4, v11, v4
- MFVRD(r5, v4)
+ mfvrd r5, v4
#ifdef __LITTLE_ENDIAN__
addi r6, r5, -1
andc r6, r6, r5
.align 4
L(found_16B):
/* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v6, v6, v10)
+ vbpermq v6, v6, v10
/* Shift each component into its correct position for merging. */
#ifdef __LITTLE_ENDIAN__
- MFVRD(r7, v6)
+ mfvrd r7, v6
addi r6, r7, -1
andc r6, r6, r7
popcntd r6, r6
#else
vsldoi v6, v6, v6, 6
- MFVRD(r7, v6)
+ mfvrd r7, v6
cntlzd r6, r7 /* Count leading zeros before the match. */
#endif
add r3, r8, r6 /* Compute final length. */
const char *s2 [r4],
size_t size [r5]) */
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
#ifndef MEMCMP
# define MEMCMP memcmp
#endif
- .machine power7
+ .machine power8
ENTRY_TOCLESS (MEMCMP, 4)
CALL_MCOUNT 3
vperm v4, v4, v0, v9
vperm v5, v5, v0, v9
#endif
- MFVRD(r7, v4)
- MFVRD(r9, v5)
+ mfvrd r7, v4
+ mfvrd r9, v5
cmpld cr6, r7, r9
bne cr6, L(ret_diff)
/* Difference in second DW. */
vsldoi v4, v4, v4, 8
vsldoi v5, v5, v5, 8
- MFVRD(r7, v4)
- MFVRD(r9, v5)
+ mfvrd r7, v4
+ mfvrd r9, v5
cmpld cr6, r7, r9
L(ret_diff):
li rRTN, 1
vperm v6, v6, v0, v9
vperm v8, v8, v0, v9
#endif
- MFVRD(r7, v6)
- MFVRD(r9, v8)
+ mfvrd r7, v6
+ mfvrd r9, v8
cmpld cr6, r7, r9
bne cr6, L(ret_diff)
/* Difference in second DW. */
vsldoi v6, v6, v6, 8
vsldoi v8, v8, v8, 8
- MFVRD(r7, v6)
- MFVRD(r9, v8)
+ mfvrd r7, v6
+ mfvrd r9, v8
cmpld cr6, r7, r9
li rRTN, 1
bgtlr cr6
/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MTVRD(v, r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define MFVRD(r, v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define VBPERMQ(t, a, b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
#ifndef MEMRCHR
# define MEMRCHR __memrchr
#endif
- .machine power7
+ .machine power8
ENTRY_TOCLESS (MEMRCHR)
CALL_MCOUNT 3
add r7, r3, r5 /* Calculate the last acceptable address. */
li r0, 0
lvsl v11, r0, r0
vslb v10, v11, v10
- MTVRD(v1, r4)
+ mtvrd v1, r4
vspltb v1, v1, 7
cmpldi r5, 64
ble L(tail64)
.align 4
L(found):
/* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v6, v6, v10)
- VBPERMQ(v7, v7, v10)
- VBPERMQ(v8, v8, v10)
- VBPERMQ(v9, v9, v10)
+ vbpermq v6, v6, v10
+ vbpermq v7, v7, v10
+ vbpermq v8, v8, v10
+ vbpermq v9, v9, v10
/* Shift each component into its correct position for merging. */
#ifdef __LITTLE_ENDIAN__
vsldoi v7, v7, v7, 2
vor v11, v6, v7
vor v4, v9, v8
vor v4, v11, v4
- MFVRD(r5, v4)
+ mfvrd r5, v4
#ifdef __LITTLE_ENDIAN__
cntlzd r6, r5 /* Count leading zeros before the match. */
#else
bge L(last)
/* Now discard bytes before starting address. */
sub r9, r10, r8
- MTVRD(v9, r9)
+ mtvrd v9, r9
vspltisb v8, 3
/* Mask unwanted bytes. */
#ifdef __LITTLE_ENDIAN__
#endif
L(last):
/* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v6, v6, v10)
+ vbpermq v6, v6, v10
/* Shift each component into its correct position for merging. */
#ifdef __LITTLE_ENDIAN__
vsldoi v6, v6, v6, 6
- MFVRD(r7, v6)
+ mfvrd r7, v6
cntlzd r6, r7 /* Count leading zeros before the match. */
#else
- MFVRD(r7, v6)
+ mfvrd r7, v6
addi r6, r7, -1
andc r6, r6, r7
popcntd r6, r6
#include <sysdep.h>
-#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */
-
/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
Returns 's'. */
#ifndef MEMSET
# define MEMSET memset
#endif
-
- /* No need to use .machine power8 since mtvsrd is already
- handled by the define. It avoid breakage on binutils
- that does not support this machine specifier. */
- .machine power7
+ .machine power8
ENTRY_TOCLESS (MEMSET, 5)
CALL_MCOUNT 3
vector instruction to achieve best throughput. */
L(huge_vector):
/* Replicate set byte to quadword in VMX register. */
- MTVSRD_V1_R4
+ mtvsrd v1,r4
xxpermdi 32,v0,v1,0
vspltb v2,v0,15
#endif /* !USE_AS_STRCHRNUL */
/* int [r3] strchr (char *s [r3], int c [r4]) */
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-/* TODO: change this to .machine power8 when the minimum required binutils
- allows it. */
- .machine power7
+ .machine power8
ENTRY_TOCLESS (FUNC_NAME)
CALL_MCOUNT 2
dcbt 0,r3
vspltisb v10, 3
lvsl v11, r0, r0
vslb v10, v11, v10
- MTVRD(v1,r4)
+ mtvrd v1, r4
li r5, 16
vspltb v1, v1, 7
/* Compare 32 bytes in each loop. */
blt cr6, L(no_match)
#endif
/* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v2, v2, v10)
- VBPERMQ(v3, v3, v10)
- VBPERMQ(v6, v6, v10)
- VBPERMQ(v7, v7, v10)
+ vbpermq v2, v2, v10
+ vbpermq v3, v3, v10
+ vbpermq v6, v6, v10
+ vbpermq v7, v7, v10
/* Shift each component into its correct position for merging. */
#ifdef __LITTLE_ENDIAN__
vsldoi v3, v3, v3, 2
vor v1, v3, v2
vor v2, v6, v7
vor v4, v1, v2
- MFVRD(r5, v4)
+ mfvrd r5, v4
#ifdef __LITTLE_ENDIAN__
addi r6, r5, -1
andc r6, r6, r5
blt cr6, L(continue1)
addi r3, r3, -32
L(end1):
- VBPERMQ(v2, v2, v10)
- VBPERMQ(v3, v3, v10)
+ vbpermq v2, v2, v10
+ vbpermq v3, v3, v10
/* Shift each component into its correct position for merging. */
#ifdef __LITTLE_ENDIAN__
vsldoi v3, v3, v3, 2
/* Merge the results and move to a GPR. */
vor v4, v3, v2
- MFVRD(r5, v4)
+ mfvrd r5, v4
#ifdef __LITTLE_ENDIAN__
addi r6, r5, -1
andc r6, r6, r5
#include <sysdep.h>
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-
/* int [r3] strlen (char *s [r3]) */
#ifndef STRLEN
# define STRLEN strlen
#endif
-
-/* TODO: change this to .machine power8 when the minimum required binutils
- allows it. */
- .machine power7
+ .machine power8
ENTRY_TOCLESS (STRLEN, 4)
CALL_MCOUNT 1
dcbt 0,r3
vslb v10,v11,v10
/* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v1,v1,v10)
- VBPERMQ(v2,v2,v10)
- VBPERMQ(v3,v3,v10)
- VBPERMQ(v4,v4,v10)
+ vbpermq v1,v1,v10
+ vbpermq v2,v2,v10
+ vbpermq v3,v3,v10
+ vbpermq v4,v4,v10
/* Shift each component into its correct position for merging. */
#ifdef __LITTLE_ENDIAN__
vor v1,v2,v1
vor v2,v3,v4
vor v4,v1,v2
- MFVRD(r10,v4)
+ mfvrd r10,v4
/* Adjust address to the begninning of the current 64-byte block. */
addi r4,r4,-64
64K as default, the page cross handling assumes minimum page size of
4k. */
- .machine power7
+ .machine power8
ENTRY_TOCLESS (STRNCMP, 4)
/* Check if size is 0. */
mr. r10,r5
64K as default, the page cross handling assumes minimum page size of
4k. */
- .machine power7
+ .machine power8
#ifdef MEMSET_is_local
ENTRY_TOCLESS (FUNC_NAME, 4)
#else
/* Define default page size to 4KB. */
#define PAGE_SIZE 4096
-/* The following macros implement Power ISA v2.07 opcodes
- that could not be used directly into this code to the keep
- compatibility with older binutils versions. */
-
-/* Move from vector register doubleword. */
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-
-/* Move to vector register doubleword. */
-#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
-
-/* Vector Bit Permute Quadword. */
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-
-/* Vector Population Count Halfword. */
-#define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21)))
-
-/* Vector Count Leading Zeros Halfword. */
-#define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21)))
-
/* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */
-/* TODO: change to power8 when minimum required binutils allows it. */
- .machine power7
+ .machine power8
ENTRY_TOCLESS (__strnlen)
CALL_MCOUNT 2
dcbt 0,r3
/* Place rounded up number of qw's to check into a vmx
register, and use some vector tricks to minimize
branching. */
- MTVRD(v7,r4) /* Copy maxlen from GPR to vector register. */
+ mtvrd v7,r4 /* copy maxlen from gpr to vector register. */
vspltisb v5,1
vspltisb v6,15
vspltb v2,v7,7
beq cr6,L(loop_16B) /* If null bytes not found. */
vcmpequb v1,v1,v0
- VBPERMQ(v1,v1,v10)
+ vbpermq v1,v1,v10
#ifdef __LITTLE_ENDIAN__
vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */
vandc v2,v2,v1
- VPOPCNTH(v1,v2) /* Count of trailing zeros, 16 if none. */
+ vpopcnth v1,v2 /* count of trailing zeros, 16 if none. */
#else
- VCLZH(v1,v1) /* Count the leading zeros, 16 if none. */
+ vclzh v1,v1 /* count the leading zeros, 16 if none. */
#endif
/* Truncate to maximum allowable offset. */
vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond
maxlen. */
vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */
- MFVRD(r0,v1)
+ mfvrd r0,v1
addi r5,r5,-16 /* Undo speculative bump. */
extsb r0,r0 /* Clear whatever gunk is in the high 56b. */
add r5,r5,r0 /* Add the offset of whatever was found. */
vcmpequb v4,v4,v0
/* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v1,v1,v10)
- VBPERMQ(v2,v2,v10)
- VBPERMQ(v3,v3,v10)
- VBPERMQ(v4,v4,v10)
+ vbpermq v1,v1,v10
+ vbpermq v2,v2,v10
+ vbpermq v3,v3,v10
+ vbpermq v4,v4,v10
/* Shift each component into its correct position for merging. */
#ifdef __LITTLE_ENDIAN__
/* Adjust address to the start of the current 64B block. */
addi r5,r5,-64
- MFVRD(r10,v4)
+ mfvrd r10,v4
#ifdef __LITTLE_ENDIAN__
addi r9,r10,-1 /* Form a mask from trailing zeros. */
andc r9,r9,r10
as a preparation for the 64B loop. */
.p2align 4
L(found_aligning64B):
- VBPERMQ(v1,v1,v10)
+ vbpermq v1,v1,v10
#ifdef __LITTLE_ENDIAN__
- MFVRD(r10,v1)
+ mfvrd r10,v1
addi r9,r10,-1 /* Form a mask from trailing zeros. */
andc r9,r9,r10
popcntd r0,r9 /* Count the bits in the mask. */
#else
vsldoi v1,v1,v1,6
- MFVRD(r10,v1)
+ mfvrd r10,v1
cntlzd r0,r10 /* Count leading zeros before the match. */
#endif
addi r5,r5,-16 /* Adjust address to offset of last 16 bytes
#include <sysdep.h>
/* char *[r3] strrchr (char *s [r3], int c [r4]) */
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
-#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
-#define VADDUQM(t,a,b) .long (0x10000100 \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
+
#ifdef __LITTLE_ENDIAN__
/* Find the match position from v6 and place result in r6. */
# define CALCULATE_MATCH() \
- VBPERMQ(v6, v6, v10); \
+ vbpermq v6, v6, v10; \
vsldoi v6, v6, v6, 6; \
- MFVRD(r7, v6); \
+ mfvrd r7, v6; \
cntlzd r6, r7; \
subfic r6, r6, 15;
/*
*/
# define FIND_NULL_POS(reg) \
vspltisb v11, -1; \
- VADDUQM(v11, reg, v11); \
+ vadduqm v11, reg, v11; \
vandc v11, v11, reg; \
- VPOPCNTD(v2, v11); \
+ vpopcntd v2, v11; \
vspltb v11, v2, 15; \
vcmpequb. v11, v11, v9; \
blt cr6, 1f; \
vsumsws v2, v2, v0;
#else
# define CALCULATE_MATCH() \
- VBPERMQ(v6, v6, v10); \
- MFVRD(r7, v6); \
+ vbpermq v6, v6, v10; \
+ mfvrd r7, v6; \
addi r6, r7, -1; \
andc r6, r6, r7; \
popcntd r6, r6; \
subfic r6, r6, 15;
# define FIND_NULL_POS(reg) \
- VCLZD(v2, reg); \
+ vclzd v2, reg; \
vspltb v11, v2, 7; \
vcmpequb. v11, v11, v9; \
blt cr6, 1f; \
#ifndef STRRCHR
# define STRRCHR strrchr
#endif
- .machine power7
+ .machine power8
ENTRY_TOCLESS (STRRCHR)
CALL_MCOUNT 2
dcbt 0,r3
vspltisb v10, 3
lvsl v11, r0, r0
vslb v10, v11, v10
- MTVRD(v1, r4)
+ mtvrd v1, r4
li r5, 16
vspltb v1, v1, 7
/* Compare 32 bytes in each loop. */
addi r3, r3, 32
blt cr6, L(continue1)
addi r3, r3, -32
- VBPERMQ(v2, v2, v10)
- VBPERMQ(v3, v3, v10)
+ vbpermq v2, v2, v10
+ vbpermq v3, v3, v10
/* Shift each component into its correct position for merging. */
#ifdef __LITTLE_ENDIAN__
vsldoi v3, v3, v3, 2
#endif
/* Merge the results and move to a GPR. */
vor v4, v3, v2
- MFVRD(r5, v4)
+ mfvrd r5, v4
#ifdef __LITTLE_ENDIAN__
addi r6, r5, -1
andc r6, r6, r5
#define XXVR(insn, vrt, vra, vrb) \
insn 32+vrt, 32+vra, 32+vrb
-/* ISA 2.07B instructions are not all defined for older binutils.
- Macros are defined below for these newer instructions in order
- to maintain compatibility. */
-
-/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */
-#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-
- /* This can be updated to power8 once the minimum version of
- binutils supports power8 and the above instructions. */
- .machine power7
+ .machine power8
ENTRY_TOCLESS (STRSPN, 4)
CALL_MCOUNT 2
L(start_cmp):
/* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */
mr r0, r3 /* Save r3 for final length computation. */
- MTVRD (v5, r5)
- MTVRD (v6, r6)
- MTVRD (v7, r7)
- MTVRD (v8, r8)
+ mtvrd v5, r5
+ mtvrd v6, r6
+ mtvrd v7, r7
+ mtvrd v8, r8
/* Continue interleaved mask generation. */
#ifdef __LITTLE_ENDIAN__
/* Compare the first 1-16B, while masking unwanted bytes. */
clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */
vxor v9, v0, v1 /* Swap high bit. */
- VBPERMQ (v8, v5, v0)
- VBPERMQ (v7, v6, v9)
+ vbpermq v8, v5, v0
+ vbpermq v7, v6, v9
vor v7, v7, v8
vor v7, v7, v11 /* Ignore non-participating bytes. */
vcmpequh. v8, v7, v4
lvx v0, 0, r3
addi r3, r3, 16
vxor v9, v0, v1 /* Swap high bit. */
- VBPERMQ (v8, v5, v0)
- VBPERMQ (v7, v6, v9)
+ vbpermq v8, v5, v0
+ vbpermq v7, v6, v9
vor v7, v7, v8
vcmpequh. v8, v7, v4
blt cr6, L(vec)
addi r3, r3, -16
L(done):
subf r3, r0, r3
- MFVRD (r10, v7)
+ mfvrd r10, v7
#ifdef __LITTLE_ENDIAN__
addi r0, r10, 1 /* Count the trailing 1's. */