This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] powerpc64: strncpy optimization for unaligned string
- From: Adhemerval Zanella <azanella at linux dot vnet dot ibm dot com>
- To: libc-alpha at sourceware dot org
- Date: Thu, 12 Feb 2015 16:21:31 -0200
- Subject: Re: [PATCH] powerpc64: strncpy optimization for unaligned string
- Authentication-results: sourceware.org; auth=none
- References: <1422444072-62407-1-git-send-email-raji at linux dot vnet dot ibm dot com>
Hi Raji,
This patch looks ok, I have pushed it upstream as 98408b95b15 with some
minor cosmetic and typo fixes.
On 28-01-2015 09:21, Rajalakshmi Srinivasaraghavan wrote:
> This patch optimizes strncpy for power7 for unaligned source or
> destination address. The source or destination address is aligned
> to doubleword and data is shifted based on the alignment and
> added with the previous loaded data to be written as a doubleword.
> For each load, cmpb instruction is used for faster null check.
>
> The new optimization shows 10 to 70% of performance improvement
> for longer string though it does not show big difference on string
> size less than 16 due to additional checks.Hence this new algorithm
> is restricted to string greater than 16.
>
> This patch is tested on ppc64 and ppc64le.
>
> * sysdeps/powerpc/powerpc64/power7/strncpy.S: Optimize unaligned path.
> ---
> sysdeps/powerpc/powerpc64/power7/strncpy.S | 380 ++++++++++++++++++++++++++++-
> 1 file changed, 378 insertions(+), 2 deletions(-)
>
> diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
> index e60fc25..c07b2be 100644
> --- a/sysdeps/powerpc/powerpc64/power7/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/power7/strncpy.S
> @@ -72,9 +72,9 @@ EALIGN(FUNC_NAME, 4, 0)
>
> mr r9, r3 /* save r3 into r9 for use */
> mr r18, r3 /* save r3 for retCode of strncpy */
> - bne 0, L(byte_by_byte)
> -
> + bne 0, L(unaligned)
>
> +L(aligned):
> srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */
> cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */
> ble 7, L(update1)
> @@ -332,6 +332,382 @@ L(HopBy8):
> addi r5, r5, -8 /* decrement length 'n' by 8 */
> addi r0, r11, -1 /* decrement loop counter */
> b L(dWordUnrollOFF)
> +
> +L(unaligned):
> + cmpdi r5, 16 /* Proceed byte by byte for less than 16 */
> + ble L(byte_by_byte)
> + rldicl r7, r3, 0, 61
> + rldicl r6, r4, 0, 61
> + cmpdi r6, 0 /* Check src alignment */
> + beq L(srcaligndstunalign)
> + /* src is unaligned */
> + rlwinm r10, r4, 3,26,28 /* Calculate padding. */
> + clrrdi r4, r4, 3 /* Align the addr to dw boundary */
> + ld r8, 0(r4) /* Load doubleword from memory. */
> + li r0, 0
> + /* Discard bits not part of the string */
> +#ifdef __LITTLE_ENDIAN__
> + srd r7, r8, r10
> +#else
> + sld r7, r8, r10
> +#endif
> + cmpb r0, r7, r0 /* Compare each byte against null */
> + /* Discard bits not part of the string */
> +#ifdef __LITTLE_ENDIAN__
> + sld r0, r0, r10
> +#else
> + srd r0, r0, r10
> +#endif
> + cmpdi r0, 0
> + bne L(bytebybyte) /* if it has null, copy byte by byte */
> + subfic r6, r6, 8
> + rlwinm r12, r3, 3,26,28 /* Calculate padding in bits. */
> + rldicl r9, r3, 0, 61 /* Calculate padding in bytes. */
> + addi r3, r3, -1
> +
> + cmpdi r12, 0 /* check dest alignment */
> + beq L(srcunaligndstalign)
> +
> + /* both src and dst unaligned */
> +#ifdef __LITTLE_ENDIAN__
> + sld r8, r7, r10
> + mr r11, r10
> + addi r11, r11, -8 /* Adjust byte pointer on loaded dw */
> +#else
> + srd r8, r7, r10
> + subfic r11, r10, 64
> +#endif
> + /* dst alignment is greater then src alignment? */
> + cmpd cr7, r12, r10
> + ble cr7, L(dst_align_small)
> + /* src alignment is less than dst */
> +
> + /* Calculate the dst alignment differnce */
> + subfic r7, r9, 8
> + mtctr r7
> +
> + /* Write till dst is aligned */
> + cmpdi r0, r7, 4
> + blt L(storebyte1) /* less than 4, store byte by byte */
> + beq L(equal1) /* if its 4, store word */
> + addi r0, r7, -4 /* greater than 4, so stb and stw */
> + mtctr r0
> +L(storebyte1):
> +#ifdef __LITTLE_ENDIAN__
> + addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
> +#else
> + addi r11, r11, -8
> +#endif
> + srd r7, r8, r11
> + stbu r7, 1(r3)
> + addi r5, r5, -1
> + bdnz L(storebyte1)
> +
> + subfic r7, r9, 8 /* Check the remaining bytes */
> + cmpdi r0, r7, 4
> + blt L(proceed1)
> +
> + .align 4
> +L(equal1):
> +#ifdef __LITTLE_ENDIAN__
> + addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
> + srd r7, r8, r11
> +#else
> + subfic r11, r11, 64
> + sld r7, r8, r11
> + srdi r7, r7, 32
> +#endif
> + stw r7, 1(r3)
> + addi r3, r3, 4
> + addi r5, r5, -4
> +
> +L(proceed1):
> + mr r7, r8
> + /* calculate the Left over bytes to be written */
> + subfic r11, r10, 64
> + subfic r12, r12, 64
> + subf r12, r12, r11 /* remaining bytes on second dw */
> + subfic r10, r12, 64 /* remaining bytes on first dw */
> + subfic r9, r9, 8
> + subf r6, r9, r6 /* recalculate padding */
> +L(srcunaligndstalign):
> + addi r3, r3, 1
> + subfic r12, r10, 64 /* remaining bytes on second dw */
> + addi r4, r4, 8
> + li r0,0
> + b L(storedouble)
> +
> + .align 4
> +L(dst_align_small):
> + mtctr r6
> + /* Write till src is aligned */
> +L(storebyte2):
> +#ifdef __LITTLE_ENDIAN__
> + addi r11, r11, 8 /* Adjust byte pointer on dw */
> +#else
> + addi r11, r11, -8
> +#endif
> + srd r7, r8, r11
> + stbu r7, 1(r3)
> + addi r5, r5, -1
> + bdnz L(storebyte2)
> +
> + addi r4, r4, 8 /* Increment src pointer */
> + addi r3, r3, 1 /* Increment dst pointer */
> + mr r9, r3
> + li r8, 0
> + cmpd cr7, r12, r10
> + beq cr7, L(aligned)
> + rldicl r6, r3, 0, 61 /* Recalculate padding */
> + mr r7, r6
> +
> + /* src is algined */
> +L(srcaligndstunalign):
> + mr r9, r3
> + mr r6, r7
> + ld r8, 0(r4)
> + subfic r10, r7, 8
> + mr r7, r8
> + li r0, 0 /* Check null */
> + cmpb r0, r8, r0
> + cmpdi r0, 0
> + bne L(byte_by_byte) /* Do byte by byte if there is NULL */
> + rlwinm r12, r3, 3,26,28 /* Calculate padding */
> + addi r3, r3, -1
> + /* write byte by byte till aligned */
> +#ifdef __LITTLE_ENDIAN__
> + li r11, -8
> +#else
> + li r11, 64
> +#endif
> + mtctr r10
> + cmpdi r0, r10, 4
> + blt L(storebyte)
> + beq L(equal)
> + addi r0, r10, -4
> + mtctr r0
> +L(storebyte):
> +#ifdef __LITTLE_ENDIAN__
> + addi r11, r11, 8 /* Adjust byte pointer on dw */
> +#else
> + addi r11, r11, -8
> +#endif
> + srd r7, r8, r11
> + stbu r7, 1(r3)
> + addi r5, r5, -1
> + bdnz L(storebyte)
> +
> + cmpdi r0, r10, 4
> + blt L(align)
> +
> + .align 4
> +L(equal):
> +#ifdef __LITTLE_ENDIAN__
> + addi r11, r11, 8
> + srd r7, r8, r11
> +#else
> + subfic r11, r11, 64
> + sld r7, r8, r11
> + srdi r7, r7, 32
> +#endif
> + stw r7, 1(r3)
> + addi r5, r5, -4
> + addi r3, r3, 4
> +L(align):
> + addi r3, r3, 1
> + addi r4, r4, 8 /* Increment src pointer */
> + subfic r10, r12, 64
> + li r0, 0
> + /* dst addr aligned to 8 */
> +L(storedouble):
> + cmpdi r5, 8
> + ble L(null1)
> + ld r7, 0(r4) /* load next dw */
> + cmpb r0, r7, r0
> + cmpdi r0, 0 /* check for null on each new dw */
> + bne L(null)
> +#ifdef __LITTLE_ENDIAN__
> + srd r9, r8, r10 /* bytes from first dw */
> + sld r11, r7, r12 /* bytes from second dw */
> +#else
> + sld r9, r8, r10
> + srd r11, r7, r12
> +#endif
> + or r11, r9, r11 /* make as a single dw */
> + std r11, 0(r3) /* store as std on aligned addr */
> + mr r8, r7 /* still few bytes left to be written */
> + addi r3, r3, 8 /* increment dst addr */
> + addi r4, r4, 8 /* increment src addr */
> + addi r5, r5, -8
> + b L(storedouble) /* Loop till NULL */
> +
> + .align 4
> +
> +/* We've hit the end of the string. Do the rest byte-by-byte. */
> +L(null):
> + addi r3, r3, -1
> + mr r10, r12
> + mtctr r6
> +#ifdef __LITTLE_ENDIAN__
> + subfic r10, r10, 64
> + addi r10, r10, -8
> +#endif
> + cmpdi r0, r5, 4
> + blt L(loop)
> + cmpdi r0, r6, 4
> + blt L(loop)
> +
> + /* we can still use stw if leftover >= 4*/
> +#ifdef __LITTLE_ENDIAN__
> + addi r10, r10, 8
> + srd r11, r8, r10
> +#else
> + subfic r10, r10, 64
> + sld r11, r8, r10
> + srdi r11, r11, 32
> +#endif
> + stw r11, 1(r3)
> + addi r5, r5, -4
> + addi r3, r3, 4
> + cmpdi r0, r5, 0
> + beq L(g1)
> + cmpdi r0, r6, 4
> + beq L(bytebybyte1)
> + addi r10, r10, 32
> +#ifdef __LITTLE_ENDIAN__
> + addi r10, r10, -8
> +#else
> + subfic r10, r10, 64
> +#endif
> + addi r0, r6, -4
> + mtctr r0
> + /* remaining byte by byte part of first dw */
> +L(loop):
> +#ifdef __LITTLE_ENDIAN__
> + addi r10, r10, 8
> +#else
> + addi r10, r10, -8
> +#endif
> + srd r0, r8, r10
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + cmpdi r0, r5, 0
> + beq L(g1)
> + bdnz L(loop)
> +L(bytebybyte1):
> + addi r3, r3, 1
> + /* remaining byte by byte part of second dw */
> +L(bytebybyte):
> + addi r3, r3, -8
> + addi r4, r4, -1
> +
> +#ifdef __LITTLE_ENDIAN__
> + extrdi. r0, r7, 8, 56
> + stbu r7, 8(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 48
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 40
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 32
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 24
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 16
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 8
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi r0, r7, 8, 0
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + b L(g2)
> +#else
> + extrdi. r0, r7, 8, 0
> + stbu r0, 8(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 8
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 16
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 24
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 32
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 40
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + extrdi. r0, r7, 8, 48
> + stbu r0, 1(r3)
> + addi r5, r5, -1
> + beq L(g2)
> + cmpdi r5, 0
> + beq L(g1)
> + stbu r7, 1(r3)
> + addi r5, r5, -1
> + b L(g2)
> +#endif
> +L(g1):
> +#ifdef USE_AS_STPNCPY
> + addi r3, r3, 1
> +#endif
> +L(g2):
> + addi r3, r3, 1
> + mr r19, r3
> + mr r8, r5
> + b L(zeroFill)
> +L(null1):
> + mr r9, r3
> + subf r4, r6, r4
> + b L(byte_by_byte)
> END(FUNC_NAME)
> #ifndef USE_AS_STPNCPY
> libc_hidden_builtin_def (strncpy)