This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 19/27] S390: Optimize strrchr and wcsrchr.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Stefan Liebler <stli at linux dot vnet dot ibm dot com>
- Cc: libc-alpha at sourceware dot org
- Date: Fri, 26 Jun 2015 15:46:06 +0200
- Subject: Re: [PATCH 19/27] S390: Optimize strrchr and wcsrchr.
- Authentication-results: sourceware.org; auth=none
- References: <1435319512-22245-1-git-send-email-stli at linux dot vnet dot ibm dot com> <1435319512-22245-20-git-send-email-stli at linux dot vnet dot ibm dot com>
On Fri, Jun 26, 2015 at 01:51:44PM +0200, Stefan Liebler wrote:
> This patch provides optimized versions of strrchr and wcsrchr with the z13
> vector instructions.
>
> +ENTRY(__strrchr_vx)
> + .machine "z13"
> + .machinemode "zarch_nohighgprs"
> +
> + vlvgb %v18,%r3,0 /* Generate vector which elements are all c.
> + if c > 255, c will be truncated. */
> + lghi %r5,0 /* current_len = 0. */
> + vrepb %v18,%v18,0
> + lghi %r1,0 /* Zero out return pointer. */
> +
> + /* Align s to 16byte */
> + risbg %r0,%r2,60,128+63,0 /* Test if s is aligned and
> + %r0 = bits 60-63 'and' 15. */
> + je .Lloop1 /* If s is aligned, loop aligned. */
> + lghi %r4,15
> + slr %r4,%r0 /* Compute highest index to load (15-x). */
> + vll %v16,%r4,0(%r2) /* Load up to 16 byte boundary (vll needs
> + highest index, remaining bytes are 0). */
> + ahi %r4,1 /* Work with loaded byte count. */
> + j .Llt16
> +
> + /* Process s in a 16byte aligned loop. */
> +.Lloop2:
> + aghi %r5,16
> +.Lloop1:
> + vl %v16,0(%r5,%r2) /* Load s. */
> + vfeezbs %v17,%v16,%v18 /* Find element equal with zero search. */
> + jno .Lfound_from_loop /* Found c/zero (cc=0|1|2). */
> + vl %v16,16(%r5,%r2)
> + aghi %r5,16
> + vfeezbs %v17,%v16,%v18
> + jno .Lfound_from_loop
> + vl %v16,16(%r5,%r2)
> + aghi %r5,16
> + vfeezbs %v17,%v16,%v18
> + jno .Lfound_from_loop
> + vl %v16,16(%r5,%r2)
> + aghi %r5,16
> + vfeezbs %v17,%v16,%v18
> + jo .Lloop2 /* No character and no zero -> loop. */
> +
> +
> +.Lfound_from_loop:
> + vlgvb %r0,%v17,7 /* Load byte index of character. */
> + lghi %r4,16 /* Byte count in v16 is 16. */
> +.Lfound:
> + /* Found c/zero in loaded bytes,
> + in %r0 is found index. */
> + je .Lzero /* Found zero, but no c before -> end. */
> +
> +.Lcharacter:
> + /* Found character. */
> + algr %r5,%r0
> + la %r1,0(%r5,%r2) /* Store found character pointer. */
> + aghi %r0,1 /* Start next search behind found character. */
> + aghi %r5,1 /* Start next search behind found character. */
> + clrje %r0,%r4,.Lloop1 /* Found character was the last loaded byte,
> + so load next 16bytes aligned and
> + loop until end of string. */
> + /* Shift left processed bytes in vector register
> + and process remaining bytes without load */
> + slr %r4,%r0 /* Calculate remaining number of bytes =
> + loaded byte count - (found-index + 1) */
> + sll %r0,3 /* Compute byte count for vector shift left. */
> + vlvgb %v17,%r0,7
> + vslb %v16,%v16,%v17 /* Vector shift left by byte by number of bytes
> + specified in bits 1-4 of byte 7 in v17. */
> +
This looks quite slow when you call strrchr("////////////////////////",'/')
I fixed same mistake on x64. You need to save that do do only determine
index only once at end, like following where you choose c to get null
when no s was found.
mask = 0;
ptr = c;
while (!has_zero(m = load (s))
{
if (has_c(m))
{
mask = m;
ptr = s;
}
s+=16;
}
m=mask_bytes_after_zero(m);
if (has_c(m))
{
mask = m;
ptr = s;
}
return ptr + bsr(m);