This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 19/27] S390: Optimize strrchr and wcsrchr.


On Fri, Jun 26, 2015 at 01:51:44PM +0200, Stefan Liebler wrote:
> This patch provides optimized versions of strrchr and wcsrchr with the z13
> vector instructions.
> 
> +ENTRY(__strrchr_vx)
> +	.machine "z13"
> +	.machinemode "zarch_nohighgprs"
> +
> +	vlvgb	%v18,%r3,0	/* Generate vector which elements are all c.
> +				   if c > 255, c will be truncated.  */
> +	lghi	%r5,0		/* current_len = 0.  */
> +	vrepb	%v18,%v18,0
> +	lghi	%r1,0		/* Zero out return pointer.  */
> +
> +	/* Align s to 16byte */
> +	risbg	%r0,%r2,60,128+63,0 /* Test if s is aligned and
> +				       %r0 = bits 60-63 'and' 15.  */
> +	je	.Lloop1		/* If s is aligned, loop aligned.  */
> +	lghi	%r4,15
> +	slr	%r4,%r0		/* Compute highest index to load (15-x).  */
> +	vll	%v16,%r4,0(%r2) /* Load up to 16 byte boundary (vll needs
> +				   highest index, remaining bytes are 0).  */
> +	ahi	%r4,1		/* Work with loaded byte count.  */
> +	j	.Llt16
> +
> +	/* Process s in a 16byte aligned loop.  */
> +.Lloop2:
> +	aghi	%r5,16
> +.Lloop1:
> +	vl	%v16,0(%r5,%r2)	/* Load s.  */
> +	vfeezbs	%v17,%v16,%v18	/* Find element equal with zero search.  */
> +	jno	.Lfound_from_loop /* Found c/zero (cc=0|1|2).  */
> +	vl	%v16,16(%r5,%r2)
> +	aghi	%r5,16
> +	vfeezbs	%v17,%v16,%v18
> +	jno	.Lfound_from_loop
> +	vl	%v16,16(%r5,%r2)
> +	aghi	%r5,16
> +	vfeezbs	%v17,%v16,%v18
> +	jno	.Lfound_from_loop
> +	vl	%v16,16(%r5,%r2)
> +	aghi	%r5,16
> +	vfeezbs	%v17,%v16,%v18
> +	jo	.Lloop2		/* No character and no zero -> loop.  */
> +
> +
> +.Lfound_from_loop:
> +	vlgvb	%r0,%v17,7	/* Load byte index of character.  */
> +	lghi	%r4,16		/* Byte count in v16 is 16.  */
> +.Lfound:
> +	/* Found c/zero in loaded bytes,
> +				   in %r0 is found index.  */
> +	je	.Lzero		/* Found zero, but no c before -> end.  */
> +
> +.Lcharacter:
> +	/* Found character.  */
> +	algr	%r5,%r0
> +	la	%r1,0(%r5,%r2)	/* Store found character pointer.  */
> +	aghi	%r0,1		/* Start next search behind found character.  */
> +	aghi	%r5,1		/* Start next search behind found character.  */
> +	clrje	%r0,%r4,.Lloop1 /* Found character was the last loaded byte,
> +				    so load next 16bytes aligned and
> +				    loop until end of string.  */
> +	/* Shift left processed bytes in vector register
> +	   and process remaining bytes without load  */
> +	slr	%r4,%r0		/* Calculate remaining number of bytes =
> +				   loaded byte count - (found-index + 1)  */
> +	sll	%r0,3		/* Compute byte count for vector shift left.  */
> +	vlvgb	%v17,%r0,7
> +	vslb	%v16,%v16,%v17	/* Vector shift left by byte by number of bytes
> +				   specified in bits 1-4 of byte 7 in v17.   */
> +

This looks quite slow when you call strrchr("////////////////////////",'/')

I fixed same mistake on x64. You need to save that do do only determine
index only once at end, like following where you choose c to get null
when no s was found.

mask = 0;
ptr = c;
while (!has_zero(m = load (s))
  {
    if (has_c(m))
      {
        mask = m;
        ptr = s;
      }
    s+=16;
  }
m=mask_bytes_after_zero(m);
if (has_c(m))
  {
    mask = m;
    ptr = s;
  }
return ptr + bsr(m);


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]