This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] optimized SPU specific code for str[n]cpy and str[n]cat


On Mon, Feb 11, 2008 at 05:20:21PM -0500, Jeff Johnston wrote:
> Patch applied.
>
> -- Jeff J.

Thanks, can you check the header files?

The files straddr.h and strcpy.h do not seem to have made it into cvs.

Let me know if you would like another patch.

-- Patrick Mansfield

>
> Patrick Mansfield wrote:
>> Jeff - can you please apply?
>>
>> Optimized SPU specific code for str[n]cpy and str[n]cat.
>>
>> These decrease (for simple performance tests) execution times compared to
>> current SPU code for these functions by 20 to 50 percent.
>>
>> Generally, code size increases, with a worst case increase of 680 bytes.
>>
>> newlib/ChangeLog:
>>
>> 2008-02-11 Patrick Mansfield <patmans@us.ibm.com>
>>
>> 	* libc/machine/spu/straddr.h: New file, supplies _straddr.
>> 	* libc/machine/spu/strcat.c: Use _straddr and _strcpy.
>> 	* libc/machine/spu/strcpy.c: Use _strcpy.
>> 	* libc/machine/spu/strcpy.h: Supply _strcpy for optimized SPU
>> 	  str[n]cpy and str[n]cat.
>> 	* libc/machine/spu/strncat.c: Use _straddr and _strcpy.
>> 	* libc/machine/spu/strncpy.c: Use _strcpy.
>>
>> Index: quilt/newlib/libc/machine/spu/strcat.c
>> ===================================================================
>> --- quilt.orig/newlib/libc/machine/spu/strcat.c
>> +++ quilt/newlib/libc/machine/spu/strcat.c
>> @@ -1,9 +1,6 @@
>>  /*
>> -  (C) Copyright 2001,2006,
>> +  (C) Copyright 2008
>>    International Business Machines Corporation,
>> -  Sony Computer Entertainment, Incorporated,
>> -  Toshiba Corporation,
>> -
>>    All rights reserved.
>>     Redistribution and use in source and binary forms, with or without
>> @@ -30,26 +27,19 @@
>>    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
>>    POSSIBILITY OF SUCH DAMAGE.
>>  */
>> -#include <spu_intrinsics.h>
>> +
>>  #include <stddef.h>
>>  #include <string.h>
>> +#include "straddr.h"
>> +#include "strcpy.h"
>>  -/* Appends the string pointed to by src (up to and including the /0
>> - * character) to the array pointed to by dest (overwriting the
>> +/*
>> + * Appends the string pointed to by src (up to and including the /0
>> + * character) to the array pointed to by dest overwriting the
>>   * /0 character at the end of dest. The strings may not overlap and
>>   * the dest string must have enough space for the result.
>>   */
>> -
>>  char *strcat(char * __restrict__ dest, const char * __restrict__ src)
>>  {
>> -  size_t d_len, s_len;
>> -
>> -  /* Determine the length of the src and dest input arrays.
>> -   */
>> -  d_len = strlen(dest);
>> -  s_len = strlen(src);
>> -
>> -  (void)memcpy((void *)(dest+d_len), (const void *)src, s_len + 1);
>> -
>> -  return ((char *)dest);
>> +  return _strncpy(_straddr(dest), src, 0, 0, 0);
>>  }
>> Index: quilt/newlib/libc/machine/spu/strcpy.h
>> ===================================================================
>> --- /dev/null
>> +++ quilt/newlib/libc/machine/spu/strcpy.h
>> @@ -0,0 +1,180 @@
>> +/*
>> +  (C) Copyright 2008
>> +  International Business Machines Corporation,
>> +  All rights reserved.
>> +
>> +  Redistribution and use in source and binary forms, with or without
>> +  modification, are permitted provided that the following conditions are met:
>> +
>> +    * Redistributions of source code must retain the above copyright notice,
>> +  this list of conditions and the following disclaimer.
>> +    * Redistributions in binary form must reproduce the above copyright
>> +  notice, this list of conditions and the following disclaimer in the
>> +  documentation and/or other materials provided with the distribution.
>> +    * Neither the names of the copyright holders nor the names of their
>> +  contributors may be used to endorse or promote products derived from
>> +  this software without specific prior written permission.
>> +
>> +  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
>> +  IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
>> +  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
>> +  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
>> +  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
>> +  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
>> +  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
>> +  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
>> +  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
>> +  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
>> +  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> +*/
>> +
>> +#include <spu_intrinsics.h>
>> +#include "vec_literal.h"
>> +
>> +/*
>> + * Supply an inline _strncpy for strcpy/cat and strncpy/cat. Relies on
>> + * checklen and lastzero code being optimized out when they are constant
>> + * zero values.
>> + */
>> +static inline void * _strncpy(char * __restrict__ dest, const char *
>> +                              __restrict__ src, size_t maxlen, int
>> +                              checklen, int lastzero)
>> +{
>> +  int adjust, offset, soffset, doffset, shift;
>> +  vec_uchar16 *vsrc, *vdest;
>> +  vec_uchar16 sdata1, sdata2, sdata, shuffle;
>> +  vec_uchar16 mask1, maskzero, cmp0;
>> +  vec_uint4 nonzeroes, gathered_cmp, vtmp, vtmp2;
>> +  vec_uint4 curlen; /* assumes size_t is 4 bytes */
>> +  const vec_uint4 val31 = { 31, 31, 31, 31 };
>> +  const vec_uint4 val_0123 = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
>> +  const vec_uchar16 all_ones = { 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
>> +                                 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff };
>> +
>> +  vsrc = (vec_uchar16 *) src;
>> +  vdest = (vec_uchar16 *) dest;
>> +  soffset = (int) src & 15;
>> +  doffset = (int) dest & 15;
>> +
>> +  if (checklen) {
>> +    /*
>> +     * Set curlen so it is the number of bytes we would copy if starting
>> +     * from vdest & ~0xf.
>> +     *
>> +     * curlen could probably be replaced by comparing vdest plus some
>> +     * offset to dest + maxlen, that would help mainly in the while loop
>> +     * but would lose only one instruction (the curlen -= 16).
>> +     */
>> +    curlen = spu_splats((unsigned int) (maxlen + doffset));
>> +  }
>> +
>> +  /*
>> +   * Setup a shuffle pattern to align the source string with the
>> +   * alignment of the destination string.
>> +   */
>> +  vtmp = spu_cmpgt(spu_promote(doffset, 0), spu_promote(soffset, 0));
>> +  adjust = spu_extract(vtmp, 0);
>> +  offset  = soffset - doffset;
>> +  offset += adjust & 16;
>> +  shuffle = spu_splats((unsigned char) offset);
>> +  shuffle = (vec_uchar16) spu_add((vec_uint4) shuffle, val_0123);
>> +
>> +  vsrc += adjust;
>> +  sdata1 = *vsrc++;
>> +  sdata2 = *vsrc++;
>> +  sdata = spu_shuffle(sdata1, sdata2, shuffle);
>> +
>> +  /*
>> +   * mask out leading bytes
>> +   */
>> +  mask1 = spu_rlmaskqwbyte(all_ones, -doffset);
>> +
>> +  cmp0 = spu_and(mask1, spu_cmpeq(sdata, 0));
>> +  nonzeroes = spu_cntlz(spu_gather(cmp0));
>> +  /*
>> +   * First element of nonzeroes - 15 is the number of leading non-zero
>> +   * bytes plus 1 for the zero byte.
>> +   */
>> +  if (checklen) {
>> +    vtmp = spu_add(curlen, 15);
>> +    vtmp2 = spu_cmpgt(nonzeroes, vtmp);
>> +    nonzeroes = spu_sel(nonzeroes, vtmp, vtmp2);
>> +  }
>> +
>> +  vtmp = spu_cmpgt(nonzeroes, val31);
>> +  /*
>> +   * Note: using immediate (constant 31) vs a vector value (val31) does
>> +   * not give different results, and we have to have a vector val31 for
>> +   * the spu_sel below, so use val31 everywhere.
>> +   */
>> +  vtmp = spu_sel(nonzeroes, val31, vtmp);
>> +  /*
>> +   * So vtmp is now min(nonzeroes, 31), the number of bytes + 16 that we
>> +   * want to copy from the first 16 bytes of the source.
>> +   */
>> +  if (checklen) {
>> +    curlen = spu_sub(vtmp, curlen);
>> +    curlen = spu_sub(15, curlen);
>> +  }
>> +
>> +  /*
>> +   * We want a right shift 0xff with fill by ones of (vtmp - 15) bytes, but
>> +   * that doesn't exist so use spu_slqwbyte and vtmp all ones left by
>> +   * (31 - vtmp). Note: this can also use spu_rlqwbytebc with spu_rlqw.
>> +   */
>> +  shift = spu_extract(spu_sub(val31, vtmp), 0);
>> +  maskzero = spu_slqwbyte(all_ones, shift);
>> +  maskzero = spu_and(mask1, maskzero);
>> +  *vdest = spu_sel(*vdest, sdata, maskzero);
>> +
>> +  vtmp = spu_cmpgt(nonzeroes, val31);
>> +  if (checklen) {
>> +    vtmp2 = spu_cmpgt(curlen, 0);
>> +    vtmp = spu_and(vtmp, vtmp2);
>> +  }
>> +  if (spu_extract(vtmp, 0)) {
>> +    sdata1 = sdata2;
>> +    sdata2 = *vsrc++;
>> +    sdata = spu_shuffle(sdata1, sdata2, shuffle);
>> +    cmp0 = spu_cmpeq(sdata, 0);
>> +    gathered_cmp = spu_gather(cmp0);
>> +    /*
>> +     * Copy 16 bytes at a time.
>> +     */
>> +    while ((spu_extract(gathered_cmp, 0) == 0) &&
>> +           (!checklen || (spu_extract(curlen, 0) > 15))) {
>> +      if (checklen)
>> +        curlen = spu_add(curlen, -16);
>> +      *++vdest = sdata;
>> +      sdata1 = sdata2;
>> +      sdata2 = *vsrc++;
>> +      sdata = spu_shuffle(sdata1, sdata2, shuffle);
>> +      cmp0 = spu_cmpeq(sdata, 0);
>> +      gathered_cmp = spu_gather(cmp0);
>> +    }
>> +    /*
>> +     * Copy 0 to 15 trailing bytes, either up to the smaller of curlen or
>> +     * the number of non-zero bytes.
>> +     */
>> +    nonzeroes = spu_cntlz(gathered_cmp);
>> +    if (checklen) {
>> +      vtmp = spu_add(curlen, 15);
>> +      vtmp2 = spu_cmpgt(nonzeroes, vtmp);
>> +      nonzeroes = spu_sel(nonzeroes, vtmp, vtmp2);
>> +      curlen = spu_sub(nonzeroes, curlen);
>> +      curlen = spu_sub(15, curlen);
>> +    }
>> +    shift = spu_extract(spu_sub(val31, nonzeroes), 0);
>> +    maskzero = spu_slqwbyte(all_ones, shift);
>> +    ++vdest;
>> +    *vdest = spu_sel(*vdest, sdata, maskzero);
>> +  }
>> +
>> +  if (checklen && lastzero) {
>> +    /*
>> +     * For strncat.
>> +     */
>> +    dest[maxlen - spu_extract(curlen, 0)] = '\0';
>> +  }
>> +  return (dest);
>> +}
>> Index: quilt/newlib/libc/machine/spu/strcpy.c
>> ===================================================================
>> --- quilt.orig/newlib/libc/machine/spu/strcpy.c
>> +++ quilt/newlib/libc/machine/spu/strcpy.c
>> @@ -1,9 +1,6 @@
>>  /*
>> -  (C) Copyright 2001,2006,
>> +  (C) Copyright 2008
>>    International Business Machines Corporation,
>> -  Sony Computer Entertainment, Incorporated,
>> -  Toshiba Corporation,
>> -
>>    All rights reserved.
>>     Redistribution and use in source and binary forms, with or without
>> @@ -30,19 +27,15 @@
>>    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
>>    POSSIBILITY OF SUCH DAMAGE.
>>  */
>> -#include <spu_intrinsics.h>
>>  #include <stddef.h>
>> +#include "strcpy.h"
>>  -/* Copy the string pointed to by src (up to and including the /0
>> +/*
>> + * Copy the string pointed to by src (up to and including the /0
>>   * character) into the array pointed to by dest. If copy between
>>   * two arrays that overlap, then behavior is undefined.
>>   */
>> -
>>  char * strcpy(char * __restrict__ dest, const char * __restrict__ src)
>>  {
>> -  /* Due to the need to support all alignment variances, this
>> -   * function can not easily be optimized. As a result, it is
>> -   * serviced using strlen and memcpy.
>> -   */
>> -  return ((char *)memcpy((void *)dest, (const void *)src, strlen(src)+1));
>> +  return _strncpy(dest, src, 0, 0, 0);
>>  }
>> Index: quilt/newlib/libc/machine/spu/strncat.c
>> ===================================================================
>> --- quilt.orig/newlib/libc/machine/spu/strncat.c
>> +++ quilt/newlib/libc/machine/spu/strncat.c
>> @@ -1,9 +1,6 @@
>>  /*
>> -  (C) Copyright 2001,2006,
>> +  (C) Copyright 2008
>>    International Business Machines Corporation,
>> -  Sony Computer Entertainment, Incorporated,
>> -  Toshiba Corporation,
>> -
>>    All rights reserved.
>>     Redistribution and use in source and binary forms, with or without
>> @@ -30,68 +27,19 @@
>>    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
>>    POSSIBILITY OF SUCH DAMAGE.
>>  */
>> -#include <spu_intrinsics.h>
>>  #include <stddef.h>
>>  #include <string.h>
>> +#include "straddr.h"
>> +#include "strcpy.h"
>>  -/* Appends the string pointed to by src (up to and including the /0
>> +/*
>> + * Appends the string pointed to by src (up to and including the /0
>>   * character) to the array pointed to by dest (overwriting the
>>   * /0 character at the end of dest. The strings may not overlap and
>>   * the dest string must have enough space for the result.
>>   */
>> -
>> -char * strncat(char * __restrict__ dest, const char * __restrict__ src, size_t n)
>> +char * strncat(char * __restrict__ dest, const char * __restrict__ src,
>> +               size_t n)
>>  {
>> -  unsigned int cmp, skip, mask, len;
>> -  vec_uchar16 *ptr, data;
>> -  vec_uint4 cnt, gt, N;
>> -  char *dst;
>> -
>> -  /* Determine the starting location to begin concatenation.
>> -   */
>> -  dst = dest + strlen(dest);
>> -
>> -  /* Copy the src image until either the src string terminates
>> -   * or n characters are copied.
>> -   */
>> -  N = spu_promote((unsigned int)n, 0);
>> -
>> -  /* Determine the string length, not including termination character,
>> -   * clamped to n characters.
>> -   */
>> -  ptr = (vec_uchar16 *)src;
>> -  skip = (unsigned int)(ptr) & 15;
>> -  mask = 0xFFFF >> skip;
>> -
>> -  data = *ptr++;
>> -  cmp = spu_extract(spu_gather(spu_cmpeq(data, 0)), 0);
>> -  cmp &= mask;
>> -
>> -  cnt = spu_cntlz(spu_promote(cmp, 0));
>> -  len = spu_extract(cnt, 0) - (skip + 16);
>> -
>> -  gt = spu_cmpgt(spu_promote(len, 0), N);
>> -
>> -  while (spu_extract(spu_andc(spu_cmpeq(cnt, 32), gt), 0)) {
>> -    data = *ptr++;
>> -    len -= 16;
>> -    cnt  = spu_cntlz(spu_gather(spu_cmpeq(data, 0)));
>> -    len += spu_extract(cnt, 0);
>> -
>> -    gt = spu_cmpgt(spu_promote(len, 0), N);
>> -  }
>> -
>> -  /* len = MIN(len, n)
>> -   */
>> -  len = spu_extract(spu_sel(spu_promote(len, 0), N, gt), 0);
>> -
>> -  /* Perform a memcpy of the resulting length
>> -   */
>> -  (void)memcpy((void *)dst, (const void *)src, len);
>> -
>> -  /* Terminate the resulting concetenated string.
>> -   */
>> -  dst[len] = '\0';
>> -
>> -  return (dest);
>> +  return _strncpy(_straddr(dest), src, n, 1, 1);
>>  }
>> Index: quilt/newlib/libc/machine/spu/strncpy.c
>> ===================================================================
>> --- quilt.orig/newlib/libc/machine/spu/strncpy.c
>> +++ quilt/newlib/libc/machine/spu/strncpy.c
>> @@ -1,9 +1,6 @@
>>  /*
>> -  (C) Copyright 2001,2006,
>> +  (C) Copyright 2008
>>    International Business Machines Corporation,
>> -  Sony Computer Entertainment, Incorporated,
>> -  Toshiba Corporation,
>> -
>>    All rights reserved.
>>     Redistribution and use in source and binary forms, with or without
>> @@ -30,58 +27,16 @@
>>    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
>>    POSSIBILITY OF SUCH DAMAGE.
>>  */
>> -#include <spu_intrinsics.h>
>>  #include <stddef.h>
>>  #include <string.h>
>> +#include "strcpy.h"
>>  -/* Copy the string up to n character from memory area src to
>> - * memory area dest. The memory areas may not overlap. The
>> - * strncpy subroutine returns a pointer to dest.
>> +/*
>> + * Copy the string up to n character from memory area src to memory area
>> + * dest. The memory areas may not overlap. The strncpy subroutine returns
>> + * a pointer to dest.
>>   */
>>  char * strncpy(char * __restrict__ dest, const char * __restrict__ src, size_t n)
>>  {
>> -  unsigned int len;
>> -  unsigned int cmp, skip, mask;
>> -  vec_uchar16 *ptr, data;
>> -  vec_uint4 cnt, gt, N;
>> -
>> -  N = spu_promote((unsigned int)n, 0);
>> -
>> -  /* Determine the string length, including termination character,
>> -   * clamped to n characters.
>> -   */
>> -  ptr = (vec_uchar16 *)src;
>> -  skip = (unsigned int)(ptr) & 15;
>> -  mask = 0xFFFF >> skip;
>> -
>> -  data = *ptr++;
>> -  cmp = spu_extract(spu_gather(spu_cmpeq(data, 0)), 0);
>> -  cmp &= mask;
>> -
>> -  cnt = spu_cntlz(spu_promote(cmp, 0));
>> -  len = spu_extract(cnt, 0) - (skip + 15);
>> -
>> -  gt = spu_cmpgt(spu_promote(len, 0), N);
>> -
>> -  while (spu_extract(spu_andc(spu_cmpeq(cnt, 32), gt), 0)) {
>> -    data = *ptr++;
>> -    len -= 16;
>> -    cnt  = spu_cntlz(spu_gather(spu_cmpeq(data, 0)));
>> -    len += spu_extract(cnt, 0);
>> -
>> -    gt = spu_cmpgt(spu_promote(len, 0), N);
>> -  }
>> -
>> -  /* len = MIN(len, n)
>> -   */
>> -  len = spu_extract(spu_sel(spu_promote((unsigned int)len, 0), N, gt), 0);
>> -
>> -  /* Padding
>> -   */
>> -  if (len != n) {
>> -    memset(dest + len, 0, n - len);
>> -  }
>> -  /* Perform a memcpy of the resulting length
>> -   */
>> -  return ((char *)memcpy((void *)dest, (const void *)src, len));
>> +  return _strncpy(dest, src, n, 1, 0);
>>  }
>> Index: quilt/newlib/libc/machine/spu/straddr.h
>> ===================================================================
>> --- /dev/null
>> +++ quilt/newlib/libc/machine/spu/straddr.h
>> @@ -0,0 +1,75 @@
>> +/*
>> +  (C) Copyright 2008
>> +  International Business Machines Corporation,
>> +  All rights reserved.
>> +
>> +  Redistribution and use in source and binary forms, with or without
>> +  modification, are permitted provided that the following conditions are met:
>> +
>> +    * Redistributions of source code must retain the above copyright notice,
>> +  this list of conditions and the following disclaimer.
>> +    * Redistributions in binary form must reproduce the above copyright
>> +  notice, this list of conditions and the following disclaimer in the
>> +  documentation and/or other materials provided with the distribution.
>> +    * Neither the names of the copyright holders nor the names of their
>> +  contributors may be used to endorse or promote products derived from
>> +  this software without specific prior written permission.
>> +
>> +  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
>> +  IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
>> +  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
>> +  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
>> +  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
>> +  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
>> +  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
>> +  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
>> +  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
>> +  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
>> +  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> +*/
>> +
>> +#include <spu_intrinsics.h>
>> +#include "vec_literal.h"
>> +
>> +/*
>> + * Supply the inline _straddr for use by strncpy and strncat.
>> +*
>> +* _straddr: search the string s, and return the address of the first byte
>> +* containing zero.
>> +*/
>> +static inline char *_straddr(const char *s)
>> +{
>> +  unsigned int cnt, cmp, skip, mask;
>> +  vec_uchar16 *ptr, data;
>> +
>> +  /*
>> +   * Compensate for unaligned strings.
>> +   */
>> +  ptr = (vec_uchar16 *)s; /* implicit (s & ~0xf) */
>> +  skip = (unsigned int)(ptr) & 0xf;
>> +  /*
>> +   * skip the first skip bytes starting at (s & ~0xf).
>> +   */
>> +  mask = 0xFFFF >> skip;
>> +
>> +  data = *ptr;
>> +  cmp = spu_extract(spu_gather(spu_cmpeq(data, 0)), 0);
>> +  cmp &= mask;
>> +
>> +  cnt = spu_extract(spu_cntlz(spu_promote(cmp, 0)), 0);
>> +
>> +  while (cnt == 32) {
>> +    data = *++ptr;
>> +    cnt = spu_extract(spu_cntlz(spu_gather(spu_cmpeq(data, 0))), 0);
>> +    /*
>> +     * The first 16 bits for gather on a byte vector are zero, so if cnt
>> +     * is 32, none of the 16 bytes in data was zero. And, there are (cnt -
>> +     * 16) non-zero bytes in data.
>> +     */
>> +  }
>> +  /*
>> +   * The first non-zero byte is at ptr aligned down plus the number of
>> +   * non-zero bytes seen.
>> +   */
>> +  return ((char*) (((int) ptr & ~0xf) + (cnt - 16)));
>> +}
>>
>>   


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]