Faster strstr

Ondřej Bílka neleai@seznam.cz
Fri Nov 11 06:52:00 GMT 2011


Hello 
I written faster version of strstr.
It uses sse2 and would be easy to convert to mmx.
On core2 it is faster than glibc strstr and for large strings 25 times so.
One of reasons is that strstr switches to two-way too much. When using
 strstr3 which uses strchr is only 5 times faster.

Main speedup is archived by scaning for first two characters instead
only first character.

It has guaranted linear time by counting number of character processed
and switching to strstr when it processed/progress ratio is more than 16

Here are selected runing times of searching in 1<<24 characters divided to groups of given size.
size 8    strstr2 1980828 strstr 1995713 strstr3 2004866 speedup1  1.007514 speedup2 1.012135
size 11   strstr2 1372418 strstr 1579756 strstr3 1399970 speedup1  1.151075 speedup2 1.020076
size 16   strstr2  994560 strstr 1183617 strstr3 1027443 speedup1  1.190091 speedup2 1.033063
size 22   strstr2  753461 strstr  965639 strstr3  759590 speedup1  1.281605 speedup2 1.008134
size 32   strstr2  511804 strstr  659935 strstr3  550006 speedup1  1.289429 speedup2 1.074642
size 45   strstr2  357020 strstr  493989 strstr3  394202 speedup1  1.383645 speedup2 1.104145
size 64   strstr2  280914 strstr  387428 strstr3  320476 speedup1  1.379169 speedup2 1.140833
size 256  strstr2   87191 strstr  186288 strstr3   95205 speedup1  2.136551 speedup2 1.091913
size 1024 strstr2   24216 strstr  143255 strstr3   39694 speedup1  5.915717 speedup2 1.639164
size 8192 strstr2   12646 strstr  149955 strstr3   69072 speedup1 11.857900 speedup2 5.461964
size 32768 strstr2   5583 strstr  115196 strstr3   21459 speedup1 20.633350 speedup2 3.843632
size 524288 strstr2  4173 strstr  114604 strstr3   20739 speedup1 27.463217 speedup2 4.969806

And code

#include <string.h>
#include <emmintrin.h>

/*on little endian architectures rigth shift shifts left. To avoid confusion we define shift up/down where shift up moves bits to rigth.*/
#define SHIFT_DOWN   _mm_srli_si128
#define shift_down   >>
#define SHIFT_UP     _mm_slli_si128
#define shift_up     <<

#define MBTYPE __m128i
#define BYTES_AT_ONCE 16
#define MASK   _mm_movemask_epi8
#define LOAD   _mm_load_si128
#define CMP    _mm_cmpeq_epi8
#define AND    _mm_and_si128
#define OR     _mm_or_si128
#define BROADCAST _mm_set1_epi8

#define ALIGN(_str,_aligned,_offset) long   _offset=((long)_str)&( (BYTES_AT_ONCE-1));\
                                     char *_aligned=_str-_offset;

/*test if first two characters match or first character is zero in paralel.*/
/*note that for last bit match is always false so we test it separately(which is faster than repairing by binary operations)*/
#define MASKMATCH \
  x = LOAD(aligned);\
  mz= MASK(OR(AND(CMP(x,mask0),SHIFT_DOWN(CMP(x,mask1),1)),CMP(x,zeros)));

char* strstr2(char *str,char* pattern) __attribute__((noinline));
char* strstr2(char *str,char* pattern){
  if (!pattern[0]) return str;
  if (!pattern[1]) return strchr(str,pattern[0]);
  int counter=-16;
  int pattern_len=0;
  MBTYPE mask0 =BROADCAST(pattern[0]), mask1 =BROADCAST(pattern[1]), zeros =BROADCAST(0);
  char f=pattern[0];
  ALIGN(str,aligned,offset)
  int i,mz; MBTYPE x;
  MASKMATCH
  mz=(mz shift_down offset) shift_up offset;//ignore matches before start
  while (1){
    if (__builtin_expect(mz,0)){
      for(i=0;i<BYTES_AT_ONCE;i++){
        if (mz&1){
          if (!aligned[i]) return NULL;
          char  *s2=aligned+i+2,*p2=pattern+2;
          if (!*p2) return aligned+i;
          while(*s2==*p2){
            s2++;p2++;
            if (!*p2) return aligned+i;
          }
          counter+=s2-aligned+i;
          if (counter>(aligned-str)*16) return strstr(aligned,pattern);
        }
        mz=mz shift_down 1;
      }
    }
    if (__builtin_expect(aligned[BYTES_AT_ONCE-1]==f,0) &&!strncmp(pattern,aligned+BYTES_AT_ONCE-1,strlen(pattern))) return aligned+BYTES_AT_ONCE-1;
    aligned+=BYTES_AT_ONCE;
    MASKMATCH
  }
}


/*comparsion code*/
#include <stdio.h>
#include <sys/time.h>
#include <math.h>
char *strstr3(char *str,char *pattern){
  while (1){
    str=strchr(str,pattern[0]);
    if (!str) return NULL;
    if (!strncmp(str,pattern,strlen(pattern))) return str;
    str++;
  }
}

void gens(char *s ,int size,int alpha){
  int i;
  for(i=0;i<size;i++){s[i]=rand()%alpha+1;
  }  
  s[size]=0;
}

int main(){
  srand(42);
  int collect;
  int totaltime[3]={0,0,0};
  char needle[111],*hay=malloc(10000000);
  int i;
  int offset=8;
  int alpha=30;
  float size=8;
  while(size<1000000){
    for(i=0;i<(1<<24)/size;i++){
      int k;
      for(k=0;k<3;k++){
        gens(needle,10,alpha),gens(hay,size+offset,alpha);
        struct timeval c1; gettimeofday(&c1,NULL);
        collect+=(long)((k==2)?strstr3(hay+offset,needle):( k==1 ? strstr(hay+offset,needle):strstr2(hay+offset,needle)));
        struct timeval c2; gettimeofday(&c2,NULL);
        int t1=1000000*c1.tv_sec+c1.tv_usec; int t2=1000000*c2.tv_sec+c2.tv_usec;
        totaltime[k]+=t2-t1;
      }
    }
    printf("size %i  strstr2 %i  strstr %i strstr3 %i speedup1 %f  speedup2 %f\n",(int)size,totaltime[0],totaltime[1],totaltime[2],((float)totaltime[1])/totaltime[0],((float)totaltime[2])/totaltime[0]);
    totaltime[0]=totaltime[1]=totaltime[2]=0;
    size*=sqrt(2);
  }
  return collect;
}

-- 

UPS interrupted the server's power





More information about the Libc-help mailing list