This is the mail archive of the
libc-help@sourceware.org
mailing list for the glibc project.
Faster strstr
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-help at sourceware dot org
- Date: Fri, 11 Nov 2011 08:51:47 +0100
- Subject: Faster strstr
Hello
I written faster version of strstr.
It uses sse2 and would be easy to convert to mmx.
On core2 it is faster than glibc strstr and for large strings 25 times so.
One of reasons is that strstr switches to two-way too much. When using
strstr3 which uses strchr is only 5 times faster.
Main speedup is archived by scaning for first two characters instead
only first character.
It has guaranted linear time by counting number of character processed
and switching to strstr when it processed/progress ratio is more than 16
Here are selected runing times of searching in 1<<24 characters divided to groups of given size.
size 8 strstr2 1980828 strstr 1995713 strstr3 2004866 speedup1 1.007514 speedup2 1.012135
size 11 strstr2 1372418 strstr 1579756 strstr3 1399970 speedup1 1.151075 speedup2 1.020076
size 16 strstr2 994560 strstr 1183617 strstr3 1027443 speedup1 1.190091 speedup2 1.033063
size 22 strstr2 753461 strstr 965639 strstr3 759590 speedup1 1.281605 speedup2 1.008134
size 32 strstr2 511804 strstr 659935 strstr3 550006 speedup1 1.289429 speedup2 1.074642
size 45 strstr2 357020 strstr 493989 strstr3 394202 speedup1 1.383645 speedup2 1.104145
size 64 strstr2 280914 strstr 387428 strstr3 320476 speedup1 1.379169 speedup2 1.140833
size 256 strstr2 87191 strstr 186288 strstr3 95205 speedup1 2.136551 speedup2 1.091913
size 1024 strstr2 24216 strstr 143255 strstr3 39694 speedup1 5.915717 speedup2 1.639164
size 8192 strstr2 12646 strstr 149955 strstr3 69072 speedup1 11.857900 speedup2 5.461964
size 32768 strstr2 5583 strstr 115196 strstr3 21459 speedup1 20.633350 speedup2 3.843632
size 524288 strstr2 4173 strstr 114604 strstr3 20739 speedup1 27.463217 speedup2 4.969806
And code
#include <string.h>
#include <emmintrin.h>
/*on little endian architectures rigth shift shifts left. To avoid confusion we define shift up/down where shift up moves bits to rigth.*/
#define SHIFT_DOWN _mm_srli_si128
#define shift_down >>
#define SHIFT_UP _mm_slli_si128
#define shift_up <<
#define MBTYPE __m128i
#define BYTES_AT_ONCE 16
#define MASK _mm_movemask_epi8
#define LOAD _mm_load_si128
#define CMP _mm_cmpeq_epi8
#define AND _mm_and_si128
#define OR _mm_or_si128
#define BROADCAST _mm_set1_epi8
#define ALIGN(_str,_aligned,_offset) long _offset=((long)_str)&( (BYTES_AT_ONCE-1));\
char *_aligned=_str-_offset;
/*test if first two characters match or first character is zero in paralel.*/
/*note that for last bit match is always false so we test it separately(which is faster than repairing by binary operations)*/
#define MASKMATCH \
x = LOAD(aligned);\
mz= MASK(OR(AND(CMP(x,mask0),SHIFT_DOWN(CMP(x,mask1),1)),CMP(x,zeros)));
char* strstr2(char *str,char* pattern) __attribute__((noinline));
char* strstr2(char *str,char* pattern){
if (!pattern[0]) return str;
if (!pattern[1]) return strchr(str,pattern[0]);
int counter=-16;
int pattern_len=0;
MBTYPE mask0 =BROADCAST(pattern[0]), mask1 =BROADCAST(pattern[1]), zeros =BROADCAST(0);
char f=pattern[0];
ALIGN(str,aligned,offset)
int i,mz; MBTYPE x;
MASKMATCH
mz=(mz shift_down offset) shift_up offset;//ignore matches before start
while (1){
if (__builtin_expect(mz,0)){
for(i=0;i<BYTES_AT_ONCE;i++){
if (mz&1){
if (!aligned[i]) return NULL;
char *s2=aligned+i+2,*p2=pattern+2;
if (!*p2) return aligned+i;
while(*s2==*p2){
s2++;p2++;
if (!*p2) return aligned+i;
}
counter+=s2-aligned+i;
if (counter>(aligned-str)*16) return strstr(aligned,pattern);
}
mz=mz shift_down 1;
}
}
if (__builtin_expect(aligned[BYTES_AT_ONCE-1]==f,0) &&!strncmp(pattern,aligned+BYTES_AT_ONCE-1,strlen(pattern))) return aligned+BYTES_AT_ONCE-1;
aligned+=BYTES_AT_ONCE;
MASKMATCH
}
}
/*comparsion code*/
#include <stdio.h>
#include <sys/time.h>
#include <math.h>
char *strstr3(char *str,char *pattern){
while (1){
str=strchr(str,pattern[0]);
if (!str) return NULL;
if (!strncmp(str,pattern,strlen(pattern))) return str;
str++;
}
}
void gens(char *s ,int size,int alpha){
int i;
for(i=0;i<size;i++){s[i]=rand()%alpha+1;
}
s[size]=0;
}
int main(){
srand(42);
int collect;
int totaltime[3]={0,0,0};
char needle[111],*hay=malloc(10000000);
int i;
int offset=8;
int alpha=30;
float size=8;
while(size<1000000){
for(i=0;i<(1<<24)/size;i++){
int k;
for(k=0;k<3;k++){
gens(needle,10,alpha),gens(hay,size+offset,alpha);
struct timeval c1; gettimeofday(&c1,NULL);
collect+=(long)((k==2)?strstr3(hay+offset,needle):( k==1 ? strstr(hay+offset,needle):strstr2(hay+offset,needle)));
struct timeval c2; gettimeofday(&c2,NULL);
int t1=1000000*c1.tv_sec+c1.tv_usec; int t2=1000000*c2.tv_sec+c2.tv_usec;
totaltime[k]+=t2-t1;
}
}
printf("size %i strstr2 %i strstr %i strstr3 %i speedup1 %f speedup2 %f\n",(int)size,totaltime[0],totaltime[1],totaltime[2],((float)totaltime[1])/totaltime[0],((float)totaltime[2])/totaltime[0]);
totaltime[0]=totaltime[1]=totaltime[2]=0;
size*=sqrt(2);
}
return collect;
}
--
UPS interrupted the server's power