#define CHAR_BASED #include #include #include typedef __m128i tp_vector; #ifdef CHAR_BASED typedef unsigned char uchar; typedef uchar tp_scalar; #define S(x) _mm_##x##_epi8 #else typedef int tp_scalar; #define S(x) _mm_##x##_epi32 #endif typedef uint64_t tp_mask; #define LOAD(x) _mm_load_si128((tp_vector *) (x)) #define LOADU(x) _mm_loadu_si128((tp_vector *) (x)) #define STOREU(x,y) _mm_storeu_si128((tp_vector *) (x), (y)) #define STORE(x,y) _mm_store_si128((tp_vector *) (x), (y)) #define MIN _mm_min_epu8 #define EQ S(cmpeq) #define OR _mm_or_si128 #define shift_down(x,y) (x)>>(y) #define shift_up(x,y) (x)<<(y) #define PARA sizeof(tp_vector)/sizeof(tp_scalar) #define BROADCAST(x) S(set1)(x) #define get_mask(x) ((long)_mm_movemask_epi8(x)) static inline tp_mask first_bit(tp_mask x) { return __builtin_ctzl(x)/sizeof(tp_scalar); } static inline tp_mask last_bit(tp_mask x) { return (sizeof(tp_mask)*8-1-__builtin_clzl(x))/sizeof(tp_scalar); } static inline uint64_t find64(tp_scalar e,void *s2){ tp_vector pe= BROADCAST(e); tp_vector v0,v1,v2,v3; v0=EQ(pe,LOAD(s2)); v1=EQ(pe,LOAD(s2+PARA)); v2=EQ(pe,LOAD(s2+2*PARA)); v3=EQ(pe,LOAD(s2+3*PARA)); return get_mask(v0)|(get_mask(v1)<<16)|(get_mask(v2)<<32)|(get_mask(v3)<<48); } #define unroll 4 #ifdef NVERSION #define NVERSION_S(x,y) x #else #define NVERSION_S(x,y) y #endif size_t strlen2(const char *_s){ uchar *s=(uchar*)_s; tp_mask mask; tp_scalar *s2; #ifdef NVERSION if (!no) return NULL; tp_scalar *end = s+no; tp_scalar *s2end= ((size_t)end-1)&(~(unroll*sizeof(tp_vector)-1)); #endif tp_vector pe= BROADCAST(0); tp_vector v0,v1,v2,v3; if ((((size_t)s)&4095)<=4096-64){ s2 = ((size_t)s)&(~(sizeof(tp_vector)-1)); v0=EQ(pe,LOAD(s2)); v1=EQ(pe,LOAD(s2+PARA)); v2=EQ(pe,LOAD(s2+2*PARA)); v3=EQ(pe,LOAD(s2+3*PARA)); mask=get_mask(v0)|(get_mask(v1)<<16)|(get_mask(v2)<<32)|(get_mask(v3)<<48); mask= shift_down(mask,s-s2); #ifdef NVERSION if((size_t)end-s2<64){ // on x64 strengthen to end-s2<=64 mask=mask& shift_down((unsigned long)-1,-((long)end-s2)); if(mask) return first_bit(mask); return no; } #endif if(mask) { return first_bit(mask); } s2= ((size_t)s)&(~(unroll*sizeof(tp_vector)-1)); s2+=unroll*PARA; } else { s2 = ((size_t)s)&(~(unroll*sizeof(tp_vector)-1)); v0=EQ(pe,LOAD(s2)); v1=EQ(pe,LOAD(s2+PARA)); v2=EQ(pe,LOAD(s2+2*PARA)); v3=EQ(pe,LOAD(s2+3*PARA)); mask=get_mask(v0)|(get_mask(v1)<<16)|(get_mask(v2)<<32)|(get_mask(v3)<<48); mask= shift_down(mask,s-s2); #ifdef NVERSION if((size_t)end-s2<64){ mask=mask& shift_down((unsigned long)-1,-((long)end-s2)); if(mask) return first_bit(mask); return no; } #endif if(mask) { return first_bit(mask); } s2+=unroll*PARA; } while(NVERSION_S(s2!=s2end,1)){ v0=EQ(pe,LOAD(s2)); v1=EQ(pe,LOAD(s2+PARA)); v2=EQ(pe,LOAD(s2+2*PARA)); v3=EQ(pe,LOAD(s2+3*PARA)); if(get_mask(OR(v0,OR(v1,OR(v2,v3))))){ mask=get_mask(v0)|(get_mask(v1)<<16)|(get_mask(v2)<<32)|(get_mask(v3)<<48); if(NVERSION_S(s2!=s2end || mask & shift_down((unsigned long)-1,-(long)end),1)) return s2+first_bit(mask)-s; } s2+=unroll*PARA; if(get_mask(EQ(pe,MIN(v0,MIN(v1,MIN(v2,v3)))))){ asm volatile ("" : : : "memory"); v0=EQ(pe,LOAD(s2)); v1=EQ(pe,LOAD(s2+PARA)); v2=EQ(pe,LOAD(s2+2*PARA)); v3=EQ(pe,LOAD(s2+3*PARA)); mask=get_mask(v0)|(get_mask(v1)<<16)|(get_mask(v2)<<32)|(get_mask(v3)<<48); if(NVERSION_S(s2!=s2end || mask & shift_down((unsigned long)-1,-(long)end),1)) return s2+first_bit(mask)-s; } s2+=unroll*PARA; } return NVERSION_S(no,1); }