Implementation of some string.h function using SSE2 instructions

Paweł Sikora pluto@agmk.net
Wed Aug 8 11:27:00 GMT 2007


Wojciech MuÂła pisze:
> Hi all,
> 
> I just subscribed to the list.
> 
> Does anybody work on SSE2 implementation?

Hi,

few days ago i've started implementation of strcmp()
for aligned data using <emmintrin.h> builtins.
gcc42/43 optimizes such code pretty well
and raw asm. implementation isn't required.

$ cat sse2_strings.h
#ifndef sse2_strings_h
#define sse2_strings_h

typedef char __attribute__(( aligned( 16 ) )) sse2_aligned_byte;
typedef sse2_aligned_byte const* sse2_byte_buffer;

int sse2_strcmp( sse2_byte_buffer s1, sse2_byte_buffer s2 );

#endif

$ cat sse2_strings.c
#include "sse2_strings.h"
#include <emmintrin.h>

static inline __m128i not( __m128i x )
{
         __m128i zero = { 0 };
         __m128i ones = _mm_cmpeq_epi8( zero, zero );
         return _mm_xor_si128( x, ones );
}

int sse2_strcmp( sse2_byte_buffer s1, sse2_byte_buffer s2 )
{
         for ( int mask = 0; ; s1 += sizeof( __m128i ), s2 += sizeof( 
__m128i ) )
         {
                 __m128i m1 = *( __m128i* )( s1 );
                 __m128i m2 = *( __m128i* )( s2 );
                 __m128i r1 = not( _mm_cmpeq_epi8( m1, m2 ) );
                 __m128i zero = { 0 };
                 __m128i r2 = _mm_cmpeq_epi8( m1, zero );
                 __m128i r3 = _mm_cmpeq_epi8( m2, zero );
                 __m128i r = _mm_or_si128( r1, _mm_or_si128( r2, r3 ) );
                 mask = _mm_movemask_epi8( r );
                 if ( mask )
                 {
                         unsigned index = __builtin_ffs( mask ) - 1;
                         return ( s1[ index ] - s2[ index ] );
                 }
         }
}



More information about the Libc-alpha mailing list