This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [RFC PATCH v2] Add strcmp, strncmp, memcmp inline implementation.
- From: Andrew Pinski <pinskia at gmail dot com>
- To: OndÅej BÃlka <neleai at seznam dot cz>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Mon, 25 May 2015 16:23:52 +0800
- Subject: Re: [RFC PATCH v2] Add strcmp, strncmp, memcmp inline implementation.
- Authentication-results: sourceware.org; auth=none
- References: <20150525015826 dot GA29445 at domone> <20150525081156 dot GA10661 at domone>
On Mon, May 25, 2015 at 4:11 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> Sorry for noise,
>
> I wanted to ask if one needs to surround strlen with
> __builtin_constant_p to avoid runtime overhead, and when I read patch I
> realized that I by mistake included older partial patch. What I intended
> is here, please ignore previous one. I know that it had bug when you
> have string literal with zero in middle.
>
> So do I need that to be safe? And other comments?
Yes what will it take to get this into GCC instead of adding one more
hack to glibc for this?
Sorry but I feel like this would be better in GCC than doing this in glibc.
Thanks,
Andrew
>
> On Mon, May 25, 2015 at 03:58:26AM +0200, OndÅej BÃlka wrote:
>> Hi,
>>
>> I just found that on x64 gcc __builtin_strcmp suck a lot. And by lot I
>> mean its around three times slower than libcall by using rep cmpsb which
>> even intel manual says that shouldn't be used.
>>
>> So I decided to write a strcmp inline that works. As reencoding it as
>> gcc pass would be extra effort without any benefit I skipped it.
>>
>> This adds x64 specific implementation for constant arguments less than
>> 16 bytes. These are quite common as programmer often does checks like
>>
>> if (strcmp (x, "foo"))
>>
>> Extending this for 32 bytes and more would be straightforward but
>> wouldn't help much as there aren't lot of that large words.
>>
>> As same trick could be used for strncmp and memcmp with size <= 16
>> with few extra checks as we exploited alignment of string literals.
>>
>> It could be optimized more with cooperation from gcc. A page-cross check
>> could be omitted in most cases using dataflow that gcc already does in
>> fortify_source. A CROSS_PAGE macro could first check for
>> __builtin_valid_range_p(x, x+16) which evaluates to true if gcc can
>> prove that x is more than 16 bytes large.
>>
>>
>> A possible issue would be introducing sse with string.h. How detect gcc
>> -no-sse flag?
>>
>
> * sysdeps/x86_64/bits/string.h: New file.
>
> diff --git a/sysdeps/x86_64/bits/string.h b/sysdeps/x86_64/bits/string.h
> new file mode 100644
> index 0000000..c4d154b
> --- /dev/null
> +++ b/sysdeps/x86_64/bits/string.h
> @@ -0,0 +1,87 @@
> +/* This file should provide inline versions of string functions.
> +
> + Surround GCC-specific parts with #ifdef __GNUC__, and use `__extern_inline'.
> +
> + This file should define __STRING_INLINES if functions are actually defined
> + as inlines. */
> +
> +#ifndef _BITS_STRING_H
> +#define _BITS_STRING_H 1
> +
> +/* Define if architecture can access unaligned multi-byte variables. */
> +#define _STRING_ARCH_unaligned 0
> +
> +#ifdef _USE_GNU
> +# if __GNUC_PREREQ (3, 2)
> +# define _HAVE_STRING_ARCH_strcmp
> +# define _HAVE_STRING_ARCH_strncmp
> +# define _HAVE_STRING_ARCH_memcmp
> +
> +# include <stdint.h>
> +# include <emmintrin.h>
> +# define __LOAD(x) _mm_load_si128 ((__tp_vector *) (x))
> +# define __LOADU(x) _mm_loadu_si128 ((__tp_vector *) (x))
> +# define get_mask(x) ((uint64_t) _mm_movemask_epi8 (x))
> +# define __EQ _mm_cmpeq_epi8
> +typedef __m128i __tp_vector;
> +typedef uint64_t __tp_mask;
> +
> +#define CROSS_PAGE(p) __builtin_expect (((uintptr_t) s) % 4096 \
> + > 4096 - sizeof (__tp_vector) , 0)
> +
> +static inline __attribute__ ((always_inline))
> +int
> +__memcmp_small_a (char *s, char *c, int n)
> +{
> + if (CROSS_PAGE (s))
> + return memcmp (s, c, n);
> + __tp_mask m = get_mask (__EQ (__LOADU (s), __LOAD (c))) | 1UL << n;
> + int found = __builtin_ctzl (m);
> + return s[found] - c[found];
> +}
> +static inline __attribute__ ((always_inline))
> +int
> +__memcmp_small (char *s, char *c, int n)
> +{
> + if (CROSS_PAGE (s) || CROSS_PAGE (c))
> + return memcmp (s, c, n);
> + __tp_mask m = get_mask (__EQ (__LOADU (s), __LOADU (c))) | 1UL << n;
> + int found = __builtin_ctzl (m);
> + return s[found] - c[found];
> +}
> +#define __min(x,y) (x < y ? x : y)
> +
> +/* Dereferencing a pointer arg to run sizeof on it fails for the void
> + pointer case, so we use this instead.
> + Note that __x is evaluated twice. */
> +#define __string2_1bptr_p(__x) __builtin_constant_p (__x) && \
> + ((size_t)(const void *)((__x) + 1) - (size_t)(const void *)(__x) == 1)
> +
> +
> +# define strcmp(s1, s2) \
> + (__extension__ \
> + (__string2_1bptr_p (s1) && strlen (s1) <= 16 \
> + ? __memcmp_small_a (s1, s2, strlen (s1)) \
> + : (__string2_1bptr_p (s2) && strlen (s2) <= 16 \
> + ? - __memcmp_small_a (s2, s1, strlen (s2)) \
> + : strcmp (s1, s2))))
> +
> +# define strncmp(s1, s2, n) \
> + (__extension__ \
> + (__string2_1bptr_p (s1) && strlen (s1) <= 16 \
> + ? __memcmp_small_a (s1, s2, min (n, strlen (s1))) \
> + : (__string2_1bptr_p (s2) && strlen (s2) <= 16 \
> + ? - __memcmp_small_a (s2, s1, min (n, strlen (s2))) \
> + : strncmp (s1, s2, n))))
> +
> +# define memcmp(s1, s2, n) \
> + (__extension__ \
> + (__builtin_constant_p (n <= 16) && n <= 16 \
> + ? n == 0 ? 0 : __memcmp_small (s1, s2, n - 1)) \
> + : memcmp (s1, s2, n))
> +
> +
> +# undef __string2_1bptr_p
> +# endif
> +#endif