This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PING][PATCH] Improve generic strcspn and strpbrk


Ping
On Wed, Jul 01, 2015 at 05:35:15PM +0200, OndÅej BÃlka wrote:
> Hi,
> 
> As I promised previously to improve sse2 strspn I found that I could
> convince gcc to generate reasonable code. Its really needed to use goto,
> otherwise gcc totally messes up a loop which will be 10% slower than
> current. Generated assembly is relatively ok, gcc only increments two
> registers by 4 instead one which doesn't affect performance much.
> 
> So here is a generic strcspn/strpbrk implementation and I will send
> patch to replace x64 code later.
> 
> 
> Carlos as usual this is code where you need to make hard decision based
> on actual profile. This implementation on gcc workload shows 30%
> improvement. That is by exploiting property of workload that match in 4
> bytes is likely. If it isn't a performance will suffer as these checks
> increase cost of subsequent characters.
> 
> This is again case where its unavoidable that some patterns will become
> five times slower. If accept has 128 characters and match is in second
> character then we spend ages with byte-by-byte checks of accept. But if
> match is in first character then strchr will find that quickly. Problem
> is that 99.8% of call have accept less than 8 bytes. So adding just a
> check if size exceeds 8 will harm performance. See following two graphs.
> 
> http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strpbrk_profile/results_gcc/result.html
> http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strpbrk_profile/results_rand/result.html
> 
> I cannot avoid that without my generic string framework where I would
> first to vectorized check of first 8 bytes and its dubious if that would
> help 32bit architectures. For x64 I correct solution is use sse2 prolog,
> hard part is tuning as these break even depending on size. For use in
> sse2 I need to add LATE_CHECK as it when called from sse42 routine
> accept size is greater than 16 so early heuristic doesn't make much
> sense.
> 
> Main gain versus current sse2 assembly is that I align s to 4 bytes so I
> could read 4 bytes at start of loop before check. That helps mainly
> older cpu to execute loads earlier, it improves performance by 50% on
> core2 large inputs, but newer are better at speculative loads so
> difference gets smaller until haswell that doesn't show a difference.
> 
> Tested on x64 by including them in test-strcspn/strpbrk
> 
> OK to commit?
> 
> 	* string/strcspn.c(STRCSPN): Check membership by array lookup.
> 	* string/strcspn.c: Include string/strpbrk.c
> 	* string/test-strcspn.c (simple_strcspn): Use string version.
> 	* string/test-strpbrk.c (simple_strpbrk): Use string version.
> 
> ---
>  string/strcspn.c      |  43 +-----------------
>  string/strpbrk.c      | 120 +++++++++++++++++++++++++++++++++++++++++++++-----
>  string/test-strcspn.c |  15 ++-----
>  string/test-strpbrk.c |  14 +-----
>  4 files changed, 116 insertions(+), 76 deletions(-)
> 
> diff --git a/string/strcspn.c b/string/strcspn.c
> index 2694d2a..2a82dd2 100644
> --- a/string/strcspn.c
> +++ b/string/strcspn.c
> @@ -1,41 +1,2 @@
> -/* Copyright (C) 1991-2015 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#include <string.h>
> -
> -#undef strcspn
> -
> -#ifndef STRCSPN
> -# define STRCSPN strcspn
> -#endif
> -
> -/* Return the length of the maximum initial segment of S
> -   which contains no characters from REJECT.  */
> -size_t
> -STRCSPN (const char *s, const char *reject)
> -{
> -  size_t count = 0;
> -
> -  while (*s != '\0')
> -    if (strchr (reject, *s++) == NULL)
> -      ++count;
> -    else
> -      return count;
> -
> -  return count;
> -}
> -libc_hidden_builtin_def (strcspn)
> +#define AS_STRCSPN
> +#include "strpbrk.c"
> diff --git a/string/strpbrk.c b/string/strpbrk.c
> index 4f1d9b7..62808da 100644
> --- a/string/strpbrk.c
> +++ b/string/strpbrk.c
> @@ -16,26 +16,124 @@
>     <http://www.gnu.org/licenses/>.  */
>  
>  #include <string.h>
> -
> +#include <stdint.h>
>  #undef strpbrk
> +#undef strcspn
> +
> +
> +#ifdef AS_STRCSPN
> +# ifndef STRPBRK
> +#  define STRPBRK strcspn
> +# endif
> +# define RETURN_TYPE size_t
> +# define RETURN(c) return c
> +#else
> +# define RETURN_TYPE char *
> +# define RETURN(c) return (char *) (s[c] != '\0' ? s + c : NULL)
> +#endif
>  
>  #ifndef STRPBRK
>  #define STRPBRK strpbrk
>  #endif
>  
> +
>  /* Find the first occurrence in S of any character in ACCEPT.  */
> -char *
> -STRPBRK (const char *s, const char *accept)
> +RETURN_TYPE
> +STRPBRK (const char *_s, const char *_accept)
>  {
> -  while (*s != '\0')
> +  unsigned char *s = (unsigned char *) _s;
> +  unsigned char *a = (unsigned char *) _accept;
> +  
> +#ifndef LATE_CHECK
> +  /* We need to align s to 4 bytes. We do check now to avoid expensive table
> +     construction.  */
> +  do 
> +    {
> +      if (s[0] == *a)
> +        RETURN(0);
> +    }
> +  while (*a++);
> +  a = (unsigned char *) _accept;
> + 
> +  /* We couldn't do these checks in one loop as gcc 
> +     messes up register allocation.  */
> +  do 
> +    {
> +      if (s[1] == *a)
> +        RETURN(1);
> +    }
> +  while (*a++);
> +  a = (unsigned char *) _accept;
> +
> +  do 
> +    {
> +      if (s[2] == *a)
> +        RETURN(2);
> +    }
> +  while (*a++);
> +  a = (unsigned char *) _accept;
> +
> +  do 
> +    {
> +      if (s[3] == *a)
> +        RETURN(3);
> +    }
> +  while (*a++);
> +  a = (unsigned char *) _accept;
> +
> +#endif
> +
> +  unsigned char table[256];
> +  memset (table, 0, 256);
> +  do 
>      {
> -      const char *a = accept;
> -      while (*a != '\0')
> -	if (*a++ == *s)
> -	  return (char *) s;
> -      ++s;
> +      table[*a] = 1;
>      }
> +  while (*a++);
> +  unsigned char s0, s1, s2, s3;
> +  size_t count = 0;
> +#ifdef LATE_CHECK
> +  s0 = s[count + 0];
> +  s1 = s[count + 1];
> +  s2 = s[count + 2];
> +  s3 = s[count + 3];
> +  if (table[s0])
> +    goto ret0;
> +  if (table[s1])
> +    goto ret1;
> +  if (table[s2])
> +    goto ret2;
> +  if (table[s3])
> +    goto ret3;
>  
> -  return NULL;
> +#endif
> +
> +  count = 4 - ((uintptr_t) s) % 4;
> +
> +  while (1)
> +    {
> +      s0 = s[count + 0];
> +      s1 = s[count + 1];
> +      s2 = s[count + 2];
> +      s3 = s[count + 3];
> +      if (table[s0])
> +        goto ret0;
> +      if (table[s1])
> +        goto ret1;
> +      if (table[s2])
> +        goto ret2;
> +      if (table[s3])
> +        goto ret3;
> +      count += 4;
> +    }
> +  ret3:
> +  count++;
> +  ret2:
> +  count++;
> +  ret1:
> +  count++;
> +  ret0:
> +  RETURN(count);
>  }
> -libc_hidden_builtin_def (strpbrk)
> +
> +libc_hidden_builtin_def (STRPBRK)
> diff --git a/string/test-strcspn.c b/string/test-strcspn.c
> index b60a048..50a06e4 100644
> --- a/string/test-strcspn.c
> +++ b/string/test-strcspn.c
> @@ -31,18 +31,9 @@ IMPL (stupid_strcspn, 0)
>  IMPL (simple_strcspn, 0)
>  IMPL (strcspn, 1)
>  
> -size_t
> -simple_strcspn (const char *s, const char *rej)
> -{
> -  const char *r, *str = s;
> -  char c;
> -
> -  while ((c = *s++) != '\0')
> -    for (r = rej; *r != '\0'; ++r)
> -      if (*r == c)
> -	return s - str - 1;
> -  return s - str - 1;
> -}
> +#define AS_STRCSPN
> +#define STRPBRK simple_strcspn
> +#include "string/strpbrk.c"
>  
>  size_t
>  stupid_strcspn (const char *s, const char *rej)
> diff --git a/string/test-strpbrk.c b/string/test-strpbrk.c
> index b4ac389..f389e9d 100644
> --- a/string/test-strpbrk.c
> +++ b/string/test-strpbrk.c
> @@ -32,18 +32,8 @@ IMPL (stupid_strpbrk, 0)
>  IMPL (simple_strpbrk, 0)
>  IMPL (strpbrk, 1)
>  
> -char *
> -simple_strpbrk (const char *s, const char *rej)
> -{
> -  const char *r;
> -  char c;
> -
> -  while ((c = *s++) != '\0')
> -    for (r = rej; *r != '\0'; ++r)
> -      if (*r == c)
> -	return (char *) s - 1;
> -  return NULL;
> -}
> +#define STRPBRK simple_strpbrk
> +#include "string/strpbrk.c"
>  
>  char *
>  stupid_strpbrk (const char *s, const char *rej)
> -- 
> 1.8.4.rc3

-- 

hardware stress fractures


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]