[PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation
H.J. Lu
hjl.tools@gmail.com
Thu Mar 24 18:57:06 GMT 2022
On Wed, Mar 23, 2022 at 3:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The generic implementation is faster.
>
> geometric_mean(N=20) of all benchmarks New / Original: .678
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2, pos, New Time / Old Time
> 0, 0, 0, 512, 0.054
> 1, 0, 0, 512, 0.055
> 1, 1, 0, 512, 0.051
> 1, 0, 1, 512, 0.054
> 1, 1, 1, 512, 0.054
> 2, 0, 0, 512, 0.861
> 2, 2, 0, 512, 0.861
> 2, 0, 2, 512, 0.861
> 2, 2, 2, 512, 0.864
> 3, 0, 0, 512, 0.854
> 3, 3, 0, 512, 0.848
> 3, 0, 3, 512, 0.845
> 3, 3, 3, 512, 0.85
> 4, 0, 0, 512, 0.851
> 4, 4, 0, 512, 0.85
> 4, 0, 4, 512, 0.852
> 4, 4, 4, 512, 0.849
> 5, 0, 0, 512, 0.938
> 5, 5, 0, 512, 0.94
> 5, 0, 5, 512, 0.864
> 5, 5, 5, 512, 0.86
> 6, 0, 0, 512, 0.858
> 6, 6, 0, 512, 0.869
> 6, 0, 6, 512, 0.847
> 6, 6, 6, 512, 0.868
> 7, 0, 0, 512, 0.867
> 7, 7, 0, 512, 0.861
> 7, 0, 7, 512, 0.864
> 7, 7, 7, 512, 0.863
> 8, 0, 0, 512, 0.884
> 8, 0, 8, 512, 0.884
> 9, 0, 0, 512, 0.886
> 9, 1, 0, 512, 0.894
> 9, 0, 9, 512, 0.889
> 9, 1, 9, 512, 0.886
> 10, 0, 0, 512, 0.859
> 10, 2, 0, 512, 0.859
> 10, 0, 10, 512, 0.862
> 10, 2, 10, 512, 0.861
> 11, 0, 0, 512, 0.846
> 11, 3, 0, 512, 0.865
> 11, 0, 11, 512, 0.859
> 11, 3, 11, 512, 0.862
> 12, 0, 0, 512, 0.858
> 12, 4, 0, 512, 0.857
> 12, 0, 12, 512, 0.964
> 12, 4, 12, 512, 0.876
> 13, 0, 0, 512, 0.827
> 13, 5, 0, 512, 0.805
> 13, 0, 13, 512, 0.821
> 13, 5, 13, 512, 0.825
> 14, 0, 0, 512, 0.786
> 14, 6, 0, 512, 0.786
> 14, 0, 14, 512, 0.803
> 14, 6, 14, 512, 0.783
> 15, 0, 0, 512, 0.778
> 15, 7, 0, 512, 0.792
> 15, 0, 15, 512, 0.796
> 15, 7, 15, 512, 0.799
> 16, 0, 0, 512, 0.803
> 16, 0, 16, 512, 0.815
> 17, 0, 0, 512, 0.812
> 17, 1, 0, 512, 0.826
> 17, 0, 17, 512, 0.803
> 17, 1, 17, 512, 0.856
> 18, 0, 0, 512, 0.801
> 18, 2, 0, 512, 0.886
> 18, 0, 18, 512, 0.805
> 18, 2, 18, 512, 0.807
> 19, 0, 0, 512, 0.814
> 19, 3, 0, 512, 0.804
> 19, 0, 19, 512, 0.813
> 19, 3, 19, 512, 0.814
> 20, 0, 0, 512, 0.885
> 20, 4, 0, 512, 0.799
> 20, 0, 20, 512, 0.826
> 20, 4, 20, 512, 0.808
> 21, 0, 0, 512, 0.816
> 21, 5, 0, 512, 0.824
> 21, 0, 21, 512, 0.819
> 21, 5, 21, 512, 0.826
> 22, 0, 0, 512, 0.814
> 22, 6, 0, 512, 0.824
> 22, 0, 22, 512, 0.81
> 22, 6, 22, 512, 0.806
> 23, 0, 0, 512, 0.825
> 23, 7, 0, 512, 0.829
> 23, 0, 23, 512, 0.809
> 23, 7, 23, 512, 0.823
> 24, 0, 0, 512, 0.829
> 24, 0, 24, 512, 0.823
> 25, 0, 0, 512, 0.864
> 25, 1, 0, 512, 0.895
> 25, 0, 25, 512, 0.88
> 25, 1, 25, 512, 0.848
> 26, 0, 0, 512, 0.903
> 26, 2, 0, 512, 0.888
> 26, 0, 26, 512, 0.894
> 26, 2, 26, 512, 0.89
> 27, 0, 0, 512, 0.914
> 27, 3, 0, 512, 0.917
> 27, 0, 27, 512, 0.902
> 27, 3, 27, 512, 0.887
> 28, 0, 0, 512, 0.887
> 28, 4, 0, 512, 0.877
> 28, 0, 28, 512, 0.893
> 28, 4, 28, 512, 0.866
> 29, 0, 0, 512, 0.885
> 29, 5, 0, 512, 0.907
> 29, 0, 29, 512, 0.894
> 29, 5, 29, 512, 0.906
> 30, 0, 0, 512, 0.88
> 30, 6, 0, 512, 0.898
> 30, 0, 30, 512, 0.9
> 30, 6, 30, 512, 0.895
> 31, 0, 0, 512, 0.893
> 31, 7, 0, 512, 0.874
> 31, 0, 31, 512, 0.894
> 31, 7, 31, 512, 0.899
> 4, 0, 0, 32, 0.618
> 4, 1, 0, 32, 0.627
> 4, 0, 1, 32, 0.625
> 4, 1, 1, 32, 0.613
> 4, 0, 0, 64, 0.913
> 4, 2, 0, 64, 0.801
> 4, 0, 2, 64, 0.759
> 4, 2, 2, 64, 0.761
> 4, 0, 0, 128, 0.822
> 4, 3, 0, 128, 0.863
> 4, 0, 3, 128, 0.867
> 4, 3, 3, 128, 0.917
> 4, 0, 0, 256, 0.816
> 4, 4, 0, 256, 0.812
> 4, 0, 4, 256, 0.803
> 4, 4, 4, 256, 0.811
> 4, 5, 0, 512, 0.848
> 4, 0, 5, 512, 0.843
> 4, 5, 5, 512, 0.857
> 4, 0, 0, 1024, 0.886
> 4, 6, 0, 1024, 0.887
> 4, 0, 6, 1024, 0.881
> 4, 6, 6, 1024, 0.873
> 4, 0, 0, 2048, 0.892
> 4, 7, 0, 2048, 0.894
> 4, 0, 7, 2048, 0.89
> 4, 7, 7, 2048, 0.874
> 10, 1, 0, 64, 0.946
> 10, 1, 1, 64, 0.81
> 10, 2, 0, 64, 0.804
> 10, 2, 2, 64, 0.82
> 10, 3, 0, 64, 0.772
> 10, 3, 3, 64, 0.772
> 10, 4, 0, 64, 0.748
> 10, 4, 4, 64, 0.751
> 10, 5, 0, 64, 0.76
> 10, 5, 5, 64, 0.76
> 10, 6, 0, 64, 0.726
> 10, 6, 6, 64, 0.718
> 10, 7, 0, 64, 0.724
> 10, 7, 7, 64, 0.72
> 6, 0, 0, 0, 0.415
> 6, 0, 0, 1, 0.423
> 6, 0, 1, 1, 0.412
> 6, 0, 0, 2, 0.433
> 6, 0, 2, 2, 0.434
> 6, 0, 0, 3, 0.427
> 6, 0, 3, 3, 0.428
> 6, 0, 0, 4, 0.465
> 6, 0, 4, 4, 0.466
> 6, 0, 0, 5, 0.463
> 6, 0, 5, 5, 0.468
> 6, 0, 0, 6, 0.435
> 6, 0, 6, 6, 0.444
> 6, 0, 0, 7, 0.41
> 6, 0, 7, 7, 0.42
> 6, 0, 0, 8, 0.474
> 6, 0, 8, 8, 0.501
> 6, 0, 0, 9, 0.471
> 6, 0, 9, 9, 0.489
> 6, 0, 0, 10, 0.462
> 6, 0, 10, 10, 0.46
> 6, 0, 0, 11, 0.459
> 6, 0, 11, 11, 0.458
> 6, 0, 0, 12, 0.516
> 6, 0, 12, 12, 0.51
> 6, 0, 0, 13, 0.494
> 6, 0, 13, 13, 0.524
> 6, 0, 0, 14, 0.486
> 6, 0, 14, 14, 0.5
> 6, 0, 0, 15, 0.48
> 6, 0, 15, 15, 0.501
> 6, 0, 0, 16, 0.54
> 6, 0, 16, 16, 0.538
> 6, 0, 0, 17, 0.503
> 6, 0, 17, 17, 0.541
> 6, 0, 0, 18, 0.537
> 6, 0, 18, 18, 0.549
> 6, 0, 0, 19, 0.527
> 6, 0, 19, 19, 0.537
> 6, 0, 0, 20, 0.539
> 6, 0, 20, 20, 0.554
> 6, 0, 0, 21, 0.558
> 6, 0, 21, 21, 0.541
> 6, 0, 0, 22, 0.546
> 6, 0, 22, 22, 0.561
> 6, 0, 0, 23, 0.54
> 6, 0, 23, 23, 0.536
> 6, 0, 0, 24, 0.565
> 6, 0, 24, 24, 0.584
> 6, 0, 0, 25, 0.563
> 6, 0, 25, 25, 0.58
> 6, 0, 0, 26, 0.555
> 6, 0, 26, 26, 0.584
> 6, 0, 0, 27, 0.569
> 6, 0, 27, 27, 0.587
> 6, 0, 0, 28, 0.612
> 6, 0, 28, 28, 0.623
> 6, 0, 0, 29, 0.604
> 6, 0, 29, 29, 0.621
> 6, 0, 0, 30, 0.59
> 6, 0, 30, 30, 0.609
> 6, 0, 0, 31, 0.577
> 6, 0, 31, 31, 0.588
> 6, 0, 0, 32, 0.621
> 6, 0, 32, 32, 0.608
> 6, 0, 0, 33, 0.601
> 6, 0, 33, 33, 0.623
> 6, 0, 0, 34, 0.614
> 6, 0, 34, 34, 0.615
> 6, 0, 0, 35, 0.598
> 6, 0, 35, 35, 0.608
> 6, 0, 0, 36, 0.626
> 6, 0, 36, 36, 0.634
> 6, 0, 0, 37, 0.62
> 6, 0, 37, 37, 0.634
> 6, 0, 0, 38, 0.612
> 6, 0, 38, 38, 0.637
> 6, 0, 0, 39, 0.627
> 6, 0, 39, 39, 0.612
> 6, 0, 0, 40, 0.661
> 6, 0, 40, 40, 0.674
> 6, 0, 0, 41, 0.633
> 6, 0, 41, 41, 0.643
> 6, 0, 0, 42, 0.634
> 6, 0, 42, 42, 0.636
> 6, 0, 0, 43, 0.619
> 6, 0, 43, 43, 0.625
> 6, 0, 0, 44, 0.654
> 6, 0, 44, 44, 0.654
> 6, 0, 0, 45, 0.647
> 6, 0, 45, 45, 0.649
> 6, 0, 0, 46, 0.651
> 6, 0, 46, 46, 0.651
> 6, 0, 0, 47, 0.646
> 6, 0, 47, 47, 0.648
> 6, 0, 0, 48, 0.662
> 6, 0, 48, 48, 0.664
> 6, 0, 0, 49, 0.68
> 6, 0, 49, 49, 0.667
> 6, 0, 0, 50, 0.654
> 6, 0, 50, 50, 0.659
> 6, 0, 0, 51, 0.638
> 6, 0, 51, 51, 0.639
> 6, 0, 0, 52, 0.665
> 6, 0, 52, 52, 0.669
> 6, 0, 0, 53, 0.658
> 6, 0, 53, 53, 0.656
> 6, 0, 0, 54, 0.669
> 6, 0, 54, 54, 0.67
> 6, 0, 0, 55, 0.668
> 6, 0, 55, 55, 0.664
> 6, 0, 0, 56, 0.701
> 6, 0, 56, 56, 0.695
> 6, 0, 0, 57, 0.687
> 6, 0, 57, 57, 0.696
> 6, 0, 0, 58, 0.693
> 6, 0, 58, 58, 0.704
> 6, 0, 0, 59, 0.695
> 6, 0, 59, 59, 0.708
> 6, 0, 0, 60, 0.708
> 6, 0, 60, 60, 0.728
> 6, 0, 0, 61, 0.708
> 6, 0, 61, 61, 0.71
> 6, 0, 0, 62, 0.715
> 6, 0, 62, 62, 0.705
> 6, 0, 0, 63, 0.677
> 6, 0, 63, 63, 0.702
>
> .../{strcspn-sse2.S => strcspn-sse2.c} | 8 +-
> sysdeps/x86_64/strcspn.S | 119 ------------------
> 2 files changed, 4 insertions(+), 123 deletions(-)
> rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (85%)
> delete mode 100644 sysdeps/x86_64/strcspn.S
>
> diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> similarity index 85%
> rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
> rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
> index f97e856e1f..3a04bb39fc 100644
> --- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> @@ -1,4 +1,4 @@
> -/* strcspn optimized with SSE2.
> +/* strcspn.
> Copyright (C) 2017-2022 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> @@ -19,10 +19,10 @@
> #if IS_IN (libc)
>
> # include <sysdep.h>
> -# define strcspn __strcspn_sse2
> +# define STRCSPN __strcspn_sse2
>
> # undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcspn)
> +# define libc_hidden_builtin_def(STRCSPN)
> #endif
>
> -#include <sysdeps/x86_64/strcspn.S>
> +#include <string/strcspn.c>
> diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
> deleted file mode 100644
> index f3cd86c606..0000000000
> --- a/sysdeps/x86_64/strcspn.S
> +++ /dev/null
> @@ -1,119 +0,0 @@
> -/* strcspn (str, ss) -- Return the length of the initial segment of STR
> - which contains no characters from SS.
> - For AMD x86-64.
> - Copyright (C) 1994-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -
> - .text
> -ENTRY (strcspn)
> -
> - movq %rdi, %rdx /* Save SRC. */
> -
> - /* First we create a table with flags for all possible characters.
> - For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
> - supported by the C string functions we have 256 characters.
> - Before inserting marks for the stop characters we clear the whole
> - table. */
> - movq %rdi, %r8 /* Save value. */
> - subq $256, %rsp /* Make space for 256 bytes. */
> - cfi_adjust_cfa_offset(256)
> - movl $32, %ecx /* 32*8 bytes = 256 bytes. */
> - movq %rsp, %rdi
> - xorl %eax, %eax /* We store 0s. */
> - cld
> - rep
> - stosq
> -
> - movq %rsi, %rax /* Setup skipset. */
> -
> -/* For understanding the following code remember that %rcx == 0 now.
> - Although all the following instruction only modify %cl we always
> - have a correct zero-extended 64-bit value in %rcx. */
> -
> - .p2align 4
> -L(2): movb (%rax), %cl /* get byte from skipset */
> - testb %cl, %cl /* is NUL char? */
> - jz L(1) /* yes => start compare loop */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> -
> - movb 1(%rax), %cl /* get byte from skipset */
> - testb $0xff, %cl /* is NUL char? */
> - jz L(1) /* yes => start compare loop */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> -
> - movb 2(%rax), %cl /* get byte from skipset */
> - testb $0xff, %cl /* is NUL char? */
> - jz L(1) /* yes => start compare loop */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> -
> - movb 3(%rax), %cl /* get byte from skipset */
> - addq $4, %rax /* increment skipset pointer */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> - testb $0xff, %cl /* is NUL char? */
> - jnz L(2) /* no => process next dword from skipset */
> -
> -L(1): leaq -4(%rdx), %rax /* prepare loop */
> -
> - /* We use a neat trick for the following loop. Normally we would
> - have to test for two termination conditions
> - 1. a character in the skipset was found
> - and
> - 2. the end of the string was found
> - But as a sign that the character is in the skipset we store its
> - value in the table. But the value of NUL is NUL so the loop
> - terminates for NUL in every case. */
> -
> - .p2align 4
> -L(3): addq $4, %rax /* adjust pointer for full loop round */
> -
> - movb (%rax), %cl /* get byte from string */
> - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - je L(4) /* yes => return */
> -
> - movb 1(%rax), %cl /* get byte from string */
> - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - je L(5) /* yes => return */
> -
> - movb 2(%rax), %cl /* get byte from string */
> - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - jz L(6) /* yes => return */
> -
> - movb 3(%rax), %cl /* get byte from string */
> - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - jne L(3) /* no => start loop again */
> -
> - incq %rax /* adjust pointer */
> -L(6): incq %rax
> -L(5): incq %rax
> -
> -L(4): addq $256, %rsp /* remove skipset */
> - cfi_adjust_cfa_offset(-256)
> -#ifdef USE_AS_STRPBRK
> - xorl %edx,%edx
> - orb %cl, %cl /* was last character NUL? */
> - cmovzq %rdx, %rax /* Yes: return NULL */
> -#else
> - subq %rdx, %rax /* we have to return the number of valid
> - characters, so compute distance to first
> - non-valid character */
> -#endif
> - ret
> -END (strcspn)
> -libc_hidden_builtin_def (strcspn)
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
More information about the Libc-alpha
mailing list