This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 2/3] aarch64: Optimized memset specific to AmpereComputing skylark


Szabolcs,

    We have done that performance test. For skylark.s, the performance improvement is larger than 10% for some cases, and not minor part. So we consider to add this x-regs version.

Feng
________________________________________
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Sent: Monday, October 15, 2018 19:53
To: Xue Feng; libc-alpha@sourceware.org
Cc: nd@arm.com; marcus.shawcroft@linaro.org; Feng Xue
Subject: Re: [PATCH 2/3] aarch64: Optimized memset specific to AmpereComputing skylark

On 12/10/18 12:41, Xue Feng wrote:
&gt; This version uses general register based memory store instead of
&gt; vector register based, for the former is faster than the latter
&gt; in skylark.
&gt;

if that's the only difference compared to memset_falkor.S
then it would be nice to know how much it matters for
performance on falkor and skylark to use x vs q regs.

if it's &lt; 10% then don't add another memset variant.

&gt; The fact that DC ZVA size in skylark is 64-byte, is used by IFUNC
&gt; dispatch to select this memset, so that cost of runtime-check on
&gt; DC ZVA size can be saved.
&gt;
&gt;     * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
&gt;     Add memset_skylark.
&gt;     * sysdeps/aarch64/multiarch/ifunc-impl-list.c
&gt;     (__libc_ifunc_impl_list): Add __memset_skylark to memset ifunc.
&gt;     * sysdeps/aarch64/multiarch/memset.c (libc_ifunc):
&gt;     Add IS_SKYLARK check for ifunc dispatch.
&gt;     * sysdeps/aarch64/multiarch/memset_skylark.S: New file.
&gt; ---
&gt;  ChangeLog                                   |  10 ++
&gt;  sysdeps/aarch64/multiarch/Makefile          |   3 +-
&gt;  sysdeps/aarch64/multiarch/ifunc-impl-list.c |   1 +
&gt;  sysdeps/aarch64/multiarch/memset.c          |   5 +-
&gt;  sysdeps/aarch64/multiarch/memset_skylark.S  | 176 ++++++++++++++++++++++++++++
&gt;  5 files changed, 193 insertions(+), 2 deletions(-)
&gt;  create mode 100644 sysdeps/aarch64/multiarch/memset_skylark.S
&gt;
&gt; diff --git a/ChangeLog b/ChangeLog
&gt; index 2533c9d..28370f9 100644
&gt; --- a/ChangeLog
&gt; +++ b/ChangeLog
&gt; @@ -1,3 +1,13 @@
&gt; +2018-10-12  Feng Xue  <feng.xue@amperecomputing.com>
&gt; +
&gt; +     * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
&gt; +     Add memset_skylark.
&gt; +     * sysdeps/aarch64/multiarch/ifunc-impl-list.c
&gt; +     (__libc_ifunc_impl_list): Add __memset_skylark to memset ifunc.
&gt; +     * sysdeps/aarch64/multiarch/memset.c (libc_ifunc):
&gt; +     Add IS_SKYLARK check for ifunc dispatch.
&gt; +     * sysdeps/aarch64/multiarch/memset_skylark.S: New file.
&gt; +
&gt;  2018-10-11  Feng Xue  <feng.xue@amperecomputing.com>
&gt;
&gt;       * manual/tunables.texi (Tunable glibc.cpu.name): Add skylark.
&gt; diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
&gt; index b1a5f59..828ce4f 100644
&gt; --- a/sysdeps/aarch64/multiarch/Makefile
&gt; +++ b/sysdeps/aarch64/multiarch/Makefile
&gt; @@ -1,5 +1,6 @@
&gt;  ifeq ($(subdir),string)
&gt;  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
&gt; -                memcpy_falkor memmove_falkor memset_generic memset_falkor \
&gt; +                memcpy_falkor memmove_falkor \
&gt; +                memset_generic memset_falkor memset_skylark \
&gt;                  strlen_generic strlen_asimd
&gt;  endif
&gt; diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
&gt; index af44665..baf01a0 100644
&gt; --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
&gt; +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
&gt; @@ -51,6 +51,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&gt;             /* Enable this on non-falkor processors too so that other cores
&gt;                can do a comparative analysis with __memset_generic.  */
&gt;             IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
&gt; +           IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_skylark)
&gt;             IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
&gt;
&gt;    IFUNC_IMPL (i, name, strlen,
&gt; diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
&gt; index d74ed3a..b732a7e 100644
&gt; --- a/sysdeps/aarch64/multiarch/memset.c
&gt; +++ b/sysdeps/aarch64/multiarch/memset.c
&gt; @@ -29,12 +29,15 @@
&gt;  extern __typeof (__redirect_memset) __libc_memset;
&gt;
&gt;  extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
&gt; +extern __typeof (__redirect_memset) __memset_skylark attribute_hidden;
&gt;  extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
&gt;
&gt;  libc_ifunc (__libc_memset,
&gt;           ((IS_FALKOR (midr) || IS_PHECDA (midr)) &amp;&amp; zva_size == 64
&gt;            ? __memset_falkor
&gt; -          : __memset_generic));
&gt; +          : (IS_SKYLARK (midr) &amp;&amp; zva_size == 64
&gt; +            ? __memset_skylark
&gt; +            : __memset_generic)));
&gt;
&gt;  # undef memset
&gt;  strong_alias (__libc_memset, memset);
&gt; diff --git a/sysdeps/aarch64/multiarch/memset_skylark.S b/sysdeps/aarch64/multiarch/memset_skylark.S
&gt; new file mode 100644
&gt; index 0000000..22bf576
&gt; --- /dev/null
&gt; +++ b/sysdeps/aarch64/multiarch/memset_skylark.S
&gt; @@ -0,0 +1,176 @@
&gt; +/* Optimized memset for AmpereComputing skylark processor.
&gt; +   Copyright (C) 2018 Free Software Foundation, Inc.
&gt; +
&gt; +   This file is part of the GNU C Library.
&gt; +
&gt; +   The GNU C Library is free software; you can redistribute it and/or
&gt; +   modify it under the terms of the GNU Lesser General Public
&gt; +   License as published by the Free Software Foundation; either
&gt; +   version 2.1 of the License, or (at your option) any later version.
&gt; +
&gt; +   The GNU C Library is distributed in the hope that it will be useful,
&gt; +   but WITHOUT ANY WARRANTY; without even the implied warranty of
&gt; +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
&gt; +   Lesser General Public License for more details.
&gt; +
&gt; +   You should have received a copy of the GNU Lesser General Public
&gt; +   License along with the GNU C Library.  If not, see
&gt; +   <http: www.gnu.org="" licenses="">.  */
&gt; +
&gt; +#include <sysdep.h>
&gt; +#include "memset-reg.h"
&gt; +
&gt; +#if IS_IN (libc)
&gt; +# define MEMSET __memset_skylark
&gt; +
&gt; +/* Assumptions:
&gt; + *
&gt; + * ARMv8-a, AArch64, unaligned accesses
&gt; + *
&gt; + */
&gt; +
&gt; +ENTRY_ALIGN (MEMSET, 6)
&gt; +
&gt; +     DELOUSE (0)
&gt; +     DELOUSE (2)
&gt; +
&gt; +     bfi     valw, valw, 8, 8
&gt; +     bfi     valw, valw, 16, 16
&gt; +     bfi     val, val, 32, 32
&gt; +
&gt; +     add     dstend, dstin, count
&gt; +
&gt; +     cmp     count, 96
&gt; +     b.hi    L(set_long)
&gt; +     cmp     count, 16
&gt; +     b.hs    L(set_medium)
&gt; +
&gt; +     /* Set 0..15 bytes.  */
&gt; +     tbz     count, 3, 1f
&gt; +     str     val, [dstin]
&gt; +     str     val, [dstend, -8]
&gt; +     ret
&gt; +
&gt; +     .p2align 3
&gt; +1:   tbz     count, 2, 2f
&gt; +     str     valw, [dstin]
&gt; +     str     valw, [dstend, -4]
&gt; +     ret
&gt; +2:   cbz     count, 3f
&gt; +     strb    valw, [dstin]
&gt; +     tbz     count, 1, 3f
&gt; +     strh    valw, [dstend, -2]
&gt; +3:   ret
&gt; +
&gt; +     .p2align 3
&gt; +     /* Set 16..96 bytes.  */
&gt; +L(set_medium):
&gt; +     stp     val, val, [dstin]
&gt; +     tbnz    count, 6, L(set96)
&gt; +     stp     val, val, [dstend, -16]
&gt; +     tbz     count, 5, 1f
&gt; +     stp     val, val, [dstin, 16]
&gt; +     stp     val, val, [dstend, -32]
&gt; +1:   ret
&gt; +
&gt; +     .p2align 4
&gt; +     /* Set 64..96 bytes.  Write 64 bytes from the start and
&gt; +        32 bytes from the end.  */
&gt; +L(set96):
&gt; +     stp     val, val, [dstin, 16]
&gt; +     stp     val, val, [dstin, 32]
&gt; +     stp     val, val, [dstin, 48]
&gt; +     stp     val, val, [dstend, -32]
&gt; +     stp     val, val, [dstend, -16]
&gt; +     ret
&gt; +
&gt; +     .p2align 4
&gt; +L(set_long):
&gt; +     stp     val, val, [dstin]
&gt; +     cmp     count, 512
&gt; +     ccmp    val, 0, 0, cs
&gt; +     bic     dst, dstin, 15
&gt; +     b.eq    L(zva_64)
&gt; +
&gt; +     /* Small-size or non-zero memset does not use DC ZVA. */
&gt; +     sub     count, dstend, dst
&gt; +
&gt; +     /*
&gt; +      * Adjust count and bias for loop. By substracting extra 1 from count,
&gt; +      * it is easy to use tbz instruction to check whether loop tailing
&gt; +      * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
&gt; +      */
&gt; +     sub     count, count, 64+16+1
&gt; +     nop
&gt; +
&gt; +1:   stp     val, val, [dst, 16]
&gt; +     stp     val, val, [dst, 32]
&gt; +     stp     val, val, [dst, 48]
&gt; +     stp     val, val, [dst, 64]!
&gt; +     subs    count, count, 64
&gt; +     b.hs    1b
&gt; +
&gt; +     tbz     count, 5, 1f    /* Remaining count is less than 33 bytes? */
&gt; +     stp     val, val, [dst, 16]
&gt; +     stp     val, val, [dst, 32]
&gt; +1:   stp     val, val, [dstend, -32]
&gt; +     stp     val, val, [dstend, -16]
&gt; +     ret
&gt; +
&gt; +     .p2align 3
&gt; +L(zva_64):
&gt; +     stp     val, val, [dst, 16]
&gt; +     stp     val, val, [dst, 32]
&gt; +     stp     val, val, [dst, 48]
&gt; +     bic     dst, dst, 63
&gt; +
&gt; +     /*
&gt; +      * Previous memory writes might cross cache line boundary, and cause
&gt; +      * cache line partially dirty. Zeroing this kind of cache line using
&gt; +      * DC ZVA will incur extra cost, for it requires loading untouched
&gt; +      * part of the line from memory before zeoring.
&gt; +      *
&gt; +      * So, write the first 64 byte aligned block using stp to force
&gt; +      * fully dirty cache line.
&gt; +      */
&gt; +     stp     val, val, [dst, 64]
&gt; +     stp     val, val, [dst, 80]
&gt; +     stp     val, val, [dst, 96]
&gt; +     stp     val, val, [dst, 112]
&gt; +
&gt; +     sub     count, dstend, dst
&gt; +     /*
&gt; +      * Adjust count and bias for loop. By substracting extra 1 from count,
&gt; +      * it is easy to use tbz instruction to check whether loop tailing
&gt; +      * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
&gt; +      */
&gt; +     sub     count, count, 128+64+64+1
&gt; +     add     dst, dst, 128
&gt; +     nop
&gt; +
&gt; +     /* DC ZVA sets 64 bytes each time. */
&gt; +1:   dc      zva, dst
&gt; +     add     dst, dst, 64
&gt; +     subs    count, count, 64
&gt; +     b.hs    1b
&gt; +
&gt; +     /*
&gt; +      * Write the last 64 byte aligned block using stp to force fully
&gt; +      * dirty cache line.
&gt; +      */
&gt; +     stp     val, val, [dst, 0]
&gt; +     stp     val, val, [dst, 16]
&gt; +     stp     val, val, [dst, 32]
&gt; +     stp     val, val, [dst, 48]
&gt; +
&gt; +     tbz     count, 5, 1f    /* Remaining count is less than 33 bytes? */
&gt; +     stp     val, val, [dst, 64]
&gt; +     stp     val, val, [dst, 80]
&gt; +1:   stp     val, val, [dstend, -32]
&gt; +     stp     val, val, [dstend, -16]
&gt; +     ret
&gt; +
&gt; +END (MEMSET)
&gt; +libc_hidden_builtin_def (MEMSET)
&gt; +
&gt; +#endif
&gt;

</sysdep.h></http:></feng.xue@amperecomputing.com></feng.xue@amperecomputing.com></szabolcs.nagy@arm.com>

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]