[PATCH] riscv: Fix alignment-ignorant memcpy implementation
Adhemerval Zanella Netto
adhemerval.zanella@linaro.org
Mon Mar 4 18:01:48 GMT 2024
On 04/03/24 14:59, Adhemerval Zanella wrote:
> The memcpy optimization (commit 587a1290a1af7bee6db) has a series
> of mistakes:
>
> - The implementation is wrong: the chunk size calculation is wrong
> leading to invalid memory access.
>
> - It adds ifunc supports as default, so --disable-multi-arch does
> not work as expected for riscv.
>
> - It mixes Linux files (memcpy ifunc selection which requires the
> vDSO/syscall mechanism) with generic support (the memcpy
> optimization itself).
>
> - There is no __libc_ifunc_impl_list, which makes testing only
> check the selected implementation instead of all supported
> by the system.
>
> Also, there is no reason why the implementation can't be coded in C,
> since it uses only normal registers and the compiler is able to
> generate code as good as the assembly implementation. I have not
> checked the performance, but the C implementation uses the same
> strategies, the generate code with gcc 13 seems straightforward, and
> the tail code also avoid byte-operations.
>
> This patch also simplifies the required bits to enable ifunc: there
> is no need to memcopy.h; nor to add Linux-specific files.
>
> Checked on riscv64 and riscv32 by explicitly enabling the function
> on __libc_ifunc_impl_list on qemu-system.
> ---
> sysdeps/riscv/memcpy_noalignment.S | 136 ------------------
> .../multiarch}/memcpy-generic.c | 8 +-
> sysdeps/riscv/multiarch/memcpy_noalignment.c | 100 +++++++++++++
> sysdeps/unix/sysv/linux/riscv/Makefile | 9 --
> sysdeps/unix/sysv/linux/riscv/hwprobe.c | 1 +
> .../sysv/linux/riscv/include/sys/hwprobe.h | 8 ++
> .../unix/sysv/linux/riscv/multiarch/Makefile | 7 +
> .../linux/riscv/multiarch/ifunc-impl-list.c} | 33 +++--
> .../sysv/linux/riscv/multiarch}/memcpy.c | 16 +--
> 9 files changed, 151 insertions(+), 167 deletions(-)
> delete mode 100644 sysdeps/riscv/memcpy_noalignment.S
> rename sysdeps/{unix/sysv/linux/riscv => riscv/multiarch}/memcpy-generic.c (87%)
> create mode 100644 sysdeps/riscv/multiarch/memcpy_noalignment.c
> create mode 100644 sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h
> create mode 100644 sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> rename sysdeps/{riscv/memcopy.h => unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c} (51%)
> rename sysdeps/{riscv => unix/sysv/linux/riscv/multiarch}/memcpy.c (90%)
>
> diff --git a/sysdeps/riscv/memcpy_noalignment.S b/sysdeps/riscv/memcpy_noalignment.S
> deleted file mode 100644
> index 621f8d028f..0000000000
> --- a/sysdeps/riscv/memcpy_noalignment.S
> +++ /dev/null
> @@ -1,136 +0,0 @@
> -/* memcpy for RISC-V, ignoring buffer alignment
> - Copyright (C) 2024 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library. If not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -#include <sys/asm.h>
> -
> -/* void *memcpy(void *, const void *, size_t) */
> -ENTRY (__memcpy_noalignment)
> - move t6, a0 /* Preserve return value */
> -
> - /* Bail if 0 */
> - beqz a2, 7f
> -
> - /* Jump to byte copy if size < SZREG */
> - li a4, SZREG
> - bltu a2, a4, 5f
> -
> - /* Round down to the nearest "page" size */
> - andi a4, a2, ~((16*SZREG)-1)
> - beqz a4, 2f
> - add a3, a1, a4
> -
> - /* Copy the first word to get dest word aligned */
> - andi a5, t6, SZREG-1
> - beqz a5, 1f
> - REG_L a6, (a1)
> - REG_S a6, (t6)
> -
> - /* Align dst up to a word, move src and size as well. */
> - addi t6, t6, SZREG-1
> - andi t6, t6, ~(SZREG-1)
> - sub a5, t6, a0
> - add a1, a1, a5
> - sub a2, a2, a5
> -
> - /* Recompute page count */
> - andi a4, a2, ~((16*SZREG)-1)
> - beqz a4, 2f
> -
> -1:
> - /* Copy "pages" (chunks of 16 registers) */
> - REG_L a4, 0(a1)
> - REG_L a5, SZREG(a1)
> - REG_L a6, 2*SZREG(a1)
> - REG_L a7, 3*SZREG(a1)
> - REG_L t0, 4*SZREG(a1)
> - REG_L t1, 5*SZREG(a1)
> - REG_L t2, 6*SZREG(a1)
> - REG_L t3, 7*SZREG(a1)
> - REG_L t4, 8*SZREG(a1)
> - REG_L t5, 9*SZREG(a1)
> - REG_S a4, 0(t6)
> - REG_S a5, SZREG(t6)
> - REG_S a6, 2*SZREG(t6)
> - REG_S a7, 3*SZREG(t6)
> - REG_S t0, 4*SZREG(t6)
> - REG_S t1, 5*SZREG(t6)
> - REG_S t2, 6*SZREG(t6)
> - REG_S t3, 7*SZREG(t6)
> - REG_S t4, 8*SZREG(t6)
> - REG_S t5, 9*SZREG(t6)
> - REG_L a4, 10*SZREG(a1)
> - REG_L a5, 11*SZREG(a1)
> - REG_L a6, 12*SZREG(a1)
> - REG_L a7, 13*SZREG(a1)
> - REG_L t0, 14*SZREG(a1)
> - REG_L t1, 15*SZREG(a1)
> - addi a1, a1, 16*SZREG
> - REG_S a4, 10*SZREG(t6)
> - REG_S a5, 11*SZREG(t6)
> - REG_S a6, 12*SZREG(t6)
> - REG_S a7, 13*SZREG(t6)
> - REG_S t0, 14*SZREG(t6)
> - REG_S t1, 15*SZREG(t6)
> - addi t6, t6, 16*SZREG
> - bltu a1, a3, 1b
> - andi a2, a2, (16*SZREG)-1 /* Update count */
> -
> -2:
> - /* Remainder is smaller than a page, compute native word count */
> - beqz a2, 7f
> - andi a5, a2, ~(SZREG-1)
> - andi a2, a2, (SZREG-1)
> - add a3, a1, a5
> - /* Jump directly to last word if no words. */
> - beqz a5, 4f
> -
> -3:
> - /* Use single native register copy */
> - REG_L a4, 0(a1)
> - addi a1, a1, SZREG
> - REG_S a4, 0(t6)
> - addi t6, t6, SZREG
> - bltu a1, a3, 3b
> -
> - /* Jump directly out if no more bytes */
> - beqz a2, 7f
> -
> -4:
> - /* Copy the last word unaligned */
> - add a3, a1, a2
> - add a4, t6, a2
> - REG_L a5, -SZREG(a3)
> - REG_S a5, -SZREG(a4)
> - ret
> -
> -5:
> - /* Copy bytes when the total copy is <SZREG */
> - add a3, a1, a2
> -
> -6:
> - lb a4, 0(a1)
> - addi a1, a1, 1
> - sb a4, 0(t6)
> - addi t6, t6, 1
> - bltu a1, a3, 6b
> -
> -7:
> - ret
> -
> -END (__memcpy_noalignment)
> diff --git a/sysdeps/unix/sysv/linux/riscv/memcpy-generic.c b/sysdeps/riscv/multiarch/memcpy-generic.c
> similarity index 87%
> rename from sysdeps/unix/sysv/linux/riscv/memcpy-generic.c
> rename to sysdeps/riscv/multiarch/memcpy-generic.c
> index f06f4bda15..4235d33859 100644
> --- a/sysdeps/unix/sysv/linux/riscv/memcpy-generic.c
> +++ b/sysdeps/riscv/multiarch/memcpy-generic.c
> @@ -18,7 +18,9 @@
>
> #include <string.h>
>
> -extern __typeof (memcpy) __memcpy_generic;
> -hidden_proto (__memcpy_generic)
> -
> +#if IS_IN(libc)
> +# define MEMCPY __memcpy_generic
> +# undef libc_hidden_builtin_def
> +# define libc_hidden_builtin_def(x)
> +#endif
> #include <string/memcpy.c>
> diff --git a/sysdeps/riscv/multiarch/memcpy_noalignment.c b/sysdeps/riscv/multiarch/memcpy_noalignment.c
> new file mode 100644
> index 0000000000..9552ae45fc
> --- /dev/null
> +++ b/sysdeps/riscv/multiarch/memcpy_noalignment.c
> @@ -0,0 +1,100 @@
> +/* Copy memory to memory until the specified number of bytes.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <string.h>
> +#include <stdint.h>
> +#include <string-optype.h>
> +
> +/* memcpy optimization for chips with fast unaligned support
> + (RISCV_HWPROBE_MISALIGNED_FAST). */
> +
> +#define OPSIZ (sizeof (op_t))
> +
> +typedef uint32_t __attribute__ ((__may_alias__)) op32_t;
> +typedef uint16_t __attribute__ ((__may_alias__)) op16_t;
> +
> +void *
> +__memcpy_noalignment (void *dest, const void *src, size_t n)
> +{
> + unsigned char *d = dest;
> + const unsigned char *s = src;
> +
> + if (__glibc_unlikely (n == 0))
> + return dest;
> +
> + if (n < OPSIZ)
> + goto tail;
> +
> +#define COPY_WORD(__d, __s, __i) \
> + *(op_t *)(__d + (__i * OPSIZ)) = *(op_t *)(__s + (__i * OPSIZ))
> +
> + /* Copy the first op_t to get dest word aligned. */
> + COPY_WORD (d, s, 0);
> + size_t off = OPSIZ - (uintptr_t)d % OPSIZ;
> + d += off;
> + s += off;
> + n -= off;
> +
> + /* Copy chunks of 16 registers. */
> + enum { block_size = 16 * OPSIZ };
> +
> + for (; n >= block_size; s += block_size, d += block_size, n -= block_size)
> + {
> +
> + COPY_WORD (d, s, 0);
> + COPY_WORD (d, s, 1);
> + COPY_WORD (d, s, 2);
> + COPY_WORD (d, s, 3);
> + COPY_WORD (d, s, 4);
> + COPY_WORD (d, s, 5);
> + COPY_WORD (d, s, 6);
> + COPY_WORD (d, s, 7);
> + COPY_WORD (d, s, 8);
> + COPY_WORD (d, s, 9);
> + COPY_WORD (d, s, 10);
> + COPY_WORD (d, s, 11);
> + COPY_WORD (d, s, 12);
> + COPY_WORD (d, s, 13);
> + COPY_WORD (d, s, 14);
> + COPY_WORD (d, s, 15);
> + }
> +
> + /* Remainder of chunks, issue a op_t copy until n < OPSIZ. */
> + for (; n >= OPSIZ; s += OPSIZ, d += OPSIZ, n -= OPSIZ)
> + COPY_WORD (d, s, 0);
> +
> +tail:
> + if (n & 4)
And I forgot to add this should be:
if (sizeof (op_t) == sizeof (uint64_t) && n & 4)
Since there is no need to enable this for riscv32.
> + {
> + *(op32_t *)(d) = *(op32_t *)s;
> + s += sizeof (op32_t);
> + d += sizeof (op32_t);
> + }
> +
> + if (n & 2)
> + {
> + *(op16_t *)(d) = *(op16_t *)s;
> + s += sizeof (op16_t);
> + d += sizeof (op16_t);
> + }
> +
> + if (n & 1)
> + *d = *s;
> +
> + return dest;
> +}
> diff --git a/sysdeps/unix/sysv/linux/riscv/Makefile b/sysdeps/unix/sysv/linux/riscv/Makefile
> index 398ff7418b..04abf226ad 100644
> --- a/sysdeps/unix/sysv/linux/riscv/Makefile
> +++ b/sysdeps/unix/sysv/linux/riscv/Makefile
> @@ -15,15 +15,6 @@ ifeq ($(subdir),stdlib)
> gen-as-const-headers += ucontext_i.sym
> endif
>
> -ifeq ($(subdir),string)
> -sysdep_routines += \
> - memcpy \
> - memcpy-generic \
> - memcpy_noalignment \
> - # sysdep_routines
> -
> -endif
> -
> abi-variants := ilp32 ilp32d lp64 lp64d
>
> ifeq (,$(filter $(default-abi),$(abi-variants)))
> diff --git a/sysdeps/unix/sysv/linux/riscv/hwprobe.c b/sysdeps/unix/sysv/linux/riscv/hwprobe.c
> index e64c159eb3..9159045478 100644
> --- a/sysdeps/unix/sysv/linux/riscv/hwprobe.c
> +++ b/sysdeps/unix/sysv/linux/riscv/hwprobe.c
> @@ -34,3 +34,4 @@ int __riscv_hwprobe (struct riscv_hwprobe *pairs, size_t pair_count,
> /* Negate negative errno values to match pthreads API. */
> return -r;
> }
> +libc_hidden_def (__riscv_hwprobe)
> diff --git a/sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h b/sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h
> new file mode 100644
> index 0000000000..cce91c1b53
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h
> @@ -0,0 +1,8 @@
> +#ifndef _SYS_HWPROBE_H
> +# include_next <sys/hwprobe.h>
> +
> +#ifndef _ISOMAC
> +libc_hidden_proto (__riscv_hwprobe)
> +#endif
> +
> +#endif
> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> new file mode 100644
> index 0000000000..7e567cb95c
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> @@ -0,0 +1,7 @@
> +ifeq ($(subdir),string)
> +sysdep_routines += \
> + memcpy \
> + memcpy-generic \
> + memcpy_noalignment \
> + # sysdep_routines
> +endif
> diff --git a/sysdeps/riscv/memcopy.h b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> similarity index 51%
> rename from sysdeps/riscv/memcopy.h
> rename to sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> index 27675964b0..9f806d7a9e 100644
> --- a/sysdeps/riscv/memcopy.h
> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> @@ -1,4 +1,4 @@
> -/* memcopy.h -- definitions for memory copy functions. RISC-V version.
> +/* Enumerate available IFUNC implementations of a function. RISCV version.
> Copyright (C) 2024 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> @@ -16,11 +16,28 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdeps/generic/memcopy.h>
> +#include <ifunc-impl-list.h>
> +#include <string.h>
> +#include <sys/hwprobe.h>
>
> -/* Redefine the generic memcpy implementation to __memcpy_generic, so
> - the memcpy ifunc can select between generic and special versions.
> - In rtld, don't bother with all the ifunciness. */
> -#if IS_IN (libc)
> -#define MEMCPY __memcpy_generic
> -#endif
> +size_t
> +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> + size_t max)
> +{
> + size_t i = max;
> +
> + bool fast_unaligned = false;
> +
> + struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 };
> + if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0
> + && (pair.value & RISCV_HWPROBE_MISALIGNED_MASK)
> + == RISCV_HWPROBE_MISALIGNED_FAST)
> + fast_unaligned = true;
> +
> + IFUNC_IMPL (i, name, memcpy,
> + IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned,
> + __memcpy_noalignment)
> + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
> +
> + return 0;
> +}
> diff --git a/sysdeps/riscv/memcpy.c b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
> similarity index 90%
> rename from sysdeps/riscv/memcpy.c
> rename to sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
> index 20f9548c44..51d8ace858 100644
> --- a/sysdeps/riscv/memcpy.c
> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
> @@ -28,8 +28,6 @@
> # include <riscv-ifunc.h>
> # include <sys/hwprobe.h>
>
> -# define INIT_ARCH()
> -
> extern __typeof (__redirect_memcpy) __libc_memcpy;
>
> extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
> @@ -38,14 +36,9 @@ extern __typeof (__redirect_memcpy) __memcpy_noalignment attribute_hidden;
> static inline __typeof (__redirect_memcpy) *
> select_memcpy_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func)
> {
> - unsigned long long int value;
> -
> - INIT_ARCH ();
> -
> - if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &value) != 0)
> - return __memcpy_generic;
> -
> - if ((value & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST)
> + unsigned long long int v;
> + if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &v) == 0
> + && (v & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST)
> return __memcpy_noalignment;
>
> return __memcpy_generic;
> @@ -59,5 +52,6 @@ strong_alias (__libc_memcpy, memcpy);
> __hidden_ver1 (memcpy, __GI_memcpy, __redirect_memcpy)
> __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memcpy);
> # endif
> -
> +#else
> +# include <string/memcpy.c>
> #endif
More information about the Libc-alpha
mailing list