This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Add ifunc memcpy and memmove for aarch64


This patch adds ifunc versions of memcpy and memmove for aarch64.  I
know this isn't appropriate for 2.25 but I wanted to submit it and get
it reviewed for 2.26.  The basic change is to include software
prefetching for large memcpy's on thunderx which can speed up those
routines by around 2X.  For memcpy's under 32K bytes I found that the
software prefetching did not help (and sometimes hurt).  I wasn't
really interested in speeding up memmove but since memcpy and memmove
are implemented in one file it seemed easier to make memmove an ifunc
along with memcpy rather than try and split them up.  memmove does get
a speedup when it uses the memcpy code.

The ifunc code depends on the mrs instruction which is a privileged
instruction but the 4.11 version of the linux kernel will have
emulation for it (https://lkml.org/lkml/2017/1/10/816).  Since it is
emulated I added code to save it's value rather than read it everytime
we want to execute an ifunc selection function.  I also saved a flag to
specify if the platform was thunderx or not so that glibc did not have
to do multiple logical operations on the mrs value in each ifunc
selection function to determine if it was on a thunderx platform or
not.

I have attached the bench-memcpy.out, bench-memcpy-large.out, bench-
memmove.out and bench-memmove-large.out files to show the performance
difference, most of the difference is seen in the large versions as the
smaller ones only use prefetching on a couple of inputs.

Steve Ellcey
sellcey@caviumnetworks.com


2017-01-19  Steve Ellcey  <sellcey@caviumnetworks.com>

	* sysdeps/aarch64/memcpy.S (MEMMOVE, MEMCPY): New macros.
	(memmove): Use MEMMOVE for name.
	(memcpy): Use MEMCPY for name.  Add loop with prefetching
	under USE_THUNDERX macro.
	* sysdeps/aarch64/multiarch/Makefile: New file.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c: Ditto.
	* sysdeps/aarch64/multiarch/init-arch.h: Ditto.
	* sysdeps/aarch64/multiarch/memcpy.c: Ditto.
	* sysdeps/aarch64/multiarch/memcpy_generic.S: Ditto.
	* sysdeps/aarch64/multiarch/memcpy_thunderx.S: Ditto.
	* sysdeps/unix/sysv/linux/aarch64/configure.ac (arch_minimum_kernel):
	Set to 4.11.0 if building with multi_arch.
	* sysdeps/unix/sysv/linux/aarch64/configure: Regenerate.
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 29af8b1..74444b4 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -59,7 +59,14 @@
    Overlapping large forward memmoves use a loop that copies backwards.
 */
 
-ENTRY_ALIGN (memmove, 6)
+#ifndef MEMMOVE
+#  define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+#  define MEMCPY memcpy
+#endif
+
+ENTRY_ALIGN (MEMMOVE, 6)
 
 	DELOUSE (0)
 	DELOUSE (1)
@@ -71,9 +78,9 @@ ENTRY_ALIGN (memmove, 6)
 	b.lo	L(move_long)
 
 	/* Common case falls through into memcpy.  */
-END (memmove)
-libc_hidden_builtin_def (memmove)
-ENTRY (memcpy)
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+ENTRY (MEMCPY)
 
 	DELOUSE (0)
 	DELOUSE (1)
@@ -158,10 +165,22 @@ L(copy96):
 
 	.p2align 4
 L(copy_long):
+
+#ifdef USE_THUNDERX
+
+	/* On thunderx, large memcpy's are helped by software prefetching.
+	   This loop is identical to the one below it but with prefetching
+	   instructions included.  For loops that are less than 32768 bytes,
+	   the prefetching does not help and slow the code down so we only
+	   use the prefetching loop for the largest memcpys.  */
+
+	cmp	count, #32768
+	b.lo	L(copy_long_without_prefetch)
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
 	ldp	D_l, D_h, [src]
 	sub	src, src, tmp1
+	prfm	pldl1strm, [src, 384]
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
 	stp	D_l, D_h, [dstin]
@@ -169,7 +188,10 @@ L(copy_long):
 	ldp	C_l, C_h, [src, 48]
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	2f
+
+L(prefetch_loop64):
+	tbz	src, #6, 1f
+	prfm	pldl1strm, [src, 512]
 1:
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
@@ -180,12 +202,40 @@ L(copy_long):
 	stp	D_l, D_h, [dst, 64]!
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 64
-	b.hi	1b
+	b.hi	L(prefetch_loop64)
+	b	L(last64)
+
+L(copy_long_without_prefetch):
+#endif
+
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
 
 	/* Write the last full set of 64 bytes.  The remainder is at most 64
 	   bytes, so it is safe to always copy 64 bytes from the end even if
 	   there is just 1 byte left.  */
-2:
+L(last64):
 	ldp	E_l, E_h, [srcend, -64]
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [srcend, -48]
@@ -256,5 +306,5 @@ L(move_long):
 	stp	C_l, C_h, [dstin]
 3:	ret
 
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index e69de29..78d52c7 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),string)
+sysdep_routines += memcpy_generic memcpy_thunderx
+endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index e69de29..c6d63f6 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,61 @@
+/* Enumerate available IFUNC implementations of a function.  AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ldsodefs.h>
+#include <ifunc-impl-list.h>
+#include <init-arch.h>
+#include <stdio.h>
+
+/* Access to the midr_el1 register is emulated by the linux kernel and
+   is slow so we save it in __midr after it is read once.  We also save
+   the value of IS_THUNDERX in __is_thunderx so it does not need to be
+   recomputed by checking multiple bits from __midr.  */
+
+uint64_t __midr attribute_hidden = 0;
+bool __is_thunderx attribute_hidden;
+
+/* Maximum number of IFUNC implementations.  */
+#define MAX_IFUNC	2
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  INIT_ARCH ();
+
+#ifdef SHARED
+  /* Support sysdeps/aarch64/multiarch/memcpy.c.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, __is_thunderx,
+			      __memcpy_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, __is_thunderx,
+			      __memmove_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+#endif
+
+  return i;
+}
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index e69de29..e12ba61 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -0,0 +1,55 @@
+/* This file is part of the GNU C Library.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <elf.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#define MIDR_REVISION_MASK	0xf
+#define MIDR_REVISION(__midr)	((__midr) & MIDR_REVISION_MASK)
+#define MIDR_PARTNUM_SHIFT	4
+#define MIDR_PARTNUM_MASK	(0xfff << MIDR_PARTNUM_SHIFT)
+#define MIDR_PARTNUM(__midr)	\
+	(((__midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT)
+#define MIDR_ARCHITECTURE_SHIFT	16
+#define MIDR_ARCHITECTURE_MASK	(0xf << MIDR_ARCHITECTURE_SHIFT)
+#define MIDR_ARCHITECTURE(__midr)	\
+	(((__midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT)
+#define MIDR_VARIANT_SHIFT	20
+#define MIDR_VARIANT_MASK	(0xf << MIDR_VARIANT_SHIFT)
+#define MIDR_VARIANT(__midr)	\
+	(((__midr) & MIDR_VARIANT_MASK) >> MIDR_VARIANT_SHIFT)
+#define MIDR_IMPLEMENTOR_SHIFT	24
+#define MIDR_IMPLEMENTOR_MASK	(0xff << MIDR_IMPLEMENTOR_SHIFT)
+#define MIDR_IMPLEMENTOR(__midr)	\
+	(((__midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT)
+
+#define IS_THUNDERX(__midr) (MIDR_IMPLEMENTOR(__midr) == 'C'	\
+			     && MIDR_PARTNUM(__midr) == 0x0a1)
+
+
+extern uint64_t __midr attribute_hidden;
+extern bool __is_thunderx attribute_hidden;
+
+#define INIT_ARCH()						\
+  {								\
+    if (__midr == 0)						\
+      {								\
+	asm volatile ("mrs %0, midr_el1" : "=r"(__midr));	\
+	__is_thunderx = IS_THUNDERX(__midr);			\
+      }								\
+  }
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index e69de29..b2b587b 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -0,0 +1,41 @@
+/* Multiple versions of memcpy. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memcpy before the initialization
+   happened.  */
+
+#if defined SHARED && IS_IN (libc)
+/* Redefine memcpy so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memcpy
+# define memcpy __redirect_memcpy
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memcpy) __libc_memcpy;
+
+extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
+
+libc_ifunc (__libc_memcpy,
+            __is_thunderx ? __memcpy_thunderx : __memcpy_generic);
+
+#undef memcpy
+strong_alias (__libc_memcpy, memcpy);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_generic.S b/sysdeps/aarch64/multiarch/memcpy_generic.S
index e69de29..c0e3462 100644
--- a/sysdeps/aarch64/multiarch/memcpy_generic.S
+++ b/sysdeps/aarch64/multiarch/memcpy_generic.S
@@ -0,0 +1,42 @@
+/* A Generic Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual memcpy and memmove code is in ../memcpy.S.  If we are
+   building a shared libc using IFUNC this file defines __memcpy_generic
+   and __memmove_generic.  Otherwise the include of ../memcpy.S will
+   define the normal __memcpy and__memmove entry points.  */
+
+#include <sysdep.h>
+
+#if defined SHARED && IS_IN (libc)
+
+#define MEMCPY __memcpy_generic
+#define MEMMOVE __memmove_generic
+
+/* Do not hide the generic versions of memcpy and memmove, we use them
+   internally.  */
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+/* It doesn't make sense to send libc-internal memcpy calls through a PLT. */
+	.globl __GI_memcpy; __GI_memcpy = __memcpy_generic
+	.globl __GI_memmove; __GI_memmove = __memmove_generic
+
+#endif
+
+#include "../memcpy.S"
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
index e69de29..df5e959 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -0,0 +1,33 @@
+/* A Thunderx Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual thunderx optimized code is in ../memcpy.S under the USE_THUNDERX
+   ifdef.  If we are not building a shared libc with IFUNC then we do not
+   build anything when compiling this file and __memcpy is defined by
+   memcpy_generic.S.  */
+
+#include <sysdep.h>
+
+#if defined SHARED && IS_IN (libc)
+
+#define MEMCPY __memcpy_thunderx
+#define MEMMOVE __memmove_thunderx
+#define USE_THUNDERX
+#include "../memcpy.S"
+
+#endif
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index e69de29..c08c763 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -0,0 +1,40 @@
+/* Multiple versions of memmove. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memmove before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+/* Redefine memmove so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memmove
+# define memmove __redirect_memmove
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memmove) __libc_memmove;
+
+extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
+
+libc_ifunc (__libc_memmove,
+            __is_thunderx ? __memmove_thunderx : __memmove_generic);
+
+#undef memmove
+strong_alias (__libc_memmove, memmove);
+#endif
diff --git a/sysdeps/unix/sysv/linux/aarch64/configure.ac b/sysdeps/unix/sysv/linux/aarch64/configure.ac
index 211fa9c..684cb46 100644
--- a/sysdeps/unix/sysv/linux/aarch64/configure.ac
+++ b/sysdeps/unix/sysv/linux/aarch64/configure.ac
@@ -1,6 +1,11 @@
 GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
 # Local configure fragment for sysdeps/unix/sysv/linux/aarch64.
 
-arch_minimum_kernel=3.7.0
+# For multi-arch support we need a kernel that emulates the mrs instruction.
+if test x$multi_arch = xyes; then
+    arch_minimum_kernel=4.11.0
+else
+    arch_minimum_kernel=3.7.0
+fi
 
 LIBC_SLIBDIR_RTLDDIR([lib64], [lib])

Attachment: bench-memcpy.out
Description: Text document

Attachment: bench-memcpy-large.out
Description: Text document

Attachment: bench-memmove.out
Description: Text document

Attachment: bench-memmove-large.out
Description: Text document


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]