This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Patch][Aarch64] memcpy IFUNC for Cavium ThunderX2


This patch adds a new memcpy ifunc for Cavium ThunderX2.  The difference
between this and the Thunderx version is in the prefetching.  ThunderX2
has different cache characteristics and so uses a different prefetching
strategy.  Note that I prefetch past the end of the buffer being copied
but my understanding is that that is legal and should never generate any
errors.  I tried adding code to not prefetch past the end of the source
but those changes slowed down memcpy so I did not include them.

I did not copy memcpy_thunderx.S to memcpy_thunderx2.S but just use
memcpy_thunderx2.S to set some macros and then include memcpy_thunderx.S.
This is to reduce duplicate code.

I have attached the memcpy benchmark output files from a ThunderX2 run,
the main differences are in bench-memcpy-large.out.

Tested with no regressions, OK to checkin?

Steve Ellcey
sellcey@cavium.com


2018-02-14  Steve Ellcey  <sellcey@cavium.com>

	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
	Add memcpy_thunderx2.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c (MAX_IFUNC):
	Increment to 4.
	(__libc_ifunc_impl_list): Add __memcpy_thunderx2.
	* sysdeps/aarch64/multiarch/memcpy.c (libc_ifunc): Add IS_THUNDERX2
	and IS_THUNDERX2PA checks.
	* sysdeps/aarch64/multiarch/memcpy_thunderx.S (USE_THUNDERX2):
	Use macro to set name appropriately.
	(memcpy): Use USE_THUNDERX2 macro to modify prefetches.
	* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: New file.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_THUNDERX2PA):
	New macro.
	(IS_THUNDERX2): New macro.
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index aa179c4..57ffdf7 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,4 +1,4 @@
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
-		   memmove_falkor memset_generic memset_falkor
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+		   memcpy_falkor memmove_falkor memset_generic memset_falkor
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index f84956c..e55be80 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -25,7 +25,7 @@
 #include <stdio.h>
 
 /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC	3
+#define MAX_IFUNC	4
 
 size_t
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -40,6 +40,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
   IFUNC_IMPL (i, name, memcpy,
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 3efea2c..b94c655 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -30,6 +30,7 @@ extern __typeof (__redirect_memcpy) __libc_memcpy;
 
 extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
 
 libc_ifunc (__libc_memcpy,
@@ -37,7 +38,9 @@ libc_ifunc (__libc_memcpy,
 	     ? __memcpy_thunderx
 	     : (IS_FALKOR (midr)
 		? __memcpy_falkor
-		: __memcpy_generic)));
+		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+		  ? __memcpy_thunderx2
+		  : __memcpy_generic))));
 
 # undef memcpy
 strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
index 4f6921d..de494d9 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -74,11 +74,13 @@
 
 #if IS_IN (libc)
 
-# undef MEMCPY
-# define MEMCPY __memcpy_thunderx
-# undef MEMMOVE
-# define MEMMOVE __memmove_thunderx
-# define USE_THUNDERX
+# ifndef USE_THUNDERX2
+#  undef MEMCPY
+#  define MEMCPY __memcpy_thunderx
+#  undef MEMMOVE
+#  define MEMMOVE __memmove_thunderx
+#  define USE_THUNDERX
+# endif
 
 ENTRY_ALIGN (MEMMOVE, 6)
 
@@ -180,7 +182,7 @@ L(copy96):
 	.p2align 4
 L(copy_long):
 
-# ifdef USE_THUNDERX
+# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
 
 	/* On thunderx, large memcpy's are helped by software prefetching.
 	   This loop is identical to the one below it but with prefetching
@@ -194,7 +196,11 @@ L(copy_long):
 	bic	dst, dstin, 15
 	ldp	D_l, D_h, [src]
 	sub	src, src, tmp1
+#  if defined(USE_THUNDERX)
 	prfm	pldl1strm, [src, 384]
+#  elif defined(USE_THUNDERX2)
+	prfm	pldl1strm, [src, 256]
+#  endif
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
 	stp	D_l, D_h, [dstin]
@@ -204,9 +210,13 @@ L(copy_long):
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 
 L(prefetch_loop64):
+#  if defined(USE_THUNDERX)
 	tbz	src, #6, 1f
 	prfm	pldl1strm, [src, 512]
 1:
+#  elif defined(USE_THUNDERX2)
+	prfm	pldl1strm, [src, 256]
+#  endif
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
 	stp	B_l, B_h, [dst, 32]
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index e69de29..8501abf 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -0,0 +1,27 @@
+/* A Thunderx2 Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
+   The only real differences are with the prefetching instructions.  */
+
+#define MEMCPY __memcpy_thunderx2
+#define MEMMOVE __memmove_thunderx2
+#define USE_THUNDERX2
+
+#include "memcpy_thunderx.S"
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index c646f9d..cde655b 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -41,6 +41,11 @@
 #define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C'	\
 			   && MIDR_PARTNUM(midr) == 0x0a1)
 
+#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B'     \
+			   && MIDR_PARTNUM(midr) == 0x516)
+#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C'       \
+			   && MIDR_PARTNUM(midr) == 0xaf)
+
 #define IS_FALKOR(midr) (MIDR_IMPLEMENTOR(midr) == 'Q'			      \
                         && MIDR_PARTNUM(midr) == 0xc00)
 

Attachment: bench-memcpy-walk.out
Description: Text document

Attachment: bench-memcpy-large.out
Description: Text document

Attachment: bench-memcpy-random.out
Description: Text document

Attachment: bench-memcpy.out
Description: Text document


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]