This is the mail archive of the libc-ports@sources.redhat.com mailing list for the libc-ports project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCHv2] ARM: NEON optimized implementation of memcpy.


NEON optimizations provide ~1.5x speedup when copying memory blocks,
that are much larger than L2 cache size. Performance improvement
varies for the other block sizes, but is always better than the
code used for older ARM cores.

In order to get NEON code enabled, ASFLAGS needs to be defined as
something like "-mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon"
when building glibc.

This is an updated patch, now tuned for all the memory block sizes,
including very small ones. The code improvements are mostly a result
of a discussion on #beagleboard irc channel with Mans Rullgard, the
author of the following ARM NEON related blog post:
http://hardwarebug.org/2008/12/31/arm-neon-memory-hazards/

Crossover between ARM and NEON parts of the function is carefully
taken into account.

The patch now also optionally supports a configuration with using
unaligned loads and stores, they are quite a bit faster on Cortex-A8.
But the code does not use unaligned memory accesses by default.
The intention is to have an absolutely safe drop-in replacement for
the existing memcpy function, guaranteed not to cause any problems.
Maybe this can be tweaked later.
---
 sysdeps/arm/memcpy.S |  132 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 132 insertions(+), 0 deletions(-)

diff --git a/sysdeps/arm/memcpy.S b/sysdeps/arm/memcpy.S
index 61cf33c..d562ef2 100644
--- a/sysdeps/arm/memcpy.S
+++ b/sysdeps/arm/memcpy.S
@@ -2,6 +2,7 @@
    This file is part of the GNU C Library.
 
    Contributed by MontaVista Software, Inc. (written by Nicolas Pitre)
+   NEON code contributed by Nokia Corporation (written by Siarhei Siamashka)
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -20,6 +21,135 @@
 
 #include <sysdep.h>
 
+#ifdef __ARM_NEON__
+		.text
+		.fpu	neon
+
+/*
+ * ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
+ * of unaligned load/store memory accesses supported since ARMv6. This
+ * will further improve performance, but can purely theoretically cause
+ * problems if somebody decides to set SCTLR.A bit in the OS kernel
+ * (to trap each unaligned memory access) or somehow mess with strongly
+ * ordered/device memory.
+ */
+
+#define NEON_MAX_PREFETCH_DISTANCE 320
+
+ENTRY(memcpy)
+		mov	ip, r0
+		cmp	r2, #16
+		blt     4f	@ Have less than 16 bytes to copy
+
+		@ First ensure 16 byte alignment for the destination buffer
+		tst	r0, #0xF
+		beq	2f
+		tst	r0, #1
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+		subne	r2, r2, #1
+		tst	ip, #2
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+		ldrneh	r3, [r1], #2
+		strneh	r3, [ip], #2
+#else
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+#endif
+		subne	r2, r2, #2
+
+		tst	ip, #4
+		beq	1f
+		vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]!
+		vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
+		sub	r2, r2, #4
+1:
+		tst	ip, #8
+		beq	2f
+		vld1.8	{d0}, [r1]!
+		vst1.8	{d0}, [ip, :64]!
+		sub	r2, r2, #8
+2:
+		subs	r2, r2, #32
+		blt	3f
+		mov	r3, #32
+
+		@ Main copy loop, 32 bytes are processed per iteration.
+		@ ARM instructions are used for doing fine-grained prefetch,
+		@ increasing prefetch distance progressively up to
+		@ NEON_MAX_PREFETCH_DISTANCE at runtime
+1:
+		vld1.8	{d0-d3}, [r1]!
+		cmp	r3, #(NEON_MAX_PREFETCH_DISTANCE - 32)
+		pld	[r1, r3]
+		addle	r3, r3, #32
+		vst1.8	{d0-d3}, [ip, :128]!
+		sub	r2, r2, #32
+		cmp	r2, r3
+		bge	1b
+		cmp	r2, #0
+		blt	3f
+1:		@ Copy the remaining part of the buffer (already prefetched)
+		vld1.8	{d0-d3}, [r1]!
+		subs	r2, r2, #32
+		vst1.8	{d0-d3}, [ip, :128]!
+		bge	1b
+3:		@ Copy up to 31 remaining bytes
+		tst	r2, #16
+		beq	4f
+		vld1.8	{d0, d1}, [r1]!
+		vst1.8	{d0, d1}, [ip, :128]!
+4:
+		@ Use ARM instructions exclusively for the final trailing part
+		@ not fully fitting into full 16 byte aligned block in order
+		@ to avoid "ARM store after NEON store" hazard. Also NEON
+		@ pipeline will be (mostly) flushed by the time when the
+		@ control returns to the caller, making the use of NEON mostly
+		@ transparent (and avoiding hazards in the caller code)
+
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+		movs	r3, r2, lsl #29
+		ldrcs	r3, [r1], #4
+		strcs	r3, [ip], #4
+		ldrcs	r3, [r1], #4
+		strcs	r3, [ip], #4
+		ldrmi	r3, [r1], #4
+		strmi	r3, [ip], #4
+		movs	r2, r2, lsl #31
+		ldrcsh	r3, [r1], #2
+		strcsh	r3, [ip], #2
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+#else
+		movs	r3, r2, lsl #29
+		bcc	1f
+	.rept	8
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+	.endr
+1:
+		bpl	1f
+	.rept	4
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+	.endr
+1:
+		movs	r2, r2, lsl #31
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+#endif
+		bx	lr
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
+
+#else
+
 /*
  * Data preload for architectures that support it (ARM V5TE and above)
  */
@@ -225,3 +355,5 @@ ENTRY(memcpy)
 
 END(memcpy)
 libc_hidden_builtin_def (memcpy)
+
+#endif
-- 
1.5.6.5


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]