This is the mail archive of the
libc-ports@sources.redhat.com
mailing list for the libc-ports project.
[PATCHv2] ARM: NEON optimized implementation of memcpy.
- From: Siarhei Siamashka <siarhei dot siamashka at nokia dot com>
- To: libc-ports at sourceware dot org
- Date: Sun, 5 Jul 2009 18:21:03 +0300
- Subject: [PATCHv2] ARM: NEON optimized implementation of memcpy.
NEON optimizations provide ~1.5x speedup when copying memory blocks,
that are much larger than L2 cache size. Performance improvement
varies for the other block sizes, but is always better than the
code used for older ARM cores.
In order to get NEON code enabled, ASFLAGS needs to be defined as
something like "-mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon"
when building glibc.
This is an updated patch, now tuned for all the memory block sizes,
including very small ones. The code improvements are mostly a result
of a discussion on #beagleboard irc channel with Mans Rullgard, the
author of the following ARM NEON related blog post:
http://hardwarebug.org/2008/12/31/arm-neon-memory-hazards/
Crossover between ARM and NEON parts of the function is carefully
taken into account.
The patch now also optionally supports a configuration with using
unaligned loads and stores, they are quite a bit faster on Cortex-A8.
But the code does not use unaligned memory accesses by default.
The intention is to have an absolutely safe drop-in replacement for
the existing memcpy function, guaranteed not to cause any problems.
Maybe this can be tweaked later.
---
sysdeps/arm/memcpy.S | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 132 insertions(+), 0 deletions(-)
diff --git a/sysdeps/arm/memcpy.S b/sysdeps/arm/memcpy.S
index 61cf33c..d562ef2 100644
--- a/sysdeps/arm/memcpy.S
+++ b/sysdeps/arm/memcpy.S
@@ -2,6 +2,7 @@
This file is part of the GNU C Library.
Contributed by MontaVista Software, Inc. (written by Nicolas Pitre)
+ NEON code contributed by Nokia Corporation (written by Siarhei Siamashka)
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -20,6 +21,135 @@
#include <sysdep.h>
+#ifdef __ARM_NEON__
+ .text
+ .fpu neon
+
+/*
+ * ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
+ * of unaligned load/store memory accesses supported since ARMv6. This
+ * will further improve performance, but can purely theoretically cause
+ * problems if somebody decides to set SCTLR.A bit in the OS kernel
+ * (to trap each unaligned memory access) or somehow mess with strongly
+ * ordered/device memory.
+ */
+
+#define NEON_MAX_PREFETCH_DISTANCE 320
+
+ENTRY(memcpy)
+ mov ip, r0
+ cmp r2, #16
+ blt 4f @ Have less than 16 bytes to copy
+
+ @ First ensure 16 byte alignment for the destination buffer
+ tst r0, #0xF
+ beq 2f
+ tst r0, #1
+ ldrneb r3, [r1], #1
+ strneb r3, [ip], #1
+ subne r2, r2, #1
+ tst ip, #2
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+ ldrneh r3, [r1], #2
+ strneh r3, [ip], #2
+#else
+ ldrneb r3, [r1], #1
+ strneb r3, [ip], #1
+ ldrneb r3, [r1], #1
+ strneb r3, [ip], #1
+#endif
+ subne r2, r2, #2
+
+ tst ip, #4
+ beq 1f
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
+ sub r2, r2, #4
+1:
+ tst ip, #8
+ beq 2f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [ip, :64]!
+ sub r2, r2, #8
+2:
+ subs r2, r2, #32
+ blt 3f
+ mov r3, #32
+
+ @ Main copy loop, 32 bytes are processed per iteration.
+ @ ARM instructions are used for doing fine-grained prefetch,
+ @ increasing prefetch distance progressively up to
+ @ NEON_MAX_PREFETCH_DISTANCE at runtime
+1:
+ vld1.8 {d0-d3}, [r1]!
+ cmp r3, #(NEON_MAX_PREFETCH_DISTANCE - 32)
+ pld [r1, r3]
+ addle r3, r3, #32
+ vst1.8 {d0-d3}, [ip, :128]!
+ sub r2, r2, #32
+ cmp r2, r3
+ bge 1b
+ cmp r2, #0
+ blt 3f
+1: @ Copy the remaining part of the buffer (already prefetched)
+ vld1.8 {d0-d3}, [r1]!
+ subs r2, r2, #32
+ vst1.8 {d0-d3}, [ip, :128]!
+ bge 1b
+3: @ Copy up to 31 remaining bytes
+ tst r2, #16
+ beq 4f
+ vld1.8 {d0, d1}, [r1]!
+ vst1.8 {d0, d1}, [ip, :128]!
+4:
+ @ Use ARM instructions exclusively for the final trailing part
+ @ not fully fitting into full 16 byte aligned block in order
+ @ to avoid "ARM store after NEON store" hazard. Also NEON
+ @ pipeline will be (mostly) flushed by the time when the
+ @ control returns to the caller, making the use of NEON mostly
+ @ transparent (and avoiding hazards in the caller code)
+
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+ movs r3, r2, lsl #29
+ ldrcs r3, [r1], #4
+ strcs r3, [ip], #4
+ ldrcs r3, [r1], #4
+ strcs r3, [ip], #4
+ ldrmi r3, [r1], #4
+ strmi r3, [ip], #4
+ movs r2, r2, lsl #31
+ ldrcsh r3, [r1], #2
+ strcsh r3, [ip], #2
+ ldrmib r3, [r1], #1
+ strmib r3, [ip], #1
+#else
+ movs r3, r2, lsl #29
+ bcc 1f
+ .rept 8
+ ldrcsb r3, [r1], #1
+ strcsb r3, [ip], #1
+ .endr
+1:
+ bpl 1f
+ .rept 4
+ ldrmib r3, [r1], #1
+ strmib r3, [ip], #1
+ .endr
+1:
+ movs r2, r2, lsl #31
+ ldrcsb r3, [r1], #1
+ strcsb r3, [ip], #1
+ ldrcsb r3, [r1], #1
+ strcsb r3, [ip], #1
+ ldrmib r3, [r1], #1
+ strmib r3, [ip], #1
+#endif
+ bx lr
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
+
+#else
+
/*
* Data preload for architectures that support it (ARM V5TE and above)
*/
@@ -225,3 +355,5 @@ ENTRY(memcpy)
END(memcpy)
libc_hidden_builtin_def (memcpy)
+
+#endif
--
1.5.6.5