[PATCH 18/21] mips: libc: memcpy prefetches beyond copied memory
Aleksandar Rikalo
arikalo@gmail.com
Thu Oct 31 05:49:34 GMT 2024
From: Faraz Shahbazker <fshahbazker@wavecomp.com>
Fix prefetching in core loop to avoid exceeding the operated upon
memory region. Revert accidentally changed prefetch-hint back to
streaming mode. Refactor various bits and provide pre-processor
checks to allow parameters to be overridden from compiler command
line.
Signed-off-by: Faraz Shahbazker <fshahbazker@wavecomp.com>
Signed-off-by: Aleksandar Rikalo <arikalo@gmail.com>
---
newlib/libc/machine/mips/memcpy.c | 150 +++++++++++++++++++-----------
1 file changed, 97 insertions(+), 53 deletions(-)
diff --git a/newlib/libc/machine/mips/memcpy.c b/newlib/libc/machine/mips/memcpy.c
index 2d5031814..03ef299b5 100644
--- a/newlib/libc/machine/mips/memcpy.c
+++ b/newlib/libc/machine/mips/memcpy.c
@@ -27,7 +27,9 @@
*/
/* Typical observed latency in cycles in fetching from DRAM. */
-#define LATENCY_CYCLES 63
+#ifndef LATENCY_CYCLES
+ #define LATENCY_CYCLES 63
+#endif
/* Pre-fetch performance is subject to accurate prefetch ahead,
which in turn depends on both the cache-line size and the amount
@@ -44,30 +46,42 @@
#define LATENCY_CYCLES 150
#elif defined(_MIPS_TUNE_I6400) || defined(_MIPS_TUNE_I6500)
#define CACHE_LINE 64
- #define BLOCK_CYCLES 16
+ #define BLOCK_CYCLES 15
#elif defined(_MIPS_TUNE_P6600)
#define CACHE_LINE 32
- #define BLOCK_CYCLES 12
-#elif defined(_MIPS_TUNE_INTERAPTIV) || defined(_MIPS_TUNE_INTERAPTIV_MR2)
+ #define BLOCK_CYCLES 15
+#elif defined(_MIPS_TUNE_INTERAPTIV) || defined(_MIPS_TUNE_INTERAPTIV_MR2)
#define CACHE_LINE 32
#define BLOCK_CYCLES 30
#else
- #define CACHE_LINE 32
- #define BLOCK_CYCLES 11
+ #ifndef CACHE_LINE
+ #define CACHE_LINE 32
+ #endif
+ #ifndef BLOCK_CYCLES
+ #ifdef __nanomips__
+ #define BLOCK_CYCLES 20
+ #else
+ #define BLOCK_CYCLES 11
+ #endif
+ #endif
#endif
/* Pre-fetch look ahead = ceil (latency / block-cycles) */
#define PREF_AHEAD (LATENCY_CYCLES / BLOCK_CYCLES \
+ ((LATENCY_CYCLES % BLOCK_CYCLES) == 0 ? 0 : 1))
-/* Unroll-factor, controls how many words at a time in the core loop. */
-#define BLOCK (CACHE_LINE == 128 ? 16 : 8)
+/* The unroll-factor controls how many words at a time in the core loop. */
+#ifndef BLOCK_SIZE
+ #define BLOCK_SIZE (CACHE_LINE == 128 ? 16 : 8)
+#elif BLOCK_SIZE != 8 && BLOCK_SIZE != 16
+ #error "BLOCK_SIZE must be 8 or 16"
+#endif
#define __overloadable
-#ifndef UNALIGNED_INSTR_SUPPORT
+#if !defined(UNALIGNED_INSTR_SUPPORT)
/* does target have unaligned lw/ld/ualw/uald instructions? */
#define UNALIGNED_INSTR_SUPPORT 0
- #if (__mips_isa_rev < 6 && !__mips1)
+#if (__mips_isa_rev < 6 && !defined(__mips1)) || defined(__nanomips__)
#undef UNALIGNED_INSTR_SUPPORT
#define UNALIGNED_INSTR_SUPPORT 1
#endif
@@ -75,17 +89,35 @@
#if !defined(HW_UNALIGNED_SUPPORT)
/* Does target have hardware support for unaligned accesses? */
#define HW_UNALIGNED_SUPPORT 0
- #if __mips_isa_rev >= 6
+ #if __mips_isa_rev >= 6 && !defined(__nanomips__)
#undef HW_UNALIGNED_SUPPORT
#define HW_UNALIGNED_SUPPORT 1
#endif
#endif
-#define ENABLE_PREFETCH 1
+
+#ifndef ENABLE_PREFETCH
+ #define ENABLE_PREFETCH 1
+#endif
+
+#ifndef ENABLE_PREFETCH_CHECK
+ #define ENABLE_PREFETCH_CHECK 0
+#endif
+
#if ENABLE_PREFETCH
- #define PREFETCH(addr) __builtin_prefetch (addr, 0, 0)
-#else
+ #if ENABLE_PREFETCH_CHECK
+#include <assert.h>
+static char *limit;
+#define PREFETCH(addr) \
+ do { \
+ assert ((char *)(addr) < limit); \
+ __builtin_prefetch ((addr), 0, 1); \
+ } while (0)
+#else /* ENABLE_PREFETCH_CHECK */
+ #define PREFETCH(addr) __builtin_prefetch (addr, 0, 1)
+ #endif /* ENABLE_PREFETCH_CHECK */
+#else /* ENABLE_PREFETCH */
#define PREFETCH(addr)
-#endif
+#endif /* ENABLE_PREFETCH */
#include <string.h>
@@ -95,17 +127,18 @@ typedef struct
{
reg_t B0:8, B1:8, B2:8, B3:8, B4:8, B5:8, B6:8, B7:8;
} bits_t;
-#else
+#else /* __mips64 */
typedef unsigned long reg_t;
typedef struct
{
reg_t B0:8, B1:8, B2:8, B3:8;
} bits_t;
-#endif
+#endif /* __mips64 */
-#define CACHE_LINES_PER_BLOCK ((BLOCK * sizeof (reg_t) > CACHE_LINE) ? \
- (BLOCK * sizeof (reg_t) / CACHE_LINE) \
- : 1)
+#define CACHE_LINES_PER_BLOCK \
+ ((BLOCK_SIZE * sizeof (reg_t) > CACHE_LINE) \
+ ? (BLOCK_SIZE * sizeof (reg_t) / CACHE_LINE) \
+ : 1)
typedef union
{
@@ -116,7 +149,7 @@ typedef union
#define DO_BYTE(a, i) \
a[i] = bw.b.B##i; \
len--; \
- if(!len) return ret; \
+ if (!len) return ret; \
/* This code is called when aligning a pointer, there are remaining bytes
after doing word compares, or architecture does not have some form
@@ -144,7 +177,7 @@ do_bytes_remaining (void *a, const void *b, unsigned long len, void *ret)
{
unsigned char *x = (unsigned char *) a;
bitfields_t bw;
- if(len > 0)
+ if (len > 0)
{
bw.v = *(reg_t *)b;
DO_BYTE(x, 0);
@@ -155,7 +188,7 @@ do_bytes_remaining (void *a, const void *b, unsigned long len, void *ret)
DO_BYTE(x, 4);
DO_BYTE(x, 5);
DO_BYTE(x, 6);
-#endif
+#endif /* __mips64 */
}
return ret;
}
@@ -166,7 +199,7 @@ do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
{
/* Use a set-back so that load/stores have incremented addresses in
order to promote bonding. */
- int off = (BLOCK - words);
+ int off = (BLOCK_SIZE - words);
a -= off;
b -= off;
switch (off)
@@ -178,7 +211,7 @@ do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
case 5: a[5] = b[5];
case 6: a[6] = b[6];
case 7: a[7] = b[7];
-#if BLOCK==16
+#if BLOCK_SIZE==16
case 8: a[8] = b[8];
case 9: a[9] = b[9];
case 10: a[10] = b[10];
@@ -187,9 +220,9 @@ do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
case 13: a[13] = b[13];
case 14: a[14] = b[14];
case 15: a[15] = b[15];
-#endif
+#endif /* BLOCK_SIZE==16 */
}
- return do_bytes_remaining (a + BLOCK, b + BLOCK, bytes, ret);
+ return do_bytes_remaining (a + BLOCK_SIZE, b + BLOCK_SIZE, bytes, ret);
}
#if !HW_UNALIGNED_SUPPORT
@@ -206,7 +239,7 @@ do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
{
/* Use a set-back so that load/stores have incremented addresses in
order to promote bonding. */
- int off = (BLOCK - words);
+ int off = (BLOCK_SIZE - words);
a -= off;
b -= off;
switch (off)
@@ -218,7 +251,7 @@ do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
case 5: a[5].uli = b[5];
case 6: a[6].uli = b[6];
case 7: a[7].uli = b[7];
-#if BLOCK==16
+#if BLOCK_SIZE==16
case 8: a[8].uli = b[8];
case 9: a[9].uli = b[9];
case 10: a[10].uli = b[10];
@@ -227,9 +260,9 @@ do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
case 13: a[13].uli = b[13];
case 14: a[14].uli = b[14];
case 15: a[15].uli = b[15];
-#endif
+#endif /* BLOCK_SIZE==16 */
}
- return do_bytes_remaining (a + BLOCK, b + BLOCK, bytes, ret);
+ return do_bytes_remaining (a + BLOCK_SIZE, b + BLOCK_SIZE, bytes, ret);
}
/* The first pointer is not aligned while second pointer is. */
@@ -238,13 +271,19 @@ unaligned_words (struct ulw *a, const reg_t * b,
unsigned long words, unsigned long bytes, void *ret)
{
unsigned long i, words_by_block, words_by_1;
- words_by_1 = words % BLOCK;
- words_by_block = words / BLOCK;
+ words_by_1 = words % BLOCK_SIZE;
+ words_by_block = words / BLOCK_SIZE;
+
for (; words_by_block > 0; words_by_block--)
{
- if (words_by_block >= PREF_AHEAD - CACHE_LINES_PER_BLOCK)
+ /* This condition is deliberately conservative. One could theoretically
+ pre-fetch another time around in some cases without crossing the page
+ boundary at the limit, but checking for the right conditions here is
+ too expensive to be worth it. */
+ if (words_by_block > PREF_AHEAD)
for (i = 0; i < CACHE_LINES_PER_BLOCK; i++)
- PREFETCH (b + (BLOCK / CACHE_LINES_PER_BLOCK) * (PREF_AHEAD + i));
+ PREFETCH (b + ((BLOCK_SIZE / CACHE_LINES_PER_BLOCK)
+ * (PREF_AHEAD + i)));
reg_t y0 = b[0], y1 = b[1], y2 = b[2], y3 = b[3];
reg_t y4 = b[4], y5 = b[5], y6 = b[6], y7 = b[7];
@@ -256,7 +295,7 @@ unaligned_words (struct ulw *a, const reg_t * b,
a[5].uli = y5;
a[6].uli = y6;
a[7].uli = y7;
-#if BLOCK==16
+#if BLOCK_SIZE==16
y0 = b[8], y1 = b[9], y2 = b[10], y3 = b[11];
y4 = b[12], y5 = b[13], y6 = b[14], y7 = b[15];
a[8].uli = y0;
@@ -267,16 +306,16 @@ unaligned_words (struct ulw *a, const reg_t * b,
a[13].uli = y5;
a[14].uli = y6;
a[15].uli = y7;
-#endif
- a += BLOCK;
- b += BLOCK;
+#endif /* BLOCK_SIZE==16 */
+ a += BLOCK_SIZE;
+ b += BLOCK_SIZE;
}
/* Mop up any remaining bytes. */
return do_uwords_remaining (a, b, words_by_1, bytes, ret);
}
-#else
+#else /* !UNALIGNED_INSTR_SUPPORT */
/* No HW support or unaligned lw/ld/ualw/uald instructions. */
static void *
@@ -294,7 +333,7 @@ unaligned_words (reg_t * a, const reg_t * b,
x[1] = bw.b.B1;
x[2] = bw.b.B2;
x[3] = bw.b.B3;
-#if __mips64
+#ifdef __mips64
x[4] = bw.b.B4;
x[5] = bw.b.B5;
x[6] = bw.b.B6;
@@ -316,13 +355,15 @@ aligned_words (reg_t * a, const reg_t * b,
unsigned long words, unsigned long bytes, void *ret)
{
unsigned long i, words_by_block, words_by_1;
- words_by_1 = words % BLOCK;
- words_by_block = words / BLOCK;
+ words_by_1 = words % BLOCK_SIZE;
+ words_by_block = words / BLOCK_SIZE;
+
for (; words_by_block > 0; words_by_block--)
{
- if(words_by_block >= PREF_AHEAD - CACHE_LINES_PER_BLOCK)
+ if (words_by_block > PREF_AHEAD)
for (i = 0; i < CACHE_LINES_PER_BLOCK; i++)
- PREFETCH (b + ((BLOCK / CACHE_LINES_PER_BLOCK) * (PREF_AHEAD + i)));
+ PREFETCH (b + ((BLOCK_SIZE / CACHE_LINES_PER_BLOCK)
+ * (PREF_AHEAD + i)));
reg_t x0 = b[0], x1 = b[1], x2 = b[2], x3 = b[3];
reg_t x4 = b[4], x5 = b[5], x6 = b[6], x7 = b[7];
@@ -334,7 +375,7 @@ aligned_words (reg_t * a, const reg_t * b,
a[5] = x5;
a[6] = x6;
a[7] = x7;
-#if BLOCK==16
+#if BLOCK_SIZE==16
x0 = b[8], x1 = b[9], x2 = b[10], x3 = b[11];
x4 = b[12], x5 = b[13], x6 = b[14], x7 = b[15];
a[8] = x0;
@@ -345,9 +386,9 @@ aligned_words (reg_t * a, const reg_t * b,
a[13] = x5;
a[14] = x6;
a[15] = x7;
-#endif
- a += BLOCK;
- b += BLOCK;
+#endif /* BLOCK_SIZE==16 */
+ a += BLOCK_SIZE;
+ b += BLOCK_SIZE;
}
/* mop up any remaining bytes. */
@@ -359,13 +400,16 @@ memcpy (void *a, const void *b, size_t len) __overloadable
{
unsigned long bytes, words, i;
void *ret = a;
+#if ENABLE_PREFETCH_CHECK
+ limit = (char *)b + len;
+#endif /* ENABLE_PREFETCH_CHECK */
/* shouldn't hit that often. */
if (len <= 8)
return do_bytes (a, b, len, a);
/* Start pre-fetches ahead of time. */
- if (len > CACHE_LINE * (PREF_AHEAD - 1))
- for (i = 1; i < PREF_AHEAD - 1; i++)
+ if (len > CACHE_LINE * PREF_AHEAD)
+ for (i = 1; i < PREF_AHEAD; i++)
PREFETCH ((char *)b + CACHE_LINE * i);
else
for (i = 1; i < len / CACHE_LINE; i++)
@@ -396,10 +440,10 @@ memcpy (void *a, const void *b, size_t len) __overloadable
#if HW_UNALIGNED_SUPPORT
/* treat possible unaligned first pointer as aligned. */
return aligned_words (a, b, words, bytes, ret);
-#else
+#else /* !HW_UNALIGNED_SUPPORT */
if (((unsigned long) a) % sizeof (reg_t) == 0)
return aligned_words (a, b, words, bytes, ret);
/* need to use unaligned instructions on first pointer. */
return unaligned_words (a, b, words, bytes, ret);
-#endif
+#endif /* HW_UNALIGNED_SUPPORT */
}
--
2.25.1
More information about the Newlib
mailing list