[PATCH 18/21] mips: libc: memcpy prefetches beyond copied memory

Aleksandar Rikalo arikalo@gmail.com
Thu Oct 31 05:49:34 GMT 2024


From: Faraz Shahbazker <fshahbazker@wavecomp.com>

Fix prefetching in core loop to avoid exceeding the operated upon
memory region. Revert accidentally changed prefetch-hint back to
streaming mode. Refactor various bits and provide pre-processor
checks to allow parameters to be overridden from compiler command
line.

Signed-off-by: Faraz Shahbazker <fshahbazker@wavecomp.com>
Signed-off-by: Aleksandar Rikalo <arikalo@gmail.com>
---
 newlib/libc/machine/mips/memcpy.c | 150 +++++++++++++++++++-----------
 1 file changed, 97 insertions(+), 53 deletions(-)

diff --git a/newlib/libc/machine/mips/memcpy.c b/newlib/libc/machine/mips/memcpy.c
index 2d5031814..03ef299b5 100644
--- a/newlib/libc/machine/mips/memcpy.c
+++ b/newlib/libc/machine/mips/memcpy.c
@@ -27,7 +27,9 @@
 */
 
 /* Typical observed latency in cycles in fetching from DRAM.  */
-#define LATENCY_CYCLES 63
+#ifndef LATENCY_CYCLES
+ #define LATENCY_CYCLES 63
+#endif
 
 /* Pre-fetch performance is subject to accurate prefetch ahead,
    which in turn depends on both the cache-line size and the amount
@@ -44,30 +46,42 @@
  #define LATENCY_CYCLES 150
 #elif defined(_MIPS_TUNE_I6400) || defined(_MIPS_TUNE_I6500)
  #define CACHE_LINE 64
- #define BLOCK_CYCLES 16
+ #define BLOCK_CYCLES 15
 #elif defined(_MIPS_TUNE_P6600)
  #define CACHE_LINE 32
- #define BLOCK_CYCLES 12
-#elif defined(_MIPS_TUNE_INTERAPTIV) ||  defined(_MIPS_TUNE_INTERAPTIV_MR2)
+ #define BLOCK_CYCLES 15
+#elif defined(_MIPS_TUNE_INTERAPTIV) || defined(_MIPS_TUNE_INTERAPTIV_MR2)
  #define CACHE_LINE 32
  #define BLOCK_CYCLES 30
 #else
- #define CACHE_LINE 32
- #define BLOCK_CYCLES 11
+ #ifndef CACHE_LINE
+  #define CACHE_LINE 32
+ #endif
+ #ifndef BLOCK_CYCLES
+  #ifdef __nanomips__
+   #define BLOCK_CYCLES 20
+  #else
+   #define BLOCK_CYCLES 11
+  #endif
+ #endif
 #endif
 
 /* Pre-fetch look ahead = ceil (latency / block-cycles)  */
 #define PREF_AHEAD (LATENCY_CYCLES / BLOCK_CYCLES			\
 		    + ((LATENCY_CYCLES % BLOCK_CYCLES) == 0 ? 0 : 1))
 
-/* Unroll-factor, controls how many words at a time in the core loop.  */
-#define BLOCK (CACHE_LINE == 128 ? 16 : 8)
+/* The unroll-factor controls how many words at a time in the core loop.  */
+#ifndef BLOCK_SIZE
+ #define BLOCK_SIZE (CACHE_LINE == 128 ? 16 : 8)
+#elif BLOCK_SIZE != 8 && BLOCK_SIZE != 16
+ #error "BLOCK_SIZE must be 8 or 16"
+#endif
 
 #define __overloadable
-#ifndef UNALIGNED_INSTR_SUPPORT
+#if !defined(UNALIGNED_INSTR_SUPPORT)
 /* does target have unaligned lw/ld/ualw/uald instructions? */
  #define UNALIGNED_INSTR_SUPPORT 0
- #if (__mips_isa_rev < 6 && !__mips1)
+#if (__mips_isa_rev < 6 && !defined(__mips1)) || defined(__nanomips__)
   #undef UNALIGNED_INSTR_SUPPORT
   #define UNALIGNED_INSTR_SUPPORT 1
  #endif
@@ -75,17 +89,35 @@
 #if !defined(HW_UNALIGNED_SUPPORT)
 /* Does target have hardware support for unaligned accesses?  */
  #define HW_UNALIGNED_SUPPORT 0
- #if __mips_isa_rev >= 6
+ #if __mips_isa_rev >= 6 && !defined(__nanomips__)
   #undef HW_UNALIGNED_SUPPORT
   #define HW_UNALIGNED_SUPPORT 1
  #endif
 #endif
-#define ENABLE_PREFETCH     1
+
+#ifndef ENABLE_PREFETCH
+ #define ENABLE_PREFETCH 1
+#endif
+
+#ifndef ENABLE_PREFETCH_CHECK
+ #define ENABLE_PREFETCH_CHECK 0
+#endif
+
 #if ENABLE_PREFETCH
- #define PREFETCH(addr)  __builtin_prefetch (addr, 0, 0)
-#else
+ #if ENABLE_PREFETCH_CHECK
+#include <assert.h>
+static  char *limit;
+#define PREFETCH(addr)				\
+  do {						\
+    assert ((char *)(addr) < limit);		\
+    __builtin_prefetch ((addr), 0, 1);		\
+  } while (0)
+#else /* ENABLE_PREFETCH_CHECK */
+  #define PREFETCH(addr)  __builtin_prefetch (addr, 0, 1)
+ #endif /* ENABLE_PREFETCH_CHECK */
+#else /* ENABLE_PREFETCH */
  #define PREFETCH(addr)
-#endif
+#endif /* ENABLE_PREFETCH */
 
 #include <string.h>
 
@@ -95,17 +127,18 @@ typedef struct
 {
   reg_t B0:8, B1:8, B2:8, B3:8, B4:8, B5:8, B6:8, B7:8;
 } bits_t;
-#else
+#else /* __mips64 */
 typedef unsigned long reg_t;
 typedef struct
 {
   reg_t B0:8, B1:8, B2:8, B3:8;
 } bits_t;
-#endif
+#endif /* __mips64 */
 
-#define CACHE_LINES_PER_BLOCK ((BLOCK * sizeof (reg_t) > CACHE_LINE) ?	\
-			       (BLOCK * sizeof (reg_t) / CACHE_LINE)	\
-			       : 1)
+#define CACHE_LINES_PER_BLOCK						\
+  ((BLOCK_SIZE * sizeof (reg_t) > CACHE_LINE)				\
+   ? (BLOCK_SIZE * sizeof (reg_t) / CACHE_LINE)				\
+   : 1)
 
 typedef union
 {
@@ -116,7 +149,7 @@ typedef union
 #define DO_BYTE(a, i)   \
   a[i] = bw.b.B##i;     \
   len--;                \
-  if(!len) return ret;  \
+  if (!len) return ret;  \
 
 /* This code is called when aligning a pointer, there are remaining bytes
    after doing word compares, or architecture does not have some form
@@ -144,7 +177,7 @@ do_bytes_remaining (void *a, const void *b, unsigned long len, void *ret)
 {
   unsigned char *x = (unsigned char *) a;
   bitfields_t bw;
-  if(len > 0)
+  if (len > 0)
     {
       bw.v = *(reg_t *)b;
       DO_BYTE(x, 0);
@@ -155,7 +188,7 @@ do_bytes_remaining (void *a, const void *b, unsigned long len, void *ret)
       DO_BYTE(x, 4);
       DO_BYTE(x, 5);
       DO_BYTE(x, 6);
-#endif
+#endif /* __mips64 */
     }
   return ret;
 }
@@ -166,7 +199,7 @@ do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
 {
   /* Use a set-back so that load/stores have incremented addresses in
      order to promote bonding.  */
-  int off = (BLOCK - words);
+  int off = (BLOCK_SIZE - words);
   a -= off;
   b -= off;
   switch (off)
@@ -178,7 +211,7 @@ do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
       case 5: a[5] = b[5];
       case 6: a[6] = b[6];
       case 7: a[7] = b[7];
-#if BLOCK==16
+#if BLOCK_SIZE==16
       case 8: a[8] = b[8];
       case 9: a[9] = b[9];
       case 10: a[10] = b[10];
@@ -187,9 +220,9 @@ do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
       case 13: a[13] = b[13];
       case 14: a[14] = b[14];
       case 15: a[15] = b[15];
-#endif
+#endif /* BLOCK_SIZE==16 */
     }
-  return do_bytes_remaining (a + BLOCK, b + BLOCK, bytes, ret);
+  return do_bytes_remaining (a + BLOCK_SIZE, b + BLOCK_SIZE, bytes, ret);
 }
 
 #if !HW_UNALIGNED_SUPPORT
@@ -206,7 +239,7 @@ do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
 {
   /* Use a set-back so that load/stores have incremented addresses in
      order to promote bonding.  */
-  int off = (BLOCK - words);
+  int off = (BLOCK_SIZE - words);
   a -= off;
   b -= off;
   switch (off)
@@ -218,7 +251,7 @@ do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
       case 5: a[5].uli = b[5];
       case 6: a[6].uli = b[6];
       case 7: a[7].uli = b[7];
-#if BLOCK==16
+#if BLOCK_SIZE==16
       case 8: a[8].uli = b[8];
       case 9: a[9].uli = b[9];
       case 10: a[10].uli = b[10];
@@ -227,9 +260,9 @@ do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
       case 13: a[13].uli = b[13];
       case 14: a[14].uli = b[14];
       case 15: a[15].uli = b[15];
-#endif
+#endif /* BLOCK_SIZE==16 */
     }
-  return do_bytes_remaining (a + BLOCK, b + BLOCK, bytes, ret);
+  return do_bytes_remaining (a + BLOCK_SIZE, b + BLOCK_SIZE, bytes, ret);
 }
 
 /* The first pointer is not aligned while second pointer is.  */
@@ -238,13 +271,19 @@ unaligned_words (struct ulw *a, const reg_t * b,
 		 unsigned long words, unsigned long bytes, void *ret)
 {
   unsigned long i, words_by_block, words_by_1;
-  words_by_1 = words % BLOCK;
-  words_by_block = words / BLOCK;
+  words_by_1 = words % BLOCK_SIZE;
+  words_by_block = words / BLOCK_SIZE;
+
   for (; words_by_block > 0; words_by_block--)
     {
-      if (words_by_block >= PREF_AHEAD - CACHE_LINES_PER_BLOCK)
+      /* This condition is deliberately conservative.  One could theoretically
+	 pre-fetch another time around in some cases without crossing the page
+	 boundary at the limit, but checking for the right conditions here is
+	 too expensive to be worth it.  */
+      if (words_by_block > PREF_AHEAD)
 	for (i = 0; i < CACHE_LINES_PER_BLOCK; i++)
-	  PREFETCH (b + (BLOCK / CACHE_LINES_PER_BLOCK) * (PREF_AHEAD + i));
+	  PREFETCH (b + ((BLOCK_SIZE / CACHE_LINES_PER_BLOCK)
+			 * (PREF_AHEAD + i)));
 
       reg_t y0 = b[0], y1 = b[1], y2 = b[2], y3 = b[3];
       reg_t y4 = b[4], y5 = b[5], y6 = b[6], y7 = b[7];
@@ -256,7 +295,7 @@ unaligned_words (struct ulw *a, const reg_t * b,
       a[5].uli = y5;
       a[6].uli = y6;
       a[7].uli = y7;
-#if BLOCK==16
+#if BLOCK_SIZE==16
       y0 = b[8], y1 = b[9], y2 = b[10], y3 = b[11];
       y4 = b[12], y5 = b[13], y6 = b[14], y7 = b[15];
       a[8].uli = y0;
@@ -267,16 +306,16 @@ unaligned_words (struct ulw *a, const reg_t * b,
       a[13].uli = y5;
       a[14].uli = y6;
       a[15].uli = y7;
-#endif
-      a += BLOCK;
-      b += BLOCK;
+#endif /* BLOCK_SIZE==16 */
+      a += BLOCK_SIZE;
+      b += BLOCK_SIZE;
   }
 
   /* Mop up any remaining bytes.  */
   return do_uwords_remaining (a, b, words_by_1, bytes, ret);
 }
 
-#else
+#else /* !UNALIGNED_INSTR_SUPPORT */
 
 /* No HW support or unaligned lw/ld/ualw/uald instructions.  */
 static void *
@@ -294,7 +333,7 @@ unaligned_words (reg_t * a, const reg_t * b,
       x[1] = bw.b.B1;
       x[2] = bw.b.B2;
       x[3] = bw.b.B3;
-#if __mips64
+#ifdef __mips64
       x[4] = bw.b.B4;
       x[5] = bw.b.B5;
       x[6] = bw.b.B6;
@@ -316,13 +355,15 @@ aligned_words (reg_t * a, const reg_t * b,
 	       unsigned long words, unsigned long bytes, void *ret)
 {
   unsigned long i, words_by_block, words_by_1;
-  words_by_1 = words % BLOCK;
-  words_by_block = words / BLOCK;
+  words_by_1 = words % BLOCK_SIZE;
+  words_by_block = words / BLOCK_SIZE;
+
   for (; words_by_block > 0; words_by_block--)
     {
-      if(words_by_block >= PREF_AHEAD - CACHE_LINES_PER_BLOCK)
+      if (words_by_block > PREF_AHEAD)
 	for (i = 0; i < CACHE_LINES_PER_BLOCK; i++)
-	  PREFETCH (b + ((BLOCK / CACHE_LINES_PER_BLOCK) * (PREF_AHEAD + i)));
+	  PREFETCH (b + ((BLOCK_SIZE / CACHE_LINES_PER_BLOCK)
+			 * (PREF_AHEAD + i)));
 
       reg_t x0 = b[0], x1 = b[1], x2 = b[2], x3 = b[3];
       reg_t x4 = b[4], x5 = b[5], x6 = b[6], x7 = b[7];
@@ -334,7 +375,7 @@ aligned_words (reg_t * a, const reg_t * b,
       a[5] = x5;
       a[6] = x6;
       a[7] = x7;
-#if BLOCK==16
+#if BLOCK_SIZE==16
       x0 = b[8], x1 = b[9], x2 = b[10], x3 = b[11];
       x4 = b[12], x5 = b[13], x6 = b[14], x7 = b[15];
       a[8] = x0;
@@ -345,9 +386,9 @@ aligned_words (reg_t * a, const reg_t * b,
       a[13] = x5;
       a[14] = x6;
       a[15] = x7;
-#endif
-      a += BLOCK;
-      b += BLOCK;
+#endif /* BLOCK_SIZE==16 */
+      a += BLOCK_SIZE;
+      b += BLOCK_SIZE;
     }
 
   /* mop up any remaining bytes.  */
@@ -359,13 +400,16 @@ memcpy (void *a, const void *b, size_t len) __overloadable
 {
   unsigned long bytes, words, i;
   void *ret = a;
+#if ENABLE_PREFETCH_CHECK
+  limit = (char *)b + len;
+#endif /* ENABLE_PREFETCH_CHECK */
   /* shouldn't hit that often.  */
   if (len <= 8)
     return do_bytes (a, b, len, a);
 
   /* Start pre-fetches ahead of time.  */
-  if (len > CACHE_LINE * (PREF_AHEAD - 1))
-    for (i = 1; i < PREF_AHEAD - 1; i++)
+  if (len > CACHE_LINE * PREF_AHEAD)
+    for (i = 1; i < PREF_AHEAD; i++)
       PREFETCH ((char *)b + CACHE_LINE * i);
   else
     for (i = 1; i < len / CACHE_LINE; i++)
@@ -396,10 +440,10 @@ memcpy (void *a, const void *b, size_t len) __overloadable
 #if HW_UNALIGNED_SUPPORT
   /* treat possible unaligned first pointer as aligned.  */
   return aligned_words (a, b, words, bytes, ret);
-#else
+#else /* !HW_UNALIGNED_SUPPORT */
   if (((unsigned long) a) % sizeof (reg_t) == 0)
     return aligned_words (a, b, words, bytes, ret);
   /* need to use unaligned instructions on first pointer.  */
   return unaligned_words (a, b, words, bytes, ret);
-#endif
+#endif /* HW_UNALIGNED_SUPPORT */
 }
-- 
2.25.1



More information about the Newlib mailing list