This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [patch] Fix and reenable memcpy/memset for m68k


On Apr 30, 2007, at 3:39 PM, Joel Sherrill wrote:

Kazu,

Just to finish this one completely off. I want to let you know that I pinged Eric Norum
to review this because he was responsible for the m68k specific memcpy RTEMS uses.
Where possible we prefer to use newlib code instead of overriding it with our own
version. We are inclined to switch to your implementation but I would
appreciate you reviewing to the attached memcpy.c to see if Eric had any
optimization ideas that aren't also in yours.


FYI parts of the RTEMS build structure can distinguish CPU models at a finer
granularity than multilib allows. The attached memcpy.c has some optimizations
to account for the CPU32+.
Thanks.

The optimization in question is to put the CPU32+ (and 68010 if there are still any of those in the wild) into 'loop mode'. This is a special case of a single two-byte instruction followed by a dbra. This allows the CPU to repeatedly execute the two instructions without any instruction fetch cycles. Thus, the assembly code from Kazu Hirata actually incurs a performance penalty by unrolling the loops.
Whether or not this optimization is worthwhile adding is left as an argument for others to decide. Are CPU32+ architecture machines even available any more?



--joel


Kazu Hirata wrote:
Hi,

Attached is a patch to fix and reenable memcpy/memset as suggested by
Eric.  (Thanks Eric for pointing our the bug!)

Tested on fido-elf. OK to apply?

Kazu Hirata

2007-04-30 Kazu Hirata <kazu@codesourcery.com>

	* libc/machine/m68k/Makefile.am (lib_a_SOURCES): Add memcpy.S
	and memset.S.
	* libc/machine/m68k/Makefile.in: Regenerate.
	* libc/machine/m68k/memcpy.S: Use sub.l followed by dbra.
	* libc/machine/m68k/memset.S: Likewise.

Index: newlib/libc/machine/m68k/Makefile.am
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/m68k/Makefile.am,v
retrieving revision 1.5
diff -u -d -p -r1.5 Makefile.am
--- newlib/libc/machine/m68k/Makefile.am 27 Apr 2007 22:10:47 -0000 1.5
+++ newlib/libc/machine/m68k/Makefile.am 30 Apr 2007 19:59:36 -0000
@@ -8,7 +8,7 @@ AM_CCASFLAGS = $(INCLUDES)
noinst_LIBRARIES = lib.a
-lib_a_SOURCES = setjmp.S strcpy.c strlen.c
+lib_a_SOURCES = setjmp.S strcpy.c strlen.c memcpy.S memset.S
lib_a_CCASFLAGS=$(AM_CCASFLAGS)
lib_a_CFLAGS=$(AM_CFLAGS)
Index: newlib/libc/machine/m68k/Makefile.in
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/m68k/Makefile.in,v
retrieving revision 1.13
diff -u -d -p -r1.13 Makefile.in
--- newlib/libc/machine/m68k/Makefile.in 27 Apr 2007 22:10:47 -0000 1.13
+++ newlib/libc/machine/m68k/Makefile.in 30 Apr 2007 19:59:36 -0000
@@ -56,7 +56,8 @@ ARFLAGS = cru
lib_a_AR = $(AR) $(ARFLAGS)
lib_a_LIBADD =
am_lib_a_OBJECTS = lib_a-setjmp.$(OBJEXT) lib_a-strcpy.$(OBJEXT) \
- lib_a-strlen.$(OBJEXT)
+ lib_a-strlen.$(OBJEXT) lib_a-memcpy.$(OBJEXT) \
+ lib_a-memset.$(OBJEXT)
lib_a_OBJECTS = $(am_lib_a_OBJECTS)
DEFAULT_INCLUDES = -I. -I$(srcdir)
depcomp =
@@ -181,7 +182,7 @@ AUTOMAKE_OPTIONS = cygnus
INCLUDES = $(NEWLIB_CFLAGS) $(CROSS_CFLAGS) $(TARGET_CFLAGS)
AM_CCASFLAGS = $(INCLUDES)
noinst_LIBRARIES = lib.a
-lib_a_SOURCES = setjmp.S strcpy.c strlen.c
+lib_a_SOURCES = setjmp.S strcpy.c strlen.c memcpy.S memset.S
lib_a_CCASFLAGS = $(AM_CCASFLAGS)
lib_a_CFLAGS = $(AM_CFLAGS)
ACLOCAL_AMFLAGS = -I ../../..
@@ -249,6 +250,18 @@ lib_a-setjmp.o: setjmp.S
lib_a-setjmp.obj: setjmp.S
$(CCAS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-setjmp.obj `if test -f 'setjmp.S'; then $(CYGPATH_W) 'setjmp.S'; else $ (CYGPATH_W) '$(srcdir)/setjmp.S'; fi`
+lib_a-memcpy.o: memcpy.S
+ $(CCAS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-memcpy.o `test -f 'memcpy.S' || echo '$(srcdir)/'`memcpy.S
+
+lib_a-memcpy.obj: memcpy.S
+ $(CCAS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-memcpy.obj `if test -f 'memcpy.S'; then $(CYGPATH_W) 'memcpy.S'; else $ (CYGPATH_W) '$(srcdir)/memcpy.S'; fi`
+
+lib_a-memset.o: memset.S
+ $(CCAS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-memset.o `test -f 'memset.S' || echo '$(srcdir)/'`memset.S
+
+lib_a-memset.obj: memset.S
+ $(CCAS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-memset.obj `if test -f 'memset.S'; then $(CYGPATH_W) 'memset.S'; else $ (CYGPATH_W) '$(srcdir)/memset.S'; fi`
+
.c.o:
$(COMPILE) -c $<
Index: newlib/libc/machine/m68k/memcpy.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/m68k/memcpy.S,v
retrieving revision 1.1
diff -u -d -p -r1.1 memcpy.S
--- newlib/libc/machine/m68k/memcpy.S 27 Apr 2007 16:05:35 -0000 1.1
+++ newlib/libc/machine/m68k/memcpy.S 30 Apr 2007 19:59:36 -0000
@@ -73,10 +73,11 @@ memcpy:
.Lcopy:
#if !defined (__mcoldfire__)
dbra %d0,1b
+ sub.l #0x10000,%d0
#else
subq.l #1,%d0
- bpl 1b
#endif
+ bpl 1b
bra .Lresidue
1:
Index: newlib/libc/machine/m68k/memset.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/m68k/memset.S,v
retrieving revision 1.1
diff -u -d -p -r1.1 memset.S
--- newlib/libc/machine/m68k/memset.S 27 Apr 2007 16:05:35 -0000 1.1
+++ newlib/libc/machine/m68k/memset.S 30 Apr 2007 19:59:36 -0000
@@ -76,10 +76,11 @@ memset:
.Llset:
#if !defined (__mcoldfire__)
dbra %d2,1b | loop until done
+ sub.l #0x10000,%d2
#else
subq.l #1,%d2
- bpl 1b
#endif
+ bpl 1b
and.l #3,%d1 | residue byte transfers, fixed
move.l (%sp)+,%d2 | restore d2
bra .Lbset



/* * C library memcpy routine * * This routine has code to optimize performance on the CPU32+ * and another version for other 68k machines. * * It could be optimized more for machines with MOVE16 instructions. * * The routine is placed in this source directory to ensure that it * is picked up by all applications. * * W. Eric Norum * Saskatchewan Accelerator Laboratory * University of Saskatchewan * Saskatoon, Saskatchewan, CANADA * eric@skatter.usask.ca */

#include <string.h>
#include <rtems/score/m68k.h>

#if defined(__mcpu32__)
#define COPYSETUP(n)	n--
#define COPY(to,from,n,size) \
	asm volatile ("1:\n" \
	     "\tmove." size " (%0)+,(%1)+\n" \
	     "\tdbf %2,1b\n"  \
	     "\tsub.l #0x10000,%2\n" \
	     "\tbpl.b 1b\n" : \
		"=a" (from), "=a" (to), "=d" (n) :\
		 "0" (from),  "1" (to), "2" (n) : \
		 "cc", "memory")
#else
#define COPYSETUP(n)
#define COPY(to,from,n,size) \
	asm volatile ("1:\n" \
	     "\tmove." size " (%0)+,(%1)+\n" \
	     "\tsubq.l #1,%2\n\tbne.b 1b\n" : \
		"=a" (from), "=a" (to), "=d" (n) :\
		 "0" (from),  "1" (to), "2" (n) : \
		 "cc", "memory")
#endif

/* gcc doesn't know that cpu32+ is better than cpu32 :( */
#if defined(__mcpu32p__)
#undef M68k_HAS_MISALIGNED
#define M68k_HAS_MISALIGNED 1
#endif

void *
memcpy(void *s1, const void *s2, size_t n)
{
	char *p1 = s1;
	const char *p2 = s2;

	if (n) {
		if (n < 16) {
			COPYSETUP (n);
			COPY (p1, p2, n, "b");
		}
		else {
			int nbyte;
			int nl;
			nbyte = (int)p1 & 0x3;
			if (nbyte) {
				nbyte = 4 - nbyte;
				n -= nbyte;
				COPYSETUP (nbyte);
				COPY (p1, p2, nbyte, "b");
			}
#if (M68K_HAS_MISALIGNED == 0)
			/*
			 * Take care of machines that can't
			 * handle misaligned references.
			 */
			if ((int)p2 & 0x1) {
				COPYSETUP (n);
				COPY (p1, p2, n, "b");
				return s1;
			}
#endif
			nl = (unsigned int)n >> 2;
			COPYSETUP (nl);
			COPY (p1, p2, nl, "l");
			nbyte = (int)n & 0x3;
			if (nbyte) {
				COPYSETUP (nbyte);
				COPY (p1, p2, nbyte, "b");
			}
		}
	}
	return s1;
}

-- Eric Norum <norume@aps.anl.gov> Advanced Photon Source Argonne National Laboratory (630) 252-4793



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]