[patch] Fix and reenable memcpy/memset for m68k
Eric Norum
norume@aps.anl.gov
Mon Apr 30 21:29:00 GMT 2007
On Apr 30, 2007, at 3:39 PM, Joel Sherrill wrote:
> Kazu,
>
> Just to finish this one completely off. I want to let you know
> that I pinged Eric Norum
> to review this because he was responsible for the m68k specific
> memcpy RTEMS uses.
> Where possible we prefer to use newlib code instead of overriding
> it with our own
> version. We are inclined to switch to your implementation but I
> would
> appreciate you reviewing to the attached memcpy.c to see if Eric
> had any
> optimization ideas that aren't also in yours.
>
> FYI parts of the RTEMS build structure can distinguish CPU models
> at a finer
> granularity than multilib allows. The attached memcpy.c has some
> optimizations
> to account for the CPU32+.
> Thanks.
The optimization in question is to put the CPU32+ (and 68010 if there
are still any of those in the wild) into 'loop mode'. This is a
special case of a single two-byte instruction followed by a dbra.
This allows the CPU to repeatedly execute the two instructions
without any instruction fetch cycles. Thus, the assembly code from
Kazu Hirata actually incurs a performance penalty by unrolling the
loops.
Whether or not this optimization is worthwhile adding is left as an
argument for others to decide. Are CPU32+ architecture machines
even available any more?
>
> --joel
>
> Kazu Hirata wrote:
>> Hi,
>>
>> Attached is a patch to fix and reenable memcpy/memset as suggested by
>> Eric. (Thanks Eric for pointing our the bug!)
>>
>> Tested on fido-elf. OK to apply?
>>
>> Kazu Hirata
>>
>> 2007-04-30 Kazu Hirata <kazu@codesourcery.com>
>>
>> * libc/machine/m68k/Makefile.am (lib_a_SOURCES): Add memcpy.S
>> and memset.S.
>> * libc/machine/m68k/Makefile.in: Regenerate.
>> * libc/machine/m68k/memcpy.S: Use sub.l followed by dbra.
>> * libc/machine/m68k/memset.S: Likewise.
>>
>> Index: newlib/libc/machine/m68k/Makefile.am
>> ===================================================================
>> RCS file: /cvs/src/src/newlib/libc/machine/m68k/Makefile.am,v
>> retrieving revision 1.5
>> diff -u -d -p -r1.5 Makefile.am
>> --- newlib/libc/machine/m68k/Makefile.am 27 Apr 2007 22:10:47
>> -0000 1.5
>> +++ newlib/libc/machine/m68k/Makefile.am 30 Apr 2007 19:59:36 -0000
>> @@ -8,7 +8,7 @@ AM_CCASFLAGS = $(INCLUDES)
>> noinst_LIBRARIES = lib.a
>> -lib_a_SOURCES = setjmp.S strcpy.c strlen.c
>> +lib_a_SOURCES = setjmp.S strcpy.c strlen.c memcpy.S memset.S
>> lib_a_CCASFLAGS=$(AM_CCASFLAGS)
>> lib_a_CFLAGS=$(AM_CFLAGS)
>> Index: newlib/libc/machine/m68k/Makefile.in
>> ===================================================================
>> RCS file: /cvs/src/src/newlib/libc/machine/m68k/Makefile.in,v
>> retrieving revision 1.13
>> diff -u -d -p -r1.13 Makefile.in
>> --- newlib/libc/machine/m68k/Makefile.in 27 Apr 2007 22:10:47
>> -0000 1.13
>> +++ newlib/libc/machine/m68k/Makefile.in 30 Apr 2007 19:59:36 -0000
>> @@ -56,7 +56,8 @@ ARFLAGS = cru
>> lib_a_AR = $(AR) $(ARFLAGS)
>> lib_a_LIBADD =
>> am_lib_a_OBJECTS = lib_a-setjmp.$(OBJEXT) lib_a-strcpy.$(OBJEXT) \
>> - lib_a-strlen.$(OBJEXT)
>> + lib_a-strlen.$(OBJEXT) lib_a-memcpy.$(OBJEXT) \
>> + lib_a-memset.$(OBJEXT)
>> lib_a_OBJECTS = $(am_lib_a_OBJECTS)
>> DEFAULT_INCLUDES = -I. -I$(srcdir)
>> depcomp =
>> @@ -181,7 +182,7 @@ AUTOMAKE_OPTIONS = cygnus
>> INCLUDES = $(NEWLIB_CFLAGS) $(CROSS_CFLAGS) $(TARGET_CFLAGS)
>> AM_CCASFLAGS = $(INCLUDES)
>> noinst_LIBRARIES = lib.a
>> -lib_a_SOURCES = setjmp.S strcpy.c strlen.c
>> +lib_a_SOURCES = setjmp.S strcpy.c strlen.c memcpy.S memset.S
>> lib_a_CCASFLAGS = $(AM_CCASFLAGS)
>> lib_a_CFLAGS = $(AM_CFLAGS)
>> ACLOCAL_AMFLAGS = -I ../../..
>> @@ -249,6 +250,18 @@ lib_a-setjmp.o: setjmp.S
>> lib_a-setjmp.obj: setjmp.S
>> $(CCAS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-setjmp.obj
>> `if test -f 'setjmp.S'; then $(CYGPATH_W) 'setjmp.S'; else $
>> (CYGPATH_W) '$(srcdir)/setjmp.S'; fi`
>> +lib_a-memcpy.o: memcpy.S
>> + $(CCAS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-memcpy.o
>> `test -f 'memcpy.S' || echo '$(srcdir)/'`memcpy.S
>> +
>> +lib_a-memcpy.obj: memcpy.S
>> + $(CCAS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-memcpy.obj
>> `if test -f 'memcpy.S'; then $(CYGPATH_W) 'memcpy.S'; else $
>> (CYGPATH_W) '$(srcdir)/memcpy.S'; fi`
>> +
>> +lib_a-memset.o: memset.S
>> + $(CCAS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-memset.o
>> `test -f 'memset.S' || echo '$(srcdir)/'`memset.S
>> +
>> +lib_a-memset.obj: memset.S
>> + $(CCAS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-memset.obj
>> `if test -f 'memset.S'; then $(CYGPATH_W) 'memset.S'; else $
>> (CYGPATH_W) '$(srcdir)/memset.S'; fi`
>> +
>> .c.o:
>> $(COMPILE) -c $<
>> Index: newlib/libc/machine/m68k/memcpy.S
>> ===================================================================
>> RCS file: /cvs/src/src/newlib/libc/machine/m68k/memcpy.S,v
>> retrieving revision 1.1
>> diff -u -d -p -r1.1 memcpy.S
>> --- newlib/libc/machine/m68k/memcpy.S 27 Apr 2007 16:05:35 -0000 1.1
>> +++ newlib/libc/machine/m68k/memcpy.S 30 Apr 2007 19:59:36 -0000
>> @@ -73,10 +73,11 @@ memcpy:
>> .Lcopy:
>> #if !defined (__mcoldfire__)
>> dbra %d0,1b
>> + sub.l #0x10000,%d0
>> #else
>> subq.l #1,%d0
>> - bpl 1b
>> #endif
>> + bpl 1b
>> bra .Lresidue
>> 1:
>> Index: newlib/libc/machine/m68k/memset.S
>> ===================================================================
>> RCS file: /cvs/src/src/newlib/libc/machine/m68k/memset.S,v
>> retrieving revision 1.1
>> diff -u -d -p -r1.1 memset.S
>> --- newlib/libc/machine/m68k/memset.S 27 Apr 2007 16:05:35 -0000 1.1
>> +++ newlib/libc/machine/m68k/memset.S 30 Apr 2007 19:59:36 -0000
>> @@ -76,10 +76,11 @@ memset:
>> .Llset:
>> #if !defined (__mcoldfire__)
>> dbra %d2,1b | loop until done
>> + sub.l #0x10000,%d2
>> #else
>> subq.l #1,%d2
>> - bpl 1b
>> #endif
>> + bpl 1b
>> and.l #3,%d1 | residue byte transfers, fixed
>> move.l (%sp)+,%d2 | restore d2
>> bra .Lbset
>>
>
> /*
> * C library memcpy routine
> *
> * This routine has code to optimize performance on the CPU32+
> * and another version for other 68k machines.
> *
> * It could be optimized more for machines with MOVE16 instructions.
> *
> * The routine is placed in this source directory to ensure that it
> * is picked up by all applications.
> *
> * W. Eric Norum
> * Saskatchewan Accelerator Laboratory
> * University of Saskatchewan
> * Saskatoon, Saskatchewan, CANADA
> * eric@skatter.usask.ca
> */
>
> #include <string.h>
> #include <rtems/score/m68k.h>
>
> #if defined(__mcpu32__)
> #define COPYSETUP(n) n--
> #define COPY(to,from,n,size) \
> asm volatile ("1:\n" \
> "\tmove." size " (%0)+,(%1)+\n" \
> "\tdbf %2,1b\n" \
> "\tsub.l #0x10000,%2\n" \
> "\tbpl.b 1b\n" : \
> "=a" (from), "=a" (to), "=d" (n) :\
> "0" (from), "1" (to), "2" (n) : \
> "cc", "memory")
> #else
> #define COPYSETUP(n)
> #define COPY(to,from,n,size) \
> asm volatile ("1:\n" \
> "\tmove." size " (%0)+,(%1)+\n" \
> "\tsubq.l #1,%2\n\tbne.b 1b\n" : \
> "=a" (from), "=a" (to), "=d" (n) :\
> "0" (from), "1" (to), "2" (n) : \
> "cc", "memory")
> #endif
>
> /* gcc doesn't know that cpu32+ is better than cpu32 :( */
> #if defined(__mcpu32p__)
> #undef M68k_HAS_MISALIGNED
> #define M68k_HAS_MISALIGNED 1
> #endif
>
> void *
> memcpy(void *s1, const void *s2, size_t n)
> {
> char *p1 = s1;
> const char *p2 = s2;
>
> if (n) {
> if (n < 16) {
> COPYSETUP (n);
> COPY (p1, p2, n, "b");
> }
> else {
> int nbyte;
> int nl;
> nbyte = (int)p1 & 0x3;
> if (nbyte) {
> nbyte = 4 - nbyte;
> n -= nbyte;
> COPYSETUP (nbyte);
> COPY (p1, p2, nbyte, "b");
> }
> #if (M68K_HAS_MISALIGNED == 0)
> /*
> * Take care of machines that can't
> * handle misaligned references.
> */
> if ((int)p2 & 0x1) {
> COPYSETUP (n);
> COPY (p1, p2, n, "b");
> return s1;
> }
> #endif
> nl = (unsigned int)n >> 2;
> COPYSETUP (nl);
> COPY (p1, p2, nl, "l");
> nbyte = (int)n & 0x3;
> if (nbyte) {
> COPYSETUP (nbyte);
> COPY (p1, p2, nbyte, "b");
> }
> }
> }
> return s1;
> }
--
Eric Norum <norume@aps.anl.gov>
Advanced Photon Source
Argonne National Laboratory
(630) 252-4793
More information about the Newlib
mailing list