This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
[PATCH] memset.S for PowerPC
- From: Steven Munroe <sjmunroe at vnet dot ibm dot com>
- To: libc-alpha at sources dot redhat dot com
- Cc: geoffk at geoffk dot org
- Date: Wed, 21 Aug 2002 16:16:36 -0500
- Subject: [PATCH] memset.S for PowerPC
Here is the revised memset patch for PowerPC (32-bit). if this meets
your approval I'll start revising my PowerPC64 patches into this form.
To test this patch I devised a simple performance test that took parts of
./string/tester.c and added my own in-cache and cache-rollover performance
tests and timers.
The before test uses libc.so from the Suse SLES 7.0 distribution which
appears to use ./sysdeps/generic/memset.c. Presumably so it will work
correctly on all powerpc platforms. The results are:
start alignment test
unaligned memset 796.875 MB per sec
4kb buffer @0x40029000
page aligned bzero 1448.373 MB per sec
page aligned memset 1469.154 MB per sec
16mb buffer @0x41029000
page aligned bzero 278.564 MB per sec
page aligned memset 279.552 MB per sec
The after (this patch) test results are:
start alignment test
unaligned memset 796.875 MB per sec
4kb buffer @0x40002000
page aligned bzero 2221.258 MB per sec
page aligned memset 1523.810 MB per sec
16mb buffer @0x41002000
page aligned bzero 646.873 MB per sec
page aligned memset 353.469 MB per sec
Both tests where run an IBM 7044-170, POWER3(630+) 400MHz, with 512MB ram.
This is a 64-bit system running the 64-bit 2.4.19-rc3 Linux kernel.
2002-08-20 Steven Munroe <sjmunroe@us.ibm.com>
* sysdeps/powerpc/elf/libc-start.c : Scan Aux Vector for
AT_DCACHEBSIZE and copy value to __cache_line_size.
* sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c : Scan Aux Vector for
AT_DCACHEBSIZE and copy value to __cache_line_size.
* sysdeps/powerpc/memset.S : Define __cache_line_size and use its
value to select the correct stride for dcbz.
>>>>>>>>
diff -rc2P glibc-2.2.5/sysdeps/powerpc/elf/libc-start.c glibc-2.2.5-memset/sysdeps/powerpc/elf/libc-start.c
*** glibc-2.2.5/sysdeps/powerpc/elf/libc-start.c Thu Jul 5 23:56:01 2001
--- glibc-2.2.5-memset/sysdeps/powerpc/elf/libc-start.c Wed Aug 21 13:31:27 2002
***************
*** 27,30 ****
--- 27,34 ----
extern int _dl_starting_up;
weak_extern (_dl_starting_up)
+
+ extern int __cache_line_size;
+ weak_extern (__cache_line_size)
+
extern int __libc_multiple_libcs;
extern void *__libc_stack_end;
***************
*** 38,41 ****
--- 42,66 ----
};
+ /* Scan the Aux Vector for the "Data Cache Block Size" entry. If found
+ verify that the static extern __cache_line_size is defined by checking
+ for not NULL. If it is defined then assign the cache block size
+ value to __cache_line_size. */
+ static inline void
+ __aux_init_cache (ElfW(auxv_t) *av)
+ {
+ for (; av->a_type != AT_NULL; ++av)
+ switch (av->a_type)
+ {
+ case AT_DCACHEBSIZE:
+ {
+ int *cls = & __cache_line_size;
+ if (cls != NULL)
+ *cls = av->a_un.a_val;
+ }
+ break;
+ }
+ }
+
+
int
/* GKM FIXME: GCC: this should get __BP_ prefix by virtue of the
***************
*** 43,47 ****
BP_SYM (__libc_start_main) (int argc, char *__unbounded *__unbounded ubp_av,
char *__unbounded *__unbounded ubp_ev,
! void *__unbounded auxvec, void (*rtld_fini) (void),
struct startup_info *__unbounded stinfo,
char *__unbounded *__unbounded stack_on_entry)
--- 68,72 ----
BP_SYM (__libc_start_main) (int argc, char *__unbounded *__unbounded ubp_av,
char *__unbounded *__unbounded ubp_ev,
! ElfW(auxv_t) *__unbounded auxvec, void (*rtld_fini) (void),
struct startup_info *__unbounded stinfo,
char *__unbounded *__unbounded stack_on_entry)
***************
*** 67,70 ****
--- 92,96 ----
if (*stack_on_entry != NULL)
{
+ char *__unbounded *__unbounded temp;
/* ...in which case, we have argc as the top thing on the
stack, followed by argv (NULL-terminated), envp (likewise),
***************
*** 73,80 ****
ubp_av = stack_on_entry + 1;
ubp_ev = ubp_av + argc + 1;
! auxvec = ubp_ev;
! while (*(char *__unbounded *__unbounded) auxvec != NULL)
! ++auxvec;
! ++auxvec;
#ifndef SHARED
_dl_aux_init ((ElfW(auxv_t) *) auxvec);
--- 99,108 ----
ubp_av = stack_on_entry + 1;
ubp_ev = ubp_av + argc + 1;
! temp = ubp_ev;
! while (*temp != NULL)
! ++temp;
! auxvec = (ElfW(auxv_t) *)++temp;
!
!
#ifndef SHARED
_dl_aux_init ((ElfW(auxv_t) *) auxvec);
***************
*** 84,87 ****
--- 112,118 ----
INIT_ARGV_and_ENVIRON;
+
+ /* Initialize the __cache_line_size variable from the aux vector. */
+ __aux_init_cache((ElfW(auxv_t) *) auxvec);
/* Store something that has some relationship to the end of the
diff -rc2P glibc-2.2.5/sysdeps/powerpc/memset.S glibc-2.2.5-memset/sysdeps/powerpc/memset.S
*** glibc-2.2.5/sysdeps/powerpc/memset.S Thu Jul 5 23:56:01 2001
--- glibc-2.2.5-memset/sysdeps/powerpc/memset.S Wed Aug 21 12:19:04 2002
***************
*** 22,31 ****
#include <bp-asm.h>
/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
Returns 's'.
! The memset is done in three sizes: byte (8 bits), word (32 bits),
! cache line (256 bits). There is a special case for setting cache lines
! to 0, to take advantage of the dcbz instruction. */
EALIGN (BP_SYM (memset), 5, 1)
--- 22,45 ----
#include <bp-asm.h>
+ /* Define a global static that can hold the cache line size. The
+ assumption is that startup code will access the "aux vector" to
+ to obtain the value set by the kernel and store it into this
+ variable. */
+
+ .globl __cache_line_size
+ .section ".data","aw"
+ .align 2
+ .type __cache_line_size,@object
+ .size __cache_line_size,4
+ __cache_line_size:
+ .long 0
+ .section ".text"
/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
Returns 's'.
! The memset is done in four sizes: byte (8 bits), word (32 bits),
! 32-byte blocks (256 bits) and __cache_line_size (128, 256, 1024 bits).
! There is a special case for setting whole cache lines to 0, which
! takes advantage of the dcbz instruction. */
EALIGN (BP_SYM (memset), 5, 1)
***************
*** 51,54 ****
--- 65,72 ----
#define rNEG32 r9 /* constant -32 for clearing with dcbz */
+ #define rGOT r9 /* Address of the Global Offset Table. */
+ #define rCLS r8 /* Cache line size obtained from static. */
+ #define rCLM r9 /* Cache line size mask to check for cache alignment. */
+
#if __BOUNDED_POINTERS__
cmplwi cr1, rRTN, 0
***************
*** 106,110 ****
clrrwi. rALIGN, rLEN, 5
mtcrf 0x01, rLEN /* 40th instruction from .align */
! beq cr1, L(zloopstart) /* special case for clearing memory using dcbz */
srwi rTMP, rALIGN, 5
mtctr rTMP
--- 124,138 ----
clrrwi. rALIGN, rLEN, 5
mtcrf 0x01, rLEN /* 40th instruction from .align */
!
! /* Check if we can use the special case for clearing memory using dcbz.
! This requires that we know the correct cache line size for this
! processor. Getting the __cache_line_size may require establishing GOT
! addressability, so branch out of line to set this up. */
! beq cr1, L(checklinesize)
!
! /* Store blocks of 32-bytes (265-bits) starting on a 32-byte boundary.
! Can't assume that rCHR is zero or that the cache line size is either
! 32-bytes or even known. */
! L(nondcbz):
srwi rTMP, rALIGN, 5
mtctr rTMP
***************
*** 115,119 ****
bdz L(cloopdone) /* 48th instruction from .align */
! L(c3): dcbz rNEG64, rMEMP
stw rCHR, -4(rMEMP)
stw rCHR, -8(rMEMP)
--- 143,149 ----
bdz L(cloopdone) /* 48th instruction from .align */
! /* We can't use dcbz here as we don't know the cache line size. We can
! use "data cache block touch for store", which is safe. */
! L(c3): dcbtst rNEG64, rMEMP
stw rCHR, -4(rMEMP)
stw rCHR, -8(rMEMP)
***************
*** 143,147 ****
.align 5
nop
! /* Clear lines of memory in 128-byte chunks. */
L(zloopstart):
clrlwi rLEN, rLEN, 27
--- 173,180 ----
.align 5
nop
! /* Clear cache lines of memory in 128-byte chunks.
! This code is optimized for processors with 32-byte cache lines.
! It is further optimized for the 601 processor, which requires
! some care in how the code is aligned in the i-cache. */
L(zloopstart):
clrlwi rLEN, rLEN, 27
***************
*** 227,229 ****
--- 260,338 ----
stw rCHR, -8(rMEMP)
blr
+
+ L(checklinesize):
+ #ifdef SHARED
+ mflr rTMP
+ /* If the remaining length is less the 32 bytes then don't bother getting
+ the cache line size. */
+ beq L(medium)
+ /* Establishes GOT addressability so we can load __cache_line_size
+ from static. This value was set from the aux vector during startup. */
+ bl _GLOBAL_OFFSET_TABLE_@local-4
+ mflr rGOT
+ lwz rGOT,__cache_line_size@got(rGOT)
+ lwz rCLS,0(rGOT)
+ mtlr rTMP
+ #else
+ /* Load __cache_line_size from static. This value was set from the
+ aux vector during startup. */
+ lis rCLS,__cache_line_size@ha
+ /* If the remaining length is less the 32 bytes then don't bother getting
+ the cache line size. */
+ beq L(medium)
+ lwz rCLS,__cache_line_size@l(rCLS)
+ #endif
+
+ /*If the cache line size was not set then goto to L(nondcbz), which is
+ safe for any cache line size. */
+ cmplwi cr1,rCLS,0
+ beq cr1,L(nondcbz)
+
+ /* If the cache line size is 32 bytes then goto to L(zloopstart),
+ which is coded specificly for 32-byte lines (and 601). */
+ cmplwi cr1,rCLS,32
+ beq cr1,L(zloopstart)
+
+ /* Now we know the cache line size and it is not 32-bytes. However
+ we may not yet be aligned to the cache line and may have a partial
+ line to fill. Touch it 1st to fetch the cache line. */
+ dcbtst 0,rMEMP
+
+ addi rCLM,rCLS,-1
+ L(getCacheAligned):
+ cmplwi cr1,rLEN,32
+ and. rTMP,rCLM,rMEMP
+ blt cr1,L(handletail32)
+ beq L(cacheAligned)
+ /* We are not aligned to start of a cache line yet. Store 32-byte
+ of data and test again. */
+ addi rMEMP,rMEMP,32
+ addi rLEN,rLEN,-32
+ stw rCHR,-32(rMEMP)
+ stw rCHR,-28(rMEMP)
+ stw rCHR,-24(rMEMP)
+ stw rCHR,-20(rMEMP)
+ stw rCHR,-16(rMEMP)
+ stw rCHR,-12(rMEMP)
+ stw rCHR,-8(rMEMP)
+ stw rCHR,-4(rMEMP)
+ b L(getCacheAligned)
+
+ /* Now we are aligned to the cache line and can use dcbz. */
+ L(cacheAligned):
+ cmplw cr1,rLEN,rCLS
+ blt cr1,L(handletail32)
+ dcbz 0,rMEMP
+ subf rLEN,rCLS,rLEN
+ add rMEMP,rMEMP,rCLS
+ b L(cacheAligned)
+
+ /* We are here because; the cache line size was set, it was not
+ 32-bytes, and the remainder (rLEN) is now less than the actual cache
+ line size. Set up the preconditions for L(nondcbz) and go there to
+ store the remaining bytes. */
+ L(handletail32):
+ clrrwi. rALIGN, rLEN, 5
+ b L(nondcbz)
+
END (BP_SYM (memset))
diff -rc2P glibc-2.2.5/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c glibc-2.2.5-memset/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c
*** glibc-2.2.5/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c Thu Jul 5 23:56:19 2001
--- glibc-2.2.5-memset/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c Wed Aug 21 13:02:54 2002
***************
*** 21,24 ****
--- 21,50 ----
#include "config.h"
#include "kernel-features.h"
+ #include <ldsodefs.h>
+
+ extern int __cache_line_size;
+ weak_extern (__cache_line_size)
+
+ #define DL_PLATFORM_INIT __aux_init_cache(_dl_auxv)
+
+ /* Scan the Aux Vector for the "Data Cache Block Size" entry. If found
+ verify that the static extern __cache_line_size is defined by checking
+ for not NULL. If it is defined then assign the cache block size
+ value to __cache_line_size. */
+ static inline void
+ __aux_init_cache (ElfW(auxv_t) *av)
+ {
+ for (; av->a_type != AT_NULL; ++av)
+ switch (av->a_type)
+ {
+ case AT_DCACHEBSIZE:
+ {
+ int *cls = & __cache_line_size;
+ if (cls != NULL)
+ *cls = av->a_un.a_val;
+ }
+ break;
+ }
+ }
#ifndef __ASSUME_STD_AUXV