This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
[PATCH] memset.S and PowerPC
- From: Steven Munroe <sjmunroe at vnet dot ibm dot com>
- To: libc-alpha at sources dot redhat dot com
- Cc: aj at suse dot de, drepper at redhat dot com, geoffk at redhat dot com
- Date: Tue, 9 Jul 2002 17:03:32 -0500
- Subject: [PATCH] memset.S and PowerPC
On Tue, 2002-06-18 at 12:34, Ulrich Drepper wrote:
> I have not the slightest interest in adding more hacks. The code works
> fine for the platforms so far supported. ...
Actually the current code (memset.S) fails on any PowerPC system with a different (then 32-byte) cache-line size. As all 64-bit PowerPC Hardware implementations have a 128 byte cache-lines, this has been a issue for some time.
> ... If the final solution
> requires using the auxiliary vector values and therefore a memory load,
> so be it. But no going back to the generic code and no disabling the
> benefits for supported implementations.
OK the attach patch adds support to capture the AT_DCACHEBSIZE from the aux vector during libc (and ld.so) init and store it in static "int __cache_line_size". __cache_line_size is used by sysdeps/powerpc/memset.S to determine the actual cache-line size. The original implementation (hardcode for 32-byte cache-lines) is left intact (logic and cache alignment) and is used when the cache-line size matches. Otherwise cache-line size independent code is used. If the __cache_line_size value is 0 (for example an old version of the kernel that doesn't support AT_DCACHEBSIZE) then use a slower but safe sequence.
2002-07-08 Steven Munroe <sjmunroe@us.ibm.com>
* sysdeps/powerpc/elf/libc-start.c : Scan Aux Vector for
AT_DCACHEBSIZE and copy value to __cache_line_size.
* sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c : Scan Aux Vector for
AT_DCACHEBSIZE and copy value to __cache_line_size.
* sysdeps/powerpc/memset.S : Define __cache_line_size and use its
value to select the correct stride for dcbz.
diff -rc2P -x manual glibc-2.2.5/sysdeps/powerpc/elf/libc-start.c glibc-225memset/sysdeps/powerpc/elf/libc-start.c
*** glibc-2.2.5/sysdeps/powerpc/elf/libc-start.c Thu Jul 5 23:56:01 2001
--- glibc-225memset/sysdeps/powerpc/elf/libc-start.c Tue Jul 9 08:40:18 2002
***************
*** 27,30 ****
--- 27,34 ----
extern int _dl_starting_up;
weak_extern (_dl_starting_up)
+
+ extern int __cache_line_size;
+ weak_extern (__cache_line_size)
+
extern int __libc_multiple_libcs;
extern void *__libc_stack_end;
***************
*** 38,41 ****
--- 42,63 ----
};
+
+ static inline void
+ __aux_init_cache (ElfW(auxv_t) *av)
+ {
+ for (; av->a_type != AT_NULL; ++av)
+ switch (av->a_type)
+ {
+ case AT_DCACHEBSIZE:
+ {
+ int *cls = & __cache_line_size;
+ if (cls != NULL)
+ *cls = av->a_un.a_val;
+ }
+ break;
+ }
+ }
+
+
int
/* GKM FIXME: GCC: this should get __BP_ prefix by virtue of the
***************
*** 75,80 ****
auxvec = ubp_ev;
while (*(char *__unbounded *__unbounded) auxvec != NULL)
! ++auxvec;
! ++auxvec;
#ifndef SHARED
_dl_aux_init ((ElfW(auxv_t) *) auxvec);
--- 97,103 ----
auxvec = ubp_ev;
while (*(char *__unbounded *__unbounded) auxvec != NULL)
! ++(char**)auxvec;
! ++(char**)auxvec;
!
#ifndef SHARED
_dl_aux_init ((ElfW(auxv_t) *) auxvec);
***************
*** 83,87 ****
}
! INIT_ARGV_and_ENVIRON;
/* Store something that has some relationship to the end of the
--- 106,112 ----
}
! INIT_ARGV_and_ENVIRON;
! /* set up cache line size etc from aux vector. */
! __aux_init_cache((ElfW(auxv_t) *) auxvec);
/* Store something that has some relationship to the end of the
diff -rc2P -x manual glibc-2.2.5/sysdeps/powerpc/memset.S glibc-225memset/sysdeps/powerpc/memset.S
*** glibc-2.2.5/sysdeps/powerpc/memset.S Thu Jul 5 23:56:01 2001
--- glibc-225memset/sysdeps/powerpc/memset.S Thu Jun 27 14:26:13 2002
***************
*** 22,31 ****
#include <bp-asm.h>
/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
Returns 's'.
! The memset is done in three sizes: byte (8 bits), word (32 bits),
! cache line (256 bits). There is a special case for setting cache lines
! to 0, to take advantage of the dcbz instruction. */
EALIGN (BP_SYM (memset), 5, 1)
--- 22,44 ----
#include <bp-asm.h>
+ /* define a global static that can hold the cache line size. The
+ assumption is that startup code will access the "aux vector"
+ stuff cache line value into this variable */
+
+ .globl __cache_line_size
+ .section ".sdata","aw"
+ .align 2
+ .type __cache_line_size,@object
+ .size __cache_line_size,4
+ __cache_line_size:
+ .long 0
+ .section ".text"
/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
Returns 's'.
! The memset is done in four sizes: byte (8 bits), word (32 bits),
! 32-byte blocks (256 bits) and cache line (128, 256, 1024 bits).
! There is a special case for setting cache lines to 0, to take
! advantage of the dcbz instruction. */
EALIGN (BP_SYM (memset), 5, 1)
***************
*** 51,54 ****
--- 64,71 ----
#define rNEG32 r9 /* constant -32 for clearing with dcbz */
+ #define rGOT r9 /* address of the Global Offset Table */
+ #define rCLS r8 /* cache line size obtained from static */
+ #define rCLM r9 /* cache line mask to check for cache alignment */
+
#if __BOUNDED_POINTERS__
cmplwi cr1, rRTN, 0
***************
*** 106,110 ****
clrrwi. rALIGN, rLEN, 5
mtcrf 0x01, rLEN /* 40th instruction from .align */
! beq cr1, L(zloopstart) /* special case for clearing memory using dcbz */
srwi rTMP, rALIGN, 5
mtctr rTMP
--- 123,137 ----
clrrwi. rALIGN, rLEN, 5
mtcrf 0x01, rLEN /* 40th instruction from .align */
!
! /* Check if we can use the special case for clearing memory using dcbz.
! This requires that we know the correct cache line size for this
! processor. Getting the __cache_line_size requires establishing GOT
! addressability . So branch out of line to set this up. */
! beq cr1, L(checklinesize)
!
! /* Store blocks of 32-bytes (265-bits) starting on a 32-byte boundary.
! Can't assume that rCHR is zero or that the cache line size is either
! 32-bytes or even known */
! L(nondcbz):
srwi rTMP, rALIGN, 5
mtctr rTMP
***************
*** 115,119 ****
bdz L(cloopdone) /* 48th instruction from .align */
! L(c3): dcbz rNEG64, rMEMP
stw rCHR, -4(rMEMP)
stw rCHR, -8(rMEMP)
--- 142,148 ----
bdz L(cloopdone) /* 48th instruction from .align */
! /* cant use dcbz here sinse we do not know the cache line size. use data
! cache block touch which is safe */
! L(c3): dcbt rNEG64, rMEMP
stw rCHR, -4(rMEMP)
stw rCHR, -8(rMEMP)
***************
*** 143,147 ****
.align 5
nop
! /* Clear lines of memory in 128-byte chunks. */
L(zloopstart):
clrlwi rLEN, rLEN, 27
--- 172,181 ----
.align 5
nop
! /* Clear lines of memory in 128-byte chunks.
! This code is optimized for processors with 32-byte
! cache lines. It is further optimized for the 601
! processor so preserving alignment in the i-cache is
! important.
! */
L(zloopstart):
clrlwi rLEN, rLEN, 27
***************
*** 227,229 ****
--- 261,326 ----
stw rCHR, -8(rMEMP)
blr
+
+ L(checklinesize):
+ mflr rTMP
+ /* If the remaining length is less the 32 bytes, don't bother getting
+ the cache line size */
+ beq L(medium)
+
+ /* Establishes GOT addressability so we can load __cache_line_size
+ from static. This value was set from the aux vector during startup. */
+ bl _GLOBAL_OFFSET_TABLE_@local-4
+ mflr rGOT
+ lwz rGOT,__cache_line_size@got(rGOT)
+ lwz rCLS,0(rGOT)
+ mtlr rTMP
+
+ /* If the cache line size was not set just goto to L(nondcbz) which is
+ safe for any cache line size. */
+ cmplwi cr1,rCLS,0
+ beq cr1,L(nondcbz)
+
+ /* If the cache line size is 32 bytes just goto to L(zloopstart)
+ which is coded specificly for 32-byte lines (and 601). */
+ cmplwi cr1,rCLS,32
+ beq cr1,L(zloopstart)
+
+ /* Now we know the cache line size, and it is not 32-bytes, but
+ we may not yet be aligned to the cache line. May have a partial
+ line to fill, so touch it 1st */
+ dcbt 0,rMEMP
+ addi rCLM,rCLS,-1
+ L(getCacheAligned):
+ cmplwi cr1,rLEN,32
+ and. rTMP,rCLM,rMEMP
+ blt cr1,L(handletail32)
+ beq L(cacheAligned)
+ addi rMEMP,rMEMP,32
+ addi rLEN,rLEN,-32
+ stw rCHR,-32(rMEMP)
+ stw rCHR,-28(rMEMP)
+ stw rCHR,-24(rMEMP)
+ stw rCHR,-20(rMEMP)
+ stw rCHR,-16(rMEMP)
+ stw rCHR,-12(rMEMP)
+ stw rCHR,-8(rMEMP)
+ stw rCHR,-4(rMEMP)
+ b L(getCacheAligned)
+
+ /* Now we are aligned to the cache line and can use dcbz. */
+ L(cacheAligned):
+ cmplw cr1,rLEN,rCLS
+ blt cr1,L(handletail32)
+ dcbz 0,rMEMP
+ subf rLEN,rCLS,rLEN
+ add rMEMP,rMEMP,rCLS
+ b L(cacheAligned)
+
+ /* We are here because the cache line size was set and was not 32-bytes
+ and the remainder (rLEN) is less than the actual cache line size.
+ So set up the preconditions for L(nondcbz) and go there. */
+ L(handletail32):
+ clrrwi. rALIGN, rLEN, 5
+ b L(nondcbz)
+
END (BP_SYM (memset))
diff -rc2P -x manual glibc-2.2.5/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c glibc-225memset/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c
*** glibc-2.2.5/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c Thu Jul 5 23:56:19 2001
--- glibc-225memset/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c Tue Jul 9 08:37:30 2002
***************
*** 21,24 ****
--- 21,47 ----
#include "config.h"
#include "kernel-features.h"
+ #include <ldsodefs.h>
+
+ extern int __cache_line_size;
+ weak_extern (__cache_line_size)
+
+ #define DL_PLATFORM_INIT __aux_init_cache(_dl_auxv)
+
+ /* set __cache_line_size with d-cache line size from the aux vector */
+ static inline void
+ __aux_init_cache (ElfW(auxv_t) *av)
+ {
+ for (; av->a_type != AT_NULL; ++av)
+ switch (av->a_type)
+ {
+ case AT_DCACHEBSIZE:
+ {
+ int *cls = & __cache_line_size;
+ if (cls != NULL)
+ *cls = av->a_un.a_val;
+ }
+ break;
+ }
+ }
#ifndef __ASSUME_STD_AUXV