Index: sysdeps/x86_64/bzero.S =================================================================== RCS file: /cvs/glibc/libc/sysdeps/x86_64/bzero.S,v retrieving revision 1.1 diff -p -u -w -p -u -w -r1.1 bzero.S --- sysdeps/x86_64/bzero.S 31 Aug 2002 17:30:07 -0000 1.1 +++ sysdeps/x86_64/bzero.S 17 Aug 2007 20:06:14 -0000 @@ -1,3 +1,6 @@ -#define memset __bzero -#include -weak_alias (__bzero, bzero) +#define USE_AS_BZERO +#define memset bzero + +#include "memset.S" + +weak_alias (bzero, __bzero) Index: sysdeps/x86_64/cacheinfo.c =================================================================== RCS file: /cvs/glibc/libc/sysdeps/x86_64/cacheinfo.c,v retrieving revision 1.3.2.1 diff -p -u -w -p -u -w -r1.3.2.1 cacheinfo.c --- sysdeps/x86_64/cacheinfo.c 12 Jul 2007 13:29:44 -0000 1.3.2.1 +++ sysdeps/x86_64/cacheinfo.c 17 Aug 2007 20:06:14 -0000 @@ -1,4 +1,6 @@ -/* x86_64 cache info. +/* + x86_64 cache info. + Copyright (C) 2003, 2004, 2006, 2007 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -187,7 +189,6 @@ intel_check_word (int name, unsigned int return 0; } - static long int __attribute__ ((noinline)) handle_intel (int name, unsigned int maxidx) { @@ -245,7 +246,6 @@ handle_intel (int name, unsigned int max return 0; } - static long int __attribute__ ((noinline)) handle_amd (int name) { @@ -257,7 +257,7 @@ handle_amd (int name) : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (0x80000000)); - if (name >= _SC_LEVEL3_CACHE_SIZE) + if (name > _SC_LEVEL3_CACHE_SIZE) return 0; unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE); @@ -286,6 +286,7 @@ handle_amd (int name) return ecx & 0xff; case _SC_LEVEL1_DCACHE_LINESIZE: return ecx & 0xff; + case _SC_LEVEL2_CACHE_SIZE: return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00; case _SC_LEVEL2_CACHE_ASSOC: @@ -301,6 +302,16 @@ handle_amd (int name) return 8; case 8: return 16; + case 10: + return 32; + case 11: + return 48; + case 12: + return 64; + case 13: + return 96; + case 14: + return 128; case 0xf: return (ecx << 6) & 0x3fffc00; default: @@ -308,13 +319,46 @@ handle_amd (int name) } case _SC_LEVEL2_CACHE_LINESIZE: return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff; + + case _SC_LEVEL3_CACHE_SIZE: + return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1; + case _SC_LEVEL3_CACHE_ASSOC: + edx >>= 12; + switch (edx & 0xf) + { + case 0: + case 1: + case 2: + case 4: + return edx & 0xf; + case 6: + return 8; + case 8: + return 16; + case 10: + return 32; + case 11: + return 48; + case 12: + return 64; + case 13: + return 96; + case 14: + return 128; + case 0xf: + return (edx & 0x3ffc0000) << 1; + default: + return 0; + } + case _SC_LEVEL3_CACHE_LINESIZE: + return (edx & 0xf000) == 0 ? 0 : edx & 0xff; + default: assert (! "cannot happen"); } return -1; } - /* Get the value of the system variable NAME. */ long int attribute_hidden @@ -343,19 +387,19 @@ __cache_sysconf (int name) return 0; } - -/* Half the core cache size for use in memory and string routines, typically - L1 size. */ -long int __x86_64_core_cache_size_half attribute_hidden = 32 * 1024 / 2; -/* Shared cache size for use in memory and string routines, typically - L2 or L3 size. */ -long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; +/* Core data cache size for use in memory and string routines, typically L1 size. */ +long int __attribute__ ((visibility ("hidden"))) _x86_64_data_cache_size = 32 * 1024; +long int __attribute__ ((visibility ("hidden"))) _x86_64_data_cache_size_half = 32 * 1024 / 2; +/* Core unified cache size for use in memory and string routines, typically L2 size. */ +long int __attribute__ ((visibility ("hidden"))) _x86_64_core_cache_size = 512 * 1024; +long int __attribute__ ((visibility ("hidden"))) _x86_64_core_cache_size_half = 512 * 1024 / 2; +/* Shared cache size for use in memory and string routines, typically L2 or L3 size. */ +long int __attribute__ ((visibility ("hidden"))) _x86_64_shared_cache_size = 2 * 1024 * 1024; +long int __attribute__ ((visibility ("hidden"))) _x86_64_shared_cache_size_half = 2 * 1024 * 1024 / 2; /* PREFETCHW support flag for use in memory and string routines. */ -int __x86_64_prefetchw attribute_hidden; - +int __attribute__ ((visibility ("hidden"))) _x86_64_prefetchw; -static void -__attribute__((constructor)) +static void __attribute__((constructor)) init_cacheinfo (void) { /* Find out what brand of processor. */ @@ -363,10 +407,8 @@ init_cacheinfo (void) unsigned int ebx; unsigned int ecx; unsigned int edx; - int max_cpuid; - int max_cpuid_ex; - long int core = -1; - long int shared = -1; + int max_cpuid, max_cpuid_ex; + long int data = -1, core = -1, shared = -1; unsigned int level; unsigned int threads = 0; @@ -377,21 +419,21 @@ init_cacheinfo (void) /* This spells out "GenuineIntel". */ if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69) { - core = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid); + data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, eax); + core = handle_intel (_SC_LEVEL2_CACHE_SIZE, eax); + shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, eax); - /* Try L3 first. */ + /* Assume L3 exists. */ level = 3; - shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid); - if (shared <= 0) { - /* Try L2 otherwise. */ + /* Use L2 if no L3. */ level = 2; - shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid); + shared = core; + core = 0; } - /* Figure out the number of logical threads that share the - highest cache level. */ + /* Figure out the number of logical threads that share the highest cache level. */ if (max_cpuid >= 4) { int i = 0; @@ -417,21 +459,57 @@ init_cacheinfo (void) threads = (ebx >> 16) & 0xff; } - /* Cap usage of highest cache level to the number of supported - threads. */ - if (shared > 0 && threads > 0) + /* Cap usage of highest cache level to the number of supported threads. */ + if (threads > 0 && shared > 0) shared /= threads; } /* This spells out "AuthenticAMD". */ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) { - core = handle_amd (_SC_LEVEL1_DCACHE_SIZE); - shared = handle_amd (_SC_LEVEL2_CACHE_SIZE); + data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); + core = handle_amd (_SC_LEVEL2_CACHE_SIZE); + shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); + /* Get maximum extended function. */ asm volatile ("cpuid" : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (0x80000000)); + if (shared <= 0) + shared = 0; + else + { + /* Figure out the number of logical threads that share L3. */ + if (max_cpuid_ex >= 0x80000008) + { + /* Get width of APIC ID. */ + asm volatile ("cpuid" + : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "0" (0x80000008)); + + threads = 1 << ((ecx >> 12) & 0x0f); + } + + if (threads == 0) + { + /* If APIC ID width is not available, use logical processor count. */ + asm volatile ("cpuid" + : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "0" (0x00000001)); + + /* HTT */ + if (edx & (1 << 28)) + threads = (ebx >> 16) & 0xff; + } + + /* Cap usage of highest cache level to the number of supported threads. */ + if (threads > 0) + shared /= threads; + + /* Account for exclusive L2 and L3 caches. */ + shared += core; + } + if (max_cpuid_ex >= 0x80000001) { asm volatile ("cpuid" @@ -439,13 +517,26 @@ init_cacheinfo (void) : "0" (0x80000001)); /* PREFETCHW || 3DNow! */ if ((ecx & 0x100) || (edx & 0x80000000)) - __x86_64_prefetchw = -1; + _x86_64_prefetchw = -1; + } } + + /* Override default values for each cache size discovered. */ + if (data >= 0) + { + _x86_64_data_cache_size = data; + _x86_64_data_cache_size_half = data / 2; } - if (core > 0) - __x86_64_core_cache_size_half = core / 2; + if (core >= 0) + { + _x86_64_core_cache_size = core; + _x86_64_core_cache_size_half = core / 2; + } - if (shared > 0) - __x86_64_shared_cache_size_half = shared / 2; + if (shared >= 0) + { + _x86_64_shared_cache_size = shared; + _x86_64_shared_cache_size_half = shared / 2; + } } Index: sysdeps/x86_64/memcpy.S =================================================================== RCS file: /cvs/glibc/libc/sysdeps/x86_64/memcpy.S,v retrieving revision 1.5.6.1 diff -p -u -w -p -u -w -r1.5.6.1 memcpy.S --- sysdeps/x86_64/memcpy.S 12 Jul 2007 13:29:44 -0000 1.5.6.1 +++ sysdeps/x86_64/memcpy.S 17 Aug 2007 20:06:15 -0000 @@ -39,7 +39,7 @@ .text -#if defined PIC && !defined NOT_IN_libc +#if !defined USE_AS_MEMPCPY && defined PIC && !defined NOT_IN_libc ENTRY (__memcpy_chk) cmpq %rdx, %rcx @@ -146,7 +146,7 @@ L(1after): /* Align to the natural word size. */ L(aligntry): - movl %esi, %ecx /* align by destination */ + movl %esi, %ecx /* align by source */ andl $7, %ecx jz L(alignafter) /* already aligned */ @@ -172,7 +172,7 @@ L(alignloop): /* 1-byte alignment loo L(alignafter): -/* Loop to handle mid-sized blocks. */ +/* Handle mid-sized blocks. */ L(32try): /* up to 1KB */ cmpq $1024, %rdx @@ -245,11 +245,11 @@ L(32after): larger blocks are excluded when building for RTLD. */ -/* Handle large blocks smaller than 1/2 L1. */ +/* Handle blocks smaller than 1/2 L1. */ L(fasttry): /* first 1/2 L1 */ #ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */ - movq __x86_64_core_cache_size_half (%rip), %r11 + movq _x86_64_data_cache_size_half (%rip), %r11 cmpq %rdx, %r11 /* calculate the smaller of */ cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */ #endif @@ -289,16 +289,19 @@ L(fastskip): #endif retq /* exit */ -#ifndef NOT_IN_libc /* none of the algorithms below for RTLD */ - .p2align 4 L(fastafter): -/* Handle large blocks smaller than 1/2 L2. */ +#ifndef NOT_IN_libc /* none of the algorithms below for RTLD */ + +/* Handle blocks smaller than 1/2 L2 or 1/2 L3. */ -L(pretry): /* first 1/2 L2 */ - movq __x86_64_shared_cache_size_half (%rip), %r8 +L(pretry): /* first 1/2 L2 or 1/2 L3*/ + movq _x86_64_core_cache_size_half (%rip), %r8 + movq _x86_64_shared_cache_size_half (%rip), %rcx + cmpq %rcx, %r8 /* calculate the greater of */ + cmovbq %rcx, %r8 /* 1/2 L2 and 1/2 L3 */ cmpq %rdx, %r8 /* calculate the lesser of */ cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */ @@ -317,7 +320,7 @@ L(pre): /* 64-byte with prefetching movq %rbx, SAVE3 (%rsp) cfi_rel_offset (%rbx, SAVE3) - cmpl $0, __x86_64_prefetchw (%rip) + cmpl $0, _x86_64_prefetchw (%rip) jz L(preloop) /* check if PREFETCHW OK */ .p2align 4 @@ -477,7 +480,7 @@ L(preskip): L(preafter): -/* Loop to handle huge blocks. */ +/* Handle huge blocks. */ L(NTtry): Index: sysdeps/x86_64/memset.S =================================================================== RCS file: /cvs/glibc/libc/sysdeps/x86_64/memset.S,v retrieving revision 1.5 diff -p -u -w -p -u -w -r1.5 memset.S --- sysdeps/x86_64/memset.S 31 Mar 2005 10:00:13 -0000 1.5 +++ sysdeps/x86_64/memset.S 17 Aug 2007 20:06:15 -0000 @@ -1,8 +1,10 @@ -/* memset/bzero -- set memory area to CH/0 - Optimized version for x86-64. - Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. +/* + Optimized memset for x86-64. + + Copyright (C) 2007 Free Software Foundation, Inc. + Contributed by Evandro Menezes , 2007. + This file is part of the GNU C Library. - Contributed by Andreas Jaeger . The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -17,130 +19,312 @@ You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ + 02111-1307 USA. +*/ #include #include "asm-syntax.h" -#include "bp-sym.h" -#include "bp-asm.h" - -/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */ -#define BZERO_P (defined memset) - -/* This is somehow experimental and could made dependend on the cache - size. */ -#define LARGE $120000 .text -#if !BZERO_P && defined PIC && !defined NOT_IN_libc + +#if !defined USE_AS_BZERO && defined PIC && !defined NOT_IN_libc ENTRY (__memset_chk) + cmpq %rdx, %rcx jb HIDDEN_JUMPTARGET (__chk_fail) + END (__memset_chk) #endif -ENTRY (memset) -#if BZERO_P - mov %rsi,%rdx /* Adjust parameter. */ - xorl %esi,%esi /* Fill with 0s. */ -#endif - cmp $0x7,%rdx /* Check for small length. */ - mov %rdi,%rcx /* Save ptr as return value. */ - jbe 7f -#if BZERO_P - mov %rsi,%r8 /* Just copy 0. */ + .text + +ENTRY (memset) /* (void *, const void*, size_t*/ + +#ifdef USE_AS_BZERO + movq %rsi, %rdx /* memset doubles as bzero */ + xorl %esi, %esi #else - /* Populate 8 bit data to full 64-bit. */ - movabs $0x0101010101010101,%r8 - movzbl %sil,%eax - imul %rax,%r8 -#endif - test $0x7,%edi /* Check for alignment. */ - je 2f - - .p2align 4 -1: /* Align ptr to 8 byte. */ - mov %sil,(%rcx) - dec %rdx - inc %rcx - test $0x7,%ecx - jne 1b - -2: /* Check for really large regions. */ - mov %rdx,%rax - shr $0x6,%rax - je 4f - cmp LARGE, %rdx - jae 11f - - .p2align 4 -3: /* Copy 64 bytes. */ - mov %r8,(%rcx) - mov %r8,0x8(%rcx) - mov %r8,0x10(%rcx) - mov %r8,0x18(%rcx) - mov %r8,0x20(%rcx) - mov %r8,0x28(%rcx) - mov %r8,0x30(%rcx) - mov %r8,0x38(%rcx) - add $0x40,%rcx - dec %rax - jne 3b - -4: /* Copy final bytes. */ - and $0x3f,%edx - mov %rdx,%rax - shr $0x3,%rax - je 6f - -5: /* First in chunks of 8 bytes. */ - mov %r8,(%rcx) - add $0x8,%rcx - dec %rax - jne 5b -6: - and $0x7,%edx -7: - test %rdx,%rdx - je 9f -8: /* And finally as bytes (up to 7). */ - mov %sil,(%rcx) - inc %rcx - dec %rdx - jne 8b -9: -#if BZERO_P - nop + movq $0x0101010101010101, %rcx /* memset proper */ + movzbq %sil, %rsi + imulq %rcx, %rsi /* replicate 8 times */ +#endif + +/* Handle tiny blocks. */ + +L(try1): + cmpq $64, %rdx + movq %rdi, %rax /* return memory block address (even for bzero ()) */ + jae L(1after) + +L(1): /* 1-byte */ + testb $1, %dl + jz L(1a) + + movb %sil, (%rdi) + incq %rdi + +L(1a): + testb $2, %dl + jz L(1b) + + movw %si, (%rdi) + addq $2, %rdi + +L(1b): + testb $4, %dl + jz L(1c) + + movl %esi, (%rdi) + addq $4, %rdi + +L(1c): + testb $8, %dl + jz L(1d) + + movq %rsi, (%rdi) + addq $8, %rdi + +L(1d): + testb $16, %dl + jz L(1e) + + movq %rsi, (%rdi) + movq %rsi, 8 (%rdi) + addq $16, %rdi + +L(1e): + testb $32, %dl + jz L(exit) + + movq %rsi, (%rdi) + movq %rsi, 8 (%rdi) + movq %rsi, 16 (%rdi) + movq %rsi, 24 (%rdi) + +L(exit): + rep + ret + + .p2align 4 + +L(1after): + +/* Handle small blocks. */ + +L(32try): + cmpq $512, %rdx + ja L(32after) + +L(32): /* 32-byte */ + movl %edx, %ecx + shrl $5, %ecx + jz L(32skip) + + .p2align 4 + +L(32loop): + decl %ecx + + movq %rsi, (%rdi) + movq %rsi, 8 (%rdi) + movq %rsi, 16 (%rdi) + movq %rsi, 24 (%rdi) + + leaq 32 (%rdi), %rdi + + jz L(32skip) + + decl %ecx + + movq %rsi, (%rdi) + movq %rsi, 8 (%rdi) + movq %rsi, 16 (%rdi) + movq %rsi, 24 (%rdi) + + leaq 32 (%rdi), %rdi + + jnz L(32loop) + + .p2align 4 + +L(32skip): + andl $31, %edx + jnz L(1) + + rep + ret + + .p2align 4 + +L(32after): + +/* Align to natural word alignment. */ + +L(aligntry): + movl %edi, %ecx /* align by destination */ + + andl $7, %ecx /* skip if already aligned */ + jz L(alignafter) + +L(align): /* align */ + leaq -8 (%rcx, %rdx), %rdx + subl $8, %ecx + + .p2align 4 + +L(alignloop): + incl %ecx + + movb %sil, (%rdi) + leaq 1 (%rdi), %rdi + + jnz L(alignloop) + + .p2align 4 + +L(alignafter): + +/* + In order to minimize code-size in RTLD, algorithms specific for + larger blocks are excluded when building for RTLD. +*/ + +/* Handle large blocks up to L2 or L3 size. */ + +L(fasttry): +#ifndef NOT_IN_libc + cmpq $2048, %rdx + jb L(64) + + movq _x86_64_core_cache_size (%rip), %r8 + movq _x86_64_shared_cache_size (%rip), %rcx + cmpq %rcx, %r8 /* calculate the greater of */ + cmovbq %rcx, %r8 /* L2 and L3 */ + cmpq %rdx, %r8 /* calculate the lesser of */ + cmovaq %rdx, %r8 /* remaining bytes and L2 or L3 */ +#endif + +L(fast): /* microcode */ +#ifndef NOT_IN_libc + movq %r8, %rcx + andq $-8, %r8 #else - /* Load result (only if used as memset). */ - mov %rdi,%rax /* start address of destination is result */ + movq %rdx, %rcx +#endif + shrq $3, %rcx + jz L(fastskip) + + xchgq %rax, %rsi + + rep + stosq + + xchgq %rax, %rsi + +L(fastskip): +#ifndef NOT_IN_libc + subq %r8, %rdx + ja L(64after) #endif - retq + + andl $7, %edx + jnz L(1) + + rep + ret .p2align 4 -11: /* Copy 64 bytes without polluting the cache. */ - /* We could use movntdq %xmm0,(%rcx) here to further - speed up for large cases but let's not use XMM registers. */ - movnti %r8,(%rcx) - movnti %r8,0x8(%rcx) - movnti %r8,0x10(%rcx) - movnti %r8,0x18(%rcx) - movnti %r8,0x20(%rcx) - movnti %r8,0x28(%rcx) - movnti %r8,0x30(%rcx) - movnti %r8,0x38(%rcx) - add $0x40,%rcx - dec %rax - jne 11b - jmp 4b + +L(fastafter): + +#ifndef NOT_IN_libc /* none of the algorithms below for RTLD */ + +/* Handle mid-size blocks. */ + +L(64try): + +L(64): /* 64-byte */ + movq %rdx, %rcx + shrq $6, %rcx + jz L(64skip) + + .p2align 4 + +L(64loop): + decq %rcx + + movq %rsi, (%rdi) + movq %rsi, 8 (%rdi) + movq %rsi, 16 (%rdi) + movq %rsi, 24 (%rdi) + movq %rsi, 32 (%rdi) + movq %rsi, 40 (%rdi) + movq %rsi, 48 (%rdi) + movq %rsi, 56 (%rdi) + + leaq 64 (%rdi), %rdi + + jnz L(64loop) + +L(64skip): + andl $63, %edx + jnz L(32) + + rep + ret + + .p2align 4 + +L(64after): + +/* Handle huge blocks. */ + +L(NTtry): + +L(NT): /* 128-byte */ + movq %rdx, %rcx + shrq $7, %rcx + jz L(NTskip) + + .p2align 4 + +L(NTloop): + decq %rcx + + movntiq %rsi, (%rdi) + movntiq %rsi, 8 (%rdi) + movntiq %rsi, 16 (%rdi) + movntiq %rsi, 24 (%rdi) + movntiq %rsi, 32 (%rdi) + movntiq %rsi, 40 (%rdi) + movntiq %rsi, 48 (%rdi) + movntiq %rsi, 56 (%rdi) + movntiq %rsi, 64 (%rdi) + movntiq %rsi, 72 (%rdi) + movntiq %rsi, 80 (%rdi) + movntiq %rsi, 88 (%rdi) + movntiq %rsi, 96 (%rdi) + movntiq %rsi, 104 (%rdi) + movntiq %rsi, 112 (%rdi) + movntiq %rsi, 120 (%rdi) + + leaq 128 (%rdi), %rdi + + jnz L(NTloop) + + sfence + +L(NTskip): + andl $127, %edx + jnz L(32) + + rep + ret + +#endif /* !NOT_IN_libc */ END (memset) -#if !BZERO_P -libc_hidden_builtin_def (memset) -#endif -#if !BZERO_P && defined PIC && !defined NOT_IN_libc -strong_alias (__memset_chk, __memset_zero_constant_len_parameter) - .section .gnu.warning.__memset_zero_constant_len_parameter - .string "memset used with constant zero length parameter; this could be due to transposed parameters" +#ifndef USE_AS_BZERO +libc_hidden_builtin_def (memset) #endif