Index: sysdeps/x86_64/bzero.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/x86_64/bzero.S,v
retrieving revision 1.1
diff -p -u -w -p -u -w -r1.1 bzero.S
--- sysdeps/x86_64/bzero.S	31 Aug 2002 17:30:07 -0000	1.1
+++ sysdeps/x86_64/bzero.S	17 Aug 2007 20:06:14 -0000
@@ -1,3 +1,6 @@
-#define memset __bzero
-#include <sysdeps/x86_64/memset.S>
-weak_alias (__bzero, bzero)
+#define USE_AS_BZERO
+#define memset bzero
+
+#include "memset.S"
+
+weak_alias (bzero, __bzero)
Index: sysdeps/x86_64/cacheinfo.c
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/x86_64/cacheinfo.c,v
retrieving revision 1.3.2.1
diff -p -u -w -p -u -w -r1.3.2.1 cacheinfo.c
--- sysdeps/x86_64/cacheinfo.c	12 Jul 2007 13:29:44 -0000	1.3.2.1
+++ sysdeps/x86_64/cacheinfo.c	17 Aug 2007 20:06:14 -0000
@@ -1,4 +1,6 @@
-/* x86_64 cache info.
+/*
+   x86_64 cache info.
+
    Copyright (C) 2003, 2004, 2006, 2007 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -187,7 +189,6 @@ intel_check_word (int name, unsigned int
   return 0;
 }
 
-
 static long int __attribute__ ((noinline))
 handle_intel (int name, unsigned int maxidx)
 {
@@ -245,7 +246,6 @@ handle_intel (int name, unsigned int max
   return 0;
 }
 
-
 static long int __attribute__ ((noinline))
 handle_amd (int name)
 {
@@ -257,7 +257,7 @@ handle_amd (int name)
 		: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
 		: "0" (0x80000000));
 
-  if (name >= _SC_LEVEL3_CACHE_SIZE)
+  if (name > _SC_LEVEL3_CACHE_SIZE)
     return 0;
 
   unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
@@ -286,6 +286,7 @@ handle_amd (int name)
       return ecx & 0xff;
     case _SC_LEVEL1_DCACHE_LINESIZE:
       return ecx & 0xff;
+
     case _SC_LEVEL2_CACHE_SIZE:
       return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
     case _SC_LEVEL2_CACHE_ASSOC:
@@ -301,6 +302,16 @@ handle_amd (int name)
 	  return 8;
 	case 8:
 	  return 16;
+	case 10:
+	  return 32;
+	case 11:
+	  return 48;
+	case 12:
+	  return 64;
+	case 13:
+	  return 96;
+	case 14:
+	  return 128;
 	case 0xf:
 	  return (ecx << 6) & 0x3fffc00;
 	default:
@@ -308,13 +319,46 @@ handle_amd (int name)
         }
     case _SC_LEVEL2_CACHE_LINESIZE:
       return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
+
+    case _SC_LEVEL3_CACHE_SIZE:
+      return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1;
+    case _SC_LEVEL3_CACHE_ASSOC:
+      edx >>= 12;
+      switch (edx & 0xf)
+        {
+        case 0:
+        case 1:
+        case 2:
+        case 4:
+	  return edx & 0xf;
+	case 6:
+	  return 8;
+	case 8:
+	  return 16;
+	case 10:
+	  return 32;
+	case 11:
+	  return 48;
+	case 12:
+	  return 64;
+	case 13:
+	  return 96;
+	case 14:
+	  return 128;
+	case 0xf:
+	  return (edx & 0x3ffc0000) << 1;
+	default:
+	  return 0;
+        }
+    case _SC_LEVEL3_CACHE_LINESIZE:
+      return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
+
     default:
       assert (! "cannot happen");
     }
   return -1;
 }
 
-
 /* Get the value of the system variable NAME.  */
 long int
 attribute_hidden
@@ -343,19 +387,19 @@ __cache_sysconf (int name)
   return 0;
 }
 
-
-/* Half the core cache size for use in memory and string routines, typically
-   L1 size. */
-long int __x86_64_core_cache_size_half attribute_hidden = 32 * 1024 / 2;
-/* Shared cache size for use in memory and string routines, typically
-   L2 or L3 size. */
-long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
+/* Core data cache size for use in memory and string routines, typically L1 size. */
+long int __attribute__ ((visibility ("hidden"))) _x86_64_data_cache_size      = 32 * 1024;
+long int __attribute__ ((visibility ("hidden"))) _x86_64_data_cache_size_half = 32 * 1024 / 2;
+/* Core unified cache size for use in memory and string routines, typically L2 size. */
+long int __attribute__ ((visibility ("hidden"))) _x86_64_core_cache_size      = 512 * 1024;
+long int __attribute__ ((visibility ("hidden"))) _x86_64_core_cache_size_half = 512 * 1024 / 2;
+/* Shared cache size for use in memory and string routines, typically L2 or L3 size. */
+long int __attribute__ ((visibility ("hidden"))) _x86_64_shared_cache_size      = 2 * 1024 * 1024;
+long int __attribute__ ((visibility ("hidden"))) _x86_64_shared_cache_size_half = 2 * 1024 * 1024 / 2;
 /* PREFETCHW support flag for use in memory and string routines. */
-int __x86_64_prefetchw attribute_hidden;
-
+int __attribute__ ((visibility ("hidden"))) _x86_64_prefetchw;
 
-static void
-__attribute__((constructor))
+static void __attribute__((constructor))
 init_cacheinfo (void)
 {
   /* Find out what brand of processor.  */
@@ -363,10 +407,8 @@ init_cacheinfo (void)
   unsigned int ebx;
   unsigned int ecx;
   unsigned int edx;
-  int max_cpuid;
-  int max_cpuid_ex;
-  long int core = -1;
-  long int shared = -1;
+  int max_cpuid, max_cpuid_ex;
+  long int data = -1, core = -1, shared = -1;
   unsigned int level;
   unsigned int threads = 0;
 
@@ -377,21 +419,21 @@ init_cacheinfo (void)
   /* This spells out "GenuineIntel".  */
   if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
     {
-      core = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
+      data   = handle_intel (_SC_LEVEL1_DCACHE_SIZE, eax);
+      core   = handle_intel (_SC_LEVEL2_CACHE_SIZE, eax);
+      shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, eax);
 
-      /* Try L3 first. */
+      /* Assume L3 exists. */
       level  = 3;
-      shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
-
       if (shared <= 0)
         {
-	  /* Try L2 otherwise. */
+	  /* Use L2 if no L3. */
           level  = 2;
-          shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
+          shared = core;
+          core   = 0;
 	}
 
-      /* Figure out the number of logical threads that share the
-	 highest cache level. */
+      /* Figure out the number of logical threads that share the highest cache level. */
       if (max_cpuid >= 4)
         {
 	  int i = 0;
@@ -417,21 +459,57 @@ init_cacheinfo (void)
 	  threads = (ebx >> 16) & 0xff;
 	}
 
-      /* Cap usage of highest cache level to the number of supported
-	 threads. */
-      if (shared > 0 && threads > 0)
+      /* Cap usage of highest cache level to the number of supported threads. */
+      if (threads > 0 && shared > 0)
         shared /= threads;
     }
   /* This spells out "AuthenticAMD".  */
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
     {
-      core   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
-      shared = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+      data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
+      core   = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+      shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
 
+      /* Get maximum extended function. */
       asm volatile ("cpuid"
 		    : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx)
 		    : "0" (0x80000000));
 
+      if (shared <= 0)
+	shared = 0;
+      else
+	{
+	  /* Figure out the number of logical threads that share L3. */
+	  if (max_cpuid_ex >= 0x80000008)
+	    {
+	      /* Get width of APIC ID. */
+	      asm volatile ("cpuid"
+			    : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx)
+			    : "0" (0x80000008));
+
+	      threads = 1 << ((ecx >> 12) & 0x0f);
+	    }
+
+          if (threads == 0)
+            {
+	      /* If APIC ID width is not available, use logical processor count. */
+	      asm volatile ("cpuid"
+	      		    : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx)
+			    : "0" (0x00000001));
+
+       	      /* HTT */
+              if (edx & (1 << 28))
+		threads = (ebx >> 16) & 0xff;
+	    }
+
+	  /* Cap usage of highest cache level to the number of supported threads. */
+	  if (threads > 0)
+	    shared /= threads;
+
+	  /* Account for exclusive L2 and L3 caches. */
+	  shared += core;
+	}
+
       if (max_cpuid_ex >= 0x80000001)
 	{
 	  asm volatile ("cpuid"
@@ -439,13 +517,26 @@ init_cacheinfo (void)
 			: "0" (0x80000001));
 	  /*  PREFETCHW     || 3DNow! */
 	  if ((ecx & 0x100) || (edx & 0x80000000))
-	    __x86_64_prefetchw = -1;
+	    _x86_64_prefetchw = -1;
+	}
 	}
+
+  /* Override default values for each cache size discovered. */
+  if (data >= 0)
+    {
+      _x86_64_data_cache_size = data;
+      _x86_64_data_cache_size_half = data / 2;
     }
 
-  if (core > 0)
-    __x86_64_core_cache_size_half = core / 2;
+  if (core >= 0)
+    {
+      _x86_64_core_cache_size = core;
+      _x86_64_core_cache_size_half = core / 2;
+    }
 
-  if (shared > 0)
-    __x86_64_shared_cache_size_half = shared / 2;
+  if (shared >= 0)
+    {
+      _x86_64_shared_cache_size = shared;
+      _x86_64_shared_cache_size_half = shared / 2;
+    }
 }
Index: sysdeps/x86_64/memcpy.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/x86_64/memcpy.S,v
retrieving revision 1.5.6.1
diff -p -u -w -p -u -w -r1.5.6.1 memcpy.S
--- sysdeps/x86_64/memcpy.S	12 Jul 2007 13:29:44 -0000	1.5.6.1
+++ sysdeps/x86_64/memcpy.S	17 Aug 2007 20:06:15 -0000
@@ -39,7 +39,7 @@
 
         .text
 
-#if defined PIC && !defined NOT_IN_libc
+#if !defined USE_AS_MEMPCPY && defined PIC && !defined NOT_IN_libc
 ENTRY (__memcpy_chk)
 
 	cmpq	%rdx, %rcx
@@ -146,7 +146,7 @@ L(1after):
 /* Align to the natural word size. */
 
 L(aligntry):
-	movl	%esi, %ecx      	/* align by destination */
+	movl	%esi, %ecx      	/* align by source */
 
 	andl	$7, %ecx
 	jz	L(alignafter)  		/* already aligned */
@@ -172,7 +172,7 @@ L(alignloop):				/* 1-byte alignment loo
 
 L(alignafter):
 
-/* Loop to handle mid-sized blocks. */
+/* Handle mid-sized blocks. */
 
 L(32try):				/* up to 1KB */
 	cmpq	$1024, %rdx
@@ -245,11 +245,11 @@ L(32after):
 	larger blocks are excluded when building for RTLD.
 */
 
-/* Handle large blocks smaller than 1/2 L1. */
+/* Handle blocks smaller than 1/2 L1. */
 
 L(fasttry):				/* first 1/2 L1 */
 #ifndef NOT_IN_libc			/* only up to this algorithm outside of libc.so */
-	movq	__x86_64_core_cache_size_half (%rip), %r11
+	movq	_x86_64_data_cache_size_half (%rip), %r11
 	cmpq	%rdx, %r11		/* calculate the smaller of */
 	cmovaq	%rdx, %r11		/* remaining bytes and 1/2 L1 */
 #endif
@@ -289,16 +289,19 @@ L(fastskip):
 #endif
 	retq				/* exit */
 
-#ifndef NOT_IN_libc			/* none of the algorithms below for RTLD */
-
 	.p2align 4
 
 L(fastafter):
 
-/* Handle large blocks smaller than 1/2 L2. */
+#ifndef NOT_IN_libc			/* none of the algorithms below for RTLD */
+
+/* Handle blocks smaller than 1/2 L2 or 1/2 L3. */
 
-L(pretry):				/* first 1/2 L2 */
-	movq	__x86_64_shared_cache_size_half (%rip), %r8
+L(pretry):				/* first 1/2 L2 or 1/2 L3*/
+	movq	_x86_64_core_cache_size_half (%rip), %r8
+	movq	_x86_64_shared_cache_size_half (%rip), %rcx
+	cmpq	%rcx, %r8		/* calculate the greater of */
+	cmovbq	%rcx, %r8		/* 1/2 L2 and 1/2 L3 */
 	cmpq	%rdx, %r8		/* calculate the lesser of */
 	cmovaq	%rdx, %r8		/* remaining bytes and 1/2 L2 */
 
@@ -317,7 +320,7 @@ L(pre):					/* 64-byte with prefetching 
 	movq	%rbx, SAVE3 (%rsp)
 	cfi_rel_offset (%rbx, SAVE3)
 
-	cmpl	$0, __x86_64_prefetchw (%rip)
+	cmpl	$0, _x86_64_prefetchw (%rip)
 	jz	L(preloop)		/* check if PREFETCHW OK */
 
 	.p2align 4
@@ -477,7 +480,7 @@ L(preskip):
 
 L(preafter):
 
-/* Loop to handle huge blocks. */
+/* Handle huge blocks. */
 
 L(NTtry):
 
Index: sysdeps/x86_64/memset.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/x86_64/memset.S,v
retrieving revision 1.5
diff -p -u -w -p -u -w -r1.5 memset.S
--- sysdeps/x86_64/memset.S	31 Mar 2005 10:00:13 -0000	1.5
+++ sysdeps/x86_64/memset.S	17 Aug 2007 20:06:15 -0000
@@ -1,8 +1,10 @@
-/* memset/bzero -- set memory area to CH/0
-   Optimized version for x86-64.
-   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+/*
+   Optimized memset for x86-64.
+
+   Copyright (C) 2007 Free Software Foundation, Inc.
+   Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
+
    This file is part of the GNU C Library.
-   Contributed by Andreas Jaeger <aj@suse.de>.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -17,130 +19,312 @@
    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library; if not, write to the Free
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   02111-1307 USA.
+*/
 
 #include <sysdep.h>
 #include "asm-syntax.h"
-#include "bp-sym.h"
-#include "bp-asm.h"
-
-/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */
-#define BZERO_P (defined memset)
-
-/* This is somehow experimental and could made dependend on the cache
-   size.  */
-#define LARGE $120000
 
         .text
-#if !BZERO_P && defined PIC && !defined NOT_IN_libc
+
+#if !defined USE_AS_BZERO && defined PIC && !defined NOT_IN_libc
 ENTRY (__memset_chk)
+
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+
 END (__memset_chk)
 #endif
-ENTRY (memset)
-#if BZERO_P
-	mov	%rsi,%rdx	/* Adjust parameter.  */
-	xorl	%esi,%esi	/* Fill with 0s.  */
-#endif
-	cmp	$0x7,%rdx	/* Check for small length.  */
-	mov	%rdi,%rcx	/* Save ptr as return value.  */
-	jbe	7f
 
-#if BZERO_P
-	mov	%rsi,%r8	/* Just copy 0.  */
+        .text
+
+ENTRY (memset)                		/* (void *, const void*, size_t*/
+
+#ifdef USE_AS_BZERO
+        movq	%rsi, %rdx		/* memset doubles as bzero */
+        xorl	%esi, %esi
 #else
-	/* Populate 8 bit data to full 64-bit.  */
-	movabs	$0x0101010101010101,%r8
-	movzbl	%sil,%eax
-	imul	%rax,%r8
-#endif
-	test	$0x7,%edi	/* Check for alignment.  */
-	je	2f
-
-	.p2align 4
-1:	/* Align ptr to 8 byte.  */
-	mov	%sil,(%rcx)
-	dec	%rdx
-	inc	%rcx
-	test	$0x7,%ecx
-	jne	1b
-
-2:	/* Check for really large regions.  */
-	mov	%rdx,%rax
-	shr	$0x6,%rax
-	je	4f
-	cmp	LARGE, %rdx
-	jae	11f
-
-	.p2align 4
-3:	/* Copy 64 bytes.  */
-	mov	%r8,(%rcx)
-	mov	%r8,0x8(%rcx)
-	mov	%r8,0x10(%rcx)
-	mov	%r8,0x18(%rcx)
-	mov	%r8,0x20(%rcx)
-	mov	%r8,0x28(%rcx)
-	mov	%r8,0x30(%rcx)
-	mov	%r8,0x38(%rcx)
-	add	$0x40,%rcx
-	dec	%rax
-	jne	3b
-
-4:	/* Copy final bytes.  */
-	and	$0x3f,%edx
-	mov	%rdx,%rax
-	shr	$0x3,%rax
-	je	6f
-
-5:	/* First in chunks of 8 bytes.  */
-	mov	%r8,(%rcx)
-	add	$0x8,%rcx
-	dec	%rax
-	jne	5b
-6:
-	and	$0x7,%edx
-7:
-	test	%rdx,%rdx
-	je	9f
-8:	/* And finally as bytes (up to 7).  */
-	mov	%sil,(%rcx)
-	inc	%rcx
-	dec	%rdx
-	jne	8b
-9:
-#if BZERO_P
-	nop
+	movq	$0x0101010101010101, %rcx /* memset proper */
+        movzbq	%sil, %rsi
+        imulq	%rcx, %rsi		/* replicate 8 times */
+#endif
+
+/* Handle tiny blocks. */
+
+L(try1):
+        cmpq	$64, %rdx
+        movq	%rdi, %rax		/* return memory block address (even for bzero ()) */
+        jae	L(1after)
+
+L(1):                                	/* 1-byte */
+        testb	$1, %dl
+        jz      L(1a)
+
+        movb	%sil, (%rdi)
+        incq	%rdi
+
+L(1a):
+        testb	$2, %dl
+        jz      L(1b)
+
+        movw	%si, (%rdi)
+        addq	$2, %rdi
+
+L(1b):
+        testb	$4, %dl
+        jz      L(1c)
+
+        movl	%esi, (%rdi)
+	addq	$4, %rdi
+
+L(1c):
+        testb	$8, %dl
+        jz      L(1d)
+
+        movq	%rsi, (%rdi)
+	addq	$8, %rdi
+
+L(1d):
+        testb	$16, %dl
+        jz      L(1e)
+
+        movq	%rsi,   (%rdi)
+        movq	%rsi, 8 (%rdi)
+	addq	$16, %rdi
+
+L(1e):
+        testb	$32, %dl
+        jz      L(exit)
+
+        movq	%rsi,    (%rdi)
+        movq	%rsi,  8 (%rdi)
+        movq	%rsi, 16 (%rdi)
+        movq	%rsi, 24 (%rdi)
+
+L(exit):
+        rep
+        ret
+
+        .p2align 4
+
+L(1after):
+
+/* Handle small blocks. */
+
+L(32try):
+        cmpq	$512, %rdx
+        ja	L(32after)
+
+L(32):                               	/* 32-byte */
+        movl	%edx, %ecx
+        shrl	$5, %ecx
+        jz      L(32skip)
+
+        .p2align 4
+
+L(32loop):
+        decl	%ecx
+
+        movq	%rsi,    (%rdi)
+        movq	%rsi,  8 (%rdi)
+        movq	%rsi, 16 (%rdi)
+        movq	%rsi, 24 (%rdi)
+
+        leaq	32 (%rdi), %rdi
+
+        jz      L(32skip)
+
+        decl	%ecx
+
+        movq	%rsi,    (%rdi)
+        movq	%rsi,  8 (%rdi)
+        movq	%rsi, 16 (%rdi)
+        movq	%rsi, 24 (%rdi)
+
+        leaq	32 (%rdi), %rdi
+
+        jnz     L(32loop)
+
+        .p2align 4
+
+L(32skip):
+        andl	$31, %edx
+        jnz     L(1)
+
+        rep
+        ret
+
+        .p2align 4
+
+L(32after):
+
+/* Align to natural word alignment. */
+
+L(aligntry):
+        movl	%edi, %ecx              /* align by destination */
+
+        andl	$7, %ecx                /* skip if already aligned */
+        jz      L(alignafter)
+
+L(align):				/* align */
+        leaq	-8 (%rcx, %rdx), %rdx
+        subl	$8, %ecx
+
+        .p2align 4
+
+L(alignloop):
+        incl	%ecx
+
+        movb	%sil, (%rdi)
+        leaq	1 (%rdi), %rdi
+
+        jnz     L(alignloop)
+
+        .p2align 4
+
+L(alignafter):
+
+/*
+   In order to minimize code-size in RTLD, algorithms specific for
+   larger blocks are excluded when building for RTLD.
+*/
+
+/* Handle large blocks up to L2 or L3 size. */
+
+L(fasttry):
+#ifndef NOT_IN_libc
+	cmpq	$2048, %rdx
+	jb	L(64)
+
+	movq	_x86_64_core_cache_size (%rip), %r8
+	movq	_x86_64_shared_cache_size (%rip), %rcx
+	cmpq	%rcx, %r8		/* calculate the greater of */
+	cmovbq	%rcx, %r8		/* L2 and L3 */
+        cmpq	%rdx, %r8		/* calculate the lesser of */
+        cmovaq	%rdx, %r8		/* remaining bytes and L2 or L3 */
+#endif
+
+L(fast):				/* microcode */
+#ifndef NOT_IN_libc
+	movq	%r8, %rcx
+	andq	$-8, %r8
 #else
-	/* Load result (only if used as memset).  */
-	mov	%rdi,%rax	/* start address of destination is result */
+	movq	%rdx, %rcx
+#endif
+	shrq	$3, %rcx
+	jz	L(fastskip)
+
+	xchgq	%rax, %rsi
+
+	rep
+	stosq
+
+	xchgq	%rax, %rsi
+
+L(fastskip):
+#ifndef NOT_IN_libc
+	subq	%r8, %rdx
+	ja	L(64after)
 #endif
-	retq
+
+	andl	$7, %edx
+	jnz	L(1)
+
+	rep
+	ret
 
 	.p2align 4
-11:	/* Copy 64 bytes without polluting the cache.  */
-	/* We could use	movntdq    %xmm0,(%rcx) here to further
-	   speed up for large cases but let's not use XMM registers.  */
-	movnti	%r8,(%rcx)
-	movnti  %r8,0x8(%rcx)
-	movnti  %r8,0x10(%rcx)
-	movnti  %r8,0x18(%rcx)
-	movnti  %r8,0x20(%rcx)
-	movnti  %r8,0x28(%rcx)
-	movnti  %r8,0x30(%rcx)
-	movnti  %r8,0x38(%rcx)
-	add	$0x40,%rcx
-	dec	%rax
-	jne	11b
-	jmp	4b
+
+L(fastafter):
+
+#ifndef NOT_IN_libc			/* none of the algorithms below for RTLD */
+
+/* Handle mid-size blocks. */
+
+L(64try):
+
+L(64):                               	/* 64-byte */
+        movq	%rdx, %rcx
+        shrq	$6, %rcx
+	jz	L(64skip)
+
+        .p2align 4
+
+L(64loop):
+        decq	%rcx
+
+        movq	%rsi,    (%rdi)
+        movq	%rsi,  8 (%rdi)
+        movq	%rsi, 16 (%rdi)
+        movq	%rsi, 24 (%rdi)
+        movq	%rsi, 32 (%rdi)
+        movq	%rsi, 40 (%rdi)
+        movq	%rsi, 48 (%rdi)
+        movq	%rsi, 56 (%rdi)
+
+        leaq	64 (%rdi), %rdi
+
+        jnz     L(64loop)
+
+L(64skip):
+	andl	$63, %edx
+	jnz     L(32)
+
+        rep
+        ret
+
+        .p2align 4
+
+L(64after):
+
+/* Handle huge blocks. */
+
+L(NTtry):
+
+L(NT):                               	/* 128-byte */
+        movq	%rdx, %rcx
+        shrq	$7, %rcx
+        jz      L(NTskip)
+
+        .p2align 4
+
+L(NTloop):
+        decq	%rcx
+
+        movntiq	%rsi,     (%rdi)
+        movntiq	%rsi,   8 (%rdi)
+        movntiq	%rsi,  16 (%rdi)
+        movntiq	%rsi,  24 (%rdi)
+        movntiq	%rsi,  32 (%rdi)
+        movntiq	%rsi,  40 (%rdi)
+        movntiq	%rsi,  48 (%rdi)
+        movntiq	%rsi,  56 (%rdi)
+        movntiq	%rsi,  64 (%rdi)
+        movntiq	%rsi,  72 (%rdi)
+        movntiq	%rsi,  80 (%rdi)
+        movntiq	%rsi,  88 (%rdi)
+        movntiq	%rsi,  96 (%rdi)
+        movntiq	%rsi, 104 (%rdi)
+        movntiq	%rsi, 112 (%rdi)
+        movntiq	%rsi, 120 (%rdi)
+
+        leaq	128 (%rdi), %rdi
+
+        jnz     L(NTloop)
+
+        sfence
+
+L(NTskip):
+        andl	$127, %edx
+        jnz     L(32)
+
+        rep
+        ret
+
+#endif /* !NOT_IN_libc */
 
 END (memset)
-#if !BZERO_P
-libc_hidden_builtin_def (memset)
-#endif
 
-#if !BZERO_P && defined PIC && !defined NOT_IN_libc
-strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
-	.section .gnu.warning.__memset_zero_constant_len_parameter
-	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+#ifndef USE_AS_BZERO
+libc_hidden_builtin_def (memset)
 #endif