This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: PATCH: Optimized memset for x86-64

From: Harsha Jagasia <harsha dot jagasia at amd dot com>
To: libc-alpha at sourceware dot org, drepper at redhat dot com, hjl at lucon dot org
Cc: Harsha Jagasia <harsha dot jagasia at amd dot com>
Date: 28 Feb 2008 17:17:58 -0600
Subject: Re: PATCH: Optimized memset for x86-64
This is a rewrite of the non-SSE code path in x86-64 memset to improve
performance on AMD processors. It uses "rep stos" instruction between 8KB and
64KB, which improves performance on AMD Barcelona by ~7% at most blocks in that
range. Another improvement is at blocks larger than the largest shared cache
size; here the sub block that is smaller than the largest shared cache size is
set with "rep stos" and the remaining sub block is set with "movnti". This
improves performance by up to 50% for blocks in that range. The patch includes
H.J's rewrite and the changes made by Ulrich and Jakub since then.

We are accepting the code changes made by H.J wrt to alignment boundary and
prologue for blocks larger than 144 bytes. We are also accepting the code
changes made by H.J for blocks less than or equal to 144 bytes. This code is
also reused as epilogue for blocks larger than 144 bytes. These code changes
result in performance losses on AMD Barcelona machine using synthetic tests
like the glibc tests and some home grown tests. These performance losses are
observed relative to the submission made by AMD at 
http://sources.redhat.com/ml/libc-alpha/2007-08/msg00054.html. Inspite of the
performance losses seen in synthetic tests, there are also some performance
gains and the changes overall have been observed to be neutral on the
workloads we have measured. Hence we are accepting H.J's changes.

It would be great if Ulrich could review. All feedback is welcome.

Harsha
-------

2008-2-26  Harsha Jagasia  <harsha.jagasia@amd.com>

        * sysdeps/x86_64/cacheinfo.c (NOT_USED_RIGHT_NOW): Remove ifdef guards.

        * sysdeps/x86_64/memset.S: Rewrite non-SSE code path as tuned for AMD
	  Barcelona machine. Make default fall through branch of
	  __x86_64_preferred_memory_instruction check as the integer code path.
	  
2007-10-15  H.J. Lu  <hongjiu.lu@intel.com>

        * sysdeps/x86_64/cacheinfo.c
	(__x86_64_preferred_memory_instruction): New.
	(init_cacheinfo): Initialize __x86_64_preferred_memory_instruction.

	* sysdeps/x86_64/memset.S: Rewrite.

Index: sysdeps/x86_64/cacheinfo.c
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/x86_64/cacheinfo.c,v
retrieving revision 1.10
diff -d -u -p -r1.10 cacheinfo.c
--- sysdeps/x86_64/cacheinfo.c	23 Dec 2007 19:32:28 -0000	1.10
+++ sysdeps/x86_64/cacheinfo.c	26 Feb 2008 18:03:19 -0000
@@ -405,13 +405,10 @@ long int __x86_64_data_cache_size_half a
 /* Shared cache size for use in memory and string routines, typically
    L2 or L3 size.  */
 long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
-#ifdef NOT_USED_RIGHT_NOW
 long int __x86_64_shared_cache_size attribute_hidden = 1024 * 1024;
-#endif
 /* PREFETCHW support flag for use in memory and string routines.  */
 int __x86_64_prefetchw attribute_hidden;
 
-#ifdef NOT_USED_RIGHT_NOW
 /* Instructions preferred for memory and string routines.
 
   0: Regular instructions
@@ -421,7 +418,6 @@ int __x86_64_prefetchw attribute_hidden;
 
   */
 int __x86_64_preferred_memory_instruction attribute_hidden;
-#endif
 
 
 static void
@@ -464,14 +460,12 @@ init_cacheinfo (void)
 		    : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
 		    : "0" (1));
 
-#ifdef NOT_USED_RIGHT_NOW
       /* Intel prefers SSSE3 instructions for memory/string rountines
 	 if they are avaiable.  */
       if ((ecx & 0x200))
 	__x86_64_preferred_memory_instruction = 3;
       else
 	__x86_64_preferred_memory_instruction = 2;
-#endif
 
       /* Figure out the number of logical threads that share the
 	 highest cache level.  */
@@ -577,8 +571,6 @@ init_cacheinfo (void)
   if (shared > 0)
     {
       __x86_64_shared_cache_size_half = shared / 2;
-#ifdef NOT_USED_RIGHT_NOW
       __x86_64_shared_cache_size = shared;
-#endif
     }
 }
Index: sysdeps/x86_64/memset.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/x86_64/memset.S,v
retrieving revision 1.12
diff -d -u -p -r1.12 memset.S
--- sysdeps/x86_64/memset.S	8 Nov 2007 01:06:30 -0000	1.12
+++ sysdeps/x86_64/memset.S	26 Feb 2008 18:03:19 -0000
@@ -2,7 +2,6 @@
    Optimized version for x86-64.
    Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Andreas Jaeger <aj@suse.de>.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -20,16 +19,12 @@
    02111-1307 USA.  */
 
 #include <sysdep.h>
-#include "asm-syntax.h"
-#include "bp-sym.h"
-#include "bp-asm.h"
 
-/* This is somehow experimental and could made dependend on the cache
-   size.  */
-#define LARGE $120000
+#define __STOS_LOWER_BOUNDARY	$8192
+#define __STOS_UPPER_BOUNDARY	$65536
 
         .text
-#ifndef NOT_IN_libc
+#if !defined NOT_IN_libc
 ENTRY(__bzero)
 	mov	%rsi,%rdx	/* Adjust parameter.  */
 	xorl	%esi,%esi	/* Fill with 0s.  */
@@ -46,89 +41,1291 @@ END (__memset_chk)
 #endif
 ENTRY (memset)
 L(memset_entry):
-	cmp	$0x7,%rdx	/* Check for small length.  */
-	mov	%rdi,%rcx	/* Save ptr as return value.  */
-	jbe	7f
+	cmp    $0x1,%rdx
+	mov    %rdi,%rax	/* memset returns the dest address.  */
+	jne    L(ck2)
+	mov    %sil,(%rdi)
+	retq   $0x0
+L(ck2):
+	mov    $0x101010101010101,%r9
+	mov    %rdx,%r8
+	movzbq %sil,%rdx
+	imul   %r9,%rdx
+L(now_dw_aligned):
+	cmp    $0x90,%r8
+	jg     L(ck_mem_ops_method)
+L(now_dw_aligned_small):
+	lea    L(setPxQx)(%rip),%r11
+	add    %r8,%rdi
+#ifndef PIC
+	jmpq   *(%r11,%r8,8)
+#else
+	movslq (%r11,%r8,4),%rcx
+	lea    (%rcx,%r11,1),%r11
+	jmpq   *%r11
+#endif
 
-	/* Populate 8 bit data to full 64-bit.  */
-	movabs	$0x0101010101010101,%r8
-	movzbl	%sil,%eax
-	imul	%rax,%r8
-	test	$0x7,%edi	/* Check for alignment.  */
-	je	2f
+L(Got0):
+	retq   $0x0
 
-	.p2align 4
-1:	/* Align ptr to 8 byte.  */
-	mov	%sil,(%rcx)
-	dec	%rdx
-	inc	%rcx
-	test	$0x7,%ecx
-	jne	1b
+	.pushsection .rodata
+	.balign     16
+#ifndef PIC
+L(setPxQx):
+	.quad       L(Got0), L(P1Q0), L(P2Q0), L(P3Q0)
+	.quad       L(P4Q0), L(P5Q0), L(P6Q0), L(P7Q0)
+	.quad       L(P0Q1), L(P1Q1), L(P2Q1), L(P3Q1)
+	.quad       L(P4Q1), L(P5Q1), L(P6Q1), L(P7Q1)
+	.quad       L(P0Q2), L(P1Q2), L(P2Q2), L(P3Q2)
+	.quad       L(P4Q2), L(P5Q2), L(P6Q2), L(P7Q2)
+	.quad       L(P0Q3), L(P1Q3), L(P2Q3), L(P3Q3)
+	.quad       L(P4Q3), L(P5Q3), L(P6Q3), L(P7Q3)
+	.quad       L(P0Q4), L(P1Q4), L(P2Q4), L(P3Q4)
+	.quad       L(P4Q4), L(P5Q4), L(P6Q4), L(P7Q4)
+	.quad       L(P0Q5), L(P1Q5), L(P2Q5), L(P3Q5)
+	.quad       L(P4Q5), L(P5Q5), L(P6Q5), L(P7Q5)
+	.quad       L(P0Q6), L(P1Q6), L(P2Q6), L(P3Q6)
+	.quad       L(P4Q6), L(P5Q6), L(P6Q6), L(P7Q6)
+	.quad       L(P0Q7), L(P1Q7), L(P2Q7), L(P3Q7)
+	.quad       L(P4Q7), L(P5Q7), L(P6Q7), L(P7Q7)
+	.quad       L(P0Q8), L(P1Q8), L(P2Q8), L(P3Q8)
+	.quad       L(P4Q8), L(P5Q8), L(P6Q8), L(P7Q8)
+	.quad       L(P0Q9), L(P1Q9), L(P2Q9), L(P3Q9)
+	.quad       L(P4Q9), L(P5Q9), L(P6Q9), L(P7Q9)
+	.quad       L(P0QA), L(P1QA), L(P2QA), L(P3QA)
+	.quad       L(P4QA), L(P5QA), L(P6QA), L(P7QA)
+	.quad       L(P0QB), L(P1QB), L(P2QB), L(P3QB)
+	.quad       L(P4QB), L(P5QB), L(P6QB), L(P7QB)
+	.quad       L(P0QC), L(P1QC), L(P2QC), L(P3QC)
+	.quad       L(P4QC), L(P5QC), L(P6QC), L(P7QC)
+	.quad       L(P0QD), L(P1QD), L(P2QD), L(P3QD)
+	.quad       L(P4QD), L(P5QD), L(P6QD), L(P7QD)
+	.quad       L(P0QE), L(P1QE), L(P2QE), L(P3QE)
+	.quad       L(P4QE), L(P5QE), L(P6QE), L(P7QE)
+	.quad       L(P0QF), L(P1QF), L(P2QF), L(P3QF)
+	.quad       L(P4QF), L(P5QF), L(P6QF), L(P7QF)
+	.quad       L(P0QG), L(P1QG), L(P2QG), L(P3QG)
+	.quad       L(P4QG), L(P5QG), L(P6QG), L(P7QG)
+	.quad       L(P0QH), L(P1QH), L(P2QH), L(P3QH)
+	.quad       L(P4QH), L(P5QH), L(P6QH), L(P7QH)
+	.quad       L(P0QI)
+# ifdef USE_EXTRA_TABLE
+	.quad       L(P1QI), L(P2QI), L(P3QI), L(P4QI)
+	.quad       L(P5QI), L(P6QI), L(P7QI)
+# endif
+#else
+L(setPxQx):
+	.int       L(Got0)-L(setPxQx)
+	.int       L(P1Q0)-L(setPxQx)
+	.int       L(P2Q0)-L(setPxQx)
+	.int       L(P3Q0)-L(setPxQx)
+	.int       L(P4Q0)-L(setPxQx)
+	.int       L(P5Q0)-L(setPxQx)
+	.int       L(P6Q0)-L(setPxQx)
+	.int       L(P7Q0)-L(setPxQx)
 
-2:	/* Check for really large regions.  */
-	mov	%rdx,%rax
-	shr	$0x6,%rax
-	je	4f
-	cmp	LARGE, %rdx
-	jae	11f
+	.int       L(P0Q1)-L(setPxQx)
+	.int       L(P1Q1)-L(setPxQx)
+	.int       L(P2Q1)-L(setPxQx)
+	.int       L(P3Q1)-L(setPxQx)
+	.int       L(P4Q1)-L(setPxQx)
+	.int       L(P5Q1)-L(setPxQx)
+	.int       L(P6Q1)-L(setPxQx)
+	.int       L(P7Q1)-L(setPxQx)
 
-	.p2align 4
-3:	/* Copy 64 bytes.  */
-	mov	%r8,(%rcx)
-	mov	%r8,0x8(%rcx)
-	mov	%r8,0x10(%rcx)
-	mov	%r8,0x18(%rcx)
-	mov	%r8,0x20(%rcx)
-	mov	%r8,0x28(%rcx)
-	mov	%r8,0x30(%rcx)
-	mov	%r8,0x38(%rcx)
-	add	$0x40,%rcx
-	dec	%rax
-	jne	3b
+	.int       L(P0Q2)-L(setPxQx)
+	.int       L(P1Q2)-L(setPxQx)
+	.int       L(P2Q2)-L(setPxQx)
+	.int       L(P3Q2)-L(setPxQx)
+	.int       L(P4Q2)-L(setPxQx)
+	.int       L(P5Q2)-L(setPxQx)
+	.int       L(P6Q2)-L(setPxQx)
+	.int       L(P7Q2)-L(setPxQx)
 
-4:	/* Copy final bytes.  */
-	and	$0x3f,%edx
-	mov	%rdx,%rax
-	shr	$0x3,%rax
-	je	6f
+	.int       L(P0Q3)-L(setPxQx)
+	.int       L(P1Q3)-L(setPxQx)
+	.int       L(P2Q3)-L(setPxQx)
+	.int       L(P3Q3)-L(setPxQx)
+	.int       L(P4Q3)-L(setPxQx)
+	.int       L(P5Q3)-L(setPxQx)
+	.int       L(P6Q3)-L(setPxQx)
+	.int       L(P7Q3)-L(setPxQx)
 
-5:	/* First in chunks of 8 bytes.  */
-	mov	%r8,(%rcx)
-	add	$0x8,%rcx
-	dec	%rax
-	jne	5b
-6:
-	and	$0x7,%edx
-7:
-	test	%rdx,%rdx
-	je	9f
-8:	/* And finally as bytes (up to 7).  */
-	mov	%sil,(%rcx)
-	inc	%rcx
-	dec	%rdx
-	jne	8b
-9:
-	/* Load result (only if used as memset).  */
-	mov	%rdi,%rax	/* start address of destination is result */
-	retq
+	.int       L(P0Q4)-L(setPxQx)
+	.int       L(P1Q4)-L(setPxQx)
+	.int       L(P2Q4)-L(setPxQx)
+	.int       L(P3Q4)-L(setPxQx)
+	.int       L(P4Q4)-L(setPxQx)
+	.int       L(P5Q4)-L(setPxQx)
+	.int       L(P6Q4)-L(setPxQx)
+	.int       L(P7Q4)-L(setPxQx)
+
+	.int       L(P0Q5)-L(setPxQx)
+	.int       L(P1Q5)-L(setPxQx)
+	.int       L(P2Q5)-L(setPxQx)
+	.int       L(P3Q5)-L(setPxQx)
+	.int       L(P4Q5)-L(setPxQx)
+	.int       L(P5Q5)-L(setPxQx)
+	.int       L(P6Q5)-L(setPxQx)
+	.int       L(P7Q5)-L(setPxQx)
+
+	.int       L(P0Q6)-L(setPxQx)
+	.int       L(P1Q6)-L(setPxQx)
+	.int       L(P2Q6)-L(setPxQx)
+	.int       L(P3Q6)-L(setPxQx)
+	.int       L(P4Q6)-L(setPxQx)
+	.int       L(P5Q6)-L(setPxQx)
+	.int       L(P6Q6)-L(setPxQx)
+	.int       L(P7Q6)-L(setPxQx)
+
+	.int       L(P0Q7)-L(setPxQx)
+	.int       L(P1Q7)-L(setPxQx)
+	.int       L(P2Q7)-L(setPxQx)
+	.int       L(P3Q7)-L(setPxQx)
+	.int       L(P4Q7)-L(setPxQx)
+	.int       L(P5Q7)-L(setPxQx)
+	.int       L(P6Q7)-L(setPxQx)
+	.int       L(P7Q7)-L(setPxQx)
+
+	.int       L(P0Q8)-L(setPxQx)
+	.int       L(P1Q8)-L(setPxQx)
+	.int       L(P2Q8)-L(setPxQx)
+	.int       L(P3Q8)-L(setPxQx)
+	.int       L(P4Q8)-L(setPxQx)
+	.int       L(P5Q8)-L(setPxQx)
+	.int       L(P6Q8)-L(setPxQx)
+	.int       L(P7Q8)-L(setPxQx)
+
+	.int       L(P0Q9)-L(setPxQx)
+	.int       L(P1Q9)-L(setPxQx)
+	.int       L(P2Q9)-L(setPxQx)
+	.int       L(P3Q9)-L(setPxQx)
+	.int       L(P4Q9)-L(setPxQx)
+	.int       L(P5Q9)-L(setPxQx)
+	.int       L(P6Q9)-L(setPxQx)
+	.int       L(P7Q9)-L(setPxQx)
+
+	.int       L(P0QA)-L(setPxQx)
+	.int       L(P1QA)-L(setPxQx)
+	.int       L(P2QA)-L(setPxQx)
+	.int       L(P3QA)-L(setPxQx)
+	.int       L(P4QA)-L(setPxQx)
+	.int       L(P5QA)-L(setPxQx)
+	.int       L(P6QA)-L(setPxQx)
+	.int       L(P7QA)-L(setPxQx)
+
+	.int       L(P0QB)-L(setPxQx)
+	.int       L(P1QB)-L(setPxQx)
+	.int       L(P2QB)-L(setPxQx)
+	.int       L(P3QB)-L(setPxQx)
+	.int       L(P4QB)-L(setPxQx)
+	.int       L(P5QB)-L(setPxQx)
+	.int       L(P6QB)-L(setPxQx)
+	.int       L(P7QB)-L(setPxQx)
+
+	.int       L(P0QC)-L(setPxQx)
+	.int       L(P1QC)-L(setPxQx)
+	.int       L(P2QC)-L(setPxQx)
+	.int       L(P3QC)-L(setPxQx)
+	.int       L(P4QC)-L(setPxQx)
+	.int       L(P5QC)-L(setPxQx)
+	.int       L(P6QC)-L(setPxQx)
+	.int       L(P7QC)-L(setPxQx)
+
+	.int       L(P0QD)-L(setPxQx)
+	.int       L(P1QD)-L(setPxQx)
+	.int       L(P2QD)-L(setPxQx)
+	.int       L(P3QD)-L(setPxQx)
+	.int       L(P4QD)-L(setPxQx)
+	.int       L(P5QD)-L(setPxQx)
+	.int       L(P6QD)-L(setPxQx)
+	.int       L(P7QD)-L(setPxQx)
+
+	.int       L(P0QE)-L(setPxQx)
+	.int       L(P1QE)-L(setPxQx)
+	.int       L(P2QE)-L(setPxQx)
+	.int       L(P3QE)-L(setPxQx)
+	.int       L(P4QE)-L(setPxQx)
+	.int       L(P5QE)-L(setPxQx)
+	.int       L(P6QE)-L(setPxQx)
+	.int       L(P7QE)-L(setPxQx)
+
+	.int       L(P0QF)-L(setPxQx)
+	.int       L(P1QF)-L(setPxQx)
+	.int       L(P2QF)-L(setPxQx)
+	.int       L(P3QF)-L(setPxQx)
+	.int       L(P4QF)-L(setPxQx)
+	.int       L(P5QF)-L(setPxQx)
+	.int       L(P6QF)-L(setPxQx)
+	.int       L(P7QF)-L(setPxQx)
+
+	.int       L(P0QG)-L(setPxQx)
+	.int       L(P1QG)-L(setPxQx)
+	.int       L(P2QG)-L(setPxQx)
+	.int       L(P3QG)-L(setPxQx)
+	.int       L(P4QG)-L(setPxQx)
+	.int       L(P5QG)-L(setPxQx)
+	.int       L(P6QG)-L(setPxQx)
+	.int       L(P7QG)-L(setPxQx)
+
+	.int       L(P0QH)-L(setPxQx)
+	.int       L(P1QH)-L(setPxQx)
+	.int       L(P2QH)-L(setPxQx)
+	.int       L(P3QH)-L(setPxQx)
+	.int       L(P4QH)-L(setPxQx)
+	.int       L(P5QH)-L(setPxQx)
+	.int       L(P6QH)-L(setPxQx)
+	.int       L(P7QH)-L(setPxQx)
+
+	.int       L(P0QI)-L(setPxQx)
+# ifdef USE_EXTRA_TABLE
+	.int       L(P1QI)-L(setPxQx)
+	.int       L(P2QI)-L(setPxQx)
+	.int       L(P3QI)-L(setPxQx)
+	.int       L(P4QI)-L(setPxQx)
+	.int       L(P5QI)-L(setPxQx)
+	.int       L(P6QI)-L(setPxQx)
+	.int       L(P7QI)-L(setPxQx)
+# endif
+#endif
+	.popsection
+
+	.balign     16
+#ifdef USE_EXTRA_TABLE
+L(P1QI): mov    %rdx,-0x91(%rdi)
+#endif
+L(P1QH): mov    %rdx,-0x89(%rdi)
+L(P1QG): mov    %rdx,-0x81(%rdi)
+#		   .balign     16
+L(P1QF): mov    %rdx,-0x79(%rdi)
+L(P1QE): mov    %rdx,-0x71(%rdi)
+L(P1QD): mov    %rdx,-0x69(%rdi)
+L(P1QC): mov    %rdx,-0x61(%rdi)
+L(P1QB): mov    %rdx,-0x59(%rdi)
+L(P1QA): mov    %rdx,-0x51(%rdi)
+L(P1Q9): mov    %rdx,-0x49(%rdi)
+L(P1Q8): mov    %rdx,-0x41(%rdi)
+L(P1Q7): mov    %rdx,-0x39(%rdi)
+L(P1Q6): mov    %rdx,-0x31(%rdi)
+L(P1Q5): mov    %rdx,-0x29(%rdi)
+L(P1Q4): mov    %rdx,-0x21(%rdi)
+L(P1Q3): mov    %rdx,-0x19(%rdi)
+L(P1Q2): mov    %rdx,-0x11(%rdi)
+L(P1Q1): mov    %rdx,-0x9(%rdi)
+L(P1Q0): mov    %dl,-0x1(%rdi)
+		retq   $0x0
+
+	.balign     16
+L(P0QI): mov    %rdx,-0x90(%rdi)
+L(P0QH): mov    %rdx,-0x88(%rdi)
+#		   .balign     16
+L(P0QG): mov    %rdx,-0x80(%rdi)
+L(P0QF): mov    %rdx,-0x78(%rdi)
+L(P0QE): mov    %rdx,-0x70(%rdi)
+L(P0QD): mov    %rdx,-0x68(%rdi)
+L(P0QC): mov    %rdx,-0x60(%rdi)
+L(P0QB): mov    %rdx,-0x58(%rdi)
+L(P0QA): mov    %rdx,-0x50(%rdi)
+L(P0Q9): mov    %rdx,-0x48(%rdi)
+L(P0Q8): mov    %rdx,-0x40(%rdi)
+L(P0Q7): mov    %rdx,-0x38(%rdi)
+L(P0Q6): mov    %rdx,-0x30(%rdi)
+L(P0Q5): mov    %rdx,-0x28(%rdi)
+L(P0Q4): mov    %rdx,-0x20(%rdi)
+L(P0Q3): mov    %rdx,-0x18(%rdi)
+L(P0Q2): mov    %rdx,-0x10(%rdi)
+L(P0Q1): mov    %rdx,-0x8(%rdi)
+L(P0Q0): retq   $0x0
+
+
+	.balign     16
+#ifdef USE_EXTRA_TABLE
+L(P2QI): mov    %rdx,-0x92(%rdi)
+#endif
+L(P2QH): mov    %rdx,-0x8a(%rdi)
+L(P2QG): mov    %rdx,-0x82(%rdi)
+#		   .balign     16
+L(P2QF): mov    %rdx,-0x7a(%rdi)
+L(P2QE): mov    %rdx,-0x72(%rdi)
+L(P2QD): mov    %rdx,-0x6a(%rdi)
+L(P2QC): mov    %rdx,-0x62(%rdi)
+L(P2QB): mov    %rdx,-0x5a(%rdi)
+L(P2QA): mov    %rdx,-0x52(%rdi)
+L(P2Q9): mov    %rdx,-0x4a(%rdi)
+L(P2Q8): mov    %rdx,-0x42(%rdi)
+L(P2Q7): mov    %rdx,-0x3a(%rdi)
+L(P2Q6): mov    %rdx,-0x32(%rdi)
+L(P2Q5): mov    %rdx,-0x2a(%rdi)
+L(P2Q4): mov    %rdx,-0x22(%rdi)
+L(P2Q3): mov    %rdx,-0x1a(%rdi)
+L(P2Q2): mov    %rdx,-0x12(%rdi)
+L(P2Q1): mov    %rdx,-0xa(%rdi)
+L(P2Q0): mov    %dx,-0x2(%rdi)
+		retq   $0x0
+
+	.balign     16
+#ifdef USE_EXTRA_TABLE
+L(P3QI): mov    %rdx,-0x93(%rdi)
+#endif
+L(P3QH): mov    %rdx,-0x8b(%rdi)
+L(P3QG): mov    %rdx,-0x83(%rdi)
+#		   .balign     16
+L(P3QF): mov    %rdx,-0x7b(%rdi)
+L(P3QE): mov    %rdx,-0x73(%rdi)
+L(P3QD): mov    %rdx,-0x6b(%rdi)
+L(P3QC): mov    %rdx,-0x63(%rdi)
+L(P3QB): mov    %rdx,-0x5b(%rdi)
+L(P3QA): mov    %rdx,-0x53(%rdi)
+L(P3Q9): mov    %rdx,-0x4b(%rdi)
+L(P3Q8): mov    %rdx,-0x43(%rdi)
+L(P3Q7): mov    %rdx,-0x3b(%rdi)
+L(P3Q6): mov    %rdx,-0x33(%rdi)
+L(P3Q5): mov    %rdx,-0x2b(%rdi)
+L(P3Q4): mov    %rdx,-0x23(%rdi)
+L(P3Q3): mov    %rdx,-0x1b(%rdi)
+L(P3Q2): mov    %rdx,-0x13(%rdi)
+L(P3Q1): mov    %rdx,-0xb(%rdi)
+L(P3Q0): mov    %dx,-0x3(%rdi)
+		mov    %dl,-0x1(%rdi)
+		retq   $0x0
+
+	.balign     16
+#ifdef USE_EXTRA_TABLE
+L(P4QI): mov    %rdx,-0x94(%rdi)
+#endif
+L(P4QH): mov    %rdx,-0x8c(%rdi)
+L(P4QG): mov    %rdx,-0x84(%rdi)
+#		   .balign     16
+L(P4QF): mov    %rdx,-0x7c(%rdi)
+L(P4QE): mov    %rdx,-0x74(%rdi)
+L(P4QD): mov    %rdx,-0x6c(%rdi)
+L(P4QC): mov    %rdx,-0x64(%rdi)
+L(P4QB): mov    %rdx,-0x5c(%rdi)
+L(P4QA): mov    %rdx,-0x54(%rdi)
+L(P4Q9): mov    %rdx,-0x4c(%rdi)
+L(P4Q8): mov    %rdx,-0x44(%rdi)
+L(P4Q7): mov    %rdx,-0x3c(%rdi)
+L(P4Q6): mov    %rdx,-0x34(%rdi)
+L(P4Q5): mov    %rdx,-0x2c(%rdi)
+L(P4Q4): mov    %rdx,-0x24(%rdi)
+L(P4Q3): mov    %rdx,-0x1c(%rdi)
+L(P4Q2): mov    %rdx,-0x14(%rdi)
+L(P4Q1): mov    %rdx,-0xc(%rdi)
+L(P4Q0): mov    %edx,-0x4(%rdi)
+		retq   $0x0
+
+	.balign     16
+#if defined(USE_EXTRA_TABLE)
+L(P5QI): mov    %rdx,-0x95(%rdi)
+#endif
+L(P5QH): mov    %rdx,-0x8d(%rdi)
+L(P5QG): mov    %rdx,-0x85(%rdi)
+#		   .balign     16
+L(P5QF): mov    %rdx,-0x7d(%rdi)
+L(P5QE): mov    %rdx,-0x75(%rdi)
+L(P5QD): mov    %rdx,-0x6d(%rdi)
+L(P5QC): mov    %rdx,-0x65(%rdi)
+L(P5QB): mov    %rdx,-0x5d(%rdi)
+L(P5QA): mov    %rdx,-0x55(%rdi)
+L(P5Q9): mov    %rdx,-0x4d(%rdi)
+L(P5Q8): mov    %rdx,-0x45(%rdi)
+L(P5Q7): mov    %rdx,-0x3d(%rdi)
+L(P5Q6): mov    %rdx,-0x35(%rdi)
+L(P5Q5): mov    %rdx,-0x2d(%rdi)
+L(P5Q4): mov    %rdx,-0x25(%rdi)
+L(P5Q3): mov    %rdx,-0x1d(%rdi)
+L(P5Q2): mov    %rdx,-0x15(%rdi)
+L(P5Q1): mov    %rdx,-0xd(%rdi)
+L(P5Q0): mov    %edx,-0x5(%rdi)
+		mov    %dl,-0x1(%rdi)
+		retq   $0x0
+
+	.balign     16
+#ifdef USE_EXTRA_TABLE
+L(P6QI): mov    %rdx,-0x96(%rdi)
+#endif
+L(P6QH): mov    %rdx,-0x8e(%rdi)
+L(P6QG): mov    %rdx,-0x86(%rdi)
+#		   .balign     16
+L(P6QF): mov    %rdx,-0x7e(%rdi)
+L(P6QE): mov    %rdx,-0x76(%rdi)
+L(P6QD): mov    %rdx,-0x6e(%rdi)
+L(P6QC): mov    %rdx,-0x66(%rdi)
+L(P6QB): mov    %rdx,-0x5e(%rdi)
+L(P6QA): mov    %rdx,-0x56(%rdi)
+L(P6Q9): mov    %rdx,-0x4e(%rdi)
+L(P6Q8): mov    %rdx,-0x46(%rdi)
+L(P6Q7): mov    %rdx,-0x3e(%rdi)
+L(P6Q6): mov    %rdx,-0x36(%rdi)
+L(P6Q5): mov    %rdx,-0x2e(%rdi)
+L(P6Q4): mov    %rdx,-0x26(%rdi)
+L(P6Q3): mov    %rdx,-0x1e(%rdi)
+L(P6Q2): mov    %rdx,-0x16(%rdi)
+L(P6Q1): mov    %rdx,-0xe(%rdi)
+L(P6Q0): mov    %edx,-0x6(%rdi)
+		mov    %dx,-0x2(%rdi)
+		retq   $0x0
+
+	.balign     16
+#ifdef USE_EXTRA_TABLE
+L(P7QI): mov    %rdx,-0x97(%rdi)
+#endif
+L(P7QH): mov    %rdx,-0x8f(%rdi)
+L(P7QG): mov    %rdx,-0x87(%rdi)
+#		   .balign     16
+L(P7QF): mov    %rdx,-0x7f(%rdi)
+L(P7QE): mov    %rdx,-0x77(%rdi)
+L(P7QD): mov    %rdx,-0x6f(%rdi)
+L(P7QC): mov    %rdx,-0x67(%rdi)
+L(P7QB): mov    %rdx,-0x5f(%rdi)
+L(P7QA): mov    %rdx,-0x57(%rdi)
+L(P7Q9): mov    %rdx,-0x4f(%rdi)
+L(P7Q8): mov    %rdx,-0x47(%rdi)
+L(P7Q7): mov    %rdx,-0x3f(%rdi)
+L(P7Q6): mov    %rdx,-0x37(%rdi)
+L(P7Q5): mov    %rdx,-0x2f(%rdi)
+L(P7Q4): mov    %rdx,-0x27(%rdi)
+L(P7Q3): mov    %rdx,-0x1f(%rdi)
+L(P7Q2): mov    %rdx,-0x17(%rdi)
+L(P7Q1): mov    %rdx,-0xf(%rdi)
+L(P7Q0): mov    %edx,-0x7(%rdi)
+		mov    %dx,-0x3(%rdi)
+		mov    %dl,-0x1(%rdi)
+		retq   $0x0
+
+	.balign     16
+L(ck_mem_ops_method):
+
+# align to 16 byte boundary first
+	#test $0xf,%rdi
+	#jz L(aligned_now)
+	 lea    L(AliPxQx)(%rip),%r11
+	 mov    $0x10,%r10
+	 mov    %rdi,%r9
+	 and    $0xf,%r9
+	 sub    %r9,%r10
+	 and    $0xf,%r10
+	 add    %r10,%rdi
+	 sub    %r10,%r8
+#ifndef PIC
+	jmpq   *(%r11,%r10,8)
+#else
+	movslq (%r11,%r10,4),%rcx
+	lea    (%rcx,%r11,1),%r11
+	jmpq   *%r11
+#endif
+
+	.pushsection .rodata
+	.balign     16
+#ifndef PIC
+L(AliPxQx):
+	.quad       L(aligned_now), L(A1Q0), L(A2Q0), L(A3Q0)
+	.quad	    L(A4Q0), L(A5Q0), L(A6Q0), L(A7Q0)
+	.quad       L(A0Q1), L(A1Q1), L(A2Q1), L(A3Q1)
+	.quad       L(A4Q1), L(A5Q1), L(A6Q1), L(A7Q1)
+#else
+L(AliPxQx):
+	.int       L(aligned_now)-L(AliPxQx)
+	.int       L(A1Q0)-L(AliPxQx)
+	.int       L(A2Q0)-L(AliPxQx)
+	.int       L(A3Q0)-L(AliPxQx)
+	.int       L(A4Q0)-L(AliPxQx)
+	.int       L(A5Q0)-L(AliPxQx)
+	.int       L(A6Q0)-L(AliPxQx)
+	.int       L(A7Q0)-L(AliPxQx)
+
+	.int       L(A0Q1)-L(AliPxQx)
+	.int       L(A1Q1)-L(AliPxQx)
+	.int       L(A2Q1)-L(AliPxQx)
+	.int       L(A3Q1)-L(AliPxQx)
+	.int       L(A4Q1)-L(AliPxQx)
+	.int       L(A5Q1)-L(AliPxQx)
+	.int       L(A6Q1)-L(AliPxQx)
+	.int       L(A7Q1)-L(AliPxQx)
+#endif
+	.popsection
+
+	.balign     16
+L(A5Q1):    mov    %dl,-0xd(%rdi)
+L(A4Q1):    mov    %edx,-0xc(%rdi)
+L(A0Q1):    mov    %rdx,-0x8(%rdi)
+L(A0Q0):    jmp     L(aligned_now)
+
+	.balign     16
+L(A1Q1):   mov    %dl,-0x9(%rdi)
+	mov    %rdx,-0x8(%rdi)
+	jmp    L(aligned_now)
+
+	.balign     16
+L(A1Q0):   mov    %dl,-0x1(%rdi)
+	jmp    L(aligned_now)
+
+	.balign     16
+L(A3Q1):    mov    %dl,-0xb(%rdi)
+L(A2Q1):    mov    %dx,-0xa(%rdi)
+	mov    %rdx,-0x8(%rdi)
+	jmp    L(aligned_now)
+
+	.balign     16
+L(A3Q0):    mov    %dl,-0x3(%rdi)
+L(A2Q0):    mov    %dx,-0x2(%rdi)
+	jmp    L(aligned_now)
+
+	.balign     16
+L(A5Q0):    mov    %dl,-0x5(%rdi)
+L(A4Q0):    mov    %edx,-0x4(%rdi)
+	jmp    L(aligned_now)
+
+	.balign     16
+L(A7Q1):    mov    %dl,-0xf(%rdi)
+L(A6Q1):    mov    %dx,-0xe(%rdi)
+	mov    %edx,-0xc(%rdi)
+	mov    %rdx,-0x8(%rdi)
+	jmp    L(aligned_now)
+
+	.balign     16
+L(A7Q0):    mov    %dl,-0x7(%rdi)
+L(A6Q0):    mov    %dx,-0x6(%rdi)
+	mov    %edx,-0x4(%rdi)
+	jmp    L(aligned_now)
+
+	.balign     16
+L(aligned_now):
+
+	 cmpl   $0x1,__x86_64_preferred_memory_instruction(%rip)
+	 jg     L(SSE_pre)
+
+L(8byte_move_try):
+	cmpq	__STOS_LOWER_BOUNDARY,%r8
+	jae	L(8byte_stos_try)
+
+	.balign     16
+L(8byte_move):
+	movq	%r8,%rcx
+	shrq	$7,%rcx
+	jz	L(8byte_move_skip)
 
 	.p2align 4
-11:	/* Copy 64 bytes without polluting the cache.  */
-	/* We could use	movntdq    %xmm0,(%rcx) here to further
-	   speed up for large cases but let's not use XMM registers.  */
-	movnti	%r8,(%rcx)
-	movnti  %r8,0x8(%rcx)
-	movnti  %r8,0x10(%rcx)
-	movnti  %r8,0x18(%rcx)
-	movnti  %r8,0x20(%rcx)
-	movnti  %r8,0x28(%rcx)
-	movnti  %r8,0x30(%rcx)
-	movnti  %r8,0x38(%rcx)
-	add	$0x40,%rcx
-	dec	%rax
-	jne	11b
+
+L(8byte_move_loop):
+	decq	%rcx
+
+	movq	%rdx,    (%rdi)
+	movq	%rdx,  8 (%rdi)
+	movq	%rdx, 16 (%rdi)
+	movq	%rdx, 24 (%rdi)
+	movq	%rdx, 32 (%rdi)
+	movq	%rdx, 40 (%rdi)
+	movq	%rdx, 48 (%rdi)
+	movq	%rdx, 56 (%rdi)
+	movq	%rdx, 64 (%rdi)
+	movq	%rdx, 72 (%rdi)
+	movq	%rdx, 80 (%rdi)
+	movq	%rdx, 88 (%rdi)
+	movq	%rdx, 96 (%rdi)
+	movq	%rdx, 104 (%rdi)
+	movq	%rdx, 112 (%rdi)
+	movq	%rdx, 120 (%rdi)
+
+	leaq	128 (%rdi),%rdi
+
+	jnz     L(8byte_move_loop)
+
+L(8byte_move_skip):
+	andl	$127,%r8d
+	lea    	(%rdi,%r8,1),%rdi
+	lea    	L(setPxQx)(%rip),%r11
+
+#ifndef PIC
+	jmpq   	*(%r11,%r8,8) # old scheme remained for nonPIC
+#else
+	movslq	(%r11,%r8,4),%rcx
+	lea    	(%rcx,%r11,1),%r11
+	jmpq   	*%r11
+#endif
+
+	.balign     16
+L(8byte_stos_try):
+	mov    __x86_64_shared_cache_size(%rip),%r9d // ck largest cache size
+	cmpq	%r8,%r9		// calculate the lesser of remaining
+	cmovaq	%r8,%r9		// bytes and largest cache size
+	jbe	L(8byte_stos)
+
+L(8byte_move_reuse_try):
+	cmp	__STOS_UPPER_BOUNDARY,%r8
+	jae	L(8byte_move)
+
+	.balign     16
+L(8byte_stos):
+	movq	%r9,%rcx
+	andq	$-8,%r9
+
+	shrq	$3,%rcx
+	jz	L(8byte_stos_skip)
+
+	xchgq	%rax,%rdx
+
+	rep
+	stosq
+
+	xchgq	%rax,%rdx
+
+L(8byte_stos_skip):
+	subq	%r9,%r8
+	ja	L(8byte_nt_move)
+
+	andl	$7,%r8d
+	lea    	(%rdi,%r8,1),%rdi
+	lea    	L(setPxQx)(%rip),%r11
+#ifndef PIC
+	jmpq   	*(%r11,%r8,8) # old scheme remained for nonPIC
+#else
+	movslq	(%r11,%r8,4),%rcx
+	lea	(%rcx,%r11,1),%r11
+	jmpq   	*%r11
+#endif
+
+	.balign     16
+L(8byte_nt_move):
+	movq	%r8,%rcx
+	shrq	$7,%rcx
+	jz      L(8byte_nt_move_skip)
+
+	.balign     16
+L(8byte_nt_move_loop):
+	decq	%rcx
+
+	movntiq	%rdx,     (%rdi)
+	movntiq	%rdx,   8 (%rdi)
+	movntiq	%rdx,  16 (%rdi)
+	movntiq	%rdx,  24 (%rdi)
+	movntiq	%rdx,  32 (%rdi)
+	movntiq	%rdx,  40 (%rdi)
+	movntiq	%rdx,  48 (%rdi)
+	movntiq	%rdx,  56 (%rdi)
+	movntiq	%rdx,  64 (%rdi)
+	movntiq	%rdx,  72 (%rdi)
+	movntiq	%rdx,  80 (%rdi)
+	movntiq	%rdx,  88 (%rdi)
+	movntiq	%rdx,  96 (%rdi)
+	movntiq	%rdx, 104 (%rdi)
+	movntiq	%rdx, 112 (%rdi)
+	movntiq	%rdx, 120 (%rdi)
+
+	leaq	128 (%rdi),%rdi
+
+	jnz     L(8byte_nt_move_loop)
+
 	sfence
-	jmp	4b
+
+L(8byte_nt_move_skip):
+	andl	$127,%r8d
+
+	lea    	(%rdi,%r8,1),%rdi
+	lea    	L(setPxQx)(%rip),%r11
+#ifndef PIC
+	jmpq   	*(%r11,%r8,8) # old scheme remained for nonPIC
+#else
+	movslq	(%r11,%r8,4),%rcx
+	lea    	(%rcx,%r11,1),%r11
+	jmpq   	*%r11
+#endif
+
+L(SSE_pre):
+	 # fill RegXMM0 with the pattern
+	 movd   %rdx,%xmm0
+	 punpcklqdq %xmm0,%xmm0
+
+	 lea    L(SSExDx)(%rip),%r9        # for later after the alignment
+	 cmp    $0xb0,%r8 # 176
+	 jge    L(byte32sse2_pre)
+
+	 add    %r8,%rdi
+#ifndef PIC
+	 jmpq   *(%r9,%r8,8)
+#else
+	 movslq    (%r9,%r8,4),%rcx
+	 lea    (%rcx,%r9,1),%r9
+	 jmpq   *%r9
+#endif
+
+L(SSE0QB):  movdqa %xmm0,-0xb0(%rdi)
+L(SSE0QA):  movdqa %xmm0,-0xa0(%rdi)
+L(SSE0Q9):  movdqa %xmm0,-0x90(%rdi)
+L(SSE0Q8):  movdqa %xmm0,-0x80(%rdi)
+L(SSE0Q7):  movdqa %xmm0,-0x70(%rdi)
+L(SSE0Q6):  movdqa %xmm0,-0x60(%rdi)
+L(SSE0Q5):  movdqa %xmm0,-0x50(%rdi)
+L(SSE0Q4):  movdqa %xmm0,-0x40(%rdi)
+L(SSE0Q3):  movdqa %xmm0,-0x30(%rdi)
+L(SSE0Q2):  movdqa %xmm0,-0x20(%rdi)
+L(SSE0Q1):  movdqa %xmm0,-0x10(%rdi)
+L(SSE0Q0):  retq   $0x0
+
+L(SSE1QB):  movdqa %xmm0,-0xb1(%rdi)
+L(SSE1QA):  movdqa %xmm0,-0xa1(%rdi)
+L(SSE1Q9):  movdqa %xmm0,-0x91(%rdi)
+L(SSE1Q8):  movdqa %xmm0,-0x81(%rdi)
+L(SSE1Q7):  movdqa %xmm0,-0x71(%rdi)
+L(SSE1Q6):  movdqa %xmm0,-0x61(%rdi)
+L(SSE1Q5):  movdqa %xmm0,-0x51(%rdi)
+L(SSE1Q4):  movdqa %xmm0,-0x41(%rdi)
+L(SSE1Q3):  movdqa %xmm0,-0x31(%rdi)
+L(SSE1Q2):  movdqa %xmm0,-0x21(%rdi)
+L(SSE1Q1):  movdqa %xmm0,-0x11(%rdi)
+L(SSE1Q0):  mov    %dl,-0x1(%rdi)
+	retq   $0x0
+
+L(SSE2QB):  movdqa %xmm0,-0xb2(%rdi)
+L(SSE2QA):  movdqa %xmm0,-0xa2(%rdi)
+L(SSE2Q9):  movdqa %xmm0,-0x92(%rdi)
+L(SSE2Q8):  movdqa %xmm0,-0x82(%rdi)
+L(SSE2Q7):  movdqa %xmm0,-0x72(%rdi)
+L(SSE2Q6):  movdqa %xmm0,-0x62(%rdi)
+L(SSE2Q5):  movdqa %xmm0,-0x52(%rdi)
+L(SSE2Q4):  movdqa %xmm0,-0x42(%rdi)
+L(SSE2Q3):  movdqa %xmm0,-0x32(%rdi)
+L(SSE2Q2):  movdqa %xmm0,-0x22(%rdi)
+L(SSE2Q1):  movdqa %xmm0,-0x12(%rdi)
+L(SSE2Q0):  mov    %dx,-0x2(%rdi)
+	retq   $0x0
+
+L(SSE3QB):  movdqa %xmm0,-0xb3(%rdi)
+L(SSE3QA):  movdqa %xmm0,-0xa3(%rdi)
+L(SSE3Q9):  movdqa %xmm0,-0x93(%rdi)
+L(SSE3Q8):  movdqa %xmm0,-0x83(%rdi)
+L(SSE3Q7):  movdqa %xmm0,-0x73(%rdi)
+L(SSE3Q6):  movdqa %xmm0,-0x63(%rdi)
+L(SSE3Q5):  movdqa %xmm0,-0x53(%rdi)
+L(SSE3Q4):  movdqa %xmm0,-0x43(%rdi)
+L(SSE3Q3):  movdqa %xmm0,-0x33(%rdi)
+L(SSE3Q2):  movdqa %xmm0,-0x23(%rdi)
+L(SSE3Q1):  movdqa %xmm0,-0x13(%rdi)
+L(SSE3Q0):  mov    %dx,-0x3(%rdi)
+	mov    %dl,-0x1(%rdi)
+	retq   $0x0
+
+L(SSE4QB):  movdqa %xmm0,-0xb4(%rdi)
+L(SSE4QA):  movdqa %xmm0,-0xa4(%rdi)
+L(SSE4Q9):  movdqa %xmm0,-0x94(%rdi)
+L(SSE4Q8):  movdqa %xmm0,-0x84(%rdi)
+L(SSE4Q7):  movdqa %xmm0,-0x74(%rdi)
+L(SSE4Q6):  movdqa %xmm0,-0x64(%rdi)
+L(SSE4Q5):  movdqa %xmm0,-0x54(%rdi)
+L(SSE4Q4):  movdqa %xmm0,-0x44(%rdi)
+L(SSE4Q3):  movdqa %xmm0,-0x34(%rdi)
+L(SSE4Q2):  movdqa %xmm0,-0x24(%rdi)
+L(SSE4Q1):  movdqa %xmm0,-0x14(%rdi)
+L(SSE4Q0):  mov    %edx,-0x4(%rdi)
+	retq   $0x0
+
+L(SSE5QB):  movdqa %xmm0,-0xb5(%rdi)
+L(SSE5QA):  movdqa %xmm0,-0xa5(%rdi)
+L(SSE5Q9):  movdqa %xmm0,-0x95(%rdi)
+L(SSE5Q8):  movdqa %xmm0,-0x85(%rdi)
+L(SSE5Q7):  movdqa %xmm0,-0x75(%rdi)
+L(SSE5Q6):  movdqa %xmm0,-0x65(%rdi)
+L(SSE5Q5):  movdqa %xmm0,-0x55(%rdi)
+L(SSE5Q4):  movdqa %xmm0,-0x45(%rdi)
+L(SSE5Q3):  movdqa %xmm0,-0x35(%rdi)
+L(SSE5Q2):  movdqa %xmm0,-0x25(%rdi)
+L(SSE5Q1):  movdqa %xmm0,-0x15(%rdi)
+L(SSE5Q0):  mov    %edx,-0x5(%rdi)
+	mov    %dl,-0x1(%rdi)
+	retq   $0x0
+
+
+L(SSE6QB):  movdqa %xmm0,-0xb6(%rdi)
+L(SSE6QA):  movdqa %xmm0,-0xa6(%rdi)
+L(SSE6Q9):  movdqa %xmm0,-0x96(%rdi)
+L(SSE6Q8):  movdqa %xmm0,-0x86(%rdi)
+L(SSE6Q7):  movdqa %xmm0,-0x76(%rdi)
+L(SSE6Q6):  movdqa %xmm0,-0x66(%rdi)
+L(SSE6Q5):  movdqa %xmm0,-0x56(%rdi)
+L(SSE6Q4):  movdqa %xmm0,-0x46(%rdi)
+L(SSE6Q3):  movdqa %xmm0,-0x36(%rdi)
+L(SSE6Q2):  movdqa %xmm0,-0x26(%rdi)
+L(SSE6Q1):  movdqa %xmm0,-0x16(%rdi)
+L(SSE6Q0):  mov    %edx,-0x6(%rdi)
+	mov    %dx,-0x2(%rdi)
+	retq   $0x0
+
+L(SSE7QB):  movdqa %xmm0,-0xb7(%rdi)
+L(SSE7QA):  movdqa %xmm0,-0xa7(%rdi)
+L(SSE7Q9):  movdqa %xmm0,-0x97(%rdi)
+L(SSE7Q8):  movdqa %xmm0,-0x87(%rdi)
+L(SSE7Q7):  movdqa %xmm0,-0x77(%rdi)
+L(SSE7Q6):  movdqa %xmm0,-0x67(%rdi)
+L(SSE7Q5):  movdqa %xmm0,-0x57(%rdi)
+L(SSE7Q4):  movdqa %xmm0,-0x47(%rdi)
+L(SSE7Q3):  movdqa %xmm0,-0x37(%rdi)
+L(SSE7Q2):  movdqa %xmm0,-0x27(%rdi)
+L(SSE7Q1):  movdqa %xmm0,-0x17(%rdi)
+L(SSE7Q0):  mov    %edx,-0x7(%rdi)
+	mov    %dx,-0x3(%rdi)
+	mov    %dl,-0x1(%rdi)
+	retq   $0x0
+
+L(SSE8QB):  movdqa %xmm0,-0xb8(%rdi)
+L(SSE8QA):  movdqa %xmm0,-0xa8(%rdi)
+L(SSE8Q9):  movdqa %xmm0,-0x98(%rdi)
+L(SSE8Q8):  movdqa %xmm0,-0x88(%rdi)
+L(SSE8Q7):  movdqa %xmm0,-0x78(%rdi)
+L(SSE8Q6):  movdqa %xmm0,-0x68(%rdi)
+L(SSE8Q5):  movdqa %xmm0,-0x58(%rdi)
+L(SSE8Q4):  movdqa %xmm0,-0x48(%rdi)
+L(SSE8Q3):  movdqa %xmm0,-0x38(%rdi)
+L(SSE8Q2):  movdqa %xmm0,-0x28(%rdi)
+L(SSE8Q1):  movdqa %xmm0,-0x18(%rdi)
+L(SSE8Q0):  mov    %rdx,-0x8(%rdi)
+	retq   $0x0
+
+L(SSE9QB):  movdqa %xmm0,-0xb9(%rdi)
+L(SSE9QA):  movdqa %xmm0,-0xa9(%rdi)
+L(SSE9Q9):  movdqa %xmm0,-0x99(%rdi)
+L(SSE9Q8):  movdqa %xmm0,-0x89(%rdi)
+L(SSE9Q7):  movdqa %xmm0,-0x79(%rdi)
+L(SSE9Q6):  movdqa %xmm0,-0x69(%rdi)
+L(SSE9Q5):  movdqa %xmm0,-0x59(%rdi)
+L(SSE9Q4):  movdqa %xmm0,-0x49(%rdi)
+L(SSE9Q3):  movdqa %xmm0,-0x39(%rdi)
+L(SSE9Q2):  movdqa %xmm0,-0x29(%rdi)
+L(SSE9Q1):  movdqa %xmm0,-0x19(%rdi)
+L(SSE9Q0):  mov    %rdx,-0x9(%rdi)
+	mov    %dl,-0x1(%rdi)
+	retq   $0x0
+
+L(SSE10QB): movdqa %xmm0,-0xba(%rdi)
+L(SSE10QA): movdqa %xmm0,-0xaa(%rdi)
+L(SSE10Q9): movdqa %xmm0,-0x9a(%rdi)
+L(SSE10Q8): movdqa %xmm0,-0x8a(%rdi)
+L(SSE10Q7): movdqa %xmm0,-0x7a(%rdi)
+L(SSE10Q6): movdqa %xmm0,-0x6a(%rdi)
+L(SSE10Q5): movdqa %xmm0,-0x5a(%rdi)
+L(SSE10Q4): movdqa %xmm0,-0x4a(%rdi)
+L(SSE10Q3): movdqa %xmm0,-0x3a(%rdi)
+L(SSE10Q2): movdqa %xmm0,-0x2a(%rdi)
+L(SSE10Q1): movdqa %xmm0,-0x1a(%rdi)
+L(SSE10Q0): mov    %rdx,-0xa(%rdi)
+	mov    %dx,-0x2(%rdi)
+	retq   $0x0
+
+L(SSE11QB): movdqa %xmm0,-0xbb(%rdi)
+L(SSE11QA): movdqa %xmm0,-0xab(%rdi)
+L(SSE11Q9): movdqa %xmm0,-0x9b(%rdi)
+L(SSE11Q8): movdqa %xmm0,-0x8b(%rdi)
+L(SSE11Q7): movdqa %xmm0,-0x7b(%rdi)
+L(SSE11Q6): movdqa %xmm0,-0x6b(%rdi)
+L(SSE11Q5): movdqa %xmm0,-0x5b(%rdi)
+L(SSE11Q4): movdqa %xmm0,-0x4b(%rdi)
+L(SSE11Q3): movdqa %xmm0,-0x3b(%rdi)
+L(SSE11Q2): movdqa %xmm0,-0x2b(%rdi)
+L(SSE11Q1): movdqa %xmm0,-0x1b(%rdi)
+L(SSE11Q0): mov    %rdx,-0xb(%rdi)
+	mov    %dx,-0x3(%rdi)
+	mov    %dl,-0x1(%rdi)
+	retq   $0x0
+
+L(SSE12QB): movdqa %xmm0,-0xbc(%rdi)
+L(SSE12QA): movdqa %xmm0,-0xac(%rdi)
+L(SSE12Q9): movdqa %xmm0,-0x9c(%rdi)
+L(SSE12Q8): movdqa %xmm0,-0x8c(%rdi)
+L(SSE12Q7): movdqa %xmm0,-0x7c(%rdi)
+L(SSE12Q6): movdqa %xmm0,-0x6c(%rdi)
+L(SSE12Q5): movdqa %xmm0,-0x5c(%rdi)
+L(SSE12Q4): movdqa %xmm0,-0x4c(%rdi)
+L(SSE12Q3): movdqa %xmm0,-0x3c(%rdi)
+L(SSE12Q2): movdqa %xmm0,-0x2c(%rdi)
+L(SSE12Q1): movdqa %xmm0,-0x1c(%rdi)
+L(SSE12Q0): mov    %rdx,-0xc(%rdi)
+	mov    %edx,-0x4(%rdi)
+	retq   $0x0
+
+L(SSE13QB): movdqa %xmm0,-0xbd(%rdi)
+L(SSE13QA): movdqa %xmm0,-0xad(%rdi)
+L(SSE13Q9): movdqa %xmm0,-0x9d(%rdi)
+L(SSE13Q8): movdqa %xmm0,-0x8d(%rdi)
+L(SSE13Q7): movdqa %xmm0,-0x7d(%rdi)
+L(SSE13Q6): movdqa %xmm0,-0x6d(%rdi)
+L(SSE13Q5): movdqa %xmm0,-0x5d(%rdi)
+L(SSE13Q4): movdqa %xmm0,-0x4d(%rdi)
+L(SSE13Q3): movdqa %xmm0,-0x3d(%rdi)
+L(SSE13Q2): movdqa %xmm0,-0x2d(%rdi)
+L(SSE13Q1): movdqa %xmm0,-0x1d(%rdi)
+L(SSE13Q0): mov    %rdx,-0xd(%rdi)
+	mov    %edx,-0x5(%rdi)
+	mov    %dl,-0x1(%rdi)
+	retq   $0x0
+
+L(SSE14QB): movdqa %xmm0,-0xbe(%rdi)
+L(SSE14QA): movdqa %xmm0,-0xae(%rdi)
+L(SSE14Q9): movdqa %xmm0,-0x9e(%rdi)
+L(SSE14Q8): movdqa %xmm0,-0x8e(%rdi)
+L(SSE14Q7): movdqa %xmm0,-0x7e(%rdi)
+L(SSE14Q6): movdqa %xmm0,-0x6e(%rdi)
+L(SSE14Q5): movdqa %xmm0,-0x5e(%rdi)
+L(SSE14Q4): movdqa %xmm0,-0x4e(%rdi)
+L(SSE14Q3): movdqa %xmm0,-0x3e(%rdi)
+L(SSE14Q2): movdqa %xmm0,-0x2e(%rdi)
+L(SSE14Q1): movdqa %xmm0,-0x1e(%rdi)
+L(SSE14Q0): mov    %rdx,-0xe(%rdi)
+	mov    %edx,-0x6(%rdi)
+	mov    %dx,-0x2(%rdi)
+	retq   $0x0
+
+L(SSE15QB): movdqa %xmm0,-0xbf(%rdi)
+L(SSE15QA): movdqa %xmm0,-0xaf(%rdi)
+L(SSE15Q9): movdqa %xmm0,-0x9f(%rdi)
+L(SSE15Q8): movdqa %xmm0,-0x8f(%rdi)
+L(SSE15Q7): movdqa %xmm0,-0x7f(%rdi)
+L(SSE15Q6): movdqa %xmm0,-0x6f(%rdi)
+L(SSE15Q5): movdqa %xmm0,-0x5f(%rdi)
+L(SSE15Q4): movdqa %xmm0,-0x4f(%rdi)
+L(SSE15Q3): movdqa %xmm0,-0x3f(%rdi)
+L(SSE15Q2): movdqa %xmm0,-0x2f(%rdi)
+L(SSE15Q1): movdqa %xmm0,-0x1f(%rdi)
+L(SSE15Q0): mov    %rdx,-0xf(%rdi)
+	mov    %edx,-0x7(%rdi)
+	mov    %dx,-0x3(%rdi)
+	mov    %dl,-0x1(%rdi)
+	retq   $0x0
+
+	.balign     16
+L(byte32sse2_pre):
+
+	mov    __x86_64_shared_cache_size(%rip),%r9d  # The largest cache size
+	cmp    %r9,%r8
+	jg     L(sse2_nt_move_pre)
+	#jmp    L(byte32sse2)
+	.balign     16
+L(byte32sse2):
+	lea    -0x80(%r8),%r8 # 128
+	cmp    $0x80,%r8   # 128
+	movdqa %xmm0,(%rdi)
+	movdqa %xmm0,0x10(%rdi)
+	movdqa %xmm0,0x20(%rdi)
+	movdqa %xmm0,0x30(%rdi)
+	movdqa %xmm0,0x40(%rdi)
+	movdqa %xmm0,0x50(%rdi)
+	movdqa %xmm0,0x60(%rdi)
+	movdqa %xmm0,0x70(%rdi)
+
+	lea    0x80(%rdi),%rdi
+	jge    L(byte32sse2)
+	lea    L(SSExDx)(%rip),%r11
+	add    %r8,%rdi
+#ifndef PIC
+	jmpq   *(%r11,%r8,8)
+#else
+	movslq    (%r11,%r8,4),%rcx
+	lea   (%rcx,%r11,1),%r11
+	jmpq   *%r11
+#endif
+
+	.balign     16
+L(sse2_nt_move_pre):
+	cmp    $0x0,%r9
+	je     L(byte32sse2)
+	jmp    L(sse2_nt_move)
+
+	.balign     16
+L(sse2_nt_move):
+	lea    -0x80(%r8),%r8
+	cmp    $0x80,%r8
+
+	movntdq %xmm0,(%rdi)
+	movntdq %xmm0,0x10(%rdi)
+	movntdq %xmm0,0x20(%rdi)
+	movntdq %xmm0,0x30(%rdi)
+	movntdq %xmm0,0x40(%rdi)
+	movntdq %xmm0,0x50(%rdi)
+	movntdq %xmm0,0x60(%rdi)
+	movntdq %xmm0,0x70(%rdi)
+
+	lea    0x80(%rdi),%rdi
+	jge    L(sse2_nt_move)
+	lea    L(SSExDx)(%rip),%r11
+	sfence
+	add    %r8,%rdi
+#ifndef PIC
+	jmpq   *(%r11,%r8,8)
+#else
+	movslq (%r11,%r8,4),%rcx
+	lea   (%rcx,%r11,1),%r11
+	jmpq   *%r11
+#endif
+
+	.pushsection .rodata
+	.balign     16
+#ifndef PIC
+L(SSExDx):
+	.quad       L(SSE0Q0), L(SSE1Q0), L(SSE2Q0), L(SSE3Q0)
+	.quad       L(SSE4Q0), L(SSE5Q0), L(SSE6Q0), L(SSE7Q0)
+	.quad       L(SSE8Q0), L(SSE9Q0), L(SSE10Q0), L(SSE11Q0)
+	.quad       L(SSE12Q0), L(SSE13Q0), L(SSE14Q0), L(SSE15Q0)
+	.quad       L(SSE0Q1), L(SSE1Q1), L(SSE2Q1), L(SSE3Q1)
+	.quad       L(SSE4Q1), L(SSE5Q1), L(SSE6Q1), L(SSE7Q1)
+	.quad       L(SSE8Q1), L(SSE9Q1), L(SSE10Q1), L(SSE11Q1)
+	.quad       L(SSE12Q1), L(SSE13Q1), L(SSE14Q1), L(SSE15Q1)
+	.quad       L(SSE0Q2), L(SSE1Q2), L(SSE2Q2), L(SSE3Q2)
+	.quad       L(SSE4Q2), L(SSE5Q2), L(SSE6Q2), L(SSE7Q2)
+	.quad       L(SSE8Q2), L(SSE9Q2), L(SSE10Q2), L(SSE11Q2)
+	.quad       L(SSE12Q2), L(SSE13Q2), L(SSE14Q2), L(SSE15Q2)
+	.quad       L(SSE0Q3), L(SSE1Q3), L(SSE2Q3), L(SSE3Q3)
+	.quad       L(SSE4Q3), L(SSE5Q3), L(SSE6Q3), L(SSE7Q3)
+	.quad       L(SSE8Q3), L(SSE9Q3), L(SSE10Q3), L(SSE11Q3)
+	.quad       L(SSE12Q3), L(SSE13Q3), L(SSE14Q3), L(SSE15Q3)
+	.quad       L(SSE0Q4), L(SSE1Q4), L(SSE2Q4), L(SSE3Q4)
+	.quad       L(SSE4Q4), L(SSE5Q4), L(SSE6Q4), L(SSE7Q4)
+	.quad       L(SSE8Q4), L(SSE9Q4), L(SSE10Q4), L(SSE11Q4)
+	.quad       L(SSE12Q4), L(SSE13Q4), L(SSE14Q4), L(SSE15Q4)
+	.quad       L(SSE0Q5), L(SSE1Q5), L(SSE2Q5), L(SSE3Q5)
+	.quad       L(SSE4Q5), L(SSE5Q5), L(SSE6Q5), L(SSE7Q5)
+	.quad       L(SSE8Q5), L(SSE9Q5), L(SSE10Q5), L(SSE11Q5)
+	.quad       L(SSE12Q5), L(SSE13Q5), L(SSE14Q5), L(SSE15Q5)
+	.quad       L(SSE0Q6), L(SSE1Q6), L(SSE2Q6), L(SSE3Q6)
+	.quad       L(SSE4Q6), L(SSE5Q6), L(SSE6Q6), L(SSE7Q6)
+	.quad       L(SSE8Q6), L(SSE9Q6), L(SSE10Q6), L(SSE11Q6)
+	.quad       L(SSE12Q6), L(SSE13Q6), L(SSE14Q6), L(SSE15Q6)
+	.quad       L(SSE0Q7), L(SSE1Q7), L(SSE2Q7), L(SSE3Q7)
+	.quad       L(SSE4Q7), L(SSE5Q7), L(SSE6Q7), L(SSE7Q7)
+	.quad       L(SSE8Q7), L(SSE9Q7), L(SSE10Q7), L(SSE11Q7)
+	.quad       L(SSE12Q7), L(SSE13Q7), L(SSE14Q7), L(SSE15Q7)
+	.quad       L(SSE0Q8), L(SSE1Q8), L(SSE2Q8), L(SSE3Q8)
+	.quad       L(SSE4Q8), L(SSE5Q8), L(SSE6Q8), L(SSE7Q8)
+	.quad       L(SSE8Q8), L(SSE9Q8), L(SSE10Q8), L(SSE11Q8)
+	.quad       L(SSE12Q8), L(SSE13Q8), L(SSE14Q8), L(SSE15Q8)
+	.quad       L(SSE0Q9), L(SSE1Q9), L(SSE2Q9), L(SSE3Q9)
+	.quad       L(SSE4Q9), L(SSE5Q9), L(SSE6Q9), L(SSE7Q9)
+	.quad       L(SSE8Q9), L(SSE9Q9), L(SSE10Q9), L(SSE11Q9)
+	.quad       L(SSE12Q9), L(SSE13Q9), L(SSE14Q9), L(SSE15Q9)
+	.quad       L(SSE0QA), L(SSE1QA), L(SSE2QA), L(SSE3QA)
+	.quad       L(SSE4QA), L(SSE5QA), L(SSE6QA), L(SSE7QA)
+	.quad       L(SSE8QA), L(SSE9QA), L(SSE10QA), L(SSE11QA)
+	.quad       L(SSE12QA), L(SSE13QA), L(SSE14QA), L(SSE15QA)
+	.quad       L(SSE0QB), L(SSE1QB), L(SSE2QB), L(SSE3QB)
+	.quad       L(SSE4QB), L(SSE5QB), L(SSE6QB), L(SSE7QB)
+	.quad       L(SSE8QB), L(SSE9QB), L(SSE10QB), L(SSE11QB)
+	.quad       L(SSE12QB), L(SSE13QB), L(SSE14QB), L(SSE15QB)
+#else
+L(SSExDx):
+	.int       L(SSE0Q0) -L(SSExDx)
+	.int       L(SSE1Q0) -L(SSExDx)
+	.int       L(SSE2Q0) -L(SSExDx)
+	.int       L(SSE3Q0) -L(SSExDx)
+	.int       L(SSE4Q0) -L(SSExDx)
+	.int       L(SSE5Q0) -L(SSExDx)
+	.int       L(SSE6Q0) -L(SSExDx)
+	.int       L(SSE7Q0) -L(SSExDx)
+
+	.int       L(SSE8Q0) -L(SSExDx)
+	.int       L(SSE9Q0) -L(SSExDx)
+	.int       L(SSE10Q0)-L(SSExDx)
+	.int       L(SSE11Q0)-L(SSExDx)
+	.int       L(SSE12Q0)-L(SSExDx)
+	.int       L(SSE13Q0)-L(SSExDx)
+	.int       L(SSE14Q0)-L(SSExDx)
+	.int       L(SSE15Q0)-L(SSExDx)
+
+	.int       L(SSE0Q1) -L(SSExDx)
+	.int       L(SSE1Q1) -L(SSExDx)
+	.int       L(SSE2Q1) -L(SSExDx)
+	.int       L(SSE3Q1) -L(SSExDx)
+	.int       L(SSE4Q1) -L(SSExDx)
+	.int       L(SSE5Q1) -L(SSExDx)
+	.int       L(SSE6Q1) -L(SSExDx)
+	.int       L(SSE7Q1) -L(SSExDx)
+
+	.int       L(SSE8Q1) -L(SSExDx)
+	.int       L(SSE9Q1) -L(SSExDx)
+	.int       L(SSE10Q1)-L(SSExDx)
+	.int       L(SSE11Q1)-L(SSExDx)
+	.int       L(SSE12Q1)-L(SSExDx)
+	.int       L(SSE13Q1)-L(SSExDx)
+	.int       L(SSE14Q1)-L(SSExDx)
+	.int       L(SSE15Q1)-L(SSExDx)
+
+	.int       L(SSE0Q2) -L(SSExDx)
+	.int       L(SSE1Q2) -L(SSExDx)
+	.int       L(SSE2Q2) -L(SSExDx)
+	.int       L(SSE3Q2) -L(SSExDx)
+	.int       L(SSE4Q2) -L(SSExDx)
+	.int       L(SSE5Q2) -L(SSExDx)
+	.int       L(SSE6Q2) -L(SSExDx)
+	.int       L(SSE7Q2) -L(SSExDx)
+
+	.int       L(SSE8Q2) -L(SSExDx)
+	.int       L(SSE9Q2) -L(SSExDx)
+	.int       L(SSE10Q2)-L(SSExDx)
+	.int       L(SSE11Q2)-L(SSExDx)
+	.int       L(SSE12Q2)-L(SSExDx)
+	.int       L(SSE13Q2)-L(SSExDx)
+	.int       L(SSE14Q2)-L(SSExDx)
+	.int       L(SSE15Q2)-L(SSExDx)
+
+	.int       L(SSE0Q3) -L(SSExDx)
+	.int       L(SSE1Q3) -L(SSExDx)
+	.int       L(SSE2Q3) -L(SSExDx)
+	.int       L(SSE3Q3) -L(SSExDx)
+	.int       L(SSE4Q3) -L(SSExDx)
+	.int       L(SSE5Q3) -L(SSExDx)
+	.int       L(SSE6Q3) -L(SSExDx)
+	.int       L(SSE7Q3) -L(SSExDx)
+
+	.int       L(SSE8Q3) -L(SSExDx)
+	.int       L(SSE9Q3) -L(SSExDx)
+	.int       L(SSE10Q3)-L(SSExDx)
+	.int       L(SSE11Q3)-L(SSExDx)
+	.int       L(SSE12Q3)-L(SSExDx)
+	.int       L(SSE13Q3)-L(SSExDx)
+	.int       L(SSE14Q3)-L(SSExDx)
+	.int       L(SSE15Q3)-L(SSExDx)
+
+	.int       L(SSE0Q4) -L(SSExDx)
+	.int       L(SSE1Q4) -L(SSExDx)
+	.int       L(SSE2Q4) -L(SSExDx)
+	.int       L(SSE3Q4) -L(SSExDx)
+	.int       L(SSE4Q4) -L(SSExDx)
+	.int       L(SSE5Q4) -L(SSExDx)
+	.int       L(SSE6Q4) -L(SSExDx)
+	.int       L(SSE7Q4) -L(SSExDx)
+
+	.int       L(SSE8Q4) -L(SSExDx)
+	.int       L(SSE9Q4) -L(SSExDx)
+	.int       L(SSE10Q4)-L(SSExDx)
+	.int       L(SSE11Q4)-L(SSExDx)
+	.int       L(SSE12Q4)-L(SSExDx)
+	.int       L(SSE13Q4)-L(SSExDx)
+	.int       L(SSE14Q4)-L(SSExDx)
+	.int       L(SSE15Q4)-L(SSExDx)
+
+	.int       L(SSE0Q5) -L(SSExDx)
+	.int       L(SSE1Q5) -L(SSExDx)
+	.int       L(SSE2Q5) -L(SSExDx)
+	.int       L(SSE3Q5) -L(SSExDx)
+	.int       L(SSE4Q5) -L(SSExDx)
+	.int       L(SSE5Q5) -L(SSExDx)
+	.int       L(SSE6Q5) -L(SSExDx)
+	.int       L(SSE7Q5) -L(SSExDx)
+
+	.int       L(SSE8Q5) -L(SSExDx)
+	.int       L(SSE9Q5) -L(SSExDx)
+	.int       L(SSE10Q5)-L(SSExDx)
+	.int       L(SSE11Q5)-L(SSExDx)
+	.int       L(SSE12Q5)-L(SSExDx)
+	.int       L(SSE13Q5)-L(SSExDx)
+	.int       L(SSE14Q5)-L(SSExDx)
+	.int       L(SSE15Q5)-L(SSExDx)
+
+	.int       L(SSE0Q6) -L(SSExDx)
+	.int       L(SSE1Q6) -L(SSExDx)
+	.int       L(SSE2Q6) -L(SSExDx)
+	.int       L(SSE3Q6) -L(SSExDx)
+	.int       L(SSE4Q6) -L(SSExDx)
+	.int       L(SSE5Q6) -L(SSExDx)
+	.int       L(SSE6Q6) -L(SSExDx)
+	.int       L(SSE7Q6) -L(SSExDx)
+
+	.int       L(SSE8Q6) -L(SSExDx)
+	.int       L(SSE9Q6) -L(SSExDx)
+	.int       L(SSE10Q6)-L(SSExDx)
+	.int       L(SSE11Q6)-L(SSExDx)
+	.int       L(SSE12Q6)-L(SSExDx)
+	.int       L(SSE13Q6)-L(SSExDx)
+	.int       L(SSE14Q6)-L(SSExDx)
+	.int       L(SSE15Q6)-L(SSExDx)
+
+	.int       L(SSE0Q7) -L(SSExDx)
+	.int       L(SSE1Q7) -L(SSExDx)
+	.int       L(SSE2Q7) -L(SSExDx)
+	.int       L(SSE3Q7) -L(SSExDx)
+	.int       L(SSE4Q7) -L(SSExDx)
+	.int       L(SSE5Q7) -L(SSExDx)
+	.int       L(SSE6Q7) -L(SSExDx)
+	.int       L(SSE7Q7) -L(SSExDx)
+
+	.int       L(SSE8Q7) -L(SSExDx)
+	.int       L(SSE9Q7) -L(SSExDx)
+	.int       L(SSE10Q7)-L(SSExDx)
+	.int       L(SSE11Q7)-L(SSExDx)
+	.int       L(SSE12Q7)-L(SSExDx)
+	.int       L(SSE13Q7)-L(SSExDx)
+	.int       L(SSE14Q7)-L(SSExDx)
+	.int       L(SSE15Q7)-L(SSExDx)
+
+	.int       L(SSE0Q8) -L(SSExDx)
+	.int       L(SSE1Q8) -L(SSExDx)
+	.int       L(SSE2Q8) -L(SSExDx)
+	.int       L(SSE3Q8) -L(SSExDx)
+	.int       L(SSE4Q8) -L(SSExDx)
+	.int       L(SSE5Q8) -L(SSExDx)
+	.int       L(SSE6Q8) -L(SSExDx)
+	.int       L(SSE7Q8) -L(SSExDx)
+
+	.int       L(SSE8Q8) -L(SSExDx)
+	.int       L(SSE9Q8) -L(SSExDx)
+	.int       L(SSE10Q8)-L(SSExDx)
+	.int       L(SSE11Q8)-L(SSExDx)
+	.int       L(SSE12Q8)-L(SSExDx)
+	.int       L(SSE13Q8)-L(SSExDx)
+	.int       L(SSE14Q8)-L(SSExDx)
+	.int       L(SSE15Q8)-L(SSExDx)
+
+	.int       L(SSE0Q9) -L(SSExDx)
+	.int       L(SSE1Q9) -L(SSExDx)
+	.int       L(SSE2Q9) -L(SSExDx)
+	.int       L(SSE3Q9) -L(SSExDx)
+	.int       L(SSE4Q9) -L(SSExDx)
+	.int       L(SSE5Q9) -L(SSExDx)
+	.int       L(SSE6Q9) -L(SSExDx)
+	.int       L(SSE7Q9) -L(SSExDx)
+
+	.int       L(SSE8Q9) -L(SSExDx)
+	.int       L(SSE9Q9) -L(SSExDx)
+	.int       L(SSE10Q9)-L(SSExDx)
+	.int       L(SSE11Q9)-L(SSExDx)
+	.int       L(SSE12Q9)-L(SSExDx)
+	.int       L(SSE13Q9)-L(SSExDx)
+	.int       L(SSE14Q9)-L(SSExDx)
+	.int       L(SSE15Q9)-L(SSExDx)
+
+	.int       L(SSE0QA) -L(SSExDx)
+	.int       L(SSE1QA) -L(SSExDx)
+	.int       L(SSE2QA) -L(SSExDx)
+	.int       L(SSE3QA) -L(SSExDx)
+	.int       L(SSE4QA) -L(SSExDx)
+	.int       L(SSE5QA) -L(SSExDx)
+	.int       L(SSE6QA) -L(SSExDx)
+	.int       L(SSE7QA) -L(SSExDx)
+
+	.int       L(SSE8QA) -L(SSExDx)
+	.int       L(SSE9QA) -L(SSExDx)
+	.int       L(SSE10QA)-L(SSExDx)
+	.int       L(SSE11QA)-L(SSExDx)
+	.int       L(SSE12QA)-L(SSExDx)
+	.int       L(SSE13QA)-L(SSExDx)
+	.int       L(SSE14QA)-L(SSExDx)
+	.int       L(SSE15QA)-L(SSExDx)
+
+	.int       L(SSE0QB) -L(SSExDx)
+	.int       L(SSE1QB) -L(SSExDx)
+	.int       L(SSE2QB) -L(SSExDx)
+	.int       L(SSE3QB) -L(SSExDx)
+	.int       L(SSE4QB) -L(SSExDx)
+	.int       L(SSE5QB) -L(SSExDx)
+	.int       L(SSE6QB) -L(SSExDx)
+	.int       L(SSE7QB) -L(SSExDx)
+
+	.int       L(SSE8QB) -L(SSExDx)
+	.int       L(SSE9QB) -L(SSExDx)
+	.int       L(SSE10QB)-L(SSExDx)
+	.int       L(SSE11QB)-L(SSExDx)
+	.int       L(SSE12QB)-L(SSExDx)
+	.int       L(SSE13QB)-L(SSExDx)
+	.int       L(SSE14QB)-L(SSExDx)
+	.int       L(SSE15QB)-L(SSExDx)
+#endif
+	.popsection
 
 END (memset)
 libc_hidden_builtin_def (memset)
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]