This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: PATCH: Optimized memset for x86-64
- From: Harsha Jagasia <harsha dot jagasia at amd dot com>
- To: libc-alpha at sourceware dot org, drepper at redhat dot com, hjl at lucon dot org
- Cc: Harsha Jagasia <harsha dot jagasia at amd dot com>
- Date: 28 Feb 2008 17:17:58 -0600
- Subject: Re: PATCH: Optimized memset for x86-64
This is a rewrite of the non-SSE code path in x86-64 memset to improve
performance on AMD processors. It uses "rep stos" instruction between 8KB and
64KB, which improves performance on AMD Barcelona by ~7% at most blocks in that
range. Another improvement is at blocks larger than the largest shared cache
size; here the sub block that is smaller than the largest shared cache size is
set with "rep stos" and the remaining sub block is set with "movnti". This
improves performance by up to 50% for blocks in that range. The patch includes
H.J's rewrite and the changes made by Ulrich and Jakub since then.
We are accepting the code changes made by H.J wrt to alignment boundary and
prologue for blocks larger than 144 bytes. We are also accepting the code
changes made by H.J for blocks less than or equal to 144 bytes. This code is
also reused as epilogue for blocks larger than 144 bytes. These code changes
result in performance losses on AMD Barcelona machine using synthetic tests
like the glibc tests and some home grown tests. These performance losses are
observed relative to the submission made by AMD at
http://sources.redhat.com/ml/libc-alpha/2007-08/msg00054.html. Inspite of the
performance losses seen in synthetic tests, there are also some performance
gains and the changes overall have been observed to be neutral on the
workloads we have measured. Hence we are accepting H.J's changes.
It would be great if Ulrich could review. All feedback is welcome.
Harsha
-------
2008-2-26 Harsha Jagasia <harsha.jagasia@amd.com>
* sysdeps/x86_64/cacheinfo.c (NOT_USED_RIGHT_NOW): Remove ifdef guards.
* sysdeps/x86_64/memset.S: Rewrite non-SSE code path as tuned for AMD
Barcelona machine. Make default fall through branch of
__x86_64_preferred_memory_instruction check as the integer code path.
2007-10-15 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/x86_64/cacheinfo.c
(__x86_64_preferred_memory_instruction): New.
(init_cacheinfo): Initialize __x86_64_preferred_memory_instruction.
* sysdeps/x86_64/memset.S: Rewrite.
Index: sysdeps/x86_64/cacheinfo.c
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/x86_64/cacheinfo.c,v
retrieving revision 1.10
diff -d -u -p -r1.10 cacheinfo.c
--- sysdeps/x86_64/cacheinfo.c 23 Dec 2007 19:32:28 -0000 1.10
+++ sysdeps/x86_64/cacheinfo.c 26 Feb 2008 18:03:19 -0000
@@ -405,13 +405,10 @@ long int __x86_64_data_cache_size_half a
/* Shared cache size for use in memory and string routines, typically
L2 or L3 size. */
long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
-#ifdef NOT_USED_RIGHT_NOW
long int __x86_64_shared_cache_size attribute_hidden = 1024 * 1024;
-#endif
/* PREFETCHW support flag for use in memory and string routines. */
int __x86_64_prefetchw attribute_hidden;
-#ifdef NOT_USED_RIGHT_NOW
/* Instructions preferred for memory and string routines.
0: Regular instructions
@@ -421,7 +418,6 @@ int __x86_64_prefetchw attribute_hidden;
*/
int __x86_64_preferred_memory_instruction attribute_hidden;
-#endif
static void
@@ -464,14 +460,12 @@ init_cacheinfo (void)
: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
: "0" (1));
-#ifdef NOT_USED_RIGHT_NOW
/* Intel prefers SSSE3 instructions for memory/string rountines
if they are avaiable. */
if ((ecx & 0x200))
__x86_64_preferred_memory_instruction = 3;
else
__x86_64_preferred_memory_instruction = 2;
-#endif
/* Figure out the number of logical threads that share the
highest cache level. */
@@ -577,8 +571,6 @@ init_cacheinfo (void)
if (shared > 0)
{
__x86_64_shared_cache_size_half = shared / 2;
-#ifdef NOT_USED_RIGHT_NOW
__x86_64_shared_cache_size = shared;
-#endif
}
}
Index: sysdeps/x86_64/memset.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/x86_64/memset.S,v
retrieving revision 1.12
diff -d -u -p -r1.12 memset.S
--- sysdeps/x86_64/memset.S 8 Nov 2007 01:06:30 -0000 1.12
+++ sysdeps/x86_64/memset.S 26 Feb 2008 18:03:19 -0000
@@ -2,7 +2,6 @@
Optimized version for x86-64.
Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc.
This file is part of the GNU C Library.
- Contributed by Andreas Jaeger <aj@suse.de>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -20,16 +19,12 @@
02111-1307 USA. */
#include <sysdep.h>
-#include "asm-syntax.h"
-#include "bp-sym.h"
-#include "bp-asm.h"
-/* This is somehow experimental and could made dependend on the cache
- size. */
-#define LARGE $120000
+#define __STOS_LOWER_BOUNDARY $8192
+#define __STOS_UPPER_BOUNDARY $65536
.text
-#ifndef NOT_IN_libc
+#if !defined NOT_IN_libc
ENTRY(__bzero)
mov %rsi,%rdx /* Adjust parameter. */
xorl %esi,%esi /* Fill with 0s. */
@@ -46,89 +41,1291 @@ END (__memset_chk)
#endif
ENTRY (memset)
L(memset_entry):
- cmp $0x7,%rdx /* Check for small length. */
- mov %rdi,%rcx /* Save ptr as return value. */
- jbe 7f
+ cmp $0x1,%rdx
+ mov %rdi,%rax /* memset returns the dest address. */
+ jne L(ck2)
+ mov %sil,(%rdi)
+ retq $0x0
+L(ck2):
+ mov $0x101010101010101,%r9
+ mov %rdx,%r8
+ movzbq %sil,%rdx
+ imul %r9,%rdx
+L(now_dw_aligned):
+ cmp $0x90,%r8
+ jg L(ck_mem_ops_method)
+L(now_dw_aligned_small):
+ lea L(setPxQx)(%rip),%r11
+ add %r8,%rdi
+#ifndef PIC
+ jmpq *(%r11,%r8,8)
+#else
+ movslq (%r11,%r8,4),%rcx
+ lea (%rcx,%r11,1),%r11
+ jmpq *%r11
+#endif
- /* Populate 8 bit data to full 64-bit. */
- movabs $0x0101010101010101,%r8
- movzbl %sil,%eax
- imul %rax,%r8
- test $0x7,%edi /* Check for alignment. */
- je 2f
+L(Got0):
+ retq $0x0
- .p2align 4
-1: /* Align ptr to 8 byte. */
- mov %sil,(%rcx)
- dec %rdx
- inc %rcx
- test $0x7,%ecx
- jne 1b
+ .pushsection .rodata
+ .balign 16
+#ifndef PIC
+L(setPxQx):
+ .quad L(Got0), L(P1Q0), L(P2Q0), L(P3Q0)
+ .quad L(P4Q0), L(P5Q0), L(P6Q0), L(P7Q0)
+ .quad L(P0Q1), L(P1Q1), L(P2Q1), L(P3Q1)
+ .quad L(P4Q1), L(P5Q1), L(P6Q1), L(P7Q1)
+ .quad L(P0Q2), L(P1Q2), L(P2Q2), L(P3Q2)
+ .quad L(P4Q2), L(P5Q2), L(P6Q2), L(P7Q2)
+ .quad L(P0Q3), L(P1Q3), L(P2Q3), L(P3Q3)
+ .quad L(P4Q3), L(P5Q3), L(P6Q3), L(P7Q3)
+ .quad L(P0Q4), L(P1Q4), L(P2Q4), L(P3Q4)
+ .quad L(P4Q4), L(P5Q4), L(P6Q4), L(P7Q4)
+ .quad L(P0Q5), L(P1Q5), L(P2Q5), L(P3Q5)
+ .quad L(P4Q5), L(P5Q5), L(P6Q5), L(P7Q5)
+ .quad L(P0Q6), L(P1Q6), L(P2Q6), L(P3Q6)
+ .quad L(P4Q6), L(P5Q6), L(P6Q6), L(P7Q6)
+ .quad L(P0Q7), L(P1Q7), L(P2Q7), L(P3Q7)
+ .quad L(P4Q7), L(P5Q7), L(P6Q7), L(P7Q7)
+ .quad L(P0Q8), L(P1Q8), L(P2Q8), L(P3Q8)
+ .quad L(P4Q8), L(P5Q8), L(P6Q8), L(P7Q8)
+ .quad L(P0Q9), L(P1Q9), L(P2Q9), L(P3Q9)
+ .quad L(P4Q9), L(P5Q9), L(P6Q9), L(P7Q9)
+ .quad L(P0QA), L(P1QA), L(P2QA), L(P3QA)
+ .quad L(P4QA), L(P5QA), L(P6QA), L(P7QA)
+ .quad L(P0QB), L(P1QB), L(P2QB), L(P3QB)
+ .quad L(P4QB), L(P5QB), L(P6QB), L(P7QB)
+ .quad L(P0QC), L(P1QC), L(P2QC), L(P3QC)
+ .quad L(P4QC), L(P5QC), L(P6QC), L(P7QC)
+ .quad L(P0QD), L(P1QD), L(P2QD), L(P3QD)
+ .quad L(P4QD), L(P5QD), L(P6QD), L(P7QD)
+ .quad L(P0QE), L(P1QE), L(P2QE), L(P3QE)
+ .quad L(P4QE), L(P5QE), L(P6QE), L(P7QE)
+ .quad L(P0QF), L(P1QF), L(P2QF), L(P3QF)
+ .quad L(P4QF), L(P5QF), L(P6QF), L(P7QF)
+ .quad L(P0QG), L(P1QG), L(P2QG), L(P3QG)
+ .quad L(P4QG), L(P5QG), L(P6QG), L(P7QG)
+ .quad L(P0QH), L(P1QH), L(P2QH), L(P3QH)
+ .quad L(P4QH), L(P5QH), L(P6QH), L(P7QH)
+ .quad L(P0QI)
+# ifdef USE_EXTRA_TABLE
+ .quad L(P1QI), L(P2QI), L(P3QI), L(P4QI)
+ .quad L(P5QI), L(P6QI), L(P7QI)
+# endif
+#else
+L(setPxQx):
+ .int L(Got0)-L(setPxQx)
+ .int L(P1Q0)-L(setPxQx)
+ .int L(P2Q0)-L(setPxQx)
+ .int L(P3Q0)-L(setPxQx)
+ .int L(P4Q0)-L(setPxQx)
+ .int L(P5Q0)-L(setPxQx)
+ .int L(P6Q0)-L(setPxQx)
+ .int L(P7Q0)-L(setPxQx)
-2: /* Check for really large regions. */
- mov %rdx,%rax
- shr $0x6,%rax
- je 4f
- cmp LARGE, %rdx
- jae 11f
+ .int L(P0Q1)-L(setPxQx)
+ .int L(P1Q1)-L(setPxQx)
+ .int L(P2Q1)-L(setPxQx)
+ .int L(P3Q1)-L(setPxQx)
+ .int L(P4Q1)-L(setPxQx)
+ .int L(P5Q1)-L(setPxQx)
+ .int L(P6Q1)-L(setPxQx)
+ .int L(P7Q1)-L(setPxQx)
- .p2align 4
-3: /* Copy 64 bytes. */
- mov %r8,(%rcx)
- mov %r8,0x8(%rcx)
- mov %r8,0x10(%rcx)
- mov %r8,0x18(%rcx)
- mov %r8,0x20(%rcx)
- mov %r8,0x28(%rcx)
- mov %r8,0x30(%rcx)
- mov %r8,0x38(%rcx)
- add $0x40,%rcx
- dec %rax
- jne 3b
+ .int L(P0Q2)-L(setPxQx)
+ .int L(P1Q2)-L(setPxQx)
+ .int L(P2Q2)-L(setPxQx)
+ .int L(P3Q2)-L(setPxQx)
+ .int L(P4Q2)-L(setPxQx)
+ .int L(P5Q2)-L(setPxQx)
+ .int L(P6Q2)-L(setPxQx)
+ .int L(P7Q2)-L(setPxQx)
-4: /* Copy final bytes. */
- and $0x3f,%edx
- mov %rdx,%rax
- shr $0x3,%rax
- je 6f
+ .int L(P0Q3)-L(setPxQx)
+ .int L(P1Q3)-L(setPxQx)
+ .int L(P2Q3)-L(setPxQx)
+ .int L(P3Q3)-L(setPxQx)
+ .int L(P4Q3)-L(setPxQx)
+ .int L(P5Q3)-L(setPxQx)
+ .int L(P6Q3)-L(setPxQx)
+ .int L(P7Q3)-L(setPxQx)
-5: /* First in chunks of 8 bytes. */
- mov %r8,(%rcx)
- add $0x8,%rcx
- dec %rax
- jne 5b
-6:
- and $0x7,%edx
-7:
- test %rdx,%rdx
- je 9f
-8: /* And finally as bytes (up to 7). */
- mov %sil,(%rcx)
- inc %rcx
- dec %rdx
- jne 8b
-9:
- /* Load result (only if used as memset). */
- mov %rdi,%rax /* start address of destination is result */
- retq
+ .int L(P0Q4)-L(setPxQx)
+ .int L(P1Q4)-L(setPxQx)
+ .int L(P2Q4)-L(setPxQx)
+ .int L(P3Q4)-L(setPxQx)
+ .int L(P4Q4)-L(setPxQx)
+ .int L(P5Q4)-L(setPxQx)
+ .int L(P6Q4)-L(setPxQx)
+ .int L(P7Q4)-L(setPxQx)
+
+ .int L(P0Q5)-L(setPxQx)
+ .int L(P1Q5)-L(setPxQx)
+ .int L(P2Q5)-L(setPxQx)
+ .int L(P3Q5)-L(setPxQx)
+ .int L(P4Q5)-L(setPxQx)
+ .int L(P5Q5)-L(setPxQx)
+ .int L(P6Q5)-L(setPxQx)
+ .int L(P7Q5)-L(setPxQx)
+
+ .int L(P0Q6)-L(setPxQx)
+ .int L(P1Q6)-L(setPxQx)
+ .int L(P2Q6)-L(setPxQx)
+ .int L(P3Q6)-L(setPxQx)
+ .int L(P4Q6)-L(setPxQx)
+ .int L(P5Q6)-L(setPxQx)
+ .int L(P6Q6)-L(setPxQx)
+ .int L(P7Q6)-L(setPxQx)
+
+ .int L(P0Q7)-L(setPxQx)
+ .int L(P1Q7)-L(setPxQx)
+ .int L(P2Q7)-L(setPxQx)
+ .int L(P3Q7)-L(setPxQx)
+ .int L(P4Q7)-L(setPxQx)
+ .int L(P5Q7)-L(setPxQx)
+ .int L(P6Q7)-L(setPxQx)
+ .int L(P7Q7)-L(setPxQx)
+
+ .int L(P0Q8)-L(setPxQx)
+ .int L(P1Q8)-L(setPxQx)
+ .int L(P2Q8)-L(setPxQx)
+ .int L(P3Q8)-L(setPxQx)
+ .int L(P4Q8)-L(setPxQx)
+ .int L(P5Q8)-L(setPxQx)
+ .int L(P6Q8)-L(setPxQx)
+ .int L(P7Q8)-L(setPxQx)
+
+ .int L(P0Q9)-L(setPxQx)
+ .int L(P1Q9)-L(setPxQx)
+ .int L(P2Q9)-L(setPxQx)
+ .int L(P3Q9)-L(setPxQx)
+ .int L(P4Q9)-L(setPxQx)
+ .int L(P5Q9)-L(setPxQx)
+ .int L(P6Q9)-L(setPxQx)
+ .int L(P7Q9)-L(setPxQx)
+
+ .int L(P0QA)-L(setPxQx)
+ .int L(P1QA)-L(setPxQx)
+ .int L(P2QA)-L(setPxQx)
+ .int L(P3QA)-L(setPxQx)
+ .int L(P4QA)-L(setPxQx)
+ .int L(P5QA)-L(setPxQx)
+ .int L(P6QA)-L(setPxQx)
+ .int L(P7QA)-L(setPxQx)
+
+ .int L(P0QB)-L(setPxQx)
+ .int L(P1QB)-L(setPxQx)
+ .int L(P2QB)-L(setPxQx)
+ .int L(P3QB)-L(setPxQx)
+ .int L(P4QB)-L(setPxQx)
+ .int L(P5QB)-L(setPxQx)
+ .int L(P6QB)-L(setPxQx)
+ .int L(P7QB)-L(setPxQx)
+
+ .int L(P0QC)-L(setPxQx)
+ .int L(P1QC)-L(setPxQx)
+ .int L(P2QC)-L(setPxQx)
+ .int L(P3QC)-L(setPxQx)
+ .int L(P4QC)-L(setPxQx)
+ .int L(P5QC)-L(setPxQx)
+ .int L(P6QC)-L(setPxQx)
+ .int L(P7QC)-L(setPxQx)
+
+ .int L(P0QD)-L(setPxQx)
+ .int L(P1QD)-L(setPxQx)
+ .int L(P2QD)-L(setPxQx)
+ .int L(P3QD)-L(setPxQx)
+ .int L(P4QD)-L(setPxQx)
+ .int L(P5QD)-L(setPxQx)
+ .int L(P6QD)-L(setPxQx)
+ .int L(P7QD)-L(setPxQx)
+
+ .int L(P0QE)-L(setPxQx)
+ .int L(P1QE)-L(setPxQx)
+ .int L(P2QE)-L(setPxQx)
+ .int L(P3QE)-L(setPxQx)
+ .int L(P4QE)-L(setPxQx)
+ .int L(P5QE)-L(setPxQx)
+ .int L(P6QE)-L(setPxQx)
+ .int L(P7QE)-L(setPxQx)
+
+ .int L(P0QF)-L(setPxQx)
+ .int L(P1QF)-L(setPxQx)
+ .int L(P2QF)-L(setPxQx)
+ .int L(P3QF)-L(setPxQx)
+ .int L(P4QF)-L(setPxQx)
+ .int L(P5QF)-L(setPxQx)
+ .int L(P6QF)-L(setPxQx)
+ .int L(P7QF)-L(setPxQx)
+
+ .int L(P0QG)-L(setPxQx)
+ .int L(P1QG)-L(setPxQx)
+ .int L(P2QG)-L(setPxQx)
+ .int L(P3QG)-L(setPxQx)
+ .int L(P4QG)-L(setPxQx)
+ .int L(P5QG)-L(setPxQx)
+ .int L(P6QG)-L(setPxQx)
+ .int L(P7QG)-L(setPxQx)
+
+ .int L(P0QH)-L(setPxQx)
+ .int L(P1QH)-L(setPxQx)
+ .int L(P2QH)-L(setPxQx)
+ .int L(P3QH)-L(setPxQx)
+ .int L(P4QH)-L(setPxQx)
+ .int L(P5QH)-L(setPxQx)
+ .int L(P6QH)-L(setPxQx)
+ .int L(P7QH)-L(setPxQx)
+
+ .int L(P0QI)-L(setPxQx)
+# ifdef USE_EXTRA_TABLE
+ .int L(P1QI)-L(setPxQx)
+ .int L(P2QI)-L(setPxQx)
+ .int L(P3QI)-L(setPxQx)
+ .int L(P4QI)-L(setPxQx)
+ .int L(P5QI)-L(setPxQx)
+ .int L(P6QI)-L(setPxQx)
+ .int L(P7QI)-L(setPxQx)
+# endif
+#endif
+ .popsection
+
+ .balign 16
+#ifdef USE_EXTRA_TABLE
+L(P1QI): mov %rdx,-0x91(%rdi)
+#endif
+L(P1QH): mov %rdx,-0x89(%rdi)
+L(P1QG): mov %rdx,-0x81(%rdi)
+# .balign 16
+L(P1QF): mov %rdx,-0x79(%rdi)
+L(P1QE): mov %rdx,-0x71(%rdi)
+L(P1QD): mov %rdx,-0x69(%rdi)
+L(P1QC): mov %rdx,-0x61(%rdi)
+L(P1QB): mov %rdx,-0x59(%rdi)
+L(P1QA): mov %rdx,-0x51(%rdi)
+L(P1Q9): mov %rdx,-0x49(%rdi)
+L(P1Q8): mov %rdx,-0x41(%rdi)
+L(P1Q7): mov %rdx,-0x39(%rdi)
+L(P1Q6): mov %rdx,-0x31(%rdi)
+L(P1Q5): mov %rdx,-0x29(%rdi)
+L(P1Q4): mov %rdx,-0x21(%rdi)
+L(P1Q3): mov %rdx,-0x19(%rdi)
+L(P1Q2): mov %rdx,-0x11(%rdi)
+L(P1Q1): mov %rdx,-0x9(%rdi)
+L(P1Q0): mov %dl,-0x1(%rdi)
+ retq $0x0
+
+ .balign 16
+L(P0QI): mov %rdx,-0x90(%rdi)
+L(P0QH): mov %rdx,-0x88(%rdi)
+# .balign 16
+L(P0QG): mov %rdx,-0x80(%rdi)
+L(P0QF): mov %rdx,-0x78(%rdi)
+L(P0QE): mov %rdx,-0x70(%rdi)
+L(P0QD): mov %rdx,-0x68(%rdi)
+L(P0QC): mov %rdx,-0x60(%rdi)
+L(P0QB): mov %rdx,-0x58(%rdi)
+L(P0QA): mov %rdx,-0x50(%rdi)
+L(P0Q9): mov %rdx,-0x48(%rdi)
+L(P0Q8): mov %rdx,-0x40(%rdi)
+L(P0Q7): mov %rdx,-0x38(%rdi)
+L(P0Q6): mov %rdx,-0x30(%rdi)
+L(P0Q5): mov %rdx,-0x28(%rdi)
+L(P0Q4): mov %rdx,-0x20(%rdi)
+L(P0Q3): mov %rdx,-0x18(%rdi)
+L(P0Q2): mov %rdx,-0x10(%rdi)
+L(P0Q1): mov %rdx,-0x8(%rdi)
+L(P0Q0): retq $0x0
+
+
+ .balign 16
+#ifdef USE_EXTRA_TABLE
+L(P2QI): mov %rdx,-0x92(%rdi)
+#endif
+L(P2QH): mov %rdx,-0x8a(%rdi)
+L(P2QG): mov %rdx,-0x82(%rdi)
+# .balign 16
+L(P2QF): mov %rdx,-0x7a(%rdi)
+L(P2QE): mov %rdx,-0x72(%rdi)
+L(P2QD): mov %rdx,-0x6a(%rdi)
+L(P2QC): mov %rdx,-0x62(%rdi)
+L(P2QB): mov %rdx,-0x5a(%rdi)
+L(P2QA): mov %rdx,-0x52(%rdi)
+L(P2Q9): mov %rdx,-0x4a(%rdi)
+L(P2Q8): mov %rdx,-0x42(%rdi)
+L(P2Q7): mov %rdx,-0x3a(%rdi)
+L(P2Q6): mov %rdx,-0x32(%rdi)
+L(P2Q5): mov %rdx,-0x2a(%rdi)
+L(P2Q4): mov %rdx,-0x22(%rdi)
+L(P2Q3): mov %rdx,-0x1a(%rdi)
+L(P2Q2): mov %rdx,-0x12(%rdi)
+L(P2Q1): mov %rdx,-0xa(%rdi)
+L(P2Q0): mov %dx,-0x2(%rdi)
+ retq $0x0
+
+ .balign 16
+#ifdef USE_EXTRA_TABLE
+L(P3QI): mov %rdx,-0x93(%rdi)
+#endif
+L(P3QH): mov %rdx,-0x8b(%rdi)
+L(P3QG): mov %rdx,-0x83(%rdi)
+# .balign 16
+L(P3QF): mov %rdx,-0x7b(%rdi)
+L(P3QE): mov %rdx,-0x73(%rdi)
+L(P3QD): mov %rdx,-0x6b(%rdi)
+L(P3QC): mov %rdx,-0x63(%rdi)
+L(P3QB): mov %rdx,-0x5b(%rdi)
+L(P3QA): mov %rdx,-0x53(%rdi)
+L(P3Q9): mov %rdx,-0x4b(%rdi)
+L(P3Q8): mov %rdx,-0x43(%rdi)
+L(P3Q7): mov %rdx,-0x3b(%rdi)
+L(P3Q6): mov %rdx,-0x33(%rdi)
+L(P3Q5): mov %rdx,-0x2b(%rdi)
+L(P3Q4): mov %rdx,-0x23(%rdi)
+L(P3Q3): mov %rdx,-0x1b(%rdi)
+L(P3Q2): mov %rdx,-0x13(%rdi)
+L(P3Q1): mov %rdx,-0xb(%rdi)
+L(P3Q0): mov %dx,-0x3(%rdi)
+ mov %dl,-0x1(%rdi)
+ retq $0x0
+
+ .balign 16
+#ifdef USE_EXTRA_TABLE
+L(P4QI): mov %rdx,-0x94(%rdi)
+#endif
+L(P4QH): mov %rdx,-0x8c(%rdi)
+L(P4QG): mov %rdx,-0x84(%rdi)
+# .balign 16
+L(P4QF): mov %rdx,-0x7c(%rdi)
+L(P4QE): mov %rdx,-0x74(%rdi)
+L(P4QD): mov %rdx,-0x6c(%rdi)
+L(P4QC): mov %rdx,-0x64(%rdi)
+L(P4QB): mov %rdx,-0x5c(%rdi)
+L(P4QA): mov %rdx,-0x54(%rdi)
+L(P4Q9): mov %rdx,-0x4c(%rdi)
+L(P4Q8): mov %rdx,-0x44(%rdi)
+L(P4Q7): mov %rdx,-0x3c(%rdi)
+L(P4Q6): mov %rdx,-0x34(%rdi)
+L(P4Q5): mov %rdx,-0x2c(%rdi)
+L(P4Q4): mov %rdx,-0x24(%rdi)
+L(P4Q3): mov %rdx,-0x1c(%rdi)
+L(P4Q2): mov %rdx,-0x14(%rdi)
+L(P4Q1): mov %rdx,-0xc(%rdi)
+L(P4Q0): mov %edx,-0x4(%rdi)
+ retq $0x0
+
+ .balign 16
+#if defined(USE_EXTRA_TABLE)
+L(P5QI): mov %rdx,-0x95(%rdi)
+#endif
+L(P5QH): mov %rdx,-0x8d(%rdi)
+L(P5QG): mov %rdx,-0x85(%rdi)
+# .balign 16
+L(P5QF): mov %rdx,-0x7d(%rdi)
+L(P5QE): mov %rdx,-0x75(%rdi)
+L(P5QD): mov %rdx,-0x6d(%rdi)
+L(P5QC): mov %rdx,-0x65(%rdi)
+L(P5QB): mov %rdx,-0x5d(%rdi)
+L(P5QA): mov %rdx,-0x55(%rdi)
+L(P5Q9): mov %rdx,-0x4d(%rdi)
+L(P5Q8): mov %rdx,-0x45(%rdi)
+L(P5Q7): mov %rdx,-0x3d(%rdi)
+L(P5Q6): mov %rdx,-0x35(%rdi)
+L(P5Q5): mov %rdx,-0x2d(%rdi)
+L(P5Q4): mov %rdx,-0x25(%rdi)
+L(P5Q3): mov %rdx,-0x1d(%rdi)
+L(P5Q2): mov %rdx,-0x15(%rdi)
+L(P5Q1): mov %rdx,-0xd(%rdi)
+L(P5Q0): mov %edx,-0x5(%rdi)
+ mov %dl,-0x1(%rdi)
+ retq $0x0
+
+ .balign 16
+#ifdef USE_EXTRA_TABLE
+L(P6QI): mov %rdx,-0x96(%rdi)
+#endif
+L(P6QH): mov %rdx,-0x8e(%rdi)
+L(P6QG): mov %rdx,-0x86(%rdi)
+# .balign 16
+L(P6QF): mov %rdx,-0x7e(%rdi)
+L(P6QE): mov %rdx,-0x76(%rdi)
+L(P6QD): mov %rdx,-0x6e(%rdi)
+L(P6QC): mov %rdx,-0x66(%rdi)
+L(P6QB): mov %rdx,-0x5e(%rdi)
+L(P6QA): mov %rdx,-0x56(%rdi)
+L(P6Q9): mov %rdx,-0x4e(%rdi)
+L(P6Q8): mov %rdx,-0x46(%rdi)
+L(P6Q7): mov %rdx,-0x3e(%rdi)
+L(P6Q6): mov %rdx,-0x36(%rdi)
+L(P6Q5): mov %rdx,-0x2e(%rdi)
+L(P6Q4): mov %rdx,-0x26(%rdi)
+L(P6Q3): mov %rdx,-0x1e(%rdi)
+L(P6Q2): mov %rdx,-0x16(%rdi)
+L(P6Q1): mov %rdx,-0xe(%rdi)
+L(P6Q0): mov %edx,-0x6(%rdi)
+ mov %dx,-0x2(%rdi)
+ retq $0x0
+
+ .balign 16
+#ifdef USE_EXTRA_TABLE
+L(P7QI): mov %rdx,-0x97(%rdi)
+#endif
+L(P7QH): mov %rdx,-0x8f(%rdi)
+L(P7QG): mov %rdx,-0x87(%rdi)
+# .balign 16
+L(P7QF): mov %rdx,-0x7f(%rdi)
+L(P7QE): mov %rdx,-0x77(%rdi)
+L(P7QD): mov %rdx,-0x6f(%rdi)
+L(P7QC): mov %rdx,-0x67(%rdi)
+L(P7QB): mov %rdx,-0x5f(%rdi)
+L(P7QA): mov %rdx,-0x57(%rdi)
+L(P7Q9): mov %rdx,-0x4f(%rdi)
+L(P7Q8): mov %rdx,-0x47(%rdi)
+L(P7Q7): mov %rdx,-0x3f(%rdi)
+L(P7Q6): mov %rdx,-0x37(%rdi)
+L(P7Q5): mov %rdx,-0x2f(%rdi)
+L(P7Q4): mov %rdx,-0x27(%rdi)
+L(P7Q3): mov %rdx,-0x1f(%rdi)
+L(P7Q2): mov %rdx,-0x17(%rdi)
+L(P7Q1): mov %rdx,-0xf(%rdi)
+L(P7Q0): mov %edx,-0x7(%rdi)
+ mov %dx,-0x3(%rdi)
+ mov %dl,-0x1(%rdi)
+ retq $0x0
+
+ .balign 16
+L(ck_mem_ops_method):
+
+# align to 16 byte boundary first
+ #test $0xf,%rdi
+ #jz L(aligned_now)
+ lea L(AliPxQx)(%rip),%r11
+ mov $0x10,%r10
+ mov %rdi,%r9
+ and $0xf,%r9
+ sub %r9,%r10
+ and $0xf,%r10
+ add %r10,%rdi
+ sub %r10,%r8
+#ifndef PIC
+ jmpq *(%r11,%r10,8)
+#else
+ movslq (%r11,%r10,4),%rcx
+ lea (%rcx,%r11,1),%r11
+ jmpq *%r11
+#endif
+
+ .pushsection .rodata
+ .balign 16
+#ifndef PIC
+L(AliPxQx):
+ .quad L(aligned_now), L(A1Q0), L(A2Q0), L(A3Q0)
+ .quad L(A4Q0), L(A5Q0), L(A6Q0), L(A7Q0)
+ .quad L(A0Q1), L(A1Q1), L(A2Q1), L(A3Q1)
+ .quad L(A4Q1), L(A5Q1), L(A6Q1), L(A7Q1)
+#else
+L(AliPxQx):
+ .int L(aligned_now)-L(AliPxQx)
+ .int L(A1Q0)-L(AliPxQx)
+ .int L(A2Q0)-L(AliPxQx)
+ .int L(A3Q0)-L(AliPxQx)
+ .int L(A4Q0)-L(AliPxQx)
+ .int L(A5Q0)-L(AliPxQx)
+ .int L(A6Q0)-L(AliPxQx)
+ .int L(A7Q0)-L(AliPxQx)
+
+ .int L(A0Q1)-L(AliPxQx)
+ .int L(A1Q1)-L(AliPxQx)
+ .int L(A2Q1)-L(AliPxQx)
+ .int L(A3Q1)-L(AliPxQx)
+ .int L(A4Q1)-L(AliPxQx)
+ .int L(A5Q1)-L(AliPxQx)
+ .int L(A6Q1)-L(AliPxQx)
+ .int L(A7Q1)-L(AliPxQx)
+#endif
+ .popsection
+
+ .balign 16
+L(A5Q1): mov %dl,-0xd(%rdi)
+L(A4Q1): mov %edx,-0xc(%rdi)
+L(A0Q1): mov %rdx,-0x8(%rdi)
+L(A0Q0): jmp L(aligned_now)
+
+ .balign 16
+L(A1Q1): mov %dl,-0x9(%rdi)
+ mov %rdx,-0x8(%rdi)
+ jmp L(aligned_now)
+
+ .balign 16
+L(A1Q0): mov %dl,-0x1(%rdi)
+ jmp L(aligned_now)
+
+ .balign 16
+L(A3Q1): mov %dl,-0xb(%rdi)
+L(A2Q1): mov %dx,-0xa(%rdi)
+ mov %rdx,-0x8(%rdi)
+ jmp L(aligned_now)
+
+ .balign 16
+L(A3Q0): mov %dl,-0x3(%rdi)
+L(A2Q0): mov %dx,-0x2(%rdi)
+ jmp L(aligned_now)
+
+ .balign 16
+L(A5Q0): mov %dl,-0x5(%rdi)
+L(A4Q0): mov %edx,-0x4(%rdi)
+ jmp L(aligned_now)
+
+ .balign 16
+L(A7Q1): mov %dl,-0xf(%rdi)
+L(A6Q1): mov %dx,-0xe(%rdi)
+ mov %edx,-0xc(%rdi)
+ mov %rdx,-0x8(%rdi)
+ jmp L(aligned_now)
+
+ .balign 16
+L(A7Q0): mov %dl,-0x7(%rdi)
+L(A6Q0): mov %dx,-0x6(%rdi)
+ mov %edx,-0x4(%rdi)
+ jmp L(aligned_now)
+
+ .balign 16
+L(aligned_now):
+
+ cmpl $0x1,__x86_64_preferred_memory_instruction(%rip)
+ jg L(SSE_pre)
+
+L(8byte_move_try):
+ cmpq __STOS_LOWER_BOUNDARY,%r8
+ jae L(8byte_stos_try)
+
+ .balign 16
+L(8byte_move):
+ movq %r8,%rcx
+ shrq $7,%rcx
+ jz L(8byte_move_skip)
.p2align 4
-11: /* Copy 64 bytes without polluting the cache. */
- /* We could use movntdq %xmm0,(%rcx) here to further
- speed up for large cases but let's not use XMM registers. */
- movnti %r8,(%rcx)
- movnti %r8,0x8(%rcx)
- movnti %r8,0x10(%rcx)
- movnti %r8,0x18(%rcx)
- movnti %r8,0x20(%rcx)
- movnti %r8,0x28(%rcx)
- movnti %r8,0x30(%rcx)
- movnti %r8,0x38(%rcx)
- add $0x40,%rcx
- dec %rax
- jne 11b
+
+L(8byte_move_loop):
+ decq %rcx
+
+ movq %rdx, (%rdi)
+ movq %rdx, 8 (%rdi)
+ movq %rdx, 16 (%rdi)
+ movq %rdx, 24 (%rdi)
+ movq %rdx, 32 (%rdi)
+ movq %rdx, 40 (%rdi)
+ movq %rdx, 48 (%rdi)
+ movq %rdx, 56 (%rdi)
+ movq %rdx, 64 (%rdi)
+ movq %rdx, 72 (%rdi)
+ movq %rdx, 80 (%rdi)
+ movq %rdx, 88 (%rdi)
+ movq %rdx, 96 (%rdi)
+ movq %rdx, 104 (%rdi)
+ movq %rdx, 112 (%rdi)
+ movq %rdx, 120 (%rdi)
+
+ leaq 128 (%rdi),%rdi
+
+ jnz L(8byte_move_loop)
+
+L(8byte_move_skip):
+ andl $127,%r8d
+ lea (%rdi,%r8,1),%rdi
+ lea L(setPxQx)(%rip),%r11
+
+#ifndef PIC
+ jmpq *(%r11,%r8,8) # old scheme remained for nonPIC
+#else
+ movslq (%r11,%r8,4),%rcx
+ lea (%rcx,%r11,1),%r11
+ jmpq *%r11
+#endif
+
+ .balign 16
+L(8byte_stos_try):
+ mov __x86_64_shared_cache_size(%rip),%r9d // ck largest cache size
+ cmpq %r8,%r9 // calculate the lesser of remaining
+ cmovaq %r8,%r9 // bytes and largest cache size
+ jbe L(8byte_stos)
+
+L(8byte_move_reuse_try):
+ cmp __STOS_UPPER_BOUNDARY,%r8
+ jae L(8byte_move)
+
+ .balign 16
+L(8byte_stos):
+ movq %r9,%rcx
+ andq $-8,%r9
+
+ shrq $3,%rcx
+ jz L(8byte_stos_skip)
+
+ xchgq %rax,%rdx
+
+ rep
+ stosq
+
+ xchgq %rax,%rdx
+
+L(8byte_stos_skip):
+ subq %r9,%r8
+ ja L(8byte_nt_move)
+
+ andl $7,%r8d
+ lea (%rdi,%r8,1),%rdi
+ lea L(setPxQx)(%rip),%r11
+#ifndef PIC
+ jmpq *(%r11,%r8,8) # old scheme remained for nonPIC
+#else
+ movslq (%r11,%r8,4),%rcx
+ lea (%rcx,%r11,1),%r11
+ jmpq *%r11
+#endif
+
+ .balign 16
+L(8byte_nt_move):
+ movq %r8,%rcx
+ shrq $7,%rcx
+ jz L(8byte_nt_move_skip)
+
+ .balign 16
+L(8byte_nt_move_loop):
+ decq %rcx
+
+ movntiq %rdx, (%rdi)
+ movntiq %rdx, 8 (%rdi)
+ movntiq %rdx, 16 (%rdi)
+ movntiq %rdx, 24 (%rdi)
+ movntiq %rdx, 32 (%rdi)
+ movntiq %rdx, 40 (%rdi)
+ movntiq %rdx, 48 (%rdi)
+ movntiq %rdx, 56 (%rdi)
+ movntiq %rdx, 64 (%rdi)
+ movntiq %rdx, 72 (%rdi)
+ movntiq %rdx, 80 (%rdi)
+ movntiq %rdx, 88 (%rdi)
+ movntiq %rdx, 96 (%rdi)
+ movntiq %rdx, 104 (%rdi)
+ movntiq %rdx, 112 (%rdi)
+ movntiq %rdx, 120 (%rdi)
+
+ leaq 128 (%rdi),%rdi
+
+ jnz L(8byte_nt_move_loop)
+
sfence
- jmp 4b
+
+L(8byte_nt_move_skip):
+ andl $127,%r8d
+
+ lea (%rdi,%r8,1),%rdi
+ lea L(setPxQx)(%rip),%r11
+#ifndef PIC
+ jmpq *(%r11,%r8,8) # old scheme remained for nonPIC
+#else
+ movslq (%r11,%r8,4),%rcx
+ lea (%rcx,%r11,1),%r11
+ jmpq *%r11
+#endif
+
+L(SSE_pre):
+ # fill RegXMM0 with the pattern
+ movd %rdx,%xmm0
+ punpcklqdq %xmm0,%xmm0
+
+ lea L(SSExDx)(%rip),%r9 # for later after the alignment
+ cmp $0xb0,%r8 # 176
+ jge L(byte32sse2_pre)
+
+ add %r8,%rdi
+#ifndef PIC
+ jmpq *(%r9,%r8,8)
+#else
+ movslq (%r9,%r8,4),%rcx
+ lea (%rcx,%r9,1),%r9
+ jmpq *%r9
+#endif
+
+L(SSE0QB): movdqa %xmm0,-0xb0(%rdi)
+L(SSE0QA): movdqa %xmm0,-0xa0(%rdi)
+L(SSE0Q9): movdqa %xmm0,-0x90(%rdi)
+L(SSE0Q8): movdqa %xmm0,-0x80(%rdi)
+L(SSE0Q7): movdqa %xmm0,-0x70(%rdi)
+L(SSE0Q6): movdqa %xmm0,-0x60(%rdi)
+L(SSE0Q5): movdqa %xmm0,-0x50(%rdi)
+L(SSE0Q4): movdqa %xmm0,-0x40(%rdi)
+L(SSE0Q3): movdqa %xmm0,-0x30(%rdi)
+L(SSE0Q2): movdqa %xmm0,-0x20(%rdi)
+L(SSE0Q1): movdqa %xmm0,-0x10(%rdi)
+L(SSE0Q0): retq $0x0
+
+L(SSE1QB): movdqa %xmm0,-0xb1(%rdi)
+L(SSE1QA): movdqa %xmm0,-0xa1(%rdi)
+L(SSE1Q9): movdqa %xmm0,-0x91(%rdi)
+L(SSE1Q8): movdqa %xmm0,-0x81(%rdi)
+L(SSE1Q7): movdqa %xmm0,-0x71(%rdi)
+L(SSE1Q6): movdqa %xmm0,-0x61(%rdi)
+L(SSE1Q5): movdqa %xmm0,-0x51(%rdi)
+L(SSE1Q4): movdqa %xmm0,-0x41(%rdi)
+L(SSE1Q3): movdqa %xmm0,-0x31(%rdi)
+L(SSE1Q2): movdqa %xmm0,-0x21(%rdi)
+L(SSE1Q1): movdqa %xmm0,-0x11(%rdi)
+L(SSE1Q0): mov %dl,-0x1(%rdi)
+ retq $0x0
+
+L(SSE2QB): movdqa %xmm0,-0xb2(%rdi)
+L(SSE2QA): movdqa %xmm0,-0xa2(%rdi)
+L(SSE2Q9): movdqa %xmm0,-0x92(%rdi)
+L(SSE2Q8): movdqa %xmm0,-0x82(%rdi)
+L(SSE2Q7): movdqa %xmm0,-0x72(%rdi)
+L(SSE2Q6): movdqa %xmm0,-0x62(%rdi)
+L(SSE2Q5): movdqa %xmm0,-0x52(%rdi)
+L(SSE2Q4): movdqa %xmm0,-0x42(%rdi)
+L(SSE2Q3): movdqa %xmm0,-0x32(%rdi)
+L(SSE2Q2): movdqa %xmm0,-0x22(%rdi)
+L(SSE2Q1): movdqa %xmm0,-0x12(%rdi)
+L(SSE2Q0): mov %dx,-0x2(%rdi)
+ retq $0x0
+
+L(SSE3QB): movdqa %xmm0,-0xb3(%rdi)
+L(SSE3QA): movdqa %xmm0,-0xa3(%rdi)
+L(SSE3Q9): movdqa %xmm0,-0x93(%rdi)
+L(SSE3Q8): movdqa %xmm0,-0x83(%rdi)
+L(SSE3Q7): movdqa %xmm0,-0x73(%rdi)
+L(SSE3Q6): movdqa %xmm0,-0x63(%rdi)
+L(SSE3Q5): movdqa %xmm0,-0x53(%rdi)
+L(SSE3Q4): movdqa %xmm0,-0x43(%rdi)
+L(SSE3Q3): movdqa %xmm0,-0x33(%rdi)
+L(SSE3Q2): movdqa %xmm0,-0x23(%rdi)
+L(SSE3Q1): movdqa %xmm0,-0x13(%rdi)
+L(SSE3Q0): mov %dx,-0x3(%rdi)
+ mov %dl,-0x1(%rdi)
+ retq $0x0
+
+L(SSE4QB): movdqa %xmm0,-0xb4(%rdi)
+L(SSE4QA): movdqa %xmm0,-0xa4(%rdi)
+L(SSE4Q9): movdqa %xmm0,-0x94(%rdi)
+L(SSE4Q8): movdqa %xmm0,-0x84(%rdi)
+L(SSE4Q7): movdqa %xmm0,-0x74(%rdi)
+L(SSE4Q6): movdqa %xmm0,-0x64(%rdi)
+L(SSE4Q5): movdqa %xmm0,-0x54(%rdi)
+L(SSE4Q4): movdqa %xmm0,-0x44(%rdi)
+L(SSE4Q3): movdqa %xmm0,-0x34(%rdi)
+L(SSE4Q2): movdqa %xmm0,-0x24(%rdi)
+L(SSE4Q1): movdqa %xmm0,-0x14(%rdi)
+L(SSE4Q0): mov %edx,-0x4(%rdi)
+ retq $0x0
+
+L(SSE5QB): movdqa %xmm0,-0xb5(%rdi)
+L(SSE5QA): movdqa %xmm0,-0xa5(%rdi)
+L(SSE5Q9): movdqa %xmm0,-0x95(%rdi)
+L(SSE5Q8): movdqa %xmm0,-0x85(%rdi)
+L(SSE5Q7): movdqa %xmm0,-0x75(%rdi)
+L(SSE5Q6): movdqa %xmm0,-0x65(%rdi)
+L(SSE5Q5): movdqa %xmm0,-0x55(%rdi)
+L(SSE5Q4): movdqa %xmm0,-0x45(%rdi)
+L(SSE5Q3): movdqa %xmm0,-0x35(%rdi)
+L(SSE5Q2): movdqa %xmm0,-0x25(%rdi)
+L(SSE5Q1): movdqa %xmm0,-0x15(%rdi)
+L(SSE5Q0): mov %edx,-0x5(%rdi)
+ mov %dl,-0x1(%rdi)
+ retq $0x0
+
+
+L(SSE6QB): movdqa %xmm0,-0xb6(%rdi)
+L(SSE6QA): movdqa %xmm0,-0xa6(%rdi)
+L(SSE6Q9): movdqa %xmm0,-0x96(%rdi)
+L(SSE6Q8): movdqa %xmm0,-0x86(%rdi)
+L(SSE6Q7): movdqa %xmm0,-0x76(%rdi)
+L(SSE6Q6): movdqa %xmm0,-0x66(%rdi)
+L(SSE6Q5): movdqa %xmm0,-0x56(%rdi)
+L(SSE6Q4): movdqa %xmm0,-0x46(%rdi)
+L(SSE6Q3): movdqa %xmm0,-0x36(%rdi)
+L(SSE6Q2): movdqa %xmm0,-0x26(%rdi)
+L(SSE6Q1): movdqa %xmm0,-0x16(%rdi)
+L(SSE6Q0): mov %edx,-0x6(%rdi)
+ mov %dx,-0x2(%rdi)
+ retq $0x0
+
+L(SSE7QB): movdqa %xmm0,-0xb7(%rdi)
+L(SSE7QA): movdqa %xmm0,-0xa7(%rdi)
+L(SSE7Q9): movdqa %xmm0,-0x97(%rdi)
+L(SSE7Q8): movdqa %xmm0,-0x87(%rdi)
+L(SSE7Q7): movdqa %xmm0,-0x77(%rdi)
+L(SSE7Q6): movdqa %xmm0,-0x67(%rdi)
+L(SSE7Q5): movdqa %xmm0,-0x57(%rdi)
+L(SSE7Q4): movdqa %xmm0,-0x47(%rdi)
+L(SSE7Q3): movdqa %xmm0,-0x37(%rdi)
+L(SSE7Q2): movdqa %xmm0,-0x27(%rdi)
+L(SSE7Q1): movdqa %xmm0,-0x17(%rdi)
+L(SSE7Q0): mov %edx,-0x7(%rdi)
+ mov %dx,-0x3(%rdi)
+ mov %dl,-0x1(%rdi)
+ retq $0x0
+
+L(SSE8QB): movdqa %xmm0,-0xb8(%rdi)
+L(SSE8QA): movdqa %xmm0,-0xa8(%rdi)
+L(SSE8Q9): movdqa %xmm0,-0x98(%rdi)
+L(SSE8Q8): movdqa %xmm0,-0x88(%rdi)
+L(SSE8Q7): movdqa %xmm0,-0x78(%rdi)
+L(SSE8Q6): movdqa %xmm0,-0x68(%rdi)
+L(SSE8Q5): movdqa %xmm0,-0x58(%rdi)
+L(SSE8Q4): movdqa %xmm0,-0x48(%rdi)
+L(SSE8Q3): movdqa %xmm0,-0x38(%rdi)
+L(SSE8Q2): movdqa %xmm0,-0x28(%rdi)
+L(SSE8Q1): movdqa %xmm0,-0x18(%rdi)
+L(SSE8Q0): mov %rdx,-0x8(%rdi)
+ retq $0x0
+
+L(SSE9QB): movdqa %xmm0,-0xb9(%rdi)
+L(SSE9QA): movdqa %xmm0,-0xa9(%rdi)
+L(SSE9Q9): movdqa %xmm0,-0x99(%rdi)
+L(SSE9Q8): movdqa %xmm0,-0x89(%rdi)
+L(SSE9Q7): movdqa %xmm0,-0x79(%rdi)
+L(SSE9Q6): movdqa %xmm0,-0x69(%rdi)
+L(SSE9Q5): movdqa %xmm0,-0x59(%rdi)
+L(SSE9Q4): movdqa %xmm0,-0x49(%rdi)
+L(SSE9Q3): movdqa %xmm0,-0x39(%rdi)
+L(SSE9Q2): movdqa %xmm0,-0x29(%rdi)
+L(SSE9Q1): movdqa %xmm0,-0x19(%rdi)
+L(SSE9Q0): mov %rdx,-0x9(%rdi)
+ mov %dl,-0x1(%rdi)
+ retq $0x0
+
+L(SSE10QB): movdqa %xmm0,-0xba(%rdi)
+L(SSE10QA): movdqa %xmm0,-0xaa(%rdi)
+L(SSE10Q9): movdqa %xmm0,-0x9a(%rdi)
+L(SSE10Q8): movdqa %xmm0,-0x8a(%rdi)
+L(SSE10Q7): movdqa %xmm0,-0x7a(%rdi)
+L(SSE10Q6): movdqa %xmm0,-0x6a(%rdi)
+L(SSE10Q5): movdqa %xmm0,-0x5a(%rdi)
+L(SSE10Q4): movdqa %xmm0,-0x4a(%rdi)
+L(SSE10Q3): movdqa %xmm0,-0x3a(%rdi)
+L(SSE10Q2): movdqa %xmm0,-0x2a(%rdi)
+L(SSE10Q1): movdqa %xmm0,-0x1a(%rdi)
+L(SSE10Q0): mov %rdx,-0xa(%rdi)
+ mov %dx,-0x2(%rdi)
+ retq $0x0
+
+L(SSE11QB): movdqa %xmm0,-0xbb(%rdi)
+L(SSE11QA): movdqa %xmm0,-0xab(%rdi)
+L(SSE11Q9): movdqa %xmm0,-0x9b(%rdi)
+L(SSE11Q8): movdqa %xmm0,-0x8b(%rdi)
+L(SSE11Q7): movdqa %xmm0,-0x7b(%rdi)
+L(SSE11Q6): movdqa %xmm0,-0x6b(%rdi)
+L(SSE11Q5): movdqa %xmm0,-0x5b(%rdi)
+L(SSE11Q4): movdqa %xmm0,-0x4b(%rdi)
+L(SSE11Q3): movdqa %xmm0,-0x3b(%rdi)
+L(SSE11Q2): movdqa %xmm0,-0x2b(%rdi)
+L(SSE11Q1): movdqa %xmm0,-0x1b(%rdi)
+L(SSE11Q0): mov %rdx,-0xb(%rdi)
+ mov %dx,-0x3(%rdi)
+ mov %dl,-0x1(%rdi)
+ retq $0x0
+
+L(SSE12QB): movdqa %xmm0,-0xbc(%rdi)
+L(SSE12QA): movdqa %xmm0,-0xac(%rdi)
+L(SSE12Q9): movdqa %xmm0,-0x9c(%rdi)
+L(SSE12Q8): movdqa %xmm0,-0x8c(%rdi)
+L(SSE12Q7): movdqa %xmm0,-0x7c(%rdi)
+L(SSE12Q6): movdqa %xmm0,-0x6c(%rdi)
+L(SSE12Q5): movdqa %xmm0,-0x5c(%rdi)
+L(SSE12Q4): movdqa %xmm0,-0x4c(%rdi)
+L(SSE12Q3): movdqa %xmm0,-0x3c(%rdi)
+L(SSE12Q2): movdqa %xmm0,-0x2c(%rdi)
+L(SSE12Q1): movdqa %xmm0,-0x1c(%rdi)
+L(SSE12Q0): mov %rdx,-0xc(%rdi)
+ mov %edx,-0x4(%rdi)
+ retq $0x0
+
+L(SSE13QB): movdqa %xmm0,-0xbd(%rdi)
+L(SSE13QA): movdqa %xmm0,-0xad(%rdi)
+L(SSE13Q9): movdqa %xmm0,-0x9d(%rdi)
+L(SSE13Q8): movdqa %xmm0,-0x8d(%rdi)
+L(SSE13Q7): movdqa %xmm0,-0x7d(%rdi)
+L(SSE13Q6): movdqa %xmm0,-0x6d(%rdi)
+L(SSE13Q5): movdqa %xmm0,-0x5d(%rdi)
+L(SSE13Q4): movdqa %xmm0,-0x4d(%rdi)
+L(SSE13Q3): movdqa %xmm0,-0x3d(%rdi)
+L(SSE13Q2): movdqa %xmm0,-0x2d(%rdi)
+L(SSE13Q1): movdqa %xmm0,-0x1d(%rdi)
+L(SSE13Q0): mov %rdx,-0xd(%rdi)
+ mov %edx,-0x5(%rdi)
+ mov %dl,-0x1(%rdi)
+ retq $0x0
+
+L(SSE14QB): movdqa %xmm0,-0xbe(%rdi)
+L(SSE14QA): movdqa %xmm0,-0xae(%rdi)
+L(SSE14Q9): movdqa %xmm0,-0x9e(%rdi)
+L(SSE14Q8): movdqa %xmm0,-0x8e(%rdi)
+L(SSE14Q7): movdqa %xmm0,-0x7e(%rdi)
+L(SSE14Q6): movdqa %xmm0,-0x6e(%rdi)
+L(SSE14Q5): movdqa %xmm0,-0x5e(%rdi)
+L(SSE14Q4): movdqa %xmm0,-0x4e(%rdi)
+L(SSE14Q3): movdqa %xmm0,-0x3e(%rdi)
+L(SSE14Q2): movdqa %xmm0,-0x2e(%rdi)
+L(SSE14Q1): movdqa %xmm0,-0x1e(%rdi)
+L(SSE14Q0): mov %rdx,-0xe(%rdi)
+ mov %edx,-0x6(%rdi)
+ mov %dx,-0x2(%rdi)
+ retq $0x0
+
+L(SSE15QB): movdqa %xmm0,-0xbf(%rdi)
+L(SSE15QA): movdqa %xmm0,-0xaf(%rdi)
+L(SSE15Q9): movdqa %xmm0,-0x9f(%rdi)
+L(SSE15Q8): movdqa %xmm0,-0x8f(%rdi)
+L(SSE15Q7): movdqa %xmm0,-0x7f(%rdi)
+L(SSE15Q6): movdqa %xmm0,-0x6f(%rdi)
+L(SSE15Q5): movdqa %xmm0,-0x5f(%rdi)
+L(SSE15Q4): movdqa %xmm0,-0x4f(%rdi)
+L(SSE15Q3): movdqa %xmm0,-0x3f(%rdi)
+L(SSE15Q2): movdqa %xmm0,-0x2f(%rdi)
+L(SSE15Q1): movdqa %xmm0,-0x1f(%rdi)
+L(SSE15Q0): mov %rdx,-0xf(%rdi)
+ mov %edx,-0x7(%rdi)
+ mov %dx,-0x3(%rdi)
+ mov %dl,-0x1(%rdi)
+ retq $0x0
+
+ .balign 16
+L(byte32sse2_pre):
+
+ mov __x86_64_shared_cache_size(%rip),%r9d # The largest cache size
+ cmp %r9,%r8
+ jg L(sse2_nt_move_pre)
+ #jmp L(byte32sse2)
+ .balign 16
+L(byte32sse2):
+ lea -0x80(%r8),%r8 # 128
+ cmp $0x80,%r8 # 128
+ movdqa %xmm0,(%rdi)
+ movdqa %xmm0,0x10(%rdi)
+ movdqa %xmm0,0x20(%rdi)
+ movdqa %xmm0,0x30(%rdi)
+ movdqa %xmm0,0x40(%rdi)
+ movdqa %xmm0,0x50(%rdi)
+ movdqa %xmm0,0x60(%rdi)
+ movdqa %xmm0,0x70(%rdi)
+
+ lea 0x80(%rdi),%rdi
+ jge L(byte32sse2)
+ lea L(SSExDx)(%rip),%r11
+ add %r8,%rdi
+#ifndef PIC
+ jmpq *(%r11,%r8,8)
+#else
+ movslq (%r11,%r8,4),%rcx
+ lea (%rcx,%r11,1),%r11
+ jmpq *%r11
+#endif
+
+ .balign 16
+L(sse2_nt_move_pre):
+ cmp $0x0,%r9
+ je L(byte32sse2)
+ jmp L(sse2_nt_move)
+
+ .balign 16
+L(sse2_nt_move):
+ lea -0x80(%r8),%r8
+ cmp $0x80,%r8
+
+ movntdq %xmm0,(%rdi)
+ movntdq %xmm0,0x10(%rdi)
+ movntdq %xmm0,0x20(%rdi)
+ movntdq %xmm0,0x30(%rdi)
+ movntdq %xmm0,0x40(%rdi)
+ movntdq %xmm0,0x50(%rdi)
+ movntdq %xmm0,0x60(%rdi)
+ movntdq %xmm0,0x70(%rdi)
+
+ lea 0x80(%rdi),%rdi
+ jge L(sse2_nt_move)
+ lea L(SSExDx)(%rip),%r11
+ sfence
+ add %r8,%rdi
+#ifndef PIC
+ jmpq *(%r11,%r8,8)
+#else
+ movslq (%r11,%r8,4),%rcx
+ lea (%rcx,%r11,1),%r11
+ jmpq *%r11
+#endif
+
+ .pushsection .rodata
+ .balign 16
+#ifndef PIC
+L(SSExDx):
+ .quad L(SSE0Q0), L(SSE1Q0), L(SSE2Q0), L(SSE3Q0)
+ .quad L(SSE4Q0), L(SSE5Q0), L(SSE6Q0), L(SSE7Q0)
+ .quad L(SSE8Q0), L(SSE9Q0), L(SSE10Q0), L(SSE11Q0)
+ .quad L(SSE12Q0), L(SSE13Q0), L(SSE14Q0), L(SSE15Q0)
+ .quad L(SSE0Q1), L(SSE1Q1), L(SSE2Q1), L(SSE3Q1)
+ .quad L(SSE4Q1), L(SSE5Q1), L(SSE6Q1), L(SSE7Q1)
+ .quad L(SSE8Q1), L(SSE9Q1), L(SSE10Q1), L(SSE11Q1)
+ .quad L(SSE12Q1), L(SSE13Q1), L(SSE14Q1), L(SSE15Q1)
+ .quad L(SSE0Q2), L(SSE1Q2), L(SSE2Q2), L(SSE3Q2)
+ .quad L(SSE4Q2), L(SSE5Q2), L(SSE6Q2), L(SSE7Q2)
+ .quad L(SSE8Q2), L(SSE9Q2), L(SSE10Q2), L(SSE11Q2)
+ .quad L(SSE12Q2), L(SSE13Q2), L(SSE14Q2), L(SSE15Q2)
+ .quad L(SSE0Q3), L(SSE1Q3), L(SSE2Q3), L(SSE3Q3)
+ .quad L(SSE4Q3), L(SSE5Q3), L(SSE6Q3), L(SSE7Q3)
+ .quad L(SSE8Q3), L(SSE9Q3), L(SSE10Q3), L(SSE11Q3)
+ .quad L(SSE12Q3), L(SSE13Q3), L(SSE14Q3), L(SSE15Q3)
+ .quad L(SSE0Q4), L(SSE1Q4), L(SSE2Q4), L(SSE3Q4)
+ .quad L(SSE4Q4), L(SSE5Q4), L(SSE6Q4), L(SSE7Q4)
+ .quad L(SSE8Q4), L(SSE9Q4), L(SSE10Q4), L(SSE11Q4)
+ .quad L(SSE12Q4), L(SSE13Q4), L(SSE14Q4), L(SSE15Q4)
+ .quad L(SSE0Q5), L(SSE1Q5), L(SSE2Q5), L(SSE3Q5)
+ .quad L(SSE4Q5), L(SSE5Q5), L(SSE6Q5), L(SSE7Q5)
+ .quad L(SSE8Q5), L(SSE9Q5), L(SSE10Q5), L(SSE11Q5)
+ .quad L(SSE12Q5), L(SSE13Q5), L(SSE14Q5), L(SSE15Q5)
+ .quad L(SSE0Q6), L(SSE1Q6), L(SSE2Q6), L(SSE3Q6)
+ .quad L(SSE4Q6), L(SSE5Q6), L(SSE6Q6), L(SSE7Q6)
+ .quad L(SSE8Q6), L(SSE9Q6), L(SSE10Q6), L(SSE11Q6)
+ .quad L(SSE12Q6), L(SSE13Q6), L(SSE14Q6), L(SSE15Q6)
+ .quad L(SSE0Q7), L(SSE1Q7), L(SSE2Q7), L(SSE3Q7)
+ .quad L(SSE4Q7), L(SSE5Q7), L(SSE6Q7), L(SSE7Q7)
+ .quad L(SSE8Q7), L(SSE9Q7), L(SSE10Q7), L(SSE11Q7)
+ .quad L(SSE12Q7), L(SSE13Q7), L(SSE14Q7), L(SSE15Q7)
+ .quad L(SSE0Q8), L(SSE1Q8), L(SSE2Q8), L(SSE3Q8)
+ .quad L(SSE4Q8), L(SSE5Q8), L(SSE6Q8), L(SSE7Q8)
+ .quad L(SSE8Q8), L(SSE9Q8), L(SSE10Q8), L(SSE11Q8)
+ .quad L(SSE12Q8), L(SSE13Q8), L(SSE14Q8), L(SSE15Q8)
+ .quad L(SSE0Q9), L(SSE1Q9), L(SSE2Q9), L(SSE3Q9)
+ .quad L(SSE4Q9), L(SSE5Q9), L(SSE6Q9), L(SSE7Q9)
+ .quad L(SSE8Q9), L(SSE9Q9), L(SSE10Q9), L(SSE11Q9)
+ .quad L(SSE12Q9), L(SSE13Q9), L(SSE14Q9), L(SSE15Q9)
+ .quad L(SSE0QA), L(SSE1QA), L(SSE2QA), L(SSE3QA)
+ .quad L(SSE4QA), L(SSE5QA), L(SSE6QA), L(SSE7QA)
+ .quad L(SSE8QA), L(SSE9QA), L(SSE10QA), L(SSE11QA)
+ .quad L(SSE12QA), L(SSE13QA), L(SSE14QA), L(SSE15QA)
+ .quad L(SSE0QB), L(SSE1QB), L(SSE2QB), L(SSE3QB)
+ .quad L(SSE4QB), L(SSE5QB), L(SSE6QB), L(SSE7QB)
+ .quad L(SSE8QB), L(SSE9QB), L(SSE10QB), L(SSE11QB)
+ .quad L(SSE12QB), L(SSE13QB), L(SSE14QB), L(SSE15QB)
+#else
+L(SSExDx):
+ .int L(SSE0Q0) -L(SSExDx)
+ .int L(SSE1Q0) -L(SSExDx)
+ .int L(SSE2Q0) -L(SSExDx)
+ .int L(SSE3Q0) -L(SSExDx)
+ .int L(SSE4Q0) -L(SSExDx)
+ .int L(SSE5Q0) -L(SSExDx)
+ .int L(SSE6Q0) -L(SSExDx)
+ .int L(SSE7Q0) -L(SSExDx)
+
+ .int L(SSE8Q0) -L(SSExDx)
+ .int L(SSE9Q0) -L(SSExDx)
+ .int L(SSE10Q0)-L(SSExDx)
+ .int L(SSE11Q0)-L(SSExDx)
+ .int L(SSE12Q0)-L(SSExDx)
+ .int L(SSE13Q0)-L(SSExDx)
+ .int L(SSE14Q0)-L(SSExDx)
+ .int L(SSE15Q0)-L(SSExDx)
+
+ .int L(SSE0Q1) -L(SSExDx)
+ .int L(SSE1Q1) -L(SSExDx)
+ .int L(SSE2Q1) -L(SSExDx)
+ .int L(SSE3Q1) -L(SSExDx)
+ .int L(SSE4Q1) -L(SSExDx)
+ .int L(SSE5Q1) -L(SSExDx)
+ .int L(SSE6Q1) -L(SSExDx)
+ .int L(SSE7Q1) -L(SSExDx)
+
+ .int L(SSE8Q1) -L(SSExDx)
+ .int L(SSE9Q1) -L(SSExDx)
+ .int L(SSE10Q1)-L(SSExDx)
+ .int L(SSE11Q1)-L(SSExDx)
+ .int L(SSE12Q1)-L(SSExDx)
+ .int L(SSE13Q1)-L(SSExDx)
+ .int L(SSE14Q1)-L(SSExDx)
+ .int L(SSE15Q1)-L(SSExDx)
+
+ .int L(SSE0Q2) -L(SSExDx)
+ .int L(SSE1Q2) -L(SSExDx)
+ .int L(SSE2Q2) -L(SSExDx)
+ .int L(SSE3Q2) -L(SSExDx)
+ .int L(SSE4Q2) -L(SSExDx)
+ .int L(SSE5Q2) -L(SSExDx)
+ .int L(SSE6Q2) -L(SSExDx)
+ .int L(SSE7Q2) -L(SSExDx)
+
+ .int L(SSE8Q2) -L(SSExDx)
+ .int L(SSE9Q2) -L(SSExDx)
+ .int L(SSE10Q2)-L(SSExDx)
+ .int L(SSE11Q2)-L(SSExDx)
+ .int L(SSE12Q2)-L(SSExDx)
+ .int L(SSE13Q2)-L(SSExDx)
+ .int L(SSE14Q2)-L(SSExDx)
+ .int L(SSE15Q2)-L(SSExDx)
+
+ .int L(SSE0Q3) -L(SSExDx)
+ .int L(SSE1Q3) -L(SSExDx)
+ .int L(SSE2Q3) -L(SSExDx)
+ .int L(SSE3Q3) -L(SSExDx)
+ .int L(SSE4Q3) -L(SSExDx)
+ .int L(SSE5Q3) -L(SSExDx)
+ .int L(SSE6Q3) -L(SSExDx)
+ .int L(SSE7Q3) -L(SSExDx)
+
+ .int L(SSE8Q3) -L(SSExDx)
+ .int L(SSE9Q3) -L(SSExDx)
+ .int L(SSE10Q3)-L(SSExDx)
+ .int L(SSE11Q3)-L(SSExDx)
+ .int L(SSE12Q3)-L(SSExDx)
+ .int L(SSE13Q3)-L(SSExDx)
+ .int L(SSE14Q3)-L(SSExDx)
+ .int L(SSE15Q3)-L(SSExDx)
+
+ .int L(SSE0Q4) -L(SSExDx)
+ .int L(SSE1Q4) -L(SSExDx)
+ .int L(SSE2Q4) -L(SSExDx)
+ .int L(SSE3Q4) -L(SSExDx)
+ .int L(SSE4Q4) -L(SSExDx)
+ .int L(SSE5Q4) -L(SSExDx)
+ .int L(SSE6Q4) -L(SSExDx)
+ .int L(SSE7Q4) -L(SSExDx)
+
+ .int L(SSE8Q4) -L(SSExDx)
+ .int L(SSE9Q4) -L(SSExDx)
+ .int L(SSE10Q4)-L(SSExDx)
+ .int L(SSE11Q4)-L(SSExDx)
+ .int L(SSE12Q4)-L(SSExDx)
+ .int L(SSE13Q4)-L(SSExDx)
+ .int L(SSE14Q4)-L(SSExDx)
+ .int L(SSE15Q4)-L(SSExDx)
+
+ .int L(SSE0Q5) -L(SSExDx)
+ .int L(SSE1Q5) -L(SSExDx)
+ .int L(SSE2Q5) -L(SSExDx)
+ .int L(SSE3Q5) -L(SSExDx)
+ .int L(SSE4Q5) -L(SSExDx)
+ .int L(SSE5Q5) -L(SSExDx)
+ .int L(SSE6Q5) -L(SSExDx)
+ .int L(SSE7Q5) -L(SSExDx)
+
+ .int L(SSE8Q5) -L(SSExDx)
+ .int L(SSE9Q5) -L(SSExDx)
+ .int L(SSE10Q5)-L(SSExDx)
+ .int L(SSE11Q5)-L(SSExDx)
+ .int L(SSE12Q5)-L(SSExDx)
+ .int L(SSE13Q5)-L(SSExDx)
+ .int L(SSE14Q5)-L(SSExDx)
+ .int L(SSE15Q5)-L(SSExDx)
+
+ .int L(SSE0Q6) -L(SSExDx)
+ .int L(SSE1Q6) -L(SSExDx)
+ .int L(SSE2Q6) -L(SSExDx)
+ .int L(SSE3Q6) -L(SSExDx)
+ .int L(SSE4Q6) -L(SSExDx)
+ .int L(SSE5Q6) -L(SSExDx)
+ .int L(SSE6Q6) -L(SSExDx)
+ .int L(SSE7Q6) -L(SSExDx)
+
+ .int L(SSE8Q6) -L(SSExDx)
+ .int L(SSE9Q6) -L(SSExDx)
+ .int L(SSE10Q6)-L(SSExDx)
+ .int L(SSE11Q6)-L(SSExDx)
+ .int L(SSE12Q6)-L(SSExDx)
+ .int L(SSE13Q6)-L(SSExDx)
+ .int L(SSE14Q6)-L(SSExDx)
+ .int L(SSE15Q6)-L(SSExDx)
+
+ .int L(SSE0Q7) -L(SSExDx)
+ .int L(SSE1Q7) -L(SSExDx)
+ .int L(SSE2Q7) -L(SSExDx)
+ .int L(SSE3Q7) -L(SSExDx)
+ .int L(SSE4Q7) -L(SSExDx)
+ .int L(SSE5Q7) -L(SSExDx)
+ .int L(SSE6Q7) -L(SSExDx)
+ .int L(SSE7Q7) -L(SSExDx)
+
+ .int L(SSE8Q7) -L(SSExDx)
+ .int L(SSE9Q7) -L(SSExDx)
+ .int L(SSE10Q7)-L(SSExDx)
+ .int L(SSE11Q7)-L(SSExDx)
+ .int L(SSE12Q7)-L(SSExDx)
+ .int L(SSE13Q7)-L(SSExDx)
+ .int L(SSE14Q7)-L(SSExDx)
+ .int L(SSE15Q7)-L(SSExDx)
+
+ .int L(SSE0Q8) -L(SSExDx)
+ .int L(SSE1Q8) -L(SSExDx)
+ .int L(SSE2Q8) -L(SSExDx)
+ .int L(SSE3Q8) -L(SSExDx)
+ .int L(SSE4Q8) -L(SSExDx)
+ .int L(SSE5Q8) -L(SSExDx)
+ .int L(SSE6Q8) -L(SSExDx)
+ .int L(SSE7Q8) -L(SSExDx)
+
+ .int L(SSE8Q8) -L(SSExDx)
+ .int L(SSE9Q8) -L(SSExDx)
+ .int L(SSE10Q8)-L(SSExDx)
+ .int L(SSE11Q8)-L(SSExDx)
+ .int L(SSE12Q8)-L(SSExDx)
+ .int L(SSE13Q8)-L(SSExDx)
+ .int L(SSE14Q8)-L(SSExDx)
+ .int L(SSE15Q8)-L(SSExDx)
+
+ .int L(SSE0Q9) -L(SSExDx)
+ .int L(SSE1Q9) -L(SSExDx)
+ .int L(SSE2Q9) -L(SSExDx)
+ .int L(SSE3Q9) -L(SSExDx)
+ .int L(SSE4Q9) -L(SSExDx)
+ .int L(SSE5Q9) -L(SSExDx)
+ .int L(SSE6Q9) -L(SSExDx)
+ .int L(SSE7Q9) -L(SSExDx)
+
+ .int L(SSE8Q9) -L(SSExDx)
+ .int L(SSE9Q9) -L(SSExDx)
+ .int L(SSE10Q9)-L(SSExDx)
+ .int L(SSE11Q9)-L(SSExDx)
+ .int L(SSE12Q9)-L(SSExDx)
+ .int L(SSE13Q9)-L(SSExDx)
+ .int L(SSE14Q9)-L(SSExDx)
+ .int L(SSE15Q9)-L(SSExDx)
+
+ .int L(SSE0QA) -L(SSExDx)
+ .int L(SSE1QA) -L(SSExDx)
+ .int L(SSE2QA) -L(SSExDx)
+ .int L(SSE3QA) -L(SSExDx)
+ .int L(SSE4QA) -L(SSExDx)
+ .int L(SSE5QA) -L(SSExDx)
+ .int L(SSE6QA) -L(SSExDx)
+ .int L(SSE7QA) -L(SSExDx)
+
+ .int L(SSE8QA) -L(SSExDx)
+ .int L(SSE9QA) -L(SSExDx)
+ .int L(SSE10QA)-L(SSExDx)
+ .int L(SSE11QA)-L(SSExDx)
+ .int L(SSE12QA)-L(SSExDx)
+ .int L(SSE13QA)-L(SSExDx)
+ .int L(SSE14QA)-L(SSExDx)
+ .int L(SSE15QA)-L(SSExDx)
+
+ .int L(SSE0QB) -L(SSExDx)
+ .int L(SSE1QB) -L(SSExDx)
+ .int L(SSE2QB) -L(SSExDx)
+ .int L(SSE3QB) -L(SSExDx)
+ .int L(SSE4QB) -L(SSExDx)
+ .int L(SSE5QB) -L(SSExDx)
+ .int L(SSE6QB) -L(SSExDx)
+ .int L(SSE7QB) -L(SSExDx)
+
+ .int L(SSE8QB) -L(SSExDx)
+ .int L(SSE9QB) -L(SSExDx)
+ .int L(SSE10QB)-L(SSExDx)
+ .int L(SSE11QB)-L(SSExDx)
+ .int L(SSE12QB)-L(SSExDx)
+ .int L(SSE13QB)-L(SSExDx)
+ .int L(SSE14QB)-L(SSExDx)
+ .int L(SSE15QB)-L(SSExDx)
+#endif
+ .popsection
END (memset)
libc_hidden_builtin_def (memset)