This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
- From: ling dot ma dot program at gmail dot com
- To: libc-alpha at sourceware dot org
- Cc: liubov dot dmitrieva at gmail dot com, neleai at seznam dot cz, Ma Ling <ling dot ml at alibaba-inc dot com>
- Date: Thu, 11 Jul 2013 08:51:36 -0400
- Subject: [PATCH RFC V2] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
From: Ma Ling <ling.ml@alibaba-inc.com>
We manage to avoid branch instructions, and force destination to be aligned
with avx instruction. We modified gcc.403 so that we can only measure memcpy function,
gcc.403 benchmarks indicate the version improved performance from 4% to 16% on different cases .
Best Regards
Ling
---
In this version we did clean-up work, thanks Liubov.
sysdeps/x86_64/multiarch/Makefile | 5 +-
sysdeps/x86_64/multiarch/ifunc-defines.sym | 2 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 11 +
sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S | 409 +++++++++++++++++++++++
sysdeps/x86_64/multiarch/memmove-avx-unaligned.S | 4 +
sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S | 4 +
6 files changed, 433 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..f92cf18 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,8 +8,9 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
- memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
- memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
+ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back memmove-ssse3-back \
+ memcpy-avx-unaligned mempcpy-avx-unaligned memmove-avx-unaligned \
+ strcasestr-nonascii strcasecmp_l-ssse3 \
strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
index eb1538a..448b8c4 100644
--- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -17,4 +17,6 @@ FEATURE_OFFSET offsetof (struct cpu_features, feature)
FEATURE_SIZE sizeof (unsigned int)
COMMON_CPUID_INDEX_1
+COMMON_CPUID_INDEX_7
FEATURE_INDEX_1
+FEATURE_INDEX_7
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 332a60d..5639702 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -50,6 +50,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
__memmove_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+ __memmove_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2))
@@ -59,6 +61,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
__memmove_ssse3)
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+ __memmove_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
/* Support sysdeps/x86_64/multiarch/memset_chk.S. */
@@ -235,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
__memcpy_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+ __memcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))
@@ -243,6 +249,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX, __memcpy_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
@@ -251,6 +258,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
__mempcpy_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+ __mempcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))
@@ -260,6 +269,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
__mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+ __mempcpy_avx_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
/* Support sysdeps/x86_64/multiarch/strlen.S. */
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
new file mode 100644
index 0000000..68901f6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -0,0 +1,409 @@
+/* memcpy with AVX
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+#ifndef MEMCPY
+# define MEMCPY __memcpy_avx_unaligned
+# define MEMCPY_CHK __memcpy_chk_avx_unaligned
+#endif
+
+ .section .text.avx,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+ vzeroupper
+ mov %rdi, %rax
+
+#ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+#endif
+
+ lea (%rsi, %rdx), %r8
+ lea (%rdi, %rdx), %r9
+ cmp $256, %rdx
+ ja L(256bytesormore)
+ cmp $128, %edx
+ jb L(less_128bytes)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups 0x20(%rsi), %xmm2
+ vmovups 0x30(%rsi), %xmm3
+ vmovups 0x40(%rsi), %xmm4
+ vmovups 0x50(%rsi), %xmm5
+ vmovups 0x60(%rsi), %xmm6
+ vmovups 0x70(%rsi), %xmm7
+ vmovups -0x80(%r8), %xmm8
+ vmovups -0x70(%r8), %xmm9
+ vmovups -0x60(%r8), %xmm10
+ vmovups -0x50(%r8), %xmm11
+ vmovups -0x40(%r8), %xmm12
+ vmovups -0x30(%r8), %xmm13
+ vmovups -0x20(%r8), %xmm14
+ vmovups -0x10(%r8), %xmm15
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm2, 0x20(%rdi)
+ vmovups %xmm3, 0x30(%rdi)
+ vmovups %xmm4, 0x40(%rdi)
+ vmovups %xmm5, 0x50(%rdi)
+ vmovups %xmm6, 0x60(%rdi)
+ vmovups %xmm7, 0x70(%rdi)
+ vmovups %xmm8, -0x80(%r9)
+ vmovups %xmm9, -0x70(%r9)
+ vmovups %xmm10, -0x60(%r9)
+ vmovups %xmm11, -0x50(%r9)
+ vmovups %xmm12, -0x40(%r9)
+ vmovups %xmm13, -0x30(%r9)
+ vmovups %xmm14, -0x20(%r9)
+ vmovups %xmm15, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_128bytes):
+ cmp $64, %edx
+ jb L(less_64bytes)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups 0x20(%rsi), %xmm2
+ vmovups 0x30(%rsi), %xmm3
+ vmovups -0x40(%r8), %xmm4
+ vmovups -0x30(%r8), %xmm5
+ vmovups -0x20(%r8), %xmm6
+ vmovups -0x10(%r8), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm2, 0x20(%rdi)
+ vmovups %xmm3, 0x30(%rdi)
+ vmovups %xmm4, -0x40(%r9)
+ vmovups %xmm5, -0x30(%r9)
+ vmovups %xmm6, -0x20(%r9)
+ vmovups %xmm7, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_64bytes):
+ cmp $32, %edx
+ jb L(less_32bytes)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups -0x20(%r8), %xmm6
+ vmovups -0x10(%r8), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm6, -0x20(%r9)
+ vmovups %xmm7, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_32bytes):
+ cmp $16, %edx
+ jb L(less_16bytes)
+ vmovups (%rsi), %xmm0
+ vmovups -0x10(%r8), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm7, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_16bytes):
+ cmp $8, %edx
+ jb L(less_8bytes)
+ movq (%rsi), %rcx
+ movq -0x08(%r8), %r10
+ movq %rcx, (%rdi)
+ movq %r10, -0x08(%r9)
+ ret
+ ALIGN(4)
+L(less_8bytes):
+ cmp $4, %edx
+ jb L(less_4bytes)
+ mov (%rsi), %ecx
+ mov -0x04(%r8), %edx
+ mov %ecx, (%rdi)
+ mov %edx, -0x04(%r9)
+ ret
+ ALIGN(4)
+L(less_4bytes):
+ cmp $2, %edx
+ jb L(less_2bytes)
+ mov (%rsi), %cx
+ mov -0x02(%r8), %dx
+ mov %cx, (%rdi)
+ mov %dx, -0x02(%r9)
+ ret
+ ALIGN(4)
+L(less_2bytes):
+ cmp $1, %rdx
+ jb L(less_0bytes)
+ mov (%rsi), %cl
+ mov %cl, (%rdi)
+L(less_0bytes):
+ ret
+
+ ALIGN(4)
+L(256bytesormore):
+
+#ifdef USE_AS_MEMMOVE
+ cmp %rsi, %rdi
+ jae L(copy_backward)
+#endif
+ cmp $2048, %rdx
+ jae L(gobble_data_movsb)
+
+ vmovups -0x80(%r8), %xmm8
+ vmovups -0x70(%r8), %xmm9
+ vmovups -0x60(%r8), %xmm10
+ vmovups -0x50(%r8), %xmm11
+ vmovups -0x40(%r8), %xmm12
+ vmovups -0x30(%r8), %xmm13
+ vmovups -0x20(%r8), %xmm14
+ vmovups -0x10(%r8), %xmm15
+ vmovups (%rsi), %ymm4
+ mov %rdi, %r10
+ and $-32, %rdi
+ add $32, %rdi
+ mov %rdi, %r11
+ sub %r10, %r11
+ sub %r11, %rdx
+ add %r11, %rsi
+ sub $0x80, %rdx
+L(goble_128_loop):
+ vmovups (%rsi), %ymm0
+ vmovups 0x20(%rsi), %ymm1
+ vmovups 0x40(%rsi), %ymm2
+ vmovups 0x60(%rsi), %ymm3
+ lea 0x80(%rsi), %rsi
+ vmovaps %ymm0, (%rdi)
+ vmovaps %ymm1, 0x20(%rdi)
+ vmovaps %ymm2, 0x40(%rdi)
+ vmovaps %ymm3, 0x60(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(goble_128_loop)
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, -0x80(%r9)
+ vmovups %xmm9, -0x70(%r9)
+ vmovups %xmm10, -0x60(%r9)
+ vmovups %xmm11, -0x50(%r9)
+ vmovups %xmm12, -0x40(%r9)
+ vmovups %xmm13, -0x30(%r9)
+ vmovups %xmm14, -0x20(%r9)
+ vmovups %xmm15, -0x10(%r9)
+ ret
+
+L(gobble_data_movsb):
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+ shl $3, %rcx
+
+#ifdef USE_AS_MEMMOVE
+ mov %rsi, %r10
+ sub %rdi, %r10
+ cmp %rdx, %r10
+ jae L(memmove_use_memcpy_fwd)
+ cmp %rcx, %r10
+ jae L(memmove_use_memcpy_fwd)
+ jmp L(gobble_mem_fwd_llc_start)
+L(memmove_use_memcpy_fwd):
+#endif
+
+ cmp %rcx, %rdx
+ ja L(gobble_big_data_fwd)
+
+#ifdef USE_AS_MEMMOVE
+L(gobble_mem_fwd_llc_start):
+#endif
+ mov %rdx, %rcx
+ rep movsb
+ ret
+
+L(gobble_big_data_fwd):
+ vmovups (%rsi), %ymm4
+ vmovups -0x80(%r8), %xmm5
+ vmovups -0x70(%r8), %xmm6
+ vmovups -0x60(%r8), %xmm7
+ vmovups -0x50(%r8), %xmm8
+ vmovups -0x40(%r8), %xmm9
+ vmovups -0x30(%r8), %xmm10
+ vmovups -0x20(%r8), %xmm11
+ vmovups -0x10(%r8), %xmm12
+ mov %rdi, %r8
+ and $-32, %rdi
+ add $32, %rdi
+ mov %rdi, %r10
+ sub %r8, %r10
+ sub %r10, %rdx
+ add %r10, %rsi
+ sub $0x80, %rdx
+L(gobble_mem_fwd_loop):
+ prefetcht0 0x1c0(%rsi)
+ prefetcht0 0x280(%rsi)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups 0x20(%rsi), %xmm2
+ vmovups 0x30(%rsi), %xmm3
+ vmovntdq %xmm0, (%rdi)
+ vmovntdq %xmm1, 0x10(%rdi)
+ vmovntdq %xmm2, 0x20(%rdi)
+ vmovntdq %xmm3, 0x30(%rdi)
+ vmovups 0x40(%rsi), %xmm0
+ vmovups 0x50(%rsi), %xmm1
+ vmovups 0x60(%rsi), %xmm2
+ vmovups 0x70(%rsi), %xmm3
+ lea 0x80(%rsi), %rsi
+ vmovntdq %xmm0, 0x40(%rdi)
+ vmovntdq %xmm1, 0x50(%rdi)
+ vmovntdq %xmm2, 0x60(%rdi)
+ vmovntdq %xmm3, 0x70(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_fwd_loop)
+ sfence
+ vmovups %ymm4, (%r8)
+ vzeroupper
+ vmovups %xmm5, -0x80(%r9)
+ vmovups %xmm6, -0x70(%r9)
+ vmovups %xmm7, -0x60(%r9)
+ vmovups %xmm8, -0x50(%r9)
+ vmovups %xmm9, -0x40(%r9)
+ vmovups %xmm10, -0x30(%r9)
+ vmovups %xmm11, -0x20(%r9)
+ vmovups %xmm12, -0x10(%r9)
+ ret
+
+ ALIGN (4)
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+ shl $3, %rcx
+ vmovups (%rsi), %xmm8
+ vmovups 0x10(%rsi), %xmm9
+ vmovups 0x20(%rsi), %xmm10
+ vmovups 0x30(%rsi), %xmm11
+ vmovups 0x40(%rsi), %xmm12
+ vmovups 0x50(%rsi), %xmm13
+ vmovups 0x60(%rsi), %xmm14
+ vmovups 0x70(%rsi), %xmm15
+ mov %rdi, %r9
+ add %rdx, %rsi
+ add %rdx, %rdi
+ vmovups -0x20(%rsi), %ymm4
+ lea -0x20(%rdi), %r10
+ mov %rdi, %r11
+ and $0x1f, %r11
+ xor %r11, %rdi
+ sub %r11, %rsi
+ sub %r11, %rdx
+#ifdef USE_AS_MEMMOVE
+ mov %rdi, %r11
+ sub %rsi, %r11
+ cmp %rdx, %r11
+ jae L(memmove_use_memcpy_bwd)
+ cmp %rcx, %r11
+ jae L(memmove_use_memcpy_bwd)
+ jmp L(gobble_mem_bwd_llc_start)
+#endif
+L(memmove_use_memcpy_bwd):
+ cmp %rcx, %rdx
+ ja L(gobble_big_data_bwd)
+L(gobble_mem_bwd_llc_start):
+ sub $0x80, %rdx
+L(gobble_mem_bwd_llc):
+ vmovups -0x20(%rsi), %ymm0
+ vmovups -0x40(%rsi), %ymm1
+ vmovups -0x60(%rsi), %ymm2
+ vmovups -0x80(%rsi), %ymm3
+ lea -0x80(%rsi), %rsi
+ vmovaps %ymm0, -0x20(%rdi)
+ vmovaps %ymm1, -0x40(%rdi)
+ vmovaps %ymm2, -0x60(%rdi)
+ vmovaps %ymm3, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_bwd_llc)
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, (%r9)
+ vmovups %xmm9, 0x10(%r9)
+ vmovups %xmm10, 0x20(%r9)
+ vmovups %xmm11, 0x30(%r9)
+ vmovups %xmm12, 0x40(%r9)
+ vmovups %xmm13, 0x50(%r9)
+ vmovups %xmm14, 0x60(%r9)
+ vmovups %xmm15, 0x70(%r9)
+ ret
+
+L(gobble_big_data_bwd):
+ sub $0x80, %rdx
+L(gobble_mem_bwd_loop):
+ prefetcht0 -0x1c0(%rsi)
+ prefetcht0 -0x280(%rsi)
+ vmovups -0x10(%rsi), %xmm0
+ vmovups -0x20(%rsi), %xmm1
+ vmovups -0x30(%rsi), %xmm2
+ vmovups -0x40(%rsi), %xmm3
+ vmovntdq %xmm0, -0x10(%rdi)
+ vmovntdq %xmm1, -0x20(%rdi)
+ vmovntdq %xmm2, -0x30(%rdi)
+ vmovntdq %xmm3, -0x40(%rdi)
+ vmovups -0x50(%rsi), %xmm0
+ vmovups -0x60(%rsi), %xmm1
+ vmovups -0x70(%rsi), %xmm2
+ vmovups -0x80(%rsi), %xmm3
+ lea -0x80(%rsi), %rsi
+ vmovntdq %xmm0, -0x50(%rdi)
+ vmovntdq %xmm1, -0x60(%rdi)
+ vmovntdq %xmm2, -0x70(%rdi)
+ vmovntdq %xmm3, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_bwd_loop)
+ sfence
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, (%r9)
+ vmovups %xmm9, 0x10(%r9)
+ vmovups %xmm10, 0x20(%r9)
+ vmovups %xmm11, 0x30(%r9)
+ vmovups %xmm12, 0x40(%r9)
+ vmovups %xmm13, 0x50(%r9)
+ vmovups %xmm14, 0x60(%r9)
+ vmovups %xmm15, 0x70(%r9)
+ ret
+END (MEMCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
new file mode 100644
index 0000000..352a2c3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_avx_unaligned
+#define MEMCPY_CHK __memmove_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
new file mode 100644
index 0000000..b31394e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_avx_unaligned
+#define MEMCPY_CHK __mempcpy_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
--
1.8.1.4