diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 55778cb..b224088 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -11,16 +11,16 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
 		   memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
 		   strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
-		   memcmp-ssse3 memcmp-sse4 varshift \
+		   memcmp-ssse3 memcmp-sse4 varshift memcpy-sse2-unaligned \
 		   strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
 		   strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
 		   strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
 		   strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
 		   strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
-		   memchr-sse2 memchr-sse2-bsf \
+		   memchr-sse2 memchr-sse2-bsf mempcpy-sse2-unaligned \
 		   memrchr-sse2 memrchr-sse2-bsf memrchr-c \
 		   rawmemchr-sse2 rawmemchr-sse2-bsf \
-		   strnlen-sse2 strnlen-c \
+		   strnlen-sse2 strnlen-c memmove-sse2-unaligned \
 		   strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \
 		   strncase_l-c strncase-c strncase_l-ssse3 \
 		   strcasecmp_l-sse4 strncase_l-sse4
diff --git a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
index e475776..8a6c4a8 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
+++ b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -23,7 +23,7 @@
 #include "init-arch.h"
 
 /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC	3
+#define MAX_IFUNC	4
 
 /* Fill ARRAY of MAX elements with IFUNC implementations for function
    NAME and return the number of valid entries.  */
@@ -69,6 +69,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_chk_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
 			      __memmove_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
+			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_ia32))
 
@@ -78,6 +80,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
 			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1,
+			      __memmove_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memrchr.S.  */
@@ -269,6 +273,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
 			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memcpy.S.  */
@@ -276,6 +282,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
 			      __memcpy_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S.  */
@@ -284,6 +291,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_chk_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
 			      __mempcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
+			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_ia32))
 
@@ -293,6 +302,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_ssse3_rep)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
 			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strlen.S.  */
diff --git a/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000..c1c9410
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,752 @@
+/* memcpy optimized with sse2 unaligned memory access instructions. 
+   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+#  define MEMCPY	__memcpy_sse2_unaligned
+#  define MEMCPY_CHK	__memcpy_chk_sse2_unaligned
+# endif
+
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN	RETURN_END; CFI_PUSH (%ebx)
+
+	.section .text.sse2,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+	cmp	%edx, %eax
+	je	L(return)
+
+# ifdef USE_AS_MEMMOVE
+	jg	L(check_forward)
+
+	add	%ecx, %eax
+	cmp	%edx, %eax
+	movl	SRC(%esp), %eax
+	jle	L(forward)
+
+L(mm_len_0_or_more_backward):
+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_backward)
+
+	cmpl	$32, %ecx
+	jg	L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_32_or_more_backward):
+	cmpl	$64, %ecx
+	jg	L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_64_or_more_backward):
+	cmpl	$128, %ecx
+	jg	L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_128_or_more_backward):
+	PUSH (%esi)
+	PUSH (%edi)
+
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+	movdqu	-16(%eax, %ecx), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+
+	leal	(%edx, %ecx), %edi
+	andl	$-64, %edi
+
+	movl	%eax, %esi
+	subl	%edx, %esi
+
+	movdqu	-16(%edi, %esi), %xmm4
+	movdqu	-32(%edi, %esi), %xmm5
+	movdqu	-48(%edi, %esi), %xmm6
+	movdqu	-64(%edi, %esi), %xmm7
+
+	movdqu	%xmm0, -16(%edx, %ecx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	movdqa	%xmm4, -16(%edi)
+	movdqa	%xmm5, -32(%edi)
+	movdqa	%xmm6, -48(%edi)
+	movdqa	%xmm7, -64(%edi)
+	leal	-64(%edi), %edi
+
+	leal	64(%edx), %ebx
+	andl	$-64, %ebx
+
+	cmp	%edi, %ebx
+	jae	L(mm_main_loop_backward_end)
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	PUSH(%ebx)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+	POP(%ebx)
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+	jae	L(mm_large_page_loop_backward)
+
+	.p2align 4
+L(mm_main_loop_backward):
+
+	prefetcht0 -128(%edi, %esi)
+
+	movdqu	-64(%edi, %esi), %xmm0
+	movdqu	-48(%edi, %esi), %xmm1
+	movdqu	-32(%edi, %esi), %xmm2
+	movdqu	-16(%edi, %esi), %xmm3
+	movdqa	%xmm0, -64(%edi)
+	movdqa	%xmm1, -48(%edi)
+	movdqa	%xmm2, -32(%edi)
+	movdqa	%xmm3, -16(%edi)
+	leal	-64(%edi), %edi
+	cmp	%edi, %ebx
+	jb	L(mm_main_loop_backward)
+L(mm_main_loop_backward_end):
+	POP (%edi)
+	POP (%esi)
+	jmp	L(mm_recalc_len)
+
+/* Copy [0..16] and return.  */
+L(mm_len_0_16_bytes_backward):
+	testb	$24, %cl
+	jnz	L(mm_len_9_16_bytes_backward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jnz	L(mm_len_5_8_bytes_backward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_3_4_bytes_backward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(return)
+
+L(mm_len_3_4_bytes_backward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(return)
+
+L(mm_len_9_16_bytes_backward):
+	PUSH (%esi)
+	movl	-4(%eax,%ecx), %ebx
+	movl	-8(%eax,%ecx), %esi
+	movl	%ebx, -4(%edx,%ecx)
+	movl	%esi, -8(%edx,%ecx)
+	subl	$8, %ecx
+	POP (%esi)
+	jmp	L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(return)
+
+/* Big length copy backward part.  */
+	.p2align 4
+L(mm_large_page_loop_backward):
+	movdqu	-64(%edi, %esi), %xmm0
+	movdqu	-48(%edi, %esi), %xmm1
+	movdqu	-32(%edi, %esi), %xmm2
+	movdqu	-16(%edi, %esi), %xmm3
+	movntdq	%xmm0, -64(%edi)
+	movntdq	%xmm1, -48(%edi)
+	movntdq	%xmm2, -32(%edi)
+	movntdq	%xmm3, -16(%edi)
+	leal	-64(%edi), %edi
+	cmp	%edi, %ebx
+	jb	L(mm_large_page_loop_backward)
+	sfence
+	POP (%edi)
+	POP (%esi)
+
+L(mm_recalc_len):
+/* Compute in %ecx how many bytes are left to copy after
+	the main loop stops.  */
+	movl	%ebx, %ecx
+	subl	%edx, %ecx
+	jmp	L(mm_len_0_or_more_backward)
+
+L(check_forward):
+	add	%edx, %ecx
+	cmp	%eax, %ecx
+	movl	LEN(%esp), %ecx	
+	jle	L(forward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_forward)
+
+	cmpl	$32, %ecx
+	ja	L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_32_or_more_forward):
+	cmpl	$64, %ecx
+	ja	L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_64_or_more_forward):
+	cmpl	$128, %ecx
+	ja	L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_128_or_more_forward):
+	PUSH (%esi)
+	PUSH (%edi)
+
+/* Aligning the address of destination.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+
+	leal	64(%edx), %edi
+	andl	$-64, %edi
+	subl	%edx, %eax
+
+	movdqu	(%eax, %edi), %xmm4
+	movdqu	16(%eax, %edi), %xmm5
+	movdqu	32(%eax, %edi), %xmm6
+	movdqu	48(%eax, %edi), %xmm7
+
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqa	%xmm4, (%edi)
+	movaps	%xmm5, 16(%edi)
+	movaps	%xmm6, 32(%edi)
+	movaps	%xmm7, 48(%edi)
+	addl	$64, %edi
+
+	leal	(%edx, %ecx), %ebx
+	andl	$-64, %ebx
+	cmp	%edi, %ebx
+	jbe	L(mm_copy_remaining_forward)
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	PUSH(%ebx)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+	POP(%ebx)
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+	jae	L(mm_large_page_loop_forward)
+
+	.p2align 4
+L(mm_main_loop_forward):
+
+	prefetcht0 128(%eax, %edi)
+
+	movdqu	(%eax, %edi), %xmm0
+	movdqu	16(%eax, %edi), %xmm1
+	movdqu	32(%eax, %edi), %xmm2
+	movdqu	48(%eax, %edi), %xmm3
+	movdqa	%xmm0, (%edi)
+	movaps	%xmm1, 16(%edi)
+	movaps	%xmm2, 32(%edi)
+	movaps	%xmm3, 48(%edi)
+	leal	64(%edi), %edi
+	cmp	%edi, %ebx
+	ja	L(mm_main_loop_forward)
+
+L(mm_copy_remaining_forward):
+	addl	%edx, %ecx
+	subl	%edi, %ecx
+/* We copied all up till %edi position in the dst.
+	In %ecx now is how many bytes are left to copy.
+	Now we need to advance %esi. */
+	leal	(%edi, %eax), %esi
+
+L(mm_remaining_0_64_bytes_forward):
+	cmp	$32, %ecx
+	ja	L(mm_remaining_33_64_bytes_forward)
+	cmp	$16, %ecx
+	ja	L(mm_remaining_17_32_bytes_forward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(mm_return_pop_all)
+
+	cmpb	$8, %cl
+	ja	L(mm_remaining_9_16_bytes_forward)
+	cmpb	$4, %cl
+	.p2align 4,,5
+	ja	L(mm_remaining_5_8_bytes_forward)
+	cmpb	$2, %cl
+	.p2align 4,,1
+	ja	L(mm_remaining_3_4_bytes_forward)
+	movzbl	-1(%esi,%ecx), %eax
+	movzbl	(%esi), %ebx
+	movb	%al, -1(%edi,%ecx)
+	movb	%bl, (%edi)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_33_64_bytes_forward):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm1
+	movdqu	-32(%esi, %ecx), %xmm2
+	movdqu	-16(%esi, %ecx), %xmm3
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm1, 16(%edi)
+	movdqu	%xmm2, -32(%edi, %ecx)
+	movdqu	%xmm3, -16(%edi, %ecx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_17_32_bytes_forward):
+	movdqu	(%esi), %xmm0
+	movdqu	-16(%esi, %ecx), %xmm1
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm1, -16(%edi, %ecx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_9_16_bytes_forward):
+	movq	(%esi), %xmm0
+	movq	-8(%esi, %ecx), %xmm1
+	movq	%xmm0, (%edi)
+	movq	%xmm1, -8(%edi, %ecx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_5_8_bytes_forward):
+	movl	(%esi), %eax
+	movl	-4(%esi,%ecx), %ebx
+	movl	%eax, (%edi)
+	movl	%ebx, -4(%edi,%ecx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_3_4_bytes_forward):
+	movzwl	-2(%esi,%ecx), %eax
+	movzwl	(%esi), %ebx
+	movw	%ax, -2(%edi,%ecx)
+	movw	%bx, (%edi)
+	jmp	L(mm_return_pop_all)
+
+L(mm_len_0_16_bytes_forward):
+	testb	$24, %cl
+	jne	L(mm_len_9_16_bytes_forward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(mm_len_5_8_bytes_forward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_2_4_bytes_forward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(return)
+
+L(mm_len_2_4_bytes_forward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(return)
+
+L(mm_len_5_8_bytes_forward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(return)
+
+L(mm_len_9_16_bytes_forward):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_return_pop_all):
+	movl	%edx, %eax
+	POP (%edi)
+	POP (%esi)
+	RETURN
+
+/* Big length copy forward part.  */
+
+	.p2align 4
+L(mm_large_page_loop_forward):
+	movdqu	(%eax, %edi), %xmm0
+	movdqu	16(%eax, %edi), %xmm1
+	movdqu	32(%eax, %edi), %xmm2
+	movdqu	48(%eax, %edi), %xmm3
+	movntdq	%xmm0, (%edi)
+	movntdq	%xmm1, 16(%edi)
+	movntdq	%xmm2, 32(%edi)
+	movntdq	%xmm3, 48(%edi)
+	leal	64(%edi), %edi
+	cmp	%edi, %ebx
+	ja	L(mm_large_page_loop_forward)
+	sfence
+	jmp	L(mm_copy_remaining_forward)
+# endif
+
+L(forward):
+	cmp	$16, %ecx
+	jbe	L(len_0_16_bytes)
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+
+	jae     L(large_page)
+
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	cmpl    $32, %ecx
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	16(%eax), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	cmpl    $64, %ecx
+	movdqu	%xmm0, 16(%edx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	32(%eax), %xmm0
+	movdqu	48(%eax), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+	cmpl    $128, %ecx
+	movdqu	%xmm0, 32(%edx)
+	movdqu	%xmm1, 48(%edx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	jbe	L(return)
+
+/* Now the main loop: we align the address of the destination.  */
+	leal	64(%edx), %ebx
+	andl	$-64, %ebx
+
+	addl	%edx, %ecx
+	andl	$-64, %ecx
+
+	subl	%edx, %eax
+
+/* We should stop two iterations before the termination
+	(in order not to misprefetch).  */
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_just_one_iteration)
+
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_last_two_iterations)
+
+
+	.p2align 4
+L(main_loop_cache):
+
+	prefetcht0 128(%ebx, %eax)
+
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movdqa	%xmm1, 16(%ebx)
+	movdqa	%xmm2, 32(%ebx)
+	movdqa	%xmm3, 48(%ebx)
+	lea	64(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movdqa	%xmm0, (%ebx)
+	movdqa	%xmm1, 16(%ebx)
+	movdqa	%xmm2, 32(%ebx)
+	movdqa	%xmm3, 48(%ebx)
+	movdqa	%xmm4, 64(%ebx)
+	movdqa	%xmm5, 80(%ebx)
+	movdqa	%xmm6, 96(%ebx)
+	movdqa	%xmm7, 112(%ebx)
+	jmp	L(return)
+
+L(main_loop_just_one_iteration):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movdqa	%xmm1, 16(%ebx)
+	movdqa	%xmm2, 32(%ebx)
+	movdqa	%xmm3, 48(%ebx)
+	jmp	L(return)
+
+L(large_page):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+
+	movdqu	64(%eax), %xmm0
+	movdqu	80(%eax), %xmm1
+	movdqu	96(%eax), %xmm2
+	movdqu	112(%eax), %xmm3
+	movdqu	-128(%eax, %ecx), %xmm4
+	movdqu	-112(%eax, %ecx), %xmm5
+	movdqu	-96(%eax, %ecx), %xmm6
+	movdqu	-80(%eax, %ecx), %xmm7
+	movdqu	%xmm0, 64(%edx)
+	movdqu	%xmm1, 80(%edx)
+	movdqu	%xmm2, 96(%edx)
+	movdqu	%xmm3, 112(%edx)
+	movdqu	%xmm4, -128(%edx, %ecx)
+	movdqu	%xmm5, -112(%edx, %ecx)
+	movdqu	%xmm6, -96(%edx, %ecx)
+	movdqu	%xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+	the address of the destination.  */
+	leal	128(%edx), %ebx
+	andl	$-128, %ebx
+
+	addl	%edx, %ecx
+	andl	$-128, %ecx
+
+	subl	%edx, %eax
+
+	.p2align 4
+L(main_loop_large_page):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movntdq	%xmm0, (%ebx)
+	movntdq	%xmm1, 16(%ebx)
+	movntdq	%xmm2, 32(%ebx)
+	movntdq	%xmm3, 48(%ebx)
+	movntdq	%xmm4, 64(%ebx)
+	movntdq	%xmm5, 80(%ebx)
+	movntdq	%xmm6, 96(%ebx)
+	movntdq	%xmm7, 112(%ebx)
+	lea	128(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_large_page)
+	sfence
+	jmp	L(return)
+
+L(len_0_16_bytes):
+	testb	$24, %cl
+	jne	L(len_9_16_bytes)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(len_5_8_bytes)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%eax), %ebx
+	testb	$2, %cl
+	movb	%bl, (%edx)
+	je	L(return)
+	movzwl	-2(%eax,%ecx), %ebx
+	movw	%bx, -2(%edx,%ecx)
+	jmp	L(return)
+
+L(len_9_16_bytes):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(len_5_8_bytes):
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	movl	-4(%eax,%ecx), %ebx
+	movl	%ebx, -4(%edx,%ecx)
+
+L(return):
+	movl	%edx, %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+	RETURN
+
+END (MEMCPY)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
index f583482..766d7a4 100644
--- a/sysdeps/i386/i686/multiarch/memcpy.S
+++ b/sysdeps/i386/i686/multiarch/memcpy.S
@@ -35,7 +35,10 @@ ENTRY(memcpy)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memcpy_ia32@GOTOFF(%ebx), %eax
+1:	leal	__memcpy_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	jnz	2f
+	leal	__memcpy_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__memcpy_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
index bf93b68..277f805 100644
--- a/sysdeps/i386/i686/multiarch/memcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -36,7 +36,10 @@ ENTRY(__memcpy_chk)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memcpy_chk_ia32@GOTOFF(%ebx), %eax
+1:	leal	__memcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	jnz	2f
+	leal	__memcpy_chk_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__memcpy_chk_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
new file mode 100644
index 0000000..3873594
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_sse2_unaligned
+#define MEMCPY_CHK	__memmove_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
index 90cf167..215b1f7 100644
--- a/sysdeps/i386/i686/multiarch/memmove.S
+++ b/sysdeps/i386/i686/multiarch/memmove.S
@@ -34,7 +34,10 @@ ENTRY(memmove)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memmove_ia32@GOTOFF(%ebx), %eax
+1:	leal	__memmove_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	jnz	2f
+	leal	__memmove_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__memmove_ssse3@GOTOFF(%ebx), %eax
@@ -62,7 +65,10 @@ ENTRY(memmove)
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memmove_ia32, %eax
+1:	leal	__memmove_sse2_unaligned, %eax
+	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features
+	jnz	2f
+	leal	__memmove_ia32, %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
 	jz	2f
 	leal	__memmove_ssse3, %eax
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
index 182aeb3..284d706 100644
--- a/sysdeps/i386/i686/multiarch/memmove_chk.S
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -34,7 +34,10 @@ ENTRY(__memmove_chk)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memmove_chk_ia32@GOTOFF(%ebx), %eax
+1:	leal	__memmove_chk_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	jnz	2f
+	leal	__memmove_chk_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__memmove_chk_ssse3@GOTOFF(%ebx), %eax
@@ -53,7 +56,10 @@ ENTRY(__memmove_chk)
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__memmove_chk_ia32, %eax
+1:	leal	__memmove_chk_sse2_unaligned, %eax
+	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features
+	jnz	2f
+	leal	__memmove_chk_ia32, %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
 	jz	2f
 	leal	__memmove_chk_ssse3, %eax
@@ -63,6 +69,18 @@ ENTRY(__memmove_chk)
 2:	ret
 END(__memmove_chk)
 
+	.type __memmove_chk_sse2_unaligned, @function
+	.p2align 4;
+__memmove_chk_sse2_unaligned:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_sse2_unaligned
+	cfi_endproc
+	.size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned
+
 	.type __memmove_chk_ssse3, @function
 	.p2align 4;
 __memmove_chk_ssse3:
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
new file mode 100644
index 0000000..a1cea50
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_sse2_unaligned
+#define MEMCPY_CHK	__mempcpy_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
index 56b50bb..c1e5156 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -35,7 +35,10 @@ ENTRY(__mempcpy)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__mempcpy_ia32@GOTOFF(%ebx), %eax
+1:	leal	__mempcpy_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	jnz	2f
+	leal	__mempcpy_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__mempcpy_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
index a76341c..f1436ea 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -36,7 +36,10 @@ ENTRY(__mempcpy_chk)
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leal	__mempcpy_chk_ia32@GOTOFF(%ebx), %eax
+1:	leal	__mempcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Unaligned_Load, CPUID_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	jnz	2f
+	leal	__mempcpy_chk_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__mempcpy_chk_ssse3@GOTOFF(%ebx), %eax