x86_64: Optimize large size copy in memmove-ssse3

author MayShao-oc <MayShao-oc@zhaoxin.com>

Sat, 29 Jun 2024 03:58:27 +0000 (11:58 +0800)

committer H.J. Lu <hjl.tools@gmail.com>

Sun, 30 Jun 2024 13:26:43 +0000 (06:26 -0700)
author MayShao-oc <MayShao-oc@zhaoxin.com>
Sat, 29 Jun 2024 03:58:27 +0000 (11:58 +0800)
committer H.J. Lu <hjl.tools@gmail.com>
Sun, 30 Jun 2024 13:26:43 +0000 (06:26 -0700)
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S

index 048d015712f8ee0bb2b9fdd5eb57fb98c17d3279..01008fd981b7b6f12945ea4224261c144dc5e5ce 100644 (file)
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -151,13 +151,10 @@ L(more_2x_vec):
            loop.  */
         movups  %xmm0, (%rdi)
  
-# ifdef SHARED_CACHE_SIZE_HALF
-       cmp     $SHARED_CACHE_SIZE_HALF, %RDX_LP
-# else
-       cmp     __x86_shared_cache_size_half(%rip), %rdx
-# endif
+       cmp     __x86_shared_non_temporal_threshold(%rip), %rdx
         ja      L(large_memcpy)
  
+L(loop_fwd):
         leaq    -64(%rdi, %rdx), %r8
         andq    $-16, %rdi
         movl    $48, %edx
@@ -199,6 +196,13 @@ L(large_memcpy):
         movups  -64(%r9, %rdx), %xmm10
         movups  -80(%r9, %rdx), %xmm11
  
+       /* Check if src and dst overlap. If they do use cacheable
+          writes to potentially gain positive interference between
+          the loads during the memmove.  */
+       subq    %rdi, %r9
+       cmpq    %rdx, %r9
+       jb      L(loop_fwd)
+
         sall    $5, %ecx
         leal    (%rcx, %rcx, 2), %r8d
         leaq    -96(%rdi, %rdx), %rcx
author	MayShao-oc <MayShao-oc@zhaoxin.com>
	Sat, 29 Jun 2024 03:58:27 +0000 (11:58 +0800)
committer	H.J. Lu <hjl.tools@gmail.com>
	Sun, 30 Jun 2024 13:26:43 +0000 (06:26 -0700)