This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/x86/optimize created. glibc-2.25-301-ge6afc30
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 10 May 2017 17:22:19 -0000
- Subject: GNU C Library master sources branch hjl/x86/optimize created. glibc-2.25-301-ge6afc30
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/x86/optimize has been created
at e6afc30fdf51f7dbe7c6464213acd47ba3d1acd4 (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e6afc30fdf51f7dbe7c6464213acd47ba3d1acd4
commit e6afc30fdf51f7dbe7c6464213acd47ba3d1acd4
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed May 10 10:21:08 2017 -0700
x86-64: Restore memcpy-sse2-unaligned.S from glibc 2.19
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2a30538..5ed4e74 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -23,7 +23,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memmove-avx512-unaligned-erms \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
- strlen-sse4
+ strlen-sse4 memcpy-sse2-unaligned
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1604678..653716e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -353,6 +353,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1,
__memcpy_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_2_19)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms))
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000..1d05c2c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,171 @@
+/* memcpy with unaliged loads
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#include "asm-syntax.h"
+
+
+ENTRY(__memcpy_sse2_unaligned_2_19)
+ movq %rsi, %rax
+ leaq (%rdx,%rdx), %rcx
+ subq %rdi, %rax
+ subq %rdx, %rax
+ cmpq %rcx, %rax
+ jb L(overlapping)
+ cmpq $16, %rdx
+ jbe L(less_16)
+ movdqu (%rsi), %xmm8
+ cmpq $32, %rdx
+ movdqu %xmm8, (%rdi)
+ movdqu -16(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -16(%rdi,%rdx)
+ ja .L31
+L(return):
+ movq %rdi, %rax
+ ret
+ .p2align 4,,10
+ .p2align 4
+.L31:
+ movdqu 16(%rsi), %xmm8
+ cmpq $64, %rdx
+ movdqu %xmm8, 16(%rdi)
+ movdqu -32(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -32(%rdi,%rdx)
+ jbe L(return)
+ movdqu 32(%rsi), %xmm8
+ cmpq $128, %rdx
+ movdqu %xmm8, 32(%rdi)
+ movdqu -48(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -48(%rdi,%rdx)
+ movdqu 48(%rsi), %xmm8
+ movdqu %xmm8, 48(%rdi)
+ movdqu -64(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -64(%rdi,%rdx)
+ jbe L(return)
+ leaq 64(%rdi), %rcx
+ addq %rdi, %rdx
+ andq $-64, %rdx
+ andq $-64, %rcx
+ movq %rcx, %rax
+ subq %rdi, %rax
+ addq %rax, %rsi
+ cmpq %rdx, %rcx
+ je L(return)
+ movq %rsi, %r10
+ subq %rcx, %r10
+ leaq 16(%r10), %r9
+ leaq 32(%r10), %r8
+ leaq 48(%r10), %rax
+ .p2align 4,,10
+ .p2align 4
+L(loop):
+ movdqu (%rcx,%r10), %xmm8
+ movdqa %xmm8, (%rcx)
+ movdqu (%rcx,%r9), %xmm8
+ movdqa %xmm8, 16(%rcx)
+ movdqu (%rcx,%r8), %xmm8
+ movdqa %xmm8, 32(%rcx)
+ movdqu (%rcx,%rax), %xmm8
+ movdqa %xmm8, 48(%rcx)
+ addq $64, %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+ jmp L(return)
+L(overlapping):
+ cmpq %rsi, %rdi
+ jae .L3
+ testq %rdx, %rdx
+ .p2align 4,,5
+ je L(return)
+ movq %rdx, %r9
+ leaq 16(%rsi), %rcx
+ leaq 16(%rdi), %r8
+ shrq $4, %r9
+ movq %r9, %rax
+ salq $4, %rax
+ cmpq %rcx, %rdi
+ setae %cl
+ cmpq %r8, %rsi
+ setae %r8b
+ orl %r8d, %ecx
+ cmpq $15, %rdx
+ seta %r8b
+ testb %r8b, %cl
+ je .L16
+ testq %rax, %rax
+ je .L16
+ xorl %ecx, %ecx
+ xorl %r8d, %r8d
+.L7:
+ movdqu (%rsi,%rcx), %xmm8
+ addq $1, %r8
+ movdqu %xmm8, (%rdi,%rcx)
+ addq $16, %rcx
+ cmpq %r8, %r9
+ ja .L7
+ cmpq %rax, %rdx
+ je L(return)
+.L21:
+ movzbl (%rsi,%rax), %ecx
+ movb %cl, (%rdi,%rax)
+ addq $1, %rax
+ cmpq %rax, %rdx
+ ja .L21
+ jmp L(return)
+L(less_16):
+ testb $24, %dl
+ jne L(between_9_16)
+ testb $4, %dl
+ .p2align 4,,5
+ jne L(between_5_8)
+ testq %rdx, %rdx
+ .p2align 4,,2
+ je L(return)
+ movzbl (%rsi), %eax
+ testb $2, %dl
+ movb %al, (%rdi)
+ je L(return)
+ movzwl -2(%rsi,%rdx), %eax
+ movw %ax, -2(%rdi,%rdx)
+ jmp L(return)
+.L3:
+ leaq -1(%rdx), %rax
+ .p2align 4,,10
+ .p2align 4
+.L11:
+ movzbl (%rsi,%rax), %edx
+ movb %dl, (%rdi,%rax)
+ subq $1, %rax
+ jmp .L11
+L(between_9_16):
+ movq (%rsi), %rax
+ movq %rax, (%rdi)
+ movq -8(%rsi,%rdx), %rax
+ movq %rax, -8(%rdi,%rdx)
+ jmp L(return)
+.L16:
+ xorl %eax, %eax
+ jmp .L21
+L(between_5_8):
+ movl (%rsi), %eax
+ movl %eax, (%rdi)
+ movl -4(%rsi,%rdx), %eax
+ movl %eax, -4(%rdi,%rdx)
+ jmp L(return)
+END(__memcpy_sse2_unaligned_2_19)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=97ea71fbb1250d0f3b7b098d8ffc069fcbf0a56a
commit 97ea71fbb1250d0f3b7b098d8ffc069fcbf0a56a
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon May 1 08:32:22 2017 -0700
x86-64: Restore the old SSE4 strlen
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 3736f54..2a30538 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -22,7 +22,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memmove-avx-unaligned-erms \
memmove-avx512-unaligned-erms \
memset-avx2-unaligned-erms \
- memset-avx512-unaligned-erms
+ memset-avx512-unaligned-erms \
+ strlen-sse4
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 06d9a9d..1604678 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -410,6 +410,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms))
+ /* Support sysdeps/x86_64/multiarch/strlen.S. */
+ IFUNC_IMPL (i, name, strlen,
+ IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE4_2),
+ __strlen_sse42)
+ IFUNC_IMPL_ADD (array, i, strlen, 1, strlen))
+
/* Support sysdeps/x86_64/multiarch/strncmp.S. */
IFUNC_IMPL (i, name, strncmp,
IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S
new file mode 100644
index 0000000..8d685df
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse4.S
@@ -0,0 +1,84 @@
+/* strlen with SSE4
+ Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@redhat.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined SHARED && !defined NOT_IN_libc
+
+#include <sysdep.h>
+
+ .section .text.sse4.2,"ax",@progbits
+ENTRY (__strlen_sse42)
+ pxor %xmm1, %xmm1
+ movl %edi, %ecx
+ movq %rdi, %r8
+ andq $~15, %rdi
+ xor %edi, %ecx
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %edx
+ shrl %cl, %edx
+ shll %cl, %edx
+ andl %edx, %edx
+ jnz L(less16bytes)
+ pxor %xmm1, %xmm1
+
+ .p2align 4
+L(more64bytes_loop):
+ pcmpistri $0x08, 16(%rdi), %xmm1
+ jz L(more32bytes)
+
+ pcmpistri $0x08, 32(%rdi), %xmm1
+ jz L(more48bytes)
+
+ pcmpistri $0x08, 48(%rdi), %xmm1
+ jz L(more64bytes)
+
+ add $64, %rdi
+ pcmpistri $0x08, (%rdi), %xmm1
+ jnz L(more64bytes_loop)
+ leaq (%rdi,%rcx), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more32bytes):
+ leaq 16(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more48bytes):
+ leaq 32(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more64bytes):
+ leaq 48(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(less16bytes):
+ subq %r8, %rdi
+ bsfl %edx, %eax
+ addq %rdi, %rax
+ ret
+
+END (__strlen_sse42)
+
+#endif
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources