[PATCH] Enable AVX2 optimized memset only if -mavx2 works
Marko Myllynen
myllynen@redhat.com
Tue Jul 1 16:11:00 GMT 2014
Hi,
On 2014-07-01 18:56, H.J. Lu wrote:
> On Tue, Jul 1, 2014 at 2:03 AM, Marko Myllynen <myllynen@redhat.com> wrote:
>> On 2014-04-04 10:34, ling.ma.program@gmail.com wrote:
>>> From: Ling Ma <ling.ml@alibaba-inc.com>
>>>
>>> In this patch we manage to reduce miss branch prediction by
>>> avoid using branch instructions and force destination to be aligned
>>> with avx instruction.
>>>
>>> ---
>>> In this version we removed prefetch and append vmovd.
>>>
>>> ChangeLog | 9 ++
>>> sysdeps/x86_64/multiarch/Makefile | 4 +-
>>> sysdeps/x86_64/multiarch/memset-avx2.S | 192 +++++++++++++++++++++++++++++++++
>>> sysdeps/x86_64/multiarch/memset.S | 59 ++++++++++
>>> sysdeps/x86_64/multiarch/memset_chk.S | 44 ++++++++
>>> 5 files changed, 307 insertions(+), 1 deletion(-)
>>> create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
>>> create mode 100644 sysdeps/x86_64/multiarch/memset.S
>>> create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
>>>
>>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
>>> new file mode 100644
>>> index 0000000..08e8ee8
>>> --- /dev/null
>>> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
>>> @@ -0,0 +1,192 @@
>>> +/* memset with AVX2
>>> + Copyright (C) 2014 Free Software Foundation, Inc.
>>> + Contributed by Alibaba Group.
>>> + This file is part of the GNU C Library.
>>> +
>>> + The GNU C Library is free software; you can redistribute it and/or
>>> + modify it under the terms of the GNU Lesser General Public
>>> + License as published by the Free Software Foundation; either
>>> + version 2.1 of the License, or (at your option) any later version.
>>> +
>>> + The GNU C Library is distributed in the hope that it will be useful,
>>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>>> + Lesser General Public License for more details.
>>> +
>>> + You should have received a copy of the GNU Lesser General Public
>>> + License along with the GNU C Library; if not, see
>>> + <http://www.gnu.org/licenses/>. */
>>> +
>>> +#include <sysdep.h>
>>> +
>>> +#if !defined NOT_IN_libc
>>> +
>>> +#include "asm-syntax.h"
>>> +#ifndef ALIGN
>>> +# define ALIGN(n) .p2align n
>>> +#endif
>>> +#ifndef MEMSET
>>> +# define MEMSET __memset_avx2
>>> +# define MEMSET_CHK __memset_chk_avx2
>>> +#endif
>>> +
>>> + .section .text.avx2,"ax",@progbits
>>> +#if defined PIC
>>> +ENTRY (MEMSET_CHK)
>>> + cmpq %rdx, %rcx
>>> + jb HIDDEN_JUMPTARGET (__chk_fail)
>>> +END (MEMSET_CHK)
>>> +#endif
>>> +
>>> +ENTRY (MEMSET)
>>> + vpxor %xmm0, %xmm0, %xmm0
>>> + vmovd %esi, %xmm1
>>> + lea (%rdi, %rdx), %r8
>>> + vpshufb %xmm0, %xmm1, %xmm0
>>> + mov %rdi, %rax
>>> + cmp $256, %rdx
>>> + jae L(256bytesormore)
>>> + vmovd %xmm0, %rcx
>>> + cmp $128, %rdx
>>> + jb L(less_128bytes)
>>> + vmovups %xmm0, (%rdi)
>>> + vmovups %xmm0, 0x10(%rdi)
>>> + vmovups %xmm0, 0x20(%rdi)
>>> + vmovups %xmm0, 0x30(%rdi)
>>> + vmovups %xmm0, 0x40(%rdi)
>>> + vmovups %xmm0, 0x50(%rdi)
>>> + vmovups %xmm0, 0x60(%rdi)
>>> + vmovups %xmm0, 0x70(%rdi)
>>> + vmovups %xmm0, -0x80(%r8)
>>> + vmovups %xmm0, -0x70(%r8)
>>> + vmovups %xmm0, -0x60(%r8)
>>> + vmovups %xmm0, -0x50(%r8)
>>> + vmovups %xmm0, -0x40(%r8)
>>> + vmovups %xmm0, -0x30(%r8)
>>> + vmovups %xmm0, -0x20(%r8)
>>> + vmovups %xmm0, -0x10(%r8)
>>> + ret
>>> + ALIGN(4)
>>> +L(less_128bytes):
>>> + cmp $64, %edx
>>> + jb L(less_64bytes)
>>> + vmovups %xmm0, (%rdi)
>>> + vmovups %xmm0, 0x10(%rdi)
>>> + vmovups %xmm0, 0x20(%rdi)
>>> + vmovups %xmm0, 0x30(%rdi)
>>> + vmovups %xmm0, -0x40(%r8)
>>> + vmovups %xmm0, -0x30(%r8)
>>> + vmovups %xmm0, -0x20(%r8)
>>> + vmovups %xmm0, -0x10(%r8)
>>> + ret
>>> + ALIGN(4)
>>> +L(less_64bytes):
>>> + cmp $32, %edx
>>> + jb L(less_32bytes)
>>> + vmovups %xmm0, (%rdi)
>>> + vmovups %xmm0, 0x10(%rdi)
>>> + vmovups %xmm0, -0x20(%r8)
>>> + vmovups %xmm0, -0x10(%r8)
>>> + ret
>>> + ALIGN(4)
>>> +L(less_32bytes):
>>> + cmp $16, %edx
>>> + jb L(less_16bytes)
>>> + vmovups %xmm0, (%rdi)
>>> + vmovups %xmm0, -0x10(%r8)
>>> + ret
>>> + ALIGN(4)
>>> +L(less_16bytes):
>>> + cmp $8, %edx
>>> + jb L(less_8bytes)
>>> + mov %rcx, (%rdi)
>>> + mov %rcx, -0x08(%r8)
>>> + ret
>>> + ALIGN(4)
>>> +L(less_8bytes):
>>> + cmp $4, %edx
>>> + jb L(less_4bytes)
>>> + mov %ecx, (%rdi)
>>> + mov %ecx, -0x04(%r8)
>>> + ALIGN(4)
>>> +L(less_4bytes):
>>> + cmp $2, %edx
>>> + jb L(less_2bytes)
>>> + mov %cx, (%rdi)
>>> + mov %cx, -0x02(%r8)
>>> + ret
>>> + ALIGN(4)
>>> +L(less_2bytes):
>>> + cmp $1, %edx
>>> + jb L(less_1bytes)
>>> + mov %cl, (%rdi)
>>> +L(less_1bytes):
>>> + ret
>>> +
>>> + ALIGN(4)
>>> +L(256bytesormore):
>>> + vinserti128 $1, %xmm0, %ymm0, %ymm0
>>
>> this breaks build on RHEL 6 x86_64:
>>
>> ../sysdeps/x86_64/multiarch/memset-avx2.S:
>> ../sysdeps/x86_64/multiarch/memset-avx2.S: Assembler messages:
>> Assembler messages:
>> ../sysdeps/x86_64/multiarch/memset-avx2.S:132:
>> ../sysdeps/x86_64/multiarch/memset-avx2.S:132: Error: Error: no such
>> instruction: `vinserti128 $1,%xmm0,%ymm0,%ymm0'no such instruction:
>> `vinserti128 $1,%xmm0,%ymm0,%ymm0'
>>
>> Cheers,
>>
>
> This patches enables AVX2 optimized memset only if -mavx2 works. Tested
> with GCC 4.6 and 4.8 on Fedora 20/x86-64. OK to install?
thanks, this fixed the issue also on RHEL 6 x86_64.
Cheers,
--
Marko Myllynen
More information about the Libc-alpha
mailing list