[PATCH] Enable AVX2 optimized memset only if -mavx2 works

Marko Myllynen myllynen@redhat.com
Tue Jul 1 16:11:00 GMT 2014


Hi,

On 2014-07-01 18:56, H.J. Lu wrote:
> On Tue, Jul 1, 2014 at 2:03 AM, Marko Myllynen <myllynen@redhat.com> wrote:
>> On 2014-04-04 10:34, ling.ma.program@gmail.com wrote:
>>> From: Ling Ma <ling.ml@alibaba-inc.com>
>>>
>>> In this patch we manage to reduce miss branch prediction by
>>> avoid using branch instructions and force destination to be aligned
>>> with avx instruction.
>>>
>>> ---
>>>  In this version we removed prefetch and append vmovd.
>>>
>>>  ChangeLog                              |   9 ++
>>>  sysdeps/x86_64/multiarch/Makefile      |   4 +-
>>>  sysdeps/x86_64/multiarch/memset-avx2.S | 192 +++++++++++++++++++++++++++++++++
>>>  sysdeps/x86_64/multiarch/memset.S      |  59 ++++++++++
>>>  sysdeps/x86_64/multiarch/memset_chk.S  |  44 ++++++++
>>>  5 files changed, 307 insertions(+), 1 deletion(-)
>>>  create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
>>>  create mode 100644 sysdeps/x86_64/multiarch/memset.S
>>>  create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
>>>
>>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
>>> new file mode 100644
>>> index 0000000..08e8ee8
>>> --- /dev/null
>>> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
>>> @@ -0,0 +1,192 @@
>>> +/* memset with AVX2
>>> +   Copyright (C) 2014 Free Software Foundation, Inc.
>>> +   Contributed by Alibaba Group.
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library; if not, see
>>> +   <http://www.gnu.org/licenses/>.  */
>>> +
>>> +#include <sysdep.h>
>>> +
>>> +#if !defined NOT_IN_libc
>>> +
>>> +#include "asm-syntax.h"
>>> +#ifndef ALIGN
>>> +# define ALIGN(n)    .p2align n
>>> +#endif
>>> +#ifndef MEMSET
>>> +# define MEMSET      __memset_avx2
>>> +# define MEMSET_CHK  __memset_chk_avx2
>>> +#endif
>>> +
>>> +     .section .text.avx2,"ax",@progbits
>>> +#if defined PIC
>>> +ENTRY (MEMSET_CHK)
>>> +     cmpq    %rdx, %rcx
>>> +     jb      HIDDEN_JUMPTARGET (__chk_fail)
>>> +END (MEMSET_CHK)
>>> +#endif
>>> +
>>> +ENTRY (MEMSET)
>>> +     vpxor   %xmm0, %xmm0, %xmm0
>>> +     vmovd %esi, %xmm1
>>> +     lea     (%rdi, %rdx), %r8
>>> +     vpshufb %xmm0, %xmm1, %xmm0
>>> +     mov     %rdi, %rax
>>> +     cmp     $256, %rdx
>>> +     jae     L(256bytesormore)
>>> +     vmovd %xmm0, %rcx
>>> +     cmp     $128, %rdx
>>> +     jb      L(less_128bytes)
>>> +     vmovups %xmm0, (%rdi)
>>> +     vmovups %xmm0, 0x10(%rdi)
>>> +     vmovups %xmm0, 0x20(%rdi)
>>> +     vmovups %xmm0, 0x30(%rdi)
>>> +     vmovups %xmm0, 0x40(%rdi)
>>> +     vmovups %xmm0, 0x50(%rdi)
>>> +     vmovups %xmm0, 0x60(%rdi)
>>> +     vmovups %xmm0, 0x70(%rdi)
>>> +     vmovups %xmm0, -0x80(%r8)
>>> +     vmovups %xmm0, -0x70(%r8)
>>> +     vmovups %xmm0, -0x60(%r8)
>>> +     vmovups %xmm0, -0x50(%r8)
>>> +     vmovups %xmm0, -0x40(%r8)
>>> +     vmovups %xmm0, -0x30(%r8)
>>> +     vmovups %xmm0, -0x20(%r8)
>>> +     vmovups %xmm0, -0x10(%r8)
>>> +     ret
>>> +     ALIGN(4)
>>> +L(less_128bytes):
>>> +     cmp     $64, %edx
>>> +     jb      L(less_64bytes)
>>> +     vmovups %xmm0, (%rdi)
>>> +     vmovups %xmm0, 0x10(%rdi)
>>> +     vmovups %xmm0, 0x20(%rdi)
>>> +     vmovups %xmm0, 0x30(%rdi)
>>> +     vmovups %xmm0, -0x40(%r8)
>>> +     vmovups %xmm0, -0x30(%r8)
>>> +     vmovups %xmm0, -0x20(%r8)
>>> +     vmovups %xmm0, -0x10(%r8)
>>> +     ret
>>> +     ALIGN(4)
>>> +L(less_64bytes):
>>> +     cmp     $32, %edx
>>> +     jb      L(less_32bytes)
>>> +     vmovups %xmm0, (%rdi)
>>> +     vmovups %xmm0, 0x10(%rdi)
>>> +     vmovups %xmm0, -0x20(%r8)
>>> +     vmovups %xmm0, -0x10(%r8)
>>> +     ret
>>> +     ALIGN(4)
>>> +L(less_32bytes):
>>> +     cmp     $16, %edx
>>> +     jb      L(less_16bytes)
>>> +     vmovups %xmm0, (%rdi)
>>> +     vmovups %xmm0, -0x10(%r8)
>>> +     ret
>>> +     ALIGN(4)
>>> +L(less_16bytes):
>>> +     cmp     $8, %edx
>>> +     jb      L(less_8bytes)
>>> +     mov %rcx, (%rdi)
>>> +     mov %rcx, -0x08(%r8)
>>> +     ret
>>> +     ALIGN(4)
>>> +L(less_8bytes):
>>> +     cmp     $4, %edx
>>> +     jb      L(less_4bytes)
>>> +     mov %ecx, (%rdi)
>>> +     mov %ecx, -0x04(%r8)
>>> +     ALIGN(4)
>>> +L(less_4bytes):
>>> +     cmp     $2, %edx
>>> +     jb      L(less_2bytes)
>>> +     mov     %cx, (%rdi)
>>> +     mov     %cx, -0x02(%r8)
>>> +     ret
>>> +     ALIGN(4)
>>> +L(less_2bytes):
>>> +     cmp     $1, %edx
>>> +     jb      L(less_1bytes)
>>> +     mov     %cl, (%rdi)
>>> +L(less_1bytes):
>>> +     ret
>>> +
>>> +     ALIGN(4)
>>> +L(256bytesormore):
>>> +     vinserti128 $1, %xmm0, %ymm0, %ymm0
>>
>> this breaks build on RHEL 6 x86_64:
>>
>> ../sysdeps/x86_64/multiarch/memset-avx2.S:
>> ../sysdeps/x86_64/multiarch/memset-avx2.S: Assembler messages:
>> Assembler messages:
>> ../sysdeps/x86_64/multiarch/memset-avx2.S:132:
>> ../sysdeps/x86_64/multiarch/memset-avx2.S:132: Error: Error: no such
>> instruction: `vinserti128 $1,%xmm0,%ymm0,%ymm0'no such instruction:
>> `vinserti128 $1,%xmm0,%ymm0,%ymm0'
>>
>> Cheers,
>>
> 
> This patches enables AVX2 optimized memset only if -mavx2 works.  Tested
> with GCC 4.6 and 4.8 on Fedora 20/x86-64.  OK to install?

thanks, this fixed the issue also on RHEL 6 x86_64.

Cheers,

-- 
Marko Myllynen



More information about the Libc-alpha mailing list