]> sourceware.org Git - glibc.git/commitdiff
Fixed several libmvec bugs found during testing on KNL hardware.
authorAndrew Senkevich <andrew.senkevich@intel.com>
Fri, 24 Jul 2015 11:47:23 +0000 (14:47 +0300)
committerAndrew Senkevich <andrew.senkevich@intel.com>
Fri, 24 Jul 2015 11:47:23 +0000 (14:47 +0300)
AVX512 IFUNC implementations, implementations of wrappers to
AVX2 versions and KNL expf implementation fixed.

    * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
    * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
    * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
    * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
    implementation.

16 files changed:
ChangeLog
sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
sysdeps/x86_64/fpu/svml_s_wrapper_impl.h

index 6f6016db438e9cd5187b6487c5c0fc3b8693d2cd..3e22413e8ab15f94323ecb03454b181e79fd9d0d 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2015-07-24  Andrew Senkevich  <andrew.senkevich@intel.com>
+
+       * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
+       * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
+       * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
+       * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
+       * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
+       implementation.
+
 2015-07-24  Szabolcs Nagy  <szabolcs.nagy@arm.com>
 
        [BZ #17711]
index ba3b66f69fbc09914b994cf4286973e117b9e3df..d0f4f27f46a78c3591a05dadd0a9a65433a7ab9d 100644 (file)
 ENTRY (_ZGVeN8v_cos)
         .type   _ZGVeN8v_cos, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_cos_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_cos_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_cos_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_cos)
 
 #define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
index 8f837fbfb90c607ff323863ec435f888e0fdee16..7b7c07d92602add121f58eb6802ffc75e31db70e 100644 (file)
 ENTRY (_ZGVeN8v_exp)
         .type   _ZGVeN8v_exp, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_exp_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_exp_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_exp_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_exp)
 
 #define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
index 2f9e9d8892e8d4e89b7ddd67527b5177da812c9d..76375fdae0e263ac58c6bdc5892c62be9ddd1b59 100644 (file)
 ENTRY (_ZGVeN8v_log)
         .type   _ZGVeN8v_log, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_log_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_log_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_log_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_log_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_log)
 
 #define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
index 3b11511e513cfe7f45f5985064bf5ccfd06e68ca..c1e5e76f92a1f4631aef3d63cabc43cb2117f2a8 100644 (file)
 ENTRY (_ZGVeN8vv_pow)
         .type   _ZGVeN8vv_pow, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8vv_pow_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8vv_pow_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8vv_pow_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8vv_pow)
 
 #define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
index ba631020f33b64d484dcc26dd9f9b787e69c4037..131f2f47c5884c49cea5e9fd5c037829471262a1 100644 (file)
 ENTRY (_ZGVeN8v_sin)
         .type   _ZGVeN8v_sin, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_sin_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_sin_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_sin_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_sin)
 
 #define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
index 7228ba549a593b98acd76d1c0035266469c8a156..e33109099ec3b5339a7a9b4b18888d152a91878f 100644 (file)
 ENTRY (_ZGVeN8vvv_sincos)
         .type   _ZGVeN8vvv_sincos, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8vvv_sincos)
 
 #define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
index 91564de22aa5014c0aff3b433bed5646dce3adba..0654d3c19b94432e8b44a6e3b69e7b58a1681145 100644 (file)
 ENTRY (_ZGVeN16v_cosf)
         .type   _ZGVeN16v_cosf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_cosf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_cosf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_cosf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_cosf)
 
 #define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
index 3b3489d05ac5ebbb422a596106c0f43ce7aabcd1..62858eb39ec6d7effc6e783bb7b71dab91ebc850 100644 (file)
 ENTRY (_ZGVeN16v_expf)
         .type   _ZGVeN16v_expf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_expf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_expf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_expf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_expf)
 
 #define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
index cb807e075782a49eb7c16ef87c60d948c6f37b5d..ec69055351dba11970f0654e4dcb0ed05e134b01 100644 (file)
@@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
      The table lookup is skipped if k = 0.
      For low accuracy approximation, exp(r) ~ 1 or 1+r.  */
 
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
         movq      %rsp, %rbp
index 8756750c86897e01a4176a03fcaa3e3337554c1a..68c57e43867bbaf4a19adf7b145b6b4fcb2e51b9 100644 (file)
 ENTRY (_ZGVeN16v_logf)
         .type   _ZGVeN16v_logf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_logf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_logf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_logf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_logf)
 
 #define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
index a4ba4fbc048e639193e455272eaf81985217d7fc..3aa9f952cea5bee6428c3bbb9717bd6335b1cf87 100644 (file)
 ENTRY (_ZGVeN16vv_powf)
         .type   _ZGVeN16vv_powf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16vv_powf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16vv_powf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16vv_powf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16vv_powf)
 
 #define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
index 0a1753eab7fb1a51e106461d3312537c3c9f84a2..bdcabab6e2d2042be3c3e78c0e5f5e4ae91b5e32 100644 (file)
 ENTRY (_ZGVeN16vvv_sincosf)
         .type   _ZGVeN16vvv_sincosf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16vvv_sincosf)
 
 #define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
index 7ed637b8e66673b34ca861ab6c776c053c7ca884..3ec78a0b5e739dd68c44c40d45d1e7aaaca54c6f 100644 (file)
 ENTRY (_ZGVeN16v_sinf)
         .type   _ZGVeN16v_sinf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_sinf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_sinf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_sinf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_sinf)
 
 #define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
index bd93b8edfa0f4fffba917fdc2c9426a022e158d9..5c0ff897c00be8f7e5b1639582a8b89635b349cb 100644 (file)
 
 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512 callee
-        pushq  %rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq   %rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq   $-64, %rsp
-        subq   $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x04
-        .byte  0x24
-        call   HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
-        call   HIDDEN_JUMPTARGET(\callee)
-        movq   %rbp, %rsp
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq   %rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
 
 /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512_ff callee
-        pushq  %rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq   %rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq   $-64, %rsp
-        subq   $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x4c
-        .byte  0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovapd 64(%rsp), %ymm1.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x4c
-        .byte  0x24
-        .byte  0x40
-        call   HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
-/* Below is encoding for vmovapd 96(%rsp), %ymm1.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x4c
-        .byte  0x24
-        .byte  0x60
-        call   HIDDEN_JUMPTARGET(\callee)
-        movq   %rbp, %rsp
+        andq      $-64, %rsp
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovupd   (%rsp), %ymm0
+        vmovupd   64(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 128(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq   %rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
         cfi_rel_offset (%r13, 0)
         subq      $176, %rsp
         movq      %rsi, %r13
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
         .byte  0x62
         .byte  0xf1
         .byte  0x7c
         .byte  0x48
-        .byte  0x29
+        .byte  0x11
         .byte  0x04
         .byte  0x24
         movq    %rdi, %r12
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x04
-        .byte  0x24
+        vmovupd (%rsp), %ymm0
         call      HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
+        vmovupd   32(%rsp), %ymm0
         lea       64(%rsp), %rdi
         lea       96(%rsp), %rsi
         call      HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 64(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x40
-/* Below is encoding for vmovapd   96(%rsp), %ymm1.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x4c
-        .byte  0x24
-        .byte  0x60
-/* Below is encoding for vmovapd   %ymm0, 32(%r12).  */
-        .byte  0xc4
-        .byte  0xc1
-        .byte  0x7d
-        .byte  0x29
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
-/* Below is encoding for vmovapd   %ymm1, 32(%r13).  */
-        .byte  0xc4
-        .byte  0xc1
-        .byte  0x7d
-        .byte  0x29
-        .byte  0x4d
-        .byte  0x20
+        vmovupd   64(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        vmovupd   %ymm0, 32(%r12)
+        vmovupd   %ymm1, 32(%r13)
+        vzeroupper
         addq      $176, %rsp
         popq      %r13
         cfi_adjust_cfa_offset (-8)
index 66bb081c9de77fd5ac40e1365a2306626c7766bd..d255d195ee8190a7d5a2b5302b89367e51f10107 100644 (file)
 
 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512 callee
-        pushq  %rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq   %rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq   $-64, %rsp
-        subq   $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-        vmovaps (%rsp), %ymm0
-        call   HIDDEN_JUMPTARGET(\callee)
-        vmovaps 32(%rsp), %ymm0
-        call   HIDDEN_JUMPTARGET(\callee)
-        movq   %rbp, %rsp
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq   %rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
         movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
-        subq      $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x4c
-        .byte  0x24
-        vmovaps (%rsp), %ymm0
-        vmovaps 64(%rsp), %ymm1
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovups   (%rsp), %ymm0
+        vmovups   64(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
-        vmovaps 32(%rsp), %ymm0
-        vmovaps 96(%rsp), %ymm1
+        vmovups   %ymm0, 128(%rsp)
+        vmovups   32(%rsp), %ymm0
+        vmovups   96(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
+        vmovups   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
         movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq      %rbp
This page took 0.138014 seconds and 5 git commands to generate.