]> sourceware.org Git - glibc.git/commitdiff
x86-64: Add AVX optimized string/memory functions for RTM
authorH.J. Lu <hjl.tools@gmail.com>
Fri, 5 Mar 2021 15:26:42 +0000 (07:26 -0800)
committerH.J. Lu <hjl.tools@gmail.com>
Mon, 29 Mar 2021 14:40:17 +0000 (07:40 -0700)
Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX
optimized string/memory functions with

xtest
jz 1f
vzeroall
ret
1:
vzeroupper
ret

at function exit on processors with usable RTM, but without 256-bit EVEX
instructions to avoid VZEROUPPER inside a transactionally executing RTM
region.

52 files changed:
sysdeps/x86_64/multiarch/Makefile
sysdeps/x86_64/multiarch/ifunc-avx2.h
sysdeps/x86_64/multiarch/ifunc-impl-list.c
sysdeps/x86_64/multiarch/ifunc-memcmp.h
sysdeps/x86_64/multiarch/ifunc-memmove.h
sysdeps/x86_64/multiarch/ifunc-memset.h
sysdeps/x86_64/multiarch/ifunc-strcpy.h
sysdeps/x86_64/multiarch/ifunc-wmemset.h
sysdeps/x86_64/multiarch/memchr-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memchr-avx2.S
sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memrchr-avx2.S
sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strcat-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strcat-avx2.S
sysdeps/x86_64/multiarch/strchr-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strchr-avx2.S
sysdeps/x86_64/multiarch/strchr.c
sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strcmp-avx2.S
sysdeps/x86_64/multiarch/strcmp.c
sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strcpy-avx2.S
sysdeps/x86_64/multiarch/strlen-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strlen-avx2.S
sysdeps/x86_64/multiarch/strncat-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strncmp.c
sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/strrchr-avx2.S
sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/wcsnlen.c
sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S [new file with mode: 0644]
sysdeps/x86_64/sysdep.h

index 9d79b138e9fa3e7e4917e80f8e6113f40eeb9b26..491c7698dc1a936eef3f2cde134ff010b14da433 100644 (file)
@@ -40,6 +40,25 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
                   memset-sse2-unaligned-erms \
                   memset-avx2-unaligned-erms \
                   memset-avx512-unaligned-erms \
+                  memchr-avx2-rtm \
+                  memcmp-avx2-movbe-rtm \
+                  memmove-avx-unaligned-erms-rtm \
+                  memrchr-avx2-rtm \
+                  memset-avx2-unaligned-erms-rtm \
+                  rawmemchr-avx2-rtm \
+                  strchr-avx2-rtm \
+                  strcmp-avx2-rtm \
+                  strchrnul-avx2-rtm \
+                  stpcpy-avx2-rtm \
+                  stpncpy-avx2-rtm \
+                  strcat-avx2-rtm \
+                  strcpy-avx2-rtm \
+                  strlen-avx2-rtm \
+                  strncat-avx2-rtm \
+                  strncmp-avx2-rtm \
+                  strncpy-avx2-rtm \
+                  strnlen-avx2-rtm \
+                  strrchr-avx2-rtm \
                   memchr-evex \
                   memcmp-evex-movbe \
                   memmove-evex-unaligned-erms \
@@ -76,6 +95,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
                   wcsrchr-sse2 wcsrchr-avx2 \
                   wcsnlen-sse4_1 wcsnlen-c \
                   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
+                  wcschr-avx2-rtm \
+                  wcscmp-avx2-rtm \
+                  wcslen-avx2-rtm \
+                  wcsncmp-avx2-rtm \
+                  wcsnlen-avx2-rtm \
+                  wcsrchr-avx2-rtm \
+                  wmemchr-avx2-rtm \
+                  wmemcmp-avx2-movbe-rtm \
                   wcschr-evex \
                   wcscmp-evex \
                   wcslen-evex \
index 634c3c3c919ad128a57f58fbd87728070fe82567..e3ec62ca5e2b6787fa53ac4bf555c5b6b01a0c5b 100644 (file)
@@ -21,6 +21,7 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -36,6 +37,9 @@ IFUNC_SELECTOR (void)
          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
        return OPTIMIZE (evex);
 
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+       return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
        return OPTIMIZE (avx2);
     }
index 96344a71e4461ac26837f5cf303d54fc98282dfb..024913065be053f97447f065a8a463e700dea963 100644 (file)
@@ -43,6 +43,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memchr,
                              CPU_FEATURE_USABLE (AVX2),
                              __memchr_avx2)
+             IFUNC_IMPL_ADD (array, i, memchr,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memchr_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, memchr,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -56,6 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
                              (CPU_FEATURE_USABLE (AVX2)
                               && CPU_FEATURE_USABLE (MOVBE)),
                              __memcmp_avx2_movbe)
+             IFUNC_IMPL_ADD (array, i, memcmp,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (MOVBE)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memcmp_avx2_movbe_rtm)
              IFUNC_IMPL_ADD (array, i, memcmp,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -85,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, __memmove_chk,
                              CPU_FEATURE_USABLE (AVX),
                              __memmove_chk_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memmove_chk_avx_unaligned_rtm)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memmove_chk_avx_unaligned_erms_rtm)
              IFUNC_IMPL_ADD (array, i, __memmove_chk,
                              CPU_FEATURE_USABLE (AVX512VL),
                              __memmove_chk_evex_unaligned)
@@ -113,6 +130,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memmove,
                              CPU_FEATURE_USABLE (AVX),
                              __memmove_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memmove,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memmove_avx_unaligned_rtm)
+             IFUNC_IMPL_ADD (array, i, memmove,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memmove_avx_unaligned_erms_rtm)
              IFUNC_IMPL_ADD (array, i, memmove,
                              CPU_FEATURE_USABLE (AVX512VL),
                              __memmove_evex_unaligned)
@@ -143,6 +168,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memrchr,
                              CPU_FEATURE_USABLE (AVX2),
                              __memrchr_avx2)
+             IFUNC_IMPL_ADD (array, i, memrchr,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memrchr_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, memrchr,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -165,6 +194,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, __memset_chk,
                              CPU_FEATURE_USABLE (AVX2),
                              __memset_chk_avx2_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memset_chk,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memset_chk_avx2_unaligned_rtm)
+             IFUNC_IMPL_ADD (array, i, __memset_chk,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memset_chk_avx2_unaligned_erms_rtm)
              IFUNC_IMPL_ADD (array, i, __memset_chk,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -198,6 +235,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memset,
                              CPU_FEATURE_USABLE (AVX2),
                              __memset_avx2_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memset,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memset_avx2_unaligned_rtm)
+             IFUNC_IMPL_ADD (array, i, memset,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memset_avx2_unaligned_erms_rtm)
              IFUNC_IMPL_ADD (array, i, memset,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -222,6 +267,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, rawmemchr,
                              CPU_FEATURE_USABLE (AVX2),
                              __rawmemchr_avx2)
+             IFUNC_IMPL_ADD (array, i, rawmemchr,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __rawmemchr_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, rawmemchr,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -234,6 +283,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, strlen,
                              CPU_FEATURE_USABLE (AVX2),
                              __strlen_avx2)
+             IFUNC_IMPL_ADD (array, i, strlen,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strlen_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strlen,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -245,6 +298,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, strnlen,
                              CPU_FEATURE_USABLE (AVX2),
                              __strnlen_avx2)
+             IFUNC_IMPL_ADD (array, i, strnlen,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strnlen_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strnlen,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -257,6 +314,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
                              __stpncpy_ssse3)
              IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
                              __stpncpy_avx2)
+             IFUNC_IMPL_ADD (array, i, stpncpy,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __stpncpy_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, stpncpy,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -271,6 +332,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
                              __stpcpy_ssse3)
              IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
                              __stpcpy_avx2)
+             IFUNC_IMPL_ADD (array, i, stpcpy,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __stpcpy_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, stpcpy,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -309,6 +374,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcat,
              IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
                              __strcat_avx2)
+             IFUNC_IMPL_ADD (array, i, strcat,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strcat_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strcat,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -323,6 +392,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, strchr,
                              CPU_FEATURE_USABLE (AVX2),
                              __strchr_avx2)
+             IFUNC_IMPL_ADD (array, i, strchr,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strchr_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strchr,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -336,6 +409,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, strchrnul,
                              CPU_FEATURE_USABLE (AVX2),
                              __strchrnul_avx2)
+             IFUNC_IMPL_ADD (array, i, strchrnul,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strchrnul_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strchrnul,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -348,6 +425,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, strrchr,
                              CPU_FEATURE_USABLE (AVX2),
                              __strrchr_avx2)
+             IFUNC_IMPL_ADD (array, i, strrchr,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strrchr_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strrchr,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -359,6 +440,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, strcmp,
                              CPU_FEATURE_USABLE (AVX2),
                              __strcmp_avx2)
+             IFUNC_IMPL_ADD (array, i, strcmp,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strcmp_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strcmp,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -375,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcpy,
              IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
                              __strcpy_avx2)
+             IFUNC_IMPL_ADD (array, i, strcpy,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strcpy_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strcpy,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -422,6 +511,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strncat,
              IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
                              __strncat_avx2)
+             IFUNC_IMPL_ADD (array, i, strncat,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strncat_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strncat,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -436,6 +529,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strncpy,
              IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
                              __strncpy_avx2)
+             IFUNC_IMPL_ADD (array, i, strncpy,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strncpy_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strncpy,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
@@ -469,6 +566,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, wcschr,
                              CPU_FEATURE_USABLE (AVX2),
                              __wcschr_avx2)
+             IFUNC_IMPL_ADD (array, i, wcschr,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __wcschr_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, wcschr,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -481,6 +582,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, wcsrchr,
                              CPU_FEATURE_USABLE (AVX2),
                              __wcsrchr_avx2)
+             IFUNC_IMPL_ADD (array, i, wcsrchr,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __wcsrchr_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, wcsrchr,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -493,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, wcscmp,
                              CPU_FEATURE_USABLE (AVX2),
                              __wcscmp_avx2)
+             IFUNC_IMPL_ADD (array, i, wcscmp,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __wcscmp_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, wcscmp,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -505,6 +614,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, wcsncmp,
                              CPU_FEATURE_USABLE (AVX2),
                              __wcsncmp_avx2)
+             IFUNC_IMPL_ADD (array, i, wcsncmp,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __wcsncmp_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, wcsncmp,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -523,6 +636,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, wcslen,
                              CPU_FEATURE_USABLE (AVX2),
                              __wcslen_avx2)
+             IFUNC_IMPL_ADD (array, i, wcslen,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __wcslen_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, wcslen,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -535,6 +652,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, wcsnlen,
                              CPU_FEATURE_USABLE (AVX2),
                              __wcsnlen_avx2)
+             IFUNC_IMPL_ADD (array, i, wcsnlen,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __wcsnlen_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, wcsnlen,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -550,6 +671,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, wmemchr,
                              CPU_FEATURE_USABLE (AVX2),
                              __wmemchr_avx2)
+             IFUNC_IMPL_ADD (array, i, wmemchr,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __wmemchr_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, wmemchr,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -563,6 +688,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
                              (CPU_FEATURE_USABLE (AVX2)
                               && CPU_FEATURE_USABLE (MOVBE)),
                              __wmemcmp_avx2_movbe)
+             IFUNC_IMPL_ADD (array, i, wmemcmp,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (MOVBE)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __wmemcmp_avx2_movbe_rtm)
              IFUNC_IMPL_ADD (array, i, wmemcmp,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)
@@ -581,6 +711,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, wmemset,
                              CPU_FEATURE_USABLE (AVX2),
                              __wmemset_avx2_unaligned)
+             IFUNC_IMPL_ADD (array, i, wmemset,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __wmemset_avx2_unaligned_rtm)
              IFUNC_IMPL_ADD (array, i, wmemset,
                              CPU_FEATURE_USABLE (AVX512VL),
                              __wmemset_evex_unaligned)
@@ -606,6 +740,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, __memcpy_chk,
                              CPU_FEATURE_USABLE (AVX),
                              __memcpy_chk_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memcpy_chk_avx_unaligned_rtm)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memcpy_chk_avx_unaligned_erms_rtm)
              IFUNC_IMPL_ADD (array, i, __memcpy_chk,
                              CPU_FEATURE_USABLE (AVX512VL),
                              __memcpy_chk_evex_unaligned)
@@ -634,6 +776,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memcpy,
                              CPU_FEATURE_USABLE (AVX),
                              __memcpy_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memcpy,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memcpy_avx_unaligned_rtm)
+             IFUNC_IMPL_ADD (array, i, memcpy,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __memcpy_avx_unaligned_erms_rtm)
              IFUNC_IMPL_ADD (array, i, memcpy,
                              CPU_FEATURE_USABLE (AVX512VL),
                              __memcpy_evex_unaligned)
@@ -676,6 +826,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
                              CPU_FEATURE_USABLE (AVX),
                              __mempcpy_chk_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __mempcpy_chk_avx_unaligned_rtm)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __mempcpy_chk_avx_unaligned_erms_rtm)
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
                              CPU_FEATURE_USABLE (AVX512VL),
                              __mempcpy_chk_evex_unaligned)
@@ -713,6 +871,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, mempcpy,
                              CPU_FEATURE_USABLE (AVX),
                              __mempcpy_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __mempcpy_avx_unaligned_rtm)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+                             (CPU_FEATURE_USABLE (AVX)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __mempcpy_avx_unaligned_erms_rtm)
              IFUNC_IMPL_ADD (array, i, mempcpy,
                              CPU_FEATURE_USABLE (AVX512VL),
                              __mempcpy_evex_unaligned)
@@ -734,6 +900,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, strncmp,
                              CPU_FEATURE_USABLE (AVX2),
                              __strncmp_avx2)
+             IFUNC_IMPL_ADD (array, i, strncmp,
+                             (CPU_FEATURE_USABLE (AVX2)
+                              && CPU_FEATURE_USABLE (RTM)),
+                             __strncmp_avx2_rtm)
              IFUNC_IMPL_ADD (array, i, strncmp,
                              (CPU_FEATURE_USABLE (AVX512VL)
                               && CPU_FEATURE_USABLE (AVX512BW)),
index 5ac41a19b876b0d917501dfb2def5a74f7faf3bc..8bee1aff75d6e6f20e2aa30b69709df77ccea54e 100644 (file)
@@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
 
 static inline void *
@@ -38,6 +39,9 @@ IFUNC_SELECTOR (void)
          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
        return OPTIMIZE (evex_movbe);
 
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+       return OPTIMIZE (avx2_movbe_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
        return OPTIMIZE (avx2_movbe);
     }
index 517b332bfc2b069358bf0d91188c18b8b36c930e..4eba926eca4679a166e768defe02a3d3fffd61fb 100644 (file)
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
@@ -71,6 +75,14 @@ IFUNC_SELECTOR (void)
          return OPTIMIZE (evex_unaligned);
        }
 
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+       {
+         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+           return OPTIMIZE (avx_unaligned_erms_rtm);
+
+         return OPTIMIZE (avx_unaligned_rtm);
+       }
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
        {
          if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
index 02468182636322c533dcd2c836caa51f1bd970c3..43655fb68473c0ea1f6a015a020b8e93c79f3751 100644 (file)
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
@@ -69,6 +73,14 @@ IFUNC_SELECTOR (void)
          return OPTIMIZE (evex_unaligned);
        }
 
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+       {
+         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+           return OPTIMIZE (avx2_unaligned_erms_rtm);
+
+         return OPTIMIZE (avx2_unaligned_rtm);
+       }
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
        {
          if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
index f31f436adfe24af108eb0546e5e6b617fbb8a284..39568f480f4b6ba2409d48c91b03f8055a2e874f 100644 (file)
@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -39,6 +40,9 @@ IFUNC_SELECTOR (void)
          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
        return OPTIMIZE (evex);
 
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+       return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
        return OPTIMIZE (avx2);
     }
index 7e947c56b4d0201390634087f66a73f1e0a714fa..8d952eff99a1b3cc56a64dd36f52214404a07fa7 100644 (file)
@@ -20,6 +20,8 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
 
@@ -39,6 +41,9 @@ IFUNC_SELECTOR (void)
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
        return OPTIMIZE (evex_unaligned);
 
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+       return OPTIMIZE (avx2_unaligned_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
        return OPTIMIZE (avx2_unaligned);
     }
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
new file mode 100644 (file)
index 0000000..87b076c
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef MEMCHR
+# define MEMCHR __memchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memchr-avx2.S"
index 77a95231681df3e3bb3b5e1877af46fc7d6e0b4e..1fcb1c350f0e6e61beec814ce24486199b9b51ff 100644 (file)
 #  define VZEROUPPER   vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)   p##.avx
+# endif
+
 # define VEC_SIZE 32
 
-       .section .text.avx,"ax",@progbits
+       .section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
        /* Check for zero length.  */
@@ -107,8 +111,8 @@ L(cros_page_boundary):
 # endif
        addq    %rdi, %rax
        addq    %rcx, %rax
-       VZEROUPPER
-       ret
+L(return_vzeroupper):
+       ZERO_UPPER_VEC_REGISTERS_RETURN
 
        .p2align 4
 L(aligned_more):
@@ -224,8 +228,7 @@ L(last_4x_vec_or_less):
 
        jnz     L(first_vec_x3_check)
        xorl    %eax, %eax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(last_2x_vec):
@@ -243,8 +246,7 @@ L(last_2x_vec):
        testl   %eax, %eax
        jnz     L(first_vec_x1_check)
        xorl    %eax, %eax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x0_check):
@@ -253,8 +255,7 @@ L(first_vec_x0_check):
        cmpq    %rax, %rdx
        jbe     L(zero)
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x1_check):
@@ -264,8 +265,7 @@ L(first_vec_x1_check):
        jbe     L(zero)
        addq    $VEC_SIZE, %rax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x2_check):
@@ -275,8 +275,7 @@ L(first_vec_x2_check):
        jbe     L(zero)
        addq    $(VEC_SIZE * 2), %rax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x3_check):
@@ -286,12 +285,14 @@ L(first_vec_x3_check):
        jbe     L(zero)
        addq    $(VEC_SIZE * 3), %rax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(zero):
-       VZEROUPPER
+       xorl    %eax, %eax
+       jmp     L(return_vzeroupper)
+
+       .p2align 4
 L(null):
        xorl    %eax, %eax
        ret
@@ -301,24 +302,21 @@ L(null):
 L(first_vec_x0):
        tzcntl  %eax, %eax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x1):
        tzcntl  %eax, %eax
        addq    $VEC_SIZE, %rax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x2):
        tzcntl  %eax, %eax
        addq    $(VEC_SIZE * 2), %rax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(4x_vec_end):
@@ -337,8 +335,7 @@ L(first_vec_x3):
        tzcntl  %eax, %eax
        addq    $(VEC_SIZE * 3), %rax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
new file mode 100644 (file)
index 0000000..cf4eff5
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef MEMCMP
+# define MEMCMP __memcmp_avx2_movbe_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memcmp-avx2-movbe.S"
index cf9c9b8c1fcc84cae94f69f60b473c8e8ef22099..ad0fa962a1e28022d726fd3e2a68d213dba3a2c8 100644 (file)
 #  define VZEROUPPER   vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)   p##.avx
+# endif
+
 # define VEC_SIZE 32
 # define VEC_MASK ((1 << VEC_SIZE) - 1)
 
@@ -55,7 +59,7 @@
            memcmp has to use UNSIGNED comparison for elemnts.
 */
 
-       .section .text.avx,"ax",@progbits
+       .section SECTION(.text),"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
        shl     $2, %RDX_LP
@@ -123,8 +127,8 @@ ENTRY (MEMCMP)
        vptest  %ymm0, %ymm5
        jnc     L(4x_vec_end)
        xorl    %eax, %eax
-       VZEROUPPER
-       ret
+L(return_vzeroupper):
+       ZERO_UPPER_VEC_REGISTERS_RETURN
 
        .p2align 4
 L(last_2x_vec):
@@ -144,8 +148,7 @@ L(last_vec):
        vpmovmskb %ymm2, %eax
        subl    $VEC_MASK, %eax
        jnz     L(first_vec)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec):
@@ -164,8 +167,7 @@ L(wmemcmp_return):
        movzbl  (%rsi, %rcx), %edx
        sub     %edx, %eax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 # ifdef USE_AS_WMEMCMP
        .p2align 4
@@ -367,8 +369,7 @@ L(last_4x_vec):
        vpmovmskb %ymm2, %eax
        subl    $VEC_MASK, %eax
        jnz     L(first_vec)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(4x_vec_end):
@@ -394,8 +395,7 @@ L(4x_vec_end):
        movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %edx
        sub     %edx, %eax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x1):
@@ -410,8 +410,7 @@ L(first_vec_x1):
        movzbl  VEC_SIZE(%rsi, %rcx), %edx
        sub     %edx, %eax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x2):
@@ -426,7 +425,6 @@ L(first_vec_x2):
        movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %edx
        sub     %edx, %eax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 END (MEMCMP)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
new file mode 100644 (file)
index 0000000..1ec1962
--- /dev/null
@@ -0,0 +1,17 @@
+#if IS_IN (libc)
+# define VEC_SIZE      32
+# define VEC(i)                ymm##i
+# define VMOVNT                vmovntdq
+# define VMOVU         vmovdqu
+# define VMOVA         vmovdqa
+
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+# define VZEROUPPER_RETURN jmp  L(return)
+
+# define SECTION(p)            p##.avx.rtm
+# define MEMMOVE_SYMBOL(p,s)   p##_avx_##s##_rtm
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
index d713d7d6792897be397aaff8914e780373fbc224..897a3d976275b7bf3cd7b76d3ffaef2bd43ece59 100644 (file)
@@ -150,11 +150,12 @@ L(last_2x_vec):
        VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
        VMOVU   %VEC(0), (%rdi)
        VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
-       VZEROUPPER
 #if !defined USE_MULTIARCH || !IS_IN (libc)
 L(nop):
-#endif
        ret
+#else
+       VZEROUPPER_RETURN
+#endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 
@@ -247,8 +248,11 @@ L(last_2x_vec):
        VMOVU   %VEC(0), (%rdi)
        VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
 L(return):
-       VZEROUPPER
+#if VEC_SIZE > 16
+       ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
        ret
+#endif
 
 L(movsb):
        cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
@@ -313,8 +317,7 @@ L(between_32_63):
        VMOVU   -32(%rsi,%rdx), %YMM1
        VMOVU   %YMM0, (%rdi)
        VMOVU   %YMM1, -32(%rdi,%rdx)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 #endif
 #if VEC_SIZE > 16
        /* From 16 to 31.  No branch when size == 16.  */
@@ -323,7 +326,7 @@ L(between_16_31):
        VMOVU   -16(%rsi,%rdx), %XMM1
        VMOVU   %XMM0, (%rdi)
        VMOVU   %XMM1, -16(%rdi,%rdx)
-       ret
+       VZEROUPPER_RETURN
 #endif
 L(between_8_15):
        /* From 8 to 15.  No branch when size == 8.  */
@@ -376,8 +379,7 @@ L(more_2x_vec):
        VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
        VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
        VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 L(last_4x_vec):
        /* Copy from 2 * VEC to 4 * VEC. */
        VMOVU   (%rsi), %VEC(0)
@@ -388,8 +390,7 @@ L(last_4x_vec):
        VMOVU   %VEC(1), VEC_SIZE(%rdi)
        VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
        VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 L(more_8x_vec):
        cmpq    %rsi, %rdi
@@ -445,8 +446,7 @@ L(loop_4x_vec_forward):
        VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
        /* Store the first VEC.  */
        VMOVU   %VEC(4), (%r11)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 L(more_8x_vec_backward):
        /* Load the first 4 * VEC and last VEC to support overlapping
@@ -497,8 +497,7 @@ L(loop_4x_vec_backward):
        VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
        /* Store the last VEC.  */
        VMOVU   %VEC(8), (%r11)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 L(large_forward):
@@ -533,8 +532,7 @@ L(loop_large_forward):
        VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
        /* Store the first VEC.  */
        VMOVU   %VEC(4), (%r11)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 L(large_backward):
        /* Don't use non-temporal store if there is overlap between
@@ -568,8 +566,7 @@ L(loop_large_backward):
        VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
        /* Store the last VEC.  */
        VMOVU   %VEC(8), (%r11)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
new file mode 100644 (file)
index 0000000..cea2d2a
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memrchr-avx2.S"
index eddede45be9fc7cc045f7eaf69f345261310691b..ac7370cb06e9a0fdd36b4f6d0cdd147429f3b351 100644 (file)
 
 # include <sysdep.h>
 
+# ifndef MEMRCHR
+#  define MEMRCHR      __memrchr_avx2
+# endif
+
 # ifndef VZEROUPPER
 #  define VZEROUPPER   vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)   p##.avx
+# endif
+
 # define VEC_SIZE 32
 
-       .section .text.avx,"ax",@progbits
-ENTRY (__memrchr_avx2)
+       .section SECTION(.text),"ax",@progbits
+ENTRY (MEMRCHR)
        /* Broadcast CHAR to YMM0.  */
        vmovd   %esi, %xmm0
        vpbroadcastb %xmm0, %ymm0
@@ -134,8 +142,8 @@ L(loop_4x_vec):
        vpmovmskb %ymm1, %eax
        bsrl    %eax, %eax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+L(return_vzeroupper):
+       ZERO_UPPER_VEC_REGISTERS_RETURN
 
        .p2align 4
 L(last_4x_vec_or_less):
@@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
        addq    %rax, %rdx
        jl      L(zero)
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(last_2x_vec):
@@ -191,31 +198,27 @@ L(last_2x_vec):
        jl      L(zero)
        addl    $(VEC_SIZE * 2), %eax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(last_vec_x0):
        bsrl    %eax, %eax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(last_vec_x1):
        bsrl    %eax, %eax
        addl    $VEC_SIZE, %eax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(last_vec_x2):
        bsrl    %eax, %eax
        addl    $(VEC_SIZE * 2), %eax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(last_vec_x3):
@@ -232,8 +235,7 @@ L(last_vec_x1_check):
        jl      L(zero)
        addl    $VEC_SIZE, %eax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(last_vec_x3_check):
@@ -243,12 +245,14 @@ L(last_vec_x3_check):
        jl      L(zero)
        addl    $(VEC_SIZE * 3), %eax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(zero):
-       VZEROUPPER
+       xorl    %eax, %eax
+       VZEROUPPER_RETURN
+
+       .p2align 4
 L(null):
        xorl    %eax, %eax
        ret
@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
 
        bsrl    %eax, %eax
        addq    %rdi, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(last_vec_or_less):
@@ -315,8 +318,7 @@ L(last_vec_or_less):
        bsrl    %eax, %eax
        addq    %rdi, %rax
        addq    %r8, %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(last_vec_2x_aligned):
@@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
        bsrl    %eax, %eax
        addq    %rdi, %rax
        addq    %r8, %rax
-       VZEROUPPER
-       ret
-END (__memrchr_avx2)
+       VZEROUPPER_RETURN
+END (MEMRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
new file mode 100644 (file)
index 0000000..8ac3e47
--- /dev/null
@@ -0,0 +1,10 @@
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp   L(return)
+
+#define SECTION(p) p##.avx.rtm
+#define MEMSET_SYMBOL(p,s)     p##_avx2_##s##_rtm
+#define WMEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
+
+#include "memset-avx2-unaligned-erms.S"
index 7ab3d89849bedeb51c39ae90dc182441a90f5ca9..ae0860f36a47d5941456e5138b11544ad9540dbf 100644 (file)
   movq r, %rax; \
   vpbroadcastd %xmm0, %ymm0
 
-# define SECTION(p)            p##.avx
-# define MEMSET_SYMBOL(p,s)    p##_avx2_##s
-# define WMEMSET_SYMBOL(p,s)   p##_avx2_##s
+# ifndef SECTION
+#  define SECTION(p)           p##.avx
+# endif
+# ifndef MEMSET_SYMBOL
+#  define MEMSET_SYMBOL(p,s)   p##_avx2_##s
+# endif
+# ifndef WMEMSET_SYMBOL
+#  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
+# endif
 
 # include "memset-vec-unaligned-erms.S"
 #endif
index 358ee4be12b4d606f85f6d94f688a8a2374a31dc..584747f1a1664005419671ed0e7caf390b77d92c 100644 (file)
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER                   vzeroupper
+#  define VZEROUPPER_SHORT_RETURN      vzeroupper; ret
 # else
 #  define VZEROUPPER
 # endif
 #endif
 
 #ifndef VZEROUPPER_SHORT_RETURN
-# if VEC_SIZE > 16
-#  define VZEROUPPER_SHORT_RETURN      vzeroupper
-# else
-#  define VZEROUPPER_SHORT_RETURN      rep
-# endif
+# define VZEROUPPER_SHORT_RETURN       rep; ret
 #endif
 
 #ifndef MOVQ
@@ -117,8 +114,7 @@ L(entry_from_bzero):
        /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
        VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
        VMOVU   %VEC(0), (%rdi)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
@@ -141,14 +137,12 @@ ENTRY (__memset_erms)
 ENTRY (MEMSET_SYMBOL (__memset, erms))
 # endif
 L(stosb):
-       /* Issue vzeroupper before rep stosb.  */
-       VZEROUPPER
        mov     %RDX_LP, %RCX_LP
        movzbl  %sil, %eax
        mov     %RDI_LP, %RDX_LP
        rep stosb
        mov     %RDX_LP, %RAX_LP
-       ret
+       VZEROUPPER_RETURN
 # if VEC_SIZE == 16
 END (__memset_erms)
 # else
@@ -175,8 +169,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
        /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
        VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
        VMOVU   %VEC(0), (%rdi)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 L(stosb_more_2x_vec):
        cmp     __x86_rep_stosb_threshold(%rip), %RDX_LP
@@ -190,8 +183,11 @@ L(more_2x_vec):
        VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
        VMOVU   %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 L(return):
-       VZEROUPPER
+#if VEC_SIZE > 16
+       ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
        ret
+#endif
 
 L(loop_start):
        leaq    (VEC_SIZE * 4)(%rdi), %rcx
@@ -217,7 +213,6 @@ L(loop):
        cmpq    %rcx, %rdx
        jne     L(loop)
        VZEROUPPER_SHORT_RETURN
-       ret
 L(less_vec):
        /* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@@ -241,40 +236,34 @@ L(less_vec):
        jb      1f
        movb    %cl, (%rdi)
 1:
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 # if VEC_SIZE > 32
        /* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
        VMOVU   %YMM0, -32(%rdi,%rdx)
        VMOVU   %YMM0, (%rdi)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 # endif
 # if VEC_SIZE > 16
        /* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
        VMOVU   %XMM0, -16(%rdi,%rdx)
        VMOVU   %XMM0, (%rdi)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 # endif
        /* From 8 to 15.  No branch when size == 8.  */
 L(between_8_15):
        movq    %rcx, -8(%rdi,%rdx)
        movq    %rcx, (%rdi)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 L(between_4_7):
        /* From 4 to 7.  No branch when size == 4.  */
        movl    %ecx, -4(%rdi,%rdx)
        movl    %ecx, (%rdi)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 L(between_2_3):
        /* From 2 to 3.  No branch when size == 2.  */
        movw    %cx, -2(%rdi,%rdx)
        movw    %cx, (%rdi)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
new file mode 100644 (file)
index 0000000..acc5f6e
--- /dev/null
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_avx2_rtm
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
new file mode 100644 (file)
index 0000000..2b9c07a
--- /dev/null
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
new file mode 100644 (file)
index 0000000..60a2ccf
--- /dev/null
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
new file mode 100644 (file)
index 0000000..637fb55
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef STRCAT
+# define STRCAT __strcat_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcat-avx2.S"
index 41de8b2b6421f4f7d6c4d3eb78728f906650777f..4356fa733009963b0db814bee4be42ba38c82a68 100644 (file)
 /* Number of bytes in a vector register */
 # define VEC_SIZE      32
 
-       .section .text.avx,"ax",@progbits
+# ifndef SECTION
+#  define SECTION(p)   p##.avx
+# endif
+
+       .section SECTION(.text),"ax",@progbits
 ENTRY (STRCAT)
        mov     %rdi, %r9
 # ifdef USE_AS_STRNCAT
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
new file mode 100644 (file)
index 0000000..81f20d1
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef STRCHR
+# define STRCHR __strchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strchr-avx2.S"
index 476c20c2899ded40c1f3a9f96489bb0fa9415d0b..25bec38b5dd9e1924de8bfbb907851d282c34428 100644 (file)
 #  define VZEROUPPER   vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)   p##.avx
+# endif
+
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
 
-       .section .text.avx,"ax",@progbits
+       .section SECTION(.text),"ax",@progbits
 ENTRY (STRCHR)
        movl    %edi, %ecx
 # ifndef USE_AS_STRCHRNUL
@@ -76,8 +80,8 @@ ENTRY (STRCHR)
        cmp (%rax), %CHAR_REG
        cmovne  %rdx, %rax
 # endif
-       VZEROUPPER
-       ret
+L(return_vzeroupper):
+       ZERO_UPPER_VEC_REGISTERS_RETURN
 
        .p2align 4
 L(more_vecs):
@@ -126,8 +130,7 @@ L(aligned_more):
        cmp (%rax), %CHAR_REG
        cmovne  %rdx, %rax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x0):
@@ -138,8 +141,7 @@ L(first_vec_x0):
        cmp (%rax), %CHAR_REG
        cmovne  %rdx, %rax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x1):
@@ -149,8 +151,7 @@ L(first_vec_x1):
        cmp (%rax), %CHAR_REG
        cmovne  %rdx, %rax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x2):
@@ -161,8 +162,7 @@ L(first_vec_x2):
        cmp (%rax), %CHAR_REG
        cmovne  %rdx, %rax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 L(prep_loop_4x):
        /* Align data to 4 * VEC_SIZE.  */
@@ -221,8 +221,7 @@ L(loop_4x_vec):
        cmp (%rax), %CHAR_REG
        cmovne  %rdx, %rax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        /* Cold case for crossing page with first load.  */
        .p2align 4
@@ -246,8 +245,7 @@ L(cross_page_boundary):
        cmp (%rax), %CHAR_REG
        cmovne  %rdx, %rax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 END (STRCHR)
 # endif
index 2c0a3e78fad12b190528fbf75ea2e1a93d6abdd2..691770f335b70b5e5528dc0df4fe3b8559fca64d 100644 (file)
@@ -29,6 +29,7 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
        return OPTIMIZE (evex);
 
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+       return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
        return OPTIMIZE (avx2);
     }
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
new file mode 100644 (file)
index 0000000..cdcf818
--- /dev/null
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2_rtm
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
new file mode 100644 (file)
index 0000000..aecd30d
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef STRCMP
+# define STRCMP __strcmp_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcmp-avx2.S"
index 53cb7a66960d2d2f9e9f9027cbd3b109221122e6..40333010a65650f94eb19736cd1df6fa95c7f398 100644 (file)
 #  define VZEROUPPER   vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)   p##.avx
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -75,7 +79,7 @@
    the maximum offset is reached before a difference is found, zero is
    returned.  */
 
-       .section .text.avx,"ax",@progbits
+       .section SECTION(.text),"ax",@progbits
 ENTRY (STRCMP)
 # ifdef USE_AS_STRNCMP
        /* Check for simple cases (0 or 1) in offset.  */
@@ -127,8 +131,8 @@ L(return):
        movzbl  (%rsi, %rdx), %edx
        subl    %edx, %eax
 # endif
-       VZEROUPPER
-       ret
+L(return_vzeroupper):
+       ZERO_UPPER_VEC_REGISTERS_RETURN
 
        .p2align 4
 L(return_vec_size):
@@ -161,8 +165,7 @@ L(return_vec_size):
        subl    %edx, %eax
 #  endif
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(return_2_vec_size):
@@ -195,8 +198,7 @@ L(return_2_vec_size):
        subl    %edx, %eax
 #  endif
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(return_3_vec_size):
@@ -229,8 +231,7 @@ L(return_3_vec_size):
        subl    %edx, %eax
 #  endif
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(next_3_vectors):
@@ -356,8 +357,7 @@ L(back_to_loop):
        subl    %edx, %eax
 #  endif
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(test_vec):
@@ -400,8 +400,7 @@ L(test_vec):
        subl    %edx, %eax
 #  endif
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(test_2_vec):
@@ -444,8 +443,7 @@ L(test_2_vec):
        subl    %edx, %eax
 #  endif
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(test_3_vec):
@@ -486,8 +484,7 @@ L(test_3_vec):
        subl    %edx, %eax
 #  endif
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(loop_cross_page):
@@ -556,8 +553,7 @@ L(loop_cross_page):
        subl    %edx, %eax
 #  endif
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(loop_cross_page_2_vec):
@@ -631,8 +627,7 @@ L(loop_cross_page_2_vec):
        subl    %edx, %eax
 #  endif
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNCMP
 L(string_nbyte_offset_check):
@@ -674,8 +669,7 @@ L(cross_page_loop):
 # ifndef USE_AS_WCSCMP
 L(different):
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 # ifdef USE_AS_WCSCMP
        .p2align 4
@@ -685,16 +679,14 @@ L(different):
        setl    %al
        negl    %eax
        orl     $1, %eax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 # endif
 
 # ifdef USE_AS_STRNCMP
        .p2align 4
 L(zero):
        xorl    %eax, %eax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(char0):
@@ -708,8 +700,7 @@ L(char0):
        movzbl  (%rdi), %eax
        subl    %ecx, %eax
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 # endif
 
        .p2align 4
@@ -734,8 +725,7 @@ L(last_vector):
        movzbl  (%rsi, %rdx), %edx
        subl    %edx, %eax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        /* Comparing on page boundary region requires special treatment:
           It must done one vector at the time, starting with the wider
@@ -856,7 +846,6 @@ L(cross_page_4bytes):
        testl   %eax, %eax
        jne     L(cross_page_loop)
        subl    %ecx, %eax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 END (STRCMP)
 #endif
index 1df75690d0e186f57a58b0598836f314a833131b..62b7abeeee646ab472b71b98020d9e1909dd2044 100644 (file)
@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
          && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
        return OPTIMIZE (evex);
 
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+       return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
        return OPTIMIZE (avx2);
     }
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
new file mode 100644 (file)
index 0000000..c2c581e
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef STRCPY
+# define STRCPY __strcpy_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcpy-avx2.S"
index b7629eaf15e989eb730fbd1e5ed6da31c28023df..5b6506d58f8dfa95eabfb1156b4218b44ec84f71 100644 (file)
 #  define VZEROUPPER   vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)   p##.avx
+# endif
+
 /* zero register */
 #define xmmZ   xmm0
 #define ymmZ   ymm0
@@ -46,7 +50,7 @@
 
 # ifndef USE_AS_STRCAT
 
-       .section .text.avx,"ax",@progbits
+       .section SECTION(.text),"ax",@progbits
 ENTRY (STRCPY)
 #  ifdef USE_AS_STRNCPY
        mov     %RDX_LP, %R8_LP
@@ -369,8 +373,8 @@ L(CopyVecSizeExit):
        lea     1(%rdi), %rdi
        jnz     L(StrncpyFillTailWithZero)
 # endif
-       VZEROUPPER
-       ret
+L(return_vzeroupper):
+       ZERO_UPPER_VEC_REGISTERS_RETURN
 
        .p2align 4
 L(CopyTwoVecSize1):
@@ -553,8 +557,7 @@ L(Exit1):
        lea     2(%rdi), %rdi
        jnz     L(StrncpyFillTailWithZero)
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Exit2):
@@ -569,8 +572,7 @@ L(Exit2):
        lea     3(%rdi), %rdi
        jnz     L(StrncpyFillTailWithZero)
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Exit3):
@@ -584,8 +586,7 @@ L(Exit3):
        lea     4(%rdi), %rdi
        jnz     L(StrncpyFillTailWithZero)
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Exit4_7):
@@ -602,8 +603,7 @@ L(Exit4_7):
        lea     1(%rdi, %rdx), %rdi
        jnz     L(StrncpyFillTailWithZero)
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Exit8_15):
@@ -620,8 +620,7 @@ L(Exit8_15):
        lea     1(%rdi, %rdx), %rdi
        jnz     L(StrncpyFillTailWithZero)
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Exit16_31):
@@ -638,8 +637,7 @@ L(Exit16_31):
        lea 1(%rdi, %rdx), %rdi
        jnz L(StrncpyFillTailWithZero)
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Exit32_63):
@@ -656,8 +654,7 @@ L(Exit32_63):
        lea     1(%rdi, %rdx), %rdi
        jnz     L(StrncpyFillTailWithZero)
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNCPY
 
@@ -671,8 +668,7 @@ L(StrncpyExit1):
 #  ifdef USE_AS_STRCAT
        movb    $0, 1(%rdi)
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(StrncpyExit2):
@@ -684,8 +680,7 @@ L(StrncpyExit2):
 #  ifdef USE_AS_STRCAT
        movb    $0, 2(%rdi)
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(StrncpyExit3_4):
@@ -699,8 +694,7 @@ L(StrncpyExit3_4):
 #  ifdef USE_AS_STRCAT
        movb    $0, (%rdi, %r8)
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(StrncpyExit5_8):
@@ -714,8 +708,7 @@ L(StrncpyExit5_8):
 #  ifdef USE_AS_STRCAT
        movb    $0, (%rdi, %r8)
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(StrncpyExit9_16):
@@ -729,8 +722,7 @@ L(StrncpyExit9_16):
 #  ifdef USE_AS_STRCAT
        movb    $0, (%rdi, %r8)
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(StrncpyExit17_32):
@@ -744,8 +736,7 @@ L(StrncpyExit17_32):
 #  ifdef USE_AS_STRCAT
        movb    $0, (%rdi, %r8)
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(StrncpyExit33_64):
@@ -760,8 +751,7 @@ L(StrncpyExit33_64):
 #  ifdef USE_AS_STRCAT
        movb    $0, (%rdi, %r8)
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(StrncpyExit65):
@@ -778,50 +768,43 @@ L(StrncpyExit65):
 #  ifdef USE_AS_STRCAT
        movb    $0, 65(%rdi)
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 #  ifndef USE_AS_STRCAT
 
        .p2align 4
 L(Fill1):
        mov     %dl, (%rdi)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Fill2):
        mov     %dx, (%rdi)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Fill3_4):
        mov     %dx, (%rdi)
        mov     %dx, -2(%rdi, %r8)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Fill5_8):
        mov     %edx, (%rdi)
        mov     %edx, -4(%rdi, %r8)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Fill9_16):
        mov     %rdx, (%rdi)
        mov     %rdx, -8(%rdi, %r8)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(Fill17_32):
        vmovdqu %xmmZ, (%rdi)
        vmovdqu %xmmZ, -16(%rdi, %r8)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(CopyVecSizeUnalignedVec2):
@@ -898,8 +881,7 @@ L(Fill):
        cmp     $1, %r8d
        ja      L(Fill2)
        je      L(Fill1)
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 /* end of ifndef USE_AS_STRCAT */
 #  endif
@@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3):
 #  ifdef USE_AS_STRCAT
        movb    $0, (VEC_SIZE * 4)(%rdi)
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(UnalignedFourVecSizeLeaveCase2):
@@ -1001,16 +982,14 @@ L(StrncpyExit):
 #  ifdef USE_AS_STRCAT
        movb    $0, (%rdi)
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(ExitZero):
 #  ifndef USE_AS_STRCAT
        mov     %rdi, %rax
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 # endif
 
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
new file mode 100644 (file)
index 0000000..75b4b76
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef STRLEN
+# define STRLEN __strlen_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strlen-avx2.S"
index caa615970c304d76216708363d7c3174905476fd..1caae9e6bc0eec0ea2263e24cac9fd3b4f0d10b3 100644 (file)
 #  define VZEROUPPER   vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)   p##.avx
+# endif
+
 # define VEC_SIZE 32
 
-       .section .text.avx,"ax",@progbits
+       .section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
        /* Check for zero length.  */
@@ -111,8 +115,8 @@ L(cros_page_boundary):
 # ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 # endif
-       VZEROUPPER
-       ret
+L(return_vzeroupper):
+       ZERO_UPPER_VEC_REGISTERS_RETURN
 
        .p2align 4
 L(aligned_more):
@@ -231,8 +235,7 @@ L(last_4x_vec_or_less):
 #  ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(last_2x_vec):
@@ -253,8 +256,7 @@ L(last_2x_vec):
 #  ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x0_check):
@@ -267,8 +269,7 @@ L(first_vec_x0_check):
 #  ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x1_check):
@@ -282,8 +283,7 @@ L(first_vec_x1_check):
 #  ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x2_check):
@@ -297,8 +297,7 @@ L(first_vec_x2_check):
 #  ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x3_check):
@@ -312,8 +311,7 @@ L(first_vec_x3_check):
 #  ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(max):
@@ -321,8 +319,7 @@ L(max):
 #  ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 #  endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(zero):
@@ -338,8 +335,7 @@ L(first_vec_x0):
 # ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x1):
@@ -350,8 +346,7 @@ L(first_vec_x1):
 # ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(first_vec_x2):
@@ -362,8 +357,7 @@ L(first_vec_x2):
 # ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(4x_vec_end):
@@ -389,8 +383,7 @@ L(first_vec_x3):
 # ifdef USE_AS_WCSLEN
        shrq    $2, %rax
 # endif
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 END (STRLEN)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
new file mode 100644 (file)
index 0000000..0dcea18
--- /dev/null
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_avx2_rtm
+#include "strcat-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
new file mode 100644 (file)
index 0000000..37d1224
--- /dev/null
@@ -0,0 +1,3 @@
+#define STRCMP __strncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#include "strcmp-avx2-rtm.S"
index 617c4e10658f35ce07a2c4ab422b6763a41a4e7b..60ba0fe356b31779d5299b67707877bdec92d20c 100644 (file)
@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
          && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
        return OPTIMIZE (evex);
 
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+       return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
        return OPTIMIZE (avx2);
     }
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
new file mode 100644 (file)
index 0000000..79e7083
--- /dev/null
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
new file mode 100644 (file)
index 0000000..04f1626
--- /dev/null
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_avx2_rtm
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
new file mode 100644 (file)
index 0000000..5def14e
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef STRRCHR
+# define STRRCHR __strrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strrchr-avx2.S"
index 53ea445305b889cd66e7121be0d6cf9ef032ec75..0deba97114d3b83d8b1d6be267ed3cc9b21f384d 100644 (file)
 #  define VZEROUPPER   vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)   p##.avx
+# endif
+
 # define VEC_SIZE      32
 
-       .section .text.avx,"ax",@progbits
+       .section SECTION(.text),"ax",@progbits
 ENTRY (STRRCHR)
        movd    %esi, %xmm4
        movl    %edi, %ecx
@@ -166,8 +170,8 @@ L(return_value):
 # endif
        bsrl    %eax, %eax
        leaq    -VEC_SIZE(%rdi, %rax), %rax
-       VZEROUPPER
-       ret
+L(return_vzeroupper):
+       ZERO_UPPER_VEC_REGISTERS_RETURN
 
        .p2align 4
 L(match):
@@ -198,8 +202,7 @@ L(find_nul):
        jz      L(return_value)
        bsrl    %eax, %eax
        leaq    -VEC_SIZE(%rdi, %rax), %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(char_and_nul):
@@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec):
        jz      L(return_null)
        bsrl    %eax, %eax
        leaq    -VEC_SIZE(%rdi, %rax), %rax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
        .p2align 4
 L(return_null):
        xorl    %eax, %eax
-       VZEROUPPER
-       ret
+       VZEROUPPER_RETURN
 
 END (STRRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
new file mode 100644 (file)
index 0000000..d49dbbf
--- /dev/null
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2_rtm
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
new file mode 100644 (file)
index 0000000..d6ca2b8
--- /dev/null
@@ -0,0 +1,4 @@
+#define STRCMP __wcscmp_avx2_rtm
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
new file mode 100644 (file)
index 0000000..35658d7
--- /dev/null
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_avx2_rtm
+#define USE_AS_WCSLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
new file mode 100644 (file)
index 0000000..4e88c70
--- /dev/null
@@ -0,0 +1,5 @@
+#define STRCMP __wcsncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
new file mode 100644 (file)
index 0000000..7437ebe
--- /dev/null
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_avx2_rtm
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
index 19bc6fd938cbc9fb003b4b872765d6e23e3c7e90..4983f1b2226f2ad75eb3f53cc59e478df52d10ab 100644 (file)
@@ -29,6 +29,7 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
        return OPTIMIZE (evex);
 
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+       return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
        return OPTIMIZE (avx2);
     }
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
new file mode 100644 (file)
index 0000000..9bf7608
--- /dev/null
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_avx2_rtm
+#define USE_AS_WCSRCHR 1
+#include "strrchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
new file mode 100644 (file)
index 0000000..58ed21d
--- /dev/null
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_avx2_rtm
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
new file mode 100644 (file)
index 0000000..31104d1
--- /dev/null
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2_movbe_rtm
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2-movbe-rtm.S"
index d07b8f0aaf2fe486021652cc405340d2f8c4d3f0..7bebdeb21095eda04ebaa4253d51e8e5180f7a3c 100644 (file)
@@ -95,6 +95,28 @@ lose:                                                                              \
 #define R14_LP r14
 #define R15_LP r15
 
+/* Zero upper vector registers and return with xtest.  NB: Use VZEROALL
+   to avoid RTM abort triggered by VZEROUPPER inside transactionally.  */
+#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
+       xtest;                                                  \
+       jz      1f;                                             \
+       vzeroall;                                               \
+       ret;                                                    \
+1:                                                             \
+       vzeroupper;                                             \
+       ret
+
+/* Zero upper vector registers and return.  */
+#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+       VZEROUPPER;                                             \
+       ret
+#endif
+
+#ifndef VZEROUPPER_RETURN
+# define VZEROUPPER_RETURN     VZEROUPPER; ret
+#endif
+
 #else  /* __ASSEMBLER__ */
 
 /* Long and pointer size in bytes.  */
This page took 0.11522 seconds and 5 git commands to generate.