Bug 19586 - IFUNC selection logic requires improvement for string and memory routines for x86_64 arch.
Summary: IFUNC selection logic requires improvement for string and memory routines for...
Status: WAITING
Alias: None
Product: glibc
Classification: Unclassified
Component: string (show other bugs)
Version: 2.24
: P2 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords:
Depends on: 18880
Blocks:
  Show dependency treegraph
 
Reported: 2016-02-09 06:25 UTC by Amit Pawar
Modified: 2016-02-18 11:07 UTC (History)
1 user (show)

See Also:
Host:
Target:
Build:
Last reconfirmed:
fweimer: security-


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Amit Pawar 2016-02-09 06:25:45 UTC
Problem:
1) For AMD Excavator core SSSE3-Fast_Copy_Backward (not set currently but required) performance is good but AVX_Fast_Unaligned_Load version is selected for MEMMOVE routine. 

------ Current IFUNC for Memmove -----------------------------
            (HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
            ? __memmove_avx_unaligned
            : (HAS_CPU_FEATURE (SSSE3)
               ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
                  ? __memmove_ssse3_back : __memmove_ssse3)
               : __memmove_sse2)));
--------------------------------------------------------------

Though this selection order can be updated to select the right one but in future this order may again need to be updated for newer CPU's when performance of other ISA improves and same issue rises again. To resolve this in better manner current framework needs to be updated to define selection order which is unique for each CPU.

2) Performance of SSSE3-Fast_Copy_Backward based implementation is good on Excavator core for MEMCPY routine but can’t select. Selection order given in MEMCPY IFUNC function does not check for this implementation.

------------ Current IFUNC for Memcpy -----------------------
ENTRY(__new_memcpy)
        .type   __new_memcpy, @gnu_indirect_function
        LOAD_RTLD_GLOBAL_RO_RDX
#ifdef HAVE_AVX512_ASM_SUPPORT
        HAS_ARCH_FEATURE (AVX512F_Usable)
        jz      1f
        HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
        jz      1f
        leaq    __memcpy_avx512_no_vzeroupper(%rip), %rax
        ret
#endif
1:      leaq    __memcpy_avx_unaligned(%rip), %rax
        HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
        jz 2f
        ret
2:      leaq    __memcpy_sse2(%rip), %rax
        HAS_ARCH_FEATURE (Slow_BSF)
        jnz     3f
        leaq    __memcpy_sse2_unaligned(%rip), %rax
        ret
3:      HAS_CPU_FEATURE (SSSE3)
        jz 4f
        leaq    __memcpy_ssse3(%rip), %rax
4:      ret
END(__new_memcpy)
---------------------------------------------------------------

For Excavator CPU following selection order is preferred in case of MEMCPY routine.
1. SSSE3-Fast_Copy_Backward based
2. AVX_Unaligned based
3. SSE2_Unaligned or SSSE3

Following limitation are observed in current framework while trying to improve the selection order logic in IFUNC functions for string and memory routines.
(Used GLIBC trunk commit id:0e069029a8bb132876d315242054a312ae106852 for this test experiment)

1. Having single selection order may not be a good option for all the CPU's. 
2. Single selection order does not help in selecting fastest version among older and newer generation CPU's.
3. For newer CPU's selection order may needs to be updated due to ISA improvement. This new order may lead to performance degradation in case of older generation CPU's and requires lot of testing activity to verify.

Following approach addresses these issues by improving current framework to define a selection order for each CPU based on family and model values for all functions. This logic can be further extended FPU multi-versioning functions.

1) Define a structure array and each row in a array fill it with details of multi-versioning functions along with their respective CPU or ARCH flags defined in GLIBC for every string and memory routines. Each string and memory routines are given ID's and these ID's are used to refer back in this array. Each ISA are also given ID's for defining the order.

------------- EXAMPLE --------------

/* CPU selection order structure.*/
typedef enum
{
  OTHER_TYPE,
  CPU_TYPE,                    // Used to map for single CPU flag type
  ARCH_TYPE,                 // Used to map for single Arch flag type
  MULTI_ARCH_TYPE,  // Used to map for multiple Arch flags type (Ex AVX512F_Usable and Prefer_No_VZEROUPPER).
}feature_type;

typedef struct
{
  unsigned int n_implementation;
  struct implementation_details
  {
    feature_type feature_flag;            // Used to identify whether CPU and Arch based implementation.
    unsigned int bit_flag;                        // Used bit to verify.
    void *funcptrs;                                    // Function mapped to this flag and returned when supported.
  }impl_list[TOTAL_IMPLEMENTATION];           // Total no of implementation found for this functions.
}cpu_selection;


cpu_selection string_implementations[TOTAL_STRING_ROUTINES] =  {
  /* Memset */
  [index_MEMSET]=
    {
      .n_implementation= 2,
      [index_MEMSET_SSE2] =                            /* SSE2 specific details and it is 0th index*/
      [index_MEMSET_AVX2_Unaligned] =  /* AVX2 specific details and at 1st index */
    },
  /* MEMCMP */
  [index_MEMCMP]=
    {
      .n_implementation= 3,
      [index_MEMCMP_SSE2] =                    /* SSE2 specific details*/
      [index_MEMCMP_SSSE3] =                 /* SSSE3 specific details*/
      [index_MEMCMP_SSE4_1] =              /* SSE4_1 specific details*/
    },
  ...,
  /* Continue to define for other routines */
  ...
};

-------------------------------------

2) Define two functions one for INTEL and AMD. These functions will define required order for every string and memory routines for every CPU's based on family and model values in local array. This local array is two dimensional one and using string index ID'S define the selected order as ISA indexes (two types of indexes are defined. One for string and memory functions, second one for ISA specific) for this function. If unknown CPU is detected then fallback to current order defined in GLIBC. Following examples shows how this can be achieved in amd_cpus function.

------------------------- CODE snippet ----------------------- 

#define index_MEMSET 0
/* Following two macros are used to define the order in local array for memset function*/
#define index_MEMSET_SSE2        0 
#define index_MEMSET_AVX2      1

#define index_MEMCMP                                       1
/* Following three macros are used to define the order in local array for memcmp function*/
#define index_MEMCMP_SSE2                           0
#define index_MEMCMP_SSE2_Unaligned   1
#define index_MEMCMP_SSSE3                        2
...
...
...
   
#define FIRST_CHOICE          0
#define SECOND_CHOICE    1
#define THIRD_CHOICE         2

static void
amd_cpus(const struct cpu_features *cpu_features)
{

  // Contains array of no used for indexing in main string_implementation array.
  unsigned int s_order[TOTAL_STRING_ROUTINES][TOTAL_IMPLEMENTATION];
  unsigned int i,j;

 // Initialize local array.
  for (i=0;i < TOTAL_STRING_ROUTINES ; i++)
    for (j=0;j<TOTAL_IMPLEMENTATION;j++)
      s_order[i][j] = 0;

 
  /* Define an order here and this is fallback order if CPU is registered in following switch case */

  // Incase of memset selection order, AVX2_Usable is first choice and then followed by SSE2 as second choice = { AVX2_Usable, SSE2};
  s_order [index_MEMSET][FIRST_CHOICE]  = MEMSET_I(AVX2_Usable);
  s_order [index_MEMSET][SECOND_CHOICE] = MEMSET_I(SSE2);

  // Incase of memcmp selection order, SSE4_1 as first choice, SSSE3 as second choice and SSE2 as third choice.
  s_order [index_MEMCMP][FIRST_CHOICE]  = index_MEMCMP_SSE4_1 ;
  s_order [index_MEMCMP][SECOND_CHOICE] = index_MEMCMP_SSSE3;
  s_order [index_MEMCMP][THIRD_CHOICE]  = index_MEMCMP_SSE2;

 // If CPU is registered in following switch case then it may update the order as per the requirements for each function but if does not then the above order becomes fallback order.
  switch (cpu_features->family)
    {
        case 21:

          /* Excavator */
         // Here order for memset and memcmp is updated as per required.
          if (cpu_features->model >= 0x60 && cpu_features->model <= 0x7f)
            {
              /* SSE2 is first choice for memset*/
              s_order [index_MEMSET][FIRST_CHOICE]  = MEMSET_I(SSE2);
              s_order [index_MEMSET][SECOND_CHOICE] = MEMSET_I(AVX2_Usable);

              /* MEMCMP does not need changes */
             s_order [index_MEMCMP][FIRST_CHOICE]  = index_MEMCMP_SSSE3 ;
             s_order [index_MEMCMP][SECOND_CHOICE] = index_MEMCMP_SSE4_1;
             s_order [index_MEMCMP][THIRD_CHOICE]  = index_MEMCMP_SSE2;
            }

          break;

        default:
          ;
    }

  // Now following function will verify the given order by verifying CPU and ARCH flags.
  verify_update_global_array (s_order);

} 
---------------------------------------------

3) verify_update_global_array will verify the given order by input array and updates the global array which is used to return the function address when called from IFUNC. This function will verify every CPU and ARCH flag given as per the given order by the input argument. First one in the order to support will be selected and this index will be updated in global array "string_functions"

unsigned int string_functions[TOTAL_STRING_ROUTINES];

------------------ Implementation -------------------------
// Function to verify whether passed input CPU or ARCH flag is supported or not.
static unsigned int
test_bit (unsigned int bit_flag, feature_type feature_flag)
{
  unsigned int flag= 0;
  switch (feature_flag)
  {
    case CPU_TYPE:

        switch (bit_flag)
          {

#define TEST_CPU_FLAG(x)          \
  case bit_##x:                   \
                                  \
    if (HAS_CPU_FEATURE(x))       \
       flag=1;                    \
                                  \
    break;

            TEST_CPU_FLAG(CX8)
            TEST_CPU_FLAG(CMOV)
            TEST_CPU_FLAG(SSE2)
            TEST_CPU_FLAG(SSSE3)
            TEST_CPU_FLAG(SSE4_1)
            TEST_CPU_FLAG(SSE4_2)
            TEST_CPU_FLAG(AVX)
            TEST_CPU_FLAG(AVX2)

              default:
                flag=0;
          }


      break;

    case ARCH_TYPE:
    case MULTI_ARCH_TYPE:
#define TEST_ARCH_FLAG(x)         \
  case bit_##x:                   \
                                  \
    if (HAS_ARCH_FEATURE(x))      \
       flag=1;                    \
                                  \
    break;

      switch (bit_flag)
        {

          TEST_ARCH_FLAG(Fast_Rep_String)
          TEST_ARCH_FLAG(Fast_Copy_Backward)
          TEST_ARCH_FLAG(Slow_BSF)
          TEST_ARCH_FLAG(Fast_Unaligned_Load)
          TEST_ARCH_FLAG(Prefer_PMINUB_for_stringop)
          TEST_ARCH_FLAG(AVX_Usable)
          TEST_ARCH_FLAG(FMA_Usable)
          TEST_ARCH_FLAG(FMA4_Usable)
          TEST_ARCH_FLAG(Slow_SSE4_2)
          TEST_ARCH_FLAG(AVX2_Usable)
          TEST_ARCH_FLAG(AVX_Fast_Unaligned_Load)
          TEST_ARCH_FLAG(AVX512F_Usable)
          TEST_ARCH_FLAG(AVX512DQ_Usable)
          TEST_ARCH_FLAG(I586)
          TEST_ARCH_FLAG(I686)
          TEST_ARCH_FLAG(Prefer_MAP_32BIT_EXEC)
          TEST_ARCH_FLAG(Prefer_No_VZEROUPPER)
          default:
            flag = 0;
        }

      break;

    default:
      flag = 0;
  }

  return flag;
}

static void
verify_update_global_array(unsigned int array[][TOTAL_IMPLEMENTATION])
{
  unsigned int i,j,k;

  for ( i=0 ; i < TOTAL_STRING_ROUTINES; i++)
    {
      k = 0;

      for (j=0; j < string_implementations[i].n_implementation ; j++)
        {
          feature_type feature_flag;
          unsigned int bit_flag;

          k = array[i][j];
          bit_flag = string_implementations[i].impl_list[k].bit_flag;
          feature_flag = string_implementations[i].impl_list[k].feature_flag;

          if ((feature_flag == CPU_TYPE || feature_flag == ARCH_TYPE)
               && test_bit (bit_flag, feature_flag))
            {
              string_functions[i] = k;
              break;
            }

          if (feature_flag == MULTI_ARCH_TYPE)
            {
              // Handle it
              string_functions[i] = k;
              break;
            }
        }
    }
}
----------------------------------------------------------

4) Every string and memory Ifunc functions will call "__get_string_func_address" function by passing their ID.

--------- Code snippet ----------------


void *
__get_string_func_address (unsigned int string_routine_index)
{
  unsigned int index=-1;

#ifndef SHARED
  static int initialized = 1;
  if (initialized)
  {
    register_string_functions();
    initialized = 0;
  }
#endif

  if (string_routine_index < TOTAL_STRING_ROUTINES)
  {
    // The value present here is the index of the implementation which is supported and retrieve below and returned.
    index = string_functions[string_routine_index];
  }
  else
    index = 0;

  return string_implementations[string_routine_index].impl_list[index].funcptrs;
}

-------------------------------------------

5) Memset Ifunc calls __get_string_func_address by passing its ID and value returned by this function will be returned to its parent.

#if IS_IN (libc)
ENTRY(memset)
        .type   memset, @gnu_indirect_function
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $32, %rsp
        mov     $index_MEMSET, %edi
        callq  __get_string_func_address
        leave
        ret
END(memset)
#endif
Comment 1 Amit Pawar 2016-02-09 06:28:14 UTC
This requires further improvement when new version is implemented using combination of CPU and ARCH flags.
Comment 2 Amit Pawar 2016-02-09 06:54:37 UTC
RFC patch submitted.
https://sourceware.org/ml/libc-alpha/2016-02/msg00072.html
Comment 3 H.J. Lu 2016-02-09 13:28:50 UTC
Please fix PR 18880 first.  Your scheme is too complicated for my
taste.  We have bit_Prefer_PMINUB_for_stringop to select a version
which may not be selected otherwise.  Can you do something similar?
Comment 4 Amit Pawar 2016-02-12 08:00:21 UTC
What I thought was, my approach is allowing easy way to configure the ISA selection instead of defining new flag bit for each versions of string and memory routines against each CPU. With such approach PR 18880 is easier to fix.

Does it make sense? Please suggest if you are thinking of other approaches.
Comment 5 H.J. Lu 2016-02-18 11:07:05 UTC
(In reply to Amit Pawar from comment #4)
> What I thought was, my approach is allowing easy way to configure the ISA
> selection instead of defining new flag bit for each versions of string and
> memory routines against each CPU. With such approach PR 18880 is easier to
> fix.
> 
> Does it make sense? Please suggest if you are thinking of other approaches.

There are 2 separate issues.  Please fix PR 18880 first.