[PATCH v4 1/3] Reduce CAS in low level locks [BZ #28537]

Noah Goldstein goldstein.w.n@gmail.com
Wed Nov 10 01:56:10 GMT 2021


On Tue, Nov 9, 2021 at 6:21 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> CAS instruction is expensive.  From the x86 CPU's point of view, getting
> a cache line for writing is more expensive than reading.  See Appendix
> A.2 Spinlock in:
>
> https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
>
> The full compare and swap will grab the cache line exclusive and cause
> excessive cache line bouncing.
>
> 1. Change low level locks to do an atomic load and skip CAS if compare
> may fail to reduce cache line bouncing on contended locks.
> 2. In __lll_lock, replace atomic_compare_and_exchange_bool_acq with
> atomic_compare_and_exchange_val_acq and pass down the result to
> __lll_lock_wait and __lll_lock_wait_private to avoid the redundant load
> there.
> 3. Drop __glibc_unlikely in __lll_trylock and lll_cond_trylock since we
> don't know if it's actually rare; in the contended case it is clearly not
> rare.

Think especially since you have a manual check for old vs new, the atomic
CAS failure should still be wrapped in a `glibc_unlikely`.

> ---
>  nptl/lowlevellock.c         | 12 ++++++------
>  sysdeps/nptl/lowlevellock.h | 29 ++++++++++++++++++++---------
>  2 files changed, 26 insertions(+), 15 deletions(-)
>
> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
> index 8c740529c4..d1965c01ca 100644
> --- a/nptl/lowlevellock.c
> +++ b/nptl/lowlevellock.c
> @@ -22,30 +22,30 @@
>  #include <stap-probe.h>
>
>  void
> -__lll_lock_wait_private (int *futex)
> +__lll_lock_wait_private (int *futex, int futex_value)
>  {
> -  if (atomic_load_relaxed (futex) == 2)
> +  if (futex_value == 2)
>      goto futex;
>
>    while (atomic_exchange_acquire (futex, 2) != 0)
>      {
>      futex:
> -      LIBC_PROBE (lll_lock_wait_private, 1, futex);
> +      LIBC_PROBE (lll_lock_wait_private, 2, futex, futex_value);
>        futex_wait ((unsigned int *) futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
>      }
>  }
>  libc_hidden_def (__lll_lock_wait_private)
>
>  void
> -__lll_lock_wait (int *futex, int private)
> +__lll_lock_wait (int *futex, int futex_value, int private)
>  {
> -  if (atomic_load_relaxed (futex) == 2)
> +  if (futex_value == 2)
>      goto futex;
>
>    while (atomic_exchange_acquire (futex, 2) != 0)
>      {
>      futex:
> -      LIBC_PROBE (lll_lock_wait, 1, futex);
> +      LIBC_PROBE (lll_lock_wait, 2, futex, futex_value);
>        futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2.  */
>      }
>  }
> diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h
> index 4d95114ed3..4235d13de9 100644
> --- a/sysdeps/nptl/lowlevellock.h
> +++ b/sysdeps/nptl/lowlevellock.h
> @@ -66,7 +66,11 @@
>     0.  Otherwise leave lock unchanged and return non-zero to indicate that the
>     lock was not acquired.  */
>  #define __lll_trylock(lock)    \
> -  __glibc_unlikely (atomic_compare_and_exchange_bool_acq ((lock), 1, 0))
> +  (__extension__ ({                                                    \
> +    __typeof (*(lock)) __lock_value = atomic_load_relaxed (lock);      \
> +    (__lock_value != 0                                                 \
> +     || atomic_compare_and_exchange_bool_acq ((lock), 1, 0));          \
> +  }))
>  #define lll_trylock(lock)      \
>     __lll_trylock (&(lock))
>
> @@ -74,11 +78,15 @@
>     return 0.  Otherwise leave lock unchanged and return non-zero to indicate
>     that the lock was not acquired.  */
>  #define lll_cond_trylock(lock) \
> -  __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0))
> +  (__extension__ ({                                                    \
> +    __typeof (lock) __lock_value = atomic_load_relaxed (&(lock));      \
> +    (__lock_value != 0                                                 \
> +     || atomic_compare_and_exchange_bool_acq (&(lock), 2, 0));         \
> +  }))
>
> -extern void __lll_lock_wait_private (int *futex);
> +extern void __lll_lock_wait_private (int *futex, int futex_value);
>  libc_hidden_proto (__lll_lock_wait_private)
> -extern void __lll_lock_wait (int *futex, int private);
> +extern void __lll_lock_wait (int *futex, int futex_value, int private);
>  libc_hidden_proto (__lll_lock_wait)
>
>  /* This is an expression rather than a statement even though its value is
> @@ -95,13 +103,15 @@ libc_hidden_proto (__lll_lock_wait)
>    ((void)                                                               \
>     ({                                                                   \
>       int *__futex = (futex);                                            \
> -     if (__glibc_unlikely                                               \
> -         (atomic_compare_and_exchange_bool_acq (__futex, 1, 0)))        \
> +     int __futex_value = atomic_load_relaxed (futex);                   \
> +     if (__futex_value != 0                                             \
> +         || ((__futex_value = atomic_compare_and_exchange_val_acq       \
> +              (__futex, 1, 0) != 0)))                                   \
>         {                                                                \
>           if (__builtin_constant_p (private) && (private) == LLL_PRIVATE) \
> -           __lll_lock_wait_private (__futex);                           \
> +           __lll_lock_wait_private (futex, __futex_value);              \
>           else                                                           \
> -           __lll_lock_wait (__futex, private);                          \
> +           __lll_lock_wait (futex, __futex_value, private);             \
>         }                                                                \
>     }))
>  #define lll_lock(futex, private)       \
> @@ -120,7 +130,8 @@ libc_hidden_proto (__lll_lock_wait)
>     ({                                                                   \
>       int *__futex = (futex);                                            \
>       if (__glibc_unlikely (atomic_exchange_acq (__futex, 2) != 0))      \
> -       __lll_lock_wait (__futex, private);                              \
> +       __lll_lock_wait (__futex, atomic_load_relaxed (__futex),         \
> +                       private); \
>     }))
>  #define lll_cond_lock(futex, private) __lll_cond_lock (&(futex), private)
>
> --
> 2.33.1
>


More information about the Libc-alpha mailing list