[PATCH v4 1/3] Reduce CAS in low level locks [BZ #28537]
Noah Goldstein
goldstein.w.n@gmail.com
Wed Nov 10 01:56:10 GMT 2021
On Tue, Nov 9, 2021 at 6:21 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> CAS instruction is expensive. From the x86 CPU's point of view, getting
> a cache line for writing is more expensive than reading. See Appendix
> A.2 Spinlock in:
>
> https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
>
> The full compare and swap will grab the cache line exclusive and cause
> excessive cache line bouncing.
>
> 1. Change low level locks to do an atomic load and skip CAS if compare
> may fail to reduce cache line bouncing on contended locks.
> 2. In __lll_lock, replace atomic_compare_and_exchange_bool_acq with
> atomic_compare_and_exchange_val_acq and pass down the result to
> __lll_lock_wait and __lll_lock_wait_private to avoid the redundant load
> there.
> 3. Drop __glibc_unlikely in __lll_trylock and lll_cond_trylock since we
> don't know if it's actually rare; in the contended case it is clearly not
> rare.
Think especially since you have a manual check for old vs new, the atomic
CAS failure should still be wrapped in a `glibc_unlikely`.
> ---
> nptl/lowlevellock.c | 12 ++++++------
> sysdeps/nptl/lowlevellock.h | 29 ++++++++++++++++++++---------
> 2 files changed, 26 insertions(+), 15 deletions(-)
>
> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
> index 8c740529c4..d1965c01ca 100644
> --- a/nptl/lowlevellock.c
> +++ b/nptl/lowlevellock.c
> @@ -22,30 +22,30 @@
> #include <stap-probe.h>
>
> void
> -__lll_lock_wait_private (int *futex)
> +__lll_lock_wait_private (int *futex, int futex_value)
> {
> - if (atomic_load_relaxed (futex) == 2)
> + if (futex_value == 2)
> goto futex;
>
> while (atomic_exchange_acquire (futex, 2) != 0)
> {
> futex:
> - LIBC_PROBE (lll_lock_wait_private, 1, futex);
> + LIBC_PROBE (lll_lock_wait_private, 2, futex, futex_value);
> futex_wait ((unsigned int *) futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */
> }
> }
> libc_hidden_def (__lll_lock_wait_private)
>
> void
> -__lll_lock_wait (int *futex, int private)
> +__lll_lock_wait (int *futex, int futex_value, int private)
> {
> - if (atomic_load_relaxed (futex) == 2)
> + if (futex_value == 2)
> goto futex;
>
> while (atomic_exchange_acquire (futex, 2) != 0)
> {
> futex:
> - LIBC_PROBE (lll_lock_wait, 1, futex);
> + LIBC_PROBE (lll_lock_wait, 2, futex, futex_value);
> futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2. */
> }
> }
> diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h
> index 4d95114ed3..4235d13de9 100644
> --- a/sysdeps/nptl/lowlevellock.h
> +++ b/sysdeps/nptl/lowlevellock.h
> @@ -66,7 +66,11 @@
> 0. Otherwise leave lock unchanged and return non-zero to indicate that the
> lock was not acquired. */
> #define __lll_trylock(lock) \
> - __glibc_unlikely (atomic_compare_and_exchange_bool_acq ((lock), 1, 0))
> + (__extension__ ({ \
> + __typeof (*(lock)) __lock_value = atomic_load_relaxed (lock); \
> + (__lock_value != 0 \
> + || atomic_compare_and_exchange_bool_acq ((lock), 1, 0)); \
> + }))
> #define lll_trylock(lock) \
> __lll_trylock (&(lock))
>
> @@ -74,11 +78,15 @@
> return 0. Otherwise leave lock unchanged and return non-zero to indicate
> that the lock was not acquired. */
> #define lll_cond_trylock(lock) \
> - __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0))
> + (__extension__ ({ \
> + __typeof (lock) __lock_value = atomic_load_relaxed (&(lock)); \
> + (__lock_value != 0 \
> + || atomic_compare_and_exchange_bool_acq (&(lock), 2, 0)); \
> + }))
>
> -extern void __lll_lock_wait_private (int *futex);
> +extern void __lll_lock_wait_private (int *futex, int futex_value);
> libc_hidden_proto (__lll_lock_wait_private)
> -extern void __lll_lock_wait (int *futex, int private);
> +extern void __lll_lock_wait (int *futex, int futex_value, int private);
> libc_hidden_proto (__lll_lock_wait)
>
> /* This is an expression rather than a statement even though its value is
> @@ -95,13 +103,15 @@ libc_hidden_proto (__lll_lock_wait)
> ((void) \
> ({ \
> int *__futex = (futex); \
> - if (__glibc_unlikely \
> - (atomic_compare_and_exchange_bool_acq (__futex, 1, 0))) \
> + int __futex_value = atomic_load_relaxed (futex); \
> + if (__futex_value != 0 \
> + || ((__futex_value = atomic_compare_and_exchange_val_acq \
> + (__futex, 1, 0) != 0))) \
> { \
> if (__builtin_constant_p (private) && (private) == LLL_PRIVATE) \
> - __lll_lock_wait_private (__futex); \
> + __lll_lock_wait_private (futex, __futex_value); \
> else \
> - __lll_lock_wait (__futex, private); \
> + __lll_lock_wait (futex, __futex_value, private); \
> } \
> }))
> #define lll_lock(futex, private) \
> @@ -120,7 +130,8 @@ libc_hidden_proto (__lll_lock_wait)
> ({ \
> int *__futex = (futex); \
> if (__glibc_unlikely (atomic_exchange_acq (__futex, 2) != 0)) \
> - __lll_lock_wait (__futex, private); \
> + __lll_lock_wait (__futex, atomic_load_relaxed (__futex), \
> + private); \
> }))
> #define lll_cond_lock(futex, private) __lll_cond_lock (&(futex), private)
>
> --
> 2.33.1
>
More information about the Libc-alpha
mailing list