This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 2/4] Small optimization for lowlevellock


Ping.

On 22/02/2019 16:27, Adhemerval Zanella wrote:
> This patch optimizes both __lll_lock_wait_private and __lll_lock_wait
> by issuing only one lll_futex_wait.  Since it is defined as an inlined
> syscall and inlined syscalls are defined using inlined assembly the
> compiler usually can not see both calls are equal and optimize
> accordingly.
> 
> On aarch64 the resulting binary is change from:
> 
> 0000000000000060 <__lll_lock_wait>:
>   60:   2a0103e5        mov     w5, w1
>   64:   b9400001        ldr     w1, [x0]
>   68:   aa0003e4        mov     x4, x0
>   6c:   7100083f        cmp     w1, #0x2
>   70:   540000e1        b.ne    8c <__lll_lock_wait+0x2c>  // b.any
>   74:   521900a1        eor     w1, w5, #0x80
>   78:   d2800042        mov     x2, #0x2                        // #2
>   7c:   93407c21        sxtw    x1, w1
>   80:   d2800003        mov     x3, #0x0                        // #0
>   84:   d2800c48        mov     x8, #0x62                       // #98
>   88:   d4000001        svc     #0x0
>   8c:   521900a5        eor     w5, w5, #0x80
>   90:   52800046        mov     w6, #0x2                        // #2
>   94:   93407ca5        sxtw    x5, w5
>   98:   14000008        b       b8 <__lll_lock_wait+0x58>
>   9c:   d503201f        nop
>   a0:   aa0403e0        mov     x0, x4
>   a4:   aa0503e1        mov     x1, x5
>   a8:   d2800042        mov     x2, #0x2                        // #2
>   ac:   d2800003        mov     x3, #0x0                        // #0
>   b0:   d2800c48        mov     x8, #0x62                       // #98
>   b4:   d4000001        svc     #0x0
>   b8:   885ffc80        ldaxr   w0, [x4]
>   bc:   88017c86        stxr    w1, w6, [x4]
>   c0:   35ffffc1        cbnz    w1, b8 <__lll_lock_wait+0x58>
>   c4:   35fffee0        cbnz    w0, a0 <__lll_lock_wait+0x40>
>   c8:   d65f03c0        ret
> 
> To:
> 
> 0000000000000048 <__lll_lock_wait>:
>   48:   aa0003e4        mov     x4, x0
>   4c:   2a0103e5        mov     w5, w1
>   50:   b9400000        ldr     w0, [x0]
>   54:   7100081f        cmp     w0, #0x2
>   58:   540000c0        b.eq    70 <__lll_lock_wait+0x28>  // b.none
>   5c:   52800041        mov     w1, #0x2                        // #2
>   60:   885ffc80        ldaxr   w0, [x4]
>   64:   88027c81        stxr    w2, w1, [x4]
>   68:   35ffffc2        cbnz    w2, 60 <__lll_lock_wait+0x18>
>   6c:   34000120        cbz     w0, 90 <__lll_lock_wait+0x48>
>   70:   521900a1        eor     w1, w5, #0x80
>   74:   aa0403e0        mov     x0, x4
>   78:   93407c21        sxtw    x1, w1
>   7c:   d2800042        mov     x2, #0x2                        // #2
>   80:   d2800003        mov     x3, #0x0                        // #0
>   84:   d2800c48        mov     x8, #0x62                       // #98
>   88:   d4000001        svc     #0x0
>   8c:   17fffff4        b       5c <__lll_lock_wait+0x14>
>   90:   d65f03c0        ret
> 
> I see similar changes on powerpc and other architectures.  It also aligns
> with x86_64 implementation by adding the systemtap probes.
> 
> Checker on aarch64-linux-gnu.
> 
> 	* nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private):
> 	Optimize futex call and add systemtap probe.
> ---
>  nptl/lowlevellock.c | 31 +++++++++++++++++++------------
>  1 file changed, 19 insertions(+), 12 deletions(-)
> 
> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
> index 5eaa3807ea..47548ff121 100644
> --- a/nptl/lowlevellock.c
> +++ b/nptl/lowlevellock.c
> @@ -17,20 +17,23 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> -#include <errno.h>
>  #include <sysdep.h>
>  #include <lowlevellock.h>
> -#include <sys/time.h>
>  #include <atomic.h>
> +#include <stap-probe.h>
>  
>  void
>  __lll_lock_wait_private (int *futex)
>  {
> -  if (*futex == 2)
> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
> -
> -  while (atomic_exchange_acq (futex, 2) != 0)
> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
> +  if (atomic_load_relaxed (futex) == 2)
> +    goto futex;
> +
> +  while (atomic_exchange_acquire (futex, 2) != 0)
> +    {
> +    futex:
> +      LIBC_PROBE (lll_lock_wait_private, 1, futex);
> +      lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
> +    }
>  }
>  
>  
> @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex)
>  void
>  __lll_lock_wait (int *futex, int private)
>  {
> -  if (*futex == 2)
> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
> -
> -  while (atomic_exchange_acq (futex, 2) != 0)
> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
> +  if (atomic_load_relaxed (futex) == 2)
> +    goto futex;
> +
> +  while (atomic_exchange_acquire (futex, 2) != 0)
> +    {
> +    futex:
> +      LIBC_PROBE (lll_lock_wait, 1, futex);
> +      lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
> +    }
>  }
>  #endif
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]