This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH 2/4] Small optimization for lowlevellock

From: Carlos O'Donell <codonell at redhat dot com>
To: Adhemerval Zanella <adhemerval dot zanella at linaro dot org>, libc-alpha at sourceware dot org
Date: Thu, 9 May 2019 14:33:28 -0400
Subject: Re: [PATCH 2/4] Small optimization for lowlevellock
References: <20190222192703.18177-1-adhemerval.zanella@linaro.org> <20190222192703.18177-2-adhemerval.zanella@linaro.org> <d928bb39-fabc-89cd-0e22-165cf5df1916@linaro.org> <8ce7b84b-f7b9-df00-2de0-d2e2b3d782b7@linaro.org>

On 5/9/19 2:25 PM, Adhemerval Zanella wrote:
> Ping (x2). The third and forth part are already approved.

This looks good to me.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>

> On 09/04/2019 09:11, Adhemerval Zanella wrote:
>> Ping.
>>
>> On 22/02/2019 16:27, Adhemerval Zanella wrote:
>>> This patch optimizes both __lll_lock_wait_private and __lll_lock_wait
>>> by issuing only one lll_futex_wait.  Since it is defined as an inlined
>>> syscall and inlined syscalls are defined using inlined assembly the
>>> compiler usually can not see both calls are equal and optimize
>>> accordingly.

Right.

>>>
>>> On aarch64 the resulting binary is change from:
>>>
>>> 0000000000000060 <__lll_lock_wait>:
>>>   60:   2a0103e5        mov     w5, w1
>>>   64:   b9400001        ldr     w1, [x0]
>>>   68:   aa0003e4        mov     x4, x0
>>>   6c:   7100083f        cmp     w1, #0x2
>>>   70:   540000e1        b.ne    8c <__lll_lock_wait+0x2c>  // b.any
>>>   74:   521900a1        eor     w1, w5, #0x80
>>>   78:   d2800042        mov     x2, #0x2                        // #2
>>>   7c:   93407c21        sxtw    x1, w1
>>>   80:   d2800003        mov     x3, #0x0                        // #0
>>>   84:   d2800c48        mov     x8, #0x62                       // #98
>>>   88:   d4000001        svc     #0x0
>>>   8c:   521900a5        eor     w5, w5, #0x80
>>>   90:   52800046        mov     w6, #0x2                        // #2
>>>   94:   93407ca5        sxtw    x5, w5
>>>   98:   14000008        b       b8 <__lll_lock_wait+0x58>
>>>   9c:   d503201f        nop
>>>   a0:   aa0403e0        mov     x0, x4
>>>   a4:   aa0503e1        mov     x1, x5
>>>   a8:   d2800042        mov     x2, #0x2                        // #2
>>>   ac:   d2800003        mov     x3, #0x0                        // #0
>>>   b0:   d2800c48        mov     x8, #0x62                       // #98
>>>   b4:   d4000001        svc     #0x0
>>>   b8:   885ffc80        ldaxr   w0, [x4]
>>>   bc:   88017c86        stxr    w1, w6, [x4]
>>>   c0:   35ffffc1        cbnz    w1, b8 <__lll_lock_wait+0x58>
>>>   c4:   35fffee0        cbnz    w0, a0 <__lll_lock_wait+0x40>
>>>   c8:   d65f03c0        ret
>>>
>>> To:
>>>
>>> 0000000000000048 <__lll_lock_wait>:
>>>   48:   aa0003e4        mov     x4, x0
>>>   4c:   2a0103e5        mov     w5, w1
>>>   50:   b9400000        ldr     w0, [x0]
>>>   54:   7100081f        cmp     w0, #0x2
>>>   58:   540000c0        b.eq    70 <__lll_lock_wait+0x28>  // b.none
>>>   5c:   52800041        mov     w1, #0x2                        // #2
>>>   60:   885ffc80        ldaxr   w0, [x4]
>>>   64:   88027c81        stxr    w2, w1, [x4]
>>>   68:   35ffffc2        cbnz    w2, 60 <__lll_lock_wait+0x18>
>>>   6c:   34000120        cbz     w0, 90 <__lll_lock_wait+0x48>
>>>   70:   521900a1        eor     w1, w5, #0x80
>>>   74:   aa0403e0        mov     x0, x4
>>>   78:   93407c21        sxtw    x1, w1
>>>   7c:   d2800042        mov     x2, #0x2                        // #2
>>>   80:   d2800003        mov     x3, #0x0                        // #0
>>>   84:   d2800c48        mov     x8, #0x62                       // #98
>>>   88:   d4000001        svc     #0x0
>>>   8c:   17fffff4        b       5c <__lll_lock_wait+0x14>
>>>   90:   d65f03c0        ret
>>>

Nice!

>>> I see similar changes on powerpc and other architectures.  It also aligns
>>> with x86_64 implementation by adding the systemtap probes.
>>>
>>> Checker on aarch64-linux-gnu.
>>>
>>> 	* nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private):
>>> 	Optimize futex call and add systemtap probe.
>>> ---
>>>  nptl/lowlevellock.c | 31 +++++++++++++++++++------------
>>>  1 file changed, 19 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
>>> index 5eaa3807ea..47548ff121 100644
>>> --- a/nptl/lowlevellock.c
>>> +++ b/nptl/lowlevellock.c
>>> @@ -17,20 +17,23 @@
>>>     License along with the GNU C Library; if not, see
>>>     <http://www.gnu.org/licenses/>.  */
>>>  
>>> -#include <errno.h>
>>>  #include <sysdep.h>
>>>  #include <lowlevellock.h>
>>> -#include <sys/time.h>
>>>  #include <atomic.h>
>>> +#include <stap-probe.h>

OK.

>>>  
>>>  void
>>>  __lll_lock_wait_private (int *futex)
>>>  {
>>> -  if (*futex == 2)
>>> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
>>> -
>>> -  while (atomic_exchange_acq (futex, 2) != 0)
>>> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */

OK.

>>> +  if (atomic_load_relaxed (futex) == 2)
>>> +    goto futex;
>>> +
>>> +  while (atomic_exchange_acquire (futex, 2) != 0)
>>> +    {
>>> +    futex:
>>> +      LIBC_PROBE (lll_lock_wait_private, 1, futex);
>>> +      lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */

OK.

>>> +    }
>>>  }
>>>  
>>>  
>>> @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex)
>>>  void
>>>  __lll_lock_wait (int *futex, int private)
>>>  {
>>> -  if (*futex == 2)
>>> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
>>> -
>>> -  while (atomic_exchange_acq (futex, 2) != 0)
>>> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
>>> +  if (atomic_load_relaxed (futex) == 2)
>>> +    goto futex;

OK.

>>> +
>>> +  while (atomic_exchange_acquire (futex, 2) != 0)
>>> +    {
>>> +    futex:
>>> +      LIBC_PROBE (lll_lock_wait, 1, futex);
>>> +      lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */

OK.

>>> +    }
>>>  }
>>>  #endif
>>>


-- 
Cheers,
Carlos.

References:
- Re: [PATCH 2/4] Small optimization for lowlevellock
  - From: Adhemerval Zanella

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]