This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 2/4] Small optimization for lowlevellock


Ping (x2). The third and forth part are already approved.

On 09/04/2019 09:11, Adhemerval Zanella wrote:
> Ping.
> 
> On 22/02/2019 16:27, Adhemerval Zanella wrote:
>> This patch optimizes both __lll_lock_wait_private and __lll_lock_wait
>> by issuing only one lll_futex_wait.  Since it is defined as an inlined
>> syscall and inlined syscalls are defined using inlined assembly the
>> compiler usually can not see both calls are equal and optimize
>> accordingly.
>>
>> On aarch64 the resulting binary is change from:
>>
>> 0000000000000060 <__lll_lock_wait>:
>>   60:   2a0103e5        mov     w5, w1
>>   64:   b9400001        ldr     w1, [x0]
>>   68:   aa0003e4        mov     x4, x0
>>   6c:   7100083f        cmp     w1, #0x2
>>   70:   540000e1        b.ne    8c <__lll_lock_wait+0x2c>  // b.any
>>   74:   521900a1        eor     w1, w5, #0x80
>>   78:   d2800042        mov     x2, #0x2                        // #2
>>   7c:   93407c21        sxtw    x1, w1
>>   80:   d2800003        mov     x3, #0x0                        // #0
>>   84:   d2800c48        mov     x8, #0x62                       // #98
>>   88:   d4000001        svc     #0x0
>>   8c:   521900a5        eor     w5, w5, #0x80
>>   90:   52800046        mov     w6, #0x2                        // #2
>>   94:   93407ca5        sxtw    x5, w5
>>   98:   14000008        b       b8 <__lll_lock_wait+0x58>
>>   9c:   d503201f        nop
>>   a0:   aa0403e0        mov     x0, x4
>>   a4:   aa0503e1        mov     x1, x5
>>   a8:   d2800042        mov     x2, #0x2                        // #2
>>   ac:   d2800003        mov     x3, #0x0                        // #0
>>   b0:   d2800c48        mov     x8, #0x62                       // #98
>>   b4:   d4000001        svc     #0x0
>>   b8:   885ffc80        ldaxr   w0, [x4]
>>   bc:   88017c86        stxr    w1, w6, [x4]
>>   c0:   35ffffc1        cbnz    w1, b8 <__lll_lock_wait+0x58>
>>   c4:   35fffee0        cbnz    w0, a0 <__lll_lock_wait+0x40>
>>   c8:   d65f03c0        ret
>>
>> To:
>>
>> 0000000000000048 <__lll_lock_wait>:
>>   48:   aa0003e4        mov     x4, x0
>>   4c:   2a0103e5        mov     w5, w1
>>   50:   b9400000        ldr     w0, [x0]
>>   54:   7100081f        cmp     w0, #0x2
>>   58:   540000c0        b.eq    70 <__lll_lock_wait+0x28>  // b.none
>>   5c:   52800041        mov     w1, #0x2                        // #2
>>   60:   885ffc80        ldaxr   w0, [x4]
>>   64:   88027c81        stxr    w2, w1, [x4]
>>   68:   35ffffc2        cbnz    w2, 60 <__lll_lock_wait+0x18>
>>   6c:   34000120        cbz     w0, 90 <__lll_lock_wait+0x48>
>>   70:   521900a1        eor     w1, w5, #0x80
>>   74:   aa0403e0        mov     x0, x4
>>   78:   93407c21        sxtw    x1, w1
>>   7c:   d2800042        mov     x2, #0x2                        // #2
>>   80:   d2800003        mov     x3, #0x0                        // #0
>>   84:   d2800c48        mov     x8, #0x62                       // #98
>>   88:   d4000001        svc     #0x0
>>   8c:   17fffff4        b       5c <__lll_lock_wait+0x14>
>>   90:   d65f03c0        ret
>>
>> I see similar changes on powerpc and other architectures.  It also aligns
>> with x86_64 implementation by adding the systemtap probes.
>>
>> Checker on aarch64-linux-gnu.
>>
>> 	* nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private):
>> 	Optimize futex call and add systemtap probe.
>> ---
>>  nptl/lowlevellock.c | 31 +++++++++++++++++++------------
>>  1 file changed, 19 insertions(+), 12 deletions(-)
>>
>> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
>> index 5eaa3807ea..47548ff121 100644
>> --- a/nptl/lowlevellock.c
>> +++ b/nptl/lowlevellock.c
>> @@ -17,20 +17,23 @@
>>     License along with the GNU C Library; if not, see
>>     <http://www.gnu.org/licenses/>.  */
>>  
>> -#include <errno.h>
>>  #include <sysdep.h>
>>  #include <lowlevellock.h>
>> -#include <sys/time.h>
>>  #include <atomic.h>
>> +#include <stap-probe.h>
>>  
>>  void
>>  __lll_lock_wait_private (int *futex)
>>  {
>> -  if (*futex == 2)
>> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
>> -
>> -  while (atomic_exchange_acq (futex, 2) != 0)
>> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
>> +  if (atomic_load_relaxed (futex) == 2)
>> +    goto futex;
>> +
>> +  while (atomic_exchange_acquire (futex, 2) != 0)
>> +    {
>> +    futex:
>> +      LIBC_PROBE (lll_lock_wait_private, 1, futex);
>> +      lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
>> +    }
>>  }
>>  
>>  
>> @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex)
>>  void
>>  __lll_lock_wait (int *futex, int private)
>>  {
>> -  if (*futex == 2)
>> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
>> -
>> -  while (atomic_exchange_acq (futex, 2) != 0)
>> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
>> +  if (atomic_load_relaxed (futex) == 2)
>> +    goto futex;
>> +
>> +  while (atomic_exchange_acquire (futex, 2) != 0)
>> +    {
>> +    futex:
>> +      LIBC_PROBE (lll_lock_wait, 1, futex);
>> +      lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
>> +    }
>>  }
>>  #endif
>>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]