This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 2/4] Small optimization for lowlevellock
- From: Adhemerval Zanella <adhemerval dot zanella at linaro dot org>
- To: libc-alpha at sourceware dot org
- Date: Thu, 9 May 2019 15:25:45 -0300
- Subject: Re: [PATCH 2/4] Small optimization for lowlevellock
- References: <20190222192703.18177-1-adhemerval.zanella@linaro.org> <20190222192703.18177-2-adhemerval.zanella@linaro.org> <d928bb39-fabc-89cd-0e22-165cf5df1916@linaro.org>
Ping (x2). The third and forth part are already approved.
On 09/04/2019 09:11, Adhemerval Zanella wrote:
> Ping.
>
> On 22/02/2019 16:27, Adhemerval Zanella wrote:
>> This patch optimizes both __lll_lock_wait_private and __lll_lock_wait
>> by issuing only one lll_futex_wait. Since it is defined as an inlined
>> syscall and inlined syscalls are defined using inlined assembly the
>> compiler usually can not see both calls are equal and optimize
>> accordingly.
>>
>> On aarch64 the resulting binary is change from:
>>
>> 0000000000000060 <__lll_lock_wait>:
>> 60: 2a0103e5 mov w5, w1
>> 64: b9400001 ldr w1, [x0]
>> 68: aa0003e4 mov x4, x0
>> 6c: 7100083f cmp w1, #0x2
>> 70: 540000e1 b.ne 8c <__lll_lock_wait+0x2c> // b.any
>> 74: 521900a1 eor w1, w5, #0x80
>> 78: d2800042 mov x2, #0x2 // #2
>> 7c: 93407c21 sxtw x1, w1
>> 80: d2800003 mov x3, #0x0 // #0
>> 84: d2800c48 mov x8, #0x62 // #98
>> 88: d4000001 svc #0x0
>> 8c: 521900a5 eor w5, w5, #0x80
>> 90: 52800046 mov w6, #0x2 // #2
>> 94: 93407ca5 sxtw x5, w5
>> 98: 14000008 b b8 <__lll_lock_wait+0x58>
>> 9c: d503201f nop
>> a0: aa0403e0 mov x0, x4
>> a4: aa0503e1 mov x1, x5
>> a8: d2800042 mov x2, #0x2 // #2
>> ac: d2800003 mov x3, #0x0 // #0
>> b0: d2800c48 mov x8, #0x62 // #98
>> b4: d4000001 svc #0x0
>> b8: 885ffc80 ldaxr w0, [x4]
>> bc: 88017c86 stxr w1, w6, [x4]
>> c0: 35ffffc1 cbnz w1, b8 <__lll_lock_wait+0x58>
>> c4: 35fffee0 cbnz w0, a0 <__lll_lock_wait+0x40>
>> c8: d65f03c0 ret
>>
>> To:
>>
>> 0000000000000048 <__lll_lock_wait>:
>> 48: aa0003e4 mov x4, x0
>> 4c: 2a0103e5 mov w5, w1
>> 50: b9400000 ldr w0, [x0]
>> 54: 7100081f cmp w0, #0x2
>> 58: 540000c0 b.eq 70 <__lll_lock_wait+0x28> // b.none
>> 5c: 52800041 mov w1, #0x2 // #2
>> 60: 885ffc80 ldaxr w0, [x4]
>> 64: 88027c81 stxr w2, w1, [x4]
>> 68: 35ffffc2 cbnz w2, 60 <__lll_lock_wait+0x18>
>> 6c: 34000120 cbz w0, 90 <__lll_lock_wait+0x48>
>> 70: 521900a1 eor w1, w5, #0x80
>> 74: aa0403e0 mov x0, x4
>> 78: 93407c21 sxtw x1, w1
>> 7c: d2800042 mov x2, #0x2 // #2
>> 80: d2800003 mov x3, #0x0 // #0
>> 84: d2800c48 mov x8, #0x62 // #98
>> 88: d4000001 svc #0x0
>> 8c: 17fffff4 b 5c <__lll_lock_wait+0x14>
>> 90: d65f03c0 ret
>>
>> I see similar changes on powerpc and other architectures. It also aligns
>> with x86_64 implementation by adding the systemtap probes.
>>
>> Checker on aarch64-linux-gnu.
>>
>> * nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private):
>> Optimize futex call and add systemtap probe.
>> ---
>> nptl/lowlevellock.c | 31 +++++++++++++++++++------------
>> 1 file changed, 19 insertions(+), 12 deletions(-)
>>
>> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
>> index 5eaa3807ea..47548ff121 100644
>> --- a/nptl/lowlevellock.c
>> +++ b/nptl/lowlevellock.c
>> @@ -17,20 +17,23 @@
>> License along with the GNU C Library; if not, see
>> <http://www.gnu.org/licenses/>. */
>>
>> -#include <errno.h>
>> #include <sysdep.h>
>> #include <lowlevellock.h>
>> -#include <sys/time.h>
>> #include <atomic.h>
>> +#include <stap-probe.h>
>>
>> void
>> __lll_lock_wait_private (int *futex)
>> {
>> - if (*futex == 2)
>> - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */
>> -
>> - while (atomic_exchange_acq (futex, 2) != 0)
>> - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */
>> + if (atomic_load_relaxed (futex) == 2)
>> + goto futex;
>> +
>> + while (atomic_exchange_acquire (futex, 2) != 0)
>> + {
>> + futex:
>> + LIBC_PROBE (lll_lock_wait_private, 1, futex);
>> + lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */
>> + }
>> }
>>
>>
>> @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex)
>> void
>> __lll_lock_wait (int *futex, int private)
>> {
>> - if (*futex == 2)
>> - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */
>> -
>> - while (atomic_exchange_acq (futex, 2) != 0)
>> - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */
>> + if (atomic_load_relaxed (futex) == 2)
>> + goto futex;
>> +
>> + while (atomic_exchange_acquire (futex, 2) != 0)
>> + {
>> + futex:
>> + LIBC_PROBE (lll_lock_wait, 1, futex);
>> + lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */
>> + }
>> }
>> #endif
>>