This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Performance of global access versus thread local


On 25 September 2013 22:38, Roland McGrath <roland@hack.frob.com> wrote:

Hi Roland,

> If you're going to use a DSO like that, you should use LD_BIND_NOW=1 to
> keep startup overhead out of your measured loops.  There is no real need to
> use a DSO though.  I'm guessing you did so just to make sure the tested
> accesses were the PIC flavors.  You can just compile the main program with
> -fPIC for that.

Thanks, I updated the test as attached. Built with:

#  gcc -fPIC -O2 main.c -o main -lrt

> For the global case, it would be a hidden global within libc itself.
> So you need the  __attribute__ ((visibility ("hidden"))) variant to
> be representative of what the accesses inside libc would do.

I changed this too.

> It's probably better to write the two accesses by hand in assembly--or at
> least show us the disassembly of what you compiled--to be sure they are
> really representative of what the special-case assembly access in libc
> would do.

The numbers I get now are more as expected, although I am still not
sure why the x86_64 TLS version is faster:

x86_64:

TLS ticks per 1000 loops: 0.0000075703 Global ticks per 1000 loops: 0.0000087879

0000000000400790 <tls_access>:
  400790:       48 c7 c0 fc ff ff ff    mov    $0xfffffffffffffffc,%rax
  400797:       64 8b 00                mov    %fs:(%rax),%eax
  40079a:       c3                      retq

00000000004007b0 <global_access>:
  4007b0:       8b 05 8a 08 20 00       mov    0x20088a(%rip),%eax
   # 601040 <__TMC_END__>
  4007b6:       c3                      retq

arm:

TLS ticks per 1000 loops: 0.0000043694 Global ticks per 1000 loops: 0.0000035409

000085ec <tls_access>:
    85ec:       4b03            ldr     r3, [pc, #12]   ; (85fc
<tls_access+0x10>)
    85ee:       ee1d 2f70       mrc     15, 0, r2, cr13, cr0, {3}
    85f2:       447b            add     r3, pc
    85f4:       681b            ldr     r3, [r3, #0]
    85f6:       58d0            ldr     r0, [r2, r3]
    85f8:       4770            bx      lr
    85fa:       bf00            nop
    85fc:       00008a2a        .word   0x00008a2a

00008614 <global_access>:
    8614:       4b01            ldr     r3, [pc, #4]    ; (861c
<global_access+0x8>)
    8616:       447b            add     r3, pc
    8618:       6818            ldr     r0, [r3, #0]
    861a:       4770            bx      lr
    861c:       00008a1a        .word   0x00008a1a


-- 
Will Newton
Toolchain Working Group, Linaro
#include <stdio.h>
#include <time.h>

#define LOOPS 10000000

static __thread int tlsvar __attribute__((tls_model ("initial-exec")));
int globalvar __attribute__ ((visibility ("hidden")));
//int globalvar;

int tls_access(void)
{
  return tlsvar;
}

void set_tls(int v)
{
  tlsvar = v;
}

int global_access(void)
{
  return globalvar;
}

int main(void)
{
  struct timespec start, end;
  unsigned int i, loops = LOOPS;
  double tls_elapsed, global_elapsed;

  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
  for (i = 0; i < LOOPS; i++)
    {
      tls_access();
    }
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);

  tls_elapsed = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) * 1e-9;

  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
  for (i = 0; i < LOOPS; i++)
    {
      global_access();
    }
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);

  global_elapsed = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) * 1e-9;

  printf("TLS ticks per 1000 loops: %.10f Global ticks per 1000 loops: %.10f\n",
         (tls_elapsed / loops) * 1000, (global_elapsed / loops) * 1000);

}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]