GNU C Library master sources branch hjl/numa-spinlock/master created. glibc-2.28.9000-369-g1d18415

hjl@sourceware.org hjl@sourceware.org
Thu Dec 6 21:39:00 GMT 2018


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/numa-spinlock/master has been created
        at  1d184158efaf326d440742aefe5f1dc9aca17e8e (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1d184158efaf326d440742aefe5f1dc9aca17e8e

commit 1d184158efaf326d440742aefe5f1dc9aca17e8e
Author: Kemi Wang <kemi.wang@intel.com>
Date:   Thu Dec 6 14:06:00 2018 +0800

    add hashwork for spinlock
    
    Signed-off-by: Kemi Wang <kemi.wang@intel.com>

diff --git a/sysdeps/x86/nptl/Makefile b/sysdeps/x86/nptl/Makefile
index ed9c3d9..497cf2c 100644
--- a/sysdeps/x86/nptl/Makefile
+++ b/sysdeps/x86/nptl/Makefile
@@ -16,7 +16,7 @@
 # <http://www.gnu.org/licenses/>.
 
 ifeq ($(subdir),numa-spinlock)
-libnuma_spinlock-tests += tst-spinlock-overhead tst-numa-spinlock-overhead
+libnuma_spinlock-tests += tst-spinlock-overhead tst-numa-spinlock-overhead tst-spinlock-overhead-hashwork tst-numa-spinlock-overhead-hashwork
 
 ifeq (yes,$(build-shared))
 $(addprefix $(objpfx),$(libnuma_spinlock-tests)): $(shared-thread-library)
diff --git a/sysdeps/x86/nptl/tst-numa-spinlock-overhead-hashwork.c b/sysdeps/x86/nptl/tst-numa-spinlock-overhead-hashwork.c
new file mode 100644
index 0000000..60d31b2
--- /dev/null
+++ b/sysdeps/x86/nptl/tst-numa-spinlock-overhead-hashwork.c
@@ -0,0 +1,358 @@
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include <sys/param.h>
+#include <errno.h>
+#include <numa-spinlock/numa-spinlock.h>
+
+#define memory_barrier() __asm ("" ::: "memory")
+#define pause() __asm  ("rep ; nop" ::: "memory")
+
+#define CACHELINE_SIZE	64
+#define CACHE_ALIGNED	__attribute__((aligned(CACHELINE_SIZE)))
+
+struct count
+{
+  unsigned long long total;
+} __attribute__((aligned(128)));
+
+struct count *gcount;
+
+/* The time consumed by one update is about 200 TSCs.  */
+static int delay_time_unlocked = 400;
+
+struct numa_spinlock *lock;
+
+#define HASHBUF 20
+
+struct SHA1_CTX
+{
+    uint32_t state[5];
+    uint32_t count[2];
+    uint8_t  buffer[64];
+};
+
+typedef struct SHA1_CTX SHA1_CTX;
+
+struct ops
+{
+  void *(*test) (void *arg);
+} *ops;
+
+void *work_thread (void *arg);
+
+void test_threads (int numthreads);
+
+#define iterations (10000 * 5)
+
+static volatile int start_thread;
+
+/* Delay some fixed time */
+static void
+delay_tsc (unsigned n)
+{
+  unsigned long long start, current, diff;
+  unsigned int aux;
+  start = __builtin_ia32_rdtscp (&aux);
+  while (1)
+    {
+      current = __builtin_ia32_rdtscp (&aux);
+      diff = current - start;
+      if (diff < n)
+	pause ();
+      else
+	break;
+    }
+}
+
+static void
+wait_a_bit (int delay_time)
+{
+  if (delay_time > 0)
+    delay_tsc (delay_time);
+}
+
+/* Hashwork */
+/* SHA-1 in C, By Steve Reid <sreid@sea-to-sky.net> 100% Public Domain.  */
+
+#define rol rotl32
+static inline __attribute__((always_inline)) uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+	  return (x << r) | (x >> (32 - r));
+}
+/* blk0() and blk() perform the initial expand. */
+/* I got the idea of expanding during the round function from SSLeay */
+/* FIXME: can we do this in an endian-proof way? */
+
+#ifdef WORDS_BIGENDIAN
+#define blk0(i) block->l[i]
+#else
+#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) | (rol(block->l[i],8)&0x00FF00FF))
+#endif
+#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] ^ block->l[(i+2)&15]^block->l[i&15],1))
+
+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
+#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
+#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
+#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
+
+/* Hash a single 512-bit block. This is the core of the algorithm. */
+static void SHA1_Transform(uint32_t state[5], const uint8_t buffer[64])
+{
+    uint32_t a, b, c, d, e;
+    typedef union {
+        uint8_t c[64];
+        uint32_t l[16];
+    } CHAR64LONG16;
+    CHAR64LONG16* block;
+
+    block = (CHAR64LONG16*)buffer;
+
+    /* Copy context->state[] to working vars */
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+
+    /* 4 rounds of 20 operations each. Loop unrolled. */
+    R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
+    R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
+    R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
+    R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
+    R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
+    R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
+    R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
+    R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
+    R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
+    R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
+    R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
+    R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
+    R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
+    R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
+    R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
+    R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
+    R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
+    R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
+    R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
+    R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
+
+    /* Add the working vars back into context.state[] */
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+
+    /* Wipe variables */
+    a = b = c = d = e = 0;
+}
+
+/* SHA1Init - Initialize new context */
+static void SHA1_Init(SHA1_CTX* context)
+{
+    /* SHA1 initialization constants */
+    context->state[0] = 0x67452301;
+    context->state[1] = 0xEFCDAB89;
+    context->state[2] = 0x98BADCFE;
+    context->state[3] = 0x10325476;
+    context->state[4] = 0xC3D2E1F0;
+    context->count[0] = 0;
+  	context->count[1] = 0;
+}
+
+/* Run your data through this. */
+void SHA1_Update(SHA1_CTX* context, const uint8_t* data, const size_t len)
+{
+    size_t i, j;
+
+    j = (context->count[0] >> 3) & 63;
+    if ((context->count[0] += len << 3) < (len << 3)) context->count[1]++;
+
+    context->count[1] += (len >> 29);
+
+    if ((j + len) > 63) 
+  {
+        memcpy(&context->buffer[j], data, (i = 64-j));
+        SHA1_Transform(context->state, context->buffer);
+
+        for ( ; i + 63 < len; i += 64) 
+    {
+            SHA1_Transform(context->state, data + i);
+        }
+
+        j = 0;
+    }
+    else i = 0;
+    memcpy(&context->buffer[j], &data[i], len - i);
+}
+
+struct work_todo_argument
+{
+  SHA1_CTX *v1;
+  char *v2;
+  int v3;
+};
+
+static void *
+work_todo (void *v)
+{
+  struct work_todo_argument *p = v;
+  SHA1_Update (p->v1, (uint8_t *)p->v2, p->v3);
+  return NULL;
+}
+
+void *
+work_thread (void *arg)
+{
+  long i;
+  unsigned long pid = (unsigned long) arg;
+  unsigned long long start, end;
+  struct work_todo_argument work_todo_arg;
+  struct numa_spinlock_info lock_info;
+  
+  if (init_numa_spinlock_info (lock, &lock_info))
+    {
+      printf ("init_numa_spinlock_info failure: %m\n");
+      exit (1);
+    }
+  char buf[HASHBUF];
+  SHA1_CTX ctx;
+
+  memset (buf, 0x1, HASHBUF);
+  SHA1_Init (&ctx);
+
+  work_todo_arg.v1 = &ctx;
+  work_todo_arg.v2 = buf;
+  work_todo_arg.v3 = HASHBUF;
+  lock_info.argument = &work_todo_arg;
+  lock_info.workload = work_todo;
+
+  while (!start_thread)
+    pause ();
+  unsigned int aux;
+  start = __builtin_ia32_rdtscp (&aux);
+  for (i = 0; i < iterations; i++)
+    {
+      apply_numa_spinlock (lock, &lock_info);
+      wait_a_bit (delay_time_unlocked);
+    }
+  end = __builtin_ia32_rdtscp (&aux);
+  gcount[pid].total = end - start;
+
+  return NULL;
+}
+
+void
+test_threads (int numthreads)
+{
+  lock = init_numa_spinlock ();
+  memory_barrier ();
+
+  pthread_t thr[numthreads];
+  int i;
+
+  for (i = 0; i < numthreads; i++)
+    {
+      pthread_attr_t attr;
+      pthread_attr_init (&attr);
+      cpu_set_t set;
+      CPU_ZERO (&set);
+      (void)CPU_SET (i, &set);
+      pthread_attr_setaffinity_np (&attr, sizeof(cpu_set_t), &set);
+      pthread_create (&thr[i], &attr, ops->test, (void *)(long)i);
+    }
+
+  memory_barrier ();
+  start_thread = 1;
+  memory_barrier ();
+  sched_yield ();
+
+  for (i = 0; i < numthreads; i++)
+    pthread_join (thr[i], NULL);
+}
+
+struct ops hashwork_ops =
+{
+  .test = work_thread,
+};
+
+struct ops *ops;
+
+static struct count
+total_cost (int numthreads)
+{
+  int i;
+  unsigned long long total = 0;
+
+  memset (gcount, 0, sizeof(gcount[0]) * numthreads);
+
+  test_threads (numthreads);
+
+  for (i = 0; i < numthreads; i++)
+      total += gcount[i].total;
+
+  struct count cost = { total };
+  return cost;
+}
+
+int
+main (void)
+{
+  int numthreads = sysconf (_SC_NPROCESSORS_ONLN);
+  if (numthreads < 8)
+    return 1;
+
+  ops = &hashwork_ops;
+
+  int err_ret = posix_memalign ((void **)&gcount, 4096,
+				sizeof(gcount[0]) * numthreads);
+  if (err_ret)
+    {
+      printf ("posix_memalign failure: %s\n", strerror (err_ret));
+      exit (err_ret);
+    }
+
+  struct count cost, cost1;
+  double overhead;
+  int i, last;
+  cost1 = total_cost (1);
+  for (last = i = 2; i <= numthreads;) 
+    {
+      last = i;
+      cost = total_cost (i);
+      overhead = cost.total;
+      overhead /= i;
+      overhead /= cost1.total;
+      printf ("Number of threads: %4d, Total time %14lld, Overhead: %.2f\n",
+	      i, cost.total, overhead);
+      if ((i * 2) < numthreads)
+	i = i * 2;
+      else
+	i = i + 16;
+    }
+
+  if (last != numthreads)
+    {
+      i = numthreads;
+      cost = total_cost (i);
+      overhead = cost.total;
+      overhead /= i;
+      overhead /= cost1.total;
+      printf ("Number of threads: %4d, Total time %14lld, Overhead: %.2f\n",
+	      i, cost.total, overhead);
+    }
+
+  free (gcount);
+  return 0;
+}
diff --git a/sysdeps/x86/nptl/tst-spinlock-overhead-hashwork.c b/sysdeps/x86/nptl/tst-spinlock-overhead-hashwork.c
new file mode 100644
index 0000000..a99b440
--- /dev/null
+++ b/sysdeps/x86/nptl/tst-spinlock-overhead-hashwork.c
@@ -0,0 +1,346 @@
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include <sys/param.h>
+#include <errno.h>
+
+#define memory_barrier() __asm ("" ::: "memory")
+#define pause() __asm  ("rep ; nop" ::: "memory")
+
+#define CACHELINE_SIZE	64
+#define CACHE_ALIGNED	__attribute__((aligned(CACHELINE_SIZE)))
+
+struct count
+{
+  unsigned long long total;
+} __attribute__((aligned(128)));
+
+struct count *gcount;
+
+/* The time consumed by one update is about 200 TSCs.  */
+static int delay_time_unlocked = 400;
+
+struct
+{
+  pthread_spinlock_t testlock;
+  char pad[64];
+} test CACHE_ALIGNED;
+
+#define lock() pthread_spin_lock(&test.testlock)
+#define unlock() pthread_spin_unlock(&test.testlock)
+
+static void
+__attribute__((constructor))
+init_spin (void)
+{
+  pthread_spin_init (&test.testlock, 0);
+}
+
+#define HASHBUF 20
+
+struct SHA1_CTX
+{
+    uint32_t state[5];
+    uint32_t count[2];
+    uint8_t  buffer[64];
+};
+
+typedef struct SHA1_CTX SHA1_CTX;
+
+struct ops
+{
+  void *(*test) (void *arg);
+} *ops;
+
+void *work_thread (void *arg);
+
+void test_threads (int numthreads);
+
+#define iterations (10000 * 5)
+
+static volatile int start_thread;
+
+/* Delay some fixed time */
+static void
+delay_tsc (unsigned n)
+{
+  unsigned long long start, current, diff;
+  unsigned int aux;
+  start = __builtin_ia32_rdtscp (&aux);
+  while (1)
+    {
+      current = __builtin_ia32_rdtscp (&aux);
+      diff = current - start;
+      if (diff < n)
+	pause ();
+      else
+	break;
+    }
+}
+
+static void
+wait_a_bit (int delay_time)
+{
+  if (delay_time > 0)
+    delay_tsc (delay_time);
+}
+
+/* Hashwork */
+/* SHA-1 in C, By Steve Reid <sreid@sea-to-sky.net> 100% Public Domain.  */
+
+#define rol rotl32
+static inline __attribute__((always_inline)) uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+	  return (x << r) | (x >> (32 - r));
+}
+/* blk0() and blk() perform the initial expand. */
+/* I got the idea of expanding during the round function from SSLeay */
+/* FIXME: can we do this in an endian-proof way? */
+
+#ifdef WORDS_BIGENDIAN
+#define blk0(i) block->l[i]
+#else
+#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) | (rol(block->l[i],8)&0x00FF00FF))
+#endif
+#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] ^ block->l[(i+2)&15]^block->l[i&15],1))
+
+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
+#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
+#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
+#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
+
+/* Hash a single 512-bit block. This is the core of the algorithm. */
+static void SHA1_Transform(uint32_t state[5], const uint8_t buffer[64])
+{
+    uint32_t a, b, c, d, e;
+    typedef union {
+        uint8_t c[64];
+        uint32_t l[16];
+    } CHAR64LONG16;
+    CHAR64LONG16* block;
+
+    block = (CHAR64LONG16*)buffer;
+
+    /* Copy context->state[] to working vars */
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+
+    /* 4 rounds of 20 operations each. Loop unrolled. */
+    R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
+    R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
+    R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
+    R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
+    R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
+    R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
+    R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
+    R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
+    R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
+    R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
+    R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
+    R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
+    R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
+    R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
+    R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
+    R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
+    R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
+    R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
+    R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
+    R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
+
+    /* Add the working vars back into context.state[] */
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+
+    /* Wipe variables */
+    a = b = c = d = e = 0;
+}
+
+/* SHA1Init - Initialize new context */
+static void SHA1_Init(SHA1_CTX* context)
+{
+    /* SHA1 initialization constants */
+    context->state[0] = 0x67452301;
+    context->state[1] = 0xEFCDAB89;
+    context->state[2] = 0x98BADCFE;
+    context->state[3] = 0x10325476;
+    context->state[4] = 0xC3D2E1F0;
+    context->count[0] = 0;
+  	context->count[1] = 0;
+}
+
+/* Run your data through this. */
+void SHA1_Update(SHA1_CTX* context, const uint8_t* data, const size_t len)
+{
+    size_t i, j;
+
+    j = (context->count[0] >> 3) & 63;
+    if ((context->count[0] += len << 3) < (len << 3)) context->count[1]++;
+
+    context->count[1] += (len >> 29);
+
+    if ((j + len) > 63) 
+  {
+        memcpy(&context->buffer[j], data, (i = 64-j));
+        SHA1_Transform(context->state, context->buffer);
+
+        for ( ; i + 63 < len; i += 64) 
+    {
+            SHA1_Transform(context->state, data + i);
+        }
+
+        j = 0;
+    }
+    else i = 0;
+    memcpy(&context->buffer[j], &data[i], len - i);
+}
+
+void *
+work_thread (void *arg)
+{
+  long i;
+  unsigned long pid = (unsigned long) arg;
+  unsigned long long start, end;
+  char buf[HASHBUF];
+  SHA1_CTX ctx;
+
+  memset (buf, 0x1, HASHBUF);
+  SHA1_Init (&ctx);
+
+  while (!start_thread)
+    pause ();
+
+  unsigned int aux;
+  start = __builtin_ia32_rdtscp (&aux);
+  for (i = 0; i < iterations; i++)
+    {
+      lock();
+      SHA1_Update (&ctx, (uint8_t *)buf, HASHBUF);
+      unlock ();
+      wait_a_bit (delay_time_unlocked);
+    }
+  end = __builtin_ia32_rdtscp (&aux);
+  gcount[pid].total = end - start;
+
+  return NULL;
+}
+
+void
+test_threads (int numthreads)
+{
+  start_thread = 0;
+
+  memory_barrier ();
+
+  pthread_t thr[numthreads];
+  int i;
+
+  for (i = 0; i < numthreads; i++)
+    {
+      pthread_attr_t attr;
+      pthread_attr_init (&attr);
+      cpu_set_t set;
+      CPU_ZERO (&set);
+      (void)CPU_SET (i, &set);
+      pthread_attr_setaffinity_np (&attr, sizeof(cpu_set_t), &set);
+      pthread_create (&thr[i], &attr, ops->test, (void *)(long)i);
+    }
+
+  memory_barrier ();
+  start_thread = 1;
+  memory_barrier ();
+  sched_yield ();
+
+  for (i = 0; i < numthreads; i++)
+    pthread_join (thr[i], NULL);
+}
+
+struct ops hashwork_ops =
+{
+  .test = work_thread,
+};
+
+struct ops *ops;
+
+static struct count
+total_cost (int numthreads)
+{
+  int i;
+  unsigned long long total = 0;
+
+  memset (gcount, 0, sizeof(gcount[0]) * numthreads);
+
+  test_threads (numthreads);
+
+  for (i = 0; i < numthreads; i++)
+      total += gcount[i].total;
+	
+  struct count cost = {total};
+  return cost;
+}
+
+int
+main (void)
+{
+  int numthreads = sysconf (_SC_NPROCESSORS_ONLN);
+  if (numthreads < 8)
+    return 1;
+
+  ops = &hashwork_ops;
+
+  int err_ret = posix_memalign ((void **)&gcount, 4096,
+				sizeof(gcount[0]) * numthreads);
+  if (err_ret)
+    {
+      printf ("posix_memalign failure: %s\n", strerror (err_ret));
+      exit (err_ret);
+    }
+
+  struct count cost, cost1;
+  double overhead;
+  int i, last;
+  cost1 = total_cost (1);
+  for (last = i = 2; i <= numthreads;) 
+    {
+      last = i;
+      cost = total_cost (i);
+      overhead = cost.total;
+      overhead /= i;
+      overhead /= cost1.total;
+      printf ("Number of threads: %4d, Total time %14lld, Overhead: %.2f\n",
+	      i, cost.total, overhead);
+      if ((i * 2) < numthreads)
+	i = i * 2;
+      else
+	i = i + 16;
+    }
+
+  if (last != numthreads)
+    {
+      i = numthreads;
+      cost = total_cost (i);
+      overhead = cost.total;
+      overhead /= i;
+      overhead /= cost1.total;
+      printf ("Number of threads: %4d, Total time %14lld, Overhead: %.2f\n",
+	      i, cost.total, overhead);
+    }
+
+  free (gcount);
+  return 0;
+}

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b4e40bd14ae993b4a39a7056e6eda5ae00d2e9be

commit b4e40bd14ae993b4a39a7056e6eda5ae00d2e9be
Author: Kemi Wang <kemi.wang@intel.com>
Date:   Thu Dec 6 13:41:59 2018 +0800

    code cleanup for global cmpxchg workload
    
    Signed-off-by: Kemi Wang <kemi.wang@intel.com>

diff --git a/sysdeps/x86/nptl/tst-numa-spinlock-overhead.c b/sysdeps/x86/nptl/tst-numa-spinlock-overhead.c
index 084cc13..fc37e3c 100644
--- a/sysdeps/x86/nptl/tst-numa-spinlock-overhead.c
+++ b/sysdeps/x86/nptl/tst-numa-spinlock-overhead.c
@@ -19,7 +19,6 @@
 #define CACHELINE_SIZE	64
 #define CACHE_ALIGNED	__attribute__((aligned(CACHELINE_SIZE)))
 
-#define constant_time 5
 unsigned long g_val CACHE_ALIGNED;
 unsigned long g_val2 CACHE_ALIGNED;
 unsigned long g_val3 CACHE_ALIGNED;
@@ -27,8 +26,6 @@ unsigned long cmplock CACHE_ALIGNED;
 struct count
 {
   unsigned long long total;
-  unsigned long long spinlock;
-  unsigned long long wall;
 } __attribute__((aligned(128)));
 
 struct count *gcount;
@@ -41,23 +38,15 @@ struct numa_spinlock *lock;
 struct ops
 {
   void *(*test) (void *arg);
-  void (*print_thread) (void *res, int);
 } *ops;
 
-struct stats_result
-{
-  unsigned long num;
-};
-
 void *work_thread (void *arg);
 
-void test_threads (int numthreads, unsigned long time);
+void test_threads (int numthreads);
 
 #define iterations (10000 * 5)
 
 static volatile int start_thread;
-static volatile int stop_flag;
-
 
 /* Delay some fixed time */
 static void
@@ -109,27 +98,16 @@ work_thread (void *arg)
 {
   long i;
   unsigned long pid = (unsigned long) arg;
-  struct stats_result *res;
   unsigned long long start, end;
   struct work_todo_argument work_todo_arg;
   struct numa_spinlock_info lock_info;
 
-  int err_ret = posix_memalign ((void **)&res, CACHELINE_SIZE,
-				roundup (sizeof (*res), CACHELINE_SIZE));
-  if (err_ret)
-    {
-      printf ("posix_memalign failure: %s\n", strerror (err_ret));
-      exit (err_ret);
-    }
-
   if (init_numa_spinlock_info (lock, &lock_info))
     {
       printf ("init_numa_spinlock_info failure: %m\n");
       exit (1);
     }
 
-  long num = 0;
-
   work_todo_arg.v1 = &g_val;
   work_todo_arg.v2 = &g_val2;
   work_todo_arg.v3 = &g_val3;
@@ -146,13 +124,11 @@ work_thread (void *arg)
     {
       apply_numa_spinlock (lock, &lock_info);
       wait_a_bit (delay_time_unlocked);
-      num++;
     }
   end = __builtin_ia32_rdtscp (&aux);
   gcount[pid].total = end - start;
-  res->num = num;
 
-  return res;
+  return NULL;
 }
 
 
@@ -166,16 +142,12 @@ init_global_data(void)
 }
 
 void
-test_threads (int numthreads, unsigned long time)
+test_threads (int numthreads)
 {
-  start_thread = 0;
-  stop_flag = 0;
-
   lock = init_numa_spinlock ();
   memory_barrier ();
 
   pthread_t thr[numthreads];
-  void *res[numthreads];
   int i;
 
   init_global_data ();
@@ -195,26 +167,11 @@ test_threads (int numthreads, unsigned long time)
   memory_barrier ();
   sched_yield ();
 
-  if (time)
-    {
-      struct timespec ts =
-	{
-	  ts.tv_sec = time,
-	  ts.tv_nsec = 0
-	};
-      clock_nanosleep (CLOCK_MONOTONIC, 0, &ts, NULL);
-      memory_barrier ();
-      stop_flag = 1;
-    }
-
-  for (i = 0; i < numthreads; i++) {
-    pthread_join (thr[i], (void *)&res[i]);
-    free (res[i]);
-  }
-
+  for (i = 0; i < numthreads; i++)
+    pthread_join (thr[i], NULL);
 }
 
-struct ops hashwork_ops =
+struct ops cmpxchg_ops =
 {
   .test = work_thread,
 };
@@ -226,25 +183,15 @@ total_cost (int numthreads)
 {
   int i;
   unsigned long long total = 0;
-  unsigned long long spinlock = 0;
 
   memset (gcount, 0, sizeof(gcount[0]) * numthreads);
 
-  unsigned long long start, end, diff;
-  unsigned int aux;
-
-  start = __builtin_ia32_rdtscp (&aux);
-  test_threads (numthreads, constant_time);
-  end = __builtin_ia32_rdtscp (&aux);
-  diff = end - start;
+  test_threads (numthreads);
 
   for (i = 0; i < numthreads; i++)
-    {
       total += gcount[i].total;
-      spinlock += gcount[i].spinlock;
-    }
 
-  struct count cost = { total, spinlock, diff };
+  struct count cost = { total };
   return cost;
 }
 
@@ -255,7 +202,7 @@ main (void)
   if (numthreads < 8)
     return 1;
 
-  ops = &hashwork_ops;
+  ops = &cmpxchg_ops;
 
   int err_ret = posix_memalign ((void **)&gcount, 4096,
 				sizeof(gcount[0]) * numthreads);
diff --git a/sysdeps/x86/nptl/tst-spinlock-overhead.c b/sysdeps/x86/nptl/tst-spinlock-overhead.c
index edbf9d3..2f03eef 100644
--- a/sysdeps/x86/nptl/tst-spinlock-overhead.c
+++ b/sysdeps/x86/nptl/tst-spinlock-overhead.c
@@ -18,7 +18,6 @@
 #define CACHELINE_SIZE	64
 #define CACHE_ALIGNED	__attribute__((aligned(CACHELINE_SIZE)))
 
-#define constant_time 5
 unsigned long g_val CACHE_ALIGNED;
 unsigned long g_val2 CACHE_ALIGNED;
 unsigned long g_val3 CACHE_ALIGNED;
@@ -26,8 +25,6 @@ unsigned long cmplock CACHE_ALIGNED;
 struct count
 {
   unsigned long long total;
-  unsigned long long spinlock;
-  unsigned long long wall;
 } __attribute__((aligned(128)));
 
 struct count *gcount;
@@ -54,23 +51,15 @@ init_spin (void)
 struct ops
 {
   void *(*test) (void *arg);
-  void (*print_thread) (void *res, int);
 } *ops;
 
-struct stats_result
-{
-  unsigned long num;
-};
-
 void *work_thread (void *arg);
 
-void test_threads (int numthreads, unsigned long time);
+void test_threads (int numthreads);
 
 #define iterations (10000 * 5)
 
 static volatile int start_thread;
-static volatile int stop_flag;
-
 
 /* Delay some fixed time */
 static void
@@ -112,16 +101,7 @@ work_thread (void *arg)
 {
   long i;
   unsigned long pid = (unsigned long) arg;
-  struct stats_result *res;
   unsigned long long start, end;
-  int err_ret = posix_memalign ((void **)&res, CACHELINE_SIZE,
-				roundup (sizeof (*res), CACHELINE_SIZE));
-  if (err_ret)
-    {
-      printf ("posix_memalign failure: %s\n", strerror (err_ret));
-      exit (err_ret);
-    }
-  long num = 0;
 
   while (!start_thread)
     pause ();
@@ -134,13 +114,11 @@ work_thread (void *arg)
       work_todo ();
       unlock ();
       wait_a_bit (delay_time_unlocked);
-      num++;
     }
   end = __builtin_ia32_rdtscp (&aux);
   gcount[pid].total = end - start;
-  res->num = num;
 
-  return res;
+  return NULL;
 }
 
 
@@ -154,15 +132,13 @@ init_global_data(void)
 }
 
 void
-test_threads (int numthreads, unsigned long time)
+test_threads (int numthreads)
 {
   start_thread = 0;
-  stop_flag = 0;
 
   memory_barrier ();
 
   pthread_t thr[numthreads];
-  void *res[numthreads];
   int i;
 
   init_global_data ();
@@ -182,26 +158,11 @@ test_threads (int numthreads, unsigned long time)
   memory_barrier ();
   sched_yield ();
 
-  if (time)
-    {
-      struct timespec ts =
-	{
-	  ts.tv_sec = time,
-	  ts.tv_nsec = 0
-	};
-      clock_nanosleep (CLOCK_MONOTONIC, 0, &ts, NULL);
-      memory_barrier ();
-      stop_flag = 1;
-    }
-
-  for (i = 0; i < numthreads; i++) {
-    pthread_join (thr[i], (void *)&res[i]);
-    free (res[i]);
-  }
-
+  for (i = 0; i < numthreads; i++)
+    pthread_join (thr[i], NULL);
 }
 
-struct ops hashwork_ops =
+struct ops cmpxchg_ops =
 {
   .test = work_thread,
 };
@@ -213,25 +174,15 @@ total_cost (int numthreads)
 {
   int i;
   unsigned long long total = 0;
-  unsigned long long spinlock = 0;
 
   memset (gcount, 0, sizeof(gcount[0]) * numthreads);
 
-  unsigned long long start, end, diff;
-  unsigned int aux;
-
-  start = __builtin_ia32_rdtscp (&aux);
-  test_threads (numthreads, constant_time);
-  end = __builtin_ia32_rdtscp (&aux);
-  diff = end - start;
+  test_threads (numthreads);
 
   for (i = 0; i < numthreads; i++)
-    {
       total += gcount[i].total;
-      spinlock += gcount[i].spinlock;
-    }
 
-  struct count cost = { total, spinlock, diff };
+  struct count cost = {total};
   return cost;
 }
 
@@ -242,7 +193,7 @@ main (void)
   if (numthreads < 8)
     return 1;
 
-  ops = &hashwork_ops;
+  ops = &cmpxchg_ops;
 
   int err_ret = posix_memalign ((void **)&gcount, 4096,
 				sizeof(gcount[0]) * numthreads);

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=14906da7f44d7445632975a8f03e1fa984a0d9b9

commit 14906da7f44d7445632975a8f03e1fa984a0d9b9
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Dec 5 13:00:04 2018 -0800

    Add tst-spinlock-overhead.c and tst-numa-spinlock-overhead.c
    
    1. tst-spinlock-overhead
    
    On CFL:
    
    Number of threads:    2, Total time      111692114, Overhead: 1.35
    Number of threads:    4, Total time      437096222, Overhead: 2.64
    Number of threads:    8, Total time     1566989478, Overhead: 4.73
    Number of threads:   12, Total time     3735663036, Overhead: 7.51
    
    On BDX:
    
    Number of threads:    2, Total time      161928910, Overhead: 2.68
    Number of threads:    4, Total time      632319301, Overhead: 5.22
    Number of threads:    8, Total time     2607285614, Overhead: 10.77
    Number of threads:   16, Total time    12960115272, Overhead: 26.77
    Number of threads:   32, Total time    69437649115, Overhead: 71.70
    Number of threads:   48, Total time   131934124605, Overhead: 90.82
    Number of threads:   56, Total time   171965209489, Overhead: 101.47
    
    on SKX:
    
    Number of threads:    2, Total time      210957738, Overhead: 2.83
    Number of threads:    4, Total time      700807292, Overhead: 4.71
    Number of threads:    8, Total time     2730139430, Overhead: 9.17
    Number of threads:   16, Total time    12286303676, Overhead: 20.63
    Number of threads:   32, Total time    71034964166, Overhead: 59.63
    Number of threads:   64, Total time   374564752366, Overhead: 157.23
    Number of threads:   80, Total time   564485071330, Overhead: 189.56
    Number of threads:   96, Total time   830032415132, Overhead: 232.27
    Number of threads:  112, Total time  1118021304438, Overhead: 268.17
    
    2. tst-numa-spinlock-overhead
    
    On CFL:
    
    Number of threads:    2, Total time      143721672, Overhead: 1.52
    Number of threads:    4, Total time      404509056, Overhead: 2.14
    Number of threads:    8, Total time      947638656, Overhead: 2.51
    Number of threads:   12, Total time     1594749872, Overhead: 2.81
    
    On BDX:
    
    Number of threads:    2, Total time      168167714, Overhead: 2.42
    Number of threads:    4, Total time      482525975, Overhead: 3.47
    Number of threads:    8, Total time     1397432259, Overhead: 5.03
    Number of threads:   16, Total time     4078305832, Overhead: 7.34
    Number of threads:   32, Total time    12059269166, Overhead: 10.85
    Number of threads:   48, Total time    23553679167, Overhead: 14.13
    Number of threads:   56, Total time    29682472170, Overhead: 15.26
    
    On SKX:
    
    Number of threads:    2, Total time      207803788, Overhead: 2.13
    Number of threads:    4, Total time      687834998, Overhead: 3.53
    Number of threads:    8, Total time     1643241490, Overhead: 4.22
    Number of threads:   16, Total time     4999757806, Overhead: 6.42
    Number of threads:   32, Total time    15625584082, Overhead: 10.03
    Number of threads:   64, Total time    48207310808, Overhead: 15.47
    Number of threads:   80, Total time    73146275124, Overhead: 18.78
    Number of threads:   96, Total time   101618181262, Overhead: 21.74
    Number of threads:  112, Total time   138927813380, Overhead: 25.47

diff --git a/sysdeps/x86/nptl/Makefile b/sysdeps/x86/nptl/Makefile
new file mode 100644
index 0000000..ed9c3d9
--- /dev/null
+++ b/sysdeps/x86/nptl/Makefile
@@ -0,0 +1,26 @@
+# Copyright (C) 2018 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+ifeq ($(subdir),numa-spinlock)
+libnuma_spinlock-tests += tst-spinlock-overhead tst-numa-spinlock-overhead
+
+ifeq (yes,$(build-shared))
+$(addprefix $(objpfx),$(libnuma_spinlock-tests)): $(shared-thread-library)
+else
+$(addprefix $(objpfx),$(libnuma_spinlock-tests)): $(static-thread-library)
+endif
+endif
diff --git a/sysdeps/x86/nptl/tst-numa-spinlock-overhead.c b/sysdeps/x86/nptl/tst-numa-spinlock-overhead.c
new file mode 100644
index 0000000..084cc13
--- /dev/null
+++ b/sysdeps/x86/nptl/tst-numa-spinlock-overhead.c
@@ -0,0 +1,300 @@
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include <sys/param.h>
+#include <errno.h>
+#include <numa-spinlock/numa-spinlock.h>
+
+#define memory_barrier() __asm ("" ::: "memory")
+#define pause() __asm  ("rep ; nop" ::: "memory")
+
+#define CACHELINE_SIZE	64
+#define CACHE_ALIGNED	__attribute__((aligned(CACHELINE_SIZE)))
+
+#define constant_time 5
+unsigned long g_val CACHE_ALIGNED;
+unsigned long g_val2 CACHE_ALIGNED;
+unsigned long g_val3 CACHE_ALIGNED;
+unsigned long cmplock CACHE_ALIGNED;
+struct count
+{
+  unsigned long long total;
+  unsigned long long spinlock;
+  unsigned long long wall;
+} __attribute__((aligned(128)));
+
+struct count *gcount;
+
+/* The time consumed by one update is about 200 TSCs.  */
+static int delay_time_unlocked = 400;
+
+struct numa_spinlock *lock;
+
+struct ops
+{
+  void *(*test) (void *arg);
+  void (*print_thread) (void *res, int);
+} *ops;
+
+struct stats_result
+{
+  unsigned long num;
+};
+
+void *work_thread (void *arg);
+
+void test_threads (int numthreads, unsigned long time);
+
+#define iterations (10000 * 5)
+
+static volatile int start_thread;
+static volatile int stop_flag;
+
+
+/* Delay some fixed time */
+static void
+delay_tsc (unsigned n)
+{
+  unsigned long long start, current, diff;
+  unsigned int aux;
+  start = __builtin_ia32_rdtscp (&aux);
+  while (1)
+    {
+      current = __builtin_ia32_rdtscp (&aux);
+      diff = current - start;
+      if (diff < n)
+	pause ();
+      else
+	break;
+    }
+}
+
+static void
+wait_a_bit (int delay_time)
+{
+  if (delay_time > 0)
+    delay_tsc (delay_time);
+}
+
+struct work_todo_argument
+{
+  unsigned long *v1;
+  unsigned long *v2;
+  unsigned long *v3;
+  unsigned long *v4;
+};
+
+static void *
+work_todo (void *v)
+{
+  struct work_todo_argument *p = v;
+  unsigned long ret;
+  *p->v1 = *p->v1 + 1;
+  *p->v2 = *p->v2 + 1;
+  ret = __sync_val_compare_and_swap (p->v4, 0, 1);
+  *p->v3 = *p->v3 + ret;
+  return (void *) 2;
+}
+
+void *
+work_thread (void *arg)
+{
+  long i;
+  unsigned long pid = (unsigned long) arg;
+  struct stats_result *res;
+  unsigned long long start, end;
+  struct work_todo_argument work_todo_arg;
+  struct numa_spinlock_info lock_info;
+
+  int err_ret = posix_memalign ((void **)&res, CACHELINE_SIZE,
+				roundup (sizeof (*res), CACHELINE_SIZE));
+  if (err_ret)
+    {
+      printf ("posix_memalign failure: %s\n", strerror (err_ret));
+      exit (err_ret);
+    }
+
+  if (init_numa_spinlock_info (lock, &lock_info))
+    {
+      printf ("init_numa_spinlock_info failure: %m\n");
+      exit (1);
+    }
+
+  long num = 0;
+
+  work_todo_arg.v1 = &g_val;
+  work_todo_arg.v2 = &g_val2;
+  work_todo_arg.v3 = &g_val3;
+  work_todo_arg.v4 = &cmplock;
+  lock_info.argument = &work_todo_arg;
+  lock_info.workload = work_todo;
+
+  while (!start_thread)
+    pause ();
+
+  unsigned int aux;
+  start = __builtin_ia32_rdtscp (&aux);
+  for (i = 0; i < iterations; i++)
+    {
+      apply_numa_spinlock (lock, &lock_info);
+      wait_a_bit (delay_time_unlocked);
+      num++;
+    }
+  end = __builtin_ia32_rdtscp (&aux);
+  gcount[pid].total = end - start;
+  res->num = num;
+
+  return res;
+}
+
+
+void
+init_global_data(void)
+{
+  g_val = 0;
+  g_val2 = 0;
+  g_val3 = 0;
+  cmplock = 0;
+}
+
+void
+test_threads (int numthreads, unsigned long time)
+{
+  start_thread = 0;
+  stop_flag = 0;
+
+  lock = init_numa_spinlock ();
+  memory_barrier ();
+
+  pthread_t thr[numthreads];
+  void *res[numthreads];
+  int i;
+
+  init_global_data ();
+  for (i = 0; i < numthreads; i++)
+    {
+      pthread_attr_t attr;
+      pthread_attr_init (&attr);
+      cpu_set_t set;
+      CPU_ZERO (&set);
+      (void)CPU_SET (i, &set);
+      pthread_attr_setaffinity_np (&attr, sizeof(cpu_set_t), &set);
+      pthread_create (&thr[i], &attr, ops->test, (void *)(long)i);
+    }
+
+  memory_barrier ();
+  start_thread = 1;
+  memory_barrier ();
+  sched_yield ();
+
+  if (time)
+    {
+      struct timespec ts =
+	{
+	  ts.tv_sec = time,
+	  ts.tv_nsec = 0
+	};
+      clock_nanosleep (CLOCK_MONOTONIC, 0, &ts, NULL);
+      memory_barrier ();
+      stop_flag = 1;
+    }
+
+  for (i = 0; i < numthreads; i++) {
+    pthread_join (thr[i], (void *)&res[i]);
+    free (res[i]);
+  }
+
+}
+
+struct ops hashwork_ops =
+{
+  .test = work_thread,
+};
+
+struct ops *ops;
+
+static struct count
+total_cost (int numthreads)
+{
+  int i;
+  unsigned long long total = 0;
+  unsigned long long spinlock = 0;
+
+  memset (gcount, 0, sizeof(gcount[0]) * numthreads);
+
+  unsigned long long start, end, diff;
+  unsigned int aux;
+
+  start = __builtin_ia32_rdtscp (&aux);
+  test_threads (numthreads, constant_time);
+  end = __builtin_ia32_rdtscp (&aux);
+  diff = end - start;
+
+  for (i = 0; i < numthreads; i++)
+    {
+      total += gcount[i].total;
+      spinlock += gcount[i].spinlock;
+    }
+
+  struct count cost = { total, spinlock, diff };
+  return cost;
+}
+
+int
+main (void)
+{
+  int numthreads = sysconf (_SC_NPROCESSORS_ONLN);
+  if (numthreads < 8)
+    return 1;
+
+  ops = &hashwork_ops;
+
+  int err_ret = posix_memalign ((void **)&gcount, 4096,
+				sizeof(gcount[0]) * numthreads);
+  if (err_ret)
+    {
+      printf ("posix_memalign failure: %s\n", strerror (err_ret));
+      exit (err_ret);
+    }
+
+  struct count cost, cost1;
+  double overhead;
+  int i, last;
+  cost1 = total_cost (1);
+  for (last = i = 2; i <= numthreads;)
+    {
+      last = i;
+      cost = total_cost (i);
+      overhead = cost.total;
+      overhead /= i;
+      overhead /= cost1.total;
+      printf ("Number of threads: %4d, Total time %14lld, Overhead: %.2f\n",
+	      i, cost.total, overhead);
+      if ((i * 2) < numthreads)
+	i = i * 2;
+      else
+	i = i + 16;
+    }
+
+  if (last != numthreads)
+    {
+      i = numthreads;
+      cost = total_cost (i);
+      overhead = cost.total;
+      overhead /= i;
+      overhead /= cost1.total;
+      printf ("Number of threads: %4d, Total time %14lld, Overhead: %.2f\n",
+	      i, cost.total, overhead);
+    }
+
+  free (gcount);
+  return 0;
+}
diff --git a/sysdeps/x86/nptl/tst-spinlock-overhead.c b/sysdeps/x86/nptl/tst-spinlock-overhead.c
new file mode 100644
index 0000000..edbf9d3
--- /dev/null
+++ b/sysdeps/x86/nptl/tst-spinlock-overhead.c
@@ -0,0 +1,287 @@
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include <sys/param.h>
+#include <errno.h>
+
+#define memory_barrier() __asm ("" ::: "memory")
+#define pause() __asm  ("rep ; nop" ::: "memory")
+
+#define CACHELINE_SIZE	64
+#define CACHE_ALIGNED	__attribute__((aligned(CACHELINE_SIZE)))
+
+#define constant_time 5
+unsigned long g_val CACHE_ALIGNED;
+unsigned long g_val2 CACHE_ALIGNED;
+unsigned long g_val3 CACHE_ALIGNED;
+unsigned long cmplock CACHE_ALIGNED;
+struct count
+{
+  unsigned long long total;
+  unsigned long long spinlock;
+  unsigned long long wall;
+} __attribute__((aligned(128)));
+
+struct count *gcount;
+
+/* The time consumed by one update is about 200 TSCs.  */
+static int delay_time_unlocked = 400;
+
+struct
+{
+  pthread_spinlock_t testlock;
+  char pad[64];
+} test CACHE_ALIGNED;
+
+#define lock() pthread_spin_lock(&test.testlock)
+#define unlock() pthread_spin_unlock(&test.testlock)
+
+static void
+__attribute__((constructor))
+init_spin (void)
+{
+  pthread_spin_init (&test.testlock, 0);
+}
+
+struct ops
+{
+  void *(*test) (void *arg);
+  void (*print_thread) (void *res, int);
+} *ops;
+
+struct stats_result
+{
+  unsigned long num;
+};
+
+void *work_thread (void *arg);
+
+void test_threads (int numthreads, unsigned long time);
+
+#define iterations (10000 * 5)
+
+static volatile int start_thread;
+static volatile int stop_flag;
+
+
+/* Delay some fixed time */
+static void
+delay_tsc (unsigned n)
+{
+  unsigned long long start, current, diff;
+  unsigned int aux;
+  start = __builtin_ia32_rdtscp (&aux);
+  while (1)
+    {
+      current = __builtin_ia32_rdtscp (&aux);
+      diff = current - start;
+      if (diff < n)
+	pause ();
+      else
+	break;
+    }
+}
+
+static void
+wait_a_bit (int delay_time)
+{
+  if (delay_time > 0)
+    delay_tsc (delay_time);
+}
+
+static void
+work_todo (void)
+{
+  unsigned long ret;
+  g_val = g_val + 1;
+  g_val2 = g_val2 + 1;
+  ret = __sync_val_compare_and_swap (&cmplock, 0, 1);
+  g_val3 = g_val3 + 1 + ret;
+}
+
+void *
+work_thread (void *arg)
+{
+  long i;
+  unsigned long pid = (unsigned long) arg;
+  struct stats_result *res;
+  unsigned long long start, end;
+  int err_ret = posix_memalign ((void **)&res, CACHELINE_SIZE,
+				roundup (sizeof (*res), CACHELINE_SIZE));
+  if (err_ret)
+    {
+      printf ("posix_memalign failure: %s\n", strerror (err_ret));
+      exit (err_ret);
+    }
+  long num = 0;
+
+  while (!start_thread)
+    pause ();
+
+  unsigned int aux;
+  start = __builtin_ia32_rdtscp (&aux);
+  for (i = 0; i < iterations; i++)
+    {
+      lock();
+      work_todo ();
+      unlock ();
+      wait_a_bit (delay_time_unlocked);
+      num++;
+    }
+  end = __builtin_ia32_rdtscp (&aux);
+  gcount[pid].total = end - start;
+  res->num = num;
+
+  return res;
+}
+
+
+void
+init_global_data(void)
+{
+  g_val = 0;
+  g_val2 = 0;
+  g_val3 = 0;
+  cmplock = 0;
+}
+
+void
+test_threads (int numthreads, unsigned long time)
+{
+  start_thread = 0;
+  stop_flag = 0;
+
+  memory_barrier ();
+
+  pthread_t thr[numthreads];
+  void *res[numthreads];
+  int i;
+
+  init_global_data ();
+  for (i = 0; i < numthreads; i++)
+    {
+      pthread_attr_t attr;
+      pthread_attr_init (&attr);
+      cpu_set_t set;
+      CPU_ZERO (&set);
+      (void)CPU_SET (i, &set);
+      pthread_attr_setaffinity_np (&attr, sizeof(cpu_set_t), &set);
+      pthread_create (&thr[i], &attr, ops->test, (void *)(long)i);
+    }
+
+  memory_barrier ();
+  start_thread = 1;
+  memory_barrier ();
+  sched_yield ();
+
+  if (time)
+    {
+      struct timespec ts =
+	{
+	  ts.tv_sec = time,
+	  ts.tv_nsec = 0
+	};
+      clock_nanosleep (CLOCK_MONOTONIC, 0, &ts, NULL);
+      memory_barrier ();
+      stop_flag = 1;
+    }
+
+  for (i = 0; i < numthreads; i++) {
+    pthread_join (thr[i], (void *)&res[i]);
+    free (res[i]);
+  }
+
+}
+
+struct ops hashwork_ops =
+{
+  .test = work_thread,
+};
+
+struct ops *ops;
+
+static struct count
+total_cost (int numthreads)
+{
+  int i;
+  unsigned long long total = 0;
+  unsigned long long spinlock = 0;
+
+  memset (gcount, 0, sizeof(gcount[0]) * numthreads);
+
+  unsigned long long start, end, diff;
+  unsigned int aux;
+
+  start = __builtin_ia32_rdtscp (&aux);
+  test_threads (numthreads, constant_time);
+  end = __builtin_ia32_rdtscp (&aux);
+  diff = end - start;
+
+  for (i = 0; i < numthreads; i++)
+    {
+      total += gcount[i].total;
+      spinlock += gcount[i].spinlock;
+    }
+
+  struct count cost = { total, spinlock, diff };
+  return cost;
+}
+
+int
+main (void)
+{
+  int numthreads = sysconf (_SC_NPROCESSORS_ONLN);
+  if (numthreads < 8)
+    return 1;
+
+  ops = &hashwork_ops;
+
+  int err_ret = posix_memalign ((void **)&gcount, 4096,
+				sizeof(gcount[0]) * numthreads);
+  if (err_ret)
+    {
+      printf ("posix_memalign failure: %s\n", strerror (err_ret));
+      exit (err_ret);
+    }
+
+  struct count cost, cost1;
+  double overhead;
+  int i, last;
+  cost1 = total_cost (1);
+  for (last = i = 2; i <= numthreads;)
+    {
+      last = i;
+      cost = total_cost (i);
+      overhead = cost.total;
+      overhead /= i;
+      overhead /= cost1.total;
+      printf ("Number of threads: %4d, Total time %14lld, Overhead: %.2f\n",
+	      i, cost.total, overhead);
+      if ((i * 2) < numthreads)
+	i = i * 2;
+      else
+	i = i + 16;
+    }
+
+  if (last != numthreads)
+    {
+      i = numthreads;
+      cost = total_cost (i);
+      overhead = cost.total;
+      overhead /= i;
+      overhead /= cost1.total;
+      printf ("Number of threads: %4d, Total time %14lld, Overhead: %.2f\n",
+	      i, cost.total, overhead);
+    }
+
+  free (gcount);
+  return 0;
+}

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ad1b9827c6fcf6e21e992846e5c09efa80432921

commit ad1b9827c6fcf6e21e992846e5c09efa80432921
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Nov 26 21:31:51 2018 +0800

    Integrate NUMA spinlock with glibc build

diff --git a/Makeconfig b/Makeconfig
index fbcf69e..190b375 100644
--- a/Makeconfig
+++ b/Makeconfig
@@ -566,7 +566,8 @@ link-libc-printers-tests = $(link-libc-rpath) \
 			   $(link-libc-tests-after-rpath-link)
 
 # This is how to find at build-time things that will be installed there.
-rpath-dirs = math elf dlfcn nss nis rt resolv mathvec support
+rpath-dirs = math elf dlfcn nss nis rt resolv mathvec support \
+	     numa-spinlock
 rpath-link = \
 $(common-objdir):$(subst $(empty) ,:,$(patsubst ../$(subdir),.,$(rpath-dirs:%=$(common-objpfx)%)))
 else  # build-static
@@ -1250,6 +1251,12 @@ else
 libsupport = $(common-objpfx)support/libsupport.a
 endif
 
+ifeq ($(build-shared),yes)
+libnuma-spinlock = $(common-objpfx)numa-spinlock/libnuma-spinlock.so$(libnuma-spinlock.so-version)
+else
+libnuma-spinlock = $(common-objpfx)numa-spinlock/libnuma-spinlock.a
+endif
+
 # These are the subdirectories containing the library source.  The order
 # is more or less arbitrary.  The sorting step will take care of the
 # dependencies.
@@ -1258,7 +1265,7 @@ all-subdirs = csu assert ctype locale intl catgets math setjmp signal	    \
 	      grp pwd posix io termios resource misc socket sysvipc gmon    \
 	      gnulib iconv iconvdata wctype manual shadow gshadow po argp   \
 	      localedata timezone rt conform debug mathvec support	    \
-	      dlfcn elf
+	      dlfcn elf numa-spinlock
 
 ifeq ($(build-crypt),yes)
 all-subdirs += crypt
diff --git a/configure b/configure
index 535e2f6..b48e5f4 100755
--- a/configure
+++ b/configure
@@ -6779,6 +6779,9 @@ fi
 config_vars="$config_vars
 build-mathvec = $build_mathvec"
 
+config_vars="$config_vars
+build-libnuma_spinlock = $build_libnuma_spinlock"
+
 
 
 
diff --git a/configure.ac b/configure.ac
index 6cc10ed..c09b3c6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1791,6 +1791,8 @@ if test x"$build_mathvec" = xnotset; then
 fi
 LIBC_CONFIG_VAR([build-mathvec], [$build_mathvec])
 
+LIBC_CONFIG_VAR([build-libnuma_spinlock], [$build_libnuma_spinlock])
+
 AC_SUBST(libc_extra_cflags)
 AC_SUBST(libc_extra_cppflags)
 
diff --git a/numa-spinlock/Makefile b/numa-spinlock/Makefile
new file mode 100644
index 0000000..43d4b08
--- /dev/null
+++ b/numa-spinlock/Makefile
@@ -0,0 +1,49 @@
+# Copyright (C) 2018 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+# Makefile for the numa spinlock library.
+
+subdir		:= numa-spinlock
+
+include ../Makeconfig
+
+ifeq ($(build-libnuma_spinlock),yes)
+extra-libs	:= libnuma_spinlock
+extra-libs-others = $(extra-libs)
+
+numa-spinlock-routines = numa-spinlock getmaxnumanode
+
+libnuma_spinlock-sysdep_routines += $(numa-spinlock-routines)
+libnuma_spinlock-tests +=
+tests += $(libnuma_spinlock-tests)
+
+ifeq ($(build-shared),yes)
+check-abi: $(objpfx)check-abi-libnuma_spinlock.out
+tests-special += $(objpfx)check-abi-libnuma_spinlock.out
+update-abi: update-abi-libnuma_spinlock
+update-all-abi: update-all-abi-libnuma_spinlock
+endif
+endif
+
+include ../Rules
+
+ifeq ($(build-shared),yes)
+libnuma_spinlock = $(objpfx)libnuma_spinlock.so
+else
+libnuma_spinlock = $(objpfx)libnuma_spinlock.a
+endif
+$(addprefix $(objpfx),$(libnuma_spinlock-tests)): $(libnuma_spinlock)
diff --git a/numa-spinlock/Versions b/numa-spinlock/Versions
new file mode 100644
index 0000000..92b656d
--- /dev/null
+++ b/numa-spinlock/Versions
@@ -0,0 +1,7 @@
+libnuma_spinlock {
+  GLIBC_2.29 {
+    apply_numa_spinlock;
+    init_numa_spinlock;
+    init_numa_spinlock_info;
+  }
+}
diff --git a/numa-spinlock/libnuma_spinlock.abilist b/numa-spinlock/libnuma_spinlock.abilist
new file mode 100644
index 0000000..e1f702e
--- /dev/null
+++ b/numa-spinlock/libnuma_spinlock.abilist
@@ -0,0 +1,3 @@
+GLIBC_2.29 apply_numa_spinlock F
+GLIBC_2.29 init_numa_spinlock F
+GLIBC_2.29 init_numa_spinlock_info F
diff --git a/shlib-versions b/shlib-versions
index b9cb99d..92576bf 100644
--- a/shlib-versions
+++ b/shlib-versions
@@ -75,3 +75,6 @@ libgcc_s=1
 
 # The vector math library
 libmvec=1
+
+# The numa spinlock library
+libnuma_spinlock=1
diff --git a/sysdeps/unix/sysv/linux/configure b/sysdeps/unix/sysv/linux/configure
index cace758..5ffeb1e 100644
--- a/sysdeps/unix/sysv/linux/configure
+++ b/sysdeps/unix/sysv/linux/configure
@@ -130,6 +130,9 @@ fi
 # One Linux we use ldconfig.
 use_ldconfig=yes
 
+# One Linux we build libnuma_spinlock.
+build_libnuma_spinlock=yes
+
 if test $host = $build; then
   # If $prefix/include/{net,scsi} are symlinks, make install will
   # clobber what they're linked to (probably a kernel tree).
diff --git a/sysdeps/unix/sysv/linux/configure.ac b/sysdeps/unix/sysv/linux/configure.ac
index 13abda0..5711859 100644
--- a/sysdeps/unix/sysv/linux/configure.ac
+++ b/sysdeps/unix/sysv/linux/configure.ac
@@ -89,6 +89,9 @@ fi
 # One Linux we use ldconfig.
 use_ldconfig=yes
 
+# One Linux we build libnuma_spinlock.
+build_libnuma_spinlock=yes
+
 if test $host = $build; then
   # If $prefix/include/{net,scsi} are symlinks, make install will
   # clobber what they're linked to (probably a kernel tree).

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2d3e8fe426c072bf1bc84668b556c80d9c4bb4bb

commit 2d3e8fe426c072bf1bc84668b556c80d9c4bb4bb
Author: ling.ma <ling.ml@antfin.com>
Date:   Mon Nov 26 21:31:51 2018 +0800

    NUMA spinlock
    
    Wire-latency (RC delay) dominate modern computer performance.  Serialized
    region protected by conventional spinlock causes cache line ping-pong
    seriously on multi-socket NUMA platforms.
    
    However if the serialized region is sent to one core and executed only
    when contention happens, it can save much time and power, because all
    shared data are located in private cache of one core .  We call it NUMA
    spinlock.
    
    Currently multiple CPU sockets give us better performance per watt,
    however that also involve more complex synchronization requirement.
    For example under critical section scenario , the Lock cache line
    will ping-pong among CPU sockets and the Competing-Lock process
    among more cores also bring more overhead. In this version
    we introduce distributed synchronization mechanism, which will
    reduce the issues a lot. Assuming There are 2 nodes:
    
    1.
    
      If(the thread is from node_0)
        Lock_from_node_0
    2.
    
      If (the thread is from node_1)
        Lock_from_node_1
    
    3.
    
      Lock_Global
    
    4.
    
      Enter critical section
    
    5.
    
      If(the thread is from node_0)
        UnLock_from_node_0
    
    6.
    
      if (the thread is from node_1)
        UnLock_from_node_1
    
    7.  The threads from the same node complete the critical one by one, until
    no waiting threads in the right node. During the process We also
    accelerate data and Lock movement in the same node.
    
    8. UnLock_Global:  We allow threads from other nodes to enter critical
      section
    
    Step 1 or 2 help us to mitigate Global Lock pression, and only one thread
    get Global Lock in step 3 & 4.
    
    Step 5 or 6 help us to reduce Global Lock & shared data movement, because
    Lock and shared data are locked in the same node.  NUMA spinlock is very
    good at step 7 , meanwhile which also balance the workload of Lock Owner
    in original version.

diff --git a/numa-spinlock/getmaxnumanode.c b/numa-spinlock/getmaxnumanode.c
new file mode 100644
index 0000000..44a96fc
--- /dev/null
+++ b/numa-spinlock/getmaxnumanode.c
@@ -0,0 +1,145 @@
+/* Get the maxinum NUMA node number, Linux version.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <ctype.h>
+#include <time.h>
+#include <string.h>
+#include <atomic.h>
+#include <not-cancel.h>
+
+#include "getmaxnumanode.h"
+
+static char *
+next_line (int fd, char *const buffer, char **cp, char **re,
+	   char *const buffer_end)
+{
+  char *res = *cp;
+  char *nl = memchr (*cp, '\n', *re - *cp);
+  if (nl == NULL)
+    {
+      if (*cp != buffer)
+	{
+	  if (*re == buffer_end)
+	    {
+	      memmove (buffer, *cp, *re - *cp);
+	      *re = buffer + (*re - *cp);
+	      *cp = buffer;
+
+	      ssize_t n = __read_nocancel (fd, *re, buffer_end - *re);
+	      if (n < 0)
+		return NULL;
+
+	      *re += n;
+
+	      nl = memchr (*cp, '\n', *re - *cp);
+	      while (nl == NULL && *re == buffer_end)
+		{
+		  /* Truncate too long lines.  */
+		  *re = buffer + 3 * (buffer_end - buffer) / 4;
+		  n = __read_nocancel (fd, *re, buffer_end - *re);
+		  if (n < 0)
+		    return NULL;
+
+		  nl = memchr (*re, '\n', n);
+		  **re = '\n';
+		  *re += n;
+		}
+	    }
+	  else
+	    nl = memchr (*cp, '\n', *re - *cp);
+
+	  res = *cp;
+	}
+
+      if (nl == NULL)
+	nl = *re - 1;
+    }
+
+  *cp = nl + 1;
+  assert (*cp <= *re);
+
+  return res == *re ? NULL : res;
+}
+
+unsigned int
+__get_max_numa_node (void)
+{
+  static unsigned int cached_result;
+  static time_t timestamp;
+
+  time_t now = time (NULL);
+  time_t prev = timestamp;
+  atomic_read_barrier ();
+  if (now == prev && cached_result)
+    return cached_result;
+
+  const size_t buffer_size = 1024;
+  char buffer[buffer_size];
+  char *buffer_end = buffer + buffer_size;
+  char *cp = buffer_end;
+  char *re = buffer_end;
+
+  const int flags = O_RDONLY | O_CLOEXEC;
+  int fd = __open_nocancel ("/sys/devices/system/node/online", flags);
+  char *l;
+  unsigned int result = 1;
+  if (fd != -1)
+    {
+      l = next_line (fd, buffer, &cp, &re, buffer_end);
+      if (l != NULL)
+	do
+	  {
+	    char *endp;
+	    unsigned long int n = strtoul (l, &endp, 10);
+	    if (l == endp)
+	      {
+		result = 0;
+		break;
+	      }
+
+	    unsigned long int m = n;
+	    if (*endp == '-')
+	      {
+		l = endp + 1;
+		m = strtoul (l, &endp, 10);
+		if (l == endp)
+		  {
+		    result = 0;
+		    break;
+		  }
+	      }
+
+	    if (m >= result)
+	      result = m + 1;
+
+	    l = endp;
+	    while (l < re && isspace (*l))
+	      ++l;
+	  }
+	while (l < re);
+
+      __close_nocancel_nostatus (fd);
+    }
+
+  cached_result = result;
+  atomic_write_barrier ();
+  timestamp = now;
+
+  return result;
+}
diff --git a/numa-spinlock/getmaxnumanode.h b/numa-spinlock/getmaxnumanode.h
new file mode 100644
index 0000000..7187f5e
--- /dev/null
+++ b/numa-spinlock/getmaxnumanode.h
@@ -0,0 +1,20 @@
+/* Get the maxinum NUMA node number, Linux version.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+extern unsigned int __get_max_numa_node (void)
+  __attribute__ ((visibility ("hidden")));
diff --git a/numa-spinlock/numa-spinlock.c b/numa-spinlock/numa-spinlock.c
new file mode 100644
index 0000000..9f05bd9
--- /dev/null
+++ b/numa-spinlock/numa-spinlock.c
@@ -0,0 +1,110 @@
+#include <string.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <errno.h>
+#include <atomic.h>
+#include "numa-spinlock.h"
+#include "getmaxnumanode.h"
+
+static inline void
+run_numa_spinlock (struct numa_spinlock_info *old, void **cpu)
+{
+  struct numa_spinlock_info *next, *current;
+
+  old->result = old->workload (old->argument);
+
+retry:
+  current = __sync_val_compare_and_swap (cpu, old, NULL);
+
+  if (current == old)
+    {
+      atomic_store_release (&current->pending, 0);
+      return;
+    }
+
+  current = atomic_exchange_acquire (cpu, old);
+
+repeat:
+  if (old == current)
+    goto retry;
+
+  while (!(next = atomic_load_relaxed (&current->next)))
+    atomic_spin_nop ();
+
+  current->result = current->workload (current->argument);
+  current->pending = 0;
+  current = next;
+  goto repeat;
+}
+
+void
+apply_numa_spinlock (struct numa_spinlock *queue,
+		     struct numa_spinlock_info *current)
+{
+  struct numa_spinlock_info *old;
+  void **core;
+
+  current->next = NULL;
+  current->pending = 1;
+  core = &queue->cpu[current->node].core;
+  old = atomic_exchange_acquire (core , current);
+  if (old)
+    {
+      atomic_store_release (&current->next, old);
+      while (atomic_load_relaxed (&current->pending))
+	atomic_spin_nop ();
+      return;
+    }
+
+  old = atomic_exchange_acquire (&queue->owner.core, current);
+  if (old)
+    {
+      atomic_store_release (&old->next, current);
+      while (atomic_load_relaxed (&current->pending))
+	atomic_spin_nop ();
+    }
+
+  run_numa_spinlock (current, core);
+  old = current;
+
+  current = __sync_val_compare_and_swap (&queue->owner.core, old, NULL);
+  if (current == old)
+    return;
+
+  while (!(current = atomic_load_relaxed (&old->next)))
+    atomic_spin_nop ();
+
+  atomic_store_release (&current->pending, 0);
+}
+
+int
+init_numa_spinlock_info (struct numa_spinlock *queue,
+			 struct numa_spinlock_info *info)
+{
+  memset (info, 0, sizeof (*info));
+  unsigned int node;
+  int err_ret = getcpu (NULL, &node);
+  if (err_ret)
+    return err_ret;
+  if (node >= queue->node_count)
+    {
+      errno = EINVAL;
+      return -1;
+    }
+  info->node = node;
+  return err_ret;
+}
+
+struct numa_spinlock *
+init_numa_spinlock (void)
+{
+  unsigned int node_count = __get_max_numa_node ();
+  size_t size = (sizeof (struct numa_spinlock)
+		 + node_count * sizeof (struct numa_spinlock_node));
+  struct numa_spinlock *queue;
+  if (posix_memalign ((void **) &queue, 64, size))
+    return NULL;
+  memset (queue, 0, size);
+  queue->node_count = node_count;
+  return queue;
+}
diff --git a/numa-spinlock/numa-spinlock.h b/numa-spinlock/numa-spinlock.h
new file mode 100644
index 0000000..79d259f
--- /dev/null
+++ b/numa-spinlock/numa-spinlock.h
@@ -0,0 +1,42 @@
+/* NUMA spinlock.  */
+
+#ifndef _NUMA_SPINLOCK_H
+#define _NUMA_SPINLOCK_H
+
+#include <features.h>
+
+__BEGIN_DECLS
+
+struct numa_spinlock_node
+{
+  void *core;
+  char __pad[64 - sizeof (void *)];
+} __attribute__((aligned (64)));
+
+struct numa_spinlock
+{
+  struct numa_spinlock_node owner;
+  unsigned int node_count;
+  struct numa_spinlock_node cpu[];
+};
+
+struct numa_spinlock_info
+{
+  struct numa_spinlock_info *next;
+  void *(*workload) (void *);
+  void *argument;
+  void *result;
+  unsigned int node;
+  int pending;
+  char __pad[64 - (4 * sizeof (void *) + 2 * sizeof (int))];
+} __attribute__((aligned (64)));
+
+extern struct numa_spinlock *init_numa_spinlock (void);
+extern int init_numa_spinlock_info (struct numa_spinlock *,
+				    struct numa_spinlock_info *);
+extern void apply_numa_spinlock (struct numa_spinlock *,
+				 struct numa_spinlock_info *);
+
+__END_DECLS
+
+#endif /* numa_spinlock.h */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=edf0e8943a164e21a854568c2b99458d6fbdc192

commit edf0e8943a164e21a854568c2b99458d6fbdc192
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Dec 5 08:36:46 2018 -0800

    Add getcpu
    
    Add
    
      #include <sched.h>
    
      int getcpu (unsigned int *cpu, unsigned int *node);
    
    to return currently used CPU and NUMA node.
    
    Tested on x86-64, x32 and i686 as well as with build-many-glibcs.py.
    
    	* NEWS: Mention getcpu.
    	* include/sched.h (__getcpu): New libc_hidden_proto.
    	* manual/resource.texi: Document getcpu.
    	* sysdeps/unix/sysv/linux/Makefile (sysdep_routines): Add getcpu.
    	* sysdeps/unix/sysv/linux/Versions (GLIBC_2.29): Add getcpu.
    	* sysdeps/unix/sysv/linux/aarch64/libc.abilist: Add getcpu.
    	* sysdeps/unix/sysv/linux/alpha/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/arm/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/hppa/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/i386/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/ia64/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/microblaze/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist:
    	Likewise.
    	* sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/nios2/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist:
    	Likewise.
    	* sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libc.abilist:
    	Likewise.
    	* sysdeps/unix/sysv/linux/powerpc/powerpc64/libc.abilist:
    	Likewise.
    	* sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/sh/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/x86_64/64/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist: Likewise.
    	* sysdeps/unix/sysv/linux/bits/sched.h (getcpu): New prototype.
    	* sysdeps/unix/sysv/linux/getcpu.c: New file.
    	* sysdeps/unix/sysv/linux/tst-skeleton-affinity.c (test_size):
    	Also check getcpu.

diff --git a/NEWS b/NEWS
index 8483dcf..3ade0e6 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,9 @@ Version 2.29
 
 Major new features:
 
+* The getcpu wrapper function has been added, which returns currently
+  used CPU and NUMA node.  This function is Linux-specific.
+
 * A new convenience target has been added for distribution maintainers
   to build and install all locales as directories with files.  The new
   target is run by issuing the following command in your build tree:
diff --git a/include/sched.h b/include/sched.h
index b698f78..0843c26 100644
--- a/include/sched.h
+++ b/include/sched.h
@@ -26,5 +26,7 @@ libc_hidden_proto (__clone)
 extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
 		     size_t __child_stack_size, int __flags, void *__arg, ...);
 libc_hidden_proto (__clone2)
+extern __typeof__ (getcpu) __getcpu;
+libc_hidden_proto (__getcpu)
 #endif
 #endif
diff --git a/manual/resource.texi b/manual/resource.texi
index 8bc2a80..8c4c92a 100644
--- a/manual/resource.texi
+++ b/manual/resource.texi
@@ -1429,6 +1429,27 @@ not leave a processor for the process or thread to run on.
 This function is a GNU extension and is declared in @file{sched.h}.
 @end deftypefun
 
+@deftypefun int getcpu (unsigned int *cpu, unsigned int *node)
+@standards{Linux, <sched.h>}
+@safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}}
+The @code{getcpu} function identifies the processor and node on which
+the calling thread or process is currently running and writes them into
+the integers pointed to by the @var{cpu} and @var{node} arguments.  The
+processor is a unique nonnegative integer identifying a CPU.  The node
+is a unique nonnegative integer identifying a NUMA node.  When either
+@var{cpu} or @var{node} is @code{NULL}, nothing is written to the
+respective pointer.
+
+The return value is @code{0} on success and @code{-1} on failure.  The
+following @code{errno} error condition is defined for this function:
+
+@table @code
+@item ENOSYS
+The operating system does not support this function.
+@end table
+
+This function is Linux-specific and is declared in @file{sched.h}.
+@end deftypefun
 
 @node Memory Resources
 @section Querying memory available resources
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 362cf3b..988855d 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -142,7 +142,7 @@ endif
 ifeq ($(subdir),posix)
 sysdep_headers += bits/initspin.h
 
-sysdep_routines += sched_getcpu oldglob
+sysdep_routines += sched_getcpu oldglob getcpu
 
 tests += tst-affinity tst-affinity-pid
 
diff --git a/sysdeps/unix/sysv/linux/Versions b/sysdeps/unix/sysv/linux/Versions
index 336c13b..f1e12d9 100644
--- a/sysdeps/unix/sysv/linux/Versions
+++ b/sysdeps/unix/sysv/linux/Versions
@@ -171,6 +171,9 @@ libc {
     mlock2;
     pkey_alloc; pkey_free; pkey_set; pkey_get; pkey_mprotect;
   }
+  GLIBC_2.29 {
+    getcpu;
+  }
   GLIBC_PRIVATE {
     # functions used in other libraries
     __syscall_rt_sigqueueinfo;
diff --git a/sysdeps/unix/sysv/linux/aarch64/libc.abilist b/sysdeps/unix/sysv/linux/aarch64/libc.abilist
index e66c741..c2ca32e 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libc.abilist
@@ -2138,4 +2138,5 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
diff --git a/sysdeps/unix/sysv/linux/alpha/libc.abilist b/sysdeps/unix/sysv/linux/alpha/libc.abilist
index 8df162f..a86ce44 100644
--- a/sysdeps/unix/sysv/linux/alpha/libc.abilist
+++ b/sysdeps/unix/sysv/linux/alpha/libc.abilist
@@ -2033,6 +2033,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/arm/libc.abilist b/sysdeps/unix/sysv/linux/arm/libc.abilist
index 43c804f..b10c07a 100644
--- a/sysdeps/unix/sysv/linux/arm/libc.abilist
+++ b/sysdeps/unix/sysv/linux/arm/libc.abilist
@@ -123,6 +123,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.4 _Exit F
 GLIBC_2.4 _IO_2_1_stderr_ D 0xa0
diff --git a/sysdeps/unix/sysv/linux/bits/sched.h b/sysdeps/unix/sysv/linux/bits/sched.h
index 34f27a7..ea5d51a 100644
--- a/sysdeps/unix/sysv/linux/bits/sched.h
+++ b/sysdeps/unix/sysv/linux/bits/sched.h
@@ -86,6 +86,9 @@ extern int unshare (int __flags) __THROW;
 /* Get index of currently used CPU.  */
 extern int sched_getcpu (void) __THROW;
 
+/* Get currently used CPU and NUMA node.  */
+extern int getcpu (unsigned int *, unsigned int *) __THROW;
+
 /* Switch process to namespace of type NSTYPE indicated by FD.  */
 extern int setns (int __fd, int __nstype) __THROW;
 #endif
diff --git a/sysdeps/unix/sysv/linux/getcpu.c b/sysdeps/unix/sysv/linux/getcpu.c
new file mode 100644
index 0000000..59c6cc4
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/getcpu.c
@@ -0,0 +1,38 @@
+/* Copyright (C) 2007-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <sched.h>
+#include <sysdep.h>
+
+#ifdef HAVE_GETCPU_VSYSCALL
+# define HAVE_VSYSCALL
+#endif
+#include <sysdep-vdso.h>
+
+int
+__getcpu (unsigned int *cpu, unsigned int *node)
+{
+#ifdef __NR_getcpu
+  return INLINE_VSYSCALL (getcpu, 3, cpu, node, NULL);
+#else
+  __set_errno (ENOSYS);
+  return -1;
+#endif
+}
+weak_alias (__getcpu, getcpu)
+libc_hidden_def (__getcpu)
diff --git a/sysdeps/unix/sysv/linux/hppa/libc.abilist b/sysdeps/unix/sysv/linux/hppa/libc.abilist
index 88b01c2..3527d39 100644
--- a/sysdeps/unix/sysv/linux/hppa/libc.abilist
+++ b/sysdeps/unix/sysv/linux/hppa/libc.abilist
@@ -1880,6 +1880,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/i386/libc.abilist b/sysdeps/unix/sysv/linux/i386/libc.abilist
index 6d02f31..979aaae 100644
--- a/sysdeps/unix/sysv/linux/i386/libc.abilist
+++ b/sysdeps/unix/sysv/linux/i386/libc.abilist
@@ -2045,6 +2045,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/ia64/libc.abilist b/sysdeps/unix/sysv/linux/ia64/libc.abilist
index 4249712..a4a9023 100644
--- a/sysdeps/unix/sysv/linux/ia64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/ia64/libc.abilist
@@ -1914,6 +1914,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist b/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
index d47b808..9f86f26 100644
--- a/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
+++ b/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
@@ -124,6 +124,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.4 _Exit F
 GLIBC_2.4 _IO_2_1_stderr_ D 0x98
diff --git a/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist b/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
index d5e3830..558ba5f 100644
--- a/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
+++ b/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
@@ -1989,6 +1989,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/microblaze/libc.abilist b/sysdeps/unix/sysv/linux/microblaze/libc.abilist
index 8596b84..ca773e3 100644
--- a/sysdeps/unix/sysv/linux/microblaze/libc.abilist
+++ b/sysdeps/unix/sysv/linux/microblaze/libc.abilist
@@ -2130,4 +2130,5 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
diff --git a/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
index 88e0f89..89c5c29 100644
--- a/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
@@ -1967,6 +1967,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
index aff7462..429624b 100644
--- a/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
@@ -1965,6 +1965,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
index 71d8244..fbe714e 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
@@ -1973,6 +1973,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
index de6c53d..baa657f 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
@@ -1968,6 +1968,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/nios2/libc.abilist b/sysdeps/unix/sysv/linux/nios2/libc.abilist
index e724bab..a2f407b 100644
--- a/sysdeps/unix/sysv/linux/nios2/libc.abilist
+++ b/sysdeps/unix/sysv/linux/nios2/libc.abilist
@@ -2171,4 +2171,5 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
index e9ecbcc..1fd4fca 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
@@ -1993,6 +1993,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libc.abilist
index da83ea6..cd3dcc0 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libc.abilist
@@ -1997,6 +1997,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libc-le.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libc-le.abilist
index 4535b40..54ddbb3 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libc-le.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libc-le.abilist
@@ -2228,4 +2228,5 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libc.abilist
index 65725de..5b9e494 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libc.abilist
@@ -123,6 +123,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 _Exit F
 GLIBC_2.3 _IO_2_1_stderr_ D 0xe0
diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist b/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
index bbb3c4a..36f5de9 100644
--- a/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
@@ -2100,4 +2100,5 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist b/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
index e85ac2a..b74e995 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
@@ -2002,6 +2002,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist b/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
index d569310..964ada9 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
@@ -1908,6 +1908,7 @@ GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
 GLIBC_2.29 __fentry__ F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/sh/libc.abilist b/sysdeps/unix/sysv/linux/sh/libc.abilist
index ff939a1..7c689af 100644
--- a/sysdeps/unix/sysv/linux/sh/libc.abilist
+++ b/sysdeps/unix/sysv/linux/sh/libc.abilist
@@ -1884,6 +1884,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist b/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
index 64fa9e1..e3cc7c1 100644
--- a/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
@@ -1996,6 +1996,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist b/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
index db909d1..a7dae1e 100644
--- a/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
@@ -1937,6 +1937,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/tst-skeleton-affinity.c b/sysdeps/unix/sysv/linux/tst-skeleton-affinity.c
index 695c1cc..fd1357b 100644
--- a/sysdeps/unix/sysv/linux/tst-skeleton-affinity.c
+++ b/sysdeps/unix/sysv/linux/tst-skeleton-affinity.c
@@ -165,6 +165,18 @@ test_size (const struct conf *conf, size_t size)
   for (int cpu = 0; cpu <= conf->last_cpu; ++cpu)
     {
       int active_cpu = sched_getcpu ();
+      unsigned int numa_cpu, numa_node;
+      if (getcpu (&numa_cpu, &numa_node))
+	{
+	  printf ("error: getcpu: %m\n");
+	  return false;
+	}
+      if ((unsigned int) active_cpu != numa_cpu)
+	{
+	  printf ("error: Unexpected CPU %d, expected %d\n",
+		  active_cpu, numa_cpu);
+	  return false;
+	}
       if (last_active_cpu >= 0 && last_active_cpu != active_cpu)
 	{
 	  printf ("error: Unexpected CPU %d, expected %d\n",
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
index 3b175f1..87b6b6d 100644
--- a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
@@ -1895,6 +1895,7 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.3 __ctype_b_loc F
 GLIBC_2.3 __ctype_tolower_loc F
diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
index 1b57710..39a0009 100644
--- a/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
@@ -2146,4 +2146,5 @@ GLIBC_2.28 thrd_current F
 GLIBC_2.28 thrd_equal F
 GLIBC_2.28 thrd_sleep F
 GLIBC_2.28 thrd_yield F
+GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources



More information about the Glibc-cvs mailing list