[PATCH] NUMA spinlock [BZ #23962]
Rich Felker
dalias@libc.org
Thu Jan 3 20:43:00 GMT 2019
On Wed, Dec 26, 2018 at 10:50:19AM +0800, Ma Ling wrote:
> From: "ling.ma" <ling.ml@antfin.com>
>
> On multi-socket systems, memory is shared across the entire system.
> Data access to the local socket is much faster than the remote socket
> and data access to the local core is faster than sibling cores on the
> same socket. For serialized workloads with conventional spinlock,
> when there is high spinlock contention between threads, lock ping-pong
> among sockets becomes the bottleneck and threads spend majority of
> their time in spinlock overhead.
>
> On multi-socket systems, the keys to our NUMA spinlock performance
> are to minimize cross-socket traffic as well as localize the serialized
> workload to one core for execution. The basic principles of NUMA
> spinlock are mainly consisted of following approaches, which reduce
> data movement and accelerate critical section, eventually give us
> significant performance improvement.
I question whether this belongs in glibc. It seems highly application-
and kernel-specific. Is there a reason you wouldn't prefer to
implement and maintain it in a library for use in the kind of
application that needs it?
Some specific review inline below:
> [...]
> diff --git a/sysdeps/unix/sysv/linux/numa_spinlock_alloc.c b/sysdeps/unix/sysv/linux/numa_spinlock_alloc.c
> new file mode 100644
> index 0000000..8ff4e1a
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/numa_spinlock_alloc.c
> @@ -0,0 +1,304 @@
> +/* Initialization of NUMA spinlock.
> + Copyright (C) 2018 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <assert.h>
> +#include <ctype.h>
> +#include <string.h>
> +#include <dirent.h>
> +#include <stdio.h>
> +#include <limits.h>
> +#ifdef _LIBC
> +# include <not-cancel.h>
> +#else
> +# include <stdlib.h>
> +# include <unistd.h>
> +# include <fcntl.h>
> +# define __open_nocancel open
> +# define __close_nocancel_nostatus close
> +# define __read_nocancel read
> +#endif
> +
> +#include "numa-spinlock-private.h"
> +
> +static char *
> +next_line (int fd, char *const buffer, char **cp, char **re,
> + char *const buffer_end)
> +{
> + char *res = *cp;
> + char *nl = memchr (*cp, '\n', *re - *cp);
> + if (nl == NULL)
> + {
> + if (*cp != buffer)
> + {
> + if (*re == buffer_end)
> + {
> + memmove (buffer, *cp, *re - *cp);
> + *re = buffer + (*re - *cp);
> + *cp = buffer;
> +
> + ssize_t n = __read_nocancel (fd, *re, buffer_end - *re);
> + if (n < 0)
> + return NULL;
> +
> + *re += n;
> +
> + nl = memchr (*cp, '\n', *re - *cp);
> + while (nl == NULL && *re == buffer_end)
> + {
> + /* Truncate too long lines. */
> + *re = buffer + 3 * (buffer_end - buffer) / 4;
> + n = __read_nocancel (fd, *re, buffer_end - *re);
> + if (n < 0)
> + return NULL;
> +
> + nl = memchr (*re, '\n', n);
> + **re = '\n';
> + *re += n;
> + }
> + }
> + else
> + nl = memchr (*cp, '\n', *re - *cp);
> +
> + res = *cp;
> + }
> +
> + if (nl == NULL)
> + nl = *re - 1;
> + }
> +
> + *cp = nl + 1;
> + assert (*cp <= *re);
> +
> + return res == *re ? NULL : res;
> +}
This looks like fragile duplication of stdio-like buffering logic
that's not at all specific to this file. Does glibc have a policy on
whether things needing this should use stdio or some other shared code
rather than open-coding it like this?
> [...]
> +/* Allocate a NUMA spinlock and return a pointer to it. Caller should
> + call numa_spinlock_free on the NUMA spinlock pointer to free the
> + memory when it is no longer needed. */
> +
> +struct numa_spinlock *
> +numa_spinlock_alloc (void)
> +{
> + const size_t buffer_size = 1024;
> + char buffer[buffer_size];
> + char *buffer_end = buffer + buffer_size;
> + char *cp = buffer_end;
> + char *re = buffer_end;
> +
> + const int flags = O_RDONLY | O_CLOEXEC;
> + int fd = __open_nocancel ("/sys/devices/system/node/online", flags);
> + char *l;
> + unsigned int max_node = 0;
> + unsigned int node_count = 0;
> + if (fd != -1)
> + {
> + l = next_line (fd, buffer, &cp, &re, buffer_end);
> + if (l != NULL)
> + do
> + {
> + char *endp;
> + unsigned long int n = strtoul (l, &endp, 10);
> + if (l == endp)
> + {
> + node_count = 1;
> + break;
> + }
> +
> + unsigned long int m = n;
> + if (*endp == '-')
> + {
> + l = endp + 1;
> + m = strtoul (l, &endp, 10);
> + if (l == endp)
> + {
> + node_count = 1;
> + break;
> + }
> + }
> +
> + node_count += m - n + 1;
> +
> + if (m >= max_node)
> + max_node = m;
> +
> + l = endp;
> + while (l < re && isspace (*l))
> + ++l;
> + }
> + while (l < re);
> +
> + __close_nocancel_nostatus (fd);
> + }
> +
> + /* NB: Some NUMA nodes may not be available, if the number of NUMA
> + nodes is 1, set the maximium NUMA node number to 0. */
> + if (node_count == 1)
> + max_node = 0;
> +
> + unsigned int max_cpu = 0;
> + unsigned int *physical_package_id_p = NULL;
> +
> + if (max_node == 0)
> + {
> + /* There is at least 1 node. */
> + node_count = 1;
> +
> + /* If NUMA is disabled, use physical_package_id instead. */
> + struct dirent **cpu_list;
> + int nprocs = scandir ("/sys/devices/system/cpu", &cpu_list,
> + select_cpu, NULL);
> + if (nprocs > 0)
> + {
> + int i;
> + unsigned int *cpu_id_p = NULL;
> +
> + /* Find the maximum CPU number. */
> + if (posix_memalign ((void **) &cpu_id_p,
> + __alignof__ (void *),
> + nprocs * sizeof (unsigned int)) == 0)
Using posix_memalign to get memory with the alignment of
__alignof__(void*) makes no sense. All allocations via malloc are
suitably aligned for any standard type.
> + {
> + for (i = 0; i < nprocs; i++)
> + {
> + unsigned int cpu_id
> + = strtoul (cpu_list[i]->d_name + 3, NULL, 10);
> + cpu_id_p[i] = cpu_id;
> + if (cpu_id > max_cpu)
> + max_cpu = cpu_id;
> + }
> +
> + if (posix_memalign ((void **) &physical_package_id_p,
> + __alignof__ (void *),
> + ((max_cpu + 1)
> + * sizeof (unsigned int))) == 0)
Again.
> + {
> + memset (physical_package_id_p, 0,
> + ((max_cpu + 1) * sizeof (unsigned int)));
> +
> + max_node = UINT_MAX;
> +
> + /* Get physical_package_id. */
> + char path[(sizeof ("/sys/devices/system/cpu")
> + + 3 * sizeof (unsigned long int)
> + + sizeof ("/topology/physical_package_id"))];
> + for (i = 0; i < nprocs; i++)
> + {
> + struct dirent *d = cpu_list[i];
> + if (snprintf (path, sizeof (path),
> + "/sys/devices/system/cpu/%s/topology/physical_package_id",
> + d->d_name) > 0)
Are these sysfs pathnames documented as stable/permanent by the
kernel?
> + {
> + fd = __open_nocancel (path, flags);
> + if (fd != -1)
> + {
> + if (__read_nocancel (fd, buffer,
> + buffer_size) > 0)
> + {
> + char *endp;
> + unsigned long int package_id
> + = strtoul (buffer, &endp, 10);
> + if (package_id != ULONG_MAX
> + && *buffer != '\0'
> + && (*endp == '\0' || *endp == '\n'))
> + {
> + physical_package_id_p[cpu_id_p[i]]
> + = package_id;
> + if (max_node == UINT_MAX)
> + {
> + /* This is the first node. */
> + max_node = package_id;
> + }
> + else if (package_id != max_node)
> + {
> + /* NB: We only need to know if
> + NODE_COUNT > 1. */
> + node_count = 2;
> + if (package_id > max_node)
> + max_node = package_id;
> + }
> + }
> + }
> + __close_nocancel_nostatus (fd);
> + }
> + }
> +
> + free (d);
> + }
> + }
> +
> + free (cpu_id_p);
> + }
> + else
> + {
> + for (i = 0; i < nprocs; i++)
> + free (cpu_list[i]);
> + }
> +
> + free (cpu_list);
> + }
> + }
> +
> + if (physical_package_id_p != NULL && node_count == 1)
> + {
> + /* There is only one node. No need for physical_package_id_p. */
> + free (physical_package_id_p);
> + physical_package_id_p = NULL;
> + max_cpu = 0;
> + }
> +
> + /* Allocate an array of struct numa_spinlock_info pointers to hold info
> + for all NUMA nodes with NUMA node number from getcpu () as index. */
> + size_t size = (sizeof (struct numa_spinlock)
> + + ((max_node + 1)
> + * sizeof (struct numa_spinlock_info *)));
> + struct numa_spinlock *lock;
> + if (posix_memalign ((void **) &lock,
> + __alignof__ (struct numa_spinlock_info *), size))
Another gratuitous posix_memalign.
More information about the Libc-alpha
mailing list