[PATCH] nptl: Disable THP on thread stack if it incurs in large RSS usage

Adhemerval Zanella Netto adhemerval.zanella@linaro.org
Mon May 15 17:57:11 GMT 2023



On 03/05/23 09:42, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>> +static __always_inline int
>> +advise_thp (void *mem, size_t size, char *guard)
>> +{
>> +  enum malloc_thp_mode_t thpmode = __malloc_thp_mode ();
>> +  if (thpmode != malloc_thp_mode_always)
>> +    return 0;
>> +
>> +  unsigned long int thpsize = __malloc_default_thp_pagesize ();
>> +  if (PTR_ALIGN_DOWN (mem, thpsize) != PTR_ALIGN_DOWN (guard, thpsize))
>> +    return 0;
>> +
>> +  return __madvise (mem, size, MADV_NOHUGEPAGE);
>> +}
> 
> This still doesn't make sense since if _STACK_GROWS_DOWN, mem == guard, so
> this will always execute the madvise. 

Yes, if THP is set to always this is exactly the idea of this patch since
afaiu the kernel might still back up the stack with large pages if the 
request a size is smaller than the default THP.  It is only an issue if
the guard page address is not aligned to THP default size, which will
potentially trigger issues Cupertino has brought (since we do not prior
hand which is the mapping flags used on page used to fulfill the allocation). 

> As I mentioned, I couldn't find evidence that
> the claimed scenario of a huge page allocated, written to and then split due to the
> mprotect exists.

I adapted Cupertino original test to allow specify both the thread stack
and guard size by command line.  Just:

$ gcc -std=gnu11 -O2 -g -I. -D_LARGEFILE64_SOURCE=1 -D_GNU_SOURCE   -c -o tststackalloc.o tststackalloc.c
$ echo "always" | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
$ ./tststackalloc -m
[...]
[statm] RSS: 65964 pages (270188544 bytes = 257 MB)
[smaps] RSS: 270327808 bytes = 257 MB
[...]

With either the new tunable or this patch:

$ ld-linux-x86-64.so.2 --library-path . ./tststackalloc  -m
[...]
[statm] RSS: 531 pages (2174976 bytes = 2 MB)
[smaps] RSS: 3002368 bytes = 2 MB
[...]

> 
> So the real issue is that the current stack allocation code randomly (based on
> alignment from previous mmap calls) uses huge pages even for small stacks.

Keep in mind this heuristic is only enabled if THP is set to 'always', meaning
the kernel will try to back *all* the stack with large pages.  The issue is
when the *guard* page is within a large page.
-------------- next part --------------
// Compile & run:
//    gcc -Wall -g -o tststackalloc tststackalloc.c $< -lpthread
//    ./tststackalloc 1     # Attempt to use huge pages for stacks -> RSS bloat
//    ./tststackalloc 0     # Do not attempt to use huge pages -> No RSS bloat

#include <pthread.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
#include <inttypes.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <assert.h>
#include <ctype.h>

// Number of threads to create
#define NOOF_THREADS (128)

// Size of a small page (hard-coded)
#define SMALL_PAGE_SIZE (4*1024)
static size_t small_page_size = 0;

// Size of a huge page (hard-coded)
#define HUGE_PAGE_SIZE (2*1024*1024)
static size_t huge_page_size = 0;

// Total size of the thread stack, including the guard page(s)
#define STACK_SIZE_TOTAL (HUGE_PAGE_SIZE)

// Size of the guard page(s)
#define GUARD_SIZE (SMALL_PAGE_SIZE)

// When enabled (set to non-zero), tries to align thread stacks on
// huge page boundaries, making them eligible for huge pages
static int huge_page_align_stacks;

static volatile int exit_thread = 0;

unsigned long int
default_thp_pagesize (void)
{
  int fd = open (
    "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", O_RDONLY);
  if (fd == -1)
    return -1;

  char str[64];
  ssize_t s = read (fd, str, sizeof (str));
  close (fd);
  if (s < 0)
    return -1;

  unsigned long int r = 0;
  for (ssize_t i = 0; i < s; i++)
    {
      if (str[i] == '\n')
        break;
      r *= 10;
      r += str[i] - '0';
    }
  return r;
}

static void *
start (void *arg)
{
  while (!exit_thread)
    sleep (1);
  return NULL;
}

static char *
next_line (int fd, char *const buffer, char **cp, char **re,
           char *const buffer_end)
{
  char *res = *cp;
  char *nl = memchr (*cp, '\n', *re - *cp);
  if (nl == NULL)
    {
      if (*cp != buffer)
        {
          if (*re == buffer_end)
            {
              memmove (buffer, *cp, *re - *cp);
              *re = buffer + (*re - *cp);
              *cp = buffer;

              ssize_t n = read (fd, *re, buffer_end - *re);
              if (n < 0)
                return NULL;

              *re += n;

              nl = memchr (*cp, '\n', *re - *cp);
              while (nl == NULL && *re == buffer_end)
                {
                  /* Truncate too long lines.  */
                  *re = buffer + 3 * (buffer_end - buffer) / 4;
                  n = read (fd, *re, buffer_end - *re);
                  if (n < 0)
                    return NULL;

                  nl = memchr (*re, '\n', n);
                  **re = '\n';
                  *re += n;
                }
            }
          else
            nl = memchr (*cp, '\n', *re - *cp);

          res = *cp;
        }

      if (nl == NULL)
        nl = *re - 1;
    }

  *cp = nl + 1;
  assert (*cp <= *re);

  return res == *re ? NULL : res;
}

static void
read_proc_file (const char *fname, void (*closure)(const char *, size_t *),
						   size_t *arg)
{
  int fd = open (fname, O_RDONLY | O_CLOEXEC);
  assert (fd != -1);

  enum { buffer_size = 1024 };
  char buffer[buffer_size];
  char *buffer_end = buffer + buffer_size;
  char *cp = buffer_end;
  char *re = buffer_end;

  char *l;
  while ((l = next_line (fd, buffer, &cp, &re, buffer_end)) != NULL)
    closure (l, arg);

  close (fd);
}

static void
parse_statm_line (const char *line, size_t *ret)
{
  long int rss;
  assert (sscanf (line, "%*d %ld", &rss) == 1);
  *ret = rss;
}

static void
parse_statm (void)
{
  size_t rss = 0;
  read_proc_file ("/proc/self/statm", parse_statm_line, &rss);

  fprintf (stderr, "[statm] RSS: %zd pages (%zd bytes = %zd MB)\n", rss,
	   rss * small_page_size, rss * small_page_size / 1024 / 1024);
}

static void
parse_smaps_line (const char *line, size_t *total)
{
  static const char field[] = "Rss:";
  size_t fieldlen = strlen (field);
  if (strncmp (line, field, fieldlen) != 0)
    return;

  // skip spaces
  for (line += fieldlen; isspace (*line); line++);

  enum { numberlen = 32 };
  char number[numberlen];
  size_t i;
  for (i = 0; isdigit (line[i]) && i < numberlen - 1; i++)
    number[i] = line[i];
  number[i] = '\0';

  // Assume kB.
  long int value = strtol (number, NULL, 10);
  assert (value != LONG_MIN && value != LONG_MAX && errno != ERANGE);

  *total += value * 1024;
}

static void
parse_smaps (void)
{
  size_t rss = 0;
  read_proc_file ("/proc/self/smaps", parse_smaps_line, &rss);

  fprintf (stderr, "[smaps] RSS: %zd bytes = %zd MB\n", rss,
	   rss / (1024 * 1024));
}


static inline uintptr_t
align_down (uintptr_t value, uintptr_t alignment)
{
  return value & ~(alignment - 1);
}

// Do a series of small, single page mmap calls to attempt to set
// everything up so that the next mmap call (glibc allocating the
// stack) returns a 2MB aligned range. The kernel "expands" vmas from
// higher to lower addresses (subsequent calls return ranges starting
// at lower addresses), so this function keeps calling mmap until it a
// huge page aligned address is returned. The next range (the stack)
// will then end on that same address.
static void
align_next_on (uintptr_t alignment)
{
  uintptr_t p;
  do
    {
      p =
	(uintptr_t) mmap (NULL, small_page_size, PROT_NONE,
			  MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0);
    }
  while (p != align_down (p, huge_page_size));
}

static size_t
parse_size_t (const char *value)
{
  char *endptr;
  errno = 0;
  size_t n = strtoull (value, &endptr, 10);
  if (errno == ERANGE)
    {
      fprintf (stderr, "error: invalid %s value\n", value);
      exit (EXIT_FAILURE);
    }
  return n;
}

int
main (int argc, char *argv[])
{
  int opt;

  size_t guard_size = GUARD_SIZE;
  size_t stack_size = STACK_SIZE_TOTAL;

  while ((opt = getopt (argc, argv, "g:s:m")) != -1)
    {
      switch (opt)
	{
	case 'g':
	  guard_size = parse_size_t (optarg);
	  break;
	case 's':
	  stack_size = parse_size_t (optarg);
	  break;
	case 'm':
	  huge_page_align_stacks = 1;
	  break;
	default:
	  fprintf (stderr, "Usage: %s [-s stacksize] [-g guardsize] [-m]\n",
		   argv[0]);
	  exit (EXIT_FAILURE);
	}
    }

  huge_page_size = default_thp_pagesize ();
  if (huge_page_size == 0)
    huge_page_size = HUGE_PAGE_SIZE;

  {
    long int sz = sysconf (_SC_PAGESIZE);
    if (sz == -1)
      small_page_size = SMALL_PAGE_SIZE;
    else
      small_page_size = sz;
  }

  pthread_t t[NOOF_THREADS];
  pthread_attr_t attr;
  int i;

  void *dummy = malloc (1024);
  free (dummy);

  fprintf (stderr, "Page size: %zd kB, %zd MB huge pages\n",
	   small_page_size / 1024, huge_page_size / (1024 * 1024));
  fprintf (stderr, "Stack size: %zd kB, guard size: %zd kB\n",
	   stack_size / 1024, guard_size / 1024);
  if (huge_page_align_stacks)
    {
      fprintf (stderr,
	       "Will attempt to align allocations to make stacks eligible for huge pages\n");
    }
  pid_t pid = getpid ();
  fprintf (stderr, "pid: %d (/proc/%d/smaps)\n", pid, pid);

  pthread_attr_init (&attr);
  pthread_attr_setstacksize (&attr, stack_size);
  pthread_attr_setguardsize (&attr, guard_size);

  fprintf (stderr, "Creating %d threads...\n", NOOF_THREADS);
  for (i = 0; i < NOOF_THREADS; i++)
    {
      if (huge_page_align_stacks)
	{
	  // align (next) allocation on huge page boundary
	  align_next_on (huge_page_size);
	}
      pthread_create (&t[i], &attr, start, NULL);
    }
  sleep (1);

  parse_statm ();
  parse_smaps ();

  fprintf (stderr, "Press enter to exit...\n");
  getchar ();

  exit_thread = 1;
  for (i = 0; i < NOOF_THREADS; i++)
    pthread_join (t[i], NULL);
  return 0;
}


More information about the Libc-alpha mailing list