nptl/allocatestack.c

   1 /* Copyright (C) 2002-2020 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <dl-sysdep.h>
  28 #include <dl-tls.h>
  29 #include <tls.h>
  30 #include <list.h>
  31 #include <lowlevellock.h>
  32 #include <futex-internal.h>
  33 #include <kernel-features.h>
  34 #include <stack-aliasing.h>
  35
  36
  37 #ifndef NEED_SEPARATE_REGISTER_STACK
  38
  39 /* Most architectures have exactly one stack pointer.  Some have more.  */
  40 # define STACK_VARIABLES void *stackaddr = NULL
  41
  42 /* How to pass the values to the 'create_thread' function.  */
  43 # define STACK_VARIABLES_ARGS stackaddr
  44
  45 /* How to declare function which gets there parameters.  */
  46 # define STACK_VARIABLES_PARMS void *stackaddr
  47
  48 /* How to declare allocate_stack.  */
  49 # define ALLOCATE_STACK_PARMS void **stack
  50
  51 /* This is how the function is called.  We do it this way to allow
  52    other variants of the function to have more parameters.  */
  53 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  54
  55 #else
  56
  57 /* We need two stacks.  The kernel will place them but we have to tell
  58    the kernel about the size of the reserved address space.  */
  59 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  60
  61 /* How to pass the values to the 'create_thread' function.  */
  62 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  63
  64 /* How to declare function which gets there parameters.  */
  65 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  66
  67 /* How to declare allocate_stack.  */
  68 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  69
  70 /* This is how the function is called.  We do it this way to allow
  71    other variants of the function to have more parameters.  */
  72 # define ALLOCATE_STACK(attr, pd) \
  73   allocate_stack (attr, pd, &stackaddr, &stacksize)
  74
  75 #endif
  76
  77
  78 /* Default alignment of stack.  */
  79 #ifndef STACK_ALIGN
  80 # define STACK_ALIGN __alignof__ (long double)
  81 #endif
  82
  83 /* Default value for minimal stack size after allocating thread
  84    descriptor and guard.  */
  85 #ifndef MINIMAL_REST_STACK
  86 # define MINIMAL_REST_STACK     4096
  87 #endif
  88
  89
  90 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  91    a stack.  Use it when possible.  */
  92 #ifndef MAP_STACK
  93 # define MAP_STACK 0
  94 #endif
  95
  96 /* This yields the pointer that TLS support code calls the thread pointer.  */
  97 #if TLS_TCB_AT_TP
  98 # define TLS_TPADJ(pd) (pd)
  99 #elif TLS_DTV_AT_TP
 100 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
 101 #endif
 102
 103 /* Cache handling for not-yet free stacks.  */
 104
 105 /* Maximum size in kB of cache.  */
 106 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 107 static size_t stack_cache_actsize;
 108
 109 /* Mutex protecting this variable.  */
 110 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 111
 112 /* List of queued stack frames.  */
 113 static LIST_HEAD (stack_cache);
 114
 115 /* List of the stacks in use.  */
 116 static LIST_HEAD (stack_used);
 117
 118 /* We need to record what list operations we are going to do so that,
 119    in case of an asynchronous interruption due to a fork() call, we
 120    can correct for the work.  */
 121 static uintptr_t in_flight_stack;
 122
 123 /* List of the threads with user provided stacks in use.  No need to
 124    initialize this, since it's done in __pthread_initialize_minimal.  */
 125 list_t __stack_user __attribute__ ((nocommon));
 126 hidden_data_def (__stack_user)
 127
 128
 129 /* Check whether the stack is still used or not.  */
 130 #define FREE_P(descr) ((descr)->tid <= 0)
 131
 132
 133 static void
 134 stack_list_del (list_t *elem)
 135 {
 136   in_flight_stack = (uintptr_t) elem;
 137
 138   atomic_write_barrier ();
 139
 140   list_del (elem);
 141
 142   atomic_write_barrier ();
 143
 144   in_flight_stack = 0;
 145 }
 146
 147
 148 static void
 149 stack_list_add (list_t *elem, list_t *list)
 150 {
 151   in_flight_stack = (uintptr_t) elem | 1;
 152
 153   atomic_write_barrier ();
 154
 155   list_add (elem, list);
 156
 157   atomic_write_barrier ();
 158
 159   in_flight_stack = 0;
 160 }
 161
 162
 163 /* We create a double linked list of all cache entries.  Double linked
 164    because this allows removing entries from the end.  */
 165
 166
 167 /* Get a stack frame from the cache.  We have to match by size since
 168    some blocks might be too small or far too large.  */
 169 static struct pthread *
 170 get_cached_stack (size_t *sizep, void **memp)
 171 {
 172   size_t size = *sizep;
 173   struct pthread *result = NULL;
 174   list_t *entry;
 175
 176   lll_lock (stack_cache_lock, LLL_PRIVATE);
 177
 178   /* Search the cache for a matching entry.  We search for the
 179      smallest stack which has at least the required size.  Note that
 180      in normal situations the size of all allocated stacks is the
 181      same.  As the very least there are only a few different sizes.
 182      Therefore this loop will exit early most of the time with an
 183      exact match.  */
 184   list_for_each (entry, &stack_cache)
 185     {
 186       struct pthread *curr;
 187
 188       curr = list_entry (entry, struct pthread, list);
 189       if (FREE_P (curr) && curr->stackblock_size >= size)
 190         {
 191           if (curr->stackblock_size == size)
 192             {
 193               result = curr;
 194               break;
 195             }
 196
 197           if (result == NULL
 198               || result->stackblock_size > curr->stackblock_size)
 199             result = curr;
 200         }
 201     }
 202
 203   if (__builtin_expect (result == NULL, 0)
 204       /* Make sure the size difference is not too excessive.  In that
 205          case we do not use the block.  */
 206       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 207     {
 208       /* Release the lock.  */
 209       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 210
 211       return NULL;
 212     }
 213
 214   /* Don't allow setxid until cloned.  */
 215   result->setxid_futex = -1;
 216
 217   /* Dequeue the entry.  */
 218   stack_list_del (&result->list);
 219
 220   /* And add to the list of stacks in use.  */
 221   stack_list_add (&result->list, &stack_used);
 222
 223   /* And decrease the cache size.  */
 224   stack_cache_actsize -= result->stackblock_size;
 225
 226   /* Release the lock early.  */
 227   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 228
 229   /* Report size and location of the stack to the caller.  */
 230   *sizep = result->stackblock_size;
 231   *memp = result->stackblock;
 232
 233   /* Cancellation handling is back to the default.  */
 234   result->cancelhandling = 0;
 235   result->cleanup = NULL;
 236
 237   /* No pending event.  */
 238   result->nextevent = NULL;
 239
 240   result->tls_state = (struct tls_internal_t) { 0 };
 241
 242   /* Clear the DTV.  */
 243   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 244   for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
 245     free (dtv[1 + cnt].pointer.to_free);
 246   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 247
 248   /* Re-initialize the TLS.  */
 249   _dl_allocate_tls_init (TLS_TPADJ (result));
 250
 251   return result;
 252 }
 253
 254
 255 /* Free stacks until cache size is lower than LIMIT.  */
 256 static void
 257 free_stacks (size_t limit)
 258 {
 259   /* We reduce the size of the cache.  Remove the last entries until
 260      the size is below the limit.  */
 261   list_t *entry;
 262   list_t *prev;
 263
 264   /* Search from the end of the list.  */
 265   list_for_each_prev_safe (entry, prev, &stack_cache)
 266     {
 267       struct pthread *curr;
 268
 269       curr = list_entry (entry, struct pthread, list);
 270       if (FREE_P (curr))
 271         {
 272           /* Unlink the block.  */
 273           stack_list_del (entry);
 274
 275           /* Account for the freed memory.  */
 276           stack_cache_actsize -= curr->stackblock_size;
 277
 278           /* Free the memory associated with the ELF TLS.  */
 279           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 280
 281           /* Remove this block.  This should never fail.  If it does
 282              something is really wrong.  */
 283           if (__munmap (curr->stackblock, curr->stackblock_size) != 0)
 284             abort ();
 285
 286           /* Maybe we have freed enough.  */
 287           if (stack_cache_actsize <= limit)
 288             break;
 289         }
 290     }
 291 }
 292
 293 /* Free all the stacks on cleanup.  */
 294 void
 295 __nptl_stacks_freeres (void)
 296 {
 297   free_stacks (0);
 298 }
 299
 300 /* Add a stack frame which is not used anymore to the stack.  Must be
 301    called with the cache lock held.  */
 302 static inline void
 303 __attribute ((always_inline))
 304 queue_stack (struct pthread *stack)
 305 {
 306   /* We unconditionally add the stack to the list.  The memory may
 307      still be in use but it will not be reused until the kernel marks
 308      the stack as not used anymore.  */
 309   stack_list_add (&stack->list, &stack_cache);
 310
 311   stack_cache_actsize += stack->stackblock_size;
 312   if (__glibc_unlikely (stack_cache_actsize > stack_cache_maxsize))
 313     free_stacks (stack_cache_maxsize);
 314 }
 315
 316
 317 static int
 318 change_stack_perm (struct pthread *pd
 319 #ifdef NEED_SEPARATE_REGISTER_STACK
 320                    , size_t pagemask
 321 #endif
 322                    )
 323 {
 324 #ifdef NEED_SEPARATE_REGISTER_STACK
 325   void *stack = (pd->stackblock
 326                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 327                       & pagemask) + pd->guardsize) & pagemask));
 328   size_t len = pd->stackblock + pd->stackblock_size - stack;
 329 #elif _STACK_GROWS_DOWN
 330   void *stack = pd->stackblock + pd->guardsize;
 331   size_t len = pd->stackblock_size - pd->guardsize;
 332 #elif _STACK_GROWS_UP
 333   void *stack = pd->stackblock;
 334   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 335 #else
 336 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 337 #endif
 338   if (__mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 339     return errno;
 340
 341   return 0;
 342 }
 343
 344 /* Return the guard page position on allocated stack.  */
 345 static inline char *
 346 __attribute ((always_inline))
 347 guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
 348                 size_t pagesize_m1)
 349 {
 350 #ifdef NEED_SEPARATE_REGISTER_STACK
 351   return mem + (((size - guardsize) / 2) & ~pagesize_m1);
 352 #elif _STACK_GROWS_DOWN
 353   return mem;
 354 #elif _STACK_GROWS_UP
 355   return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 356 #endif
 357 }
 358
 359 /* Based on stack allocated with PROT_NONE, setup the required portions with
 360    'prot' flags based on the guard page position.  */
 361 static inline int
 362 setup_stack_prot (char *mem, size_t size, char *guard, size_t guardsize,
 363                   const int prot)
 364 {
 365   char *guardend = guard + guardsize;
 366 #if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
 367   /* As defined at guard_position, for architectures with downward stack
 368      the guard page is always at start of the allocated area.  */
 369   if (__mprotect (guardend, size - guardsize, prot) != 0)
 370     return errno;
 371 #else
 372   size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
 373   if (__mprotect (mem, mprots1, prot) != 0)
 374     return errno;
 375   size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
 376   if (__mprotect (guardend, mprots2, prot) != 0)
 377     return errno;
 378 #endif
 379   return 0;
 380 }
 381
 382 /* Mark the memory of the stack as usable to the kernel.  It frees everything
 383    except for the space used for the TCB itself.  */
 384 static __always_inline void
 385 advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
 386 {
 387   uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
 388   size_t pagesize_m1 = __getpagesize () - 1;
 389 #if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
 390   size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
 391   assert (freesize < size);
 392   if (freesize > PTHREAD_STACK_MIN)
 393     __madvise (mem, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
 394 #else
 395   /* Page aligned start of memory to free (higher than or equal
 396      to current sp plus the minimum stack size).  */
 397   uintptr_t freeblock = (sp + PTHREAD_STACK_MIN + pagesize_m1) & ~pagesize_m1;
 398   uintptr_t free_end = (pd - guardsize) & ~pagesize_m1;
 399   if (free_end > freeblock)
 400     {
 401       size_t freesize = free_end - freeblock;
 402       assert (freesize < size);
 403       __madvise ((void*) freeblock, freesize, MADV_DONTNEED);
 404     }
 405 #endif
 406 }
 407
 408 /* Returns a usable stack for a new thread either by allocating a
 409    new stack or reusing a cached stack of sufficient size.
 410    ATTR must be non-NULL and point to a valid pthread_attr.
 411    PDP must be non-NULL.  */
 412 static int
 413 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 414                 ALLOCATE_STACK_PARMS)
 415 {
 416   struct pthread *pd;
 417   size_t size;
 418   size_t pagesize_m1 = __getpagesize () - 1;
 419
 420   assert (powerof2 (pagesize_m1 + 1));
 421   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 422
 423   /* Get the stack size from the attribute if it is set.  Otherwise we
 424      use the default we determined at start time.  */
 425   if (attr->stacksize != 0)
 426     size = attr->stacksize;
 427   else
 428     {
 429       lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
 430       size = __default_pthread_attr.internal.stacksize;
 431       lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
 432     }
 433
 434   /* Get memory for the stack.  */
 435   if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
 436     {
 437       uintptr_t adj;
 438       char *stackaddr = (char *) attr->stackaddr;
 439
 440       /* Assume the same layout as the _STACK_GROWS_DOWN case, with struct
 441          pthread at the top of the stack block.  Later we adjust the guard
 442          location and stack address to match the _STACK_GROWS_UP case.  */
 443       if (_STACK_GROWS_UP)
 444         stackaddr += attr->stacksize;
 445
 446       /* If the user also specified the size of the stack make sure it
 447          is large enough.  */
 448       if (attr->stacksize != 0
 449           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 450         return EINVAL;
 451
 452       /* Adjust stack size for alignment of the TLS block.  */
 453 #if TLS_TCB_AT_TP
 454       adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
 455             & __static_tls_align_m1;
 456       assert (size > adj + TLS_TCB_SIZE);
 457 #elif TLS_DTV_AT_TP
 458       adj = ((uintptr_t) stackaddr - __static_tls_size)
 459             & __static_tls_align_m1;
 460       assert (size > adj);
 461 #endif
 462
 463       /* The user provided some memory.  Let's hope it matches the
 464          size...  We do not allocate guard pages if the user provided
 465          the stack.  It is the user's responsibility to do this if it
 466          is wanted.  */
 467 #if TLS_TCB_AT_TP
 468       pd = (struct pthread *) ((uintptr_t) stackaddr
 469                                - TLS_TCB_SIZE - adj);
 470 #elif TLS_DTV_AT_TP
 471       pd = (struct pthread *) (((uintptr_t) stackaddr
 472                                 - __static_tls_size - adj)
 473                                - TLS_PRE_TCB_SIZE);
 474 #endif
 475
 476       /* The user provided stack memory needs to be cleared.  */
 477       memset (pd, '\0', sizeof (struct pthread));
 478
 479       /* The first TSD block is included in the TCB.  */
 480       pd->specific[0] = pd->specific_1stblock;
 481
 482       /* Remember the stack-related values.  */
 483       pd->stackblock = (char *) stackaddr - size;
 484       pd->stackblock_size = size;
 485
 486       /* This is a user-provided stack.  It will not be queued in the
 487          stack cache nor will the memory (except the TLS memory) be freed.  */
 488       pd->user_stack = true;
 489
 490       /* This is at least the second thread.  */
 491       pd->header.multiple_threads = 1;
 492 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 493       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 494 #endif
 495
 496 #ifdef NEED_DL_SYSINFO
 497       SETUP_THREAD_SYSINFO (pd);
 498 #endif
 499
 500       /* Don't allow setxid until cloned.  */
 501       pd->setxid_futex = -1;
 502
 503       /* Allocate the DTV for this thread.  */
 504       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 505         {
 506           /* Something went wrong.  */
 507           assert (errno == ENOMEM);
 508           return errno;
 509         }
 510
 511
 512       /* Prepare to modify global data.  */
 513       lll_lock (stack_cache_lock, LLL_PRIVATE);
 514
 515       /* And add to the list of stacks in use.  */
 516       list_add (&pd->list, &__stack_user);
 517
 518       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 519     }
 520   else
 521     {
 522       /* Allocate some anonymous memory.  If possible use the cache.  */
 523       size_t guardsize;
 524       size_t reported_guardsize;
 525       size_t reqsize;
 526       void *mem;
 527       const int prot = (PROT_READ | PROT_WRITE
 528                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 529
 530       /* Adjust the stack size for alignment.  */
 531       size &= ~__static_tls_align_m1;
 532       assert (size != 0);
 533
 534       /* Make sure the size of the stack is enough for the guard and
 535          eventually the thread descriptor.  On some targets there is
 536          a minimum guard size requirement, ARCH_MIN_GUARD_SIZE, so
 537          internally enforce it (unless the guard was disabled), but
 538          report the original guard size for backward compatibility:
 539          before POSIX 2008 the guardsize was specified to be one page
 540          by default which is observable via pthread_attr_getguardsize
 541          and pthread_getattr_np.  */
 542       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 543       reported_guardsize = guardsize;
 544       if (guardsize > 0 && guardsize < ARCH_MIN_GUARD_SIZE)
 545         guardsize = ARCH_MIN_GUARD_SIZE;
 546       if (guardsize < attr->guardsize || size + guardsize < guardsize)
 547         /* Arithmetic overflow.  */
 548         return EINVAL;
 549       size += guardsize;
 550       if (__builtin_expect (size < ((guardsize + __static_tls_size
 551                                      + MINIMAL_REST_STACK + pagesize_m1)
 552                                     & ~pagesize_m1),
 553                             0))
 554         /* The stack is too small (or the guard too large).  */
 555         return EINVAL;
 556
 557       /* Try to get a stack from the cache.  */
 558       reqsize = size;
 559       pd = get_cached_stack (&size, &mem);
 560       if (pd == NULL)
 561         {
 562           /* To avoid aliasing effects on a larger scale than pages we
 563              adjust the allocated stack size if necessary.  This way
 564              allocations directly following each other will not have
 565              aliasing problems.  */
 566 #if MULTI_PAGE_ALIASING != 0
 567           if ((size % MULTI_PAGE_ALIASING) == 0)
 568             size += pagesize_m1 + 1;
 569 #endif
 570
 571           /* If a guard page is required, avoid committing memory by first
 572              allocate with PROT_NONE and then reserve with required permission
 573              excluding the guard page.  */
 574           mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
 575                         MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 576
 577           if (__glibc_unlikely (mem == MAP_FAILED))
 578             return errno;
 579
 580           /* SIZE is guaranteed to be greater than zero.
 581              So we can never get a null pointer back from mmap.  */
 582           assert (mem != NULL);
 583
 584           /* Place the thread descriptor at the end of the stack.  */
 585 #if TLS_TCB_AT_TP
 586           pd = (struct pthread *) ((((uintptr_t) mem + size)
 587                                     - TLS_TCB_SIZE)
 588                                    & ~__static_tls_align_m1);
 589 #elif TLS_DTV_AT_TP
 590           pd = (struct pthread *) ((((uintptr_t) mem + size
 591                                     - __static_tls_size)
 592                                     & ~__static_tls_align_m1)
 593                                    - TLS_PRE_TCB_SIZE);
 594 #endif
 595
 596           /* Now mprotect the required region excluding the guard area.  */
 597           if (__glibc_likely (guardsize > 0))
 598             {
 599               char *guard = guard_position (mem, size, guardsize, pd,
 600                                             pagesize_m1);
 601               if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)
 602                 {
 603                   __munmap (mem, size);
 604                   return errno;
 605                 }
 606             }
 607
 608           /* Remember the stack-related values.  */
 609           pd->stackblock = mem;
 610           pd->stackblock_size = size;
 611           /* Update guardsize for newly allocated guardsize to avoid
 612              an mprotect in guard resize below.  */
 613           pd->guardsize = guardsize;
 614
 615           /* We allocated the first block thread-specific data array.
 616              This address will not change for the lifetime of this
 617              descriptor.  */
 618           pd->specific[0] = pd->specific_1stblock;
 619
 620           /* This is at least the second thread.  */
 621           pd->header.multiple_threads = 1;
 622 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 623           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 624 #endif
 625
 626 #ifdef NEED_DL_SYSINFO
 627           SETUP_THREAD_SYSINFO (pd);
 628 #endif
 629
 630           /* Don't allow setxid until cloned.  */
 631           pd->setxid_futex = -1;
 632
 633           /* Allocate the DTV for this thread.  */
 634           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 635             {
 636               /* Something went wrong.  */
 637               assert (errno == ENOMEM);
 638
 639               /* Free the stack memory we just allocated.  */
 640               (void) __munmap (mem, size);
 641
 642               return errno;
 643             }
 644
 645
 646           /* Prepare to modify global data.  */
 647           lll_lock (stack_cache_lock, LLL_PRIVATE);
 648
 649           /* And add to the list of stacks in use.  */
 650           stack_list_add (&pd->list, &stack_used);
 651
 652           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 653
 654
 655           /* There might have been a race.  Another thread might have
 656              caused the stacks to get exec permission while this new
 657              stack was prepared.  Detect if this was possible and
 658              change the permission if necessary.  */
 659           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 660                                 && (prot & PROT_EXEC) == 0, 0))
 661             {
 662               int err = change_stack_perm (pd
 663 #ifdef NEED_SEPARATE_REGISTER_STACK
 664                                            , ~pagesize_m1
 665 #endif
 666                                            );
 667               if (err != 0)
 668                 {
 669                   /* Free the stack memory we just allocated.  */
 670                   (void) __munmap (mem, size);
 671
 672                   return err;
 673                 }
 674             }
 675
 676
 677           /* Note that all of the stack and the thread descriptor is
 678              zeroed.  This means we do not have to initialize fields
 679              with initial value zero.  This is specifically true for
 680              the 'tid' field which is always set back to zero once the
 681              stack is not used anymore and for the 'guardsize' field
 682              which will be read next.  */
 683         }
 684
 685       /* Create or resize the guard area if necessary.  */
 686       if (__glibc_unlikely (guardsize > pd->guardsize))
 687         {
 688           char *guard = guard_position (mem, size, guardsize, pd,
 689                                         pagesize_m1);
 690           if (__mprotect (guard, guardsize, PROT_NONE) != 0)
 691             {
 692             mprot_error:
 693               lll_lock (stack_cache_lock, LLL_PRIVATE);
 694
 695               /* Remove the thread from the list.  */
 696               stack_list_del (&pd->list);
 697
 698               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 699
 700               /* Get rid of the TLS block we allocated.  */
 701               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 702
 703               /* Free the stack memory regardless of whether the size
 704                  of the cache is over the limit or not.  If this piece
 705                  of memory caused problems we better do not use it
 706                  anymore.  Uh, and we ignore possible errors.  There
 707                  is nothing we could do.  */
 708               (void) __munmap (mem, size);
 709
 710               return errno;
 711             }
 712
 713           pd->guardsize = guardsize;
 714         }
 715       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 716                                  0))
 717         {
 718           /* The old guard area is too large.  */
 719
 720 #ifdef NEED_SEPARATE_REGISTER_STACK
 721           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 722           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 723
 724           if (oldguard < guard
 725               && __mprotect (oldguard, guard - oldguard, prot) != 0)
 726             goto mprot_error;
 727
 728           if (__mprotect (guard + guardsize,
 729                         oldguard + pd->guardsize - guard - guardsize,
 730                         prot) != 0)
 731             goto mprot_error;
 732 #elif _STACK_GROWS_DOWN
 733           if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 734                         prot) != 0)
 735             goto mprot_error;
 736 #elif _STACK_GROWS_UP
 737          char *new_guard = (char *)(((uintptr_t) pd - guardsize)
 738                                     & ~pagesize_m1);
 739          char *old_guard = (char *)(((uintptr_t) pd - pd->guardsize)
 740                                     & ~pagesize_m1);
 741          /* The guard size difference might be > 0, but once rounded
 742             to the nearest page the size difference might be zero.  */
 743          if (new_guard > old_guard
 744              && __mprotect (old_guard, new_guard - old_guard, prot) != 0)
 745             goto mprot_error;
 746 #endif
 747
 748           pd->guardsize = guardsize;
 749         }
 750       /* The pthread_getattr_np() calls need to get passed the size
 751          requested in the attribute, regardless of how large the
 752          actually used guardsize is.  */
 753       pd->reported_guardsize = reported_guardsize;
 754     }
 755
 756   /* Initialize the lock.  We have to do this unconditionally since the
 757      stillborn thread could be canceled while the lock is taken.  */
 758   pd->lock = LLL_LOCK_INITIALIZER;
 759
 760   /* The robust mutex lists also need to be initialized
 761      unconditionally because the cleanup for the previous stack owner
 762      might have happened in the kernel.  */
 763   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 764                                   - offsetof (pthread_mutex_t,
 765                                               __data.__list.__next));
 766   pd->robust_head.list_op_pending = NULL;
 767 #if __PTHREAD_MUTEX_HAVE_PREV
 768   pd->robust_prev = &pd->robust_head;
 769 #endif
 770   pd->robust_head.list = &pd->robust_head;
 771
 772   /* We place the thread descriptor at the end of the stack.  */
 773   *pdp = pd;
 774
 775 #if _STACK_GROWS_DOWN
 776   void *stacktop;
 777
 778 # if TLS_TCB_AT_TP
 779   /* The stack begins before the TCB and the static TLS block.  */
 780   stacktop = ((char *) (pd + 1) - __static_tls_size);
 781 # elif TLS_DTV_AT_TP
 782   stacktop = (char *) (pd - 1);
 783 # endif
 784
 785 # ifdef NEED_SEPARATE_REGISTER_STACK
 786   *stack = pd->stackblock;
 787   *stacksize = stacktop - *stack;
 788 # else
 789   *stack = stacktop;
 790 # endif
 791 #else
 792   *stack = pd->stackblock;
 793 #endif
 794
 795   return 0;
 796 }
 797
 798
 799 void
 800 __deallocate_stack (struct pthread *pd)
 801 {
 802   lll_lock (stack_cache_lock, LLL_PRIVATE);
 803
 804   /* Remove the thread from the list of threads with user defined
 805      stacks.  */
 806   stack_list_del (&pd->list);
 807
 808   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 809      not reset the 'used' flag in the 'tid' field.  This is done by
 810      the kernel.  If no thread has been created yet this field is
 811      still zero.  */
 812   if (__glibc_likely (! pd->user_stack))
 813     (void) queue_stack (pd);
 814   else
 815     /* Free the memory associated with the ELF TLS.  */
 816     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 817
 818   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 819 }
 820
 821
 822 int
 823 __make_stacks_executable (void **stack_endp)
 824 {
 825   /* First the main thread's stack.  */
 826   int err = _dl_make_stack_executable (stack_endp);
 827   if (err != 0)
 828     return err;
 829
 830 #ifdef NEED_SEPARATE_REGISTER_STACK
 831   const size_t pagemask = ~(__getpagesize () - 1);
 832 #endif
 833
 834   lll_lock (stack_cache_lock, LLL_PRIVATE);
 835
 836   list_t *runp;
 837   list_for_each (runp, &stack_used)
 838     {
 839       err = change_stack_perm (list_entry (runp, struct pthread, list)
 840 #ifdef NEED_SEPARATE_REGISTER_STACK
 841                                , pagemask
 842 #endif
 843                                );
 844       if (err != 0)
 845         break;
 846     }
 847
 848   /* Also change the permission for the currently unused stacks.  This
 849      might be wasted time but better spend it here than adding a check
 850      in the fast path.  */
 851   if (err == 0)
 852     list_for_each (runp, &stack_cache)
 853       {
 854         err = change_stack_perm (list_entry (runp, struct pthread, list)
 855 #ifdef NEED_SEPARATE_REGISTER_STACK
 856                                  , pagemask
 857 #endif
 858                                  );
 859         if (err != 0)
 860           break;
 861       }
 862
 863   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 864
 865   return err;
 866 }
 867
 868
 869 /* In case of a fork() call the memory allocation in the child will be
 870    the same but only one thread is running.  All stacks except that of
 871    the one running thread are not used anymore.  We have to recycle
 872    them.  */
 873 void
 874 __reclaim_stacks (void)
 875 {
 876   struct pthread *self = (struct pthread *) THREAD_SELF;
 877
 878   /* No locking necessary.  The caller is the only stack in use.  But
 879      we have to be aware that we might have interrupted a list
 880      operation.  */
 881
 882   if (in_flight_stack != 0)
 883     {
 884       bool add_p = in_flight_stack & 1;
 885       list_t *elem = (list_t *) (in_flight_stack & ~(uintptr_t) 1);
 886
 887       if (add_p)
 888         {
 889           /* We always add at the beginning of the list.  So in this case we
 890              only need to check the beginning of these lists to see if the
 891              pointers at the head of the list are inconsistent.  */
 892           list_t *l = NULL;
 893
 894           if (stack_used.next->prev != &stack_used)
 895             l = &stack_used;
 896           else if (stack_cache.next->prev != &stack_cache)
 897             l = &stack_cache;
 898
 899           if (l != NULL)
 900             {
 901               assert (l->next->prev == elem);
 902               elem->next = l->next;
 903               elem->prev = l;
 904               l->next = elem;
 905             }
 906         }
 907       else
 908         {
 909           /* We can simply always replay the delete operation.  */
 910           elem->next->prev = elem->prev;
 911           elem->prev->next = elem->next;
 912         }
 913     }
 914
 915   /* Mark all stacks except the still running one as free.  */
 916   list_t *runp;
 917   list_for_each (runp, &stack_used)
 918     {
 919       struct pthread *curp = list_entry (runp, struct pthread, list);
 920       if (curp != self)
 921         {
 922           /* This marks the stack as free.  */
 923           curp->tid = 0;
 924
 925           /* Account for the size of the stack.  */
 926           stack_cache_actsize += curp->stackblock_size;
 927
 928           if (curp->specific_used)
 929             {
 930               /* Clear the thread-specific data.  */
 931               memset (curp->specific_1stblock, '\0',
 932                       sizeof (curp->specific_1stblock));
 933
 934               curp->specific_used = false;
 935
 936               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 937                 if (curp->specific[cnt] != NULL)
 938                   {
 939                     memset (curp->specific[cnt], '\0',
 940                             sizeof (curp->specific_1stblock));
 941
 942                     /* We have allocated the block which we do not
 943                        free here so re-set the bit.  */
 944                     curp->specific_used = true;
 945                   }
 946             }
 947         }
 948     }
 949
 950   /* Add the stack of all running threads to the cache.  */
 951   list_splice (&stack_used, &stack_cache);
 952
 953   /* Remove the entry for the current thread to from the cache list
 954      and add it to the list of running threads.  Which of the two
 955      lists is decided by the user_stack flag.  */
 956   stack_list_del (&self->list);
 957
 958   /* Re-initialize the lists for all the threads.  */
 959   INIT_LIST_HEAD (&stack_used);
 960   INIT_LIST_HEAD (&__stack_user);
 961
 962   if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
 963     list_add (&self->list, &__stack_user);
 964   else
 965     list_add (&self->list, &stack_used);
 966
 967   /* There is one thread running.  */
 968   __nptl_nthreads = 1;
 969
 970   in_flight_stack = 0;
 971
 972   /* Initialize locks.  */
 973   stack_cache_lock = LLL_LOCK_INITIALIZER;
 974   __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
 975 }
 976
 977
 978 static void
 979 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 980 {
 981   int ch;
 982
 983   /* Wait until this thread is cloned.  */
 984   if (t->setxid_futex == -1
 985       && ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -2, -1))
 986     do
 987       futex_wait_simple (&t->setxid_futex, -2, FUTEX_PRIVATE);
 988     while (t->setxid_futex == -2);
 989
 990   /* Don't let the thread exit before the setxid handler runs.  */
 991   t->setxid_futex = 0;
 992
 993   do
 994     {
 995       ch = t->cancelhandling;
 996
 997       /* If the thread is exiting right now, ignore it.  */
 998       if ((ch & EXITING_BITMASK) != 0)
 999         {
1000           /* Release the futex if there is no other setxid in
1001              progress.  */
1002           if ((ch & SETXID_BITMASK) == 0)
1003             {
1004               t->setxid_futex = 1;
1005               futex_wake (&t->setxid_futex, 1, FUTEX_PRIVATE);
1006             }
1007           return;
1008         }
1009     }
1010   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1011                                                ch | SETXID_BITMASK, ch));
1012 }
1013
1014
1015 static void
1016 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
1017 {
1018   int ch;
1019
1020   do
1021     {
1022       ch = t->cancelhandling;
1023       if ((ch & SETXID_BITMASK) == 0)
1024         return;
1025     }
1026   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1027                                                ch & ~SETXID_BITMASK, ch));
1028
1029   /* Release the futex just in case.  */
1030   t->setxid_futex = 1;
1031   futex_wake (&t->setxid_futex, 1, FUTEX_PRIVATE);
1032 }
1033
1034
1035 static int
1036 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
1037 {
1038   if ((t->cancelhandling & SETXID_BITMASK) == 0)
1039     return 0;
1040
1041   int val;
1042   pid_t pid = __getpid ();
1043   val = INTERNAL_SYSCALL_CALL (tgkill, pid, t->tid, SIGSETXID);
1044
1045   /* If this failed, it must have had not started yet or else exited.  */
1046   if (!INTERNAL_SYSCALL_ERROR_P (val))
1047     {
1048       atomic_increment (&cmdp->cntr);
1049       return 1;
1050     }
1051   else
1052     return 0;
1053 }
1054
1055 /* Check for consistency across set*id system call results.  The abort
1056    should not happen as long as all privileges changes happen through
1057    the glibc wrappers.  ERROR must be 0 (no error) or an errno
1058    code.  */
1059 void
1060 attribute_hidden
1061 __nptl_setxid_error (struct xid_command *cmdp, int error)
1062 {
1063   do
1064     {
1065       int olderror = cmdp->error;
1066       if (olderror == error)
1067         break;
1068       if (olderror != -1)
1069         {
1070           /* Mismatch between current and previous results.  Save the
1071              error value to memory so that is not clobbered by the
1072              abort function and preserved in coredumps.  */
1073           volatile int xid_err __attribute__((unused)) = error;
1074           abort ();
1075         }
1076     }
1077   while (atomic_compare_and_exchange_bool_acq (&cmdp->error, error, -1));
1078 }
1079
1080 int
1081 attribute_hidden
1082 __nptl_setxid (struct xid_command *cmdp)
1083 {
1084   int signalled;
1085   int result;
1086   lll_lock (stack_cache_lock, LLL_PRIVATE);
1087
1088   __xidcmd = cmdp;
1089   cmdp->cntr = 0;
1090   cmdp->error = -1;
1091
1092   struct pthread *self = THREAD_SELF;
1093
1094   /* Iterate over the list with system-allocated threads first.  */
1095   list_t *runp;
1096   list_for_each (runp, &stack_used)
1097     {
1098       struct pthread *t = list_entry (runp, struct pthread, list);
1099       if (t == self)
1100         continue;
1101
1102       setxid_mark_thread (cmdp, t);
1103     }
1104
1105   /* Now the list with threads using user-allocated stacks.  */
1106   list_for_each (runp, &__stack_user)
1107     {
1108       struct pthread *t = list_entry (runp, struct pthread, list);
1109       if (t == self)
1110         continue;
1111
1112       setxid_mark_thread (cmdp, t);
1113     }
1114
1115   /* Iterate until we don't succeed in signalling anyone.  That means
1116      we have gotten all running threads, and their children will be
1117      automatically correct once started.  */
1118   do
1119     {
1120       signalled = 0;
1121
1122       list_for_each (runp, &stack_used)
1123         {
1124           struct pthread *t = list_entry (runp, struct pthread, list);
1125           if (t == self)
1126             continue;
1127
1128           signalled += setxid_signal_thread (cmdp, t);
1129         }
1130
1131       list_for_each (runp, &__stack_user)
1132         {
1133           struct pthread *t = list_entry (runp, struct pthread, list);
1134           if (t == self)
1135             continue;
1136
1137           signalled += setxid_signal_thread (cmdp, t);
1138         }
1139
1140       int cur = cmdp->cntr;
1141       while (cur != 0)
1142         {
1143           futex_wait_simple ((unsigned int *) &cmdp->cntr, cur,
1144                              FUTEX_PRIVATE);
1145           cur = cmdp->cntr;
1146         }
1147     }
1148   while (signalled != 0);
1149
1150   /* Clean up flags, so that no thread blocks during exit waiting
1151      for a signal which will never come.  */
1152   list_for_each (runp, &stack_used)
1153     {
1154       struct pthread *t = list_entry (runp, struct pthread, list);
1155       if (t == self)
1156         continue;
1157
1158       setxid_unmark_thread (cmdp, t);
1159     }
1160
1161   list_for_each (runp, &__stack_user)
1162     {
1163       struct pthread *t = list_entry (runp, struct pthread, list);
1164       if (t == self)
1165         continue;
1166
1167       setxid_unmark_thread (cmdp, t);
1168     }
1169
1170   /* This must be last, otherwise the current thread might not have
1171      permissions to send SIGSETXID syscall to the other threads.  */
1172   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, 3,
1173                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1174   int error = 0;
1175   if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (result)))
1176     {
1177       error = INTERNAL_SYSCALL_ERRNO (result);
1178       __set_errno (error);
1179       result = -1;
1180     }
1181   __nptl_setxid_error (cmdp, error);
1182
1183   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1184   return result;
1185 }
1186
1187 static inline void __attribute__((always_inline))
1188 init_one_static_tls (struct pthread *curp, struct link_map *map)
1189 {
1190 # if TLS_TCB_AT_TP
1191   void *dest = (char *) curp - map->l_tls_offset;
1192 # elif TLS_DTV_AT_TP
1193   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1194 # else
1195 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1196 # endif
1197
1198   /* Initialize the memory.  */
1199   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1200           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1201 }
1202
1203 void
1204 attribute_hidden
1205 __pthread_init_static_tls (struct link_map *map)
1206 {
1207   lll_lock (stack_cache_lock, LLL_PRIVATE);
1208
1209   /* Iterate over the list with system-allocated threads first.  */
1210   list_t *runp;
1211   list_for_each (runp, &stack_used)
1212     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1213
1214   /* Now the list with threads using user-allocated stacks.  */
1215   list_for_each (runp, &__stack_user)
1216     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1217
1218   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1219 }
1220
1221
1222 void
1223 attribute_hidden
1224 __wait_lookup_done (void)
1225 {
1226   lll_lock (stack_cache_lock, LLL_PRIVATE);
1227
1228   struct pthread *self = THREAD_SELF;
1229
1230   /* Iterate over the list with system-allocated threads first.  */
1231   list_t *runp;
1232   list_for_each (runp, &stack_used)
1233     {
1234       struct pthread *t = list_entry (runp, struct pthread, list);
1235       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1236         continue;
1237
1238       int *const gscope_flagp = &t->header.gscope_flag;
1239
1240       /* We have to wait until this thread is done with the global
1241          scope.  First tell the thread that we are waiting and
1242          possibly have to be woken.  */
1243       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1244                                                 THREAD_GSCOPE_FLAG_WAIT,
1245                                                 THREAD_GSCOPE_FLAG_USED))
1246         continue;
1247
1248       do
1249         futex_wait_simple ((unsigned int *) gscope_flagp,
1250                            THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1251       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1252     }
1253
1254   /* Now the list with threads using user-allocated stacks.  */
1255   list_for_each (runp, &__stack_user)
1256     {
1257       struct pthread *t = list_entry (runp, struct pthread, list);
1258       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1259         continue;
1260
1261       int *const gscope_flagp = &t->header.gscope_flag;
1262
1263       /* We have to wait until this thread is done with the global
1264          scope.  First tell the thread that we are waiting and
1265          possibly have to be woken.  */
1266       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1267                                                 THREAD_GSCOPE_FLAG_WAIT,
1268                                                 THREAD_GSCOPE_FLAG_USED))
1269         continue;
1270
1271       do
1272         futex_wait_simple ((unsigned int *) gscope_flagp,
1273                            THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1274       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1275     }
1276
1277   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1278 }