[patch v6] Allow for unpriviledged nested containers

Carlos O'Donell carlos@redhat.com
Mon Apr 4 14:17:55 GMT 2022


On 3/28/22 23:53, DJ Delorie wrote:
> 
> [changes since v5: whether or not to isolate the pid namespace is now a
> test-specific "pidns" option, support_needs_proc() takes a "why" message
> for future readers]
> 
> If the build itself is run in a container, we may not be able to
> fully set up a nested container for test-container testing.
> Notably is the mounting of /proc, since it's critical that it
> be mounted from within the same PID namespace as its users, and
> thus cannot be bind mounted from outside the container like other
> mounts.
> 
> This patch defaults to using the parent's PID namespace instead of
> creating a new one, as this is more likely to be allowed.
> 
> If the test needs an isolated PID namespace, it should add the "pidns"
> command to its init script.

LGTM.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
 
> diff --git a/elf/tst-pldd.c b/elf/tst-pldd.c
> index f31f9956faa..378262504a5 100644
> --- a/elf/tst-pldd.c
> +++ b/elf/tst-pldd.c
> @@ -85,6 +85,8 @@ in_str_list (const char *libname, const char *const strlist[])
>  static int
>  do_test (void)
>  {
> +  support_need_proc ("needs /proc/sys/kernel/yama/ptrace_scope and /proc/$child");
> +

OK. Nice use of a message to print.

>    /* Check if our subprocess can be debugged with ptrace.  */
>    {
>      int ptrace_scope = support_ptrace_scope ();
> diff --git a/nptl/tst-pthread-getattr.c b/nptl/tst-pthread-getattr.c
> index d2ebf308ae7..3f6f76ea83b 100644
> --- a/nptl/tst-pthread-getattr.c
> +++ b/nptl/tst-pthread-getattr.c
> @@ -28,6 +28,8 @@
>  #include <unistd.h>
>  #include <inttypes.h>
>  
> +#include <support/support.h>
> +
>  /* There is an obscure bug in the kernel due to which RLIMIT_STACK is sometimes
>     returned as unlimited when it is not, which may cause this test to fail.
>     There is also the other case where RLIMIT_STACK is intentionally set as
> @@ -153,6 +155,8 @@ check_stack_top (void)
>  static int
>  do_test (void)
>  {
> +  support_need_proc ("Reads /proc/self/maps to get stack size.");

OK.

> +
>    pagesize = sysconf (_SC_PAGESIZE);
>    return check_stack_top ();
>  }
> diff --git a/nss/tst-reload2.c b/nss/tst-reload2.c
> index fb3b94a1fab..7df0ca740b6 100644
> --- a/nss/tst-reload2.c
> +++ b/nss/tst-reload2.c
> @@ -95,6 +95,8 @@ do_test (void)
>    char buf1[PATH_MAX];
>    char buf2[PATH_MAX];
>  
> +  support_need_proc ("Our xmkdirp fails if we can't map our uid, which requires /proc.");
> +

OK.

>    sprintf (buf1, "/subdir%s", support_slibdir_prefix);
>    xmkdirp (buf1, 0777);
>  
> diff --git a/support/Makefile b/support/Makefile
> index 5ddcb8d1581..f036a813048 100644
> --- a/support/Makefile
> +++ b/support/Makefile
> @@ -64,6 +64,7 @@ libsupport-routines = \
>    support_format_netent \
>    support_isolate_in_subprocess \
>    support_mutex_pi_monotonic \
> +  support_need_proc \

OK.

>    support_path_support_time64 \
>    support_process_state \
>    support_ptrace \
> diff --git a/support/support.h b/support/support.h
> index 73b9fc48f01..d20051da4d4 100644
> --- a/support/support.h
> +++ b/support/support.h
> @@ -91,6 +91,11 @@ char *support_quote_string (const char *);
>     regular file open for writing, and initially empty.  */
>  int support_descriptor_supports_holes (int fd);
>  
> +/* Predicates that a test requires a working /proc filesystem.  This
> +   call will exit with UNSUPPORTED if /proc is not available, printing
> +   WHY_MSG as part of the diagnostic.  */
> +void support_need_proc (const char *why_msg);

OK.

> +
>  /* Error-checking wrapper functions which terminate the process on
>     error.  */
>  
> diff --git a/support/support_need_proc.c b/support/support_need_proc.c
> new file mode 100644
> index 00000000000..9b4eab7539b
> --- /dev/null
> +++ b/support/support_need_proc.c
> @@ -0,0 +1,35 @@
> +/* Indicate that a test requires a working /proc.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <unistd.h>
> +#include <support/check.h>
> +#include <support/support.h>
> +
> +/* We test for /proc/self/maps since that's one of the files that one
> +   of our tests actually uses, but the general idea is if Linux's
> +   /proc/ (procfs) filesystem is mounted.  If not, the process exits
> +   with an UNSUPPORTED result code.  */
> +
> +void
> +support_need_proc (const char *why_msg)
> +{
> +#ifdef __linux__
> +  if (access ("/proc/self/maps", R_OK))
> +    FAIL_UNSUPPORTED ("/proc is not available, %s", why_msg);
> +#endif

OK. Only does anything on linux (fixed from v5).

> +}
> diff --git a/support/test-container.c b/support/test-container.c
> index 25e7f142193..c837c4d7580 100644
> --- a/support/test-container.c
> +++ b/support/test-container.c
> @@ -97,6 +97,7 @@ int verbose = 0;
>     * mytest.root/mytest.script has a list of "commands" to run:
>         syntax:
>           # comment
> +	 pidns <comment>

Perfect.

This is a great way to avoid bitrot of this code while still making it functional.

>           su
>           mv FILE FILE
>  	 cp FILE FILE
> @@ -122,6 +123,8 @@ int verbose = 0;
>  
>         details:
>           - '#': A comment.
> +	 - 'pidns': Require a separate PID namespace, prints comment if it can't
> +	    (default is a shared pid namespace)

OK.

>           - 'su': Enables running test as root in the container.
>           - 'mv': A minimal move files command.
>           - 'cp': A minimal copy files command.
> @@ -148,7 +151,7 @@ int verbose = 0;
>     * Simple, easy to review code (i.e. prefer simple naive code over
>       complex efficient code)
>  
> -   * The current implementation ist parallel-make-safe, but only in
> +   * The current implementation is parallel-make-safe, but only in
>       that it uses a lock to prevent parallel access to the testroot.  */
>  
>  

> @@ -227,11 +230,37 @@ concat (const char *str, ...)
>    return bufs[n];
>  }
>  
> +/* Like the above, but put spaces between words.  Caller frees.  */
> +static char *
> +concat_words (char **words, int num_words)
> +{
> +  int len = 0;
> +  int i;
> +  char *rv, *p;
> +
> +  for (i = 0; i < num_words; i ++)
> +    {
> +      len += strlen (words[i]);
> +      len ++;
> +    }
> +
> +  p = rv = (char *) xmalloc (len);
> +
> +  for (i = 0; i < num_words; i ++)
> +    {
> +      if (i > 0)
> +	p = stpcpy (p, " ");
> +      p = stpcpy (p, words[i]);
> +    }
> +
> +  return rv;
> +}
> +
>  /* Try to mount SRC onto DEST.  */
>  static void
>  trymount (const char *src, const char *dest)
>  {
> -  if (mount (src, dest, "", MS_BIND, NULL) < 0)
> +  if (mount (src, dest, "", MS_BIND | MS_REC, NULL) < 0)
>      FAIL_EXIT1 ("can't mount %s onto %s\n", src, dest);
>  }
>  
> @@ -726,6 +755,9 @@ main (int argc, char **argv)
>    gid_t original_gid;
>    /* If set, the test runs as root instead of the user running the testsuite.  */
>    int be_su = 0;
> +  int require_pidns = 0;
> +  const char *pidns_comment = NULL;
> +  int do_proc_mounts = 0;
>    int UMAP;
>    int GMAP;
>    /* Used for "%lld %lld 1" so need not be large.  */
> @@ -1011,6 +1043,12 @@ main (int argc, char **argv)
>  	      {
>  		be_su = 1;
>  	      }
> +	    else if (nt >= 1 && strcmp (the_words[0], "pidns") == 0)
> +	      {
> +		require_pidns = 1;
> +		if (nt > 1)
> +		  pidns_comment = concat_words (the_words + 1, nt - 1);

OK.

> +	      }
>  	    else if (nt == 3 && strcmp (the_words[0], "mkdirp") == 0)
>  	      {
>  		long int m;
> @@ -1068,7 +1106,8 @@ main (int argc, char **argv)
>  
>  #ifdef CLONE_NEWNS
>    /* The unshare here gives us our own spaces and capabilities.  */
> -  if (unshare (CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS) < 0)
> +  if (unshare (CLONE_NEWUSER | CLONE_NEWNS
> +	       | (require_pidns ? CLONE_NEWPID : 0)) < 0)
>      {
>        /* Older kernels may not support all the options, or security
>  	 policy may block this call.  */
> @@ -1079,6 +1118,11 @@ main (int argc, char **argv)
>  	    check_for_unshare_hints ();
>  	  FAIL_UNSUPPORTED ("unable to unshare user/fs: %s", strerror (saved_errno));
>  	}
> +      /* We're about to exit anyway, it's "safe" to call unshare again
> +	 just to see if the CLONE_NEWPID caused the error.  */
> +      else if (require_pidns && unshare (CLONE_NEWUSER | CLONE_NEWNS) >= 0)
> +	FAIL_EXIT1 ("unable to unshare pid ns: %s : %s", strerror (errno),
> +		    pidns_comment ? pidns_comment : "required by test");
>        else
>  	FAIL_EXIT1 ("unable to unshare user/fs: %s", strerror (errno));
>      }
> @@ -1094,6 +1138,15 @@ main (int argc, char **argv)
>    trymount (support_srcdir_root, new_srcdir_path);
>    trymount (support_objdir_root, new_objdir_path);
>  
> +  /* It may not be possible to mount /proc directly.  */
> +  if (! require_pidns)
> +  {
> +    char *new_proc = concat (new_root_path, "/proc", NULL);
> +    xmkdirp (new_proc, 0755);
> +    trymount ("/proc", new_proc);
> +    do_proc_mounts = 1;
> +  }
> +
>    xmkdirp (concat (new_root_path, "/dev", NULL), 0755);
>    devmount (new_root_path, "null");
>    devmount (new_root_path, "zero");
> @@ -1163,42 +1216,60 @@ main (int argc, char **argv)
>  
>    maybe_xmkdir ("/tmp", 0755);
>  
> -  /* Now that we're pid 1 (effectively "root") we can mount /proc  */
> -  maybe_xmkdir ("/proc", 0777);
> -  if (mount ("proc", "/proc", "proc", 0, NULL) < 0)
> -    FAIL_EXIT1 ("Unable to mount /proc: ");
> -
> -  /* We map our original UID to the same UID in the container so we
> -     can own our own files normally.  */
> -  UMAP = open ("/proc/self/uid_map", O_WRONLY);
> -  if (UMAP < 0)
> -    FAIL_EXIT1 ("can't write to /proc/self/uid_map\n");
> -
> -  sprintf (tmp, "%lld %lld 1\n",
> -	   (long long) (be_su ? 0 : original_uid), (long long) original_uid);
> -  write (UMAP, tmp, strlen (tmp));
> -  xclose (UMAP);
> -
> -  /* We must disable setgroups () before we can map our groups, else we
> -     get EPERM.  */
> -  GMAP = open ("/proc/self/setgroups", O_WRONLY);
> -  if (GMAP >= 0)
> +  if (require_pidns)
>      {
> -      /* We support kernels old enough to not have this.  */
> -      write (GMAP, "deny\n", 5);
> -      xclose (GMAP);
> +      /* Now that we're pid 1 (effectively "root") we can mount /proc  */
> +      maybe_xmkdir ("/proc", 0777);
> +      if (mount ("proc", "/proc", "proc", 0, NULL) != 0)
> +	{
> +	  /* This happens if we're trying to create a nested container,
> +	     like if the build is running under podman, and we lack
> +	     priviledges.
> +
> +	     Ideally we would WARN here, but that would just add noise to
> +	     *every* test-container test, and the ones that care should
> +	     have their own relevent diagnostics.
> +
> +	     FAIL_EXIT1 ("Unable to mount /proc: ");  */
> +	}
> +      else
> +	do_proc_mounts = 1;
>      }
>  
> -  /* We map our original GID to the same GID in the container so we
> -     can own our own files normally.  */
> -  GMAP = open ("/proc/self/gid_map", O_WRONLY);
> -  if (GMAP < 0)
> -    FAIL_EXIT1 ("can't write to /proc/self/gid_map\n");
> +  if (do_proc_mounts)
> +    {
> +      /* We map our original UID to the same UID in the container so we
> +	 can own our own files normally.  */
> +      UMAP = open ("/proc/self/uid_map", O_WRONLY);
> +      if (UMAP < 0)
> +	FAIL_EXIT1 ("can't write to /proc/self/uid_map\n");
> +
> +      sprintf (tmp, "%lld %lld 1\n",
> +	       (long long) (be_su ? 0 : original_uid), (long long) original_uid);
> +      write (UMAP, tmp, strlen (tmp));
> +      xclose (UMAP);
> +
> +      /* We must disable setgroups () before we can map our groups, else we
> +	 get EPERM.  */
> +      GMAP = open ("/proc/self/setgroups", O_WRONLY);
> +      if (GMAP >= 0)
> +	{
> +	  /* We support kernels old enough to not have this.  */
> +	  write (GMAP, "deny\n", 5);
> +	  xclose (GMAP);
> +	}
>  
> -  sprintf (tmp, "%lld %lld 1\n",
> -	   (long long) (be_su ? 0 : original_gid), (long long) original_gid);
> -  write (GMAP, tmp, strlen (tmp));
> -  xclose (GMAP);
> +      /* We map our original GID to the same GID in the container so we
> +	 can own our own files normally.  */
> +      GMAP = open ("/proc/self/gid_map", O_WRONLY);
> +      if (GMAP < 0)
> +	FAIL_EXIT1 ("can't write to /proc/self/gid_map\n");
> +
> +      sprintf (tmp, "%lld %lld 1\n",
> +	       (long long) (be_su ? 0 : original_gid), (long long) original_gid);
> +      write (GMAP, tmp, strlen (tmp));
> +      xclose (GMAP);
> +    }
>  
>    if (change_cwd)
>      {
> 


-- 
Cheers,
Carlos.



More information about the Libc-alpha mailing list