passt/isolation.c
David Gibson eed17a47fe Handle userns isolation and dropping root at the same time
passt/pasta can interact with user namespaces in a number of ways:
   1) With --netns-only we'll remain in our original user namespace
   2) With --userns or a PID option to pasta we'll join either the given
      user namespace or that of the PID
   3) When pasta spawns a shell or command we'll start a new user namespace
      for the command and then join it
   4) With passt we'll create a new user namespace when we sandbox()
      ourself

However (3) and (4) turn out to have essentially the same effect.  In both
cases we create one new user namespace.  The spawned command starts there,
and passt/pasta itself will live there from sandbox() onwards.

Because of this, we can simplify user namespace handling by moving the
userns handling earlier, to the same point we drop root in the original
namespace.  Extend the drop_user() function to isolate_user() which does
both.

After switching UID and GID in the original userns, isolate_user() will
either join or create the userns we require.  When we spawn a command with
pasta_start_ns()/pasta_setup_ns() we no longer need to create a userns,
because we're already made one.  sandbox() likewise no longer needs to
create (or join) an userns because we're already in the one we need.

We no longer need c->pasta_userns_fd, since the fd is only used locally
in isolate_user().  Likewise we can replace c->netns_only with a local
in conf(), since it's not used outside there.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2022-09-13 05:31:51 +02:00

210 lines
4.7 KiB
C

// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* isolation.c - Self isolation helpers
*
* Copyright Red Hat
* Author: Stefano Brivio <sbrivio@redhat.com>
* Author: David Gibson <david@gibson.dropbear.id.au>
*/
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <inttypes.h>
#include <limits.h>
#include <pwd.h>
#include <sched.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include "util.h"
#include "seccomp.h"
#include "passt.h"
#include "isolation.h"
/**
* drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE
*/
void drop_caps(void)
{
int i;
for (i = 0; i < 64; i++) {
if (i == CAP_NET_BIND_SERVICE)
continue;
prctl(PR_CAPBSET_DROP, i, 0, 0, 0);
}
}
/**
* isolate_user() - Switch to final UID/GID and move into userns
* @uid: User ID to run as (in original userns)
* @gid: Group ID to run as (in original userns)
* @use_userns: Whether to join or create a userns
* @userns: userns path to enter, may be empty
*/
void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns)
{
char nsmap[BUFSIZ];
/* First set our UID & GID in the original namespace */
if (setgroups(0, NULL)) {
/* If we don't have CAP_SETGID, this will EPERM */
if (errno != EPERM) {
err("Can't drop supplementary groups: %s",
strerror(errno));
exit(EXIT_FAILURE);
}
}
if (setgid(gid) != 0) {
err("Can't set GID to %u: %s", gid, strerror(errno));
exit(EXIT_FAILURE);
}
if (setuid(uid) != 0) {
err("Can't set UID to %u: %s", uid, strerror(errno));
exit(EXIT_FAILURE);
}
/* If we're told not to use a userns, nothing more to do */
if (!use_userns)
return;
/* Otherwise, if given a userns, join it */
if (*userns) {
int ufd;
ufd = open(userns, O_RDONLY | O_CLOEXEC);
if (ufd < 0) {
err("Couldn't open user namespace %s: %s",
userns, strerror(errno));
exit(EXIT_FAILURE);
}
if (setns(ufd, CLONE_NEWUSER) != 0) {
err("Couldn't enter user namespace %s: %s",
userns, strerror(errno));
exit(EXIT_FAILURE);
}
close(ufd);
return;
}
/* Otherwise, create our own userns */
if (unshare(CLONE_NEWUSER) != 0) {
err("Couldn't create user namespace: %s", strerror(errno));
exit(EXIT_FAILURE);
}
/* Configure user and group mappings */
snprintf(nsmap, BUFSIZ, "0 %u 1", uid);
FWRITE("/proc/self/uid_map", nsmap, "Cannot set uid_map in namespace");
FWRITE("/proc/self/setgroups", "deny",
"Cannot write to setgroups in namespace");
snprintf(nsmap, BUFSIZ, "0 %u 1", gid);
FWRITE("/proc/self/gid_map", nsmap, "Cannot set gid_map in namespace");
}
/**
* sandbox() - Unshare IPC, mount, PID, UTS, and user namespaces, "unmount" root
*
* Return: negative error code on failure, zero on success
*/
int sandbox(struct ctx *c)
{
int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS;
/* If we run in foreground, we have no chance to actually move to a new
* PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody
* ever gets around seccomp profiles -- there's no harm in passing it.
*/
if (!c->foreground || c->mode == MODE_PASST)
flags |= CLONE_NEWPID;
if (unshare(flags)) {
perror("unshare");
return -errno;
}
if (mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL)) {
perror("mount /");
return -errno;
}
if (mount("", TMPDIR, "tmpfs",
MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY,
"nr_inodes=2,nr_blocks=0")) {
perror("mount tmpfs");
return -errno;
}
if (chdir(TMPDIR)) {
perror("chdir");
return -errno;
}
if (syscall(SYS_pivot_root, ".", ".")) {
perror("pivot_root");
return -errno;
}
if (umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW)) {
perror("umount2");
return -errno;
}
drop_caps(); /* Relative to the new user namespace this time. */
return 0;
}
/**
* seccomp() - Set up seccomp filters depending on mode, won't return on failure
* @c: Execution context
*/
void seccomp(const struct ctx *c)
{
struct sock_fprog prog;
if (c->mode == MODE_PASST) {
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
prog.filter = filter_passt;
} else {
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
prog.filter = filter_pasta;
}
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
perror("prctl");
exit(EXIT_FAILURE);
}
}