16f5586bb8
The context structure contains a batch of fields specific to IPv4 and to IPv6 connectivity. Split those out into a sub-structure. This allows the conf_ip4() and conf_ip6() functions, which take the entire context but touch very little of it, to be given more specific parameters, making it clearer what it affects without stepping through the code. Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
632 lines
14 KiB
C
632 lines
14 KiB
C
// SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
|
/* PASST - Plug A Simple Socket Transport
|
|
* for qemu/UNIX domain socket mode
|
|
*
|
|
* PASTA - Pack A Subtle Tap Abstraction
|
|
* for network namespace/tap device mode
|
|
*
|
|
* util.c - Convenience helpers
|
|
*
|
|
* Copyright (c) 2020-2021 Red Hat GmbH
|
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
|
*/
|
|
|
|
#include <sched.h>
|
|
#include <stdio.h>
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <stdlib.h>
|
|
#include <unistd.h>
|
|
#include <arpa/inet.h>
|
|
#include <net/ethernet.h>
|
|
#include <net/if.h>
|
|
#include <netinet/tcp.h>
|
|
#include <netinet/udp.h>
|
|
#include <sys/epoll.h>
|
|
#include <sys/prctl.h>
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
#include <fcntl.h>
|
|
#include <syslog.h>
|
|
#include <stdarg.h>
|
|
#include <string.h>
|
|
#include <time.h>
|
|
#include <errno.h>
|
|
#include <pwd.h>
|
|
#include <grp.h>
|
|
|
|
#include <linux/capability.h>
|
|
|
|
#include "util.h"
|
|
#include "passt.h"
|
|
#include "packet.h"
|
|
#include "lineread.h"
|
|
|
|
/* For __openlog() and __setlogmask() wrappers, and passt_vsyslog() */
|
|
static int log_mask;
|
|
static int log_sock = -1;
|
|
static char log_ident[BUFSIZ];
|
|
static int log_opt;
|
|
static time_t log_debug_start;
|
|
int log_trace;
|
|
|
|
#define logfn(name, level) \
|
|
void name(const char *format, ...) { \
|
|
struct timespec tp; \
|
|
va_list args; \
|
|
\
|
|
if (setlogmask(0) & LOG_MASK(LOG_DEBUG)) { \
|
|
clock_gettime(CLOCK_REALTIME, &tp); \
|
|
fprintf(stderr, "%li.%04li: ", \
|
|
tp.tv_sec - log_debug_start, \
|
|
tp.tv_nsec / (100L * 1000)); \
|
|
} else { \
|
|
va_start(args, format); \
|
|
passt_vsyslog(level, format, args); \
|
|
va_end(args); \
|
|
} \
|
|
\
|
|
if (setlogmask(0) & LOG_MASK(LOG_DEBUG) || \
|
|
setlogmask(0) == LOG_MASK(LOG_EMERG)) { \
|
|
va_start(args, format); \
|
|
vfprintf(stderr, format, args); \
|
|
va_end(args); \
|
|
if (format[strlen(format)] != '\n') \
|
|
fprintf(stderr, "\n"); \
|
|
} \
|
|
}
|
|
|
|
logfn(err, LOG_ERR)
|
|
logfn(warn, LOG_WARNING)
|
|
logfn(info, LOG_INFO)
|
|
logfn(debug, LOG_DEBUG)
|
|
|
|
void trace_init(int enable)
|
|
{
|
|
log_trace = enable;
|
|
}
|
|
|
|
/**
|
|
* __openlog() - Non-optional openlog() wrapper, to allow custom vsyslog()
|
|
* @ident: openlog() identity (program name)
|
|
* @option: openlog() options
|
|
* @facility: openlog() facility (LOG_DAEMON)
|
|
*/
|
|
void __openlog(const char *ident, int option, int facility)
|
|
{
|
|
struct timespec tp;
|
|
|
|
clock_gettime(CLOCK_REALTIME, &tp);
|
|
log_debug_start = tp.tv_sec;
|
|
|
|
if (log_sock < 0) {
|
|
struct sockaddr_un a = { .sun_family = AF_UNIX, };
|
|
|
|
log_sock = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0);
|
|
if (log_sock < 0)
|
|
return;
|
|
|
|
strncpy(a.sun_path, _PATH_LOG, sizeof(a.sun_path));
|
|
if (connect(log_sock, (const struct sockaddr *)&a, sizeof(a))) {
|
|
close(log_sock);
|
|
log_sock = -1;
|
|
return;
|
|
}
|
|
}
|
|
|
|
log_mask |= facility;
|
|
strncpy(log_ident, ident, sizeof(log_ident) - 1);
|
|
log_opt = option;
|
|
|
|
openlog(ident, option, facility);
|
|
}
|
|
|
|
/**
|
|
* __setlogmask() - setlogmask() wrapper, to allow custom vsyslog()
|
|
* @mask: Same as setlogmask() mask
|
|
*/
|
|
void __setlogmask(int mask)
|
|
{
|
|
log_mask = mask;
|
|
setlogmask(mask);
|
|
}
|
|
|
|
/**
|
|
* passt_vsyslog() - vsyslog() implementation not using heap memory
|
|
* @pri: Facility and level map, same as priority for vsyslog()
|
|
* @format: Same as vsyslog() format
|
|
* @ap: Same as vsyslog() ap
|
|
*/
|
|
void passt_vsyslog(int pri, const char *format, va_list ap)
|
|
{
|
|
char buf[BUFSIZ];
|
|
int n;
|
|
|
|
if (!(LOG_MASK(LOG_PRI(pri)) & log_mask))
|
|
return;
|
|
|
|
/* Send without name and timestamp, the system logger should add them */
|
|
n = snprintf(buf, BUFSIZ, "<%i> ", pri);
|
|
|
|
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
|
|
|
if (format[strlen(format)] != '\n')
|
|
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
|
|
|
if (log_opt & LOG_PERROR)
|
|
fprintf(stderr, "%s", buf + sizeof("<0>"));
|
|
|
|
if (send(log_sock, buf, n, 0) != n)
|
|
fprintf(stderr, "Failed to send %i bytes to syslog\n", n);
|
|
}
|
|
|
|
#define IPV6_NH_OPT(nh) \
|
|
((nh) == 0 || (nh) == 43 || (nh) == 44 || (nh) == 50 || \
|
|
(nh) == 51 || (nh) == 60 || (nh) == 135 || (nh) == 139 || \
|
|
(nh) == 140 || (nh) == 253 || (nh) == 254)
|
|
|
|
/**
|
|
* ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol
|
|
* @p: Packet pool, packet number @index has IPv6 header at @offset
|
|
* @index: Index of packet in pool
|
|
* @offset: Pre-calculated IPv6 header offset
|
|
* @proto: Filled with L4 protocol number
|
|
* @dlen: Data length (payload excluding header extensions), set on return
|
|
*
|
|
* Return: pointer to L4 header, NULL if not found
|
|
*/
|
|
char *ipv6_l4hdr(const struct pool *p, int index, size_t offset, uint8_t *proto,
|
|
size_t *dlen)
|
|
{
|
|
struct ipv6_opt_hdr *o;
|
|
struct ipv6hdr *ip6h;
|
|
char *base;
|
|
int hdrlen;
|
|
uint8_t nh;
|
|
|
|
base = packet_get(p, index, 0, 0, NULL);
|
|
ip6h = packet_get(p, index, offset, sizeof(*ip6h), dlen);
|
|
if (!ip6h)
|
|
return NULL;
|
|
|
|
offset += sizeof(*ip6h);
|
|
|
|
nh = ip6h->nexthdr;
|
|
if (!IPV6_NH_OPT(nh))
|
|
goto found;
|
|
|
|
while ((o = packet_get_try(p, index, offset, sizeof(*o), dlen))) {
|
|
nh = o->nexthdr;
|
|
hdrlen = (o->hdrlen + 1) * 8;
|
|
|
|
if (IPV6_NH_OPT(nh))
|
|
offset += hdrlen;
|
|
else
|
|
goto found;
|
|
}
|
|
|
|
return NULL;
|
|
|
|
found:
|
|
if (nh == 59)
|
|
return NULL;
|
|
|
|
*proto = nh;
|
|
return base + offset;
|
|
}
|
|
|
|
/**
|
|
* sock_l4() - Create and bind socket for given L4, add to epoll list
|
|
* @c: Execution context
|
|
* @af: Address family, AF_INET or AF_INET6
|
|
* @proto: Protocol number
|
|
* @bind_addr: Address for binding, NULL for any
|
|
* @port: Port, host order
|
|
* @data: epoll reference portion for protocol handlers
|
|
*
|
|
* Return: newly created socket, -1 on error
|
|
*/
|
|
int sock_l4(const struct ctx *c, int af, uint8_t proto,
|
|
const void *bind_addr, uint16_t port, uint32_t data)
|
|
{
|
|
union epoll_ref ref = { .r.proto = proto, .r.p.data = data };
|
|
struct sockaddr_in addr4 = {
|
|
.sin_family = AF_INET,
|
|
.sin_port = htons(port),
|
|
{ 0 }, { 0 },
|
|
};
|
|
struct sockaddr_in6 addr6 = {
|
|
.sin6_family = AF_INET6,
|
|
.sin6_port = htons(port),
|
|
0, IN6ADDR_ANY_INIT, 0,
|
|
};
|
|
const struct sockaddr *sa;
|
|
struct epoll_event ev;
|
|
int fd, sl, y = 1;
|
|
|
|
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
|
|
proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6)
|
|
return -1; /* Not implemented. */
|
|
|
|
if (proto == IPPROTO_TCP)
|
|
fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto);
|
|
else
|
|
fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto);
|
|
|
|
if (fd < 0) {
|
|
perror("L4 socket");
|
|
return -1;
|
|
}
|
|
|
|
if (fd > SOCKET_MAX) {
|
|
close(fd);
|
|
return -EIO;
|
|
}
|
|
|
|
ref.r.s = fd;
|
|
|
|
if (af == AF_INET) {
|
|
if (bind_addr)
|
|
addr4.sin_addr.s_addr = *(in_addr_t *)bind_addr;
|
|
else
|
|
addr4.sin_addr.s_addr = htonl(INADDR_ANY);
|
|
|
|
sa = (const struct sockaddr *)&addr4;
|
|
sl = sizeof(addr4);
|
|
} else {
|
|
if (bind_addr) {
|
|
addr6.sin6_addr = *(struct in6_addr *)bind_addr;
|
|
|
|
if (!memcmp(bind_addr, &c->ip6.addr_ll,
|
|
sizeof(c->ip6.addr_ll)))
|
|
addr6.sin6_scope_id = c->ifi6;
|
|
} else {
|
|
addr6.sin6_addr = in6addr_any;
|
|
}
|
|
|
|
sa = (const struct sockaddr *)&addr6;
|
|
sl = sizeof(addr6);
|
|
|
|
if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &y, sizeof(y)))
|
|
debug("Failed to set IPV6_V6ONLY on socket %i", fd);
|
|
}
|
|
|
|
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)))
|
|
debug("Failed to set SO_REUSEADDR on socket %i", fd);
|
|
|
|
if (bind(fd, sa, sl) < 0) {
|
|
/* We'll fail to bind to low ports if we don't have enough
|
|
* capabilities, and we'll fail to bind on already bound ports,
|
|
* this is fine. This might also fail for ICMP because of a
|
|
* broken SELinux policy, see icmp_tap_handler().
|
|
*/
|
|
if (proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6) {
|
|
close(fd);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
if (proto == IPPROTO_TCP && listen(fd, 128) < 0) {
|
|
perror("TCP socket listen");
|
|
close(fd);
|
|
return -1;
|
|
}
|
|
|
|
ev.events = EPOLLIN;
|
|
ev.data.u64 = ref.u64;
|
|
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
|
|
perror("L4 epoll_ctl");
|
|
return -1;
|
|
}
|
|
|
|
return fd;
|
|
}
|
|
|
|
/**
|
|
* sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
|
|
* @c: Execution context
|
|
*/
|
|
void sock_probe_mem(struct ctx *c)
|
|
{
|
|
int v = INT_MAX / 2, s;
|
|
socklen_t sl;
|
|
|
|
if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
|
|
c->low_wmem = c->low_rmem = 1;
|
|
return;
|
|
}
|
|
|
|
sl = sizeof(v);
|
|
if (setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)) ||
|
|
getsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, &sl) ||
|
|
(size_t)v < SNDBUF_BIG)
|
|
c->low_wmem = 1;
|
|
|
|
v = INT_MAX / 2;
|
|
if (setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v)) ||
|
|
getsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, &sl) ||
|
|
(size_t)v < RCVBUF_BIG)
|
|
c->low_rmem = 1;
|
|
|
|
close(s);
|
|
}
|
|
|
|
|
|
/**
|
|
* timespec_diff_ms() - Report difference in milliseconds between two timestamps
|
|
* @a: Minuend timestamp
|
|
* @b: Subtrahend timestamp
|
|
*
|
|
* Return: difference in milliseconds
|
|
*/
|
|
int timespec_diff_ms(const struct timespec *a, const struct timespec *b)
|
|
{
|
|
if (a->tv_nsec < b->tv_nsec) {
|
|
return (b->tv_nsec - a->tv_nsec) / 1000000 +
|
|
(a->tv_sec - b->tv_sec - 1) * 1000;
|
|
}
|
|
|
|
return (a->tv_nsec - b->tv_nsec) / 1000000 +
|
|
(a->tv_sec - b->tv_sec) * 1000;
|
|
}
|
|
|
|
/**
|
|
* bitmap_set() - Set single bit in bitmap
|
|
* @map: Pointer to bitmap
|
|
* @bit: Bit number to set
|
|
*/
|
|
void bitmap_set(uint8_t *map, int bit)
|
|
{
|
|
unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit);
|
|
|
|
*word |= BITMAP_BIT(bit);
|
|
}
|
|
|
|
/**
|
|
* bitmap_clear() - Clear single bit in bitmap
|
|
* @map: Pointer to bitmap
|
|
* @bit: Bit number to clear
|
|
*/
|
|
void bitmap_clear(uint8_t *map, int bit)
|
|
{
|
|
unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit);
|
|
|
|
*word &= ~BITMAP_BIT(bit);
|
|
}
|
|
|
|
/**
|
|
* bitmap_isset() - Check for set bit in bitmap
|
|
* @map: Pointer to bitmap
|
|
* @bit: Bit number to check
|
|
*
|
|
* Return: one if given bit is set, zero if it's not
|
|
*/
|
|
int bitmap_isset(const uint8_t *map, int bit)
|
|
{
|
|
unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit);
|
|
|
|
return !!(*word & BITMAP_BIT(bit));
|
|
}
|
|
|
|
/**
|
|
* procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs
|
|
* @proto: IPPROTO_TCP or IPPROTO_UDP
|
|
* @ip_version: IP version, V4 or V6
|
|
* @ns: Use saved file descriptors for namespace if set
|
|
* @map: Bitmap where numbers of ports in listening state will be set
|
|
* @exclude: Bitmap of ports to exclude from setting (and clear)
|
|
*
|
|
* #syscalls:pasta lseek
|
|
* #syscalls:pasta ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
|
|
*/
|
|
void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns,
|
|
uint8_t *map, uint8_t *exclude)
|
|
{
|
|
char *path, *line;
|
|
struct lineread lr;
|
|
unsigned long port;
|
|
unsigned int state;
|
|
int *fd;
|
|
|
|
if (proto == IPPROTO_TCP) {
|
|
fd = &c->proc_net_tcp[ip_version][ns];
|
|
if (ip_version == V4)
|
|
path = "/proc/net/tcp";
|
|
else
|
|
path = "/proc/net/tcp6";
|
|
} else {
|
|
fd = &c->proc_net_udp[ip_version][ns];
|
|
if (ip_version == V4)
|
|
path = "/proc/net/udp";
|
|
else
|
|
path = "/proc/net/udp6";
|
|
}
|
|
|
|
if (*fd != -1)
|
|
lseek(*fd, 0, SEEK_SET);
|
|
else if ((*fd = open(path, O_RDONLY | O_CLOEXEC)) < 0)
|
|
return;
|
|
|
|
lineread_init(&lr, *fd);
|
|
lineread_get(&lr, &line); /* throw away header */
|
|
while (lineread_get(&lr, &line) > 0) {
|
|
/* NOLINTNEXTLINE(cert-err34-c): != 2 if conversion fails */
|
|
if (sscanf(line, "%*u: %*x:%lx %*x:%*x %x", &port, &state) != 2)
|
|
continue;
|
|
|
|
/* See enum in kernel's include/net/tcp_states.h */
|
|
if ((proto == IPPROTO_TCP && state != 0x0a) ||
|
|
(proto == IPPROTO_UDP && state != 0x07))
|
|
continue;
|
|
|
|
if (bitmap_isset(exclude, port))
|
|
bitmap_clear(map, port);
|
|
else
|
|
bitmap_set(map, port);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE
|
|
*/
|
|
void drop_caps(void)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 64; i++) {
|
|
if (i == CAP_NET_BIND_SERVICE)
|
|
continue;
|
|
|
|
prctl(PR_CAPBSET_DROP, i, 0, 0, 0);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* check_root() - Check if root in init ns, exit if we can't drop to user
|
|
*/
|
|
void check_root(struct ctx *c)
|
|
{
|
|
const char root_uid_map[] = " 0 0 4294967295";
|
|
struct passwd *pw;
|
|
char buf[BUFSIZ];
|
|
int fd;
|
|
|
|
if (getuid() && geteuid())
|
|
return;
|
|
|
|
if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0)
|
|
return;
|
|
|
|
if (read(fd, buf, BUFSIZ) != sizeof(root_uid_map) ||
|
|
strncmp(buf, root_uid_map, sizeof(root_uid_map) - 1)) {
|
|
close(fd);
|
|
return;
|
|
}
|
|
|
|
close(fd);
|
|
|
|
if (!c->uid) {
|
|
fprintf(stderr, "Don't run as root. Changing to nobody...\n");
|
|
#ifndef GLIBC_NO_STATIC_NSS
|
|
pw = getpwnam("nobody");
|
|
if (!pw) {
|
|
perror("getpwnam");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
c->uid = pw->pw_uid;
|
|
c->gid = pw->pw_gid;
|
|
#else
|
|
(void)pw;
|
|
|
|
/* Common value for 'nobody', not really specified */
|
|
c->uid = c->gid = 65534;
|
|
#endif
|
|
}
|
|
|
|
if (!setgid(c->gid) && !setuid(c->uid))
|
|
return;
|
|
|
|
fprintf(stderr, "Can't change user/group, exiting");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
/**
|
|
* ns_enter() - Enter configured user (unless already joined) and network ns
|
|
* @c: Execution context
|
|
*
|
|
* Return: 0, won't return on failure
|
|
*
|
|
* #syscalls:pasta setns
|
|
*/
|
|
int ns_enter(const struct ctx *c)
|
|
{
|
|
if (!c->netns_only &&
|
|
c->pasta_userns_fd != -1 &&
|
|
setns(c->pasta_userns_fd, CLONE_NEWUSER))
|
|
exit(EXIT_FAILURE);
|
|
|
|
if (setns(c->pasta_netns_fd, CLONE_NEWNET))
|
|
exit(EXIT_FAILURE);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* pid_file() - Write PID to file, if requested to do so, and close it
|
|
* @fd: Open PID file descriptor, closed on exit, -1 to skip writing it
|
|
* @pid: PID value to write
|
|
*/
|
|
void write_pidfile(int fd, pid_t pid)
|
|
{
|
|
char pid_buf[12];
|
|
int n;
|
|
|
|
if (fd == -1)
|
|
return;
|
|
|
|
n = snprintf(pid_buf, sizeof(pid_buf), "%i\n", pid);
|
|
|
|
if (write(fd, pid_buf, n) < 0) {
|
|
perror("PID file write");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
close(fd);
|
|
}
|
|
|
|
/**
|
|
* __daemon() - daemon()-like function writing PID file before parent exits
|
|
* @pidfile_fd: Open PID file descriptor
|
|
* @devnull_fd: Open file descriptor for /dev/null
|
|
*
|
|
* Return: child PID on success, won't return on failure
|
|
*/
|
|
int __daemon(int pidfile_fd, int devnull_fd)
|
|
{
|
|
pid_t pid = fork();
|
|
|
|
if (pid == -1) {
|
|
perror("fork");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
if (pid) {
|
|
write_pidfile(pidfile_fd, pid);
|
|
exit(EXIT_SUCCESS);
|
|
}
|
|
|
|
errno = 0;
|
|
|
|
setsid();
|
|
|
|
dup2(devnull_fd, STDIN_FILENO);
|
|
dup2(devnull_fd, STDOUT_FILENO);
|
|
dup2(devnull_fd, STDERR_FILENO);
|
|
close(devnull_fd);
|
|
|
|
if (errno)
|
|
exit(EXIT_FAILURE);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* fls() - Find last (most significant) bit set in word
|
|
* @x: Word
|
|
*
|
|
* Return: position of most significant bit set, starting from 0, -1 if none
|
|
*/
|
|
int fls(unsigned long x)
|
|
{
|
|
int y = 0;
|
|
|
|
if (!x)
|
|
return -1;
|
|
|
|
while (x >>= 1)
|
|
y++;
|
|
|
|
return y;
|
|
}
|