passt/netlink.c

// SPDX-License-Identifier: GPL-2.0-or-later

/* PASST - Plug A Simple Socket Transport
 *  for qemu/UNIX domain socket mode
 *
 * PASTA - Pack A Subtle Tap Abstraction
 *  for network namespace/tap device mode
 *
 * netlink.c - rtnetlink routines: interfaces, addresses, routes
 *
 * Copyright (c) 2020-2021 Red Hat GmbH
 * Author: Stefano Brivio <sbrivio@redhat.com>
 */

#include <sched.h>
#include <string.h>
#include <stddef.h>
#include <errno.h>
#include <sys/types.h>
#include <limits.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>

#include <linux/netlink.h>
#include <linux/rtnetlink.h>

#include "util.h"
#include "passt.h"
#include "log.h"
#include "netlink.h"

#define NLBUFSIZ	(8192 * sizeof(struct nlmsghdr)) /* See netlink(7) */

/* Socket in init, in target namespace, sequence (just needs to be monotonic) */
static int nl_sock	= -1;
static int nl_sock_ns	= -1;
static int nl_seq;

/**
 * nl_sock_init_do() - Set up netlink sockets in init or target namespace
 * @arg:	Execution context, if running from namespace, NULL otherwise
 *
 * Return: 0
 */
static int nl_sock_init_do(void *arg)
{
	struct sockaddr_nl addr = { .nl_family = AF_NETLINK, };
	int *s = arg ? &nl_sock_ns : &nl_sock;
#ifdef NETLINK_GET_STRICT_CHK
	int y = 1;
#endif

	if (arg)
		ns_enter((struct ctx *)arg);

	*s = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
	if (*s < 0 || bind(*s, (struct sockaddr *)&addr, sizeof(addr))) {
		*s = -1;
		return 0;
	}

#ifdef NETLINK_GET_STRICT_CHK
	if (setsockopt(*s, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &y, sizeof(y)))
		debug("netlink: cannot set NETLINK_GET_STRICT_CHK on %i", *s);
#endif
	return 0;
}

/**
 * nl_sock_init() - Call nl_sock_init_do(), won't return on failure
 * @c:		Execution context
 * @ns:		Get socket in namespace, not in init
 */
void nl_sock_init(const struct ctx *c, bool ns)
{
	if (ns) {
		NS_CALL(nl_sock_init_do, c);
		if (nl_sock_ns == -1)
			goto fail;
	} else {
		nl_sock_init_do(NULL);
	}

	if (nl_sock == -1)
		goto fail;

	return;

fail:
	die("Failed to get netlink socket");
}

/**
 * nl_req() - Send netlink request and read response
 * @ns:		Use netlink socket in namespace
 * @buf:	Buffer for response (at least NLBUFSIZ long)
 * @req:	Request with netlink header
 * @len:	Request length
 *
 * Return: received length on success, negative error code on failure
 */
static int nl_req(int ns, char *buf, const void *req, ssize_t len)
{
	int s = ns ? nl_sock_ns : nl_sock, done = 0;
	char flush[NLBUFSIZ];
	ssize_t n;

	while (!done && (n = recv(s, flush, sizeof(flush), MSG_DONTWAIT)) > 0) {
		struct nlmsghdr *nh = (struct nlmsghdr *)flush;
		size_t nm = n;

		for ( ; NLMSG_OK(nh, nm); nh = NLMSG_NEXT(nh, nm)) {
			if (nh->nlmsg_type == NLMSG_DONE ||
			    nh->nlmsg_type == NLMSG_ERROR) {
				done = 1;
				break;
			}
		}
	}

	if ((send(s, req, len, 0) < len) ||
	    (len = recv(s, buf, NLBUFSIZ, 0)) < 0)
		return -errno;

	return len;
}

/**
 * nl_get_ext_if() - Get interface index supporting IP version being probed
 * @af:	Address family (AF_INET or AF_INET6) to look for connectivity
 *      for.
 *
 * Return: interface index, 0 if not found
 */
unsigned int nl_get_ext_if(sa_family_t af)
{
	struct { struct nlmsghdr nlh; struct rtmsg rtm; } req = {
		.nlh.nlmsg_type	 = RTM_GETROUTE,
		.nlh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
		.nlh.nlmsg_len	 = NLMSG_LENGTH(sizeof(struct rtmsg)),
		.nlh.nlmsg_seq	 = nl_seq++,

		.rtm.rtm_table	 = RT_TABLE_MAIN,
		.rtm.rtm_scope	 = RT_SCOPE_UNIVERSE,
		.rtm.rtm_type	 = RTN_UNICAST,
		.rtm.rtm_family	 = af,
	};
	struct nlmsghdr *nh;
	struct rtattr *rta;
	char buf[NLBUFSIZ];
	ssize_t n;
	size_t na;

	if ((n = nl_req(0, buf, &req, sizeof(req))) < 0)
		return 0;

	nh = (struct nlmsghdr *)buf;

	for ( ; NLMSG_OK(nh, n); nh = NLMSG_NEXT(nh, n)) {
		struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh);

		if (rtm->rtm_dst_len || rtm->rtm_family != af)
			continue;

		for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
		     rta = RTA_NEXT(rta, na)) {
			unsigned int ifi;

			if (rta->rta_type != RTA_OIF)
				continue;

			ifi = *(unsigned int *)RTA_DATA(rta);

			return ifi;
		}
	}

	return 0;
}

/**
 * nl_route() - Get/set/copy routes for given interface and address family
 * @op:		Requested operation
 * @ifi:	Interface index in outer network namespace
 * @ifi_ns:	Interface index in target namespace for NL_SET, NL_DUP
 * @af:		Address family
 * @gw:		Default gateway to fill on NL_GET, to set on NL_SET
 */
void nl_route(enum nl_op op, unsigned int ifi, unsigned int ifi_ns,
	      sa_family_t af, void *gw)
{
	struct req_t {
		struct nlmsghdr nlh;
		struct rtmsg rtm;
		struct rtattr rta;
		unsigned int ifi;
		union {
			struct {
				struct rtattr rta_dst;
				struct in6_addr d;
				struct rtattr rta_gw;
				struct in6_addr a;
			} r6;
			struct {
				struct rtattr rta_dst;
				uint32_t d;
				struct rtattr rta_gw;
				uint32_t a;
			} r4;
		} set;
	} req = {
		.nlh.nlmsg_type	  = op == NL_SET ? RTM_NEWROUTE : RTM_GETROUTE,
		.nlh.nlmsg_flags  = NLM_F_REQUEST,
		.nlh.nlmsg_seq	  = nl_seq++,

		.rtm.rtm_family	  = af,
		.rtm.rtm_table	  = RT_TABLE_MAIN,
		.rtm.rtm_scope	  = RT_SCOPE_UNIVERSE,
		.rtm.rtm_type	  = RTN_UNICAST,

		.rta.rta_type	  = RTA_OIF,
		.rta.rta_len	  = RTA_LENGTH(sizeof(unsigned int)),
		.ifi		  = op == NL_SET ? ifi_ns : ifi,
	};
	unsigned dup_routes = 0;
	ssize_t n, nlmsgs_size;
	struct nlmsghdr *nh;
	struct rtattr *rta;
	char buf[NLBUFSIZ];
	struct rtmsg *rtm;
	size_t na;

	if (op == NL_SET) {
		if (af == AF_INET6) {
			size_t rta_len = RTA_LENGTH(sizeof(req.set.r6.d));

			req.nlh.nlmsg_len = offsetof(struct req_t, set.r6)
				+ sizeof(req.set.r6);

			req.set.r6.rta_dst.rta_type = RTA_DST;
			req.set.r6.rta_dst.rta_len = rta_len;

			memcpy(&req.set.r6.a, gw, sizeof(req.set.r6.a));
			req.set.r6.rta_gw.rta_type = RTA_GATEWAY;
			req.set.r6.rta_gw.rta_len = rta_len;
		} else {
			size_t rta_len = RTA_LENGTH(sizeof(req.set.r4.d));

			req.nlh.nlmsg_len = offsetof(struct req_t, set.r4)
				+ sizeof(req.set.r4);

			req.set.r4.rta_dst.rta_type = RTA_DST;
			req.set.r4.rta_dst.rta_len = rta_len;

			req.set.r4.a = *(uint32_t *)gw;
			req.set.r4.rta_gw.rta_type = RTA_GATEWAY;
			req.set.r4.rta_gw.rta_len = rta_len;
		}

		req.rtm.rtm_protocol = RTPROT_BOOT;
		req.nlh.nlmsg_flags |= NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
	} else {
		req.nlh.nlmsg_len = offsetof(struct req_t, set.r6);
		req.nlh.nlmsg_flags |= NLM_F_DUMP;
	}

	if ((n = nl_req(op == NL_SET, buf, &req, req.nlh.nlmsg_len)) < 0)
		return;

	if (op == NL_SET)
		return;

	nh = (struct nlmsghdr *)buf;
	nlmsgs_size = n;

	for ( ; NLMSG_OK(nh, n); nh = NLMSG_NEXT(nh, n)) {
		if (nh->nlmsg_type != RTM_NEWROUTE)
			goto next;

		if (op == NL_DUP) {
			nh->nlmsg_seq = nl_seq++;
			nh->nlmsg_pid = 0;
			nh->nlmsg_flags &= ~NLM_F_DUMP_FILTERED;
			nh->nlmsg_flags |= NLM_F_REQUEST | NLM_F_ACK |
					   NLM_F_CREATE;
			dup_routes++;
		}

		rtm = (struct rtmsg *)NLMSG_DATA(nh);
		if (op == NL_GET && rtm->rtm_dst_len)
			continue;

		for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
		     rta = RTA_NEXT(rta, na)) {
			if (op == NL_GET) {
				if (rta->rta_type != RTA_GATEWAY)
					continue;

				memcpy(gw, RTA_DATA(rta), RTA_PAYLOAD(rta));
				return;
			}

			if (op == NL_DUP && rta->rta_type == RTA_OIF)
				*(unsigned int *)RTA_DATA(rta) = ifi_ns;
		}

next:
		if (nh->nlmsg_type == NLMSG_DONE)
			break;
	}

	if (op == NL_DUP) {
		char resp[NLBUFSIZ];
		unsigned i;

		nh = (struct nlmsghdr *)buf;
		/* Routes might have dependencies between each other, and the
		 * kernel processes RTM_NEWROUTE messages sequentially. For n
		 * valid routes, we might need to send up to n requests to get
		 * all of them inserted. Routes that have been already inserted
		 * won't cause the whole request to fail, so we can simply
		 * repeat the whole request. This approach avoids the need to
		 * calculate dependencies: let the kernel do that.
		 */
		for (i = 0; i < dup_routes; i++)
			nl_req(1, resp, nh, nlmsgs_size);
	}
}

/**
 * nl_addr_get() - Get IP address for given interface and address family
 * @ifi:	Interface index in outer network namespace
 * @af:		Address family
 * @addr:	Global address to fill
 * @prefix_len:	Mask or prefix length, to fill (for IPv4)
 * @addr_l:	Link-scoped address to fill (for IPv6)
 */
void nl_addr_get(unsigned int ifi, sa_family_t af, void *addr,
		 int *prefix_len, void *addr_l)
{
	struct req_t {
		struct nlmsghdr nlh;
		struct ifaddrmsg ifa;
	} req = {
		.nlh.nlmsg_type    = RTM_GETADDR,
		.nlh.nlmsg_flags   = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
		.nlh.nlmsg_len     = sizeof(req),
		.nlh.nlmsg_seq     = nl_seq++,

		.ifa.ifa_family    = af,
		.ifa.ifa_index     = ifi,
	};
	struct nlmsghdr *nh;
	char buf[NLBUFSIZ];
	ssize_t n;

	if ((n = nl_req(0, buf, &req, req.nlh.nlmsg_len)) < 0)
		return;

	for (nh = (struct nlmsghdr *)buf;
	     NLMSG_OK(nh, n) && nh->nlmsg_type != NLMSG_DONE;
	     nh = NLMSG_NEXT(nh, n)) {
		struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);
		struct rtattr *rta;
		size_t na;

		if (nh->nlmsg_type != RTM_NEWADDR)
			continue;

		if (ifa->ifa_index != ifi)
			continue;

		for (rta = IFA_RTA(ifa), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
		     rta = RTA_NEXT(rta, na)) {
			if (rta->rta_type != IFA_ADDRESS)
				continue;

			if (af == AF_INET) {
				memcpy(addr, RTA_DATA(rta), RTA_PAYLOAD(rta));
				*prefix_len = ifa->ifa_prefixlen;
			} else if (af == AF_INET6 && addr &&
				   ifa->ifa_scope == RT_SCOPE_UNIVERSE) {
				memcpy(addr, RTA_DATA(rta), RTA_PAYLOAD(rta));
			}

			if (addr_l &&
			    af == AF_INET6 && ifa->ifa_scope == RT_SCOPE_LINK)
				memcpy(addr_l, RTA_DATA(rta), RTA_PAYLOAD(rta));
		}
	}
}

/**
 * nl_add_set() - Set IP addresses for given interface and address family
 * @ifi:	Interface index
 * @af:		Address family
 * @addr:	Global address to set
 * @prefix_len:	Mask or prefix length to set
 */
void nl_addr_set(unsigned int ifi, sa_family_t af, void *addr, int prefix_len)
{
	struct req_t {
		struct nlmsghdr nlh;
		struct ifaddrmsg ifa;
		union {
			struct {
				struct rtattr rta_l;
				uint32_t l;
				struct rtattr rta_a;
				uint32_t a;
			} a4;
			struct {
				struct rtattr rta_l;
				struct in6_addr l;
				struct rtattr rta_a;
				struct in6_addr a;
			} a6;
		} set;
	} req = {
		.nlh.nlmsg_type    = RTM_NEWADDR,
		.nlh.nlmsg_flags   = NLM_F_REQUEST | NLM_F_ACK |
				     NLM_F_CREATE | NLM_F_EXCL,
		.nlh.nlmsg_len     = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
		.nlh.nlmsg_seq     = nl_seq++,

		.ifa.ifa_family    = af,
		.ifa.ifa_index     = ifi,
		.ifa.ifa_prefixlen = prefix_len,
		.ifa.ifa_scope	   = RT_SCOPE_UNIVERSE,
	};
	char buf[NLBUFSIZ];

	if (af == AF_INET6) {
		size_t rta_len = RTA_LENGTH(sizeof(req.set.a6.l));

		/* By default, strictly speaking, it's duplicated */
		req.ifa.ifa_flags = IFA_F_NODAD;

		req.nlh.nlmsg_len = offsetof(struct req_t, set.a6)
			+ sizeof(req.set.a6);

		memcpy(&req.set.a6.l, addr, sizeof(req.set.a6.l));
		req.set.a6.rta_l.rta_len = rta_len;
		req.set.a4.rta_l.rta_type = IFA_LOCAL;
		memcpy(&req.set.a6.a, addr, sizeof(req.set.a6.a));
		req.set.a6.rta_a.rta_len = rta_len;
		req.set.a6.rta_a.rta_type = IFA_ADDRESS;
	} else {
		size_t rta_len = RTA_LENGTH(sizeof(req.set.a4.l));

		req.nlh.nlmsg_len = offsetof(struct req_t, set.a4)
			+ sizeof(req.set.a4);

		req.set.a4.l = req.set.a4.a = *(uint32_t *)addr;
		req.set.a4.rta_l.rta_len = rta_len;
		req.set.a4.rta_l.rta_type = IFA_LOCAL;
		req.set.a4.rta_a.rta_len = rta_len;
		req.set.a4.rta_a.rta_type = IFA_ADDRESS;
	}

	nl_req(1, buf, &req, req.nlh.nlmsg_len);
}

/**
 * nl_addr_dup() - Copy IP addresses for given interface and address family
 * @ifi:	Interface index in outer network namespace
 * @ifi_ns:	Interface index in target namespace
 * @af:		Address family
 */
void nl_addr_dup(unsigned int ifi, unsigned int ifi_ns, sa_family_t af)
{
	struct req_t {
		struct nlmsghdr nlh;
		struct ifaddrmsg ifa;
	} req = {
		.nlh.nlmsg_type    = RTM_GETADDR,
		.nlh.nlmsg_flags   = NLM_F_REQUEST | NLM_F_DUMP,
		.nlh.nlmsg_len     = sizeof(req),
		.nlh.nlmsg_seq     = nl_seq++,

		.ifa.ifa_family    = af,
		.ifa.ifa_index     = ifi,
		.ifa.ifa_prefixlen = 0,
	};
	char buf[NLBUFSIZ], resp[NLBUFSIZ];
	ssize_t n, nlmsgs_size;
	struct nlmsghdr *nh;

	if ((n = nl_req(0, buf, &req, sizeof(req))) < 0)
		return;

	nlmsgs_size = n;

	for (nh = (struct nlmsghdr *)buf;
	     NLMSG_OK(nh, n) && nh->nlmsg_type != NLMSG_DONE;
	     nh = NLMSG_NEXT(nh, n)) {
		struct ifaddrmsg *ifa;
		struct rtattr *rta;
		size_t na;

		if (nh->nlmsg_type != RTM_NEWADDR)
			continue;

		nh->nlmsg_seq = nl_seq++;
		nh->nlmsg_pid = 0;
		nh->nlmsg_flags &= ~NLM_F_DUMP_FILTERED;
		nh->nlmsg_flags |= NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE;

		ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);

		if (ifa->ifa_scope == RT_SCOPE_LINK || ifa->ifa_index != ifi) {
			ifa->ifa_family = AF_UNSPEC;
			continue;
		}

		ifa->ifa_index = ifi_ns;

		for (rta = IFA_RTA(ifa), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
		     rta = RTA_NEXT(rta, na)) {
			if (rta->rta_type == IFA_LABEL)
				rta->rta_type = IFA_UNSPEC;
		}
	}

	nl_req(1, resp, buf, nlmsgs_size);
}

/**
 * nl_link_get_mac() - Get link MAC address
 * @ns:		Use netlink socket in namespace
 * @ifi:	Interface index
 * @mac:	Fill with current MAC address
 */
void nl_link_get_mac(int ns, unsigned int ifi, void *mac)
{
	struct req_t {
		struct nlmsghdr nlh;
		struct ifinfomsg ifm;
	} req = {
		.nlh.nlmsg_type	  = RTM_GETLINK,
		.nlh.nlmsg_len	  = sizeof(req),
		.nlh.nlmsg_flags  = NLM_F_REQUEST | NLM_F_ACK,
		.nlh.nlmsg_seq	  = nl_seq++,
		.ifm.ifi_family	  = AF_UNSPEC,
		.ifm.ifi_index	  = ifi,
	};
	struct nlmsghdr *nh;
	char buf[NLBUFSIZ];
	ssize_t n;

	n = nl_req(ns, buf, &req, sizeof(req));
	if (n < 0)
		return;

	for (nh = (struct nlmsghdr *)buf;
	     NLMSG_OK(nh, n) && nh->nlmsg_type != NLMSG_DONE;
	     nh = NLMSG_NEXT(nh, n)) {
		struct ifinfomsg *ifm = (struct ifinfomsg *)NLMSG_DATA(nh);
		struct rtattr *rta;
		size_t na;

		if (nh->nlmsg_type != RTM_NEWLINK)
			continue;

		for (rta = IFLA_RTA(ifm), na = RTM_PAYLOAD(nh);
		     RTA_OK(rta, na);
		     rta = RTA_NEXT(rta, na)) {
			if (rta->rta_type != IFLA_ADDRESS)
				continue;

			memcpy(mac, RTA_DATA(rta), ETH_ALEN);
			break;
		}
	}
}

/**
 * nl_link_set_mac() - Set link MAC address
 * @ns:		Use netlink socket in namespace
 * @ifi:	Interface index
 * @mac:	MAC address to set
 */
void nl_link_set_mac(int ns, unsigned int ifi, void *mac)
{
	struct req_t {
		struct nlmsghdr nlh;
		struct ifinfomsg ifm;
		struct rtattr rta;
		unsigned char mac[ETH_ALEN];
	} req = {
		.nlh.nlmsg_type	  = RTM_NEWLINK,
		.nlh.nlmsg_len	  = sizeof(req),
		.nlh.nlmsg_flags  = NLM_F_REQUEST | NLM_F_ACK,
		.nlh.nlmsg_seq	  = nl_seq++,
		.ifm.ifi_family	  = AF_UNSPEC,
		.ifm.ifi_index	  = ifi,
		.rta.rta_type	  = IFLA_ADDRESS,
		.rta.rta_len	  = RTA_LENGTH(ETH_ALEN),
	};
	char buf[NLBUFSIZ];

	memcpy(req.mac, mac, ETH_ALEN);

	nl_req(ns, buf, &req, sizeof(req));
}

/**
 * nl_link_up() - Bring link up
 * @ns:		Use netlink socket in namespace
 * @ifi:	Interface index
 * @mtu:	If non-zero, set interface MTU
 */
void nl_link_up(int ns, unsigned int ifi, int mtu)
{
	struct req_t {
		struct nlmsghdr nlh;
		struct ifinfomsg ifm;
		struct rtattr rta;
		unsigned int mtu;
	} req = {
		.nlh.nlmsg_type   = RTM_NEWLINK,
		.nlh.nlmsg_len    = sizeof(req),
		.nlh.nlmsg_flags  = NLM_F_REQUEST | NLM_F_ACK,
		.nlh.nlmsg_seq	  = nl_seq++,
		.ifm.ifi_family	  = AF_UNSPEC,
		.ifm.ifi_index	  = ifi,
		.ifm.ifi_flags	  = IFF_UP,
		.ifm.ifi_change	  = IFF_UP,
		.rta.rta_type	  = IFLA_MTU,
		.rta.rta_len	  = RTA_LENGTH(sizeof(unsigned int)),
		.mtu		  = mtu,
	};
	char buf[NLBUFSIZ];

	if (!mtu)
		/* Shorten request to drop MTU attribute */
		req.nlh.nlmsg_len = offsetof(struct req_t, rta);

	nl_req(ns, buf, &req, req.nlh.nlmsg_len);
}