passt/udp.c
Stefano Brivio e07f539ae0 udp, passt: Introduce socket packet buffer, avoid getsockname() for UDP
This is in preparation for scatter-gather IO on the UDP receive path:
save a getsockname() syscall by setting a flag if we get the numbering
of all bound sockets in a strict sequence (expected, in practice) and
repurpose the tap buffer to be also a socket receive buffer, passing
it down to protocol handlers.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2021-04-30 14:52:18 +02:00

511 lines
12 KiB
C

// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
*
* udp.c - UDP L2-L4 translation routines
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
*/
/**
* DOC: Theory of Operation
*
*
* For UDP, a reduced version of port-based connection tracking is implemented
* with two purposes:
* - binding ephemeral ports when they're used as source port by the guest, so
* that replies on those ports can be forwarded back to the guest, with a
* fixed 180s timeout for this binding
* - packets received from the local host get their source changed to a local
* address (gateway address) so that they can be forwarded to the guest, and
* packets sent as replies by the guest need their destination address to
* be changed back to the address of the local host. This is dynamic to allow
* connections from the gateway as well, and uses the same fixed 180s timeout
*
* Sockets for ephemeral and non-ephemeral ports are created and at
* initialisation time, one set for IPv4 and one for IPv6. Non-ephemeral ports
* are bound at initialisation time, ephemeral ports are bound dynamically.
*
* Packets are forwarded back and forth, by prepending and stripping UDP headers
* in the obvious way, with no port translation.
*
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <errno.h>
#include <limits.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <unistd.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/udp.h>
#include <time.h>
#include "passt.h"
#include "tap.h"
#include "util.h"
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
struct udp_port {
int s;
time_t ts_ephemeral;
time_t ts_local;
};
static struct udp_port up4[USHRT_MAX];
static struct udp_port up6[USHRT_MAX];
/* Bitmaps, activity monitoring needed for port */
static uint8_t udp4_act[USHRT_MAX / 8];
static uint8_t udp6_act[USHRT_MAX / 8];
/**
* udp_act_set() - Set port in bitmap for timed events
* @af: Protocol family
* @s: Port number
*/
static void udp_act_set(int af, int p)
{
if (af == AF_INET)
udp4_act[p / 8] |= 1 << (p % 8);
else
udp6_act[p / 8] |= 1 << (p % 8);
}
/**
* udp_act_clear() - Clear port from bitmap for timed events
* @af: Protocol family
* @s: Port number
*/
static void udp_act_clear(int af, int p)
{
if (af == AF_INET)
udp4_act[p / 8] &= ~(1 << (p % 8));
else
udp6_act[p / 8] &= ~(1 << (p % 8));
}
/**
* udp_sock_handler_local() - Replace address if local, update timestamp
* @c: Execution context
* @sa: Socket address as struct sockaddr_in or sockaddr_in6
* @now: Current timestamp
*/
static void udp_sock_handler_local(struct ctx *c, int af, void *sa,
struct timespec *now)
{
if (af == AF_INET) {
struct sockaddr_in *s_in = (struct sockaddr_in *)sa;
s_in->sin_addr.s_addr = c->gw4;
up4[ntohs(s_in->sin_port)].ts_local = now->tv_sec;
udp_act_set(AF_INET, ntohs(s_in->sin_port));
} else {
struct sockaddr_in6 *s_in6 = (struct sockaddr_in6 *)sa;
memcpy(&s_in6->sin6_addr, &c->gw6, sizeof(c->gw6));
up6[ntohs(s_in6->sin6_port)].ts_local = now->tv_sec;
udp_act_set(AF_INET6, ntohs(s_in6->sin6_port));
}
}
/**
* udp_sock_name() - Get address family and port for bound UDP socket
* @c: Execution context
* @s: File descriptor number for socket
* @port: Local port, set on return, network order
*
* Return: address family, AF_INET or AF_INET6, negative error code on failure
*/
static int udp_sock_name(struct ctx *c, int s, in_port_t *port)
{
if (!c->udp.fd_in_seq) {
struct sockaddr_storage sa;
socklen_t sl;
sl = sizeof(sa);
if (getsockname(s, (struct sockaddr *)&sa, &sl))
return -errno;
if (sa.ss_family == AF_INET) {
*port = ((struct sockaddr_in *)&sa)->sin_port;
return AF_INET;
}
if (sa.ss_family == AF_INET6) {
*port = ((struct sockaddr_in6 *)&sa)->sin6_port;
return AF_INET6;
}
return -ENOTSUP;
}
if (c->v4 && c->v6) {
*port = htons((s - c->udp.fd_min) / 2);
return ((s - c->udp.fd_min) % 2) ? AF_INET6 : AF_INET;
}
*port = htons(s - c->udp.fd_min);
return c->v4 ? AF_INET : AF_INET6;
}
/**
* udp_sock_handler() - Handle new data from socket
* @c: Execution context
* @s: File descriptor number for socket
* @events: epoll events bitmap
* @pkt_buf: Buffer to receive packets, currently unused
* @now: Current timestamp
*/
void udp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
struct timespec *now)
{
struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0xff, 0xff,
0, 0, 0, 0 } };
struct sockaddr_storage sr;
socklen_t slen = sizeof(sr);
char buf[USHRT_MAX];
struct udphdr *uh;
ssize_t n;
int af;
(void)pkt_buf;
if (events == EPOLLERR)
return;
n = recvfrom(s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh),
MSG_DONTWAIT, (struct sockaddr *)&sr, &slen);
if (n < 0)
return;
uh = (struct udphdr *)buf;
af = udp_sock_name(c, s, &uh->dest);
if (af == AF_INET) {
struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr;
if (ntohl(sr4->sin_addr.s_addr) == INADDR_LOOPBACK ||
ntohl(sr4->sin_addr.s_addr) == INADDR_ANY)
udp_sock_handler_local(c, AF_INET, sr4, now);
memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr));
uh->source = sr4->sin_port;
uh->len = htons(n + sizeof(*uh));
tap_ip_send(c, &a6, IPPROTO_UDP, buf, n + sizeof(*uh));
} else if (af == AF_INET6) {
struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr;
if (IN6_IS_ADDR_LOOPBACK(&sr6->sin6_addr))
udp_sock_handler_local(c, AF_INET6, sr6, now);
uh->source = sr6->sin6_port;
uh->len = htons(n + sizeof(*uh));
tap_ip_send(c, &sr6->sin6_addr, IPPROTO_UDP,
buf, n + sizeof(*uh));
}
}
/**
* udp_tap_handler_ephemeral() - Bind ephemeral source port, update timestamp
* @af: Address family, AF_INET or AF_INET6
* @src: Source port, host order
* @now: Current timestamp
*/
static void udp_tap_handler_ephemeral(int af, in_port_t src,
struct timespec *now)
{
struct sockaddr *addr = NULL;
struct sockaddr_in6 s_in6 = {
.sin6_family = AF_INET6,
.sin6_port = htons(src),
.sin6_addr = IN6ADDR_ANY_INIT,
};
struct sockaddr_in s_in = {
.sin_family = AF_INET,
.sin_port = htons(src),
.sin_addr = { .s_addr = INADDR_ANY },
};
socklen_t sl;
int s;
if (af == AF_INET) {
if (!up4[src].ts_ephemeral) {
s = up4[src].s;
addr = (struct sockaddr *)&s_in;
sl = sizeof(s_in);
}
} else {
if (!up6[src].ts_ephemeral) {
s = up6[src].s;
addr = (struct sockaddr *)&s_in6;
sl = sizeof(s_in6);
}
}
if (addr) {
if (bind(s, addr, sl))
return;
udp_act_set(af, src);
}
if (af == AF_INET)
up4[src].ts_ephemeral = now->tv_sec;
else
up6[src].ts_ephemeral = now->tv_sec;
}
/**
* udp_tap_handler_local() - Set address to local if needed, update timestamp
* @af: Address family, AF_INET or AF_INET6
* @dst: Destination port, host order
* @sa: Socket address as struct sockaddr_in or sockaddr_in6 to modify
* @now: Current timestamp
*/
static void udp_tap_handler_local(int af, in_port_t dst, void *sa,
struct timespec *now)
{
if (af == AF_INET) {
if (up4[dst].ts_local) {
struct sockaddr_in *s_in = (struct sockaddr_in *)sa;
s_in->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
up4[dst].ts_local = now->tv_sec;
}
} else {
if (up6[dst].ts_local) {
struct sockaddr_in6 *s_in6 = (struct sockaddr_in6 *)sa;
s_in6->sin6_addr = in6addr_loopback;
up6[dst].ts_local = now->tv_sec;
}
}
}
/**
* udp_tap_handler() - Handle packets from tap
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @msg: Input messages
* @count: Message count
* @now: Current timestamp
*
* Return: count of consumed packets
*/
int udp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now)
{
/* The caller already checks that all the messages have the same source
* and destination, so we can just take those from the first message.
*/
struct udphdr *uh = (struct udphdr *)msg[0].l4h;
struct mmsghdr mm[UIO_MAXIOV] = { 0 };
struct iovec m[UIO_MAXIOV];
struct sockaddr_in6 s_in6;
struct sockaddr_in s_in;
struct sockaddr *sa;
in_port_t src, dst;
socklen_t sl;
int i, s;
(void)c;
if (msg[0].l4_len < sizeof(*uh))
return 1;
src = ntohs(uh->source);
dst = ntohs(uh->dest);
if (af == AF_INET) {
s_in = (struct sockaddr_in) {
.sin_family = AF_INET,
.sin_port = uh->dest,
.sin_addr = *(struct in_addr *)addr,
};
sa = (struct sockaddr *)&s_in;
sl = sizeof(s_in);
} else if (af == AF_INET6) {
s_in6 = (struct sockaddr_in6) {
.sin6_family = AF_INET6,
.sin6_port = uh->dest,
.sin6_addr = *(struct in6_addr *)addr,
};
sa = (struct sockaddr *)&s_in6;
sl = sizeof(s_in6);
} else {
return count;
}
for (i = 0; i < count; i++) {
m[i].iov_base = (char *)((struct udphdr *)msg[i].l4h + 1);
m[i].iov_len = msg[i].l4_len - sizeof(*uh);
mm[i].msg_hdr.msg_name = sa;
mm[i].msg_hdr.msg_namelen = sl;
mm[i].msg_hdr.msg_iov = m + i;
mm[i].msg_hdr.msg_iovlen = 1;
}
if (af == AF_INET) {
if (!(s = up4[src].s))
return count;
if (s_in.sin_addr.s_addr == c->gw4)
udp_tap_handler_local(AF_INET, dst, &s_in, now);
} else {
if (!(s = up6[src].s))
return count;
if (!memcmp(addr, &c->gw6, sizeof(c->gw6)))
udp_tap_handler_local(AF_INET6, dst, &s_in6, now);
}
if (PORT_IS_EPHEMERAL(src))
udp_tap_handler_ephemeral(af, src, now);
count = sendmmsg(s, mm, count, MSG_DONTWAIT | MSG_NOSIGNAL);
if (count < 0)
return 1;
return count;
}
/**
* udp_sock_init() - Create and bind listening sockets for inbound packets
* @c: Execution context
*
* Return: 0 on success, -1 on failure
*/
int udp_sock_init(struct ctx *c)
{
int s, prev = -1;
in_port_t port;
c->udp.fd_min = INT_MAX;
c->udp.fd_max = 0;
c->udp.fd_in_seq = 1;
for (port = 0; port < USHRT_MAX; port++) {
if (c->v4) {
if ((s = sock_l4(c, AF_INET, IPPROTO_UDP, port)) < 0)
return -1;
if (c->udp.fd_in_seq && prev != -1 && s != prev + 1)
c->udp.fd_in_seq = 0;
else
prev = s;
up4[port].s = s;
}
if (c->v6) {
if ((s = sock_l4(c, AF_INET6, IPPROTO_UDP, port)) < 0)
return -1;
if (c->udp.fd_in_seq && prev != -1 && s != prev + 1)
c->udp.fd_in_seq = 0;
else
prev = s;
up6[port].s = s;
}
}
return 0;
}
/**
* udp_timer_one() - Handler for timed events on one port
* @af: Address family, AF_INET or AF_INET6
* @p: Port number, host order
* @ts: Timestamp from caller
*/
static void udp_timer_one(struct ctx *c, int af, in_port_t p,
struct timespec *ts)
{
int s = -1;
if (af == AF_INET) {
if (ts->tv_sec - up4[p].ts_ephemeral > UDP_CONN_TIMEOUT)
up4[p].ts_ephemeral = 0;
if (ts->tv_sec - up4[p].ts_local > UDP_CONN_TIMEOUT)
up4[p].ts_local = 0;
if (!up4[p].ts_ephemeral && !up4[p].ts_local) {
udp_act_clear(AF_INET, p);
s = up4[p].s;
}
} else {
if (ts->tv_sec - up6[p].ts_ephemeral > UDP_CONN_TIMEOUT)
up6[p].ts_ephemeral = 0;
if (ts->tv_sec - up6[p].ts_local > UDP_CONN_TIMEOUT)
up6[p].ts_local = 0;
if (!up6[p].ts_ephemeral && !up6[p].ts_local) {
udp_act_clear(AF_INET6, p);
s = up6[p].s;
}
}
if (s != -1) {
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL);
close(s);
if (sock_l4(c, af, IPPROTO_UDP, p) != s)
c->udp.fd_in_seq = 0;
}
}
/**
* udp_timer() - Scan activity bitmap for ports with associated timed events
* @c: Execution context
* @ts: Timestamp from caller
*/
void udp_timer(struct ctx *c, struct timespec *ts)
{
long *word, tmp;
unsigned int i;
int n;
word = (long *)udp4_act;
for (i = 0; i < sizeof(udp4_act) / sizeof(long); i++, word++) {
tmp = *word;
while ((n = ffsl(tmp))) {
tmp &= ~(1UL << (n - 1));
udp_timer_one(c, AF_INET,
i * sizeof(long) * 8 + n - 1, ts);
}
}
word = (long *)udp6_act;
for (i = 0; i < sizeof(udp6_act) / sizeof(long); i++, word++) {
tmp = *word;
while ((n = ffsl(tmp))) {
tmp &= ~(1UL << (n - 1));
udp_timer_one(c, AF_INET6,
i * sizeof(long) * 8 + n - 1, ts);
}
}
}