udp: Find or create flows for datagrams from tap interface

Currently we create flows for datagrams from socket interfaces, and use
them to direct "spliced" (socket to socket) datagrams.  We don't yet
match datagrams from the tap interface to existing flows, nor create new
flows for them.  Add that functionality, matching datagrams from tap to
existing flows when they exist, or creating new ones.

As with spliced flows, when creating a new flow from tap to socket, we
create a new connected socket to receive reply datagrams attached to that
flow specifically. We extend udp_flow_sock_handler() to handle reply
packets bound for tap rather than another socket.

For non-obvious reasons (perhaps increased stack usage?), this caused
a failure for me when running under valgrind, because valgrind invoked
rt_sigreturn which is not in our seccomp filter.  Since we already
allow rt_sigaction and others in the valgrind target, it seems
reasonable to add rt_sigreturn as well.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
David Gibson 2024-07-18 15:26:49 +10:00 committed by Stefano Brivio
parent 8126f7a660
commit b7ad19347f
3 changed files with 103 additions and 120 deletions

View file

@ -128,7 +128,7 @@ qrap: $(QRAP_SRCS) passt.h
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS) $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \ valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
getpid gettid kill clock_gettime mmap \ rt_sigreturn getpid gettid kill clock_gettime mmap \
munmap open unlink gettimeofday futex munmap open unlink gettimeofday futex
valgrind: FLAGS += -g -DVALGRIND valgrind: FLAGS += -g -DVALGRIND
valgrind: all valgrind: all

217
udp.c
View file

@ -116,6 +116,7 @@
#include <sys/uio.h> #include <sys/uio.h>
#include <time.h> #include <time.h>
#include <fcntl.h> #include <fcntl.h>
#include <arpa/inet.h>
#include <linux/errqueue.h> #include <linux/errqueue.h>
#include "checksum.h" #include "checksum.h"
@ -389,6 +390,8 @@ static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
uflow->s[TGTSIDE] = -1; uflow->s[TGTSIDE] = -1;
} }
flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
} }
/** /**
@ -483,6 +486,13 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
} }
flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE)); flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
/* If the target side is a socket, it will be a reply socket that knows
* its own flowside. But if it's tap, then we need to look it up by
* hash.
*/
if (!pif_is_socket(tgtpif))
flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE));
FLOW_ACTIVATE(uflow); FLOW_ACTIVATE(uflow);
return FLOW_SIDX(uflow, TGTSIDE); return FLOW_SIDX(uflow, TGTSIDE);
@ -907,10 +917,12 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
{ {
const struct flowside *fromside = flowside_at_sidx(ref.flowside); const struct flowside *fromside = flowside_at_sidx(ref.flowside);
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
const struct flowside *toside = flowside_at_sidx(tosidx);
struct udp_flow *uflow = udp_at_sidx(ref.flowside); struct udp_flow *uflow = udp_at_sidx(ref.flowside);
int from_s = uflow->s[ref.flowside.sidei]; int from_s = uflow->s[ref.flowside.sidei];
bool v6 = !inany_v4(&fromside->eaddr); bool v6 = !inany_v4(&fromside->eaddr);
struct mmsghdr *mmh_recv = v6 ? udp6_mh_recv : udp4_mh_recv; struct mmsghdr *mmh_recv = v6 ? udp6_mh_recv : udp4_mh_recv;
uint8_t topif = pif_at_sidx(tosidx);
int n, i; int n, i;
ASSERT(!c->no_udp && uflow); ASSERT(!c->no_udp && uflow);
@ -921,10 +933,64 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
flow_trace(uflow, "Received %d datagrams on reply socket", n); flow_trace(uflow, "Received %d datagrams on reply socket", n);
uflow->ts = now->tv_sec; uflow->ts = now->tv_sec;
for (i = 0; i < n; i++) for (i = 0; i < n; i++) {
udp_splice_prepare(mmh_recv, i); if (pif_is_socket(topif))
udp_splice_prepare(mmh_recv, i);
else
udp_tap_prepare(c, mmh_recv, i, toside->eport, v6, now);
}
udp_splice_send(c, 0, n, tosidx); if (pif_is_socket(topif))
udp_splice_send(c, 0, n, tosidx);
else
tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
}
/**
* udp_flow_from_tap() - Find or create UDP flow for tap packets
* @c: Execution context
* @pif: pif on which the packet is arriving
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address on guest side
* @daddr: Destination address guest side
* @srcport: Source port on guest side
* @dstport: Destination port on guest side
*
* Return: sidx for the destination side of the flow for this packet, or
* FLOW_SIDX_NONE if we couldn't find or create a flow.
*/
static flow_sidx_t udp_flow_from_tap(const struct ctx *c,
uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
in_port_t srcport, in_port_t dstport,
const struct timespec *now)
{
struct udp_flow *uflow;
union flow *flow;
flow_sidx_t sidx;
ASSERT(pif == PIF_TAP);
sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr,
srcport, dstport);
if ((uflow = udp_at_sidx(sidx))) {
uflow->ts = now->tv_sec;
return flow_sidx_opposite(sidx);
}
if (!(flow = flow_alloc())) {
char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN];
debug("Couldn't allocate flow for UDP datagram from %s %s:%hu -> %s:%hu",
pif_name(pif),
inet_ntop(af, saddr, sstr, sizeof(sstr)), srcport,
inet_ntop(af, daddr, dstr, sizeof(dstr)), dstport);
return FLOW_SIDX_NONE;
}
flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport);
return udp_flow_new(c, flow, -1, now);
} }
/** /**
@ -942,23 +1008,22 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
* *
* #syscalls sendmmsg * #syscalls sendmmsg
*/ */
int udp_tap_handler(struct ctx *c, uint8_t pif, int udp_tap_handler(const struct ctx *c, uint8_t pif,
sa_family_t af, const void *saddr, const void *daddr, sa_family_t af, const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now) const struct pool *p, int idx, const struct timespec *now)
{ {
const struct flowside *toside;
struct mmsghdr mm[UIO_MAXIOV]; struct mmsghdr mm[UIO_MAXIOV];
union sockaddr_inany to_sa;
struct iovec m[UIO_MAXIOV]; struct iovec m[UIO_MAXIOV];
struct sockaddr_in6 s_in6;
struct sockaddr_in s_in;
const struct udphdr *uh; const struct udphdr *uh;
struct sockaddr *sa; struct udp_flow *uflow;
int i, s, count = 0; int i, s, count = 0;
flow_sidx_t tosidx;
in_port_t src, dst; in_port_t src, dst;
uint8_t topif;
socklen_t sl; socklen_t sl;
(void)saddr;
(void)pif;
ASSERT(!c->no_udp); ASSERT(!c->no_udp);
uh = packet_get(p, idx, 0, sizeof(*uh), NULL); uh = packet_get(p, idx, 0, sizeof(*uh), NULL);
@ -969,117 +1034,35 @@ int udp_tap_handler(struct ctx *c, uint8_t pif,
* and destination, so we can just take those from the first message. * and destination, so we can just take those from the first message.
*/ */
src = ntohs(uh->source); src = ntohs(uh->source);
src += c->udp.fwd_in.rdelta[src];
dst = ntohs(uh->dest); dst = ntohs(uh->dest);
if (af == AF_INET) { tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr, src, dst, now);
s_in = (struct sockaddr_in) { if (!(uflow = udp_at_sidx(tosidx))) {
.sin_family = AF_INET, char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN];
.sin_port = uh->dest,
.sin_addr = *(struct in_addr *)daddr,
};
sa = (struct sockaddr *)&s_in; debug("Dropping datagram with no flow %s %s:%hu -> %s:%hu",
sl = sizeof(s_in); pif_name(pif),
inet_ntop(af, saddr, sstr, sizeof(sstr)), src,
if (IN4_ARE_ADDR_EQUAL(&s_in.sin_addr, &c->ip4.dns_match) && inet_ntop(af, daddr, dstr, sizeof(dstr)), dst);
ntohs(s_in.sin_port) == 53) { return 1;
s_in.sin_addr = c->ip4.dns_host;
udp_tap_map[V4][src].ts = now->tv_sec;
udp_tap_map[V4][src].flags |= PORT_DNS_FWD;
bitmap_set(udp_act[V4][UDP_ACT_TAP], src);
} else if (IN4_ARE_ADDR_EQUAL(&s_in.sin_addr, &c->ip4.gw) &&
!c->no_map_gw) {
if (!(udp_tap_map[V4][dst].flags & PORT_LOCAL) ||
(udp_tap_map[V4][dst].flags & PORT_LOOPBACK))
s_in.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
else
s_in.sin_addr = c->ip4.addr_seen;
}
debug("UDP from tap src=%hu dst=%hu, s=%d",
src, dst, udp_tap_map[V4][src].sock);
if ((s = udp_tap_map[V4][src].sock) < 0) {
struct in_addr bind_addr = IN4ADDR_ANY_INIT;
union udp_epoll_ref uref = {
.port = src,
.pif = PIF_HOST,
};
const char *bind_if = NULL;
if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr))
bind_if = c->ip4.ifname_out;
if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr))
bind_addr = c->ip4.addr_out;
s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP, &bind_addr,
bind_if, src, uref.u32);
if (s < 0)
return p->count - idx;
udp_tap_map[V4][src].sock = s;
bitmap_set(udp_act[V4][UDP_ACT_TAP], src);
}
udp_tap_map[V4][src].ts = now->tv_sec;
} else {
s_in6 = (struct sockaddr_in6) {
.sin6_family = AF_INET6,
.sin6_port = uh->dest,
.sin6_addr = *(struct in6_addr *)daddr,
};
const struct in6_addr *bind_addr = &in6addr_any;
sa = (struct sockaddr *)&s_in6;
sl = sizeof(s_in6);
if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.dns_match) &&
ntohs(s_in6.sin6_port) == 53) {
s_in6.sin6_addr = c->ip6.dns_host;
udp_tap_map[V6][src].ts = now->tv_sec;
udp_tap_map[V6][src].flags |= PORT_DNS_FWD;
bitmap_set(udp_act[V6][UDP_ACT_TAP], src);
} else if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw) &&
!c->no_map_gw) {
if (!(udp_tap_map[V6][dst].flags & PORT_LOCAL) ||
(udp_tap_map[V6][dst].flags & PORT_LOOPBACK))
s_in6.sin6_addr = in6addr_loopback;
else if (udp_tap_map[V6][dst].flags & PORT_GUA)
s_in6.sin6_addr = c->ip6.addr;
else
s_in6.sin6_addr = c->ip6.addr_seen;
} else if (IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr)) {
bind_addr = &c->ip6.addr_ll;
}
if ((s = udp_tap_map[V6][src].sock) < 0) {
union udp_epoll_ref uref = {
.v6 = 1,
.port = src,
.pif = PIF_HOST,
};
const char *bind_if = NULL;
if (!IN6_IS_ADDR_LOOPBACK(&s_in6.sin6_addr))
bind_if = c->ip6.ifname_out;
if (!IN6_IS_ADDR_LOOPBACK(&s_in6.sin6_addr) &&
!IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr))
bind_addr = &c->ip6.addr_out;
s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP, bind_addr,
bind_if, src, uref.u32);
if (s < 0)
return p->count - idx;
udp_tap_map[V6][src].sock = s;
bitmap_set(udp_act[V6][UDP_ACT_TAP], src);
}
udp_tap_map[V6][src].ts = now->tv_sec;
} }
topif = pif_at_sidx(tosidx);
if (topif != PIF_HOST) {
flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
uint8_t frompif = pif_at_sidx(fromsidx);
flow_err(uflow, "No support for forwarding UDP from %s to %s",
pif_name(frompif), pif_name(topif));
return 1;
}
toside = flowside_at_sidx(tosidx);
s = udp_at_sidx(tosidx)->s[tosidx.sidei];
ASSERT(s >= 0);
pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport);
for (i = 0; i < (int)p->count - idx; i++) { for (i = 0; i < (int)p->count - idx; i++) {
struct udphdr *uh_send; struct udphdr *uh_send;
size_t len; size_t len;
@ -1088,7 +1071,7 @@ int udp_tap_handler(struct ctx *c, uint8_t pif,
if (!uh_send) if (!uh_send)
return p->count - idx; return p->count - idx;
mm[i].msg_hdr.msg_name = sa; mm[i].msg_hdr.msg_name = &to_sa;
mm[i].msg_hdr.msg_namelen = sl; mm[i].msg_hdr.msg_namelen = sl;
if (len) { if (len) {

4
udp.h
View file

@ -13,8 +13,8 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now); uint32_t events, const struct timespec *now);
void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now); uint32_t events, const struct timespec *now);
int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, int udp_tap_handler(const struct ctx *c, uint8_t pif,
const void *saddr, const void *daddr, sa_family_t af, const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now); const struct pool *p, int idx, const struct timespec *now);
int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const void *addr, const char *ifname, in_port_t port); const void *addr, const char *ifname, in_port_t port);