udp: Direct datagrams from host to guest via flow table
This replaces the last piece of existing UDP port tracking with the common flow table. Specifically use the flow table to direct datagrams from host sockets to the guest tap interface. Since this now requires a flow for every datagram, we add some logging if we encounter any datagrams for which we can't find or create a flow. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
parent
b7ad19347f
commit
898f797174
1 changed files with 51 additions and 134 deletions
185
udp.c
185
udp.c
|
@ -73,26 +73,6 @@
|
||||||
*
|
*
|
||||||
* Note that a spliced flow will have *both* a duplicated listening socket and a
|
* Note that a spliced flow will have *both* a duplicated listening socket and a
|
||||||
* reply socket (see above).
|
* reply socket (see above).
|
||||||
*
|
|
||||||
* Port tracking
|
|
||||||
* =============
|
|
||||||
*
|
|
||||||
* For UDP, a reduced version of port-based connection tracking is implemented
|
|
||||||
* with two purposes:
|
|
||||||
* - binding ephemeral ports when they're used as source port by the guest, so
|
|
||||||
* that replies on those ports can be forwarded back to the guest, with a
|
|
||||||
* fixed timeout for this binding
|
|
||||||
* - packets received from the local host get their source changed to a local
|
|
||||||
* address (gateway address) so that they can be forwarded to the guest, and
|
|
||||||
* packets sent as replies by the guest need their destination address to
|
|
||||||
* be changed back to the address of the local host. This is dynamic to allow
|
|
||||||
* connections from the gateway as well, and uses the same fixed 180s timeout
|
|
||||||
*
|
|
||||||
* Sockets for bound ports are created at initialisation time, one set for IPv4
|
|
||||||
* and one for IPv6.
|
|
||||||
*
|
|
||||||
* Packets are forwarded back and forth, by prepending and stripping UDP headers
|
|
||||||
* in the obvious way, with no port translation.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
|
@ -526,7 +506,6 @@ static flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
|
||||||
|
|
||||||
ASSERT(ref.type == EPOLL_TYPE_UDP);
|
ASSERT(ref.type == EPOLL_TYPE_UDP);
|
||||||
|
|
||||||
/* FIXME: Match reply packets to their flow as well */
|
|
||||||
if (!ref.udp.orig)
|
if (!ref.udp.orig)
|
||||||
return FLOW_SIDX_NONE;
|
return FLOW_SIDX_NONE;
|
||||||
|
|
||||||
|
@ -586,160 +565,87 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* udp_update_hdr4() - Update headers for one IPv4 datagram
|
* udp_update_hdr4() - Update headers for one IPv4 datagram
|
||||||
* @c: Execution context
|
|
||||||
* @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
|
* @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
|
||||||
* @s_in: Source socket address, filled in by recvmmsg()
|
|
||||||
* @bp: Pointer to udp_payload_t to update
|
* @bp: Pointer to udp_payload_t to update
|
||||||
* @dstport: Destination port number
|
* @toside: Flowside for destination side
|
||||||
* @dlen: Length of UDP payload
|
* @dlen: Length of UDP payload
|
||||||
* @now: Current timestamp
|
|
||||||
*
|
*
|
||||||
* Return: size of IPv4 payload (UDP header + data)
|
* Return: size of IPv4 payload (UDP header + data)
|
||||||
*/
|
*/
|
||||||
static size_t udp_update_hdr4(const struct ctx *c,
|
static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
|
||||||
struct iphdr *ip4h, const struct sockaddr_in *s_in,
|
const struct flowside *toside, size_t dlen)
|
||||||
struct udp_payload_t *bp,
|
|
||||||
in_port_t dstport, size_t dlen,
|
|
||||||
const struct timespec *now)
|
|
||||||
{
|
{
|
||||||
const struct in_addr dst = c->ip4.addr_seen;
|
const struct in_addr *src = inany_v4(&toside->faddr);
|
||||||
in_port_t srcport = ntohs(s_in->sin_port);
|
const struct in_addr *dst = inany_v4(&toside->eaddr);
|
||||||
size_t l4len = dlen + sizeof(bp->uh);
|
size_t l4len = dlen + sizeof(bp->uh);
|
||||||
size_t l3len = l4len + sizeof(*ip4h);
|
size_t l3len = l4len + sizeof(*ip4h);
|
||||||
struct in_addr src = s_in->sin_addr;
|
|
||||||
|
|
||||||
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) &&
|
ASSERT(src && dst);
|
||||||
IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 &&
|
|
||||||
(udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) {
|
|
||||||
src = c->ip4.dns_match;
|
|
||||||
} else if (IN4_IS_ADDR_LOOPBACK(&src) ||
|
|
||||||
IN4_ARE_ADDR_EQUAL(&src, &c->ip4.addr_seen)) {
|
|
||||||
udp_tap_map[V4][srcport].ts = now->tv_sec;
|
|
||||||
udp_tap_map[V4][srcport].flags |= PORT_LOCAL;
|
|
||||||
|
|
||||||
if (IN4_IS_ADDR_LOOPBACK(&src))
|
|
||||||
udp_tap_map[V4][srcport].flags |= PORT_LOOPBACK;
|
|
||||||
else
|
|
||||||
udp_tap_map[V4][srcport].flags &= ~PORT_LOOPBACK;
|
|
||||||
|
|
||||||
bitmap_set(udp_act[V4][UDP_ACT_TAP], srcport);
|
|
||||||
|
|
||||||
src = c->ip4.gw;
|
|
||||||
}
|
|
||||||
|
|
||||||
ip4h->tot_len = htons(l3len);
|
ip4h->tot_len = htons(l3len);
|
||||||
ip4h->daddr = dst.s_addr;
|
ip4h->daddr = dst->s_addr;
|
||||||
ip4h->saddr = src.s_addr;
|
ip4h->saddr = src->s_addr;
|
||||||
ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst);
|
ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, *src, *dst);
|
||||||
|
|
||||||
bp->uh.source = s_in->sin_port;
|
bp->uh.source = htons(toside->fport);
|
||||||
bp->uh.dest = htons(dstport);
|
bp->uh.dest = htons(toside->eport);
|
||||||
bp->uh.len = htons(l4len);
|
bp->uh.len = htons(l4len);
|
||||||
csum_udp4(&bp->uh, src, dst, bp->data, dlen);
|
csum_udp4(&bp->uh, *src, *dst, bp->data, dlen);
|
||||||
|
|
||||||
return l4len;
|
return l4len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* udp_update_hdr6() - Update headers for one IPv6 datagram
|
* udp_update_hdr6() - Update headers for one IPv6 datagram
|
||||||
* @c: Execution context
|
|
||||||
* @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
|
* @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
|
||||||
* @s_in: Source socket address, filled in by recvmmsg()
|
|
||||||
* @bp: Pointer to udp_payload_t to update
|
* @bp: Pointer to udp_payload_t to update
|
||||||
* @dstport: Destination port number
|
* @toside: Flowside for destination side
|
||||||
* @dlen: Length of UDP payload
|
* @dlen: Length of UDP payload
|
||||||
* @now: Current timestamp
|
|
||||||
*
|
*
|
||||||
* Return: size of IPv6 payload (UDP header + data)
|
* Return: size of IPv6 payload (UDP header + data)
|
||||||
*/
|
*/
|
||||||
static size_t udp_update_hdr6(const struct ctx *c,
|
static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
|
||||||
struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6,
|
const struct flowside *toside, size_t dlen)
|
||||||
struct udp_payload_t *bp,
|
|
||||||
in_port_t dstport, size_t dlen,
|
|
||||||
const struct timespec *now)
|
|
||||||
{
|
{
|
||||||
const struct in6_addr *src = &s_in6->sin6_addr;
|
|
||||||
const struct in6_addr *dst = &c->ip6.addr_seen;
|
|
||||||
in_port_t srcport = ntohs(s_in6->sin6_port);
|
|
||||||
uint16_t l4len = dlen + sizeof(bp->uh);
|
uint16_t l4len = dlen + sizeof(bp->uh);
|
||||||
|
|
||||||
if (IN6_IS_ADDR_LINKLOCAL(src)) {
|
|
||||||
dst = &c->ip6.addr_ll_seen;
|
|
||||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match) &&
|
|
||||||
IN6_ARE_ADDR_EQUAL(src, &c->ip6.dns_host) &&
|
|
||||||
srcport == 53 &&
|
|
||||||
(udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) {
|
|
||||||
src = &c->ip6.dns_match;
|
|
||||||
} else if (IN6_IS_ADDR_LOOPBACK(src) ||
|
|
||||||
IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr_seen) ||
|
|
||||||
IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr)) {
|
|
||||||
udp_tap_map[V6][srcport].ts = now->tv_sec;
|
|
||||||
udp_tap_map[V6][srcport].flags |= PORT_LOCAL;
|
|
||||||
|
|
||||||
if (IN6_IS_ADDR_LOOPBACK(src))
|
|
||||||
udp_tap_map[V6][srcport].flags |= PORT_LOOPBACK;
|
|
||||||
else
|
|
||||||
udp_tap_map[V6][srcport].flags &= ~PORT_LOOPBACK;
|
|
||||||
|
|
||||||
if (IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr))
|
|
||||||
udp_tap_map[V6][srcport].flags |= PORT_GUA;
|
|
||||||
else
|
|
||||||
udp_tap_map[V6][srcport].flags &= ~PORT_GUA;
|
|
||||||
|
|
||||||
bitmap_set(udp_act[V6][UDP_ACT_TAP], srcport);
|
|
||||||
|
|
||||||
dst = &c->ip6.addr_ll_seen;
|
|
||||||
|
|
||||||
if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
|
|
||||||
src = &c->ip6.gw;
|
|
||||||
else
|
|
||||||
src = &c->ip6.addr_ll;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
ip6h->payload_len = htons(l4len);
|
ip6h->payload_len = htons(l4len);
|
||||||
ip6h->daddr = *dst;
|
ip6h->daddr = toside->eaddr.a6;
|
||||||
ip6h->saddr = *src;
|
ip6h->saddr = toside->faddr.a6;
|
||||||
ip6h->version = 6;
|
ip6h->version = 6;
|
||||||
ip6h->nexthdr = IPPROTO_UDP;
|
ip6h->nexthdr = IPPROTO_UDP;
|
||||||
ip6h->hop_limit = 255;
|
ip6h->hop_limit = 255;
|
||||||
|
|
||||||
bp->uh.source = s_in6->sin6_port;
|
bp->uh.source = htons(toside->fport);
|
||||||
bp->uh.dest = htons(dstport);
|
bp->uh.dest = htons(toside->eport);
|
||||||
bp->uh.len = ip6h->payload_len;
|
bp->uh.len = ip6h->payload_len;
|
||||||
csum_udp6(&bp->uh, src, dst, bp->data, dlen);
|
csum_udp6(&bp->uh, &toside->faddr.a6, &toside->eaddr.a6, bp->data, dlen);
|
||||||
|
|
||||||
return l4len;
|
return l4len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* udp_tap_prepare() - Convert one datagram into a tap frame
|
* udp_tap_prepare() - Convert one datagram into a tap frame
|
||||||
* @c: Execution context
|
|
||||||
* @mmh: Receiving mmsghdr array
|
* @mmh: Receiving mmsghdr array
|
||||||
* @idx: Index of the datagram to prepare
|
* @idx: Index of the datagram to prepare
|
||||||
* @dstport: Destination port
|
* @toside: Flowside for destination side
|
||||||
* @v6: Prepare for IPv6?
|
|
||||||
* @now: Current timestamp
|
|
||||||
*/
|
*/
|
||||||
static void udp_tap_prepare(const struct ctx *c, const struct mmsghdr *mmh,
|
static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
|
||||||
unsigned idx, in_port_t dstport, bool v6,
|
const struct flowside *toside)
|
||||||
const struct timespec *now)
|
|
||||||
{
|
{
|
||||||
struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
|
struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
|
||||||
struct udp_payload_t *bp = &udp_payload[idx];
|
struct udp_payload_t *bp = &udp_payload[idx];
|
||||||
struct udp_meta_t *bm = &udp_meta[idx];
|
struct udp_meta_t *bm = &udp_meta[idx];
|
||||||
size_t l4len;
|
size_t l4len;
|
||||||
|
|
||||||
if (v6) {
|
if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->faddr)) {
|
||||||
l4len = udp_update_hdr6(c, &bm->ip6h, &bm->s_in.sa6, bp,
|
l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len);
|
||||||
dstport, mmh[idx].msg_len, now);
|
|
||||||
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
|
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
|
||||||
sizeof(udp6_eth_hdr));
|
sizeof(udp6_eth_hdr));
|
||||||
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
|
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
|
||||||
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
|
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
|
||||||
} else {
|
} else {
|
||||||
l4len = udp_update_hdr4(c, &bm->ip4h, &bm->s_in.sa4, bp,
|
l4len = udp_update_hdr4(&bm->ip4h, bp, toside, mmh[idx].msg_len);
|
||||||
dstport, mmh[idx].msg_len, now);
|
|
||||||
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
|
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
|
||||||
sizeof(udp4_eth_hdr));
|
sizeof(udp4_eth_hdr));
|
||||||
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
|
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
|
||||||
|
@ -855,17 +761,11 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve
|
||||||
const struct timespec *now)
|
const struct timespec *now)
|
||||||
{
|
{
|
||||||
struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv;
|
struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv;
|
||||||
in_port_t dstport = ref.udp.port;
|
|
||||||
int n, i;
|
int n, i;
|
||||||
|
|
||||||
if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0)
|
if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (ref.udp.pif == PIF_SPLICE)
|
|
||||||
dstport += c->udp.fwd_out.f.delta[dstport];
|
|
||||||
else if (ref.udp.pif == PIF_HOST)
|
|
||||||
dstport += c->udp.fwd_in.f.delta[dstport];
|
|
||||||
|
|
||||||
/* We divide datagrams into batches based on how we need to send them,
|
/* We divide datagrams into batches based on how we need to send them,
|
||||||
* determined by udp_meta[i].tosidx. To avoid either two passes through
|
* determined by udp_meta[i].tosidx. To avoid either two passes through
|
||||||
* the array, or recalculating tosidx for a single entry, we have to
|
* the array, or recalculating tosidx for a single entry, we have to
|
||||||
|
@ -880,9 +780,9 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve
|
||||||
do {
|
do {
|
||||||
if (pif_is_socket(batchpif)) {
|
if (pif_is_socket(batchpif)) {
|
||||||
udp_splice_prepare(mmh_recv, i);
|
udp_splice_prepare(mmh_recv, i);
|
||||||
} else {
|
} else if (batchpif == PIF_TAP) {
|
||||||
udp_tap_prepare(c, mmh_recv, i, dstport,
|
udp_tap_prepare(mmh_recv, i,
|
||||||
ref.udp.v6, now);
|
flowside_at_sidx(batchsidx));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (++i >= n)
|
if (++i >= n)
|
||||||
|
@ -896,9 +796,20 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve
|
||||||
if (pif_is_socket(batchpif)) {
|
if (pif_is_socket(batchpif)) {
|
||||||
udp_splice_send(c, batchstart, i - batchstart,
|
udp_splice_send(c, batchstart, i - batchstart,
|
||||||
batchsidx);
|
batchsidx);
|
||||||
} else {
|
} else if (batchpif == PIF_TAP) {
|
||||||
tap_send_frames(c, &udp_l2_iov[batchstart][0],
|
tap_send_frames(c, &udp_l2_iov[batchstart][0],
|
||||||
UDP_NUM_IOVS, i - batchstart);
|
UDP_NUM_IOVS, i - batchstart);
|
||||||
|
} else if (flow_sidx_valid(batchsidx)) {
|
||||||
|
flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx);
|
||||||
|
struct udp_flow *uflow = udp_at_sidx(batchsidx);
|
||||||
|
|
||||||
|
flow_err(uflow,
|
||||||
|
"No support for forwarding UDP from %s to %s",
|
||||||
|
pif_name(pif_at_sidx(fromsidx)),
|
||||||
|
pif_name(batchpif));
|
||||||
|
} else {
|
||||||
|
debug("Discarding %d datagrams without flow",
|
||||||
|
i - batchstart);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -936,14 +847,20 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||||
for (i = 0; i < n; i++) {
|
for (i = 0; i < n; i++) {
|
||||||
if (pif_is_socket(topif))
|
if (pif_is_socket(topif))
|
||||||
udp_splice_prepare(mmh_recv, i);
|
udp_splice_prepare(mmh_recv, i);
|
||||||
else
|
else if (topif == PIF_TAP)
|
||||||
udp_tap_prepare(c, mmh_recv, i, toside->eport, v6, now);
|
udp_tap_prepare(mmh_recv, i, toside);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pif_is_socket(topif))
|
if (pif_is_socket(topif)) {
|
||||||
udp_splice_send(c, 0, n, tosidx);
|
udp_splice_send(c, 0, n, tosidx);
|
||||||
else
|
} else if (topif == PIF_TAP) {
|
||||||
tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
|
tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
|
||||||
|
} else {
|
||||||
|
uint8_t frompif = pif_at_sidx(ref.flowside);
|
||||||
|
|
||||||
|
flow_err(uflow, "No support for forwarding UDP from %s to %s",
|
||||||
|
pif_name(frompif), pif_name(topif));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in a new issue