tap: Completely de-serialise input message batches

Until now, messages would be passed to protocol handlers in a single
batch only if they happened to be dequeued in a row. Packets
interleaved between different connections would result in multiple
calls to the same protocol handler for a single connection.

Instead, keep track of incoming packet descriptors, arrange them in
sequences, and call protocol handlers only as we completely sorted
input messages in batches.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2021-09-26 23:38:22 +02:00
parent dfc4513190
commit dd581730e5
7 changed files with 444 additions and 275 deletions

20
icmp.c
View file

@ -141,23 +141,26 @@ void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* Return: count of consumed packets (always 1, even if malformed) * Return: count of consumed packets (always 1, even if malformed)
*/ */
int icmp_tap_handler(struct ctx *c, int af, void *addr, int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now) struct tap_l4_msg *msg, int count, struct timespec *now)
{ {
(void)count; (void)count;
if (af == AF_INET) { if (af == AF_INET) {
struct icmphdr *ih = (struct icmphdr *)msg[0].l4h;
union icmp_epoll_ref iref = { .v6 = 0 }; union icmp_epoll_ref iref = { .v6 = 0 };
struct sockaddr_in sa = { struct sockaddr_in sa = {
.sin_family = AF_INET, .sin_family = AF_INET,
.sin_addr = { .s_addr = INADDR_ANY }, .sin_addr = { .s_addr = INADDR_ANY },
.sin_port = ih->un.echo.id,
}; };
struct icmphdr *ih;
int id, s; int id, s;
ih = (struct icmphdr *)(pkt_buf + msg[0].pkt_buf_offset);
if (msg[0].l4_len < sizeof(*ih) || ih->type != ICMP_ECHO) if (msg[0].l4_len < sizeof(*ih) || ih->type != ICMP_ECHO)
return 1; return 1;
sa.sin_port = ih->un.echo.id;
iref.id = id = ntohs(ih->un.echo.id); iref.id = id = ntohs(ih->un.echo.id);
if ((s = icmp_id_map[V4][id].sock) <= 0) { if ((s = icmp_id_map[V4][id].sock) <= 0) {
@ -171,22 +174,25 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
bitmap_set(icmp_act[V4], id); bitmap_set(icmp_act[V4], id);
sa.sin_addr = *(struct in_addr *)addr; sa.sin_addr = *(struct in_addr *)addr;
sendto(s, msg[0].l4h, msg[0].l4_len, MSG_NOSIGNAL, sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa)); (struct sockaddr *)&sa, sizeof(sa));
} else if (af == AF_INET6) { } else if (af == AF_INET6) {
struct icmp6hdr *ih = (struct icmp6hdr *)msg[0].l4h;
union icmp_epoll_ref iref = { .v6 = 1 }; union icmp_epoll_ref iref = { .v6 = 1 };
struct sockaddr_in6 sa = { struct sockaddr_in6 sa = {
.sin6_family = AF_INET6, .sin6_family = AF_INET6,
.sin6_addr = IN6ADDR_ANY_INIT, .sin6_addr = IN6ADDR_ANY_INIT,
.sin6_port = ih->icmp6_identifier,
}; };
struct icmp6hdr *ih;
int id, s; int id, s;
ih = (struct icmp6hdr *)(pkt_buf + msg[0].pkt_buf_offset);
if (msg[0].l4_len < sizeof(*ih) || if (msg[0].l4_len < sizeof(*ih) ||
(ih->icmp6_type != 128 && ih->icmp6_type != 129)) (ih->icmp6_type != 128 && ih->icmp6_type != 129))
return 1; return 1;
sa.sin6_port = ih->icmp6_identifier;
iref.id = id = ntohs(ih->icmp6_identifier); iref.id = id = ntohs(ih->icmp6_identifier);
if ((s = icmp_id_map[V6][id].sock) <= 0) { if ((s = icmp_id_map[V6][id].sock) <= 0) {
s = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, id, 0, s = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, id, 0,
@ -200,7 +206,7 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
bitmap_set(icmp_act[V6], id); bitmap_set(icmp_act[V6], id);
sa.sin6_addr = *(struct in6_addr *)addr; sa.sin6_addr = *(struct in6_addr *)addr;
sendto(s, msg[0].l4h, msg[0].l4_len, MSG_NOSIGNAL, sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa)); (struct sockaddr *)&sa, sizeof(sa));
} }

2
icmp.h
View file

@ -8,7 +8,7 @@ struct ctx;
void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now); struct timespec *now);
int icmp_tap_handler(struct ctx *c, int af, void *addr, int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now); struct tap_l4_msg *msg, int count, struct timespec *now);
void icmp_timer(struct ctx *c, struct timespec *ts); void icmp_timer(struct ctx *c, struct timespec *ts);
/** /**

18
passt.h
View file

@ -3,15 +3,21 @@
/** /**
* struct tap_msg - Generic message descriptor for arrays of messages * struct tap_msg - Generic message descriptor for arrays of messages
* @start: Pointer to message start * @pkt_buf_offset: Offset from @pkt_buf
* @l4_start: Pointer to L4 header * @len: Message length, with L2 headers
* @len: Message length, with L2 headers
* @l4_len: Message length, with L4 headers
*/ */
struct tap_msg { struct tap_msg {
char *start; uint32_t pkt_buf_offset;
char *l4h;
uint16_t len; uint16_t len;
};
/**
* struct tap_l4_msg - Layer-4 message descriptor for protocol handlers
* @pkt_buf_offset: Offset of message from @pkt_buf
* @l4_len: Length of Layer-4 payload, host order
*/
struct tap_l4_msg {
uint32_t pkt_buf_offset;
uint16_t l4_len; uint16_t l4_len;
}; };

636
tap.c
View file

@ -50,7 +50,9 @@
#include "dhcpv6.h" #include "dhcpv6.h"
#include "pcap.h" #include "pcap.h"
static struct tap_msg tap_msgs[TAP_MSGS]; /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
static struct tap_msg seq4[TAP_MSGS];
static struct tap_msg seq6[TAP_MSGS];
/** /**
* tap_send() - Send frame, with qemu socket header if needed * tap_send() - Send frame, with qemu socket header if needed
@ -198,257 +200,410 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
} }
} }
/**
* struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
* @msgs: Count of messages in sequence
* @protocol: Protocol number
* @source: Source port
* @dest: Destination port
* @saddr: Source address
* @daddr: Destination address
* @msg: Array of messages that can be handled in a single call
*/
static struct tap_l4_seq4 {
uint16_t msgs;
uint8_t protocol;
uint16_t source;
uint16_t dest;
uint32_t saddr;
uint32_t daddr;
struct tap_l4_msg msg[UIO_MAXIOV];
} l4_seq4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */];
/**
* struct l4_seq6_t - Message sequence for one protocol handler call, IPv6
* @msgs: Count of messages in sequence
* @protocol: Protocol number
* @source: Source port
* @dest: Destination port
* @saddr: Source address
* @daddr: Destination address
* @msg: Array of messages that can be handled in a single call
*/
static struct tap_l4_seq6 {
uint16_t msgs;
uint8_t protocol;
uint16_t source;
uint16_t dest;
struct in6_addr saddr;
struct in6_addr daddr;
struct tap_l4_msg msg[UIO_MAXIOV];
} l4_seq6[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */];
/**
* tap_packet_debug() - Print debug message for packet(s) from guest/tap
* @iph: IPv4 header, can be NULL
* @ip6h: IPv6 header, can be NULL
* @seq4: Pointer to @struct tap_l4_seq4, can be NULL
* @proto6: IPv6 protocol, for IPv6
* @seq6: Pointer to @struct tap_l4_seq6, can be NULL
* @count: Count of packets in this sequence
*/
static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h,
struct tap_l4_seq4 *seq4, uint8_t proto6,
struct tap_l4_seq6 *seq6, int count)
{
char buf6s[INET6_ADDRSTRLEN], buf6d[INET6_ADDRSTRLEN];
char buf4s[INET_ADDRSTRLEN], buf4d[INET_ADDRSTRLEN];
uint8_t proto;
if (iph || seq4) {
inet_ntop(AF_INET, iph ? &iph->saddr : &seq4->saddr,
buf4s, sizeof(buf4s)),
inet_ntop(AF_INET, iph ? &iph->daddr : &seq4->daddr,
buf4d, sizeof(buf4d)),
proto = iph ? iph->protocol : seq4->protocol;
} else {
inet_ntop(AF_INET6, ip6h ? &ip6h->saddr : &seq6->saddr,
buf6s, sizeof(buf6s)),
inet_ntop(AF_INET6, ip6h ? &ip6h->daddr : &seq6->daddr,
buf6d, sizeof(buf6d)),
proto = proto6;
}
if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) {
debug("protocol %i from tap: %s:%i -> %s:%i (%i packet%s)",
proto, seq4 ? buf4s : buf6s,
ntohs(seq4 ? seq4->source : seq6->source),
seq4 ? buf4d : buf6d,
ntohs(seq4 ? seq4->dest : seq6->dest),
count, count == 1 ? "" : "s");
} else {
debug("protocol %i from tap: %s -> %s (%i packet%s)",
proto, iph ? buf4s : buf6s, iph ? buf4d : buf6d,
count, count == 1 ? "" : "s");
}
}
/** /**
* tap4_handler() - IPv4 and ARP packet handler for tap file descriptor * tap4_handler() - IPv4 and ARP packet handler for tap file descriptor
* @c: Execution context * @c: Execution context
* @msg: Array of messages with the same L3 protocol * @msg: Array of messages with IPv4 or ARP protocol
* @count: Count of messages with the same L3 protocol * @count: Count of messages
* @now: Current timestamp * @now: Current timestamp
* @first: First call for an IPv4 packet in this batch
* *
* Return: count of packets consumed by handlers * Return: count of packets consumed by handlers
*/ */
static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count, static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count,
struct timespec *now, int first) struct timespec *now)
{ {
char buf_s[INET_ADDRSTRLEN] __attribute((__unused__)); unsigned int i, j, seq_count;
char buf_d[INET_ADDRSTRLEN] __attribute((__unused__)); struct tap_l4_msg *l4_msg;
struct ethhdr *eh = (struct ethhdr *)msg[0].start; struct tap_l4_seq4 *seq;
struct iphdr *iph, *prev_iph = NULL; size_t len, l4_len;
struct udphdr *uh, *prev_uh = NULL; struct ethhdr *eh;
size_t len = msg[0].len; struct iphdr *iph;
unsigned int i; struct udphdr *uh;
char *l4h; char *l4h;
if (!c->v4) if (!c->v4)
return count; return count;
if (len < sizeof(*eh) + sizeof(*iph)) i = 0;
return 1; resume:
for (seq_count = 0, seq = NULL; i < count; i++) {
if (arp(c, eh, len) || dhcp(c, eh, len)) eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset);
return 1;
for (i = 0; i < count; i++) {
len = msg[i].len; len = msg[i].len;
if (len < sizeof(*eh))
continue;
if (ntohs(eh->h_proto) == ETH_P_ARP && arp(c, eh, len))
continue;
if (len < sizeof(*eh) + sizeof(*iph)) if (len < sizeof(*eh) + sizeof(*iph))
return 1; continue;
eh = (struct ethhdr *)msg[i].start;
iph = (struct iphdr *)(eh + 1); iph = (struct iphdr *)(eh + 1);
l4h = (char *)iph + iph->ihl * 4; if ((iph->ihl * 4) + sizeof(*eh) > len)
continue;
if (iph->ihl * 4 < sizeof(*iph))
continue;
if (first && c->addr4_seen != iph->saddr) { if (iph->saddr && c->addr4_seen != iph->saddr) {
c->addr4_seen = iph->saddr; c->addr4_seen = iph->saddr;
proto_update_l2_buf(NULL, NULL, &c->addr4_seen); proto_update_l2_buf(NULL, NULL, &c->addr4_seen);
} }
msg[i].l4h = l4h; l4h = (char *)iph + iph->ihl * 4;
msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh); l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
if (iph->protocol != IPPROTO_TCP && if (iph->protocol == IPPROTO_ICMP) {
iph->protocol != IPPROTO_UDP) struct tap_l4_msg icmp_msg = { l4h - pkt_buf,
break; l4_len };
if (len < sizeof(*uh)) if (l4_len < sizeof(struct icmphdr))
break; continue;
uh = (struct udphdr *)l4h; tap_packet_debug(iph, NULL, NULL, 0, NULL, 1);
if (!c->no_icmp) {
if (!i) { icmp_tap_handler(c, AF_INET, &iph->daddr,
prev_iph = iph; &icmp_msg, 1, now);
prev_uh = uh; }
continue; continue;
} }
if (iph->tos != prev_iph->tos || if (l4_len < sizeof(*uh))
iph->frag_off != prev_iph->frag_off || continue;
iph->protocol != prev_iph->protocol ||
iph->saddr != prev_iph->saddr ||
iph->daddr != prev_iph->daddr ||
uh->source != prev_uh->source ||
uh->dest != prev_uh->dest)
break;
prev_iph = iph; uh = (struct udphdr *)l4h;
prev_uh = uh;
if (iph->protocol == IPPROTO_UDP && dhcp(c, eh, len))
continue;
if (iph->protocol != IPPROTO_TCP &&
iph->protocol != IPPROTO_UDP) {
tap_packet_debug(iph, NULL, NULL, 0, NULL, 1);
continue;
}
#define L4_MATCH(iph, uh, seq) \
(seq->protocol == iph->protocol && \
seq->source == uh->source && seq->dest == uh->dest && \
seq->saddr == iph->saddr && seq->daddr == iph->daddr)
#define L4_SET(iph, uh, seq) \
do { \
seq->protocol = iph->protocol; \
seq->source = uh->source; \
seq->dest = uh->dest; \
seq->saddr = iph->saddr; \
seq->daddr = iph->daddr; \
} while (0)
if (seq && L4_MATCH(iph, uh, seq) && seq->msgs < UIO_MAXIOV)
goto append;
for (seq = l4_seq4 + seq_count - 1; seq >= l4_seq4; seq--) {
if (L4_MATCH(iph, uh, seq)) {
if (seq->msgs >= UIO_MAXIOV)
seq = l4_seq4 - 1;
break;
}
}
if (seq < l4_seq4) {
seq = l4_seq4 + seq_count++;
L4_SET(iph, uh, seq);
seq->msgs = 0;
}
#undef L4_MATCH
#undef L4_SET
append:
l4_msg = &seq->msg[seq->msgs++];
l4_msg->pkt_buf_offset = l4h - pkt_buf;
l4_msg->l4_len = l4_len;
if (seq_count == UIO_MAXIOV)
break; /* Resume after flushing if i < count */
} }
eh = (struct ethhdr *)msg[0].start; for (j = 0, seq = l4_seq4; j < seq_count; j++, seq++) {
iph = (struct iphdr *)(eh + 1); int n = seq->msgs;
if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP || l4_msg = seq->msg;
iph->protocol == IPPROTO_SCTP) {
uh = (struct udphdr *)msg[0].l4h;
if (msg[0].len < sizeof(*uh)) tap_packet_debug(NULL, NULL, seq, 0, NULL, n);
return 1;
debug("%s (%i) from tap: %s:%i -> %s:%i (%i packet%s)", if (seq->protocol == IPPROTO_TCP) {
IP_PROTO_STR(iph->protocol), iph->protocol, if (c->no_tcp)
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), continue;
ntohs(uh->source), while ((n -= tcp_tap_handler(c, AF_INET, &seq->daddr,
inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)), l4_msg, n, now)));
ntohs(uh->dest), } else if (seq->protocol == IPPROTO_UDP) {
i, i > 1 ? "s" : ""); if (c->no_udp)
} else if (iph->protocol == IPPROTO_ICMP) { continue;
debug("icmp from tap: %s -> %s", while ((n -= udp_tap_handler(c, AF_INET, &seq->daddr,
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), l4_msg, n, now)));
inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d))); }
} }
if (iph->protocol == IPPROTO_TCP) { if (i < count)
if (c->no_tcp) goto resume;
return i;
return tcp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
}
if (iph->protocol == IPPROTO_UDP) { return count;
if (c->no_udp)
return i;
return udp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
}
if (iph->protocol == IPPROTO_ICMP) {
if (c->no_icmp)
return 1;
icmp_tap_handler(c, AF_INET, &iph->daddr, msg, 1, now);
}
return 1;
} }
/** /**
* tap6_handler() - IPv6 packet handler for tap file descriptor * tap6_handler() - IPv6 packet handler for tap file descriptor
* @c: Execution context * @c: Execution context
* @msg: Array of messages with the same L3 protocol * @msg: Array of messages with IPv6 protocol
* @count: Count of messages with the same L3 protocol * @count: Count of messages
* @now: Current timestamp * @now: Current timestamp
* @first: First call for an IPv6 packet in this batch
* *
* Return: count of packets consumed by handlers * Return: count of packets consumed by handlers
*/ */
static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count, static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count,
struct timespec *now, int first) struct timespec *now)
{ {
char buf_s[INET6_ADDRSTRLEN], buf_d[INET6_ADDRSTRLEN]; unsigned int i, j, seq_count = 0;
struct ethhdr *eh = (struct ethhdr *)msg[0].start; struct tap_l4_msg *l4_msg;
struct udphdr *uh, *prev_uh = NULL; struct tap_l4_seq6 *seq;
uint8_t proto = 0, prev_proto = 0;
size_t len = msg[0].len;
struct ipv6hdr *ip6h; struct ipv6hdr *ip6h;
unsigned int i; size_t len, l4_len;
struct ethhdr *eh;
struct udphdr *uh;
uint8_t proto;
char *l4h; char *l4h;
if (!c->v6) if (!c->v6)
return count; return count;
if (len < sizeof(*eh) + sizeof(*ip6h)) i = 0;
return 1; resume:
for (seq_count = 0, seq = NULL; i < count; i++) {
if (ndp(c, eh, len) || dhcpv6(c, eh, len)) eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset);
return 1;
for (i = 0; i < count; i++) {
struct ipv6hdr *p_ip6h;
len = msg[i].len; len = msg[i].len;
if (len < sizeof(*eh))
continue;
if (len < sizeof(*eh) + sizeof(*ip6h)) if (len < sizeof(*eh) + sizeof(*ip6h))
return 1; return 1;
eh = (struct ethhdr *)msg[i].start;
ip6h = (struct ipv6hdr *)(eh + 1); ip6h = (struct ipv6hdr *)(eh + 1);
l4h = ipv6_l4hdr(ip6h, &proto);
msg[i].l4h = l4h; if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) {
msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh); c->addr6_ll_seen = ip6h->saddr;
if (first) { if (IN6_IS_ADDR_UNSPECIFIED(&c->addr6_seen)) {
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) {
c->addr6_ll_seen = ip6h->saddr;
if (IN6_IS_ADDR_UNSPECIFIED(&c->addr6_seen)) {
c->addr6_seen = ip6h->saddr;
}
} else {
c->addr6_seen = ip6h->saddr; c->addr6_seen = ip6h->saddr;
} }
} else {
c->addr6_seen = ip6h->saddr;
} }
ip6h->saddr = c->addr6; if (ntohs(ip6h->payload_len) >
len - sizeof(*eh) - sizeof(*ip6h))
continue;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) if (!(l4h = ipv6_l4hdr(ip6h, &proto)))
break; continue;
if (len < sizeof(*uh)) l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
break;
uh = (struct udphdr *)l4h; if (proto == IPPROTO_ICMPV6) {
struct tap_l4_msg icmpv6_msg = { l4h - pkt_buf,
l4_len };
if (!i) { if (l4_len < sizeof(struct icmp6hdr))
p_ip6h = ip6h; continue;
prev_proto = proto;
prev_uh = uh; if (ndp(c, eh, len))
continue;
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
if (!c->no_icmp) {
icmp_tap_handler(c, AF_INET6, &ip6h->daddr,
&icmpv6_msg, 1, now);
}
continue; continue;
} }
if (proto != prev_proto || if (l4_len < sizeof(*uh))
memcmp(&ip6h->saddr, &p_ip6h->saddr, sizeof(ip6h->saddr)) || continue;
memcmp(&ip6h->daddr, &p_ip6h->daddr, sizeof(ip6h->daddr)) ||
uh->source != prev_uh->source ||
uh->dest != prev_uh->dest)
break;
p_ip6h = ip6h; uh = (struct udphdr *)l4h;
prev_proto = proto;
prev_uh = uh; if (proto == IPPROTO_UDP && dhcpv6(c, eh, len))
continue;
ip6h->saddr = c->addr6;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
continue;
}
#define L4_MATCH(ip6h, proto, uh, seq) \
(seq->protocol == proto && \
seq->source == uh->source && seq->dest == uh->dest && \
!memcmp(&seq->saddr, &ip6h->saddr, sizeof(seq->saddr)) && \
!memcmp(&seq->daddr, &ip6h->daddr, sizeof(seq->daddr)))
#define L4_SET(ip6h, proto, uh, seq) \
do { \
seq->protocol = proto; \
seq->source = uh->source; \
seq->dest = uh->dest; \
seq->saddr = ip6h->saddr; \
seq->daddr = ip6h->daddr; \
} while (0)
if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
seq->msgs < UIO_MAXIOV)
goto append;
for (seq = l4_seq6 + seq_count - 1; seq >= l4_seq6; seq--) {
if (L4_MATCH(ip6h, proto, uh, seq)) {
if (seq->msgs >= UIO_MAXIOV)
seq = l4_seq6 - 1;
break;
}
}
if (seq < l4_seq6) {
seq = l4_seq6 + seq_count++;
L4_SET(ip6h, proto, uh, seq);
seq->msgs = 0;
}
#undef L4_MATCH
#undef L4_SET
append:
l4_msg = &seq->msg[seq->msgs++];
l4_msg->pkt_buf_offset = l4h - pkt_buf;
l4_msg->l4_len = l4_len;
if (seq_count == UIO_MAXIOV)
break; /* Resume after flushing if i < count */
} }
if (prev_proto) for (j = 0, seq = l4_seq6; j < seq_count; j++, seq++) {
proto = prev_proto; int n = seq->msgs;
eh = (struct ethhdr *)msg[0].start; l4_msg = seq->msg;
ip6h = (struct ipv6hdr *)(eh + 1);
if (proto == IPPROTO_ICMPV6) { tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n);
debug("icmpv6 from tap: %s ->\n\t%s",
inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)));
} else if (proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
proto == IPPROTO_SCTP) {
uh = (struct udphdr *)msg[0].l4h;
if (msg[0].len < sizeof(*uh)) if (seq->protocol == IPPROTO_TCP) {
return 1; if (c->no_tcp)
continue;
debug("%s (%i) from tap: [%s]:%i\n\t-> [%s]:%i (%i packet%s)", while ((n -= tcp_tap_handler(c, AF_INET6, &seq->daddr,
IP_PROTO_STR(proto), proto, l4_msg, n, now)));
inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)), } else if (seq->protocol == IPPROTO_UDP) {
ntohs(uh->source), if (c->no_udp)
inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)), continue;
ntohs(uh->dest), while ((n -= udp_tap_handler(c, AF_INET6, &seq->daddr,
i, i > 1 ? "s" : ""); l4_msg, n, now)));
}
} }
if (proto == IPPROTO_TCP) { if (i < count)
if (c->no_tcp) goto resume;
return i;
return tcp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
}
if (proto == IPPROTO_UDP) { return count;
if (c->no_udp)
return i;
return udp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
}
if (proto == IPPROTO_ICMPV6) {
if (c->no_icmp)
return 1;
icmp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, 1, now);
}
return 1;
} }
/** /**
@ -460,10 +615,14 @@ static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count,
*/ */
static int tap_handler_passt(struct ctx *c, struct timespec *now) static int tap_handler_passt(struct ctx *c, struct timespec *now)
{ {
int msg_count = 0, same, i = 0, first_v4 = 1, first_v6 = 1; int seq4_i, seq6_i;
struct ethhdr *eh; struct ethhdr *eh;
char *p = pkt_buf;
ssize_t n, rem; ssize_t n, rem;
char *p;
redo:
p = pkt_buf;
seq4_i = seq6_i = rem = 0;
n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT); n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT);
if (n < 0) { if (n < 0) {
@ -479,30 +638,27 @@ static int tap_handler_passt(struct ctx *c, struct timespec *now)
while (n > (ssize_t)sizeof(uint32_t)) { while (n > (ssize_t)sizeof(uint32_t)) {
ssize_t len = ntohl(*(uint32_t *)p); ssize_t len = ntohl(*(uint32_t *)p);
if (len < (ssize_t)sizeof(*eh) || len > ETH_MAX_MTU)
return 0;
p += sizeof(uint32_t); p += sizeof(uint32_t);
n -= sizeof(uint32_t); n -= sizeof(uint32_t);
/* At most one packet might not fit in a single read */ /* At most one packet might not fit in a single read, and this
* needs to be blocking.
*/
if (len > n) { if (len > n) {
rem = recv(c->fd_tap, p + n, len - n, MSG_DONTWAIT); rem = recv(c->fd_tap, p + n, len - n, 0);
if ((n += rem) != len) if ((n += rem) != len)
return 0; return 0;
} }
/* Complete the partial read above before discarding a malformed
* frame, otherwise the stream will be inconsistent.
*/
if (len < (ssize_t)sizeof(*eh) || len > ETH_MAX_MTU)
goto next;
pcap(p, len); pcap(p, len);
tap_msgs[msg_count].start = p; eh = (struct ethhdr *)p;
tap_msgs[msg_count++].len = len;
n -= len;
p += len;
}
while (i < msg_count) {
eh = (struct ethhdr *)tap_msgs[i].start;
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN); memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
@ -511,52 +667,33 @@ static int tap_handler_passt(struct ctx *c, struct timespec *now)
switch (ntohs(eh->h_proto)) { switch (ntohs(eh->h_proto)) {
case ETH_P_ARP: case ETH_P_ARP:
if (c->v4)
tap4_handler(c, tap_msgs + i, 1, now, 1);
i++;
break;
case ETH_P_IP: case ETH_P_IP:
for (same = 1; i + same < msg_count && seq4[seq4_i].pkt_buf_offset = p - pkt_buf;
same < UIO_MAXIOV; same++) { seq4[seq4_i++].len = len;
struct tap_msg *next = &tap_msgs[i + same];
eh = (struct ethhdr *)next->start;
if (ntohs(eh->h_proto) != ETH_P_IP)
break;
}
if (!c->v4) {
i += same;
break;
}
i += tap4_handler(c, tap_msgs + i, same, now, first_v4);
first_v4 = 0;
break; break;
case ETH_P_IPV6: case ETH_P_IPV6:
for (same = 1; i + same < msg_count && seq6[seq6_i].pkt_buf_offset = p - pkt_buf;
same < UIO_MAXIOV; same++) { seq6[seq6_i++].len = len;
struct tap_msg *next = &tap_msgs[i + same];
eh = (struct ethhdr *)next->start;
if (ntohs(eh->h_proto) != ETH_P_IPV6)
break;
}
if (!c->v6) {
i += same;
break;
}
i += tap6_handler(c, tap_msgs + i, same, now, first_v6);
first_v6 = 0;
break; break;
default: default:
i++;
break; break;
} }
next:
p += len;
n -= len;
} }
if (seq4_i)
tap4_handler(c, seq4, seq4_i, now);
if (seq6_i)
tap6_handler(c, seq6, seq6_i, now);
/* We can't use EPOLLET otherwise. */
if (rem)
goto redo;
return 0; return 0;
} }
@ -569,14 +706,19 @@ static int tap_handler_passt(struct ctx *c, struct timespec *now)
*/ */
static int tap_handler_pasta(struct ctx *c, struct timespec *now) static int tap_handler_pasta(struct ctx *c, struct timespec *now)
{ {
struct tap_msg msg = { .start = pkt_buf }; ssize_t n = 0, len;
ssize_t n; int err, seq4_i = 0, seq6_i = 0;
while ((n = read(c->fd_tap, pkt_buf, TAP_BUF_BYTES)) > 0) { restart:
struct ethhdr *eh = (struct ethhdr *)pkt_buf; while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
msg.len = n; struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n);
pcap(msg.start, msg.len); if (len < (ssize_t)sizeof(*eh) || len > ETH_MAX_MTU) {
n += len;
continue;
}
pcap(pkt_buf + n, len);
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN); memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
@ -585,21 +727,33 @@ static int tap_handler_pasta(struct ctx *c, struct timespec *now)
switch (ntohs(eh->h_proto)) { switch (ntohs(eh->h_proto)) {
case ETH_P_ARP: case ETH_P_ARP:
if (c->v4)
tap4_handler(c, &msg, 1, now, 1);
break;
case ETH_P_IP: case ETH_P_IP:
if (c->v4) seq4[seq4_i].pkt_buf_offset = n;
tap4_handler(c, &msg, 1, now, 1); seq4[seq4_i++].len = len;
break; break;
case ETH_P_IPV6: case ETH_P_IPV6:
if (c->v6) seq6[seq6_i].pkt_buf_offset = n;
tap6_handler(c, &msg, 1, now, 1); seq6[seq6_i++].len = len;
break;
default:
break; break;
} }
n += len;
} }
if (!n || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) if (len < 0 && errno == EINTR)
goto restart;
err = errno;
if (seq4_i)
tap4_handler(c, seq4, seq4_i, now);
if (seq6_i)
tap6_handler(c, seq6, seq6_i, now);
if (len > 0 || err == EAGAIN)
return 0; return 0;
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
@ -753,12 +907,14 @@ void tap_sock_init(struct ctx *c)
close(c->fd_tap); close(c->fd_tap);
} }
if (c->mode == MODE_PASST) if (c->mode == MODE_PASST) {
tap_sock_init_unix(c); tap_sock_init_unix(c);
else ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
} else {
tap_sock_init_tun(c); tap_sock_init_tun(c);
ev.events = EPOLLIN | EPOLLRDHUP;
}
ev.events = EPOLLIN | EPOLLRDHUP;
ev.data.fd = c->fd_tap; ev.data.fd = c->fd_tap;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
} }

26
tcp.c
View file

@ -333,6 +333,7 @@
#include <sys/random.h> #include <sys/random.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/uio.h>
#include <unistd.h> #include <unistd.h>
#include <linux/ip.h> #include <linux/ip.h>
#include <linux/ipv6.h> #include <linux/ipv6.h>
@ -645,7 +646,7 @@ static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES] = {
}; };
/* sendmsg() to socket */ /* sendmsg() to socket */
static struct iovec tcp_tap_iov [TAP_MSGS]; static struct iovec tcp_tap_iov [UIO_MAXIOV];
/* Bitmap, activity monitoring needed for connection via tap */ /* Bitmap, activity monitoring needed for connection via tap */
static uint8_t tcp_act[MAX_TAP_CONNS / 8] = { 0 }; static uint8_t tcp_act[MAX_TAP_CONNS / 8] = { 0 };
@ -1968,7 +1969,7 @@ out_restore_iov:
* @now: Current timestamp * @now: Current timestamp
*/ */
static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
struct tap_msg *msg, int count, struct tap_l4_msg *msg, int count,
struct timespec *now) struct timespec *now)
{ {
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1; int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1;
@ -1979,10 +1980,13 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
ssize_t len; ssize_t len;
for (i = 0, iov_i = 0; i < count; i++) { for (i = 0, iov_i = 0; i < count; i++) {
struct tcphdr *th = (struct tcphdr *)msg[i].l4h;
uint32_t seq, seq_offset, ack_seq; uint32_t seq, seq_offset, ack_seq;
size_t len = msg[i].l4_len, off; struct tcphdr *th;
char *data; char *data;
size_t off;
th = (struct tcphdr *)(pkt_buf + msg[i].pkt_buf_offset);
len = msg[i].l4_len;
if (len < sizeof(*th)) { if (len < sizeof(*th)) {
tcp_rst(c, conn); tcp_rst(c, conn);
@ -2152,19 +2156,11 @@ out:
* Return: count of consumed packets * Return: count of consumed packets
*/ */
int tcp_tap_handler(struct ctx *c, int af, void *addr, int tcp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now) struct tap_l4_msg *msg, int count, struct timespec *now)
{ {
struct tcphdr *th = (struct tcphdr *)msg[0].l4h; struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset);
size_t len = msg[0].l4_len, off; uint16_t len = msg[0].l4_len;
struct tcp_tap_conn *conn; struct tcp_tap_conn *conn;
int ws;
if (len < sizeof(*th))
return 1;
off = th->doff * 4;
if (off < sizeof(*th) || off > len)
return 1;
conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest)); conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
if (!conn) { if (!conn) {

13
udp.c
View file

@ -879,12 +879,12 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* Return: count of consumed packets * Return: count of consumed packets
*/ */
int udp_tap_handler(struct ctx *c, int af, void *addr, int udp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now) struct tap_l4_msg *msg, int count, struct timespec *now)
{ {
/* The caller already checks that all the messages have the same source /* The caller already checks that all the messages have the same source
* and destination, so we can just take those from the first message. * and destination, so we can just take those from the first message.
*/ */
struct udphdr *uh = (struct udphdr *)msg[0].l4h; struct udphdr *uh = (struct udphdr *)(pkt_buf + msg[0].pkt_buf_offset);
struct mmsghdr mm[UIO_MAXIOV] = { 0 }; struct mmsghdr mm[UIO_MAXIOV] = { 0 };
struct iovec m[UIO_MAXIOV]; struct iovec m[UIO_MAXIOV];
struct sockaddr_in6 s_in6; struct sockaddr_in6 s_in6;
@ -972,7 +972,10 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
} }
for (i = 0; i < count; i++) { for (i = 0; i < count; i++) {
m[i].iov_base = (char *)((struct udphdr *)msg[i].l4h + 1); struct udphdr *uh;
uh = (struct udphdr *)(msg[i].pkt_buf_offset + pkt_buf);
m[i].iov_base = (char *)(uh + 1);
m[i].iov_len = msg[i].l4_len - sizeof(*uh); m[i].iov_len = msg[i].l4_len - sizeof(*uh);
mm[i].msg_hdr.msg_name = sa; mm[i].msg_hdr.msg_name = sa;
@ -1084,12 +1087,14 @@ static void udp_splice_iov_init(void)
* *
* Return: 0 on success, -1 on failure * Return: 0 on success, -1 on failure
*/ */
int udp_sock_init(struct ctx *c) int udp_sock_init(struct ctx *c, struct timespec *now)
{ {
union udp_epoll_ref uref = { .bound = 1 }; union udp_epoll_ref uref = { .bound = 1 };
in_port_t dst; in_port_t dst;
int s; int s;
(void)now;
for (dst = 0; dst < USHRT_MAX; dst++) { for (dst = 0; dst < USHRT_MAX; dst++) {
if (!bitmap_isset(c->udp.port_to_tap, dst)) if (!bitmap_isset(c->udp.port_to_tap, dst))
continue; continue;

4
udp.h
View file

@ -6,8 +6,8 @@
void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now); struct timespec *now);
int udp_tap_handler(struct ctx *c, int af, void *addr, int udp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now); struct tap_l4_msg *msg, int count, struct timespec *now);
int udp_sock_init(struct ctx *c); int udp_sock_init(struct ctx *c, struct timespec *now);
void udp_timer(struct ctx *c, struct timespec *ts); void udp_timer(struct ctx *c, struct timespec *ts);
void udp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s, void udp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
uint32_t *ip_da); uint32_t *ip_da);