passt: Assorted fixes from "fresh eyes" review

A bunch of fixes not worth single commits at this stage, notably:

- make buffer, length parameter ordering consistent in ARP, DHCP,
  NDP handlers

- strict checking of buffer, message and option length in DHCP
  handler (a malicious client could have easily crashed it)

- set up forwarding for IPv4 and IPv6, and masquerading with nft for
  IPv4, from demo script

- get rid of separate slow and fast timers, we don't save any
  overhead that way

- stricter checking of buffer lengths as passed to tap handlers

- proper dequeuing from qemu socket back-end: I accidentally trashed
  messages that were bundled up together in a single tap read
  operation -- the length header tells us what's the size of the next
  frame, but there's no apparent limit to the number of messages we
  get with one single receive

- rework some bits of the TCP state machine, now passive and active
  connection closes appear to be robust -- introduce a new
  FIN_WAIT_1_SOCK_FIN state indicating a FIN_WAIT_1 with a FIN flag
  from socket

- streamline TCP option parsing routine

- track TCP state changes to stderr (this is temporary, proper
  debugging and syslogging support pending)

- observe that multiplying a number by four might very well change
  its value, and this happens to be the case for the data offset
  from the TCP header as we check if it's the same as the total
  length to find out if it's a duplicated ACK segment

- recent estimates suggest that the duration of a millisecond is
  closer to a million nanoseconds than a thousand of them, this
  trend is now reflected into the timespec_diff_ms() convenience
  routine

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2021-02-21 11:33:38 +01:00
parent 105b916361
commit 8bca388e8a
12 changed files with 462 additions and 458 deletions

2
arp.c
View file

@ -49,7 +49,7 @@ struct arpmsg {
*
* Return: 0 if it's not an ARP message, 1 if handled, -1 on failure
*/
int arp(struct ctx *c, unsigned len, struct ethhdr *eh)
int arp(struct ctx *c, struct ethhdr *eh, size_t len)
{
struct arphdr *ah = (struct arphdr *)(eh + 1);
struct arpmsg *am = (struct arpmsg *)(ah + 1);

2
arp.h
View file

@ -1 +1 @@
int arp(struct ctx *c, unsigned len, struct ethhdr *eh);
int arp(struct ctx *c, struct ethhdr *eh, size_t len);

29
dhcp.c
View file

@ -163,22 +163,39 @@ static int fill(struct msg *m)
*
* Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure
*/
int dhcp(struct ctx *c, unsigned len, struct ethhdr *eh)
int dhcp(struct ctx *c, struct ethhdr *eh, size_t len)
{
struct iphdr *iph = (struct iphdr *)(eh + 1);
struct udphdr *uh = (struct udphdr *)((char *)iph + iph->ihl * 4);
struct msg *m = (struct msg *)(uh + 1);
unsigned int i, mlen = len - sizeof(*eh) - sizeof(*iph);
size_t mlen, olen;
struct udphdr *uh;
unsigned int i;
struct msg *m;
if (len < sizeof(*eh) + sizeof(*iph))
return 0;
if (len < sizeof(*eh) + iph->ihl * 4 + sizeof(*uh))
return 0;
uh = (struct udphdr *)((char *)iph + iph->ihl * 4);
m = (struct msg *)(uh + 1);
if (uh->dest != htons(67))
return 0;
if (mlen != ntohs(uh->len) || mlen < offsetof(struct msg, o) ||
mlen = len - sizeof(*eh) - iph->ihl * 4 - sizeof(*uh);
if (mlen != ntohs(uh->len) - sizeof(*uh) ||
mlen < offsetof(struct msg, o) ||
m->op != BOOTREQUEST)
return -1;
for (i = 0; i < mlen - offsetof(struct msg, o); i += m->o[i + 1] + 2)
olen = mlen - offsetof(struct msg, o);
for (i = 0; i + 2 < olen; i += m->o[i + 1] + 2) {
if (m->o[i + 1] + i + 2 >= olen)
return -1;
memcpy(&opts[m->o[i]].c, &m->o[i + 2], m->o[i + 1]);
}
if (opts[53].c[0] == DHCPDISCOVER) {
fprintf(stderr, "DHCP: offer to discover");

2
dhcp.h
View file

@ -1 +1 @@
int dhcp(struct ctx *c, unsigned len, struct ethhdr *eh);
int dhcp(struct ctx *c, struct ethhdr *eh, size_t len);

View file

@ -48,10 +48,17 @@ ip netns add passt
ip link add veth_passt up netns passt type veth peer name veth_passt
ip link set dev veth_passt up
ip -n passt addr add 192.0.2.2/24 dev veth_passt
ip addr add 192.0.2.1/24 dev veth_passt
ip -n passt route add default via 192.0.2.1
sysctl -w net.ipv4.ip_forward=1
nft delete table passt_nat 2>/dev/null || :
nft add table passt_nat
nft 'add chain passt_nat postrouting { type nat hook postrouting priority -100 ; }'
nft add rule passt_nat postrouting ip saddr 192.0.2.2 masquerade
ipv6_addr="$(ipv6_devaddr "$(ipv6_dev)")"
ipv6_passt="$(ipv6_mangle "${ipv6_addr}")"
ndp_setup "${ipv6_passt}"
@ -59,11 +66,15 @@ ip -n passt addr add "${ipv6_passt}/$(ipv6_mask "${ipv6_addr}")" dev veth_passt
ip addr add "${ipv6_addr}" dev veth_passt
passt_ll="$(ipv6_ll_addr "veth_passt")"
main_ll="$(get_token "link/ether" $(ip -o li sh veth_passt))"
ip -n passt neigh add "${passt_ll%%/*}" dev veth_passt lladdr "${main_ll}"
ip neigh add "${passt_ll%%/*}" dev veth_passt lladdr "${main_ll}"
ip -n passt route add default via "${passt_ll%%/*}" dev veth_passt
sysctl -w net.ipv6.conf.all.forwarding=1
ethtool -K veth_passt tx off
ip netns exec passt ethtool -K veth_passt tx off
ulimit -n 300000
ip netns exec passt ./passt

2
ndp.c
View file

@ -40,7 +40,7 @@
*
* Return: 0 if not handled here, 1 if handled, -1 on failure
*/
int ndp(struct ctx *c, unsigned len, struct ethhdr *eh)
int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
{
struct ethhdr *ehr;
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1), *ip6hr;

2
ndp.h
View file

@ -1 +1 @@
int ndp(struct ctx *c, unsigned len, struct ethhdr *eh);
int ndp(struct ctx *c, struct ethhdr *eh, size_t len);

148
passt.c
View file

@ -51,9 +51,7 @@
#define EPOLL_EVENTS 10
#define EPOLL_TIMEOUT 100 /* ms, for protocol periodic handlers */
#define PERIODIC_HANDLER_FAST 100
#define PERIODIC_HANDLER_SLOW 1000
#define TIMER_INTERVAL 20 /* ms, for protocol periodic handlers */
/**
* sock_unix() - Create and bind AF_UNIX socket, add to epoll list
@ -294,7 +292,7 @@ static void get_dns(struct ctx *c)
}
/**
* tap4_handler() - IPv4 packet handler for tap file descriptor
* tap4_handler() - IPv4 and ARP packet handler for tap file descriptor
* @c: Execution context
* @len: Total L2 packet length
* @in: Packet buffer, L2 headers
@ -303,12 +301,18 @@ static void tap4_handler(struct ctx *c, char *in, size_t len)
{
struct ethhdr *eh = (struct ethhdr *)in;
struct iphdr *iph = (struct iphdr *)(eh + 1);
char *l4h = (char *)iph + iph->ihl * 4;
char buf_s[BUFSIZ], buf_d[BUFSIZ];
char *l4h;
if (arp(c, len, eh) || dhcp(c, len, eh))
if (arp(c, eh, len) || dhcp(c, eh, len))
return;
if (len < sizeof(*eh) + sizeof(*iph))
return;
l4h = (char *)iph + iph->ihl * 4;
len -= (intptr_t)l4h - (intptr_t)eh;
if (iph->protocol == IPPROTO_ICMP) {
fprintf(stderr, "icmp from tap: %s -> %s\n",
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
@ -316,6 +320,9 @@ static void tap4_handler(struct ctx *c, char *in, size_t len)
} else {
struct tcphdr *th = (struct tcphdr *)l4h;
if (len < sizeof(*th) && len < sizeof(struct udphdr))
return;
fprintf(stderr, "%s from tap: %s:%i -> %s:%i\n",
getprotobynumber(iph->protocol)->p_name,
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
@ -324,8 +331,6 @@ static void tap4_handler(struct ctx *c, char *in, size_t len)
ntohs(th->dest));
}
len -= (intptr_t)l4h - (intptr_t)eh;
if (iph->protocol == IPPROTO_TCP)
tcp_tap_handler(c, AF_INET, &iph->daddr, l4h, len);
else if (iph->protocol == IPPROTO_UDP)
@ -346,33 +351,21 @@ static void tap6_handler(struct ctx *c, char *in, size_t len)
uint8_t proto;
char *l4h;
if (ndp(c, len, eh))
if (len < sizeof(*eh) + sizeof(*ip6h))
return;
if (ndp(c, eh, len))
return;
l4h = ipv6_l4hdr(ip6h, &proto);
/* TODO: Assign MAC address to guest so that, together with prefix
* assigned via NDP, address matches the one on the host. Then drop
* address change and checksum recomputation.
* assigned via NDP, address matches the one from the host.
*/
c->addr6_guest = ip6h->saddr;
ip6h->saddr = c->addr6;
if (proto == IPPROTO_TCP) {
struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
th->check = 0;
th->check = csum_ip4(ip6h, len + sizeof(*ip6h));
} else if (proto == IPPROTO_UDP) {
struct udphdr *uh = (struct udphdr *)(ip6h + 1);
uh->check = 0;
uh->check = csum_ip4(ip6h, len + sizeof(*ip6h));
} else if (proto == IPPROTO_ICMPV6) {
struct icmp6hdr *ih = (struct icmp6hdr *)(ip6h + 1);
ih->icmp6_cksum = 0;
ih->icmp6_cksum = csum_ip4(ip6h, len + sizeof(*ip6h));
}
len -= (intptr_t)l4h - (intptr_t)eh;
if (proto == IPPROTO_ICMPV6) {
fprintf(stderr, "icmpv6 from tap: %s ->\n\t%s\n",
@ -382,6 +375,9 @@ static void tap6_handler(struct ctx *c, char *in, size_t len)
} else {
struct tcphdr *th = (struct tcphdr *)l4h;
if (len < sizeof(*th) && len < sizeof(struct udphdr))
return;
fprintf(stderr, "%s from tap: [%s]:%i\n"
"\t-> [%s]:%i\n",
getprotobynumber(proto)->p_name,
@ -391,8 +387,6 @@ static void tap6_handler(struct ctx *c, char *in, size_t len)
ntohs(th->dest));
}
len -= (intptr_t)l4h - (intptr_t)eh;
if (proto == IPPROTO_TCP)
tcp_tap_handler(c, AF_INET6, &ip6h->daddr, l4h, len);
else if (proto == IPPROTO_UDP)
@ -400,19 +394,46 @@ static void tap6_handler(struct ctx *c, char *in, size_t len)
}
/**
* tap_handler() - IPv4/IPv6/ARP packet handler for tap file descriptor
* tap_handler() - Packet handler for tap file descriptor
* @c: Execution context
* @len: Total L2 packet length
* @in: Packet buffer, L2 headers
*
* Return: -ECONNRESET if tap connection was lost, 0 otherwise
*/
static void tap_handler(struct ctx *c, char *in, size_t len)
static int tap_handler(struct ctx *c)
{
struct ethhdr *eh = (struct ethhdr *)in;
char buf[ETH_MAX_MTU];
struct ethhdr *eh;
uint32_t vnet_len;
ssize_t n;
if (eh->h_proto == ntohs(ETH_P_IP) || eh->h_proto == ntohs(ETH_P_ARP))
tap4_handler(c, in, len);
else if (eh->h_proto == ntohs(ETH_P_IPV6))
tap6_handler(c, in, len);
eh = (struct ethhdr *)buf;
while ((n = recv(c->fd_unix, &vnet_len, 4, MSG_DONTWAIT)) == 4) {
n = recv(c->fd_unix, buf, ntohl(vnet_len), MSG_DONTWAIT);
if (n < (ssize_t)sizeof(*eh))
break;
switch (ntohs(eh->h_proto)) {
case ETH_P_IP:
case ETH_P_ARP:
tap4_handler(c, buf, n);
break;
case ETH_P_IPV6:
tap6_handler(c, buf, n);
break;
default:
break;
}
}
if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
return 0;
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_unix, NULL);
close(c->fd_unix);
return -ECONNRESET;
}
/**
@ -429,29 +450,30 @@ static void sock_handler(struct ctx *c, int fd, uint32_t events)
sl = sizeof(so);
if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &so, &sl) ||
so == SOCK_STREAM)
so == SOCK_STREAM) {
fprintf(stderr, "TCP: packet from socket %i\n", fd);
tcp_sock_handler(c, fd, events);
else if (so == SOCK_DGRAM)
}
else if (so == SOCK_DGRAM) {
udp_sock_handler(c, fd, events);
fprintf(stderr, "UDP: packet from socket %i\n", fd);
}
}
/**
* periodic_handler() - Run periodic tasks for L4 protocol handlers
* timer_handler() - Run periodic tasks for L4 protocol handlers
* @c: Execution context
* @last: Timestamp of last run, updated on return
*/
static void periodic_handler(struct ctx *c, struct timespec *last)
static void timer_handler(struct ctx *c, struct timespec *last)
{
struct timespec tmp;
int elapsed_ms;
clock_gettime(CLOCK_MONOTONIC, &tmp);
elapsed_ms = timespec_diff_ms(&tmp, last);
if (timespec_diff_ms(&tmp, last) < TIMER_INTERVAL)
return;
if (elapsed_ms >= PERIODIC_HANDLER_FAST)
tcp_periodic_fast(c);
if (elapsed_ms >= PERIODIC_HANDLER_SLOW)
tcp_periodic_slow(c);
tcp_timer(c, &tmp);
*last = tmp;
}
@ -481,10 +503,8 @@ int main(int argc, char **argv)
struct epoll_event events[EPOLL_EVENTS];
struct epoll_event ev = { 0 };
struct timespec last_time;
char buf[ETH_MAX_MTU];
struct ctx c = { 0 };
int nfds, i, len;
int fd_unix;
int nfds, i, fd_unix;
if (argc != 1)
usage(argv[0]);
@ -537,14 +557,14 @@ listen:
"./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio\n\n");
c.fd_unix = accept(fd_unix, NULL, NULL);
ev.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.data.fd = c.fd_unix;
epoll_ctl(c.epollfd, EPOLL_CTL_ADD, c.fd_unix, &ev);
clock_gettime(CLOCK_MONOTONIC, &last_time);
loop:
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, EPOLL_TIMEOUT);
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
if (nfds == -1 && errno != EINTR) {
perror("epoll_wait");
exit(EXIT_FAILURE);
@ -552,36 +572,16 @@ loop:
for (i = 0; i < nfds; i++) {
if (events[i].data.fd == c.fd_unix) {
len = recv(events[i].data.fd, buf, sizeof(buf),
MSG_DONTWAIT);
if (len <= 0) {
epoll_ctl(c.epollfd, EPOLL_CTL_DEL, c.fd_unix,
&ev);
close(c.fd_unix);
if (tap_handler(&c))
goto listen;
}
if (len == 0 || (len < 0 && errno == EINTR))
continue;
if (len < 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK)
break;
goto out;
}
tap_handler(&c, buf + 4, ntohl(*(uint32_t *)buf));
} else {
sock_handler(&c, events[i].data.fd, events[i].events);
}
}
periodic_handler(&c, &last_time);
clock_gettime(CLOCK_MONOTONIC, &last_time);
timer_handler(&c, &last_time);
goto loop;
out:
return 0;
}

674
tcp.c

File diff suppressed because it is too large Load diff

3
tcp.h
View file

@ -1,5 +1,4 @@
void tcp_sock_handler(struct ctx *c, int s, uint32_t events);
void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len);
int tcp_sock_init(struct ctx *c);
void tcp_periodic_fast(struct ctx *c);
void tcp_periodic_slow(struct ctx *c);
void tcp_timer(struct ctx *c, struct timespec *ts);

29
udp.c
View file

@ -124,8 +124,6 @@ void udp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
if (!(s = udp4_sock_port[ntohs(uh->source)]))
return;
fprintf(stderr, "udp from tap: using socket %i\n", s);
sa.sin_addr = *(struct in_addr *)addr;
sendto(s, in + sizeof(*uh), len - sizeof(*uh), MSG_DONTWAIT,
@ -140,15 +138,14 @@ void udp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
if (!(s = udp6_sock_port[ntohs(uh->source)]))
return;
fprintf(stderr, "udp from tap: using socket %i\n", s);
sendto(s, in + sizeof(*uh), len - sizeof(*uh), MSG_DONTWAIT,
sendto(s, in + sizeof(*uh), len - sizeof(*uh),
MSG_DONTWAIT | MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa));
}
}
/**
* udp_sock_init() - Create and bind listening sockets for inbound connections
* udp_sock_init() - Create and bind listening sockets for inbound packets
* @c: Execution context
*
* Return: 0 on success, -1 on failure
@ -159,15 +156,19 @@ int udp_sock_init(struct ctx *c)
int s;
for (port = 0; port < USHRT_MAX; port++) {
if (c->v4 &&
(s = sock_l4_add(c, 4, IPPROTO_UDP, htons(port))) < 0)
return -1;
udp4_sock_port[port] = s;
if (c->v4) {
if ((s = sock_l4_add(c, 4, IPPROTO_UDP, port)) < 0)
return -1;
if (c->v6 &&
(s = sock_l4_add(c, 6, IPPROTO_UDP, htons(port))) < 0)
return -1;
udp6_sock_port[port] = s;
udp4_sock_port[port] = s;
}
if (c->v6) {
if ((s = sock_l4_add(c, 6, IPPROTO_UDP, port)) < 0)
return -1;
udp6_sock_port[port] = s;
}
}
return 0;

14
util.c
View file

@ -139,7 +139,7 @@ char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto)
* sock_l4_add() - Create and bind socket for given L4, add to epoll list
* @c: Execution context
* @v: IP protocol, 4 or 6
* @proto: Protocol number, network order
* @proto: Protocol number, host order
* @port: Port, network order
*
* Return: newly created socket, -1 on error
@ -148,17 +148,17 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port)
{
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
.sin_port = port,
.sin_port = htons(port),
.sin_addr = { .s_addr = INADDR_ANY },
};
struct sockaddr_in6 addr6 = {
.sin6_family = AF_INET6,
.sin6_port = port,
.sin6_port = htons(port),
.sin6_addr = IN6ADDR_ANY_INIT,
};
struct epoll_event ev = { 0 };
const struct sockaddr *sa;
int fd, sl;
int fd, sl, one = 1;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP)
return -1; /* Not implemented. */
@ -176,6 +176,8 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port)
} else {
sa = (const struct sockaddr *)&addr6;
sl = sizeof(addr6);
setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one));
}
if (bind(fd, sa, sl) < 0) {
@ -213,10 +215,10 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port)
int timespec_diff_ms(struct timespec *a, struct timespec *b)
{
if (a->tv_nsec < b->tv_nsec) {
return (b->tv_nsec - a->tv_nsec) / 1000 +
return (b->tv_nsec - a->tv_nsec) / 1000000 +
(a->tv_sec - b->tv_sec - 1) * 1000;
}
return (a->tv_nsec - b->tv_nsec) / 1000 +
return (a->tv_nsec - b->tv_nsec) / 1000000 +
(a->tv_sec - b->tv_sec) * 1000;
}