From 8bca388e8a771d069b2a2d4ac47589112f6f0af3 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sun, 21 Feb 2021 11:33:38 +0100 Subject: [PATCH] passt: Assorted fixes from "fresh eyes" review A bunch of fixes not worth single commits at this stage, notably: - make buffer, length parameter ordering consistent in ARP, DHCP, NDP handlers - strict checking of buffer, message and option length in DHCP handler (a malicious client could have easily crashed it) - set up forwarding for IPv4 and IPv6, and masquerading with nft for IPv4, from demo script - get rid of separate slow and fast timers, we don't save any overhead that way - stricter checking of buffer lengths as passed to tap handlers - proper dequeuing from qemu socket back-end: I accidentally trashed messages that were bundled up together in a single tap read operation -- the length header tells us what's the size of the next frame, but there's no apparent limit to the number of messages we get with one single receive - rework some bits of the TCP state machine, now passive and active connection closes appear to be robust -- introduce a new FIN_WAIT_1_SOCK_FIN state indicating a FIN_WAIT_1 with a FIN flag from socket - streamline TCP option parsing routine - track TCP state changes to stderr (this is temporary, proper debugging and syslogging support pending) - observe that multiplying a number by four might very well change its value, and this happens to be the case for the data offset from the TCP header as we check if it's the same as the total length to find out if it's a duplicated ACK segment - recent estimates suggest that the duration of a millisecond is closer to a million nanoseconds than a thousand of them, this trend is now reflected into the timespec_diff_ms() convenience routine Signed-off-by: Stefano Brivio --- arp.c | 2 +- arp.h | 2 +- dhcp.c | 29 ++- dhcp.h | 2 +- doc/demo.sh | 13 +- ndp.c | 2 +- ndp.h | 2 +- passt.c | 148 ++++++------ tcp.c | 674 +++++++++++++++++++++++++--------------------------- tcp.h | 3 +- udp.c | 29 +-- util.c | 14 +- 12 files changed, 462 insertions(+), 458 deletions(-) diff --git a/arp.c b/arp.c index 21b6417..e9ccd5e 100644 --- a/arp.c +++ b/arp.c @@ -49,7 +49,7 @@ struct arpmsg { * * Return: 0 if it's not an ARP message, 1 if handled, -1 on failure */ -int arp(struct ctx *c, unsigned len, struct ethhdr *eh) +int arp(struct ctx *c, struct ethhdr *eh, size_t len) { struct arphdr *ah = (struct arphdr *)(eh + 1); struct arpmsg *am = (struct arpmsg *)(ah + 1); diff --git a/arp.h b/arp.h index ef3bd19..70188b3 100644 --- a/arp.h +++ b/arp.h @@ -1 +1 @@ -int arp(struct ctx *c, unsigned len, struct ethhdr *eh); +int arp(struct ctx *c, struct ethhdr *eh, size_t len); diff --git a/dhcp.c b/dhcp.c index 3af4ace..d4a5261 100644 --- a/dhcp.c +++ b/dhcp.c @@ -163,22 +163,39 @@ static int fill(struct msg *m) * * Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure */ -int dhcp(struct ctx *c, unsigned len, struct ethhdr *eh) +int dhcp(struct ctx *c, struct ethhdr *eh, size_t len) { struct iphdr *iph = (struct iphdr *)(eh + 1); - struct udphdr *uh = (struct udphdr *)((char *)iph + iph->ihl * 4); - struct msg *m = (struct msg *)(uh + 1); - unsigned int i, mlen = len - sizeof(*eh) - sizeof(*iph); + size_t mlen, olen; + struct udphdr *uh; + unsigned int i; + struct msg *m; + + if (len < sizeof(*eh) + sizeof(*iph)) + return 0; + + if (len < sizeof(*eh) + iph->ihl * 4 + sizeof(*uh)) + return 0; + + uh = (struct udphdr *)((char *)iph + iph->ihl * 4); + m = (struct msg *)(uh + 1); if (uh->dest != htons(67)) return 0; - if (mlen != ntohs(uh->len) || mlen < offsetof(struct msg, o) || + mlen = len - sizeof(*eh) - iph->ihl * 4 - sizeof(*uh); + if (mlen != ntohs(uh->len) - sizeof(*uh) || + mlen < offsetof(struct msg, o) || m->op != BOOTREQUEST) return -1; - for (i = 0; i < mlen - offsetof(struct msg, o); i += m->o[i + 1] + 2) + olen = mlen - offsetof(struct msg, o); + for (i = 0; i + 2 < olen; i += m->o[i + 1] + 2) { + if (m->o[i + 1] + i + 2 >= olen) + return -1; + memcpy(&opts[m->o[i]].c, &m->o[i + 2], m->o[i + 1]); + } if (opts[53].c[0] == DHCPDISCOVER) { fprintf(stderr, "DHCP: offer to discover"); diff --git a/dhcp.h b/dhcp.h index a519ee5..c4fbfe5 100644 --- a/dhcp.h +++ b/dhcp.h @@ -1 +1 @@ -int dhcp(struct ctx *c, unsigned len, struct ethhdr *eh); +int dhcp(struct ctx *c, struct ethhdr *eh, size_t len); diff --git a/doc/demo.sh b/doc/demo.sh index 3d20491..3735130 100755 --- a/doc/demo.sh +++ b/doc/demo.sh @@ -48,10 +48,17 @@ ip netns add passt ip link add veth_passt up netns passt type veth peer name veth_passt ip link set dev veth_passt up + ip -n passt addr add 192.0.2.2/24 dev veth_passt ip addr add 192.0.2.1/24 dev veth_passt ip -n passt route add default via 192.0.2.1 +sysctl -w net.ipv4.ip_forward=1 +nft delete table passt_nat 2>/dev/null || : +nft add table passt_nat +nft 'add chain passt_nat postrouting { type nat hook postrouting priority -100 ; }' +nft add rule passt_nat postrouting ip saddr 192.0.2.2 masquerade + ipv6_addr="$(ipv6_devaddr "$(ipv6_dev)")" ipv6_passt="$(ipv6_mangle "${ipv6_addr}")" ndp_setup "${ipv6_passt}" @@ -59,11 +66,15 @@ ip -n passt addr add "${ipv6_passt}/$(ipv6_mask "${ipv6_addr}")" dev veth_passt ip addr add "${ipv6_addr}" dev veth_passt passt_ll="$(ipv6_ll_addr "veth_passt")" main_ll="$(get_token "link/ether" $(ip -o li sh veth_passt))" -ip -n passt neigh add "${passt_ll%%/*}" dev veth_passt lladdr "${main_ll}" +ip neigh add "${passt_ll%%/*}" dev veth_passt lladdr "${main_ll}" ip -n passt route add default via "${passt_ll%%/*}" dev veth_passt +sysctl -w net.ipv6.conf.all.forwarding=1 + + ethtool -K veth_passt tx off ip netns exec passt ethtool -K veth_passt tx off ulimit -n 300000 + ip netns exec passt ./passt diff --git a/ndp.c b/ndp.c index 9e38cec..952621d 100644 --- a/ndp.c +++ b/ndp.c @@ -40,7 +40,7 @@ * * Return: 0 if not handled here, 1 if handled, -1 on failure */ -int ndp(struct ctx *c, unsigned len, struct ethhdr *eh) +int ndp(struct ctx *c, struct ethhdr *eh, size_t len) { struct ethhdr *ehr; struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1), *ip6hr; diff --git a/ndp.h b/ndp.h index 2c59713..b831c4d 100644 --- a/ndp.h +++ b/ndp.h @@ -1 +1 @@ -int ndp(struct ctx *c, unsigned len, struct ethhdr *eh); +int ndp(struct ctx *c, struct ethhdr *eh, size_t len); diff --git a/passt.c b/passt.c index 4ef6e72..622ff38 100644 --- a/passt.c +++ b/passt.c @@ -51,9 +51,7 @@ #define EPOLL_EVENTS 10 -#define EPOLL_TIMEOUT 100 /* ms, for protocol periodic handlers */ -#define PERIODIC_HANDLER_FAST 100 -#define PERIODIC_HANDLER_SLOW 1000 +#define TIMER_INTERVAL 20 /* ms, for protocol periodic handlers */ /** * sock_unix() - Create and bind AF_UNIX socket, add to epoll list @@ -294,7 +292,7 @@ static void get_dns(struct ctx *c) } /** - * tap4_handler() - IPv4 packet handler for tap file descriptor + * tap4_handler() - IPv4 and ARP packet handler for tap file descriptor * @c: Execution context * @len: Total L2 packet length * @in: Packet buffer, L2 headers @@ -303,12 +301,18 @@ static void tap4_handler(struct ctx *c, char *in, size_t len) { struct ethhdr *eh = (struct ethhdr *)in; struct iphdr *iph = (struct iphdr *)(eh + 1); - char *l4h = (char *)iph + iph->ihl * 4; char buf_s[BUFSIZ], buf_d[BUFSIZ]; + char *l4h; - if (arp(c, len, eh) || dhcp(c, len, eh)) + if (arp(c, eh, len) || dhcp(c, eh, len)) return; + if (len < sizeof(*eh) + sizeof(*iph)) + return; + + l4h = (char *)iph + iph->ihl * 4; + len -= (intptr_t)l4h - (intptr_t)eh; + if (iph->protocol == IPPROTO_ICMP) { fprintf(stderr, "icmp from tap: %s -> %s\n", inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), @@ -316,6 +320,9 @@ static void tap4_handler(struct ctx *c, char *in, size_t len) } else { struct tcphdr *th = (struct tcphdr *)l4h; + if (len < sizeof(*th) && len < sizeof(struct udphdr)) + return; + fprintf(stderr, "%s from tap: %s:%i -> %s:%i\n", getprotobynumber(iph->protocol)->p_name, inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), @@ -324,8 +331,6 @@ static void tap4_handler(struct ctx *c, char *in, size_t len) ntohs(th->dest)); } - len -= (intptr_t)l4h - (intptr_t)eh; - if (iph->protocol == IPPROTO_TCP) tcp_tap_handler(c, AF_INET, &iph->daddr, l4h, len); else if (iph->protocol == IPPROTO_UDP) @@ -346,33 +351,21 @@ static void tap6_handler(struct ctx *c, char *in, size_t len) uint8_t proto; char *l4h; - if (ndp(c, len, eh)) + if (len < sizeof(*eh) + sizeof(*ip6h)) + return; + + if (ndp(c, eh, len)) return; l4h = ipv6_l4hdr(ip6h, &proto); /* TODO: Assign MAC address to guest so that, together with prefix - * assigned via NDP, address matches the one on the host. Then drop - * address change and checksum recomputation. + * assigned via NDP, address matches the one from the host. */ c->addr6_guest = ip6h->saddr; ip6h->saddr = c->addr6; - if (proto == IPPROTO_TCP) { - struct tcphdr *th = (struct tcphdr *)(ip6h + 1); - th->check = 0; - th->check = csum_ip4(ip6h, len + sizeof(*ip6h)); - } else if (proto == IPPROTO_UDP) { - struct udphdr *uh = (struct udphdr *)(ip6h + 1); - - uh->check = 0; - uh->check = csum_ip4(ip6h, len + sizeof(*ip6h)); - } else if (proto == IPPROTO_ICMPV6) { - struct icmp6hdr *ih = (struct icmp6hdr *)(ip6h + 1); - - ih->icmp6_cksum = 0; - ih->icmp6_cksum = csum_ip4(ip6h, len + sizeof(*ip6h)); - } + len -= (intptr_t)l4h - (intptr_t)eh; if (proto == IPPROTO_ICMPV6) { fprintf(stderr, "icmpv6 from tap: %s ->\n\t%s\n", @@ -382,6 +375,9 @@ static void tap6_handler(struct ctx *c, char *in, size_t len) } else { struct tcphdr *th = (struct tcphdr *)l4h; + if (len < sizeof(*th) && len < sizeof(struct udphdr)) + return; + fprintf(stderr, "%s from tap: [%s]:%i\n" "\t-> [%s]:%i\n", getprotobynumber(proto)->p_name, @@ -391,8 +387,6 @@ static void tap6_handler(struct ctx *c, char *in, size_t len) ntohs(th->dest)); } - len -= (intptr_t)l4h - (intptr_t)eh; - if (proto == IPPROTO_TCP) tcp_tap_handler(c, AF_INET6, &ip6h->daddr, l4h, len); else if (proto == IPPROTO_UDP) @@ -400,19 +394,46 @@ static void tap6_handler(struct ctx *c, char *in, size_t len) } /** - * tap_handler() - IPv4/IPv6/ARP packet handler for tap file descriptor + * tap_handler() - Packet handler for tap file descriptor * @c: Execution context - * @len: Total L2 packet length - * @in: Packet buffer, L2 headers + * + * Return: -ECONNRESET if tap connection was lost, 0 otherwise */ -static void tap_handler(struct ctx *c, char *in, size_t len) +static int tap_handler(struct ctx *c) { - struct ethhdr *eh = (struct ethhdr *)in; + char buf[ETH_MAX_MTU]; + struct ethhdr *eh; + uint32_t vnet_len; + ssize_t n; - if (eh->h_proto == ntohs(ETH_P_IP) || eh->h_proto == ntohs(ETH_P_ARP)) - tap4_handler(c, in, len); - else if (eh->h_proto == ntohs(ETH_P_IPV6)) - tap6_handler(c, in, len); + eh = (struct ethhdr *)buf; + + while ((n = recv(c->fd_unix, &vnet_len, 4, MSG_DONTWAIT)) == 4) { + n = recv(c->fd_unix, buf, ntohl(vnet_len), MSG_DONTWAIT); + + if (n < (ssize_t)sizeof(*eh)) + break; + + switch (ntohs(eh->h_proto)) { + case ETH_P_IP: + case ETH_P_ARP: + tap4_handler(c, buf, n); + break; + case ETH_P_IPV6: + tap6_handler(c, buf, n); + break; + default: + break; + } + } + + if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_unix, NULL); + close(c->fd_unix); + + return -ECONNRESET; } /** @@ -429,29 +450,30 @@ static void sock_handler(struct ctx *c, int fd, uint32_t events) sl = sizeof(so); if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &so, &sl) || - so == SOCK_STREAM) + so == SOCK_STREAM) { + fprintf(stderr, "TCP: packet from socket %i\n", fd); tcp_sock_handler(c, fd, events); - else if (so == SOCK_DGRAM) + } + else if (so == SOCK_DGRAM) { udp_sock_handler(c, fd, events); + fprintf(stderr, "UDP: packet from socket %i\n", fd); + } } /** - * periodic_handler() - Run periodic tasks for L4 protocol handlers + * timer_handler() - Run periodic tasks for L4 protocol handlers * @c: Execution context * @last: Timestamp of last run, updated on return */ -static void periodic_handler(struct ctx *c, struct timespec *last) +static void timer_handler(struct ctx *c, struct timespec *last) { struct timespec tmp; - int elapsed_ms; clock_gettime(CLOCK_MONOTONIC, &tmp); - elapsed_ms = timespec_diff_ms(&tmp, last); + if (timespec_diff_ms(&tmp, last) < TIMER_INTERVAL) + return; - if (elapsed_ms >= PERIODIC_HANDLER_FAST) - tcp_periodic_fast(c); - if (elapsed_ms >= PERIODIC_HANDLER_SLOW) - tcp_periodic_slow(c); + tcp_timer(c, &tmp); *last = tmp; } @@ -481,10 +503,8 @@ int main(int argc, char **argv) struct epoll_event events[EPOLL_EVENTS]; struct epoll_event ev = { 0 }; struct timespec last_time; - char buf[ETH_MAX_MTU]; struct ctx c = { 0 }; - int nfds, i, len; - int fd_unix; + int nfds, i, fd_unix; if (argc != 1) usage(argv[0]); @@ -537,14 +557,14 @@ listen: "./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio\n\n"); c.fd_unix = accept(fd_unix, NULL, NULL); - ev.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP; + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; ev.data.fd = c.fd_unix; epoll_ctl(c.epollfd, EPOLL_CTL_ADD, c.fd_unix, &ev); clock_gettime(CLOCK_MONOTONIC, &last_time); loop: - nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, EPOLL_TIMEOUT); + nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL); if (nfds == -1 && errno != EINTR) { perror("epoll_wait"); exit(EXIT_FAILURE); @@ -552,36 +572,16 @@ loop: for (i = 0; i < nfds; i++) { if (events[i].data.fd == c.fd_unix) { - len = recv(events[i].data.fd, buf, sizeof(buf), - MSG_DONTWAIT); - - if (len <= 0) { - epoll_ctl(c.epollfd, EPOLL_CTL_DEL, c.fd_unix, - &ev); - close(c.fd_unix); + if (tap_handler(&c)) goto listen; - } - - if (len == 0 || (len < 0 && errno == EINTR)) - continue; - - if (len < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - break; - goto out; - } - - tap_handler(&c, buf + 4, ntohl(*(uint32_t *)buf)); } else { sock_handler(&c, events[i].data.fd, events[i].events); } } - periodic_handler(&c, &last_time); - clock_gettime(CLOCK_MONOTONIC, &last_time); + timer_handler(&c, &last_time); goto loop; -out: return 0; } diff --git a/tcp.c b/tcp.c index 46b739d..f1de9cf 100644 --- a/tcp.c +++ b/tcp.c @@ -130,7 +130,7 @@ * * These states apply to connected sockets only, listening sockets are always * open after initialisation, in LISTEN state. A single state is maintained for - * both sides of the connection, and most states are omitted as they are already + * both sides of the connection, and some states are omitted as they are already * handled by host kernel and guest. * * - CLOSED no connection @@ -144,31 +144,32 @@ * * - SOCK_SYN_SENT new connected socket, SYN sent to tap * - SYN,ACK from tap ACK to tap > ESTABLISHED - * - SYN,ACK timeout RST to tap, close socket > CLOSED * - socket error RST to tap, close socket > CLOSED + * - SYN,ACK timeout RST to tap, close socket > CLOSED * - RST from tap close socket > CLOSED * * - TAP_SYN_RCVD connect() completed, SYN,ACK sent to tap + * - FIN from tap write shutdown > FIN_WAIT_1 * - ACK from tap > ESTABLISHED - * - ACK timeout RST to tap, close socket > CLOSED * - socket error RST to tap, close socket > CLOSED + * - ACK timeout RST to tap, close socket > CLOSED * - RST from tap close socket > CLOSED * * - ESTABLISHED connection established, ready for data - * - zero-sized socket read FIN to tap > ESTABLISHED_SOCK_FIN - * - data timeout FIN to tap > ESTABLISHED_SOCK_FIN + * - FIN from tap write shutdown > FIN_WAIT_1 + * - zero-sized socket read read shutdown, FIN to tap > ESTABLISHED_SOCK_FIN * - socket error RST to tap, close socket > CLOSED - * - FIN from tap FIN,ACK to tap, close socket > FIN_WAIT_1 + * - data timeout FIN to tap > ESTABLISHED_SOCK_FIN * - RST from tap close socket > CLOSED * - * - ESTABLISHED_SOCK_FIN socket wants to close connection, data allowed + * - ESTABLISHED_SOCK_FIN socket closing connection, FIN sent to tap * - ACK from tap > CLOSE_WAIT * - ACK timeout RST to tap, close socket > CLOSED * - RST from tap close socket > CLOSED * - * - CLOSE_WAIT socket wants to close connection, seen by tap + * - CLOSE_WAIT socket closing connection, ACK from tap + * - FIN from tap write shutdown > LAST_ACK * - socket error RST to tap, close socket > CLOSED - * - FIN from tap ACK to tap, close socket > LAST_ACK * - FIN timeout RST to tap, close socket > CLOSED * - RST from tap close socket > CLOSED * @@ -176,12 +177,19 @@ * - anything from socket close socket > CLOSED * - socket error RST to tap, close socket > CLOSED * - ACK timeout RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED * - * - FIN_WAIT_1 tap wants to close connection, _FIN,ACK sent_ + * - FIN_WAIT_1 tap closing connection, FIN sent to socket + * - zero-sized socket read FIN,ACK to tap, shutdown > FIN_WAIT_1_SOCK_FIN + * - socket error RST to tap, close socket > CLOSED + * - ACK timeout RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - FIN_WAIT_1_SOCK_FIN tap closing connection, FIN received from socket * - ACK from tap close socket > CLOSED * - socket error RST to tap, close socket > CLOSED * - ACK timeout RST to tap, close socket > CLOSED - * + * - RST from tap close socket > CLOSED * * Connection setup * ---------------- @@ -198,34 +206,33 @@ * Aging and timeout * ----------------- * - * Two bitmaps of TCP_MAX_CONNS bits indicate which connections need scheduled - * actions: - * - @tcp_act_fast is used to send ACK segments to the tap once TCP_INFO reports - * an increased number of acknowledged bytes sent on a socket, and examined - * every 20ms (one tenth of current TCP_DELACK_MAX on Linux): for each marked - * connection, a TCP_INFO query is performed and ACK segments are sent right - * away as needed - * - @tcp_act_slow is used for state and retransmission timeouts, and examined - * every 2s: for each marked connection with an expired @timeout timestamp - * specific actions are taken depending on the connection state: - * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment - * from tap expires, connection is reset (RST to tap, socket closed) - * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from - * tap expires, connection is reset (RST to tap, socket closed) - * - ESTABLISHED: after a timeout of 1s (TODO: implement requirements from - * RFC 6298) waiting for an ACK segment from tap expires, data from socket - * queue is retransmitted starting from the last ACK sequence - * - ESTABLISHED: after a two hours (current TCP_KEEPALIVE_TIME on Linux) - * timeout waiting for any activity expires, connection is reset (RST to - * tap, socket closed) - * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK - * segment from tap expires, connection is reset (RST to tap, socket closed) - * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from - * tap expires, connection is reset (RST to tap, socket closed) - * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from - * socket expires, connection is reset (RST to tap, socket closed) - * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from - * tap expires, connection is reset (RST to tap, socket closed) + * A bitmap of TCP_MAX_CONNS bits indicate the connections subject to timed + * events based on states: + * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment + * from tap expires, connection is reset (RST to tap, socket closed) + * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from + * tap expires, connection is reset (RST to tap, socket closed) + * - TAP_SYN_SENT: connect() is pending, timeout is handled implicitly by + * connect() timeout, connection will be reset in case + * - ESTABLISHED, ESTABLISHED_SOCK_FIN: if an ACK segment to tap is pending, + * bytes acknowledged by socket endpoint are checked every 50ms (one quarter + * of current TCP_DELACK_MAX on Linux) + * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a timeout of 3s (TODO: implement + * requirements from RFC 6298) waiting for an ACK segment from tap expires, + * data from socket queue is retransmitted starting from the last ACK sequence + * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a two hours (current + * TCP_KEEPALIVE_TIME on Linux) timeout waiting for any activity expires, + * connection is reset (RST to tap, socket closed) + * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK + * segment from tap expires, connection is reset (RST to tap, socket closed) + * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from tap + * expires, connection is reset (RST to tap, socket closed) + * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from + * socet expires, connection is reset (RST to tap, socket closed) + * - FIN_WAIT_1_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK segment + * from tap expires, connection is reset (RST to tap, socket closed) + * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from + * socket expires, connection is reset (RST to tap, socket closed) * * * Data flows (from ESTABLISHED, ESTABLISHED_SOCK_FIN states) @@ -253,6 +260,7 @@ * - on read error, send RST to tap, close socket * - on zero read, send FIN to tap, enter ESTABLISHED_SOCK_FIN * - on ACK from tap: + * - set @ts_ack_tap * - check if it's the second duplicated ACK * - consume buffer by difference between new ack_seq and @seq_ack_from_tap * - update @seq_ack_from_tap from ack_seq in header @@ -263,11 +271,12 @@ * - periodically: * - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer * (TODO: implement requirements from RFC 6298, currently 3s fixed) from - * @last_ts_to_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and + * @ts_sock elapsed, reset @seq_to_tap to @seq_ack_from_tap, and * resend data with the steps listed above * * - from tap to socket: * - on packet from tap: + * - set @ts_tap * - set TCP_WINDOW_CLAMP from TCP header from tap * - check seq from header against @seq_from_tap, if data is missing, send * two ACKs with number @seq_ack_to_tap, discard packet @@ -277,15 +286,11 @@ * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and * send ACK to tap - * - set @last_ts_sock - * - on @seq_ack_to_tap < @seq_from_tap, mark socket for later ACK in bitmap * - periodically: - * - if socket is marked in bitmap, query socket for TCP_INFO, on - * tcpi_bytes_acked > @tcpi_acked_last, + * - query socket for TCP_INFO, on tcpi_bytes_acked > @tcpi_acked_last, * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and * send ACK to tap - * - on @seq_ack_to_tap == @seq_from_tap, unmark socket from bitmap */ #define _GNU_SOURCE @@ -321,22 +326,17 @@ #define SYN_TIMEOUT 240000 /* ms */ #define ACK_TIMEOUT 3000 +#define ACK_INTERVAL 50 #define ACT_TIMEOUT 7200000 #define FIN_TIMEOUT 240000 #define LAST_ACK_TIMEOUT 240000 -#define SOCK_ACK_INTERVAL 20 /* We need to include for tcpi_bytes_acked, instead of * , but that doesn't include a definition for SOL_TCP */ #define SOL_TCP IPPROTO_TCP -static char tcp_in_buf[MAX_WINDOW]; - -static uint8_t tcp_act_fast[MAX_CONNS / 8] = { 0 }; -static uint8_t tcp_act_slow[MAX_CONNS / 8] = { 0 }; - enum tcp_state { CLOSED = 0, TAP_SYN_SENT, @@ -347,6 +347,13 @@ enum tcp_state { CLOSE_WAIT, LAST_ACK, FIN_WAIT_1, + FIN_WAIT_1_SOCK_FIN, +}; + +static char *tcp_state_str[FIN_WAIT_1_SOCK_FIN + 1] = { + "CLOSED", "TAP_SYN_SENT", "SOCK_SYN_SENT", "TAP_SYN_RCVD", + "ESTABLISHED", "ESTABLISHED_SOCK_FIN", "CLOSE_WAIT", "LAST_ACK", + "FIN_WAIT_1", "FIN_WAIT_1_SOCK_FIN", }; #define FIN (1 << 0) @@ -357,7 +364,9 @@ enum tcp_state { #define OPT_EOL 0 #define OPT_NOP 1 #define OPT_MSS 2 +#define OPT_MSS_LEN 4 #define OPT_WS 3 +#define OPT_WS_LEN 3 #define OPT_SACKP 4 #define OPT_SACK 5 #define OPT_TS 8 @@ -381,8 +390,9 @@ enum tcp_state { * @ws_allowed: Window scaling allowed * @ws: Window scaling factor * @tap_window: Last window size received from tap, scaled - * @last_ts_sock: Last activity timestamp from socket for timeout purposes - * @last_ts_tap: Last activity timestamp from tap for timeout purposes + * @ts_sock: Last activity timestamp from socket for timeout purposes + * @ts_tap: Last activity timestamp from tap for timeout purposes + * @ts_ack_tap: Last ACK segment timestamp from tap for timeout purposes * @mss_guest: Maximum segment size advertised by guest */ struct tcp_conn { @@ -410,106 +420,101 @@ struct tcp_conn { int ws; int tap_window; - struct timespec last_ts_sock; - struct timespec last_ts_tap; + struct timespec ts_sock; + struct timespec ts_tap; + struct timespec ts_ack_tap; int mss_guest; }; +static char sock_buf[MAX_WINDOW]; +static uint8_t tcp_act[MAX_CONNS / 8] = { 0 }; static struct tcp_conn tc[MAX_CONNS]; static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len); /** - * tcp_act_fast_set() - Set socket in bitmap for "fast" timeout events + * tcp_act_set() - Set socket in bitmap for timed events * @s: Socket file descriptor number */ -static void tcp_act_fast_set(int s) +static void tcp_act_set(int s) { - tcp_act_fast[s / 8] |= 1 << (s % 8); + tcp_act[s / 8] |= 1 << (s % 8); } /** - * tcp_act_fast_clear() - Clear socket from bitmap for "fast" timeout events + * tcp_act_clear() - Clear socket from bitmap for timed events * @s: Socket file descriptor number */ -static void tcp_act_fast_clear(int s) +static void tcp_act_clear(int s) { - tcp_act_fast[s / 8] &= ~(1 << (s % 8)); + tcp_act[s / 8] &= ~(1 << (s % 8)); } /** - * tcp_act_slow_set() - Set socket in bitmap for "slow" timeout events + * tcp_set_state() - Set given TCP state for socket, report change to stderr * @s: Socket file descriptor number + * @state: New TCP state to be set */ -static void tcp_act_slow_set(int s) +static void tcp_set_state(int s, enum tcp_state state) { - tcp_act_slow[s / 8] |= 1 << (s % 8); -} - -/** - * tcp_act_slow_clear() - Clear socket from bitmap for "slow" timeout events - * @s: Socket file descriptor number - */ -static void tcp_act_slow_clear(int s) -{ - tcp_act_slow[s / 8] &= ~(1 << (s % 8)); + fprintf(stderr, "TCP: socket %i: %s -> %s\n", s, + tcp_state_str[tc[s].s], tcp_state_str[state]); + tc[s].s = state; } /** * tcp_opt_get() - Get option, and value if any, from TCP header * @th: Pointer to TCP header * @len: Length of buffer, including TCP header - * @type: Option type to look for - * @optlen: Optional, filled with option length if passed - * @value: Optional, set to start of option value if passed + * @__type: Option type to look for + * @__optlen: Optional, filled with option length if passed + * @__value: Optional, set to start of option value if passed * * Return: Option value, meaningful for up to 4 bytes, -1 if not found */ -static int tcp_opt_get(struct tcphdr *th, unsigned int len, uint8_t type, - uint8_t *optlen, void *value) +static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t __type, + uint8_t *__optlen, char **__value) { - uint8_t *p, __type, __optlen; + uint8_t type, optlen; + char *p; + + if (len > th->doff * 4) + len = th->doff * 4; len -= sizeof(*th); - p = (uint8_t *)(th + 1); + p = (char *)(th + 1); - if (len > th->doff * 4 - sizeof(*th)) - len = th->doff * 4 - sizeof(*th); - - while (len >= 2) { + for (; len >= 2; p += optlen, len -= optlen) { switch (*p) { case OPT_EOL: return -1; case OPT_NOP: - p++; - len--; + optlen = 1; break; default: - __type = *(p++); - __optlen = *(p++); + type = *(p++); + optlen = *(p++) - 2; len -= 2; - if (type == __type) { - if (optlen) - *optlen = __optlen; - if (value) - value = p; + if (type != __type) + break; - if (__optlen - 2 == 0) - return 0; - - if (__optlen - 2 == 1) - return *p; - - if (__optlen - 2 == 2) - return ntohs(*(uint16_t *)p); + if (__optlen) + *__optlen = optlen; + if (__value) + *__value = p; + switch (optlen) { + case 0: + return 0; + case 1: + return *p; + case 2: + return ntohs(*(uint16_t *)p); + default: return ntohl(*(uint32_t *)p); } - - p += __optlen - 2; - len -= __optlen - 2; } } @@ -524,9 +529,9 @@ static int tcp_opt_get(struct tcphdr *th, unsigned int len, uint8_t type, static void tcp_close_and_epoll_del(struct ctx *c, int s) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); + tcp_set_state(s, CLOSED); close(s); - tcp_act_fast_clear(s); - tcp_act_slow_clear(s); + tcp_act_clear(s); } /** @@ -541,7 +546,7 @@ static void tcp_rst(struct ctx *c, int s) tcp_send_to_tap(c, s, RST, NULL, 0); tcp_close_and_epoll_del(c, s); - tc[s].s = CLOSED; + tcp_set_state(s, CLOSED); } /** @@ -549,76 +554,70 @@ static void tcp_rst(struct ctx *c, int s) * @c: Execution context * @s: File descriptor number for socket * @flags: TCP flags to set - * @in: Input buffer, L4 header - * @len: Buffer length, at L4 + * @in: Payload buffer + * @len: Payload length * - * Return: -1 on error with connection reset, 0 otherwise + * Return: negative error code on connection reset, 0 otherwise */ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) { char buf[USHRT_MAX] = { 0 }, *data; struct tcp_info info = { 0 }; socklen_t sl = sizeof(info); - int ws = 0, have_info = 1; struct tcphdr *th; + int ws = 0, err; - if (getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) { - if (!(flags & RST)) { - tcp_rst(c, s); - return -1; - } - - have_info = 0; + if ((err = getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) && + !(flags & RST)) { + tcp_rst(c, s); + return err; } th = (struct tcphdr *)buf; data = (char *)(th + 1); + th->doff = sizeof(*th) / 4; - if (flags & SYN && have_info) { - if (tc[s].ws_allowed) - ws = info.tcpi_snd_wscale; - + if ((flags & SYN) && !err) { /* Options: MSS, NOP and window scale if allowed (4-8 bytes) */ - *data++ = 2; - *data++ = 4; + *data++ = OPT_MSS; + *data++ = OPT_MSS_LEN; *(uint16_t *)data = htons(info.tcpi_snd_mss); - data += 2; + data += OPT_MSS_LEN - 2; + th->doff += OPT_MSS_LEN / 4; - if (ws) { - *data++ = 1; + if (tc[s].ws_allowed && (ws = info.tcpi_snd_wscale)) { + *data++ = OPT_NOP; - *data++ = 3; - *data++ = 3; - *data++ = ws; + *data++ = OPT_WS; + *data++ = OPT_WS_LEN; + *data = ws; + *data += OPT_WS_LEN - 2; - th->doff = (20 + 8) / 4; - } else { - th->doff = (20 + 4) / 4; + th->doff += (1 + OPT_WS_LEN) / 4; } + /* RFC 793, 3.1: "[...] and the first data octet is ISN+1." */ th->seq = htonl(tc[s].seq_to_tap++); } else { - th->doff = 20 / 4; - th->seq = htonl(tc[s].seq_to_tap); tc[s].seq_to_tap += len; } - if ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last || (flags & ACK) || - len) && - have_info) { + if (!err && ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last) || + (flags & ACK) || len)) { uint64_t ack_seq; th->ack = 1; - /* info.tcpi_bytes_acked already includes one byte for SYN, but - * not for incoming connections. - */ - ack_seq = info.tcpi_bytes_acked + tc[s].seq_init_from_tap; - if (!info.tcpi_bytes_acked) - ack_seq++; - ack_seq &= (uint32_t)~0U; - tc[s].seq_ack_to_tap = ack_seq; + ack_seq = info.tcpi_bytes_acked + tc[s].seq_init_from_tap; + + tc[s].seq_ack_to_tap = ack_seq & (uint32_t)~0U; + + if (tc[s].s == LAST_ACK) { + tc[s].seq_ack_to_tap = tc[s].seq_from_tap + 1; + th->seq = htonl(ntohl(th->seq) + 1); + } + th->ack_seq = htonl(tc[s].seq_ack_to_tap); tc[s].tcpi_acked_last = info.tcpi_bytes_acked; @@ -636,7 +635,7 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) th->source = tc[s].sock_port; th->dest = tc[s].tap_port; - if (have_info) + if (!err) th->window = htons(info.tcpi_snd_wnd >> info.tcpi_snd_wscale); else th->window = WINDOW_DEFAULT; @@ -656,23 +655,18 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) * @s: File descriptor number for socket * @th: TCP header, from tap * @len: Buffer length, at L4 + * @init: Set if this is the very first segment from tap */ -static void tcp_clamp_window(int s, struct tcphdr *th, int len) +static void tcp_clamp_window(int s, struct tcphdr *th, int len, int init) { - int ws; + if (init) { + tc[s].ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); + tc[s].ws_allowed = tc[s].ws >= 0 && tc[s].ws <= MAX_WS; + tc[s].ws *= tc[s].ws_allowed; - if (!tc[s].tap_window) { - ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); - if (ws >= 0 && ws <= MAX_WS) { - tc[s].ws_allowed = 1; - tc[s].ws = ws; - } else { - tc[s].ws_allowed = 0; - tc[s].ws = 0; - } - - /* First value is not scaled. Also, don't clamp yet, to avoid - * getting a zero scale just because we set a small window now. + /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp + * yet, to avoid getting a zero scale just because we set a + * small window now. */ tc[s].tap_window = ntohs(th->window); } else { @@ -718,25 +712,31 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, sl = sizeof(tc[s].mss_guest); setsockopt(s, SOL_TCP, TCP_MAXSEG, &tc[s].mss_guest, sl); - tcp_clamp_window(s, th, len); + tcp_clamp_window(s, th, len, 1); if (af == AF_INET) { - sa = (const struct sockaddr *)&addr4; + sa = (struct sockaddr *)&addr4; sl = sizeof(addr4); - memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); - memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); - memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a)); + memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); + memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); + memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a)); } else { - sa = (const struct sockaddr *)&addr6; + sa = (struct sockaddr *)&addr6; sl = sizeof(addr6); - memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6)); + memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6)); } tc[s].sock_port = th->dest; tc[s].tap_port = th->source; + clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock); + clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap); + clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap); + + tcp_act_set(s); + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; ev.data.fd = s; @@ -745,7 +745,8 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, tc[s].seq_ack_to_tap = tc[s].seq_from_tap; /* TODO: RFC 6528 with SipHash, worth it? */ - tc[s].seq_ack_from_tap = tc[s].seq_to_tap = 0; + tc[s].seq_to_tap = 0; + tc[s].seq_ack_from_tap = tc[s].seq_to_tap; if (connect(s, sa, sl)) { if (errno != EINPROGRESS) { @@ -754,17 +755,15 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, } ev.events |= EPOLLOUT; - tc[s].s = TAP_SYN_SENT; + tcp_set_state(s, TAP_SYN_SENT); } else { if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0)) return; - tc[s].s = TAP_SYN_RCVD; + tcp_set_state(s, TAP_SYN_RCVD); } epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); - - return; } /** @@ -773,7 +772,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, * @tap_port: tap-facing port * @sock_port: Socket-facing port * - * Return: file descriptor number for socket, if found, -1 otherwise + * Return: file descriptor number for socket, if found, -ENOENT otherwise */ static int tcp_sock_lookup(int af, void *addr, in_port_t tap_port, in_port_t sock_port) @@ -797,7 +796,7 @@ static int tcp_sock_lookup(int af, void *addr, return i; } - return -1; + return -ENOENT; } /** @@ -808,10 +807,8 @@ static int tcp_sock_lookup(int af, void *addr, static void tcp_conn_from_sock(struct ctx *c, int fd) { struct sockaddr_storage sa_r, sa_l; - socklen_t sa_len = sizeof(sa_r); + socklen_t sa_len = sizeof(sa_l); struct epoll_event ev = { 0 }; - struct sockaddr_in6 *sa6; - struct sockaddr_in *sa4; int s; if (getsockname(fd, (struct sockaddr *)&sa_l, &sa_len)) @@ -822,41 +819,41 @@ static void tcp_conn_from_sock(struct ctx *c, int fd) return; if (sa_l.ss_family == AF_INET) { - sa4 = (struct sockaddr_in *)&sa_r; + struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa_r; memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); memcpy(&tc[s].a.a4.a, &sa4->sin_addr, sizeof(tc[s].a.a4.a)); tc[s].sock_port = sa4->sin_port; - - sa4 = (struct sockaddr_in *)&sa_l; - tc[s].tap_port = sa4->sin_port; - + tc[s].tap_port = ((struct sockaddr_in *)&sa_l)->sin_port; } else if (sa_l.ss_family == AF_INET6) { - sa6 = (struct sockaddr_in6 *)&sa_r; + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)&sa_r; memcpy(&tc[s].a.a6, &sa6->sin6_addr, sizeof(tc[s].a.a6)); tc[s].sock_port = sa6->sin6_port; - - sa6 = (struct sockaddr_in6 *)&sa_l; - tc[s].tap_port = sa6->sin6_port; + tc[s].tap_port = ((struct sockaddr_in6 *)&sa_l)->sin6_port; } /* TODO: RFC 6528 with SipHash, worth it? */ tc[s].seq_to_tap = 0; + tc[s].seq_ack_from_tap = tc[s].seq_to_tap + 1; + tc[s].tap_window = WINDOW_DEFAULT; tc[s].ws_allowed = 1; - clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock); - clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); + clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock); + clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap); + clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap); + + tcp_act_set(s); ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; ev.data.fd = s; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); - tc[s].s = SOCK_SYN_SENT; + tcp_set_state(s, SOCK_SYN_SENT); tcp_send_to_tap(c, s, SYN, NULL, 0); } @@ -864,14 +861,13 @@ static void tcp_conn_from_sock(struct ctx *c, int fd) * tcp_send_to_sock() - Send buffer to socket, update timestamp and sequence * @c: Execution context * @s: File descriptor number for socket - * @seq: Previous TCP sequence, host order * @data: Data buffer * @len: Length at L4 * @extra_flags: Additional flags for send(), if any * - * Return: -1 on socket error with connection reset, 0 otherwise + * Return: negative on socket error with connection reset, 0 otherwise */ -static int tcp_send_to_sock(struct ctx *c, int s, int seq, char *data, int len, +static int tcp_send_to_sock(struct ctx *c, int s, char *data, int len, int extra_flags) { int err = send(s, data, len, MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags); @@ -884,28 +880,28 @@ static int tcp_send_to_sock(struct ctx *c, int s, int seq, char *data, int len, return 0; } + err = errno; tcp_rst(c, s); - return -1; + return -err; } - clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock); - tc[s].seq_from_tap = seq + len; + tc[s].seq_from_tap += len; return 0; } /** - * tcp_check_dupack() - Check if given ACK number is duplicated, update counter + * tcp_is_dupack() - Check if given ACK number is duplicated, update counter * @s: File descriptor number for socket * @ack_seq: ACK sequence, host order * - * Return: 1 on two duplicated ACKs observed, with counter reset, 0 otherwise + * Return: -EAGAIN on duplicated ACKs observed, with counter reset, 0 otherwise */ -static int tcp_check_dupack(int s, uint32_t ack_seq) +static int tcp_is_dupack(int s, uint32_t ack_seq) { if (ack_seq == tc[s].seq_ack_from_tap && ++tc[s].dup_acks == 2) { tc[s].dup_acks = 0; - return 1; + return -EAGAIN; } return 0; @@ -916,7 +912,7 @@ static int tcp_check_dupack(int s, uint32_t ack_seq) * @s: File descriptor number for socket * @ack_seq: ACK sequence, host order * - * Return: -1 on invalid sequence, 0 otherwise + * Return: negative on invalid sequence, 0 otherwise */ static int tcp_sock_consume(int s, uint32_t ack_seq) { @@ -926,7 +922,7 @@ static int tcp_sock_consume(int s, uint32_t ack_seq) to_ack = ack_seq - tc[s].seq_ack_from_tap; if (to_ack < 0) - return -1; + return -EIO; recv(s, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC); tc[s].seq_ack_from_tap = ack_seq; @@ -939,27 +935,29 @@ static int tcp_sock_consume(int s, uint32_t ack_seq) * @c: Execution context * @s: File descriptor number for socket * - * Return: non-zero on socket error or pending data, 0 otherwise + * Return: negative on connection reset, 1 on pending data, 0 otherwise */ static int tcp_data_from_sock(struct ctx *c, int s) { - int len, offset, left, send; + int len, err, offset, left, send; /* Don't dequeue until acknowledged by guest */ - len = recv(s, tcp_in_buf, sizeof(tcp_in_buf), MSG_DONTWAIT | MSG_PEEK); + len = recv(s, sock_buf, sizeof(sock_buf), MSG_DONTWAIT | MSG_PEEK); if (len < 0) { - if (errno != EAGAIN && errno != EWOULDBLOCK) + if (errno != EAGAIN && errno != EWOULDBLOCK) { tcp_rst(c, s); - return 1; + return -errno; + } + return 0; } if (len == 0) { if (tc[s].s >= ESTABLISHED_SOCK_FIN) return 0; - tc[s].s = ESTABLISHED_SOCK_FIN; - if (tcp_send_to_tap(c, s, FIN | ACK, NULL, 0)) - return 0; + tcp_set_state(s, ESTABLISHED_SOCK_FIN); + if ((err = tcp_send_to_tap(c, s, FIN | ACK, NULL, 0))) + return err; left = 0; goto out; @@ -973,16 +971,15 @@ static int tcp_data_from_sock(struct ctx *c, int s) else send = tc[s].mss_guest; - if (tcp_send_to_tap(c, s, 0, tcp_in_buf + offset, send)) - return 0; + if ((err = tcp_send_to_tap(c, s, 0, sock_buf + offset, send))) + return err; offset += send; left -= send; } out: - clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); - tcp_act_slow_set(s); + clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock); return !!left; } @@ -997,7 +994,7 @@ out: void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) { struct tcphdr *th = (struct tcphdr *)in; - size_t off; + size_t off, skip = 0; int s, ws; if (len < sizeof(*th)) @@ -1007,9 +1004,7 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) if (off < sizeof(*th) || off > len) return; - s = tcp_sock_lookup(af, addr, th->source, th->dest); - - if (s < 0) { + if ((s = tcp_sock_lookup(af, addr, th->source, th->dest)) < 0) { if (th->syn) tcp_conn_from_tap(c, af, addr, th, len); return; @@ -1020,15 +1015,19 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) return; } - tcp_clamp_window(s, th, len); + tcp_clamp_window(s, th, len, th->syn && th->ack); - if (th->ack) - clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); + clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap); + + if (ntohl(th->seq) < tc[s].seq_from_tap) + skip = tc[s].seq_from_tap - ntohl(th->seq); switch (tc[s].s) { case SOCK_SYN_SENT: - if (!th->syn || !th->ack) + if (!th->syn || !th->ack) { + tcp_rst(c, s); return; + } tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); if (tc[s].mss_guest < 0) @@ -1045,19 +1044,20 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) return; } - tc[s].seq_from_tap = tc[s].seq_init_from_tap = ntohl(th->seq); + /* info.tcpi_bytes_acked already includes one byte for SYN, but + * not for incoming connections. + */ + tc[s].seq_init_from_tap = ntohl(th->seq) + 1; + tc[s].seq_from_tap = tc[s].seq_init_from_tap; tc[s].seq_ack_to_tap = tc[s].seq_from_tap; - tc[s].s = ESTABLISHED; + tcp_set_state(s, ESTABLISHED); tcp_send_to_tap(c, s, ACK, NULL, 0); break; - case TAP_SYN_SENT: - break; case TAP_SYN_RCVD: if (th->fin) { shutdown(s, SHUT_WR); - tc[s].s = FIN_WAIT_1; - + tcp_set_state(s, FIN_WAIT_1); break; } @@ -1066,83 +1066,81 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) return; } - tc[s].seq_ack_from_tap = ntohl(th->ack_seq); - - tc[s].s = ESTABLISHED; + tcp_set_state(s, ESTABLISHED); break; case ESTABLISHED: + case ESTABLISHED_SOCK_FIN: + clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap); + + if (ntohl(th->seq) > tc[s].seq_from_tap) { + tc[s].seq_from_tap = tc[s].seq_ack_to_tap; + tcp_send_to_tap(c, s, ACK, NULL, 0); + break; + } + if (th->ack) { int retrans = 0; - if (len == th->doff) - retrans = tcp_check_dupack(s, th->ack_seq); + if (len == off) + retrans = tcp_is_dupack(s, ntohl(th->ack_seq)); if (tcp_sock_consume(s, ntohl(th->ack_seq))) { tcp_rst(c, s); return; } - if (retrans) { + tc[s].seq_ack_from_tap = ntohl(th->ack_seq); + + if (retrans) tc[s].seq_to_tap = tc[s].seq_ack_from_tap; - tcp_data_from_sock(c, s); + + if (tc[s].s == ESTABLISHED_SOCK_FIN) { + if (!tcp_data_from_sock(c, s)) + tcp_set_state(s, CLOSE_WAIT); } } - if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off, + if (skip < len - off && + tcp_send_to_sock(c, s, in + off + skip, len - off - skip, th->psh ? 0 : MSG_MORE)) break; if (th->fin) { shutdown(s, SHUT_WR); - tc[s].s = FIN_WAIT_1; + if (tc[s].s == ESTABLISHED) + tcp_set_state(s, FIN_WAIT_1); + else + tcp_set_state(s, LAST_ACK); } break; - case ESTABLISHED_SOCK_FIN: - if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off, - th->psh ? 0 : MSG_MORE) < 0) - break; - - if (th->ack) { - shutdown(s, SHUT_RD); - if (!tcp_data_from_sock(c, s)) - tc[s].s = CLOSE_WAIT; - - if (tcp_sock_consume(s, ntohl(th->ack_seq))) { - tcp_rst(c, s); - return; - } - } - - break; - case CLOSE_WAIT: if (tcp_sock_consume(s, ntohl(th->ack_seq))) { tcp_rst(c, s); return; } + if (skip < len - off && + tcp_send_to_sock(c, s, in + off + skip, len - off - skip, + th->psh ? 0 : MSG_MORE)) + break; + if (th->fin) { shutdown(s, SHUT_WR); - tc[s].s = LAST_ACK; + tcp_set_state(s, LAST_ACK); } break; + case FIN_WAIT_1_SOCK_FIN: + if (th->ack) + tcp_close_and_epoll_del(c, s); + break; case FIN_WAIT_1: + case TAP_SYN_SENT: case LAST_ACK: case CLOSED: /* ;) */ break; } - - if (tc[s].seq_to_tap > tc[s].seq_ack_from_tap) - tcp_act_slow_set(s); - else - tcp_act_slow_clear(s); - - if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap) - tcp_act_fast_set(s); - else - tcp_act_fast_clear(s); } /** @@ -1162,14 +1160,15 @@ static void tcp_connect_finish(struct ctx *c, int s) return; } - if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0) < 0) + if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0)) return; + /* Drop EPOLLOUT, only used to wait for connect() to complete */ ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; ev.data.fd = s; epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev); - tc[s].s = TAP_SYN_RCVD; + tcp_set_state(s, TAP_SYN_RCVD); } /** @@ -1184,6 +1183,7 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events) int so; if (tc[s].s == LAST_ACK) { + tcp_send_to_tap(c, s, ACK, NULL, 0); tcp_close_and_epoll_del(c, s); return; } @@ -1210,21 +1210,21 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events) tcp_data_from_sock(c, s); if (events & EPOLLRDHUP || events & EPOLLHUP) { - if (tc[s].s == ESTABLISHED) - tc[s].s = ESTABLISHED_SOCK_FIN; - - tcp_send_to_tap(c, s, FIN | ACK, NULL, 0); - - if (tc[s].s == FIN_WAIT_1) { + if (tc[s].s == ESTABLISHED) { + tcp_set_state(s, ESTABLISHED_SOCK_FIN); shutdown(s, SHUT_RD); + tcp_data_from_sock(c, s); + tcp_send_to_tap(c, s, FIN | ACK, NULL, 0); + } else if (tc[s].s == FIN_WAIT_1) { + tcp_set_state(s, FIN_WAIT_1_SOCK_FIN); + shutdown(s, SHUT_RD); + tcp_data_from_sock(c, s); + tcp_send_to_tap(c, s, FIN | ACK, NULL, 0); - if (tcp_sock_consume(s, ntohl(tc[s].seq_ack_from_tap))) { + if (tcp_sock_consume(s, tc[s].seq_ack_from_tap)) { tcp_rst(c, s); return; } - - tcp_close_and_epoll_del(c, s); - tc[s].s = CLOSED; } } } @@ -1240,9 +1240,9 @@ int tcp_sock_init(struct ctx *c) in_port_t port; for (port = 0; port < (1 << 15) + (1 << 14); port++) { - if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, htons(port)) < 0) + if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, port) < 0) return -1; - if (c->v6 && sock_l4_add(c, 6, IPPROTO_TCP, htons(port)) < 0) + if (c->v6 && sock_l4_add(c, 6, IPPROTO_TCP, port) < 0) return -1; } @@ -1250,118 +1250,92 @@ int tcp_sock_init(struct ctx *c) } /** - * tcp_periodic_fast_one() - Handler for "fast" timeout events on one socket - * @c: Execution context - * @s: File descriptor number for socket - * @ts: Timestamp from caller - * - * Return: 0 if socket needs to be monitored further, non-zero otherwise - */ -int tcp_periodic_fast_one(struct ctx *c, int s, struct timespec *ts) -{ - if (timespec_diff_ms(ts, &tc[s].last_ts_sock) < SOCK_ACK_INTERVAL) - return 0; - - tc[s].last_ts_sock = *ts; - - tcp_send_to_tap(c, s, 0, NULL, 0); - - return tc[s].seq_from_tap == tc[s].seq_ack_to_tap; -} - -/** - * tcp_periodic_fast() - Handle sockets in "fast" event bitmap, clear as needed - * @c: Execution context - */ -void tcp_periodic_fast(struct ctx *c) -{ - long *word = (long *)tcp_act_fast, tmp; - struct timespec now; - unsigned int i; - int n, s; - - clock_gettime(CLOCK_MONOTONIC, &now); - - for (i = 0; i < sizeof(tcp_act_fast) / sizeof(long); i++, word++) { - tmp = *word; - while ((n = ffsl(tmp))) { - tmp &= ~(1UL << (n - 1)); - - s = i * sizeof(long) * 8 + n - 1; - - if (tcp_periodic_fast_one(c, s, &now)) - *word &= ~(1UL << (n - 1)); - } - } -} - -/** - * tcp_periodic_fast_one() - Handler for "slow" timeout events on one socket + * tcp_timer_one() - Handler for timed events on one socket * @c: Execution context * @s: File descriptor number for socket * @ts: Timestamp from caller */ -void tcp_periodic_slow_one(struct ctx *c, int s, struct timespec *ts) +static void tcp_timer_one(struct ctx *c, int s, struct timespec *ts) { + int ack_tap_ms = timespec_diff_ms(ts, &tc[s].ts_ack_tap); + int sock_ms = timespec_diff_ms(ts, &tc[s].ts_tap); + int tap_ms = timespec_diff_ms(ts, &tc[s].ts_tap); + switch (tc[s].s) { case SOCK_SYN_SENT: - case TAP_SYN_SENT: case TAP_SYN_RCVD: - if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > SYN_TIMEOUT) + if (ack_tap_ms > SYN_TIMEOUT) tcp_rst(c, s); + break; case ESTABLISHED_SOCK_FIN: - if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) { + if (ack_tap_ms > FIN_TIMEOUT) { tcp_rst(c, s); break; } /* Falls through */ case ESTABLISHED: - if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap && - timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACK_TIMEOUT) { - tc[s].seq_to_tap = tc[s].seq_ack_from_tap; - tcp_data_from_sock(c, s); + if (tap_ms > ACT_TIMEOUT && sock_ms > ACT_TIMEOUT) + tcp_rst(c, s); + + if (tc[s].seq_to_tap == tc[s].seq_ack_from_tap && + tc[s].seq_from_tap == tc[s].seq_ack_to_tap) { + tc[s].ts_sock = *ts; + break; } - if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACT_TIMEOUT && - timespec_diff_ms(ts, &tc[s].last_ts_sock) > ACT_TIMEOUT) - tcp_rst(c, s); + if (sock_ms > ACK_INTERVAL) { + if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap) + tcp_send_to_tap(c, s, 0, NULL, 0); + } + + if (ack_tap_ms > ACK_TIMEOUT) { + if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap) { + tc[s].seq_to_tap = tc[s].seq_ack_from_tap; + tc[s].ts_ack_tap = *ts; + tcp_data_from_sock(c, s); + } + } + + if (tc[s].seq_from_tap == tc[s].seq_ack_to_tap) + tc[s].ts_sock = *ts; break; case CLOSE_WAIT: case FIN_WAIT_1: - if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) + if (sock_ms > FIN_TIMEOUT) + tcp_rst(c, s); + break; + case FIN_WAIT_1_SOCK_FIN: + if (ack_tap_ms > FIN_TIMEOUT) tcp_rst(c, s); break; case LAST_ACK: - if (timespec_diff_ms(ts, &tc[s].last_ts_sock) > - LAST_ACK_TIMEOUT) + if (sock_ms > LAST_ACK_TIMEOUT) tcp_rst(c, s); break; + case TAP_SYN_SENT: case CLOSED: break; } } /** - * tcp_periodic_slow() - Handle sockets in "slow" event bitmap + * tcp_timer() - Scan activity bitmap for sockets waiting for timed events * @c: Execution context + * @ts: Timestamp from caller */ -void tcp_periodic_slow(struct ctx *c) +void tcp_timer(struct ctx *c, struct timespec *ts) { - long *word = (long *)tcp_act_slow, tmp; - struct timespec now; + long *word = (long *)tcp_act, tmp; unsigned int i; int n; - clock_gettime(CLOCK_MONOTONIC, &now); - - for (i = 0; i < sizeof(tcp_act_slow) / sizeof(long); i++, word++) { + for (i = 0; i < sizeof(tcp_act) / sizeof(long); i++, word++) { tmp = *word; while ((n = ffsl(tmp))) { tmp &= ~(1UL << (n - 1)); - tcp_periodic_slow_one(c, i * sizeof(long) * 8 + n - 1, - &now); + tcp_timer_one(c, i * sizeof(long) * 8 + n - 1, ts); } } } diff --git a/tcp.h b/tcp.h index 1f16790..9fa8244 100644 --- a/tcp.h +++ b/tcp.h @@ -1,5 +1,4 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events); void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len); int tcp_sock_init(struct ctx *c); -void tcp_periodic_fast(struct ctx *c); -void tcp_periodic_slow(struct ctx *c); +void tcp_timer(struct ctx *c, struct timespec *ts); diff --git a/udp.c b/udp.c index 74ce843..4acd48e 100644 --- a/udp.c +++ b/udp.c @@ -124,8 +124,6 @@ void udp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) if (!(s = udp4_sock_port[ntohs(uh->source)])) return; - fprintf(stderr, "udp from tap: using socket %i\n", s); - sa.sin_addr = *(struct in_addr *)addr; sendto(s, in + sizeof(*uh), len - sizeof(*uh), MSG_DONTWAIT, @@ -140,15 +138,14 @@ void udp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) if (!(s = udp6_sock_port[ntohs(uh->source)])) return; - fprintf(stderr, "udp from tap: using socket %i\n", s); - - sendto(s, in + sizeof(*uh), len - sizeof(*uh), MSG_DONTWAIT, + sendto(s, in + sizeof(*uh), len - sizeof(*uh), + MSG_DONTWAIT | MSG_NOSIGNAL, (struct sockaddr *)&sa, sizeof(sa)); } } /** - * udp_sock_init() - Create and bind listening sockets for inbound connections + * udp_sock_init() - Create and bind listening sockets for inbound packets * @c: Execution context * * Return: 0 on success, -1 on failure @@ -159,15 +156,19 @@ int udp_sock_init(struct ctx *c) int s; for (port = 0; port < USHRT_MAX; port++) { - if (c->v4 && - (s = sock_l4_add(c, 4, IPPROTO_UDP, htons(port))) < 0) - return -1; - udp4_sock_port[port] = s; + if (c->v4) { + if ((s = sock_l4_add(c, 4, IPPROTO_UDP, port)) < 0) + return -1; - if (c->v6 && - (s = sock_l4_add(c, 6, IPPROTO_UDP, htons(port))) < 0) - return -1; - udp6_sock_port[port] = s; + udp4_sock_port[port] = s; + } + + if (c->v6) { + if ((s = sock_l4_add(c, 6, IPPROTO_UDP, port)) < 0) + return -1; + + udp6_sock_port[port] = s; + } } return 0; diff --git a/util.c b/util.c index 324f800..e8ee57f 100644 --- a/util.c +++ b/util.c @@ -139,7 +139,7 @@ char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto) * sock_l4_add() - Create and bind socket for given L4, add to epoll list * @c: Execution context * @v: IP protocol, 4 or 6 - * @proto: Protocol number, network order + * @proto: Protocol number, host order * @port: Port, network order * * Return: newly created socket, -1 on error @@ -148,17 +148,17 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port) { struct sockaddr_in addr4 = { .sin_family = AF_INET, - .sin_port = port, + .sin_port = htons(port), .sin_addr = { .s_addr = INADDR_ANY }, }; struct sockaddr_in6 addr6 = { .sin6_family = AF_INET6, - .sin6_port = port, + .sin6_port = htons(port), .sin6_addr = IN6ADDR_ANY_INIT, }; struct epoll_event ev = { 0 }; const struct sockaddr *sa; - int fd, sl; + int fd, sl, one = 1; if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) return -1; /* Not implemented. */ @@ -176,6 +176,8 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port) } else { sa = (const struct sockaddr *)&addr6; sl = sizeof(addr6); + + setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)); } if (bind(fd, sa, sl) < 0) { @@ -213,10 +215,10 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port) int timespec_diff_ms(struct timespec *a, struct timespec *b) { if (a->tv_nsec < b->tv_nsec) { - return (b->tv_nsec - a->tv_nsec) / 1000 + + return (b->tv_nsec - a->tv_nsec) / 1000000 + (a->tv_sec - b->tv_sec - 1) * 1000; } - return (a->tv_nsec - b->tv_nsec) / 1000 + + return (a->tv_nsec - b->tv_nsec) / 1000000 + (a->tv_sec - b->tv_sec) * 1000; }