passt: Spare some syscalls, add some optimisations from profiling

Avoid a bunch of syscalls on forwarding paths by:

- storing minimum and maximum file descriptor numbers for each
  protocol, fall back to SO_PROTOCOL query only on overlaps

- allocating a larger receive buffer -- this can result in more
  coalesced packets than sendmmsg() can take (UIO_MAXIOV, i.e. 1024),
  so make sure we don't exceed that within a single call to protocol
  tap handlers

- nesting the handling loop in tap_handler() in the receive loop,
  so that we have better chances of filling our receive buffer in
  fewer calls

- skipping the recvfrom() in the UDP handler on EPOLLERR -- there's
  nothing to be done in that case

and while at it:

- restore the 20ms timer interval for periodic (TCP) events, I
  accidentally changed that to 100ms in an earlier commit

- attempt using SO_ZEROCOPY for UDP -- if it's not available,
  sendmmsg() will succeed anyway

- fix the handling of the status code from sendmmsg(), if it fails,
  we'll try to discard the first message, hence return 1 from the
  UDP handler

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2021-04-23 22:22:37 +02:00
parent 962bc97cf1
commit 38b50dba47
10 changed files with 139 additions and 49 deletions

3
icmp.c
View file

@ -135,6 +135,9 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
*/ */
int icmp_sock_init(struct ctx *c) int icmp_sock_init(struct ctx *c)
{ {
c->icmp.fd_min = INT_MAX;
c->icmp.fd_max = 0;
if (c->v4 && (c->icmp.s4 = sock_l4_add(c, 4, IPPROTO_ICMP, 0)) < 0) if (c->v4 && (c->icmp.s4 = sock_l4_add(c, 4, IPPROTO_ICMP, 0)) < 0)
return -1; return -1;

4
icmp.h
View file

@ -12,10 +12,14 @@ int icmp_sock_init(struct ctx *c);
* struct icmp_ctx - Execution context for ICMP routines * struct icmp_ctx - Execution context for ICMP routines
* @s4: ICMP socket number * @s4: ICMP socket number
* @s6: ICMPv6 socket number * @s6: ICMPv6 socket number
* @fd_min: Lowest file descriptor number for ICMP/ICMPv6 ever used
* @fd_max: Highest file descriptor number for ICMP/ICMPv6 ever used
*/ */
struct icmp_ctx { struct icmp_ctx {
int s4; int s4;
int s6; int s6;
int fd_min;
int fd_max;
}; };
#endif /* ICMP_H */ #endif /* ICMP_H */

75
passt.c
View file

@ -57,9 +57,11 @@
#define EPOLL_EVENTS 10 #define EPOLL_EVENTS 10
#define TAP_NMSG 32 /* maximum messages to buffer from tap */ #define TAP_BUF_BYTES (ETH_MAX_MTU * 8)
#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t))
#define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1)
#define TIMER_INTERVAL 100 /* ms, for protocol periodic handlers */ #define TIMER_INTERVAL 20 /* ms, for protocol periodic handlers */
/** /**
* sock_unix() - Create and bind AF_UNIX socket, add to epoll list * sock_unix() - Create and bind AF_UNIX socket, add to epoll list
@ -515,7 +517,7 @@ static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count)
return 1; return 1;
} }
static char tap_buf[ETH_MAX_MTU * TAP_NMSG]; static char tap_buf[TAP_BUF_BYTES];
/** /**
* tap_handler() - Packet handler for tap file descriptor * tap_handler() - Packet handler for tap file descriptor
@ -525,32 +527,30 @@ static char tap_buf[ETH_MAX_MTU * TAP_NMSG];
*/ */
static int tap_handler(struct ctx *c) static int tap_handler(struct ctx *c)
{ {
int msg_count = 0, same, rcv = 0, i = 0; struct tap_msg msg[TAP_MSGS];
struct tap_msg msg[UIO_MAXIOV]; int msg_count, same, i;
ssize_t n, rem, fill;
struct ethhdr *eh; struct ethhdr *eh;
char *p = tap_buf; char *p = tap_buf;
ssize_t n, rem;
fill = ETH_MAX_MTU * (TAP_NMSG - 1); while ((n = recv(c->fd_unix, p, TAP_BUF_FILL, MSG_DONTWAIT)) > 0) {
msg_count = 0;
while ((n = recv(c->fd_unix, p, fill, MSG_DONTWAIT)) > 0) { while (n > (ssize_t)sizeof(uint32_t)) {
fill -= n;
while (n > 0) {
ssize_t len = ntohl(*(uint32_t *)p); ssize_t len = ntohl(*(uint32_t *)p);
p += sizeof(uint32_t); p += sizeof(uint32_t);
n -= sizeof(uint32_t); n -= sizeof(uint32_t);
if (len < (ssize_t)sizeof(*eh)) if (len < (ssize_t)sizeof(*eh))
break; return 0;
/* At most one packet might not fit in a single read */ /* At most one packet might not fit in a single read */
if (len > n) { if (len > n) {
rem = recv(c->fd_unix, p + n, fill, rem = recv(c->fd_unix, p + n, len - n,
MSG_DONTWAIT); MSG_DONTWAIT);
rcv = errno; if ((n += rem) != len)
if (rem <= 0 || rem + n != len) return 0;
break;
} }
msg[msg_count].start = p; msg[msg_count].start = p;
@ -559,10 +559,8 @@ static int tap_handler(struct ctx *c)
n -= len; n -= len;
p += len; p += len;
} }
}
rcv = errno;
i = 0;
while (i < msg_count) { while (i < msg_count) {
eh = (struct ethhdr *)msg[i].start; eh = (struct ethhdr *)msg[i].start;
switch (ntohs(eh->h_proto)) { switch (ntohs(eh->h_proto)) {
@ -571,19 +569,27 @@ static int tap_handler(struct ctx *c)
i++; i++;
break; break;
case ETH_P_IP: case ETH_P_IP:
for (same = 1; i + same < msg_count; same++) { for (same = 1; i + same < msg_count &&
eh = (struct ethhdr *)msg[i + same].start; same < UIO_MAXIOV; same++) {
struct tap_msg *next = &msg[i + same];
eh = (struct ethhdr *)next->start;
if (ntohs(eh->h_proto) != ETH_P_IP) if (ntohs(eh->h_proto) != ETH_P_IP)
break; break;
} }
i += tap4_handler(c, msg + i, same); i += tap4_handler(c, msg + i, same);
break; break;
case ETH_P_IPV6: case ETH_P_IPV6:
for (same = 1; i + same < msg_count; same++) { for (same = 1; i + same < msg_count &&
eh = (struct ethhdr *)msg[i + same].start; same < UIO_MAXIOV; same++) {
struct tap_msg *next = &msg[i + same];
eh = (struct ethhdr *)next->start;
if (ntohs(eh->h_proto) != ETH_P_IPV6) if (ntohs(eh->h_proto) != ETH_P_IPV6)
break; break;
} }
i += tap6_handler(c, msg + i, same); i += tap6_handler(c, msg + i, same);
break; break;
default: default:
@ -592,7 +598,10 @@ static int tap_handler(struct ctx *c)
} }
} }
if (n >= 0 || rcv == EINTR || rcv == EAGAIN || rcv == EWOULDBLOCK) p = tap_buf;
}
if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
return 0; return 0;
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_unix, NULL); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_unix, NULL);
@ -614,8 +623,21 @@ static void sock_handler(struct ctx *c, int fd, uint32_t events)
sl = sizeof(so); sl = sizeof(so);
if (getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &so, &sl)) #define IN(x, proto) (x >= c->proto.fd_min && x <= c->proto.fd_max)
if (IN(fd, udp) && !IN(fd, icmp) && !IN(fd, tcp))
so = IPPROTO_UDP;
else if (IN(fd, tcp) && !IN(fd, icmp) && !IN(fd, udp))
so = IPPROTO_TCP;
else if (IN(fd, icmp) && !IN(fd, udp) && !IN(fd, tcp))
so = IPPROTO_ICMP; /* Fits ICMPv6 below, too */
else if (getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &so, &sl)) {
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
close(fd);
return; return;
}
#undef IN
debug("%s: packet from socket %i", getprotobynumber(so)->p_name, fd); debug("%s: packet from socket %i", getprotobynumber(so)->p_name, fd);
@ -771,7 +793,10 @@ loop:
for (i = 0; i < nfds; i++) { for (i = 0; i < nfds; i++) {
if (events[i].data.fd == c.fd_unix) { if (events[i].data.fd == c.fd_unix) {
if (tap_handler(&c)) if (events[i].events & EPOLLRDHUP ||
events[i].events & EPOLLHUP ||
events[i].events & EPOLLERR ||
tap_handler(&c))
goto listen; goto listen;
} else { } else {
sock_handler(&c, events[i].data.fd, events[i].events); sock_handler(&c, events[i].data.fd, events[i].events);

View file

@ -16,6 +16,7 @@ struct tap_msg {
#include "icmp.h" #include "icmp.h"
#include "tcp.h" #include "tcp.h"
#include "udp.h"
/** /**
* struct ctx - Execution context * struct ctx - Execution context
@ -56,4 +57,5 @@ struct ctx {
struct icmp_ctx icmp; struct icmp_ctx icmp;
struct tcp_ctx tcp; struct tcp_ctx tcp;
struct tcp_ctx udp;
}; };

4
tap.c
View file

@ -37,9 +37,9 @@
int tap_send(int fd, void *data, size_t len, int flags) int tap_send(int fd, void *data, size_t len, int flags)
{ {
uint32_t vnet_len = htonl(len); uint32_t vnet_len = htonl(len);
send(fd, &vnet_len, 4, 0); send(fd, &vnet_len, 4, MSG_DONTWAIT | MSG_NOSIGNAL);
return send(fd, data, len, flags); return send(fd, data, len, flags | MSG_DONTWAIT | MSG_NOSIGNAL);
} }
/** /**

8
tcp.c
View file

@ -1003,6 +1003,11 @@ static void tcp_conn_from_sock(struct ctx *c, int fd)
if (s == -1) if (s == -1)
return; return;
if (s < c->tcp.fd_min)
c->tcp.fd_min = s;
if (s > c->tcp.fd_max)
c->tcp.fd_max = s;
if (sa_l.ss_family == AF_INET) { if (sa_l.ss_family == AF_INET) {
struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa_r; struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa_r;
@ -1445,6 +1450,9 @@ int tcp_sock_init(struct ctx *c)
{ {
in_port_t port; in_port_t port;
c->tcp.fd_min = INT_MAX;
c->tcp.fd_max = 0;
for (port = 0; port < (1 << 15) + (1 << 14); port++) { for (port = 0; port < (1 << 15) + (1 << 14); port++) {
if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, port) < 0) if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, port) < 0)
return -1; return -1;

4
tcp.h
View file

@ -12,9 +12,13 @@ void tcp_timer(struct ctx *c, struct timespec *ts);
/** /**
* struct tcp_ctx - Execution context for TCP routines * struct tcp_ctx - Execution context for TCP routines
* @hash_secret: 128-bit secret for hash functions, ISN and hash table * @hash_secret: 128-bit secret for hash functions, ISN and hash table
* @fd_min: Lowest file descriptor number for TCP ever used
* @fd_max: Highest file descriptor number for TCP ever used
*/ */
struct tcp_ctx { struct tcp_ctx {
uint64_t hash_secret[2]; uint64_t hash_secret[2];
int fd_min;
int fd_max;
}; };
#endif /* TCP_H */ #endif /* TCP_H */

20
udp.c
View file

@ -68,7 +68,8 @@ void udp_sock_handler(struct ctx *c, int s, uint32_t events)
struct udphdr *uh; struct udphdr *uh;
ssize_t n; ssize_t n;
(void)events; if (events == EPOLLERR)
return;
n = recvfrom(s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh), n = recvfrom(s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh),
MSG_DONTWAIT, (struct sockaddr *)&sr, &slen); MSG_DONTWAIT, (struct sockaddr *)&sr, &slen);
@ -179,7 +180,11 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
return count; return count;
} }
return sendmmsg(s, mm, count, MSG_DONTWAIT | MSG_NOSIGNAL); count = sendmmsg(s, mm, count, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ZEROCOPY);
if (count < 0)
return 1;
return count;
} }
/** /**
@ -191,13 +196,19 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
int udp_sock_init(struct ctx *c) int udp_sock_init(struct ctx *c)
{ {
in_port_t port; in_port_t port;
int s; int s, one = 1;
c->udp.fd_min = INT_MAX;
c->udp.fd_max = 0;
for (port = 0; port < USHRT_MAX; port++) { for (port = 0; port < USHRT_MAX; port++) {
if (c->v4) { if (c->v4) {
if ((s = sock_l4_add(c, 4, IPPROTO_UDP, port)) < 0) if ((s = sock_l4_add(c, 4, IPPROTO_UDP, port)) < 0)
return -1; return -1;
setsockopt(s, SOL_SOCKET, SO_ZEROCOPY,
&one, sizeof(one));
udp4_sock_port[port] = s; udp4_sock_port[port] = s;
} }
@ -205,6 +216,9 @@ int udp_sock_init(struct ctx *c)
if ((s = sock_l4_add(c, 6, IPPROTO_UDP, port)) < 0) if ((s = sock_l4_add(c, 6, IPPROTO_UDP, port)) < 0)
return -1; return -1;
setsockopt(s, SOL_SOCKET, SO_ZEROCOPY,
&one, sizeof(one));
udp6_sock_port[port] = s; udp6_sock_port[port] = s;
} }
} }

15
udp.h
View file

@ -1,4 +1,19 @@
#ifndef UDP_H
#define UDP_H
void udp_sock_handler(struct ctx *c, int s, uint32_t events); void udp_sock_handler(struct ctx *c, int s, uint32_t events);
int udp_tap_handler(struct ctx *c, int af, void *addr, int udp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count); struct tap_msg *msg, int count);
int udp_sock_init(struct ctx *c); int udp_sock_init(struct ctx *c);
/**
* struct udp_ctx - Execution context for UDP
* @fd_min: Lowest file descriptor number for UDP ever used
* @fd_max: Highest file descriptor number for UDP ever used
*/
struct udp_ctx {
int fd_min;
int fd_max;
};
#endif /* UDP_H */

15
util.c
View file

@ -189,6 +189,21 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port)
return -1; return -1;
} }
#define CHECK_SET_MIN_MAX(ipproto, proto_ctx, fd) \
if (proto == (ipproto)) { \
if (fd < c->proto_ctx.fd_min) \
c->proto_ctx.fd_min = (fd); \
if (fd > c->proto_ctx.fd_max) \
c->proto_ctx.fd_max = (fd); \
}
CHECK_SET_MIN_MAX(IPPROTO_ICMP, icmp, fd);
CHECK_SET_MIN_MAX(IPPROTO_ICMPV6, icmp, fd);
CHECK_SET_MIN_MAX(IPPROTO_TCP, tcp, fd);
CHECK_SET_MIN_MAX(IPPROTO_UDP, udp, fd);
#undef CHECK_SET_MIN_MAX
if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
goto epoll_add; goto epoll_add;