udp, passt: Introduce socket packet buffer, avoid getsockname() for UDP

This is in preparation for scatter-gather IO on the UDP receive path:
save a getsockname() syscall by setting a flag if we get the numbering
of all bound sockets in a strict sequence (expected, in practice) and
repurpose the tap buffer to be also a socket receive buffer, passing
it down to protocol handlers.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2021-04-30 14:52:18 +02:00
parent 605af213c5
commit e07f539ae0
10 changed files with 103 additions and 34 deletions

4
icmp.c
View file

@ -38,9 +38,10 @@
* @c: Execution context * @c: Execution context
* @s: File descriptor number for socket * @s: File descriptor number for socket
* @events: epoll events bitmap * @events: epoll events bitmap
* @pkt_buf: Buffer to receive packets, currently unused
* @now: Current timestamp, unused * @now: Current timestamp, unused
*/ */
void icmp_sock_handler(struct ctx *c, int s, uint32_t events, void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
struct timespec *now) struct timespec *now)
{ {
struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0, struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0,
@ -53,6 +54,7 @@ void icmp_sock_handler(struct ctx *c, int s, uint32_t events,
ssize_t n; ssize_t n;
(void)events; (void)events;
(void)pkt_buf;
(void)now; (void)now;
n = recvfrom(s, buf, sizeof(buf), MSG_DONTWAIT, n = recvfrom(s, buf, sizeof(buf), MSG_DONTWAIT,

2
icmp.h
View file

@ -3,7 +3,7 @@
struct ctx; struct ctx;
void icmp_sock_handler(struct ctx *c, int s, uint32_t events, void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
struct timespec *now); struct timespec *now);
int icmp_tap_handler(struct ctx *c, int af, void *addr, int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now); struct tap_msg *msg, int count, struct timespec *now);

15
passt.c
View file

@ -60,6 +60,9 @@
#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t)) #define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t))
#define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1) #define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1)
#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, SOCK_BUF_BYTES)
static char pkt_buf [PKT_BUF_BYTES];
#define TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL) #define TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL)
/** /**
@ -530,8 +533,6 @@ static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count,
return 1; return 1;
} }
static char tap_buf[TAP_BUF_BYTES];
/** /**
* tap_handler() - Packet handler for tap file descriptor * tap_handler() - Packet handler for tap file descriptor
* @c: Execution context * @c: Execution context
@ -544,7 +545,7 @@ static int tap_handler(struct ctx *c, struct timespec *now)
struct tap_msg msg[TAP_MSGS]; struct tap_msg msg[TAP_MSGS];
int msg_count, same, i; int msg_count, same, i;
struct ethhdr *eh; struct ethhdr *eh;
char *p = tap_buf; char *p = pkt_buf;
ssize_t n, rem; ssize_t n, rem;
while ((n = recv(c->fd_unix, p, TAP_BUF_FILL, MSG_DONTWAIT)) > 0) { while ((n = recv(c->fd_unix, p, TAP_BUF_FILL, MSG_DONTWAIT)) > 0) {
@ -615,7 +616,7 @@ static int tap_handler(struct ctx *c, struct timespec *now)
} }
} }
p = tap_buf; p = pkt_buf;
} }
if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
@ -660,11 +661,11 @@ static void sock_handler(struct ctx *c, int s, uint32_t events,
debug("%s: packet from socket %i", getprotobynumber(proto)->p_name, s); debug("%s: packet from socket %i", getprotobynumber(proto)->p_name, s);
if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
icmp_sock_handler(c, s, events, now); icmp_sock_handler(c, s, events, pkt_buf, now);
else if (proto == IPPROTO_TCP) else if (proto == IPPROTO_TCP)
tcp_sock_handler(c, s, events, now); tcp_sock_handler( c, s, events, pkt_buf, now);
else if (proto == IPPROTO_UDP) else if (proto == IPPROTO_UDP)
udp_sock_handler(c, s, events, now); udp_sock_handler( c, s, events, pkt_buf, now);
} }
/** /**

View file

@ -14,6 +14,8 @@ struct tap_msg {
size_t l4_len; size_t l4_len;
}; };
#define SOCK_BUF_BYTES (ETH_MAX_MTU * 4)
#include "icmp.h" #include "icmp.h"
#include "tcp.h" #include "tcp.h"
#include "udp.h" #include "udp.h"
@ -62,5 +64,5 @@ struct ctx {
struct icmp_ctx icmp; struct icmp_ctx icmp;
struct tcp_ctx tcp; struct tcp_ctx tcp;
struct tcp_ctx udp; struct udp_ctx udp;
}; };

5
tcp.c
View file

@ -1402,14 +1402,17 @@ static void tcp_connect_finish(struct ctx *c, int s)
* @c: Execution context * @c: Execution context
* @s: File descriptor number for socket * @s: File descriptor number for socket
* @events: epoll events bitmap * @events: epoll events bitmap
* @pkt_buf: Buffer to receive packets, currently unused
* @now: Current timestamp * @now: Current timestamp
*/ */
void tcp_sock_handler(struct ctx *c, int s, uint32_t events, void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
struct timespec *now) struct timespec *now)
{ {
socklen_t sl; socklen_t sl;
int accept; int accept;
(void)pkt_buf;
if (tc[s].s == LAST_ACK) { if (tc[s].s == LAST_ACK) {
tcp_send_to_tap(c, s, ACK, NULL, 0); tcp_send_to_tap(c, s, ACK, NULL, 0);
tcp_close_and_epoll_del(c, s); tcp_close_and_epoll_del(c, s);

2
tcp.h
View file

@ -5,7 +5,7 @@
struct ctx; struct ctx;
void tcp_sock_handler(struct ctx *c, int s, uint32_t events, void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
struct timespec *now); struct timespec *now);
int tcp_tap_handler(struct ctx *c, int af, void *addr, int tcp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now); struct tap_msg *msg, int count, struct timespec *now);

76
udp.c
View file

@ -124,25 +124,69 @@ static void udp_sock_handler_local(struct ctx *c, int af, void *sa,
} }
} }
/**
* udp_sock_name() - Get address family and port for bound UDP socket
* @c: Execution context
* @s: File descriptor number for socket
* @port: Local port, set on return, network order
*
* Return: address family, AF_INET or AF_INET6, negative error code on failure
*/
static int udp_sock_name(struct ctx *c, int s, in_port_t *port)
{
if (!c->udp.fd_in_seq) {
struct sockaddr_storage sa;
socklen_t sl;
sl = sizeof(sa);
if (getsockname(s, (struct sockaddr *)&sa, &sl))
return -errno;
if (sa.ss_family == AF_INET) {
*port = ((struct sockaddr_in *)&sa)->sin_port;
return AF_INET;
}
if (sa.ss_family == AF_INET6) {
*port = ((struct sockaddr_in6 *)&sa)->sin6_port;
return AF_INET6;
}
return -ENOTSUP;
}
if (c->v4 && c->v6) {
*port = htons((s - c->udp.fd_min) / 2);
return ((s - c->udp.fd_min) % 2) ? AF_INET6 : AF_INET;
}
*port = htons(s - c->udp.fd_min);
return c->v4 ? AF_INET : AF_INET6;
}
/** /**
* udp_sock_handler() - Handle new data from socket * udp_sock_handler() - Handle new data from socket
* @c: Execution context * @c: Execution context
* @s: File descriptor number for socket * @s: File descriptor number for socket
* @events: epoll events bitmap * @events: epoll events bitmap
* @pkt_buf: Buffer to receive packets, currently unused
* @now: Current timestamp * @now: Current timestamp
*/ */
void udp_sock_handler(struct ctx *c, int s, uint32_t events, void udp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
struct timespec *now) struct timespec *now)
{ {
struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0, struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0xff, 0xff, 0, 0, 0xff, 0xff,
0, 0, 0, 0 } }; 0, 0, 0, 0 } };
struct sockaddr_storage sr, sl; struct sockaddr_storage sr;
socklen_t slen = sizeof(sr); socklen_t slen = sizeof(sr);
char buf[USHRT_MAX]; char buf[USHRT_MAX];
struct udphdr *uh; struct udphdr *uh;
ssize_t n; ssize_t n;
int af;
(void)pkt_buf;
if (events == EPOLLERR) if (events == EPOLLERR)
return; return;
@ -153,13 +197,10 @@ void udp_sock_handler(struct ctx *c, int s, uint32_t events,
return; return;
uh = (struct udphdr *)buf; uh = (struct udphdr *)buf;
af = udp_sock_name(c, s, &uh->dest);
if (getsockname(s, (struct sockaddr *)&sl, &slen)) if (af == AF_INET) {
return;
if (sl.ss_family == AF_INET) {
struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr; struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr;
struct sockaddr_in *sl4 = (struct sockaddr_in *)&sl;
if (ntohl(sr4->sin_addr.s_addr) == INADDR_LOOPBACK || if (ntohl(sr4->sin_addr.s_addr) == INADDR_LOOPBACK ||
ntohl(sr4->sin_addr.s_addr) == INADDR_ANY) ntohl(sr4->sin_addr.s_addr) == INADDR_ANY)
@ -167,19 +208,16 @@ void udp_sock_handler(struct ctx *c, int s, uint32_t events,
memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr)); memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr));
uh->source = sr4->sin_port; uh->source = sr4->sin_port;
uh->dest = sl4->sin_port;
uh->len = htons(n + sizeof(*uh)); uh->len = htons(n + sizeof(*uh));
tap_ip_send(c, &a6, IPPROTO_UDP, buf, n + sizeof(*uh)); tap_ip_send(c, &a6, IPPROTO_UDP, buf, n + sizeof(*uh));
} else if (sl.ss_family == AF_INET6) { } else if (af == AF_INET6) {
struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr; struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr;
struct sockaddr_in6 *sl6 = (struct sockaddr_in6 *)&sl;
if (IN6_IS_ADDR_LOOPBACK(&sr6->sin6_addr)) if (IN6_IS_ADDR_LOOPBACK(&sr6->sin6_addr))
udp_sock_handler_local(c, AF_INET6, sr6, now); udp_sock_handler_local(c, AF_INET6, sr6, now);
uh->source = sr6->sin6_port; uh->source = sr6->sin6_port;
uh->dest = sl6->sin6_port;
uh->len = htons(n + sizeof(*uh)); uh->len = htons(n + sizeof(*uh));
tap_ip_send(c, &sr6->sin6_addr, IPPROTO_UDP, tap_ip_send(c, &sr6->sin6_addr, IPPROTO_UDP,
@ -363,17 +401,23 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
*/ */
int udp_sock_init(struct ctx *c) int udp_sock_init(struct ctx *c)
{ {
int s, prev = -1;
in_port_t port; in_port_t port;
int s;
c->udp.fd_min = INT_MAX; c->udp.fd_min = INT_MAX;
c->udp.fd_max = 0; c->udp.fd_max = 0;
c->udp.fd_in_seq = 1;
for (port = 0; port < USHRT_MAX; port++) { for (port = 0; port < USHRT_MAX; port++) {
if (c->v4) { if (c->v4) {
if ((s = sock_l4(c, AF_INET, IPPROTO_UDP, port)) < 0) if ((s = sock_l4(c, AF_INET, IPPROTO_UDP, port)) < 0)
return -1; return -1;
if (c->udp.fd_in_seq && prev != -1 && s != prev + 1)
c->udp.fd_in_seq = 0;
else
prev = s;
up4[port].s = s; up4[port].s = s;
} }
@ -381,6 +425,11 @@ int udp_sock_init(struct ctx *c)
if ((s = sock_l4(c, AF_INET6, IPPROTO_UDP, port)) < 0) if ((s = sock_l4(c, AF_INET6, IPPROTO_UDP, port)) < 0)
return -1; return -1;
if (c->udp.fd_in_seq && prev != -1 && s != prev + 1)
c->udp.fd_in_seq = 0;
else
prev = s;
up6[port].s = s; up6[port].s = s;
} }
} }
@ -424,7 +473,8 @@ static void udp_timer_one(struct ctx *c, int af, in_port_t p,
if (s != -1) { if (s != -1) {
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL);
close(s); close(s);
sock_l4(c, af, IPPROTO_UDP, p); if (sock_l4(c, af, IPPROTO_UDP, p) != s)
c->udp.fd_in_seq = 0;
} }
} }

4
udp.h
View file

@ -3,7 +3,7 @@
#define UDP_TIMER_INTERVAL 1000 /* ms */ #define UDP_TIMER_INTERVAL 1000 /* ms */
void udp_sock_handler(struct ctx *c, int s, uint32_t events, void udp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
struct timespec *now); struct timespec *now);
int udp_tap_handler(struct ctx *c, int af, void *addr, int udp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now); struct tap_msg *msg, int count, struct timespec *now);
@ -14,11 +14,13 @@ void udp_timer(struct ctx *c, struct timespec *ts);
* struct udp_ctx - Execution context for UDP * struct udp_ctx - Execution context for UDP
* @fd_min: Lowest file descriptor number for UDP ever used * @fd_min: Lowest file descriptor number for UDP ever used
* @fd_max: Highest file descriptor number for UDP ever used * @fd_max: Highest file descriptor number for UDP ever used
* @fd_in_seq: 1 if all socket numbers are in sequence, 0 otherwise
* @timer_run: Timestamp of most recent timer run * @timer_run: Timestamp of most recent timer run
*/ */
struct udp_ctx { struct udp_ctx {
int fd_min; int fd_min;
int fd_max; int fd_max;
int fd_in_seq;
struct timespec timer_run; struct timespec timer_run;
}; };

20
util.c
View file

@ -189,11 +189,6 @@ int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port)
return -1; return -1;
} }
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMP, icmp, fd);
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMPV6, icmp, fd);
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_TCP, tcp, fd);
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_UDP, udp, fd);
if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
goto epoll_add; goto epoll_add;
@ -207,17 +202,30 @@ int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port)
setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)); setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one));
} }
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMP, icmp, fd);
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMPV6, icmp, fd);
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_TCP, tcp, fd);
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_UDP, udp, fd);
if (proto == IPPROTO_UDP && PORT_IS_EPHEMERAL(port)) if (proto == IPPROTO_UDP && PORT_IS_EPHEMERAL(port))
goto epoll_add; goto epoll_add;
if (bind(fd, sa, sl) < 0) { if (bind(fd, sa, sl) < 0) {
/* We'll fail to bind to low ports if we don't have enough /* We'll fail to bind to low ports if we don't have enough
* capabilities, and we'll fail to bind on already bound ports, * capabilities, and we'll fail to bind on already bound ports,
* this is fine. * this is fine. If this isn't the socket with the lowest number
* for a given protocol, leave it open, to avoid unnecessary
* holes in the numbering.
*/ */
if ((proto == IPPROTO_TCP && fd == c->tcp.fd_min) ||
(proto == IPPROTO_UDP && fd == c->udp.fd_min) ||
((proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) &&
fd == c->icmp.fd_min)) {
close(fd); close(fd);
return 0; return 0;
} }
return fd;
}
if (proto == IPPROTO_TCP && listen(fd, 128) < 0) { if (proto == IPPROTO_TCP && listen(fd, 128) < 0) {
perror("TCP socket listen"); perror("TCP socket listen");

1
util.h
View file

@ -23,6 +23,7 @@ void debug(const char *format, ...);
} while (0) } while (0)
#define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b)) #define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b))