From 418feb37ece9ad584ec8b167861bb21a2cc3c067 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 26 Aug 2024 20:41:31 +0200 Subject: [PATCH 001/382] test: Look for possible sshd-session paths (if it's there at all) in mbuto's profile Some distributions already have OpenSSH 9.8, which introduces split sshd/sshd-session binaries, and there we need to copy the binary from the host, which can be /usr/libexec/openssh/sshd-session (Fedora Rawhide), /usr/lib/ssh/sshd-session (Arch Linux), /usr/lib/openssh/sshd-session (Debian), and possibly other paths. Add at least those three, and, if we don't find sshd-session, assume we don't need it: it could very well be an older version of OpenSSH, as reported by David for Fedora 40, or perhaps another daemon (would Dropbear even work? I'm not sure). Reported-by: David Gibson <david@gibson.dropbear.id.au> Fixes: d6817b3930be ("test/passt.mbuto: Install sshd-session OpenSSH's split process") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Tested-by: David Gibson <david@gibson.dropbear.id.au> --- test/passt.mbuto | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/passt.mbuto b/test/passt.mbuto index 61865e8..138d365 100755 --- a/test/passt.mbuto +++ b/test/passt.mbuto @@ -13,8 +13,15 @@ PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl - nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp - /usr/lib/openssh/sshd-session}" + nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}" + +# OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and +# sshd-session the per-session program. We need the latter as well, and the path +# depends on the distribution. It doesn't exist on older versions. +for bin in /usr/lib/openssh/sshd-session /usr/lib/ssh/sshd-session \ + /usr/libexec/openssh/sshd-session; do + command -v "${bin}" >/dev/null && PROGS="${PROGS} ${bin}" +done KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}" From 620e19a1b48a80abddc657b4c17f5e4920f300ec Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 27 Aug 2024 16:04:44 +1000 Subject: [PATCH 002/382] udp: Merge udp[46]_mh_recv arrays We've already gotten rid of most of the IPv4/IPv6 specific data structures in udp.c by merging them with each other. One significant one remains: udp[46]_mh_recv. This was a bit awkward to remove because of a subtle interaction. We initialise the msg_namelen fields to represent the total size we have for a socket address, but when we receive into the arrays those are modified to the actual length of the sockaddr we received. That meant that naively merging the arrays meant that if we received IPv4 datagrams, then IPv6 datagrams, the addresses for the latter would be truncated. In this patch address that by resetting the received msg_namelen as soon as we've found a flow for the datagram. Finding the flow is the only thing that might use the actual sockaddr length, although we in fact don't need it for the time being. This also removes the last use of the 'v6' field from udp_listen_epoll_ref, so remove that as well. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 54 +++++++++++++++++------------------------------------- udp.h | 2 -- 2 files changed, 17 insertions(+), 39 deletions(-) diff --git a/udp.c b/udp.c index 8a93aad..01b03df 100644 --- a/udp.c +++ b/udp.c @@ -178,8 +178,7 @@ enum udp_iov_idx { /* IOVs and msghdr arrays for receiving datagrams from sockets */ static struct iovec udp_iov_recv [UDP_MAX_FRAMES]; -static struct mmsghdr udp4_mh_recv [UDP_MAX_FRAMES]; -static struct mmsghdr udp6_mh_recv [UDP_MAX_FRAMES]; +static struct mmsghdr udp_mh_recv [UDP_MAX_FRAMES]; /* IOVs and msghdr arrays for sending "spliced" datagrams to sockets */ static union sockaddr_inany udp_splice_to; @@ -222,6 +221,7 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) static void udp_iov_init_one(const struct ctx *c, size_t i) { struct udp_payload_t *payload = &udp_payload[i]; + struct msghdr *mh = &udp_mh_recv[i].msg_hdr; struct udp_meta_t *meta = &udp_meta[i]; struct iovec *siov = &udp_iov_recv[i]; struct iovec *tiov = udp_l2_iov[i]; @@ -236,27 +236,10 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); tiov[UDP_IOV_PAYLOAD].iov_base = payload; - /* It's useful to have separate msghdr arrays for receiving. Otherwise, - * an IPv4 recv() will alter msg_namelen, so we'd have to reset it every - * time or risk truncating the address on future IPv6 recv()s. - */ - if (c->ifi4) { - struct msghdr *mh = &udp4_mh_recv[i].msg_hdr; - - mh->msg_name = &meta->s_in; - mh->msg_namelen = sizeof(struct sockaddr_in); - mh->msg_iov = siov; - mh->msg_iovlen = 1; - } - - if (c->ifi6) { - struct msghdr *mh = &udp6_mh_recv[i].msg_hdr; - - mh->msg_name = &meta->s_in; - mh->msg_namelen = sizeof(struct sockaddr_in6); - mh->msg_iov = siov; - mh->msg_iovlen = 1; - } + mh->msg_name = &meta->s_in; + mh->msg_namelen = sizeof(meta->s_in); + mh->msg_iov = siov; + mh->msg_iovlen = 1; } /** @@ -506,10 +489,10 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv; + const socklen_t sasize = sizeof(udp_meta[0].s_in); int n, i; - if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0) + if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0) return; /* We divide datagrams into batches based on how we need to send them, @@ -518,6 +501,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, * populate it one entry *ahead* of the loop counter. */ udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now); + udp_mh_recv[0].msg_hdr.msg_namelen = sasize; for (i = 0; i < n; ) { flow_sidx_t batchsidx = udp_meta[i].tosidx; uint8_t batchpif = pif_at_sidx(batchsidx); @@ -525,9 +509,9 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, do { if (pif_is_socket(batchpif)) { - udp_splice_prepare(mmh_recv, i); + udp_splice_prepare(udp_mh_recv, i); } else if (batchpif == PIF_TAP) { - udp_tap_prepare(mmh_recv, i, + udp_tap_prepare(udp_mh_recv, i, flowside_at_sidx(batchsidx)); } @@ -537,6 +521,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, udp_meta[i].tosidx = udp_flow_from_sock(c, ref, &udp_meta[i].s_in, now); + udp_mh_recv[i].msg_hdr.msg_namelen = sasize; } while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx)); if (pif_is_socket(batchpif)) { @@ -572,19 +557,16 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - const struct flowside *fromside = flowside_at_sidx(ref.flowside); flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); struct udp_flow *uflow = udp_at_sidx(ref.flowside); int from_s = uflow->s[ref.flowside.sidei]; - bool v6 = !inany_v4(&fromside->eaddr); - struct mmsghdr *mmh_recv = v6 ? udp6_mh_recv : udp4_mh_recv; uint8_t topif = pif_at_sidx(tosidx); int n, i; ASSERT(!c->no_udp && uflow); - if ((n = udp_sock_recv(c, from_s, events, mmh_recv)) <= 0) + if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0) return; flow_trace(uflow, "Received %d datagrams on reply socket", n); @@ -592,9 +574,11 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, for (i = 0; i < n; i++) { if (pif_is_socket(topif)) - udp_splice_prepare(mmh_recv, i); + udp_splice_prepare(udp_mh_recv, i); else if (topif == PIF_TAP) - udp_tap_prepare(mmh_recv, i, toside); + udp_tap_prepare(udp_mh_recv, i, toside); + /* Restore sockaddr length clobbered by recvmsg() */ + udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in); } if (pif_is_socket(topif)) { @@ -740,8 +724,6 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, uref.pif = PIF_HOST; if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { - uref.v6 = 0; - if (!ns) { r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, addr, ifname, port, uref.u32); @@ -756,8 +738,6 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, } if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { - uref.v6 = 1; - if (!ns) { r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, addr, ifname, port, uref.u32); diff --git a/udp.h b/udp.h index fb42e1c..a8e76bf 100644 --- a/udp.h +++ b/udp.h @@ -26,14 +26,12 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); * union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets * @port: Source port for connected sockets, bound port otherwise * @pif: pif for this socket - * @v6: Set for IPv6 sockets or connections * @u32: Opaque u32 value of reference */ union udp_listen_epoll_ref { struct { in_port_t port; uint8_t pif; - bool v6:1; }; uint32_t u32; }; From c78b194001ec211401144e3e89071bc2f54f121d Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 27 Aug 2024 16:04:45 +1000 Subject: [PATCH 003/382] udp: Remove unnnecessary local from udp_sock_init() The 's' variable is always redundant with either 'r4' or 'r6', so remove it. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/udp.c b/udp.c index 01b03df..41a6247 100644 --- a/udp.c +++ b/udp.c @@ -714,7 +714,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port) { union udp_listen_epoll_ref uref = { .port = port }; - int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; + int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; ASSERT(!c->no_udp); @@ -725,29 +725,29 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { if (!ns) { - r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, - addr, ifname, port, uref.u32); + r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, + addr, ifname, port, uref.u32); - udp_splice_init[V4][port] = s < 0 ? -1 : s; + udp_splice_init[V4][port] = r4 < 0 ? -1 : r4; } else { - r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, - &in4addr_loopback, - ifname, port, uref.u32); - udp_splice_ns[V4][port] = s < 0 ? -1 : s; + r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, + &in4addr_loopback, + ifname, port, uref.u32); + udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4; } } if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { if (!ns) { - r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, - addr, ifname, port, uref.u32); + r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, + addr, ifname, port, uref.u32); - udp_splice_init[V6][port] = s < 0 ? -1 : s; + udp_splice_init[V6][port] = r6 < 0 ? -1 : r6; } else { - r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, - &in6addr_loopback, - ifname, port, uref.u32); - udp_splice_ns[V6][port] = s < 0 ? -1 : s; + r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, + &in6addr_loopback, + ifname, port, uref.u32); + udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6; } } From e0be6bc2f4762ba8c090aef0f8b85a47a4243356 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 27 Aug 2024 16:04:46 +1000 Subject: [PATCH 004/382] udp: Use dual stack sockets for port forwarding when possible Platforms like Linux allow IPv6 sockets to listen for IPv4 connections as well as native IPv6 connections. By doing this we halve the number of listening sockets we need (assuming passt/pasta is listening on the same ports for IPv4 and IPv6). When forwarding many ports (e.g. -u all) this can significantly reduce the amount of kernel memory that passt consumes. We've used such dual stack sockets for TCP since 8e914238b "tcp: Use dual stack sockets for port forwarding when possible". Add similar support for UDP "listening" sockets. Since UDP sockets don't use as much kernel memory as TCP sockets this isn't as big a saving, but it's still significant. When forwarding all TCP and UDP ports for both IPv4 & IPv6 (-t all -u all), this reduces kernel memory usage from ~522 MiB to ~380MiB (kernel version 6.10.6 on Fedora 40, x86_64). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/udp.c b/udp.c index 41a6247..bd9051e 100644 --- a/udp.c +++ b/udp.c @@ -723,6 +723,25 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, else uref.pif = PIF_HOST; + if (af == AF_UNSPEC && c->ifi4 && c->ifi6) { + int s; + + /* Attempt to get a dual stack socket */ + if (!ns) { + s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN, + addr, ifname, port, uref.u32); + udp_splice_init[V4][port] = s < 0 ? -1 : s; + udp_splice_init[V6][port] = s < 0 ? -1 : s; + } else { + s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN, + &in4addr_loopback, ifname, port, uref.u32); + udp_splice_ns[V4][port] = s < 0 ? -1 : s; + udp_splice_ns[V6][port] = s < 0 ? -1 : s; + } + if (IN_INTERVAL(0, FD_REF_MAX, s)) + return 0; + } + if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { if (!ns) { r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, From 712ca3235329b049bf9a4e481ba38a4c64768e8b Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 27 Aug 2024 08:23:41 +0200 Subject: [PATCH 005/382] seccomp.sh: Try to account for terminal width while formatting list of system calls Avoid excess lines on wide terminals, but make sure we don't fail if we can't fetch the number of columns for any reason, as it's not a fundamental feature and we don't want to break anything with it. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- seccomp.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/seccomp.sh b/seccomp.sh index 052e1c8..38aa826 100755 --- a/seccomp.sh +++ b/seccomp.sh @@ -242,7 +242,10 @@ for __p in ${__profiles}; do __calls="$(sed -n 's/[\t ]*\*[\t ]*#syscalls\(:'"${__p}"'\|\)[\t ]\{1,\}\(.*\)/\2/p' ${IN})" __calls="${__calls} ${EXTRA_SYSCALLS:-}" __calls="$(filter ${__calls})" - echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t + + cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null + case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac + echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args} # Pad here to keep gen_profile() "simple" __count=0 From 1daf6f4615226a2cdd9523a80d70736af4a9f3c0 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 29 Aug 2024 19:58:45 +1000 Subject: [PATCH 006/382] conf, fwd: Make ephemeral port logic more flexible "Ephemeral" ports are those which the kernel may allocate as local port numbers for outgoing connections or datagrams. Because of that, they're generally not good choices for listening servers to bind to. Thefore when using -t all, -u all or exclude-only ranges, we map only non-ephemeral ports. Our logic for this is a bit rigid though: we assume the ephemeral ports are always a fixed range at the top of the port number space. We also assume PORT_EPHEMERAL_MIN is a multiple of 8, or we won't set the forward bitmap correctly. Make the logic in conf.c more flexible, using a helper moved into fwd.[ch], although we don't change which ports we consider ephemeral (yet). The new handling is undoubtedly more computationally expensive, but since it's a once-off operation at start off, I don't think it really matters. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 12 ++++++++---- fwd.c | 17 +++++++++++++++++ fwd.h | 2 ++ util.h | 3 --- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/conf.c b/conf.c index e29b6a9..6b3dafd 100644 --- a/conf.c +++ b/conf.c @@ -156,9 +156,12 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, die("'all' port forwarding is only allowed for passt"); fwd->mode = FWD_ALL; - memset(fwd->map, 0xff, PORT_EPHEMERAL_MIN / 8); - for (i = 0; i < PORT_EPHEMERAL_MIN; i++) { + for (i = 0; i < NUM_PORTS; i++) { + if (fwd_port_is_ephemeral(i)) + continue; + + bitmap_set(fwd->map, i); if (optname == 't') { ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL, i); @@ -259,8 +262,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, } while ((p = next_chunk(p, ','))); if (exclude_only) { - for (i = 0; i < PORT_EPHEMERAL_MIN; i++) { - if (bitmap_isset(exclude, i)) + for (i = 0; i < NUM_PORTS; i++) { + if (fwd_port_is_ephemeral(i) || + bitmap_isset(exclude, i)) continue; bitmap_set(fwd->map, i); diff --git a/fwd.c b/fwd.c index 2a0452f..8fa312a 100644 --- a/fwd.c +++ b/fwd.c @@ -27,6 +27,23 @@ #include "lineread.h" #include "flow_table.h" +/* Empheral port range: values from RFC 6335 */ +static const in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14); +static const in_port_t fwd_ephemeral_max = NUM_PORTS - 1; + +/** + * fwd_port_is_ephemeral() - Is port number ephemeral? + * @port: Port number + * + * Return: true if @port is ephemeral, that is may be allocated by the kernel as + * a local port for outgoing connections or datagrams, but should not be + * used for binding services to. + */ +bool fwd_port_is_ephemeral(in_port_t port) +{ + return (port >= fwd_ephemeral_min) && (port <= fwd_ephemeral_max); +} + /* See enum in kernel's include/net/tcp_states.h */ #define UDP_LISTEN 0x07 #define TCP_LISTEN 0x0a diff --git a/fwd.h b/fwd.h index b4aa8d5..99dd66c 100644 --- a/fwd.h +++ b/fwd.h @@ -12,6 +12,8 @@ struct flowside; /* Number of ports for both TCP and UDP */ #define NUM_PORTS (1U << 16) +bool fwd_port_is_ephemeral(in_port_t port); + enum fwd_ports_mode { FWD_UNSET = 0, FWD_SPEC = 1, diff --git a/util.h b/util.h index 1463c92..c7a59d5 100644 --- a/util.h +++ b/util.h @@ -95,9 +95,6 @@ #define FD_PROTO(x, proto) \ (IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x))) -#define PORT_EPHEMERAL_MIN ((1 << 15) + (1 << 14)) /* RFC 6335 */ -#define PORT_IS_EPHEMERAL(port) ((port) >= PORT_EPHEMERAL_MIN) - #define MAC_ZERO ((uint8_t [ETH_ALEN]){ 0 }) #define MAC_IS_ZERO(addr) (!memcmp((addr), MAC_ZERO, ETH_ALEN)) From 4a41dc58d67e910c3a1f505a6a20988c4555e735 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 29 Aug 2024 19:58:46 +1000 Subject: [PATCH 007/382] conf, fwd: Don't attempt to forward port 0 When using -t all, -u all or exclude-only ranges, we'll attempt to forward all non-ephemeral port numbers, including port 0. However, this won't work as intended: bind() treats a zero port not as literal port 0, but as "pick a port for me". Because of the special meaning of port 0, we mostly outright exclude it in our handling. Do the same for setting up forwards, not attempting to forward for port 0. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/conf.c b/conf.c index 6b3dafd..3eb117f 100644 --- a/conf.c +++ b/conf.c @@ -157,7 +157,10 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, fwd->mode = FWD_ALL; - for (i = 0; i < NUM_PORTS; i++) { + /* Skip port 0. It has special meaning for many socket APIs, so + * trying to bind it is not really safe. + */ + for (i = 1; i < NUM_PORTS; i++) { if (fwd_port_is_ephemeral(i)) continue; @@ -262,7 +265,10 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, } while ((p = next_chunk(p, ','))); if (exclude_only) { - for (i = 0; i < NUM_PORTS; i++) { + /* Skip port 0. It has special meaning for many socket APIs, so + * trying to bind it is not really safe. + */ + for (i = 1; i < NUM_PORTS; i++) { if (fwd_port_is_ephemeral(i) || bitmap_isset(exclude, i)) continue; From eedc81b6ef552736c4d1d7354837e296af081b57 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 29 Aug 2024 19:58:47 +1000 Subject: [PATCH 008/382] fwd, conf: Probe host's ephemeral ports When we forward "all" ports (-t all or -u all), or use an exclude-only range, we don't actually forward *all* ports - that wouln't leave local ports to use for outgoing connections. Rather we forward all non-ephemeral ports - those that won't be used for outgoing connections or datagrams. Currently we assume the range of ephemeral ports is that recommended by RFC 6335, 49152-65535. However, that's not the range used by default on Linux, 32768-60999 but configurable with the net.ipv4.ip_local_port_range sysctl. We can't really know what range the guest will consider ephemeral, but if it differs too much from the host it's likely to cause problems we can't avoid anyway. So, using the host's ephemeral range is a better guess than using the RFC 6335 range. Therefore, add logic to probe the host's ephemeral range, falling back to the RFC 6335 range if that fails. This has the bonus advantage of reducing the number of ports bound by -t all -u all on most Linux machines thereby reducing kernel memory usage. Specifically this reduces kernel memory usage with -t all -u all from ~380MiB to ~289MiB. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 1 + fwd.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- fwd.h | 1 + 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/conf.c b/conf.c index 3eb117f..b275886 100644 --- a/conf.c +++ b/conf.c @@ -1721,6 +1721,7 @@ void conf(struct ctx *c, int argc, char **argv) /* Inbound port options & DNS can be parsed now (after IPv4/IPv6 * settings) */ + fwd_probe_ephemeral(); udp_portmap_clear(); optind = 0; do { diff --git a/fwd.c b/fwd.c index 8fa312a..a505098 100644 --- a/fwd.c +++ b/fwd.c @@ -28,8 +28,65 @@ #include "flow_table.h" /* Empheral port range: values from RFC 6335 */ -static const in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14); -static const in_port_t fwd_ephemeral_max = NUM_PORTS - 1; +static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14); +static in_port_t fwd_ephemeral_max = NUM_PORTS - 1; + +#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range" + +/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral + * + * Work out what ports the host thinks are emphemeral and record it for later + * use by fwd_port_is_ephemeral(). If we're unable to probe, assume the range + * recommended by RFC 6335. + */ +void fwd_probe_ephemeral(void) +{ + char *line, *tab, *end; + struct lineread lr; + long min, max; + ssize_t len; + int fd; + + fd = open(PORT_RANGE_SYSCTL, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + warn_perror("Unable to open %s", PORT_RANGE_SYSCTL); + return; + } + + lineread_init(&lr, fd); + len = lineread_get(&lr, &line); + close(fd); + + if (len < 0) + goto parse_err; + + tab = strchr(line, '\t'); + if (!tab) + goto parse_err; + *tab = '\0'; + + errno = 0; + min = strtol(line, &end, 10); + if (*end || errno) + goto parse_err; + + errno = 0; + max = strtol(tab + 1, &end, 10); + if (*end || errno) + goto parse_err; + + if (min < 0 || min >= NUM_PORTS || + max < 0 || max >= NUM_PORTS) + goto parse_err; + + fwd_ephemeral_min = min; + fwd_ephemeral_max = max; + + return; + +parse_err: + warn("Unable to parse %s", PORT_RANGE_SYSCTL); +} /** * fwd_port_is_ephemeral() - Is port number ephemeral? diff --git a/fwd.h b/fwd.h index 99dd66c..3562f3c 100644 --- a/fwd.h +++ b/fwd.h @@ -12,6 +12,7 @@ struct flowside; /* Number of ports for both TCP and UDP */ #define NUM_PORTS (1U << 16) +void fwd_probe_ephemeral(void); bool fwd_port_is_ephemeral(in_port_t port); enum fwd_ports_mode { From 38363964fc96008761195984c989b036227e0e5c Mon Sep 17 00:00:00 2001 From: Michal Privoznik <mprivozn@redhat.com> Date: Thu, 29 Aug 2024 16:16:03 +0200 Subject: [PATCH 009/382] Makefile: Enable _FORTIFY_SOURCE iff needed On some systems source fortification is enabled whenever code optimization is enabled (e.g. with -O2). Since code fortification is explicitly enabled too (with possibly different value than the system wants, there are three levels [1]), distros are required to patch our Makefile, e.g. [2]. Detect whether fortification is not already enabled and enable it explicitly only if really needed. 1: https://www.gnu.org/software/libc/manual/html_node/Source-Fortification.html 2: https://github.com/gentoo/gentoo/commit/edfeb8763ac56112c59248c62c9cda13e5d01c97 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 01fada4..74a9513 100644 --- a/Makefile +++ b/Makefile @@ -33,9 +33,16 @@ AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/') AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/') AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/') +# On some systems enabling optimization also enables source fortification, +# automagically. Do not override it. +FORTIFY_FLAG := +ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /dev/null; echo $$?),1) +FORTIFY_FLAG := -D_FORTIFY_SOURCE=2 +endif + FLAGS := -Wall -Wextra -Wno-format-zero-length FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE -FLAGS += -D_FORTIFY_SOURCE=2 -O2 -pie -fPIE +FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) FLAGS += -DNETNS_RUN_DIR=\"/run/netns\" FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH) From 0ea60e5a7741658ad7056a0a6674e00e72d2d288 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 3 Sep 2024 23:45:53 +0200 Subject: [PATCH 010/382] log: Don't prefix log file messages with time and severity if they're continuations In fecb1b65b1ac ("log: Don't prefix message with timestamp on --debug if it's a continuation"), I fixed this for --debug on standard error, but not for log files: if messages are continuations, they shouldn't be prefixed by timestamp and severity. Otherwise, we'll print stuff like this: 0.0028: ERROR: Receive error on guest connection, reset0.0028: ERROR: : Bad file descriptor Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- log.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/log.c b/log.c index 433b552..a61468e 100644 --- a/log.c +++ b/log.c @@ -224,19 +224,23 @@ static int logfile_rotate(int fd, const struct timespec *now) /** * logfile_write() - Write entry to log file, trigger rotation if full * @newline: Append newline at the end of the message, if missing + * @cont: Continuation of a previous message, on the same line * @pri: Facility and level map, same as priority for vsyslog() * @now: Timestamp * @format: Same as vsyslog() format * @ap: Same as vsyslog() ap */ -static void logfile_write(bool newline, int pri, const struct timespec *now, +static void logfile_write(bool newline, bool cont, int pri, + const struct timespec *now, const char *format, va_list ap) { char buf[BUFSIZ]; - int n; + int n = 0; - n = logtime_fmt(buf, BUFSIZ, now); - n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]); + if (!cont) { + n += logtime_fmt(buf, BUFSIZ, now); + n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]); + } n += vsnprintf(buf + n, BUFSIZ - n, format, ap); @@ -278,7 +282,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap) va_copy(ap2, ap); /* Don't clobber ap, we need it again */ if (log_file != -1) - logfile_write(newline, pri, now, format, ap2); + logfile_write(newline, cont, pri, now, format, ap2); else if (!(log_mask & LOG_MASK(LOG_DEBUG))) passt_vsyslog(newline, pri, format, ap2); From 7ad9f9bd2bbda8d705e0c6faf5acf2792fce063c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 6 Sep 2024 15:17:05 +1000 Subject: [PATCH 011/382] flow: Fix incorrect hash probe in flowside_lookup() Our flow hash table uses linear probing in which we step backwards through clusters of adjacent hash entries when we have near collisions. Usually that's implemented by flow_hash_probe(). However, due to some details we need a second implementation in flowside_lookup(). An embarrassing oversight in rebasing from earlier versions has mean that version is incorrect, trying to step forward through clusters rather than backward. In situations with the right sorts of has near-collisions this can lead to us not associating an ACK from the tap device with the right flow, leaving it in a not-quite-established state. If the remote peer does a shutdown() at the right time, this can lead to a storm of EPOLLRDHUP events causing high CPU load. Fixes: acca4235c46f ("flow, tcp: Generalise TCP hash table to general flow hash table") Link: https://bugs.passt.top/show_bug.cgi?id=94 Suggested-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow.c b/flow.c index 02631eb..a00e01d 100644 --- a/flow.c +++ b/flow.c @@ -697,7 +697,7 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto, !(FLOW_PROTO(&flow->f) == proto && flow->f.pif[sidx.sidei] == pif && flowside_eq(&flow->f.side[sidx.sidei], side))) - b = (b + 1) % FLOW_HASH_SIZE; + b = mod_sub(b, 1, FLOW_HASH_SIZE); return flow_hashtab[b]; } From 1166401c2f2b97961bdc285b336eed912b4f8bb1 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 6 Sep 2024 15:17:06 +1000 Subject: [PATCH 012/382] udp: Allow UDP flows to be prematurely closed Unlike TCP, UDP has no in-band signalling for the end of a flow. So the only way we remove flows is on a timer if they have no activity for 180s. However, we've started to investigate some error conditions in which we want to prematurely abort / abandon a UDP flow. We can call udp_flow_close(), which will make the flow inert (sockets closed, no epoll events, can't be looked up in hash). However it will still wait 3 minutes to clear away the stale entry. Clean this up by adding an explicit 'closed' flag which will cause a flow to be more promptly cleaned up. We also publish udp_flow_close() so it can be called from other places to abort UDP flows(). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 3 ++- udp_flow.c | 18 +++++++++++++++++- udp_flow.h | 4 ++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/flow.c b/flow.c index a00e01d..f2de041 100644 --- a/flow.c +++ b/flow.c @@ -832,7 +832,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) closed = icmp_ping_timer(c, &flow->ping, now); break; case FLOW_UDP: - if (timer) + closed = udp_flow_defer(&flow->udp); + if (!closed && timer) closed = udp_flow_timer(c, &flow->udp, now); break; default: diff --git a/udp_flow.c b/udp_flow.c index 1ff59a9..b81be2c 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -39,8 +39,11 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx) * @c: Execution context * @uflow: UDP flow */ -static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) +void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) { + if (uflow->closed) + return; /* Nothing to do */ + if (uflow->s[INISIDE] >= 0) { /* The listening socket needs to stay in epoll */ close(uflow->s[INISIDE]); @@ -56,6 +59,8 @@ static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); if (!pif_is_socket(uflow->f.pif[TGTSIDE])) flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE)); + + uflow->closed = true; } /** @@ -256,6 +261,17 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, return udp_flow_new(c, flow, -1, now); } +/** + * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows) + * @uflow: Flow to handle + * + * Return: true if the connection is ready to free, false otherwise + */ +bool udp_flow_defer(const struct udp_flow *uflow) +{ + return uflow->closed; +} + /** * udp_flow_timer() - Handler for timed events related to a given flow * @c: Execution context diff --git a/udp_flow.h b/udp_flow.h index 12ddf03..9a1b059 100644 --- a/udp_flow.h +++ b/udp_flow.h @@ -10,6 +10,7 @@ /** * struct udp - Descriptor for a flow of UDP packets * @f: Generic flow information + * @closed: Flow is already closed * @ts: Activity timestamp * @s: Socket fd (or -1) for each side of the flow */ @@ -17,6 +18,7 @@ struct udp_flow { /* Must be first element */ struct flow_common f; + bool closed :1; time_t ts; int s[SIDES]; }; @@ -30,6 +32,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, const void *saddr, const void *daddr, in_port_t srcport, in_port_t dstport, const struct timespec *now); +void udp_flow_close(const struct ctx *c, struct udp_flow *uflow); +bool udp_flow_defer(const struct udp_flow *uflow); bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, const struct timespec *now); From 88bfa3801e187ac33ca9de552612bc30a1708c72 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 6 Sep 2024 15:17:07 +1000 Subject: [PATCH 013/382] flow: Helpers to log details of a flow The details of a flow - endpoints, interfaces etc. - can be pretty important for debugging. We log this on flow state transitions, but it can also be useful to log this when we report specific conditions. Add some helper functions and macros to make it easy to do that. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 48 +++++++++++++++++++++++++++++++----------------- flow.h | 7 +++++++ 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/flow.c b/flow.c index f2de041..1ea112b 100644 --- a/flow.c +++ b/flow.c @@ -283,28 +283,23 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) "Flow %u (%s): %s", flow_idx(f), type_or_state, msg); } -/** - * flow_set_state() - Change flow's state - * @f: Flow changing state - * @state: New state +/** flow_log_details_() - Log the details of a flow + * @f: flow to log + * @pri: Log priority + * @state: State to log details according to + * + * Logs the details of the flow: endpoints, interfaces, type etc. */ -static void flow_set_state(struct flow_common *f, enum flow_state state) +void flow_log_details_(const struct flow_common *f, int pri, + enum flow_state state) { char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN]; char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN]; const struct flowside *ini = &f->side[INISIDE]; const struct flowside *tgt = &f->side[TGTSIDE]; - uint8_t oldstate = f->state; - ASSERT(state < FLOW_NUM_STATES); - ASSERT(oldstate < FLOW_NUM_STATES); - - f->state = state; - flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], - FLOW_STATE(f)); - - if (MAX(state, oldstate) >= FLOW_STATE_TGT) - flow_log_(f, LOG_DEBUG, + if (state >= FLOW_STATE_TGT) + flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu", pif_name(f->pif[INISIDE]), inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), @@ -316,8 +311,8 @@ static void flow_set_state(struct flow_common *f, enum flow_state state) tgt->oport, inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)), tgt->eport); - else if (MAX(state, oldstate) >= FLOW_STATE_INI) - flow_log_(f, LOG_DEBUG, "%s [%s]:%hu -> [%s]:%hu => ?", + else if (state >= FLOW_STATE_INI) + flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?", pif_name(f->pif[INISIDE]), inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), ini->eport, @@ -325,6 +320,25 @@ static void flow_set_state(struct flow_common *f, enum flow_state state) ini->oport); } +/** + * flow_set_state() - Change flow's state + * @f: Flow changing state + * @state: New state + */ +static void flow_set_state(struct flow_common *f, enum flow_state state) +{ + uint8_t oldstate = f->state; + + ASSERT(state < FLOW_NUM_STATES); + ASSERT(oldstate < FLOW_NUM_STATES); + + f->state = state; + flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], + FLOW_STATE(f)); + + flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate)); +} + /** * flow_initiate_() - Move flow to INI, setting pif[INISIDE] * @flow: Flow to change state diff --git a/flow.h b/flow.h index d167b65..24ba3ef 100644 --- a/flow.h +++ b/flow.h @@ -264,4 +264,11 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) flow_dbg((f), __VA_ARGS__); \ } while (0) +void flow_log_details_(const struct flow_common *f, int pri, + enum flow_state state); +#define flow_log_details(f_, pri) \ + flow_log_details_(&((f_)->f), (pri), (f_)->f.state) +#define flow_dbg_details(f_) flow_log_details((f_), LOG_DEBUG) +#define flow_err_details(f_) flow_log_details((f_), LOG_ERR) + #endif /* FLOW_H */ From bd092ca421be8908aadbeb2ecdfb9fede0f67c07 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 6 Sep 2024 15:17:08 +1000 Subject: [PATCH 014/382] udp: Split socket error handling out from udp_sock_recv() Currently udp_sock_recv() both attempts to clear socket errors and read a batch of datagrams for forwarding. That made sense initially, since both listening and reply sockets need to do this. However, we have certain error cases which will add additional complexity to the error processing. Furthermore, if we ever wanted to more thoroughly handle errors received here - e.g. by synthesising ICMP messages on the tap device - it will likely require different handling for the listening and reply socket cases. So, split handling of error events into its own udp_sock_errs() function. While we're there, allow it to report "unrecoverable errors". We don't have any of these so far, but some cases we're working on might require it. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/udp.c b/udp.c index bd9051e..45142cd 100644 --- a/udp.c +++ b/udp.c @@ -436,6 +436,30 @@ static bool udp_sock_recverr(int s) return true; } +/** + * udp_sock_errs() - Process errors on a socket + * @c: Execution context + * @s: Socket to receive from + * @events: epoll events bitmap + * + * Return: Number of errors handled, or < 0 if we have an unrecoverable error + */ +static int udp_sock_errs(const struct ctx *c, int s, uint32_t events) +{ + unsigned n_err = 0; + + ASSERT(!c->no_udp); + + if (!(events & EPOLLERR)) + return 0; /* Nothing to do */ + + /* Empty the error queue */ + while (udp_sock_recverr(s)) + n_err++; + + return n_err; +} + /** * udp_sock_recv() - Receive datagrams from a socket * @c: Execution context @@ -443,6 +467,8 @@ static bool udp_sock_recverr(int s) * @events: epoll events bitmap * @mmh mmsghdr array to receive into * + * Return: Number of datagrams received + * * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64 */ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, @@ -459,12 +485,6 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, ASSERT(!c->no_udp); - /* Clear any errors first */ - if (events & EPOLLERR) { - while (udp_sock_recverr(s)) - ; - } - if (!(events & EPOLLIN)) return 0; @@ -492,6 +512,13 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, const socklen_t sasize = sizeof(udp_meta[0].s_in); int n, i; + if (udp_sock_errs(c, ref.fd, events) < 0) { + err("UDP: Unrecoverable error on listening socket:" + " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); + /* FIXME: what now? close/re-open socket? */ + return; + } + if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0) return; @@ -566,6 +593,13 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, ASSERT(!c->no_udp && uflow); + if (udp_sock_errs(c, from_s, events) < 0) { + flow_err(uflow, "Unrecoverable error on reply socket"); + flow_err_details(uflow); + udp_flow_close(c, uflow); + return; + } + if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0) return; From bd99f02a64f46cae44ef13c3cb934b8baa9c1a2c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 6 Sep 2024 15:17:09 +1000 Subject: [PATCH 015/382] udp: Treat errors getting errors as unrecoverable We can get network errors, usually transient, reported via the socket error queue. However, at least theoretically, we could get errors trying to read the queue itself. Since we have no idea how to clear an error condition in that case, treat it as unrecoverable. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/udp.c b/udp.c index 45142cd..85a14de 100644 --- a/udp.c +++ b/udp.c @@ -387,11 +387,12 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx, * udp_sock_recverr() - Receive and clear an error from a socket * @s: Socket to receive from * - * Return: true if errors received and processed, false if no more errors + * Return: 1 if error received and processed, 0 if no more errors in queue, < 0 + * if there was an error reading the queue * * #syscalls recvmsg */ -static bool udp_sock_recverr(int s) +static int udp_sock_recverr(int s) { const struct sock_extended_err *ee; const struct cmsghdr *hdr; @@ -408,14 +409,16 @@ static bool udp_sock_recverr(int s) rc = recvmsg(s, &mh, MSG_ERRQUEUE); if (rc < 0) { - if (errno != EAGAIN && errno != EWOULDBLOCK) - err_perror("Failed to read error queue"); - return false; + if (errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + + err_perror("UDP: Failed to read error queue"); + return -1; } if (!(mh.msg_flags & MSG_ERRQUEUE)) { err("Missing MSG_ERRQUEUE flag reading error queue"); - return false; + return -1; } hdr = CMSG_FIRSTHDR(&mh); @@ -424,7 +427,7 @@ static bool udp_sock_recverr(int s) (hdr->cmsg_level == IPPROTO_IPV6 && hdr->cmsg_type == IPV6_RECVERR))) { err("Unexpected cmsg reading error queue"); - return false; + return -1; } ee = (const struct sock_extended_err *)CMSG_DATA(hdr); @@ -433,7 +436,7 @@ static bool udp_sock_recverr(int s) debug("%s error on UDP socket %i: %s", str_ee_origin(ee), s, strerror(ee->ee_errno)); - return true; + return 1; } /** @@ -447,6 +450,7 @@ static bool udp_sock_recverr(int s) static int udp_sock_errs(const struct ctx *c, int s, uint32_t events) { unsigned n_err = 0; + int rc; ASSERT(!c->no_udp); @@ -454,8 +458,11 @@ static int udp_sock_errs(const struct ctx *c, int s, uint32_t events) return 0; /* Nothing to do */ /* Empty the error queue */ - while (udp_sock_recverr(s)) - n_err++; + while ((rc = udp_sock_recverr(s)) > 0) + n_err += rc; + + if (rc < 0) + return -1; /* error reading error, unrecoverable */ return n_err; } From aff5a49b0e75dd08428a88c05d98f39885556c8b Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 6 Sep 2024 15:17:10 +1000 Subject: [PATCH 016/382] udp: Handle more error conditions in udp_sock_errs() udp_sock_errs() reads out everything in the socket error queue. However we've seen some cases[0] where an EPOLLERR event is active, but there isn't anything in the queue. One possibility is that the error is reported instead by the SO_ERROR sockopt. Check for that case and report it as best we can. If we still get an EPOLLERR without visible error, we have no way to clear the error state, so treat it as an unrecoverable error. [0] https://github.com/containers/podman/issues/23686#issuecomment-2324945010 Link: https://bugs.passt.top/show_bug.cgi?id=95 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/udp.c b/udp.c index 85a14de..ae91027 100644 --- a/udp.c +++ b/udp.c @@ -450,7 +450,8 @@ static int udp_sock_recverr(int s) static int udp_sock_errs(const struct ctx *c, int s, uint32_t events) { unsigned n_err = 0; - int rc; + socklen_t errlen; + int rc, err; ASSERT(!c->no_udp); @@ -464,6 +465,24 @@ static int udp_sock_errs(const struct ctx *c, int s, uint32_t events) if (rc < 0) return -1; /* error reading error, unrecoverable */ + errlen = sizeof(err); + if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 || + errlen != sizeof(err)) { + err_perror("Error reading SO_ERROR"); + return -1; /* error reading error, unrecoverable */ + } + + if (err) { + debug("Unqueued error on UDP socket %i: %s", s, strerror(err)); + n_err++; + } + + if (!n_err) { + /* EPOLLERR, but no errors to clear !? */ + err("EPOLLERR event without reported errors on socket %i", s); + return -1; /* no way to clear, unrecoverable */ + } + return n_err; } From afedc2412e8576d95ef49e684601bde2f12d7974 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 6 Sep 2024 10:33:55 +0200 Subject: [PATCH 017/382] tcp: Use EPOLLET for any state of not established connections Currently, for not established connections, we monitor sockets with edge-triggered events (EPOLLET) if we are in the TAP_SYN_RCVD state (outbound connection being established) but not in the TAP_SYN_ACK_SENT case of it (socket is connected, and we sent SYN,ACK to the container/guest). While debugging https://bugs.passt.top/show_bug.cgi?id=94, I spotted another possibility for a short EPOLLRDHUP storm (10 seconds), which doesn't seem to happen in actual use cases, but I could reproduce it: start a connection from a container, while dropping (using netfilter) ACK segments coming out of the container itself. On the server side, outside the container, accept the connection and shutdown the writing side of it immediately. At this point, we're in the TAP_SYN_ACK_SENT case (not just a mere TAP_SYN_RCVD state), we get EPOLLRDHUP from the socket, but we don't have any reasonable way to handle it other than waiting for the tap side to complete the three-way handshake. So we'll just keep getting this EPOLLRDHUP until the SYN_TIMEOUT kicks in. Always enable EPOLLET when EPOLLRDHUP is the only epoll event we subscribe to: in this case, getting multiple EPOLLRDHUP reports is totally useless. In the only remaining non-established state, SOCK_ACCEPTED, for inbound connections, we're anyway discarding EPOLLRDHUP events until we established the conection, because we don't know what to do with them until we get an answer from the tap side, so it's safe to enable EPOLLET also in that case. Link: https://bugs.passt.top/show_bug.cgi?id=94 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index 77c62f0..f9fe1b9 100644 --- a/tcp.c +++ b/tcp.c @@ -440,7 +440,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) if (events == TAP_SYN_RCVD) return EPOLLOUT | EPOLLET | EPOLLRDHUP; - return EPOLLRDHUP; + return EPOLLET | EPOLLRDHUP; } /** From 748ef4cd6e7d7307b4c91cbe59ad040ef535dbdc Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 5 Sep 2024 21:22:04 +1000 Subject: [PATCH 018/382] cppcheck: Work around some cppcheck 2.15.0 redundantInitialization warnings cppcheck-2.15.0 has apparently broadened when it throws a warning about redundant initialization to include some cases where we have an initializer for some fields, but then set other fields in the function body. This is arguably a false positive: although we are technically overwriting the zero-initialization the compiler supplies for fields not explicitly initialized, this sort of construct makes sense when there are some fields we know at the top of the function where the initializer is, but others that require more complex calculation. That said, in the two places this shows up, it's pretty easy to work around. The results are arguably slightly clearer than what we had, since they move the parts of the initialization closer together. So do that rather than having ugly suppressions or dealing with the tedious process of reporting a cppcheck false positive. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- pasta.c | 3 ++- udp.c | 10 ++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pasta.c b/pasta.c index 1900693..307fb4a 100644 --- a/pasta.c +++ b/pasta.c @@ -427,12 +427,12 @@ static int pasta_netns_quit_timer(void) */ void pasta_netns_quit_init(const struct ctx *c) { - union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY }; struct epoll_event ev = { .events = EPOLLIN }; int flags = O_NONBLOCK | O_CLOEXEC; struct statfs s = { 0 }; bool try_inotify = true; int fd = -1, dir_fd; + union epoll_ref ref; if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base) return; @@ -463,6 +463,7 @@ void pasta_netns_quit_init(const struct ctx *c) ref.type = EPOLL_TYPE_NSQUIT_TIMER; } else { close(dir_fd); + ref.type = EPOLL_TYPE_NSQUIT_INOTIFY; } if (fd > FD_REF_MAX) diff --git a/udp.c b/udp.c index ae91027..2ba00c9 100644 --- a/udp.c +++ b/udp.c @@ -773,16 +773,14 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port) { - union udp_listen_epoll_ref uref = { .port = port }; + union udp_listen_epoll_ref uref = { + .pif = ns ? PIF_SPLICE : PIF_HOST, + .port = port, + }; int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; ASSERT(!c->no_udp); - if (ns) - uref.pif = PIF_SPLICE; - else - uref.pif = PIF_HOST; - if (af == AF_UNSPEC && c->ifi4 && c->ifi6) { int s; From 63513e54f3208566ecb746d204ebeaafdd2c79c1 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 6 Sep 2024 12:43:45 +0200 Subject: [PATCH 019/382] util: Fix order of operands and carry of one second in timespec_diff_us() If the nanoseconds of the minuend timestamp are less than the nanoseconds of the subtrahend timestamp, we need to carry one second in the subtraction. I subtracted this second from the minuend, but didn't actually carry it in the subtraction of nanoseconds, and logged timestamps would jump back whenever we switched to the first branch of timespec_diff_us() from the second one. Most likely, the reason why I didn't carry the second is that I instinctively thought that swapping the operands would have the same effect. But it doesn't, in general: that only happens with arithmetic in modulo powers of 2. Undo the swap as well. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util.c b/util.c index 6e64279..eede4e5 100644 --- a/util.c +++ b/util.c @@ -249,7 +249,7 @@ void sock_probe_mem(struct ctx *c) int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b) { if (a->tv_nsec < b->tv_nsec) { - return (b->tv_nsec - a->tv_nsec) / 1000 + + return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 + (a->tv_sec - b->tv_sec - 1) * 1000000; } From 49fc4e0414610c6eadc6693fee4d5077d2e8097e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 6 Sep 2024 21:49:36 +1000 Subject: [PATCH 020/382] tap: Split out handling of EPOLLIN events Currently, tap_handler_pas{st,ta}() check for EPOLLRDHUP, EPOLLHUP and EPOLLERR events, then assume anything left is EPOLLIN. We have some future cases that may want to also handle EPOLLOUT, so in preparation explicitly handle EPOLLIN, moving the logic to a subfunction. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 50 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/tap.c b/tap.c index 852d837..c8abc06 100644 --- a/tap.c +++ b/tap.c @@ -982,24 +982,17 @@ static void tap_sock_reset(struct ctx *c) } /** - * tap_handler_passt() - Packet handler for AF_UNIX file descriptor + * tap_passt_input() - Handler for new data on the socket to qemu * @c: Execution context - * @events: epoll events * @now: Current timestamp */ -void tap_handler_passt(struct ctx *c, uint32_t events, - const struct timespec *now) +static void tap_passt_input(struct ctx *c, const struct timespec *now) { static const char *partial_frame; static ssize_t partial_len = 0; ssize_t n; char *p; - if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { - tap_sock_reset(c); - return; - } - tap_flush_pools(); if (partial_len) { @@ -1052,20 +1045,33 @@ void tap_handler_passt(struct ctx *c, uint32_t events, } /** - * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor + * tap_handler_passt() - Event handler for AF_UNIX file descriptor * @c: Execution context * @events: epoll events * @now: Current timestamp */ -void tap_handler_pasta(struct ctx *c, uint32_t events, +void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now) +{ + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { + tap_sock_reset(c); + return; + } + + if (events & EPOLLIN) + tap_passt_input(c, now); +} + +/** + * tap_pasta_input() - Handler for new data on the socket to hypervisor + * @c: Execution context + * @now: Current timestamp + */ +static void tap_pasta_input(struct ctx *c, const struct timespec *now) { ssize_t n, len; int ret; - if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) - die("Disconnect event on /dev/net/tun device, exiting"); - redo: n = 0; @@ -1102,6 +1108,22 @@ restart: die("Error on tap device, exiting"); } +/** + * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor + * @c: Execution context + * @events: epoll events + * @now: Current timestamp + */ +void tap_handler_pasta(struct ctx *c, uint32_t events, + const struct timespec *now) +{ + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) + die("Disconnect event on /dev/net/tun device, exiting"); + + if (events & EPOLLIN) + tap_pasta_input(c, now); +} + /** * tap_sock_unix_open() - Create and bind AF_UNIX socket * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix) From 11e29054fe91ceaf59d2a500e09c4da262c7b23e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 6 Sep 2024 21:49:37 +1000 Subject: [PATCH 021/382] tap: Improve handling of EINTR in tap_passt_input() When tap_passt_input() gets an error from recv() it (correctly) does not print any error message for EINTR, EAGAIN or EWOULDBLOCK. However in all three cases it returns from the function. That makes sense for EAGAIN and EWOULDBLOCK, since we then want to wait for the next EPOLLIN event before trying again. For EINTR, however, it makes more sense to retry immediately - as it stands we're likely to get a renewer EPOLLIN event immediately in that case, since we're using level triggered signalling. So, handle EINTR separately by immediately retrying until we succeed or get a different type of error. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tap.c b/tap.c index c8abc06..8977b3f 100644 --- a/tap.c +++ b/tap.c @@ -1003,10 +1003,13 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) memmove(pkt_buf, partial_frame, partial_len); } - n = recv(c->fd_tap, pkt_buf + partial_len, TAP_BUF_BYTES - partial_len, - MSG_DONTWAIT); + do { + n = recv(c->fd_tap, pkt_buf + partial_len, + TAP_BUF_BYTES - partial_len, MSG_DONTWAIT); + } while ((n < 0) && errno == EINTR); + if (n < 0) { - if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { err_perror("Receive error on guest connection, reset"); tap_sock_reset(c); } From d2a1dc744b10d3e5253149a2520db9967f9f20d5 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 6 Sep 2024 21:49:38 +1000 Subject: [PATCH 022/382] tap: Restructure in tap_pasta_input() tap_pasta_input() has a rather confusing structure, using two gotos. Remove these by restructuring the function to have the main loop condition based on filling our buffer space, with errors or running out of data treated as the exception, rather than the other way around. This allows us to handle the EINTR which triggered the 'restart' goto with a continue. The outer 'redo' was triggered if we completely filled our buffer, to flush it and do another pass. This one is unnecessary since we don't (yet) use EPOLLET on the tap device: if there's still more data we'll get another event and re-enter the loop. Along the way handle a couple of extra edge cases: - Check for EWOULDBLOCK as well as EAGAIN for the benefit of any future ports where those might not have the same value - Detect EOF on the tap device and exit in that case Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/tap.c b/tap.c index 8977b3f..145587f 100644 --- a/tap.c +++ b/tap.c @@ -1073,42 +1073,35 @@ void tap_handler_passt(struct ctx *c, uint32_t events, static void tap_pasta_input(struct ctx *c, const struct timespec *now) { ssize_t n, len; - int ret; - -redo: - n = 0; tap_flush_pools(); -restart: - while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) { - if (len < (ssize_t)sizeof(struct ethhdr) || - len > (ssize_t)ETH_MAX_MTU) { - n += len; - continue; + for (n = 0; n < (ssize_t)TAP_BUF_BYTES; n += len) { + len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n); + + if (len == 0) { + die("EOF on tap device, exiting"); + } else if (len < 0) { + if (errno == EINTR) { + len = 0; + continue; + } + + if (errno == EAGAIN && errno == EWOULDBLOCK) + break; /* all done for now */ + + die("Error on tap device, exiting"); } + /* Ignore frames of bad length */ + if (len < (ssize_t)sizeof(struct ethhdr) || + len > (ssize_t)ETH_MAX_MTU) + continue; tap_add_packet(c, len, pkt_buf + n); - - if ((n += len) == TAP_BUF_BYTES) - break; } - if (len < 0 && errno == EINTR) - goto restart; - - ret = errno; - tap_handler(c, now); - - if (len > 0 || ret == EAGAIN) - return; - - if (n == TAP_BUF_BYTES) - goto redo; - - die("Error on tap device, exiting"); } /** From a33ecafbd921a681ef65b66624625a1beac43c50 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 6 Sep 2024 21:49:39 +1000 Subject: [PATCH 023/382] tap: Don't risk truncating frames on full buffer in tap_pasta_input() tap_pasta_input() keeps reading frames from the tap device until the buffer is full. However, this has an ugly edge case, when we get close to buffer full, we will provide just the remaining space as a read() buffer. If this is shorter than the next frame to read, the tap device will truncate the frame and discard the remainder. Adjust the code to make sure we always have room for a maximum size frame. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tap.c b/tap.c index 145587f..41af6a6 100644 --- a/tap.c +++ b/tap.c @@ -1076,8 +1076,8 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) tap_flush_pools(); - for (n = 0; n < (ssize_t)TAP_BUF_BYTES; n += len) { - len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n); + for (n = 0; n <= (ssize_t)TAP_BUF_BYTES - ETH_MAX_MTU; n += len) { + len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU); if (len == 0) { die("EOF on tap device, exiting"); From 116bc8266d97d3a3679f9f1c5dc306c834562b48 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 6 Sep 2024 15:19:20 +0200 Subject: [PATCH 024/382] selinux: Allow read access to /proc/sys/net/ipv4/ip_local_port_range Since commit eedc81b6ef55 ("fwd, conf: Probe host's ephemeral ports"), we might need to read from /proc/sys/net/ipv4/ip_local_port_range in both passt and pasta. While pasta was already allowed to open and write /proc/sys/net entries, read access was missing in SELinux's type enforcement: add that. In passt, instead, this is the first time we need to access an entry there: add everything we need. Fixes: eedc81b6ef55 ("fwd, conf: Probe host's ephemeral ports") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/selinux/passt.te | 3 +++ contrib/selinux/pasta.te | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index bbb0917..80bf780 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -50,6 +50,7 @@ require { type passwd_file_t; class netlink_route_socket { bind create nlmsg_read }; + type sysctl_net_t; class capability { sys_tty_config setuid setgid }; class cap_userns { setpcap sys_admin sys_ptrace }; @@ -104,6 +105,8 @@ allow passt_t net_conf_t:lnk_file read; allow passt_t tmp_t:sock_file { create unlink write }; allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt }; kernel_search_network_sysctl(passt_t) +allow passt_t sysctl_net_t:dir search; +allow passt_t sysctl_net_t:file { open read }; corenet_tcp_bind_all_nodes(passt_t) corenet_udp_bind_all_nodes(passt_t) diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te index 4e36c3f..310383c 100644 --- a/contrib/selinux/pasta.te +++ b/contrib/selinux/pasta.te @@ -196,7 +196,7 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch }; allow pasta_t self:tun_socket create; allow pasta_t tun_tap_device_t:chr_file { ioctl open read write }; allow pasta_t sysctl_net_t:dir search; -allow pasta_t sysctl_net_t:file { open write }; +allow pasta_t sysctl_net_t:file { open read write }; allow pasta_t kernel_t:system module_request; allow pasta_t nsfs_t:file read; From 6b38f0723949f8b4b2787ee55d4330249a1a4a3e Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 6 Sep 2024 15:24:26 +0200 Subject: [PATCH 025/382] apparmor: Allow read access to /proc/sys/net/ipv4/ip_local_port_range ...for both passt and pasta: use passt's abstraction for this. Fixes: eedc81b6ef55 ("fwd, conf: Probe host's ephemeral ports") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/apparmor/abstractions/passt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt index d245115..43fd63f 100644 --- a/contrib/apparmor/abstractions/passt +++ b/contrib/apparmor/abstractions/passt @@ -34,6 +34,8 @@ owner @{PROC}/@{pid}/uid_map r, # conf_ugid() + @{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral() + network netlink raw, # nl_sock_init_do(), netlink.c network inet stream, # tcp.c From 1f414ed8f0b3101363c1373e338802186eb29b7c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 12 Sep 2024 16:59:39 +1000 Subject: [PATCH 026/382] tcp: Remove redundant initialisation of iov[TCP_IOV_ETH].iov_base This initialisation for IPv4 flags buffers is redundant with the very next line which sets both iov_base and iov_len. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_buf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tcp_buf.c b/tcp_buf.c index c31e9f3..2e044b2 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -168,7 +168,6 @@ void tcp_sock4_iov_init(const struct ctx *c) iov = tcp4_l2_flags_iov[i]; iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); - iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; From 5ff5d55291d2223c65f889b8eee446b8ed2c551c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 12 Sep 2024 16:59:40 +1000 Subject: [PATCH 027/382] tcp: Avoid overlapping memcpy() in DUP_ACK handling When handling the DUP_ACK flag, we copy all the buffers making up the ack frame. However, all our frames share the same buffer for the Ethernet header (tcp4_eth_src or tcp6_eth_src), so copying the TCP_IOV_ETH will result in a (perfectly) overlapping memcpy(). This seems to have been harmless so far, but overlapping ranges to memcpy() is undefined behaviour, so we really should avoid it. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_buf.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tcp_buf.c b/tcp_buf.c index 2e044b2..1a39846 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -332,9 +332,13 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) else dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - for (i = 0; i < TCP_NUM_IOVS; i++) - memcpy(dup_iov[i].iov_base, iov[i].iov_base, - iov[i].iov_len); + for (i = 0; i < TCP_NUM_IOVS; i++) { + /* All frames share the same ethernet header buffer */ + if (i != TCP_IOV_ETH) { + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + } + } dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; } From 7d8804beb8ecbd07b51dbbeaf14289d37f4f8107 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 18 Sep 2024 11:53:04 +1000 Subject: [PATCH 028/382] tcp: Make some extra functions private tcp_send_flag() and tcp_probe_peek_offset_cap() are not used outside tcp.c, and have no prototype in a header. Make them static. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index f9fe1b9..14b48a8 100644 --- a/tcp.c +++ b/tcp.c @@ -1235,7 +1235,7 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, * * Return: negative error code on connection reset, 0 otherwise */ -int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { return tcp_buf_send_flag(c, conn, flags); } @@ -2477,7 +2477,7 @@ static void tcp_sock_refill_init(const struct ctx *c) * * Return: true if supported, false otherwise */ -bool tcp_probe_peek_offset_cap(sa_family_t af) +static bool tcp_probe_peek_offset_cap(sa_family_t af) { bool ret = false; int s, optv = 0; From 4aff6f93923327cb875ceacf12ef0ffc2e613174 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 18 Sep 2024 11:53:05 +1000 Subject: [PATCH 029/382] tcp: Clean up tcpi_snd_wnd probing When available, we want to retrieve our socket peer's advertised window and forward that to the guest. That information has been available from the kernel via the TCP_INFO getsockopt() since kernel commit 8f7baad7f035. Currently our probing for this is a bit odd. The HAS_SND_WND define determines if our headers include the tcp_snd_wnd field, but that doesn't necessarily mean the running kernel supports it. Currently we start by assuming it's _not_ available, but mark it as available if we ever see a non-zero value in the field. This is a bit hit and miss in two ways: * Zero is perfectly possible window the peer could report, so we can get false negatives * We're reading TCP_INFO into a local variable, which might not be zero initialised, so if the kernel _doesn't_ write it it could have non-zero garbage, giving us false positives. We can use a more direct way of probing for this: getsockopt() reports the length of the information retreived. So, check whether that's long enough to include the field. This lets us probe the availability of the field once and for all during initialisation. That in turn allows ctx to become a const pointer to tcp_prepare_flags() which cascades through many other functions. We also move the flag for the probe result from the ctx structure to a global, to match peek_offset_cap. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 93 ++++++++++++++++++++++++++++++++++++-------------- tcp.h | 13 +++---- tcp_buf.c | 10 +++--- tcp_buf.h | 6 ++-- tcp_internal.h | 4 +-- 5 files changed, 82 insertions(+), 44 deletions(-) diff --git a/tcp.c b/tcp.c index 14b48a8..cba3f3b 100644 --- a/tcp.c +++ b/tcp.c @@ -308,11 +308,6 @@ /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 #define WINDOW_DEFAULT 14600 /* RFC 6928 */ -#ifdef HAS_SND_WND -# define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd) -#else -# define KERNEL_REPORTS_SND_WND(c) (0 && (c)) -#endif #define ACK_INTERVAL 10 /* ms */ #define SYN_TIMEOUT 10 /* s */ @@ -370,6 +365,14 @@ char tcp_buf_discard [MAX_WINDOW]; /* Does the kernel support TCP_PEEK_OFF? */ bool peek_offset_cap; +#ifdef HAS_SND_WND +/* Does the kernel report sending window in TCP_INFO (kernel commit + * 8f7baad7f035) + */ +bool snd_wnd_cap; +#else +#define snd_wnd_cap (false) +#endif /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -1052,7 +1055,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, } #endif /* !HAS_BYTES_ACKED */ - if (!KERNEL_REPORTS_SND_WND(c)) { + if (!snd_wnd_cap) { tcp_get_sndbuf(conn); new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW); conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, @@ -1136,7 +1139,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, * 0 if there is no flag to send * 1 otherwise */ -int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, +int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, char *data, size_t *optlen) { @@ -1153,11 +1156,6 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, return -ECONNRESET; } -#ifdef HAS_SND_WND - if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd) - c->tcp.kernel_snd_wnd = 1; -#endif - if (!(conn->flags & LOCAL)) tcp_rtt_dst_check(conn, &tinfo); @@ -1235,7 +1233,8 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, * * Return: negative error code on connection reset, 0 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, + int flags) { return tcp_buf_send_flag(c, conn, flags); } @@ -1245,7 +1244,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) * @c: Execution context * @conn: Connection pointer */ -void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) { if (conn->events == CLOSED) return; @@ -1463,7 +1462,7 @@ static void tcp_bind_outbound(const struct ctx *c, * @optlen: Bytes in options: caller MUST ensure available length * @now: Current timestamp */ -static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, +static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, const void *saddr, const void *daddr, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) @@ -1628,7 +1627,7 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) { return tcp_buf_data_from_sock(c, conn); } @@ -1644,8 +1643,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) * * Return: count of consumed packets */ -static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, - const struct pool *p, int idx) +static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + const struct pool *p, int idx) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; uint16_t max_ack_seq_wnd = conn->wnd_from_tap; @@ -1842,7 +1841,8 @@ out: * @opts: Pointer to start of options * @optlen: Bytes in options: caller MUST ensure available length */ -static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_conn_from_sock_finish(const struct ctx *c, + struct tcp_tap_conn *conn, const struct tcphdr *th, const char *opts, size_t optlen) { @@ -1885,7 +1885,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, * * Return: count of consumed packets */ -int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, +int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now) { @@ -2023,7 +2023,7 @@ reset: * @c: Execution context * @conn: Connection pointer */ -static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn) { socklen_t sl; int so; @@ -2049,8 +2049,8 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) * @sa: Peer socket address (from accept()) * @now: Current timestamp */ -static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s, - const struct timespec *now) +static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, + int s, const struct timespec *now) { struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); uint64_t hash; @@ -2081,7 +2081,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s, * @ref: epoll reference of listening socket * @now: Current timestamp */ -void tcp_listen_handler(struct ctx *c, union epoll_ref ref, +void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { const struct flowside *ini; @@ -2146,7 +2146,7 @@ cancel: * * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64 */ -void tcp_timer_handler(struct ctx *c, union epoll_ref ref) +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) { struct itimerspec check_armed = { { 0 }, { 0 } }; struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp; @@ -2210,7 +2210,8 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) * @ref: epoll reference * @events: epoll events bitmap */ -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) +void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events) { struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside); @@ -2494,6 +2495,40 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) return ret; } +#ifdef HAS_SND_WND +/** + * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd + * + * Return: true if supported, false otherwise + */ +static bool tcp_probe_snd_wnd_cap(void) +{ + struct tcp_info tinfo; + socklen_t sl = sizeof(tinfo); + int s; + + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn_perror("Temporary TCP socket creation failed"); + return false; + } + + if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { + warn_perror("Failed to get TCP_INFO on temporary socket"); + close(s); + return false; + } + + close(s); + + if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) + + sizeof(tinfo.tcpi_snd_wnd))) + return false; + + return true; +} +#endif /* HAS_SND_WND */ + /** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data * @c: Execution context @@ -2527,6 +2562,12 @@ int tcp_init(struct ctx *c) (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); +#ifdef HAS_SND_WND + snd_wnd_cap = tcp_probe_snd_wnd_cap(); +#endif + debug("TCP_INFO tcpi_snd_wnd field%ssupported", + snd_wnd_cap ? " " : " not "); + return 0; } diff --git a/tcp.h b/tcp.h index e9ff019..5585924 100644 --- a/tcp.h +++ b/tcp.h @@ -10,11 +10,12 @@ struct ctx; -void tcp_timer_handler(struct ctx *c, union epoll_ref ref); -void tcp_listen_handler(struct ctx *c, union epoll_ref ref, +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref); +void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now); -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); -int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, +void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events); +int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now); int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, @@ -58,16 +59,12 @@ union tcp_listen_epoll_ref { * @fwd_in: Port forwarding configuration for inbound packets * @fwd_out: Port forwarding configuration for outbound packets * @timer_run: Timestamp of most recent timer run - * @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035) * @pipe_size: Size of pipes for spliced connections */ struct tcp_ctx { struct fwd_ports fwd_in; struct fwd_ports fwd_out; struct timespec timer_run; -#ifdef HAS_SND_WND - int kernel_snd_wnd; -#endif size_t pipe_size; }; diff --git a/tcp_buf.c b/tcp_buf.c index 1a39846..c886c92 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -239,7 +239,7 @@ void tcp_flags_flush(const struct ctx *c) * @frames: Two-dimensional array containing queued frames with sub-iovs * @num_frames: Number of entries in the two arrays to be compared */ -static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns, +static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS], int num_frames) { int i; @@ -264,7 +264,7 @@ static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns, * tcp_payload_flush() - Send out buffers for segments with data * @c: Execution context */ -void tcp_payload_flush(struct ctx *c) +void tcp_payload_flush(const struct ctx *c) { size_t m; @@ -293,7 +293,7 @@ void tcp_payload_flush(struct ctx *c) * * Return: negative error code on connection reset, 0 otherwise */ -int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) { struct tcp_flags_t *payload; struct iovec *iov; @@ -361,7 +361,7 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer * @seq: Sequence number to be sent */ -static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, ssize_t dlen, int no_csum, uint32_t seq) { struct iovec *iov; @@ -405,7 +405,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, * * #syscalls recvmsg */ -int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) { uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; diff --git a/tcp_buf.h b/tcp_buf.h index 3db4c56..8d4b615 100644 --- a/tcp_buf.h +++ b/tcp_buf.h @@ -9,8 +9,8 @@ void tcp_sock4_iov_init(const struct ctx *c); void tcp_sock6_iov_init(const struct ctx *c); void tcp_flags_flush(const struct ctx *c); -void tcp_payload_flush(struct ctx *c); -int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); -int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); +void tcp_payload_flush(const struct ctx *c); +int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); +int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags); #endif /*TCP_BUF_H */ diff --git a/tcp_internal.h b/tcp_internal.h index aa8bb64..bd634be 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -82,7 +82,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, conn_event_do(c, conn, event); \ } while (0) -void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); #define tcp_rst(c, conn) \ do { \ flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ @@ -94,7 +94,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, const uint16_t *check, uint32_t seq); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, int force_seq, struct tcp_info *tinfo); -int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags, +int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, char *data, size_t *optlen); #endif /* TCP_INTERNAL_H */ From 265b2099c7715a3432eef00acd1faea7cbc1eb25 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 18 Sep 2024 11:53:06 +1000 Subject: [PATCH 030/382] tcp: Simplify ifdef logic in tcp_update_seqack_wnd() This function has a block conditional on !snd_wnd_cap shortly before an snd_wnd_cap is statically false). Therefore, simplify this down to a single conditional with an else branch. While we're there, fix some improperly indented closing braces. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tcp.c b/tcp.c index cba3f3b..92ac164 100644 --- a/tcp.c +++ b/tcp.c @@ -1066,14 +1066,13 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, if (!tinfo) { if (prev_wnd_to_tap > WINDOW_DEFAULT) { goto out; -} + } tinfo = &tinfo_new; if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) { goto out; -} + } } -#ifdef HAS_SND_WND if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { new_wnd_to_tap = tinfo->tcpi_snd_wnd; } else { @@ -1081,7 +1080,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, SNDBUF_GET(conn)); } -#endif new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW); if (!(conn->events & ESTABLISHED)) From bb41901c719f9ba422b538f773025dad5c398823 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 18 Sep 2024 11:53:07 +1000 Subject: [PATCH 031/382] tcp: Make tcp_update_seqack_wnd()s force_seq parameter explicitly boolean This parameter is already treated as a boolean internally. Make it a 'bool' type for clarity. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 6 +++--- tcp_buf.c | 2 +- tcp_internal.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tcp.c b/tcp.c index 92ac164..787df63 100644 --- a/tcp.c +++ b/tcp.c @@ -1020,7 +1020,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, * Return: 1 if sequence or window were updated, 0 otherwise */ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - int force_seq, struct tcp_info *tinfo) + bool force_seq, struct tcp_info *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; @@ -1157,7 +1157,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, if (!(conn->flags & LOCAL)) tcp_rtt_dst_check(conn, &tinfo); - if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) + if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags) return 0; *optlen = 0; @@ -2240,7 +2240,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, tcp_data_from_sock(c, conn); if (events & EPOLLOUT) - tcp_update_seqack_wnd(c, conn, 0, NULL); + tcp_update_seqack_wnd(c, conn, false, NULL); return; } diff --git a/tcp_buf.c b/tcp_buf.c index c886c92..83f91a3 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -511,7 +511,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) last_len = sendlen - (send_bufs - 1) * mss; /* Likely, some new data was acked too. */ - tcp_update_seqack_wnd(c, conn, 0, NULL); + tcp_update_seqack_wnd(c, conn, false, NULL); /* Finally, queue to tap */ dlen = mss; diff --git a/tcp_internal.h b/tcp_internal.h index bd634be..a450d85 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -93,7 +93,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, struct iovec *iov, size_t dlen, const uint16_t *check, uint32_t seq); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - int force_seq, struct tcp_info *tinfo); + bool force_seq, struct tcp_info *tinfo); int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, char *data, size_t *optlen); From bfc294b90dc46d132a56dc0a2ae118f2bea5a266 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 18 Sep 2024 20:44:05 +1000 Subject: [PATCH 032/382] util: Add helper to write() all of a buffer write(2) might not write all the data it is given. Add a write_all_buf() helper to keep calling it until all the given data is written, or we get an error. Currently we use write_remainder() to do this operation in pcap_frame(). That's a little awkward since it requires constructing an iovec, and future changes we want to make to write_remainder() will be easier in terms of this single buffer helper. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- pcap.c | 3 +-- util.c | 25 +++++++++++++++++++++++++ util.h | 1 + 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pcap.c b/pcap.c index 46cc4b0..e6b5ced 100644 --- a/pcap.c +++ b/pcap.c @@ -86,9 +86,8 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt, .caplen = l2len, .len = l2len }; - struct iovec hiov = { &h, sizeof(h) }; - if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 || + if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 || write_remainder(pcap_fd, iov, iovcnt, offset) < 0) debug_perror("Cannot log packet, length %zu", l2len); } diff --git a/util.c b/util.c index eede4e5..7db7c2e 100644 --- a/util.c +++ b/util.c @@ -582,6 +582,31 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, #endif } +/* write_all_buf() - write all of a buffer to an fd + * @fd: File descriptor + * @buf: Pointer to base of buffer + * @len: Length of buffer + * + * Return: 0 on success, -1 on error (with errno set) + * + * #syscalls write + */ +int write_all_buf(int fd, const void *buf, size_t len) +{ + const char *p = buf; + size_t left = len; + + while (left) { + ssize_t rc = write(fd, p, left); + + if (rc < 0) + return -1; + p += rc; + left -= rc; + } + return 0; +} + /* write_remainder() - write the tail of an IO vector to an fd * @fd: File descriptor * @iov: IO vector diff --git a/util.h b/util.h index c7a59d5..5e67f1f 100644 --- a/util.h +++ b/util.h @@ -200,6 +200,7 @@ void pidfile_write(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); +int write_all_buf(int fd, const void *buf, size_t len); int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip); void close_open_files(int argc, char **argv); From d836d9e345865245bab28100a6065d6fa7b6a00c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 18 Sep 2024 20:44:06 +1000 Subject: [PATCH 033/382] util: Remove possible quadratic behaviour from write_remainder() write_remainder() steps through the buffers in an IO vector writing out everything past a certain byte offset. However, on each iteration it rescans the buffer from the beginning to find out where we're up to. With an unfortunate set of write sizes this could lead to quadratic behaviour. In an even less likely set of circumstances (total vector length > maximum size_t) the 'skip' variable could overflow. This is one factor in a longstanding Coverity error we've seen (although I still can't figure out the remainder of its complaint). Rework write_remainder() to always work out our new position in the vector relative to our old/current position, rather than starting from the beginning each time. As a bonus this seems to fix the Coverity error. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Markus Armbruster <armbru@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- util.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/util.c b/util.c index 7db7c2e..87309c5 100644 --- a/util.c +++ b/util.c @@ -597,10 +597,15 @@ int write_all_buf(int fd, const void *buf, size_t len) size_t left = len; while (left) { - ssize_t rc = write(fd, p, left); + ssize_t rc; + + do + rc = write(fd, p, left); + while ((rc < 0) && errno == EINTR); if (rc < 0) return -1; + p += rc; left -= rc; } @@ -615,28 +620,30 @@ int write_all_buf(int fd, const void *buf, size_t len) * * Return: 0 on success, -1 on error (with errno set) * - * #syscalls write writev + * #syscalls writev */ int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip) { - size_t offset, i; + size_t i = 0, offset; - while ((i = iov_skip_bytes(iov, iovcnt, skip, &offset)) < iovcnt) { + while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) { ssize_t rc; if (offset) { - rc = write(fd, (char *)iov[i].iov_base + offset, - iov[i].iov_len - offset); - } else { - rc = writev(fd, &iov[i], iovcnt - i); + /* Write the remainder of the partially written buffer */ + if (write_all_buf(fd, (char *)iov[i].iov_base + offset, + iov[i].iov_len - offset) < 0) + return -1; + i++; } + /* Write as much of the remaining whole buffers as we can */ + rc = writev(fd, &iov[i], iovcnt - i); if (rc < 0) return -1; - skip += rc; + skip = rc; } - return 0; } From 4fe5f4e813b553f4877ffa2b485d941bb9f85ca2 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Wed, 18 Sep 2024 15:13:27 +0200 Subject: [PATCH 034/382] udp: Allow checksum to be disabled We can need not to set the UDP checksum. Add a parameter to udp_update_hdr4() and udp_update_hdr6() to disable it. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 58 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/udp.c b/udp.c index 2ba00c9..7b28313 100644 --- a/udp.c +++ b/udp.c @@ -294,15 +294,17 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n, /** * udp_update_hdr4() - Update headers for one IPv4 datagram - * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) - * @bp: Pointer to udp_payload_t to update - * @toside: Flowside for destination side - * @dlen: Length of UDP payload + * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) + * @bp: Pointer to udp_payload_t to update + * @toside: Flowside for destination side + * @dlen: Length of UDP payload + * @no_udp_csum: Do not set UDP checksum * * Return: size of IPv4 payload (UDP header + data) */ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, - const struct flowside *toside, size_t dlen) + const struct flowside *toside, size_t dlen, + bool no_udp_csum) { const struct in_addr *src = inany_v4(&toside->oaddr); const struct in_addr *dst = inany_v4(&toside->eaddr); @@ -319,22 +321,28 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, bp->uh.source = htons(toside->oport); bp->uh.dest = htons(toside->eport); bp->uh.len = htons(l4len); - csum_udp4(&bp->uh, *src, *dst, bp->data, dlen); + if (no_udp_csum) + bp->uh.check = 0; + else + csum_udp4(&bp->uh, *src, *dst, bp->data, dlen); return l4len; } /** * udp_update_hdr6() - Update headers for one IPv6 datagram - * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) - * @bp: Pointer to udp_payload_t to update - * @toside: Flowside for destination side - * @dlen: Length of UDP payload + * @ip6h: Pre-filled IPv6 header (except for payload_len and + * addresses) + * @bp: Pointer to udp_payload_t to update + * @toside: Flowside for destination side + * @dlen: Length of UDP payload + * @no_udp_csum: Do not set UDP checksum * * Return: size of IPv6 payload (UDP header + data) */ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, - const struct flowside *toside, size_t dlen) + const struct flowside *toside, size_t dlen, + bool no_udp_csum) { uint16_t l4len = dlen + sizeof(bp->uh); @@ -348,7 +356,16 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, bp->uh.source = htons(toside->oport); bp->uh.dest = htons(toside->eport); bp->uh.len = ip6h->payload_len; - csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, bp->data, dlen); + if (no_udp_csum) { + /* 0 is an invalid checksum for UDP IPv6 and dropped by + * the kernel stack, even if the checksum is disabled by virtio + * flags. We need to put any non-zero value here. + */ + bp->uh.check = 0xffff; + } else { + csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, + bp->data, dlen); + } return l4len; } @@ -358,9 +375,11 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, * @mmh: Receiving mmsghdr array * @idx: Index of the datagram to prepare * @toside: Flowside for destination side + * @no_udp_csum: Do not set UDP checksum */ -static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx, - const struct flowside *toside) +static void udp_tap_prepare(const struct mmsghdr *mmh, + unsigned idx, const struct flowside *toside, + bool no_udp_csum) { struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx]; struct udp_payload_t *bp = &udp_payload[idx]; @@ -368,13 +387,15 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx, size_t l4len; if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) { - l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len); + l4len = udp_update_hdr6(&bm->ip6h, bp, toside, + mmh[idx].msg_len, no_udp_csum); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); } else { - l4len = udp_update_hdr4(&bm->ip4h, bp, toside, mmh[idx].msg_len); + l4len = udp_update_hdr4(&bm->ip4h, bp, toside, + mmh[idx].msg_len, no_udp_csum); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + sizeof(udp4_eth_hdr)); (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); @@ -565,7 +586,8 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, udp_splice_prepare(udp_mh_recv, i); } else if (batchpif == PIF_TAP) { udp_tap_prepare(udp_mh_recv, i, - flowside_at_sidx(batchsidx)); + flowside_at_sidx(batchsidx), + false); } if (++i >= n) @@ -636,7 +658,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, if (pif_is_socket(topif)) udp_splice_prepare(udp_mh_recv, i); else if (topif == PIF_TAP) - udp_tap_prepare(udp_mh_recv, i, toside); + udp_tap_prepare(udp_mh_recv, i, toside, false); /* Restore sockaddr length clobbered by recvmsg() */ udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in); } From 8f8c4d27eb2e023fd80986d8fdf8a68b37e3877e Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Wed, 18 Sep 2024 15:13:28 +0200 Subject: [PATCH 035/382] tcp: Allow checksum to be disabled We can need not to set TCP checksum. Add a parameter to tcp_fill_headers4() and tcp_fill_headers6() to disable it. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 52 ++++++++++++++++++++++++++++++-------------------- tcp_buf.c | 8 +++++--- tcp_internal.h | 3 ++- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/tcp.c b/tcp.c index 787df63..1962fcc 100644 --- a/tcp.c +++ b/tcp.c @@ -899,13 +899,14 @@ static void tcp_fill_header(struct tcphdr *th, /** * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers - * @conn: Connection pointer - * @taph: tap backend specific header - * @iph: Pointer to IPv4 header - * @th: Pointer to TCP header - * @dlen: TCP payload length - * @check: Checksum, if already known - * @seq: Sequence number for this segment + * @conn: Connection pointer + * @taph: tap backend specific header + * @iph: Pointer to IPv4 header + * @th: Pointer to TCP header + * @dlen: TCP payload length + * @check: Checksum, if already known + * @seq: Sequence number for this segment + * @no_tcp_csum: Do not set TCP checksum * * Return: The IPv4 payload length, host order */ @@ -913,7 +914,7 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct iphdr *iph, struct tcphdr *th, size_t dlen, const uint16_t *check, - uint32_t seq) + uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); const struct in_addr *src4 = inany_v4(&tapside->oaddr); @@ -932,7 +933,10 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, tcp_fill_header(th, conn, seq); - tcp_update_check_tcp4(iph, th); + if (no_tcp_csum) + th->check = 0; + else + tcp_update_check_tcp4(iph, th); tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); @@ -941,20 +945,21 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, /** * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers - * @conn: Connection pointer - * @taph: tap backend specific header - * @ip6h: Pointer to IPv6 header - * @th: Pointer to TCP header - * @dlen: TCP payload length - * @check: Checksum, if already known - * @seq: Sequence number for this segment + * @conn: Connection pointer + * @taph: tap backend specific header + * @ip6h: Pointer to IPv6 header + * @th: Pointer to TCP header + * @dlen: TCP payload length + * @check: Checksum, if already known + * @seq: Sequence number for this segment + * @no_tcp_csum: Do not set TCP checksum * * Return: The IPv6 payload length, host order */ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct ipv6hdr *ip6h, struct tcphdr *th, - size_t dlen, uint32_t seq) + size_t dlen, uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); size_t l4len = dlen + sizeof(*th); @@ -973,7 +978,10 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, tcp_fill_header(th, conn, seq); - tcp_update_check_tcp6(ip6h, th); + if (no_tcp_csum) + th->check = 0; + else + tcp_update_check_tcp6(ip6h, th); tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); @@ -987,12 +995,14 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, * @dlen: TCP payload length * @check: Checksum, if already known * @seq: Sequence number for this segment + * @no_tcp_csum: Do not set TCP checksum * * Return: IP payload length, host order */ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, struct iovec *iov, size_t dlen, - const uint16_t *check, uint32_t seq) + const uint16_t *check, uint32_t seq, + bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); const struct in_addr *a4 = inany_v4(&tapside->oaddr); @@ -1001,13 +1011,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, dlen, - check, seq); + check, seq, no_tcp_csum); } return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, dlen, - seq); + seq, no_tcp_csum); } /** diff --git a/tcp_buf.c b/tcp_buf.c index 83f91a3..ffbff5e 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -320,7 +320,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) return ret; } - l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq); + l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (flags & DUP_ACK) { @@ -381,7 +381,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp4_frame_conns[tcp4_payload_used] = conn; iov = tcp4_l2_iov[tcp4_payload_used++]; - l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq); + l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, + false); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (tcp4_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); @@ -389,7 +390,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp6_frame_conns[tcp6_payload_used] = conn; iov = tcp6_l2_iov[tcp6_payload_used++]; - l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq); + l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq, + false); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (tcp6_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); diff --git a/tcp_internal.h b/tcp_internal.h index a450d85..de06db1 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -91,7 +91,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, struct iovec *iov, size_t dlen, - const uint16_t *check, uint32_t seq); + const uint16_t *check, uint32_t seq, + bool no_tcp_csum); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, bool force_seq, struct tcp_info *tinfo); int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, From 204e77cd11b2df720c9acd35d562e1ed868304b4 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 20 Sep 2024 14:12:41 +1000 Subject: [PATCH 036/382] udp: Don't attempt to get dual-stack sockets in nonsensical cases To save some kernel memory we try to use "dual stack" sockets (that listen to both IPv4 and IPv6 traffic) when possible. However udp_sock_init() attempts to do this in some cases that can't work. Specifically we can only do this when listening on any address. That's never true for the ns (splicing) case, because we always listen on loopback. For the !ns case and AF_UNSPEC case, addr should always be NULL, but add an assert to verify. This is harmless: if addr is non-NULL, sock_l4() will just fail and we'll fall back to the other path. But, it's messy and makes some upcoming changes harder, so avoid attempting this in cases we know can't work. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> --- udp.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/udp.c b/udp.c index 7b28313..8cea80c 100644 --- a/udp.c +++ b/udp.c @@ -803,21 +803,16 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, ASSERT(!c->no_udp); - if (af == AF_UNSPEC && c->ifi4 && c->ifi6) { + if (af == AF_UNSPEC && c->ifi4 && c->ifi6 && !ns) { int s; + ASSERT(!addr); + /* Attempt to get a dual stack socket */ - if (!ns) { - s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN, - addr, ifname, port, uref.u32); - udp_splice_init[V4][port] = s < 0 ? -1 : s; - udp_splice_init[V6][port] = s < 0 ? -1 : s; - } else { - s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN, - &in4addr_loopback, ifname, port, uref.u32); - udp_splice_ns[V4][port] = s < 0 ? -1 : s; - udp_splice_ns[V6][port] = s < 0 ? -1 : s; - } + s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN, + NULL, ifname, port, uref.u32); + udp_splice_init[V4][port] = s < 0 ? -1 : s; + udp_splice_init[V6][port] = s < 0 ? -1 : s; if (IN_INTERVAL(0, FD_REF_MAX, s)) return 0; } From b8d4fac6a2e77a93d9b0d291cd1ca803a29f890e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 20 Sep 2024 14:12:42 +1000 Subject: [PATCH 037/382] util, pif: Replace sock_l4() with pif_sock_l4() The sock_l4() function is very convenient for creating sockets bound to a given address, but its interface has some problems. Most importantly, the address and port alone aren't enough in some cases. For link-local addresses (at least) we also need the pif in order to properly construct a socket adddress. This case doesn't yet arise, but it might cause us trouble in future. Additionally, sock_l4() can take AF_UNSPEC with the special meaning that it should attempt to create a "dual stack" socket which will respond to both IPv4 and IPv6 traffic. This only makes sense if there is no specific address given. We verify this at runtime, but it would be nicer if we could enforce it structurally. For sockets associated specifically with a single flow we already replaced sock_l4() with flowside_sock_l4() which avoids those problems. Now, replace all the remaining users with a new pif_sock_l4() which also takes an explicit pif. The new function takes the address as an inany *, with NULL indicating the dual stack case. This does add some complexity in some of the callers, however future planned cleanups should make this go away again. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> --- pif.c | 42 ++++++++++++++++++++++++++++++++++++++++++ pif.h | 3 +++ tcp.c | 22 +++++++++++++++++----- udp.c | 34 ++++++++++++++++++++++------------ util.c | 52 ---------------------------------------------------- util.h | 3 --- 6 files changed, 84 insertions(+), 72 deletions(-) diff --git a/pif.c b/pif.c index a099e31..592fafa 100644 --- a/pif.c +++ b/pif.c @@ -59,3 +59,45 @@ void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl, *sl = sizeof(sa->sa6); } } + +/** pif_sock_l4() - Open a socket bound to an address on a specified interface + * @c: Execution context + * @type: Socket epoll type + * @pif: Interface for this socket + * @addr: Address to bind to, or NULL for dual-stack any + * @ifname: Interface for binding, NULL for any + * @port: Port number to bind to (host byte order) + * @data: epoll reference portion for protocol handlers + * + * NOTE: For namespace pifs, this must be called having already entered the + * relevant namespace. + * + * Return: newly created socket, negative error code on failure + */ +int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, + const union inany_addr *addr, const char *ifname, + in_port_t port, uint32_t data) +{ + union sockaddr_inany sa = { + .sa6.sin6_family = AF_INET6, + .sa6.sin6_addr = in6addr_any, + .sa6.sin6_port = htons(port), + }; + socklen_t sl; + + ASSERT(pif_is_socket(pif)); + + if (pif == PIF_SPLICE) { + /* Sanity checks */ + ASSERT(!ifname); + ASSERT(addr && inany_is_loopback(addr)); + } + + if (!addr) + return sock_l4_sa(c, type, &sa, sizeof(sa.sa6), + ifname, false, data); + + pif_sockaddr(c, &sa, &sl, pif, addr, port); + return sock_l4_sa(c, type, &sa, sl, + ifname, sa.sa_family == AF_INET6, data); +} diff --git a/pif.h b/pif.h index 8777bb5..f029282 100644 --- a/pif.h +++ b/pif.h @@ -59,5 +59,8 @@ static inline bool pif_is_socket(uint8_t pif) void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl, uint8_t pif, const union inany_addr *addr, in_port_t port); +int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, + const union inany_addr *addr, const char *ifname, + in_port_t port, uint32_t data); #endif /* PIF_H */ diff --git a/tcp.c b/tcp.c index 1962fcc..49e0cfe 100644 --- a/tcp.c +++ b/tcp.c @@ -2291,7 +2291,19 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, }; int s; - s = sock_l4(c, af, EPOLL_TYPE_TCP_LISTEN, addr, ifname, port, tref.u32); + if (af == AF_UNSPEC) { + ASSERT(!addr); + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, NULL, + ifname, port, tref.u32); + } else { + union inany_addr aany = af == AF_INET ? inany_any4 : inany_any6; + + if (addr) + inany_from_af(&aany, af, addr); + + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, &aany, + ifname, port, tref.u32); + } if (c->tcp.fwd_in.mode == FWD_AUTO) { if (af == AF_INET || af == AF_UNSPEC) @@ -2357,8 +2369,8 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) ASSERT(c->mode == MODE_PASTA); - s = sock_l4(c, AF_INET, EPOLL_TYPE_TCP_LISTEN, &in4addr_loopback, - NULL, port, tref.u32); + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4, + NULL, port, tref.u32); if (s >= 0) tcp_sock_set_bufsize(c, s); else @@ -2383,8 +2395,8 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) ASSERT(c->mode == MODE_PASTA); - s = sock_l4(c, AF_INET6, EPOLL_TYPE_TCP_LISTEN, &in6addr_loopback, - NULL, port, tref.u32); + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6, + NULL, port, tref.u32); if (s >= 0) tcp_sock_set_bufsize(c, s); else diff --git a/udp.c b/udp.c index 8cea80c..b3d4a64 100644 --- a/udp.c +++ b/udp.c @@ -809,8 +809,8 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, ASSERT(!addr); /* Attempt to get a dual stack socket */ - s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN, - NULL, ifname, port, uref.u32); + s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, + NULL, ifname, port, uref.u32); udp_splice_init[V4][port] = s < 0 ? -1 : s; udp_splice_init[V6][port] = s < 0 ? -1 : s; if (IN_INTERVAL(0, FD_REF_MAX, s)) @@ -819,28 +819,38 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { if (!ns) { - r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, - addr, ifname, port, uref.u32); + union inany_addr aany = inany_any4; + + if (addr) + inany_from_af(&aany, AF_INET, addr); + + r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, + &aany, ifname, port, uref.u32); udp_splice_init[V4][port] = r4 < 0 ? -1 : r4; } else { - r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, - &in4addr_loopback, - ifname, port, uref.u32); + r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE, + &inany_loopback4, ifname, + port, uref.u32); udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4; } } if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { if (!ns) { - r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, - addr, ifname, port, uref.u32); + union inany_addr aany = inany_any6; + + if (addr) + inany_from_af(&aany, AF_INET6, addr); + + r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, + &aany, ifname, port, uref.u32); udp_splice_init[V6][port] = r6 < 0 ? -1 : r6; } else { - r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, - &in6addr_loopback, - ifname, port, uref.u32); + r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE, + &inany_loopback6, ifname, + port, uref.u32); udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6; } } diff --git a/util.c b/util.c index 87309c5..ebd93ed 100644 --- a/util.c +++ b/util.c @@ -157,58 +157,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, return fd; } -/** - * sock_l4() - Create and bind socket for given L4, add to epoll list - * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @type: epoll type - * @bind_addr: Address for binding, NULL for any - * @ifname: Interface for binding, NULL for any - * @port: Port, host order - * @data: epoll reference portion for protocol handlers - * - * Return: newly created socket, negative error code on failure - */ -int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type, - const void *bind_addr, const char *ifname, uint16_t port, - uint32_t data) -{ - switch (af) { - case AF_INET: { - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(port), - { 0 }, { 0 }, - }; - if (bind_addr) - addr4.sin_addr = *(struct in_addr *)bind_addr; - return sock_l4_sa(c, type, &addr4, sizeof(addr4), ifname, - false, data); - } - - case AF_UNSPEC: - if (!DUAL_STACK_SOCKETS || bind_addr) - return -EINVAL; - /* fallthrough */ - case AF_INET6: { - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(port), - 0, IN6ADDR_ANY_INIT, 0, - }; - if (bind_addr) { - addr6.sin6_addr = *(struct in6_addr *)bind_addr; - - if (IN6_IS_ADDR_LINKLOCAL(bind_addr)) - addr6.sin6_scope_id = c->ifi6; - } - return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname, - af == AF_INET6, data); - } - default: - return -EINVAL; - } -} /** * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed diff --git a/util.h b/util.h index 5e67f1f..2c1e08e 100644 --- a/util.h +++ b/util.h @@ -181,9 +181,6 @@ int close_range(unsigned int first, unsigned int last, int flags) { int sock_l4_sa(const struct ctx *c, enum epoll_type type, const void *sa, socklen_t sl, const char *ifname, bool v6only, uint32_t data); -int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type, - const void *bind_addr, const char *ifname, uint16_t port, - uint32_t data); void sock_probe_mem(struct ctx *c); long timespec_diff_ms(const struct timespec *a, const struct timespec *b); int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b); From cbde4192eeef7a5640aea6dd84d5eac02841ef5c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 20 Sep 2024 14:12:43 +1000 Subject: [PATCH 038/382] tcp, udp: Make {tcp,udp}_sock_init() take an inany address tcp_sock_init() and udp_sock_init() take an address to bind to as an address family and void * pair. Use an inany instead. Formerly AF_UNSPEC was used to indicate that we want to listen on both 0.0.0.0 and ::, now use a NULL inany to indicate that. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> --- conf.c | 28 ++++++++++++++-------------- tcp.c | 47 ++++++++++++++++++----------------------------- tcp.h | 2 +- udp.c | 31 ++++++++++--------------------- udp.h | 4 ++-- 5 files changed, 45 insertions(+), 67 deletions(-) diff --git a/conf.c b/conf.c index b275886..9f1cd83 100644 --- a/conf.c +++ b/conf.c @@ -116,11 +116,10 @@ static int parse_port_range(const char *s, char **endptr, static void conf_ports(const struct ctx *c, char optname, const char *optarg, struct fwd_ports *fwd) { - char addr_buf[sizeof(struct in6_addr)] = { 0 }, *addr = addr_buf; + union inany_addr addr_buf = inany_any6, *addr = &addr_buf; char buf[BUFSIZ], *spec, *ifname = NULL, *p; bool exclude_only = true, bound_one = false; uint8_t exclude[PORT_BITMAP_SIZE] = { 0 }; - sa_family_t af = AF_UNSPEC; unsigned i; int ret; @@ -166,15 +165,13 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, bitmap_set(fwd->map, i); if (optname == 't') { - ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL, - i); + ret = tcp_sock_init(c, NULL, NULL, i); if (ret == -ENFILE || ret == -EMFILE) goto enfile; if (!ret) bound_one = true; } else if (optname == 'u') { - ret = udp_sock_init(c, 0, AF_UNSPEC, NULL, NULL, - i); + ret = udp_sock_init(c, 0, NULL, NULL, i); if (ret == -ENFILE || ret == -EMFILE) goto enfile; if (!ret) @@ -218,6 +215,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, if (ifname == buf + 1) { /* Interface without address */ addr = NULL; } else { + struct in6_addr a6; + struct in_addr a4; + p = buf; /* Allow square brackets for IPv4 too for convenience */ @@ -226,10 +226,10 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, p++; } - if (inet_pton(AF_INET, p, addr)) - af = AF_INET; - else if (inet_pton(AF_INET6, p, addr)) - af = AF_INET6; + if (inet_pton(AF_INET, p, &a4)) + inany_from_af(addr, AF_INET, &a4); + else if (inet_pton(AF_INET6, p, &a6)) + inany_from_af(addr, AF_INET6, &a6); else goto bad; } @@ -276,13 +276,13 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, bitmap_set(fwd->map, i); if (optname == 't') { - ret = tcp_sock_init(c, af, addr, ifname, i); + ret = tcp_sock_init(c, addr, ifname, i); if (ret == -ENFILE || ret == -EMFILE) goto enfile; if (!ret) bound_one = true; } else if (optname == 'u') { - ret = udp_sock_init(c, 0, af, addr, ifname, i); + ret = udp_sock_init(c, 0, addr, ifname, i); if (ret == -ENFILE || ret == -EMFILE) goto enfile; if (!ret) @@ -338,9 +338,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, ret = 0; if (optname == 't') - ret = tcp_sock_init(c, af, addr, ifname, i); + ret = tcp_sock_init(c, addr, ifname, i); else if (optname == 'u') - ret = udp_sock_init(c, 0, af, addr, ifname, i); + ret = udp_sock_init(c, 0, addr, ifname, i); if (ret) goto bind_fail; } diff --git a/tcp.c b/tcp.c index 49e0cfe..6ca3700 100644 --- a/tcp.c +++ b/tcp.c @@ -2273,17 +2273,16 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, } /** - * tcp_sock_init_af() - Initialise listening socket for a given af and port + * tcp_sock_init_one() - Initialise listening socket for address and port * @c: Execution context - * @af: Address family to listen on - * @port: Port, host order - * @addr: Pointer to address for binding, NULL if not configured + * @addr: Pointer to address for binding, NULL for dual stack any * @ifname: Name of interface to bind to, NULL if not configured + * @port: Port, host order * * Return: fd for the new listening socket, negative error code on failure */ -static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, - const void *addr, const char *ifname) +static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr, + const char *ifname, in_port_t port) { union tcp_listen_epoll_ref tref = { .port = port, @@ -2291,24 +2290,13 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, }; int s; - if (af == AF_UNSPEC) { - ASSERT(!addr); - s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, NULL, + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr, ifname, port, tref.u32); - } else { - union inany_addr aany = af == AF_INET ? inany_any4 : inany_any6; - - if (addr) - inany_from_af(&aany, af, addr); - - s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, &aany, - ifname, port, tref.u32); - } if (c->tcp.fwd_in.mode == FWD_AUTO) { - if (af == AF_INET || af == AF_UNSPEC) + if (!addr || inany_v4(addr)) tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s; - if (af == AF_INET6 || af == AF_UNSPEC) + if (!addr || !inany_v4(addr)) tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s; } @@ -2322,31 +2310,32 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, /** * tcp_sock_init() - Create listening sockets for a given host ("inbound") port * @c: Execution context - * @af: Address family to select a specific IP version, or AF_UNSPEC * @addr: Pointer to address for binding, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured * @port: Port, host order * * Return: 0 on (partial) success, negative error code on (complete) failure */ -int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, +int tcp_sock_init(const struct ctx *c, const union inany_addr *addr, const char *ifname, in_port_t port) { int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; ASSERT(!c->no_tcp); - if (af == AF_UNSPEC && c->ifi4 && c->ifi6) + if (!addr && c->ifi4 && c->ifi6) /* Attempt to get a dual stack socket */ - if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0) + if (tcp_sock_init_one(c, NULL, ifname, port) >= 0) return 0; /* Otherwise create a socket per IP version */ - if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) - r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname); + if ((!addr || inany_v4(addr)) && c->ifi4) + r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4, + ifname, port); - if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) - r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname); + if ((!addr || !inany_v4(addr)) && c->ifi6) + r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6, + ifname, port); if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6)) return 0; @@ -2629,7 +2618,7 @@ static void tcp_port_rebind(struct ctx *c, bool outbound) if (outbound) tcp_ns_sock_init(c, port); else - tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port); + tcp_sock_init(c, NULL, NULL, port); } } } diff --git a/tcp.h b/tcp.h index 5585924..cf30744 100644 --- a/tcp.h +++ b/tcp.h @@ -18,7 +18,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now); -int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, +int tcp_sock_init(const struct ctx *c, const union inany_addr *addr, const char *ifname, in_port_t port); int tcp_init(struct ctx *c); void tcp_timer(struct ctx *c, const struct timespec *now); diff --git a/udp.c b/udp.c index b3d4a64..08faaec 100644 --- a/udp.c +++ b/udp.c @@ -785,15 +785,14 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, * udp_sock_init() - Initialise listening sockets for a given port * @c: Execution context * @ns: In pasta mode, if set, bind with loopback address in namespace - * @af: Address family to select a specific IP version, or AF_UNSPEC * @addr: Pointer to address for binding, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured * @port: Port, host order * * Return: 0 on (partial) success, negative error code on (complete) failure */ -int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, - const void *addr, const char *ifname, in_port_t port) +int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, + const char *ifname, in_port_t port) { union udp_listen_epoll_ref uref = { .pif = ns ? PIF_SPLICE : PIF_HOST, @@ -803,11 +802,9 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, ASSERT(!c->no_udp); - if (af == AF_UNSPEC && c->ifi4 && c->ifi6 && !ns) { + if (!addr && c->ifi4 && c->ifi6 && !ns) { int s; - ASSERT(!addr); - /* Attempt to get a dual stack socket */ s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, NULL, ifname, port, uref.u32); @@ -817,15 +814,11 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, return 0; } - if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { + if ((!addr || inany_v4(addr)) && c->ifi4) { if (!ns) { - union inany_addr aany = inany_any4; - - if (addr) - inany_from_af(&aany, AF_INET, addr); - r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, - &aany, ifname, port, uref.u32); + addr ? addr : &inany_any4, ifname, + port, uref.u32); udp_splice_init[V4][port] = r4 < 0 ? -1 : r4; } else { @@ -836,15 +829,11 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, } } - if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { + if ((!addr || !inany_v4(addr)) && c->ifi6) { if (!ns) { - union inany_addr aany = inany_any6; - - if (addr) - inany_from_af(&aany, AF_INET6, addr); - r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, - &aany, ifname, port, uref.u32); + addr ? addr : &inany_any6, ifname, + port, uref.u32); udp_splice_init[V6][port] = r6 < 0 ? -1 : r6; } else { @@ -918,7 +907,7 @@ static void udp_port_rebind(struct ctx *c, bool outbound) if ((c->ifi4 && socks[V4][port] == -1) || (c->ifi6 && socks[V6][port] == -1)) - udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port); + udp_sock_init(c, outbound, NULL, NULL, port); } } diff --git a/udp.h b/udp.h index a8e76bf..de2df6d 100644 --- a/udp.h +++ b/udp.h @@ -16,8 +16,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now); -int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, - const void *addr, const char *ifname, in_port_t port); +int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, + const char *ifname, in_port_t port); int udp_init(struct ctx *c); void udp_timer(struct ctx *c, const struct timespec *now); void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); From b55013b1a7e7dd7e4e90455703d272b9ffc28b64 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 20 Sep 2024 14:12:44 +1000 Subject: [PATCH 039/382] inany: Add inany_pton() helper We already have an inany_ntop() function to format inany addresses into text. Add inany_pton() to parse them from text, and use it in conf_ports(). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> --- conf.c | 9 +-------- inany.c | 20 ++++++++++++++++++++ inany.h | 1 + 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/conf.c b/conf.c index 9f1cd83..6e62510 100644 --- a/conf.c +++ b/conf.c @@ -215,9 +215,6 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, if (ifname == buf + 1) { /* Interface without address */ addr = NULL; } else { - struct in6_addr a6; - struct in_addr a4; - p = buf; /* Allow square brackets for IPv4 too for convenience */ @@ -226,11 +223,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, p++; } - if (inet_pton(AF_INET, p, &a4)) - inany_from_af(addr, AF_INET, &a4); - else if (inet_pton(AF_INET6, p, &a6)) - inany_from_af(addr, AF_INET6, &a6); - else + if (!inany_pton(p, addr)) goto bad; } } else { diff --git a/inany.c b/inany.c index 5e391dc..f5483bf 100644 --- a/inany.c +++ b/inany.c @@ -36,3 +36,23 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size) return inet_ntop(AF_INET6, &src->a6, dst, size); } + +/** inany_pton - Parse an IPv[46] address from text format + * @src: IPv[46] address + * @dst: output buffer, filled with parsed address + * + * Return: On success, 1, if no parseable address is found, 0 + */ +int inany_pton(const char *src, union inany_addr *dst) +{ + if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) { + memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero)); + memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one)); + return 1; + } + + if (inet_pton(AF_INET6, src, &dst->a6)) + return 1; + + return 0; +} diff --git a/inany.h b/inany.h index d2893ce..6a12c29 100644 --- a/inany.h +++ b/inany.h @@ -270,5 +270,6 @@ static inline void inany_siphash_feed(struct siphash_state *state, #define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN) const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size); +int inany_pton(const char *src, union inany_addr *dst); #endif /* INANY_H */ From def8acdcd846582df5939446be0d73d50971ab18 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 27 Sep 2024 18:43:16 +0200 Subject: [PATCH 040/382] test: Kernel binary can now be passed via the KERNEL environmental variable This is quite useful at least for myself as I'm usually running tests using a guest kernel that's not the same as the one on the host. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- test/lib/setup | 8 ++++---- test/lib/term | 2 +- test/run | 3 +++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/lib/setup b/test/lib/setup index d764138..5338393 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -58,7 +58,7 @@ setup_passt() { context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \ ' -machine accel=kvm' \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ - ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ + ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ @@ -159,7 +159,7 @@ setup_passt_in_ns() { ' -machine accel=kvm' \ ' -M accel=kvm:tcg' \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ - ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ + ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ @@ -230,7 +230,7 @@ setup_two_guests() { context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \ ' -M accel=kvm:tcg' \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ - ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ + ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ @@ -243,7 +243,7 @@ setup_two_guests() { context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \ ' -M accel=kvm:tcg' \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ - ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ + ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ diff --git a/test/lib/term b/test/lib/term index 3834092..0fa0936 100755 --- a/test/lib/term +++ b/test/lib/term @@ -664,7 +664,7 @@ pause_continue() { # run_term() - Start tmux session, running entry point, with recording if needed run_term() { - TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG" + TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eKERNEL=$KERNEL" if [ ${CI} -eq 1 ]; then printf '\e[8;50;240t' diff --git a/test/run b/test/run index cd6d707..547a729 100755 --- a/test/run +++ b/test/run @@ -38,6 +38,9 @@ TRACE=${TRACE:-0} # If set, tell passt and pasta to take packet captures PCAP=${PCAP:-0} +# Custom kernel to boot guests with, if given +KERNEL=${KERNEL:-"/boot/vmlinuz-$(uname -r)"} + COMMIT="$(git log --oneline --no-decorate -1)" . lib/util From 72e7d3024b037afe2cb00c772eea0807286633bd Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 3 Oct 2024 16:51:04 +0200 Subject: [PATCH 041/382] tcp: Use tcp_payload_t rather than tcphdr As tcp_update_check_tcp4() and tcp_update_check_tcp6() compute the checksum using the TCP header and the TCP payload, it is clearer to use a pointer to tcp_payload_t that includes tcphdr and payload rather than a pointer to tcphdr (and guessing TCP header is followed by the payload). Move tcp_payload_t and tcp_flags_t to tcp_internal.h. (They will be used also by vhost-user). Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 42 ++++++++++++++++++++++-------------------- tcp_buf.c | 29 ----------------------------- tcp_internal.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 49 deletions(-) diff --git a/tcp.c b/tcp.c index 6ca3700..0590153 100644 --- a/tcp.c +++ b/tcp.c @@ -757,32 +757,34 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) /** * tcp_update_check_tcp4() - Update TCP checksum from stored one * @iph: IPv4 header - * @th: TCP header followed by TCP payload + * @bp: TCP header followed by TCP payload */ -static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th) +static void tcp_update_check_tcp4(const struct iphdr *iph, + struct tcp_payload_t *bp) { uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); struct in_addr saddr = { .s_addr = iph->saddr }; struct in_addr daddr = { .s_addr = iph->daddr }; uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); - th->check = 0; - th->check = csum(th, l4len, sum); + bp->th.check = 0; + bp->th.check = csum(bp, l4len, sum); } /** * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6 * @ip6h: IPv6 header - * @th: TCP header followed by TCP payload + * @bp: TCP header followed by TCP payload */ -static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) +static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, + struct tcp_payload_t *bp) { uint16_t l4len = ntohs(ip6h->payload_len); uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr, &ip6h->daddr); - th->check = 0; - th->check = csum(th, l4len, sum); + bp->th.check = 0; + bp->th.check = csum(bp, l4len, sum); } /** @@ -902,7 +904,7 @@ static void tcp_fill_header(struct tcphdr *th, * @conn: Connection pointer * @taph: tap backend specific header * @iph: Pointer to IPv4 header - * @th: Pointer to TCP header + * @bp: Pointer to TCP header followed by TCP payload * @dlen: TCP payload length * @check: Checksum, if already known * @seq: Sequence number for this segment @@ -912,14 +914,14 @@ static void tcp_fill_header(struct tcphdr *th, */ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, struct tap_hdr *taph, - struct iphdr *iph, struct tcphdr *th, + struct iphdr *iph, struct tcp_payload_t *bp, size_t dlen, const uint16_t *check, uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); const struct in_addr *src4 = inany_v4(&tapside->oaddr); const struct in_addr *dst4 = inany_v4(&tapside->eaddr); - size_t l4len = dlen + sizeof(*th); + size_t l4len = dlen + sizeof(bp->th); size_t l3len = l4len + sizeof(*iph); ASSERT(src4 && dst4); @@ -931,12 +933,12 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, iph->check = check ? *check : csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4); - tcp_fill_header(th, conn, seq); + tcp_fill_header(&bp->th, conn, seq); if (no_tcp_csum) - th->check = 0; + bp->th.check = 0; else - tcp_update_check_tcp4(iph, th); + tcp_update_check_tcp4(iph, bp); tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); @@ -948,7 +950,7 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, * @conn: Connection pointer * @taph: tap backend specific header * @ip6h: Pointer to IPv6 header - * @th: Pointer to TCP header + * @bp: Pointer to TCP header followed by TCP payload * @dlen: TCP payload length * @check: Checksum, if already known * @seq: Sequence number for this segment @@ -958,11 +960,11 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, */ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, struct tap_hdr *taph, - struct ipv6hdr *ip6h, struct tcphdr *th, + struct ipv6hdr *ip6h, struct tcp_payload_t *bp, size_t dlen, uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); - size_t l4len = dlen + sizeof(*th); + size_t l4len = dlen + sizeof(bp->th); ip6h->payload_len = htons(l4len); ip6h->saddr = tapside->oaddr.a6; @@ -976,12 +978,12 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; - tcp_fill_header(th, conn, seq); + tcp_fill_header(&bp->th, conn, seq); if (no_tcp_csum) - th->check = 0; + bp->th.check = 0; else - tcp_update_check_tcp6(ip6h, th); + tcp_update_check_tcp6(ip6h, bp); tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); diff --git a/tcp_buf.c b/tcp_buf.c index ffbff5e..238827b 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -38,35 +38,6 @@ (c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM) /* Static buffers */ -/** - * struct tcp_payload_t - TCP header and data to send segments with payload - * @th: TCP header - * @data: TCP data - */ -struct tcp_payload_t { - struct tcphdr th; - uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -/** - * struct tcp_flags_t - TCP header and data to send zero-length - * segments (flags) - * @th: TCP header - * @opts TCP options - */ -struct tcp_flags_t { - struct tcphdr th; - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - /* Ethernet header for IPv4 frames */ static struct ethhdr tcp4_eth_src; diff --git a/tcp_internal.h b/tcp_internal.h index de06db1..2f74ffe 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -63,6 +63,35 @@ enum tcp_iov_parts { TCP_NUM_IOVS }; +/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +/** + * struct tcp_flags_t - TCP header and data to send zero-length + * segments (flags) + * @th: TCP header + * @opts TCP options + */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + extern char tcp_buf_discard [MAX_WINDOW]; void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, From fd8334b25dfa0cf4a93bb7fad6728f3bd0e31c6d Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 3 Oct 2024 16:51:05 +0200 Subject: [PATCH 042/382] pcap: Add an offset argument in pcap_iov() The offset is passed directly to pcap_frame() and allows any headers that are not part of the frame to capture to be skipped. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- pcap.c | 5 +++-- pcap.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pcap.c b/pcap.c index e6b5ced..6ee6cdf 100644 --- a/pcap.c +++ b/pcap.c @@ -138,9 +138,10 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, * @iov: Pointer to the array of struct iovec describing the I/O vector * containing packet data to write, including L2 header * @iovcnt: Number of buffers (@iov entries) + * @offset: Offset of the L2 frame within the full data length */ /* cppcheck-suppress unusedFunction */ -void pcap_iov(const struct iovec *iov, size_t iovcnt) +void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) { struct timespec now; @@ -148,7 +149,7 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt) return; clock_gettime(CLOCK_REALTIME, &now); - pcap_frame(iov, iovcnt, 0, &now); + pcap_frame(iov, iovcnt, offset, &now); } /** diff --git a/pcap.h b/pcap.h index 5339237..9795f2e 100644 --- a/pcap.h +++ b/pcap.h @@ -9,7 +9,7 @@ void pcap(const char *pkt, size_t l2len); void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, size_t offset); -void pcap_iov(const struct iovec *iov, size_t iovcnt); +void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset); void pcap_init(struct ctx *c); #endif /* PCAP_H */ From e6548c643796f036de83163e395f0efd56da4790 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 3 Oct 2024 16:51:06 +0200 Subject: [PATCH 043/382] checksum: Add an offset argument in csum_iov() The offset allows any headers that are not part of the data to checksum to be skipped. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- checksum.c | 16 ++++++++++++++-- checksum.h | 3 ++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/checksum.c b/checksum.c index 006614f..05d002a 100644 --- a/checksum.c +++ b/checksum.c @@ -59,6 +59,7 @@ #include "util.h" #include "ip.h" #include "checksum.h" +#include "iov.h" /* Checksums are optional for UDP over IPv4, so we usually just set * them to 0. Change this to 1 to calculate real UDP over IPv4 @@ -497,16 +498,27 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * @iov Pointer to the array of IO vectors * @n Length of the array + * @offset: Offset of the data to checksum within the full data length * @init Initial 32-bit checksum, 0 for no pre-computed checksum * * Return: 16-bit folded, complemented checksum */ /* cppcheck-suppress unusedFunction */ -uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init) +uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, + uint32_t init) { unsigned int i; + size_t first; - for (i = 0; i < n; i++) + i = iov_skip_bytes(iov, n, offset, &first); + if (i >= n) + return (uint16_t)~csum_fold(init); + + init = csum_unfolded((char *)iov[i].iov_base + first, + iov[i].iov_len - first, init); + i++; + + for (; i < n; i++) init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init); return (uint16_t)~csum_fold(init); diff --git a/checksum.h b/checksum.h index c5964ac..49f7472 100644 --- a/checksum.h +++ b/checksum.h @@ -32,6 +32,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, const void *payload, size_t dlen); uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init); uint16_t csum(const void *buf, size_t len, uint32_t init); -uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init); +uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, + uint32_t init); #endif /* CHECKSUM_H */ From 3d484aa370902873bd42a434fa856b9ee3eac228 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 3 Oct 2024 16:51:07 +0200 Subject: [PATCH 044/382] tcp: Update TCP checksum using an iovec array TCP header and payload are supposed to be in the same buffer, and tcp_update_check_tcp4()/tcp_update_check_tcp6() compute the checksum from the base address of the header using the length of the IP payload. In the future (for vhost-user) we need to dispatch the TCP header and the TCP payload through several buffers. To be able to manage that, we provide an iovec array that points to the data of the TCP frame. We provide also an offset to be able to provide an array that contains the TCP frame embedded in an lower level frame, and this offset points to the TCP header inside the iovec array. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- checksum.c | 1 - tcp.c | 118 +++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 100 insertions(+), 19 deletions(-) diff --git a/checksum.c b/checksum.c index 05d002a..cf85019 100644 --- a/checksum.c +++ b/checksum.c @@ -503,7 +503,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * Return: 16-bit folded, complemented checksum */ -/* cppcheck-suppress unusedFunction */ uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, uint32_t init) { diff --git a/tcp.c b/tcp.c index 0590153..9617b7a 100644 --- a/tcp.c +++ b/tcp.c @@ -755,36 +755,106 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) } /** - * tcp_update_check_tcp4() - Update TCP checksum from stored one + * tcp_update_check_tcp4() - Calculate TCP checksum for IPv4 * @iph: IPv4 header - * @bp: TCP header followed by TCP payload + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @l4offset: IPv4 payload offset in the iovec array */ static void tcp_update_check_tcp4(const struct iphdr *iph, - struct tcp_payload_t *bp) + const struct iovec *iov, int iov_cnt, + size_t l4offset) { uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); struct in_addr saddr = { .s_addr = iph->saddr }; struct in_addr daddr = { .s_addr = iph->daddr }; - uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); + size_t check_ofs; + __sum16 *check; + int check_idx; + uint32_t sum; + char *ptr; - bp->th.check = 0; - bp->th.check = csum(bp, l4len, sum); + sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); + + check_idx = iov_skip_bytes(iov, iov_cnt, + l4offset + offsetof(struct tcphdr, check), + &check_ofs); + + if (check_idx >= iov_cnt) { + err("TCP4 buffer is too small, iov size %zd, check offset %zd", + iov_size(iov, iov_cnt), + l4offset + offsetof(struct tcphdr, check)); + return; + } + + if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) { + err("TCP4 checksum field memory is not contiguous " + "check_ofs %zd check_idx %d iov_len %zd", + check_ofs, check_idx, iov[check_idx].iov_len); + return; + } + + ptr = (char *)iov[check_idx].iov_base + check_ofs; + if ((uintptr_t)ptr & (__alignof__(*check) - 1)) { + err("TCP4 checksum field is not correctly aligned in memory"); + return; + } + + check = (__sum16 *)ptr; + + *check = 0; + *check = csum_iov(iov, iov_cnt, l4offset, sum); } /** * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6 * @ip6h: IPv6 header - * @bp: TCP header followed by TCP payload + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @l4offset: IPv6 payload offset in the iovec array */ static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, - struct tcp_payload_t *bp) + const struct iovec *iov, int iov_cnt, + size_t l4offset) { uint16_t l4len = ntohs(ip6h->payload_len); - uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, - &ip6h->saddr, &ip6h->daddr); + size_t check_ofs; + __sum16 *check; + int check_idx; + uint32_t sum; + char *ptr; - bp->th.check = 0; - bp->th.check = csum(bp, l4len, sum); + sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr, + &ip6h->daddr); + + check_idx = iov_skip_bytes(iov, iov_cnt, + l4offset + offsetof(struct tcphdr, check), + &check_ofs); + + if (check_idx >= iov_cnt) { + err("TCP6 buffer is too small, iov size %zd, check offset %zd", + iov_size(iov, iov_cnt), + l4offset + offsetof(struct tcphdr, check)); + return; + } + + if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) { + err("TCP6 checksum field memory is not contiguous " + "check_ofs %zd check_idx %d iov_len %zd", + check_ofs, check_idx, iov[check_idx].iov_len); + return; + } + + ptr = (char *)iov[check_idx].iov_base + check_ofs; + if ((uintptr_t)ptr & (__alignof__(*check) - 1)) { + err("TCP6 checksum field is not correctly aligned in memory"); + return; + } + + check = (__sum16 *)ptr; + + *check = 0; + *check = csum_iov(iov, iov_cnt, l4offset, sum); } /** @@ -935,10 +1005,16 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, tcp_fill_header(&bp->th, conn, seq); - if (no_tcp_csum) + if (no_tcp_csum) { bp->th.check = 0; - else - tcp_update_check_tcp4(iph, bp); + } else { + const struct iovec iov = { + .iov_base = bp, + .iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr), + }; + + tcp_update_check_tcp4(iph, &iov, 1, 0); + } tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); @@ -980,10 +1056,16 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, tcp_fill_header(&bp->th, conn, seq); - if (no_tcp_csum) + if (no_tcp_csum) { bp->th.check = 0; - else - tcp_update_check_tcp6(ip6h, bp); + } else { + const struct iovec iov = { + .iov_base = bp, + .iov_len = ntohs(ip6h->payload_len) + }; + + tcp_update_check_tcp6(ip6h, &iov, 1, 0); + } tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); From 151dbe0d3d3690978a0a5cf3b8fa9808bd708668 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 3 Oct 2024 16:51:08 +0200 Subject: [PATCH 045/382] udp: Update UDP checksum using an iovec array As for tcp_update_check_tcp4()/tcp_update_check_tcp6(), change csum_udp4() and csum_udp6() to use an iovec array. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- checksum.c | 29 ++++++++++++++++++----------- checksum.h | 4 ++-- tap.c | 14 +++++++++++--- tap.h | 2 +- udp.c | 17 +++++++++++++---- 5 files changed, 45 insertions(+), 21 deletions(-) diff --git a/checksum.c b/checksum.c index cf85019..c673993 100644 --- a/checksum.c +++ b/checksum.c @@ -166,22 +166,24 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol, * @udp4hr: UDP header, initialised apart from checksum * @saddr: IPv4 source address * @daddr: IPv4 destination address - * @payload: UDP packet payload - * @dlen: Length of @payload (not including UDP header) + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @offset: UDP payload offset in the iovec array */ void csum_udp4(struct udphdr *udp4hr, struct in_addr saddr, struct in_addr daddr, - const void *payload, size_t dlen) + const struct iovec *iov, int iov_cnt, size_t offset) { /* UDP checksums are optional, so don't bother */ udp4hr->check = 0; if (UDP4_REAL_CHECKSUMS) { - uint16_t l4len = dlen + sizeof(struct udphdr); + uint16_t l4len = iov_size(iov, iov_cnt) - offset + + sizeof(struct udphdr); uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP, saddr, daddr); psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum); - udp4hr->check = csum(payload, dlen, psum); + udp4hr->check = csum_iov(iov, iov_cnt, offset, psum); } } @@ -227,19 +229,24 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol, /** * csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet * @udp6hr: UDP header, initialised apart from checksum - * @payload: UDP packet payload - * @dlen: Length of @payload (not including UDP header) + * @saddr: Source address + * @daddr: Destination address + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @offset: UDP payload offset in the iovec array */ void csum_udp6(struct udphdr *udp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, - const void *payload, size_t dlen) + const struct iovec *iov, int iov_cnt, size_t offset) { - uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(struct udphdr), - IPPROTO_UDP, saddr, daddr); + uint16_t l4len = iov_size(iov, iov_cnt) - offset + + sizeof(struct udphdr); + uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP, + saddr, daddr); udp6hr->check = 0; psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum); - udp6hr->check = csum(payload, dlen, psum); + udp6hr->check = csum_iov(iov, iov_cnt, offset, psum); } /** diff --git a/checksum.h b/checksum.h index 49f7472..31ba322 100644 --- a/checksum.h +++ b/checksum.h @@ -19,14 +19,14 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol, struct in_addr saddr, struct in_addr daddr); void csum_udp4(struct udphdr *udp4hr, struct in_addr saddr, struct in_addr daddr, - const void *payload, size_t dlen); + const struct iovec *iov, int iov_cnt, size_t offset); void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen); uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol, const struct in6_addr *saddr, const struct in6_addr *daddr); void csum_udp6(struct udphdr *udp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, - const void *payload, size_t dlen); + const struct iovec *iov, int iov_cnt, size_t offset); void csum_icmp6(struct icmp6hdr *icmp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, const void *payload, size_t dlen); diff --git a/tap.c b/tap.c index 41af6a6..c53a39b 100644 --- a/tap.c +++ b/tap.c @@ -172,11 +172,15 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP); char *data = (char *)(uh + 1); + const struct iovec iov = { + .iov_base = (void *)in, + .iov_len = dlen + }; uh->source = htons(sport); uh->dest = htons(dport); uh->len = htons(l4len); - csum_udp4(uh, src, dst, in, dlen); + csum_udp4(uh, src, dst, &iov, 1, 0); memcpy(data, in, dlen); tap_send_single(c, buf, dlen + (data - buf)); @@ -247,7 +251,7 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h, void tap_udp6_send(const struct ctx *c, const struct in6_addr *src, in_port_t sport, const struct in6_addr *dst, in_port_t dport, - uint32_t flow, const void *in, size_t dlen) + uint32_t flow, void *in, size_t dlen) { size_t l4len = dlen + sizeof(struct udphdr); char buf[USHRT_MAX]; @@ -255,11 +259,15 @@ void tap_udp6_send(const struct ctx *c, struct udphdr *uh = tap_push_ip6h(ip6h, src, dst, l4len, IPPROTO_UDP, flow); char *data = (char *)(uh + 1); + const struct iovec iov = { + .iov_base = in, + .iov_len = dlen + }; uh->source = htons(sport); uh->dest = htons(dport); uh->len = htons(l4len); - csum_udp6(uh, src, dst, in, dlen); + csum_udp6(uh, src, dst, &iov, 1, 0); memcpy(data, in, dlen); tap_send_single(c, buf, dlen + (data - buf)); diff --git a/tap.h b/tap.h index ec9e2ac..85f1e84 100644 --- a/tap.h +++ b/tap.h @@ -53,7 +53,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c, void tap_udp6_send(const struct ctx *c, const struct in6_addr *src, in_port_t sport, const struct in6_addr *dst, in_port_t dport, - uint32_t flow, const void *in, size_t dlen); + uint32_t flow, void *in, size_t dlen); void tap_icmp6_send(const struct ctx *c, const struct in6_addr *src, const struct in6_addr *dst, const void *in, size_t l4len); diff --git a/udp.c b/udp.c index 08faaec..100610f 100644 --- a/udp.c +++ b/udp.c @@ -321,10 +321,15 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, bp->uh.source = htons(toside->oport); bp->uh.dest = htons(toside->eport); bp->uh.len = htons(l4len); - if (no_udp_csum) + if (no_udp_csum) { bp->uh.check = 0; - else - csum_udp4(&bp->uh, *src, *dst, bp->data, dlen); + } else { + const struct iovec iov = { + .iov_base = bp->data, + .iov_len = dlen + }; + csum_udp4(&bp->uh, *src, *dst, &iov, 1, 0); + } return l4len; } @@ -363,8 +368,12 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, */ bp->uh.check = 0xffff; } else { + const struct iovec iov = { + .iov_base = bp->data, + .iov_len = dlen + }; csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, - bp->data, dlen); + &iov, 1, 0); } return l4len; From 9d66df9a9a45b9305a2daff8a3c09a28f2c78d83 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 3 Oct 2024 14:48:32 +1000 Subject: [PATCH 046/382] conf: Add command line switch to enable IP_FREEBIND socket option In a couple of recent reports, we've seen that it can be useful for pasta to forward ports from addresses which are not currently configured on the host, but might be in future. That can be done with the sysctl net.ipv4.ip_nonlocal_bind, but that does require CAP_NET_ADMIN to set in the first place. We can allow the same thing on a per-socket basis with the IP_FREEBIND (or IPV6_FREEBIND) socket option. Add a --freebind command line argument to enable this socket option on all listening sockets. Link: https://bugs.passt.top/show_bug.cgi?id=101 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 2 ++ passt.1 | 10 ++++++++++ passt.h | 2 ++ util.c | 16 ++++++++++++++++ 4 files changed, 30 insertions(+) diff --git a/conf.c b/conf.c index 6e62510..e360fb9 100644 --- a/conf.c +++ b/conf.c @@ -836,6 +836,7 @@ static void usage(const char *name, FILE *f, int status) " --no-ndp Disable NDP responses\n" " --no-dhcpv6 Disable DHCPv6 server\n" " --no-ra Disable router advertisements\n" + " --freebind Bind to any address for forwarding\n" " --no-map-gw Don't map gateway address to host\n" " -4, --ipv4-only Enable IPv4 operation only\n" " -6, --ipv6-only Enable IPv6 operation only\n"); @@ -1255,6 +1256,7 @@ void conf(struct ctx *c, int argc, char **argv) {"no-dhcpv6", no_argument, &c->no_dhcpv6, 1 }, {"no-ndp", no_argument, &c->no_ndp, 1 }, {"no-ra", no_argument, &c->no_ra, 1 }, + {"freebind", no_argument, &c->freebind, 1 }, {"no-map-gw", no_argument, &no_map_gw, 1 }, {"ipv4-only", no_argument, NULL, '4' }, {"ipv6-only", no_argument, NULL, '6' }, diff --git a/passt.1 b/passt.1 index 79d134d..5ac2962 100644 --- a/passt.1 +++ b/passt.1 @@ -327,6 +327,16 @@ namespace will be silently dropped. Disable Router Advertisements. Router Solicitations coming from guest or target namespace will be ignored. +.TP +.BR \-\-freebind +Allow any binding address to be specified for \fB-t\fR and \fB-u\fR +options. Usually binding addresses must be addresses currently +configured on the host. With \fB\-\-freebind\fR, the +\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled +allowing any address to be used. This is typically used to bind +addresses which might be configured on the host in future, at which +point the forwarding will immediately start operating. + .TP .BR \-\-map-host-loopback " " \fIaddr Translate \fIaddr\fR to refer to the host. Packets from the guest to diff --git a/passt.h b/passt.h index 031c9b6..4908ed9 100644 --- a/passt.h +++ b/passt.h @@ -225,6 +225,7 @@ struct ip6_ctx { * @no_dhcpv6: Disable DHCPv6 server * @no_ndp: Disable NDP handler altogether * @no_ra: Disable router advertisements + * @freebind: Allow binding of non-local addresses for forwarding * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max */ @@ -284,6 +285,7 @@ struct ctx { int no_dhcpv6; int no_ndp; int no_ra; + int freebind; int low_wmem; int low_rmem; diff --git a/util.c b/util.c index ebd93ed..eba7d52 100644 --- a/util.c +++ b/util.c @@ -52,6 +52,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, { sa_family_t af = ((const struct sockaddr *)sa)->sa_family; union epoll_ref ref = { .type = type, .data = data }; + bool freebind = false; struct epoll_event ev; int fd, y = 1, ret; uint8_t proto; @@ -61,8 +62,11 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, case EPOLL_TYPE_TCP_LISTEN: proto = IPPROTO_TCP; socktype = SOCK_STREAM | SOCK_NONBLOCK; + freebind = c->freebind; break; case EPOLL_TYPE_UDP_LISTEN: + freebind = c->freebind; + /* fallthrough */ case EPOLL_TYPE_UDP_REPLY: proto = IPPROTO_UDP; socktype = SOCK_DGRAM | SOCK_NONBLOCK; @@ -127,6 +131,18 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, } } + if (freebind) { + int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; + int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND; + + if (setsockopt(fd, level, opt, &y, sizeof(y))) { + err_perror("Failed to set %s on socket %i", + af == AF_INET ? "IP_FREEBIND" + : "IPV6_FREEBIND", + fd); + } + } + if (bind(fd, sa, sl) < 0) { /* We'll fail to bind to low ports if we don't have enough * capabilities, and we'll fail to bind on already bound ports, From ff63ac922a4017de8a5d384b1c0be36433436ed8 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 3 Oct 2024 15:14:02 +1000 Subject: [PATCH 047/382] conf: Add --dns-host option to configure host side nameserver When redirecting DNS queries with the --dns-forward option, passt/pasta needs a host side nameserver to redirect the queries to. This is controlled by the c->ip[46].dns_host variables. This is set to the first first nameserver listed in the host's /etc/resolv.conf, and there isn't currently a way to override it from the command line. Prior to 0b25cac9 ("conf: Treat --dns addresses as guest visible addresses") it was possible to alter this with the -D/--dns option. However, doing so was confusing and had some nonsensical edge cases because -D generally takes guest side addresses, rather than host side addresses. Add a new --dns-host option to restore this functionality in a more sensible way. Link: https://bugs.passt.top/show_bug.cgi?id=102 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 16 ++++++++++++++++ passt.1 | 17 +++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/conf.c b/conf.c index e360fb9..c631019 100644 --- a/conf.c +++ b/conf.c @@ -829,6 +829,9 @@ static void usage(const char *name, FILE *f, int status) " --dns-forward ADDR Forward DNS queries sent to ADDR\n" " can be specified zero to two times (for IPv4 and IPv6)\n" " default: don't forward DNS queries\n" + " --dns-host ADDR Host nameserver to direct queries to\n" + " can be specified zero to two times (for IPv4 and IPv6)\n" + " default: first nameserver from host's /etc/resolv.conf\n" " --no-tcp Disable TCP protocol handler\n" " --no-udp Disable UDP protocol handler\n" " --no-icmp Disable ICMP/ICMPv6 protocol handler\n" @@ -1286,6 +1289,7 @@ void conf(struct ctx *c, int argc, char **argv) {"netns-only", no_argument, NULL, 20 }, {"map-host-loopback", required_argument, NULL, 21 }, {"map-guest-addr", required_argument, NULL, 22 }, + {"dns-host", required_argument, NULL, 24 }, { 0 }, }; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; @@ -1463,6 +1467,18 @@ void conf(struct ctx *c, int argc, char **argv) conf_nat(optarg, &c->ip4.map_guest_addr, &c->ip6.map_guest_addr, NULL); break; + case 24: + if (inet_pton(AF_INET6, optarg, &c->ip6.dns_host) && + !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) + break; + + if (inet_pton(AF_INET, optarg, &c->ip4.dns_host) && + !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host) && + !IN4_IS_ADDR_BROADCAST(&c->ip4.dns_host)) + break; + + die("Invalid host nameserver address: %s", optarg); + break; case 'd': c->debug = 1; c->quiet = 0; diff --git a/passt.1 b/passt.1 index 5ac2962..ef33267 100644 --- a/passt.1 +++ b/passt.1 @@ -249,10 +249,19 @@ the host. .TP .BR \-\-dns-forward " " \fIaddr Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the -first configured DNS resolver (with corresponding IP version). Maps -only UDP and TCP traffic to port 53 or port 853. Replies are -translated back with a reverse mapping. This option can be specified -zero to two times (once for IPv4, once for IPv6). +nameserver (with corresponding IP version) specified by the +\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or +port 853. Replies are translated back with a reverse mapping. This +option can be specified zero to two times (once for IPv4, once for +IPv6). + +.TP +.BR \-\-dns-host " " \fIaddr +Configure the host nameserver which guest or namespace queries to the +\fB\-\-dns-forward\fR address will be redirected to. This option can +be specified zero to two times (once for IPv4, once for IPv6). +By default, the first nameserver from the host's +\fI/etc/resolv.conf\fR. .TP .BR \-S ", " \-\-search " " \fIlist From b40880c157ea12ccfc93266cc08252be1aaedaa9 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 8 Oct 2024 22:40:58 +0200 Subject: [PATCH 048/382] test/lib/term: Always use printf for messages with escape sequences ...instead of echo: otherwise, bash won't handle escape sequences we use to colour messages (and 'echo -e' is not specified by POSIX). Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- test/lib/term | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/lib/term b/test/lib/term index 0fa0936..fcbed16 100755 --- a/test/lib/term +++ b/test/lib/term @@ -31,8 +31,8 @@ PR_DELAY_INIT=100 # ms # $@: Message to print info() { tmux select-pane -t ${PANE_INFO} - echo "${@}" >> $STATEBASE/log_pipe - echo "${@}" >> "${LOGFILE}" + printf "${@}\n" >> $STATEBASE/log_pipe + printf "${@}\n" >> "${LOGFILE}" } # info_n() - Highlight, print message to pane and to log file without newline @@ -47,13 +47,13 @@ info_n() { # $@: Message to print info_nolog() { tmux select-pane -t ${PANE_INFO} - echo "${@}" >> $STATEBASE/log_pipe + printf "${@}\n" >> $STATEBASE/log_pipe } # info_nolog() - Print message to log file # $@: Message to print log() { - echo "${@}" >> "${LOGFILE}" + printf "${@}\n" >> "${LOGFILE}" } # info_nolog_n() - Send message to pane without highlighting it, without newline From 7612cb80fe80c089b25245e12a5e934f772480f8 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 4 Oct 2024 18:50:43 +0200 Subject: [PATCH 049/382] test: Pass TRACE from run_term() into ./run from_term Just like we do for PCAP, DEBUG and KERNEL. Otherwise, running tests with TRACE=1 will not actually enable tracing output. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- test/lib/term | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lib/term b/test/lib/term index fcbed16..ed690de 100755 --- a/test/lib/term +++ b/test/lib/term @@ -664,7 +664,7 @@ pause_continue() { # run_term() - Start tmux session, running entry point, with recording if needed run_term() { - TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eKERNEL=$KERNEL" + TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL" if [ ${CI} -eq 1 ]; then printf '\e[8;50;240t' From 2d7f734c45c64e9d5ddc408a1e13de7d9942bf42 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 15 Oct 2024 00:17:24 +0200 Subject: [PATCH 050/382] tcp: Send "empty" handshake ACK before first data segment Starting from commit 9178a9e3462d ("tcp: Always send an ACK segment once the handshake is completed"), we always send an ACK segment, without any payload, to complete the three-way handshake while establishing a connection started from a socket. We queue that segment after checking if we already have data to send to the tap, which means that its sequence number is higher than any segment with data we're sending in the same iteration, if any data is available on the socket. However, in tcp_defer_handler(), we first flush "flags" buffers, that is, we send out segments without any data first, and then segments with data, which means that our "empty" ACK is sent before the ACK segment with data (if any), which has a lower sequence number. This appears to be harmless as the guest or container will generally reorder segments, but it looks rather weird and we can't exclude it's actually causing problems. Queue the empty ACK first, so that it gets a lower sequence number, before checking for any data from the socket. Reported-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index 9617b7a..b2155ab 100644 --- a/tcp.c +++ b/tcp.c @@ -1957,11 +1957,12 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, return; } + tcp_send_flag(c, conn, ACK); + /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ tcp_data_from_sock(c, conn); - tcp_send_flag(c, conn, ACK); } /** From f9d677bff6af48b50f3655224e8b0eb8820d3e89 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 18 Oct 2024 12:35:50 +1100 Subject: [PATCH 051/382] arp: Fix a handful of small warts This fixes a number of harmless but slightly ugly warts in the ARP resolution code: * Use in4addr_any to represent 0.0.0.0 rather than hand constructing an example. * When comparing am->sip against 0.0.0.0 use sizeof(am->sip) instead of sizeof(am->tip) (same value, but makes more logical sense) * Described the guest's assigned address as such, rather than as "our address" - that's not usually what we mean by "our address" these days * Remove "we might have the same IP address" comment which I can't make sense of in context (possibly it's relating to the statement below, which already has its own comment?) Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- arp.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arp.c b/arp.c index 53334da..fc482bb 100644 --- a/arp.c +++ b/arp.c @@ -59,14 +59,12 @@ int arp(const struct ctx *c, const struct pool *p) ah->ar_op != htons(ARPOP_REQUEST)) return 1; - /* Discard announcements (but not 0.0.0.0 "probes"): we might have the - * same IP address, hide that. - */ - if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) && + /* Discard announcements, but not 0.0.0.0 "probes" */ + if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) && !memcmp(am->sip, am->tip, sizeof(am->sip))) return 1; - /* Don't resolve our own address, either. */ + /* Don't resolve the guest's assigned address, either. */ if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip))) return 1; From 75b9c0feb0b54b040a8c49f160cfc2defe28c045 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 18 Oct 2024 12:35:51 +1100 Subject: [PATCH 052/382] test: Explicitly wait for DAD to complete on SLAAC addresses Getting a SLAAC address takes a little while because the kernel must complete Duplicate Address Detection (DAD) before marking the address as ready. In several places we have an explicit 'sleep 2' to wait for that to complete. Fixed length delays are never a great idea, although this one is pretty solid. Still, it would be better to explicitly wait for DAD to complete in case of long delays (which might happen on slow emulated hosts, or with heavy load), and to speed the tests up if DAD completes quicker. Replace the fixed sleeps with a loop waiting for DAD to complete. We do this by looping waiting for all tentative addresses to disappear. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/passt/ndp | 4 +++- test/pasta/ndp | 3 ++- test/two_guests/basic | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/passt/ndp b/test/passt/ndp index 6bf8af3..f54b8ce 100644 --- a/test/passt/ndp +++ b/test/passt/ndp @@ -16,7 +16,9 @@ htools ip jq sipcalc grep cut test Interface name gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' -guest ip link set dev __IFNAME__ up && sleep 2 +guest ip link set dev __IFNAME__ up +# Wait for DAD to complete +guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' check [ -n "__IFNAME__" ] diff --git a/test/pasta/ndp b/test/pasta/ndp index d45ff7b..c59627f 100644 --- a/test/pasta/ndp +++ b/test/pasta/ndp @@ -18,7 +18,8 @@ test Interface name nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' check [ -n "__IFNAME__" ] ns ip link set dev __IFNAME__ up -sleep 2 +# Wait for DAD to complete +ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done test SLAAC: prefix nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]' diff --git a/test/two_guests/basic b/test/two_guests/basic index 4d49e85..ac50ff8 100644 --- a/test/two_guests/basic +++ b/test/two_guests/basic @@ -36,7 +36,8 @@ check [ "__ADDR2__" = "__HOST_ADDR__" ] test DHCPv6: addresses # Link is up now, wait for DAD to complete -sleep 2 +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done guest1 /sbin/dhclient -6 __IFNAME1__ guest2 /sbin/dhclient -6 __IFNAME2__ g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' From 53176ca91d176ea15d8abf3b1429e43bc93e516c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 18 Oct 2024 12:35:52 +1100 Subject: [PATCH 053/382] test: Wait for DAD on DHCPv6 addresses After running dhclient -6 we expect the DHCPv6 assigned address to be immediately usable. That's true with the Fedora dhclient-script (and the upstream ISC DHCP one), however it's not true with the Debian dhclient-script. The Debian script can complete with the address still in "tentative" state, and the address won't be usable until Duplicate Address Detection (DAD) completes. That's arguably a bug in Debian (see link below), but for the time being we need to work around it anyway. We usually get away with this, because by the time we do anything where the address matters, DAD has completed. However, it's not robust, so we should explicitly wait for DAD to complete when we get an DHCPv6 address. Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1085231 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/passt/dhcp | 2 ++ test/passt_in_ns/dhcp | 2 ++ test/pasta/dhcp | 2 ++ test/perf/passt_tcp | 2 ++ test/two_guests/basic | 3 +++ 5 files changed, 11 insertions(+) diff --git a/test/passt/dhcp b/test/passt/dhcp index e05a4bb..9925ab9 100644 --- a/test/passt/dhcp +++ b/test/passt/dhcp @@ -49,6 +49,8 @@ check [ "__SEARCH__" = "__HOST_SEARCH__" ] test DHCPv6: address guest /sbin/dhclient -6 __IFNAME__ +# Wait for DAD to complete +guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]' hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' check [ "__ADDR6__" = "__HOST_ADDR6__" ] diff --git a/test/passt_in_ns/dhcp b/test/passt_in_ns/dhcp index 0ceed7c..a38a690 100644 --- a/test/passt_in_ns/dhcp +++ b/test/passt_in_ns/dhcp @@ -52,6 +52,8 @@ check [ "__SEARCH__" = "__HOST_SEARCH__" ] test DHCPv6: address guest /sbin/dhclient -6 __IFNAME__ +# Wait for DAD to complete +guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]' hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' check [ "__ADDR6__" = "__HOST_ADDR6__" ] diff --git a/test/pasta/dhcp b/test/pasta/dhcp index 41556b8..d4f3ad5 100644 --- a/test/pasta/dhcp +++ b/test/pasta/dhcp @@ -35,6 +35,8 @@ check [ __MTU__ = 65520 ] test DHCPv6: address ns /sbin/dhclient -6 --no-pid __IFNAME__ +# Wait for DAD to complete +ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]' hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' diff --git a/test/perf/passt_tcp b/test/perf/passt_tcp index 089d953..5978c49 100644 --- a/test/perf/passt_tcp +++ b/test/perf/passt_tcp @@ -116,6 +116,8 @@ iperf3k ns # Reducing MTU below 1280 deconfigures IPv6, get our address back guest dhclient -6 -x guest dhclient -6 __IFNAME__ +# Wait for DAD to complete +guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done tl TCP RR latency over IPv4: guest to host lat - diff --git a/test/two_guests/basic b/test/two_guests/basic index ac50ff8..9ba5efe 100644 --- a/test/two_guests/basic +++ b/test/two_guests/basic @@ -40,6 +40,9 @@ guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done guest1 /sbin/dhclient -6 __IFNAME1__ guest2 /sbin/dhclient -6 __IFNAME2__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' g2out ADDR2_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME2__").addr_info[] | select(.prefixlen == 128).local] | .[0]' hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' From ef8a5161d0d83193cadc965f6a8951fe92659996 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 18 Oct 2024 12:35:53 +1100 Subject: [PATCH 054/382] passt.1: Mark --stderr as deprecated more prominently The description of this option says that it's deprecated, but unlike --no-copy-addrs and --no-copy-routes it doesn't have a clear label. Add one to make it easier to spot. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt.1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passt.1 b/passt.1 index ef33267..c573788 100644 --- a/passt.1 +++ b/passt.1 @@ -95,7 +95,7 @@ detached PID namespace after starting, because the PID itself cannot change. Default is to fork into background. .TP -.BR \-e ", " \-\-stderr +.BR \-e ", " \-\-stderr " " (DEPRECATED) This option has no effect, and is maintained for compatibility purposes only. Note that this configuration option is \fBdeprecated\fR and will be removed in a From 1fa421192c7f11f071d11a7aba1bb1f5cdf4a604 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 18 Oct 2024 12:35:54 +1100 Subject: [PATCH 055/382] passt.1: Clarify and update "Handling of local addresses" section This section didn't mention the effect of the --map-host-loopback option which now alters this behaviour. Update it accordingly. It used "local addresses" to mean specifically 127.0.0.0/8 and ::1. However, "local" could also refer to link-local addresses or to addresses of any scope which happen to be configured on the host. Use "loopback address" to be more precise about this. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt.1 | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/passt.1 b/passt.1 index c573788..46100e2 100644 --- a/passt.1 +++ b/passt.1 @@ -882,38 +882,40 @@ root@localhost's password: .SH NOTES -.SS Handling of traffic with local destination and source addresses +.SS Handling of traffic with loopback destination and source addresses -Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address, -depending on the configuration. Local destination or source addresses need to be -changed before packets are delivered to the guest or target namespace: most -operating systems would drop packets received from non-loopback interfaces with -local addresses, and it would also be impossible for guest or target namespace -to route answers back. +Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback +address (127.0.0.0/8 or ::1), depending on the configuration. Loopback +destination or source addresses need to be changed before packets are +delivered to the guest or target namespace: most operating systems +would drop packets received with loopback addresses on non-loopback +interfaces, and it would also be impossible for guest or target +namespace to route answers back. -For convenience, and somewhat arbitrarily, the source address on these packets -is translated to the address of the default IPv4 or IPv6 gateway (if any) -- -this is known to be an existing, valid address on the same subnet. +For convenience, the source address on these packets is translated to +the address specified by the \fB\-\-map-host-loopback\fR option. If +not specified this defaults, somewhat arbitrarily, to the address of +default IPv4 or IPv6 gateway (if any) -- this is known to be an +existing, valid address on the same subnet. If \fB\-\-no-map-gw\fR or +\fB\-\-map-host-loopback none\fR are specified this translation is +disabled and packets with loopback addresses are simply dropped. -Loopback destination addresses are instead translated to the observed external -address of the guest or target namespace. For IPv6 packets, if usage of a -link-local address by guest or namespace has ever been observed, and the -original destination address is also a link-local address, the observed -link-local address is used. Otherwise, the observed global address is used. For -both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses -will be used instead. +Loopback destination addresses are translated to the observed external +address of the guest or target namespace. For IPv6, the observed +link-local address is used if the translated source address is +link-local, otherwise the observed global address is used. For both +IPv4 and IPv6, if no addresses have been seen yet, the configured +addresses will be used instead. For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1, with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while the last observed source address from guest or namespace is 192.0.2.2, this will be translated to a connection from 192.0.2.1 to 192.0.2.2. -Similarly, for traffic coming from guest or namespace, packets with destination -address corresponding to the default gateway will have their destination address -translated to a loopback address, if and only if a packet, in the opposite -direction, with a loopback destination or source address, port-wise matching for -UDP, or connection-wise for TCP, has been recently forwarded to guest or -namespace. This behaviour can be disabled with \-\-no\-map\-gw. +Similarly, for traffic coming from guest or namespace, packets with +destination address corresponding to the \fB\-\-map-host-loopback\fR +address will have their destination address translated to a loopback +address. .SS Handling of local traffic in pasta From 58e6d685995f7b1068357a00e2618627d17fa8f5 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 18 Oct 2024 12:35:55 +1100 Subject: [PATCH 056/382] test: Clarify test for spliced inbound transfers The tests in pasta/tcp and pasta/udp for inbound transfers have the server listening within the namespace explicitly bound to 127.0.0.1 or ::1. This only works because of the behaviour of inbound splice connections, which always appear with both source and destination addresses as loopback in the namespace. That's not an inherent property for "spliced" connections and arguably an undesirable one. Also update the test names to make it clearer that these tests are expecting to exercise the "splice" path. Interestingly this was already correct for the equivalent passt_in_ns/*, although we also update the test names for clarity there. Note that there are similar issues in some of the podman tests, addressed in https://github.com/containers/podman/pull/24064 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/passt_in_ns/tcp | 8 ++++---- test/passt_in_ns/udp | 4 ++-- test/pasta/tcp | 16 ++++++++-------- test/pasta/udp | 8 ++++---- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/test/passt_in_ns/tcp b/test/passt_in_ns/tcp index aaf340e..319880b 100644 --- a/test/passt_in_ns/tcp +++ b/test/passt_in_ns/tcp @@ -32,7 +32,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10001 guestw guest cmp test_big.bin /root/big.bin -test TCP/IPv4: host to ns: big transfer +test TCP/IPv4: host to ns (spliced): big transfer nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002 @@ -90,7 +90,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10001 guestw guest cmp test_small.bin /root/small.bin -test TCP/IPv4: host to ns: small transfer +test TCP/IPv4: host to ns (spliced): small transfer nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002 @@ -146,7 +146,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10001 guestw guest cmp test_big.bin /root/big.bin -test TCP/IPv6: host to ns: big transfer +test TCP/IPv6: host to ns (spliced): big transfer nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002 @@ -204,7 +204,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10001 guestw guest cmp test_small.bin /root/small.bin -test TCP/IPv6: host to ns: small transfer +test TCP/IPv6: host to ns (spliced): small transfer nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002 diff --git a/test/passt_in_ns/udp b/test/passt_in_ns/udp index 3426ab9..791511c 100644 --- a/test/passt_in_ns/udp +++ b/test/passt_in_ns/udp @@ -30,7 +30,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10001,shut-null guestw guest cmp test.bin /root/medium.bin -test UDP/IPv4: host to ns +test UDP/IPv4: host to ns (recvmmsg/sendmmsg) nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null @@ -88,7 +88,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10001,shut-null guestw guest cmp test.bin /root/medium.bin -test UDP/IPv6: host to ns +test UDP/IPv6: host to ns (recvmmsg/sendmmsg) nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null diff --git a/test/pasta/tcp b/test/pasta/tcp index 6ab18c5..53b6f25 100644 --- a/test/pasta/tcp +++ b/test/pasta/tcp @@ -19,8 +19,8 @@ set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin set TEMP_SMALL __STATEDIR__/test_small.bin set TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin -test TCP/IPv4: host to ns: big transfer -nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_BIG__,create,trunc +test TCP/IPv4: host to ns (spliced): big transfer +nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002 nsw check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__ @@ -38,8 +38,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003 hostw check cmp __BASEPATH__/big.bin __TEMP_BIG__ -test TCP/IPv4: host to ns: small transfer -nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_SMALL__,create,trunc +test TCP/IPv4: host to ns (spliced): small transfer +nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc host socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002 nsw check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__ @@ -57,8 +57,8 @@ ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003 hostw check cmp __BASEPATH__/small.bin __TEMP_SMALL__ -test TCP/IPv6: host to ns: big transfer -nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_BIG__,create,trunc +test TCP/IPv6: host to ns (spliced): big transfer +nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002 nsw check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__ @@ -77,8 +77,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003 hostw check cmp __BASEPATH__/big.bin __TEMP_BIG__ -test TCP/IPv6: host to ns: small transfer -nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_SMALL__,create,trunc +test TCP/IPv6: host to ns (spliced): small transfer +nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002 nsw check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__ diff --git a/test/pasta/udp b/test/pasta/udp index 30e3a85..7734d02 100644 --- a/test/pasta/udp +++ b/test/pasta/udp @@ -17,8 +17,8 @@ htools dd socat ip jq set TEMP __STATEDIR__/test.bin set TEMP_NS __STATEDIR__/test_ns.bin -test UDP/IPv4: host to ns -nsb socat -u UDP4-LISTEN:10002,bind=127.0.0.1,null-eof OPEN:__TEMP_NS__,create,trunc +test UDP/IPv4: host to ns (recvmmsg/sendmmsg) +nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc host socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null nsw check cmp __BASEPATH__/medium.bin __TEMP_NS__ @@ -37,8 +37,8 @@ ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null hostw check cmp __BASEPATH__/medium.bin __TEMP__ -test UDP/IPv6: host to ns -nsb socat -u UDP6-LISTEN:10002,bind=[::1],null-eof OPEN:__TEMP_NS__,create,trunc +test UDP/IPv6: host to ns (recvmmsg/sendmmsg) +nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null nsw check cmp __BASEPATH__/medium.bin __TEMP_NS__ From b4dace8f462b346ae2135af1f8d681a99a849a5f Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 18 Oct 2024 12:35:56 +1100 Subject: [PATCH 057/382] fwd: Direct inbound spliced forwards to the guest's external address In pasta mode, where addressing permits we "splice" connections, forwarding directly from host socket to guest/container socket without any L2 or L3 processing. This gives us a very large performance improvement when it's possible. Since the traffic is from a local socket within the guest, it will go over the guest's 'lo' interface, and accordingly we set the guest side address to be the loopback address. However this has a surprising side effect: sometimes guests will run services that are only supposed to be used within the guest and are therefore bound to only 127.0.0.1 and/or ::1. pasta's forwarding exposes those services to the host, which isn't generally what we want. Correct this by instead forwarding inbound "splice" flows to the guest's external address. Link: https://github.com/containers/podman/issues/24045 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 9 +++++++++ fwd.c | 31 +++++++++++++++++++++++-------- passt.1 | 23 +++++++++++++++++++---- passt.h | 2 ++ 4 files changed, 53 insertions(+), 12 deletions(-) diff --git a/conf.c b/conf.c index c631019..b3b5342 100644 --- a/conf.c +++ b/conf.c @@ -912,6 +912,9 @@ pasta_opts: " -U, --udp-ns SPEC UDP port forwarding to init namespace\n" " SPEC is as described above\n" " default: auto\n" + " --host-lo-to-ns-lo DEPRECATED:\n" + " Translate host-loopback forwards to\n" + " namespace loopback\n" " --userns NSPATH Target user namespace to join\n" " --netns PATH|NAME Target network namespace to join\n" " --netns-only Don't join existing user namespace\n" @@ -1289,6 +1292,7 @@ void conf(struct ctx *c, int argc, char **argv) {"netns-only", no_argument, NULL, 20 }, {"map-host-loopback", required_argument, NULL, 21 }, {"map-guest-addr", required_argument, NULL, 22 }, + {"host-lo-to-ns-lo", no_argument, NULL, 23 }, {"dns-host", required_argument, NULL, 24 }, { 0 }, }; @@ -1467,6 +1471,11 @@ void conf(struct ctx *c, int argc, char **argv) conf_nat(optarg, &c->ip4.map_guest_addr, &c->ip6.map_guest_addr, NULL); break; + case 23: + if (c->mode != MODE_PASTA) + die("--host-lo-to-ns-lo is for pasta mode only"); + c->host_lo_to_ns_lo = 1; + break; case 24: if (inet_pton(AF_INET6, optarg, &c->ip6.dns_host) && !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) diff --git a/fwd.c b/fwd.c index a505098..c71f5e1 100644 --- a/fwd.c +++ b/fwd.c @@ -447,20 +447,35 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) { /* spliceable */ - /* Preserve the specific loopback adddress used, but let the - * kernel pick a source port on the target side + /* The traffic will go over the guest's 'lo' interface, but by + * default use its external address, so we don't inadvertently + * expose services that listen only on the guest's loopback + * address. That can be overridden by --host-lo-to-ns-lo which + * will instead forward to the loopback address in the guest. + * + * In either case, let the kernel pick the source address to + * match. */ - tgt->oaddr = ini->eaddr; + if (inany_v4(&ini->eaddr)) { + if (c->host_lo_to_ns_lo) + tgt->eaddr = inany_loopback4; + else + tgt->eaddr = inany_from_v4(c->ip4.addr_seen); + tgt->oaddr = inany_any4; + } else { + if (c->host_lo_to_ns_lo) + tgt->eaddr = inany_loopback6; + else + tgt->eaddr.a6 = c->ip6.addr_seen; + tgt->oaddr = inany_any6; + } + + /* Let the kernel pick source port */ tgt->oport = 0; if (proto == IPPROTO_UDP) /* But for UDP preserve the source port */ tgt->oport = ini->eport; - if (inany_v4(&ini->eaddr)) - tgt->eaddr = inany_loopback4; - else - tgt->eaddr = inany_loopback6; - return PIF_SPLICE; } diff --git a/passt.1 b/passt.1 index 46100e2..f084978 100644 --- a/passt.1 +++ b/passt.1 @@ -605,6 +605,13 @@ Configure UDP port forwarding from target namespace to init namespace. Default is \fBauto\fR. +.TP +.BR \-\-host-lo-to-ns-lo " " (DEPRECATED) +If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from +the host's loopback address will appear on the loopback address in the +guest as well. Without this option such forwarded packets will appear +to come from the guest's public address. + .TP .BR \-\-userns " " \fIspec Target user namespace to join, as a path. If PID is given, without this option, @@ -893,8 +900,9 @@ interfaces, and it would also be impossible for guest or target namespace to route answers back. For convenience, the source address on these packets is translated to -the address specified by the \fB\-\-map-host-loopback\fR option. If -not specified this defaults, somewhat arbitrarily, to the address of +the address specified by the \fB\-\-map-host-loopback\fR option (with +some exceptions in pasta mode, see next section below). If not +specified this defaults, somewhat arbitrarily, to the address of default IPv4 or IPv6 gateway (if any) -- this is known to be an existing, valid address on the same subnet. If \fB\-\-no-map-gw\fR or \fB\-\-map-host-loopback none\fR are specified this translation is @@ -931,8 +939,15 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet transfers. -This bypass only applies to local connections and traffic, because it's not -possible to bind sockets to foreign addresses. +Because it's not possible to bind sockets to foreign addresses, this +bypass only applies to local connections and traffic. It also means +that the address translation differs slightly from passt mode. +Connections from loopback to loopback on the host will appear to come +from the target namespace's public address within the guest, unless +\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will +appear to come from loopback in the namespace as well. The latter +behaviour used to be the default, but is usually undesirable, since it +can unintentionally expose namespace local services to the host. .SS Binding to low numbered ports (well-known or system ports, up to 1023) diff --git a/passt.h b/passt.h index 4908ed9..72c7f72 100644 --- a/passt.h +++ b/passt.h @@ -225,6 +225,7 @@ struct ip6_ctx { * @no_dhcpv6: Disable DHCPv6 server * @no_ndp: Disable NDP handler altogether * @no_ra: Disable router advertisements + * @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses * @freebind: Allow binding of non-local addresses for forwarding * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max @@ -285,6 +286,7 @@ struct ctx { int no_dhcpv6; int no_ndp; int no_ra; + int host_lo_to_ns_lo; int freebind; int low_wmem; From 9e5df350d63b0819f04b44bb57ea146274a6b42f Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 21 Oct 2024 18:40:29 +1100 Subject: [PATCH 058/382] tcp: Use structures to construct initial TCP options As a rule, we prefer constructing packets with matching C structures, rather than building them byte by byte. However, one case we still build byte by byte is the TCP options we include in SYN packets (in fact the only time we generate TCP options on the tap interface). Rework this to use a structure and initialisers which make it a bit clearer what's going on. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by; Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 17 +++---------- tcp_buf.c | 2 +- tcp_internal.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 67 insertions(+), 20 deletions(-) diff --git a/tcp.c b/tcp.c index b2155ab..0d22e07 100644 --- a/tcp.c +++ b/tcp.c @@ -1232,7 +1232,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, * 1 otherwise */ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, - int flags, struct tcphdr *th, char *data, + int flags, struct tcphdr *th, struct tcp_syn_opts *opts, size_t *optlen) { struct tcp_info tinfo = { 0 }; @@ -1258,12 +1258,6 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, if (flags & SYN) { int mss; - /* Options: MSS, NOP and window scale (8 bytes) */ - *optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; - - *data++ = OPT_MSS; - *data++ = OPT_MSS_LEN; - if (c->mtu == -1) { mss = tinfo.tcpi_snd_mss; } else { @@ -1279,16 +1273,11 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, else if (mss > PAGE_SIZE) mss = ROUND_DOWN(mss, PAGE_SIZE); } - *(uint16_t *)data = htons(MIN(USHRT_MAX, mss)); - - data += OPT_MSS_LEN - 2; conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale); - *data++ = OPT_NOP; - *data++ = OPT_WS; - *data++ = OPT_WS_LEN; - *data++ = conn->ws_to_tap; + *opts = TCP_SYN_OPTS(mss, conn->ws_to_tap); + *optlen = sizeof(*opts); } else if (!(flags & RST)) { flags |= ACK; } diff --git a/tcp_buf.c b/tcp_buf.c index 238827b..44df0e4 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -282,7 +282,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) seq = conn->seq_to_tap; ret = tcp_prepare_flags(c, conn, flags, &payload->th, - payload->opts, &optlen); + &payload->opts, &optlen); if (ret <= 0) { if (CONN_V4(conn)) tcp4_flags_used--; diff --git a/tcp_internal.h b/tcp_internal.h index 2f74ffe..1ab8ce2 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -33,9 +33,7 @@ #define OPT_EOL 0 #define OPT_NOP 1 #define OPT_MSS 2 -#define OPT_MSS_LEN 4 #define OPT_WS 3 -#define OPT_WS_LEN 3 #define OPT_SACKP 4 #define OPT_SACK 5 #define OPT_TS 8 @@ -77,6 +75,65 @@ struct tcp_payload_t { } __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif +/** struct tcp_opt_nop - TCP NOP option + * @kind: Option kind (OPT_NOP = 1) + */ +struct tcp_opt_nop { + uint8_t kind; +} __attribute__ ((packed)); +#define TCP_OPT_NOP ((struct tcp_opt_nop){ .kind = OPT_NOP, }) + +/** struct tcp_opt_mss - TCP MSS option + * @kind: Option kind (OPT_MSS == 2) + * @len: Option length (4) + * @mss: Maximum Segment Size + */ +struct tcp_opt_mss { + uint8_t kind; + uint8_t len; + uint16_t mss; +} __attribute__ ((packed)); +#define TCP_OPT_MSS(mss_) \ + ((struct tcp_opt_mss) { \ + .kind = OPT_MSS, \ + .len = sizeof(struct tcp_opt_mss), \ + .mss = htons(mss_), \ + }) + +/** struct tcp_opt_ws - TCP Window Scaling option + * @kind: Option kind (OPT_WS == 3) + * @len: Option length (3) + * @shift: Window scaling shift + */ +struct tcp_opt_ws { + uint8_t kind; + uint8_t len; + uint8_t shift; +} __attribute__ ((packed)); +#define TCP_OPT_WS(shift_) \ + ((struct tcp_opt_ws) { \ + .kind = OPT_WS, \ + .len = sizeof(struct tcp_opt_ws), \ + .shift = (shift_), \ + }) + +/** struct tcp_syn_opts - TCP options we apply to SYN packets + * @mss: Maximum Segment Size (MSS) option + * @nop: NOP opt (for alignment) + * @ws: Window Scaling (WS) option + */ +struct tcp_syn_opts { + struct tcp_opt_mss mss; + struct tcp_opt_nop nop; + struct tcp_opt_ws ws; +} __attribute__ ((packed)); +#define TCP_SYN_OPTS(mss_, ws_) \ + ((struct tcp_syn_opts){ \ + .mss = TCP_OPT_MSS(mss_), \ + .nop = TCP_OPT_NOP, \ + .ws = TCP_OPT_WS(ws_), \ + }) + /** * struct tcp_flags_t - TCP header and data to send zero-length * segments (flags) @@ -85,7 +142,7 @@ struct tcp_payload_t { */ struct tcp_flags_t { struct tcphdr th; - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; + struct tcp_syn_opts opts; #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))); #else @@ -124,7 +181,8 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, bool no_tcp_csum); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, bool force_seq, struct tcp_info *tinfo); -int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, - struct tcphdr *th, char *data, size_t *optlen); +int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, + int flags, struct tcphdr *th, struct tcp_syn_opts *opts, + size_t *optlen); #endif /* TCP_INTERNAL_H */ From 149f457b23ed2cb196eed1b3f413b4a900f39547 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 24 Oct 2024 09:12:11 +0200 Subject: [PATCH 059/382] tcp_splice: splice() all we have to the writing side, not what we just read In tcp_splice_sock_handler(), we try to calculate how much we can move from the pipe to the writing socket: if we just read some bytes, we'll use that amount, but if we haven't, we just try to empty the pipe. However, if we just read something, that doesn't mean that that's all the data we have on the pipe, as it's obvious from this sequence, where: pasta: epoll event on connected spliced TCP socket 54 (events: 0x00000001) Flow 0 (TCP connection (spliced)): 98304 from read-side call Flow 0 (TCP connection (spliced)): 33615 from write-side call (passed 98304) Flow 0 (TCP connection (spliced)): -1 from read-side call Flow 0 (TCP connection (spliced)): -1 from write-side call (passed 524288) Flow 0 (TCP connection (spliced)): event at tcp_splice_sock_handler:580 Flow 0 (TCP connection (spliced)): OUT_WAIT_0 we first pile up 98304 - 33615 = 64689 pending bytes, that we read but couldn't write, as the receiver buffer is full, and we set the corresponding OUT_WAIT flag. Then: pasta: epoll event on connected spliced TCP socket 54 (events: 0x00000001) Flow 0 (TCP connection (spliced)): 32768 from read-side call Flow 0 (TCP connection (spliced)): -1 from write-side call (passed 32768) Flow 0 (TCP connection (spliced)): event at tcp_splice_sock_handler:580 we splice() 32768 more bytes from our receiving side to the pipe. At some point: pasta: epoll event on connected spliced TCP socket 49 (events: 0x00000004) Flow 0 (TCP connection (spliced)): event at tcp_splice_sock_handler:489 Flow 0 (TCP connection (spliced)): ~OUT_WAIT_0 Flow 0 (TCP connection (spliced)): 1320 from read-side call Flow 0 (TCP connection (spliced)): 1320 from write-side call (passed 1320) the receiver is signalling to us that it's ready for more data (EPOLLOUT). We reset the OUT_WAIT flag, read 1320 more bytes from our receiving socket into the pipe, and that's what we write to the receiver, forgetting about the pending 97457 bytes we had, which the receiver might never get (not the same 97547 bytes: we'll actually send 1320 of those). This condition is rather hard to reproduce, and it was observed with Podman pulling container images via HTTPS. In the traces above, the client is side 0 (the initiating peer), and the server is sending the whole data. Instead of splicing from pipe to socket the amount of data we just read, we need to splice all the pending data we piled up until that point. We could do that using 'read' and 'written' counters, but there's actually no need, as the kernel also keeps track of how much data is available on the pipe. So, to make this simple and more robust, just give the whole pipe size as length to splice(). The kernel knows what to do with it. Later in the function, we used 'to_write' for an optimisation meant to reduce wakeups which retries right away to splice() in both directions if we couldn't write to the receiver the whole amount of pending data. Calculate a 'pending' value instead, only if we reach that point. Now that we check for the actual amount of pending data in that optimisation, we need to make sure we don't compare a zero or negative 'written' value: if we met that, it means that the receiver signalled end-of-file, an error, or to try again later. In those three cases, the optimisation doesn't make any sense, so skip it. Reported-by: Ed Santiago <santiago@redhat.com> Reported-by: Paul Holzinger <pholzing@redhat.com> Analysed-by: Paul Holzinger <pholzing@redhat.com> Link: https://github.com/containers/podman/issues/24219 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp_splice.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index 9f5cc27..f112cfe 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -503,7 +503,7 @@ swap: lowat_act_flag = RCVLOWAT_ACT(fromsidei); while (1) { - ssize_t readlen, to_write = 0, written; + ssize_t readlen, written, pending; int more = 0; retry: @@ -518,14 +518,11 @@ retry: if (errno != EAGAIN) goto close; - - to_write = c->tcp.pipe_size; } else if (!readlen) { eof = 1; - to_write = c->tcp.pipe_size; } else { never_read = 0; - to_write += readlen; + if (readlen >= (long)c->tcp.pipe_size * 90 / 100) more = SPLICE_F_MORE; @@ -535,10 +532,10 @@ retry: eintr: written = splice(conn->pipe[fromsidei][0], NULL, - conn->s[!fromsidei], NULL, to_write, + conn->s[!fromsidei], NULL, c->tcp.pipe_size, SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); flow_trace(conn, "%zi from write-side call (passed %zi)", - written, to_write); + written, c->tcp.pipe_size); /* Most common case: skip updating counters. */ if (readlen > 0 && readlen == written) { @@ -584,10 +581,9 @@ eintr: if (never_read && written == (long)(c->tcp.pipe_size)) goto retry; - if (!never_read && written < to_write) { - to_write -= written; + pending = conn->read[fromsidei] - conn->written[fromsidei]; + if (!never_read && written > 0 && written < pending) goto retry; - } if (eof) break; From 9e4615b40bfa7f1b692c3c3360d88a22c453b016 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 24 Oct 2024 22:16:39 +0200 Subject: [PATCH 060/382] tcp_splice: fcntl(2) returns the size of the pipe, if F_SETPIPE_SZ succeeds Don't report bogus failures (with --trace) just because the return value is not zero. Link: https://github.com/containers/podman/issues/24219 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp_splice.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index f112cfe..93f8bce 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -320,7 +320,7 @@ static int tcp_splice_connect_finish(const struct ctx *c, } if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ, - c->tcp.pipe_size)) { + c->tcp.pipe_size) != (int)c->tcp.pipe_size) { flow_trace(conn, "cannot set %d->%d pipe size to %zu", sidei, !sidei, c->tcp.pipe_size); @@ -672,7 +672,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c) continue; if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ, - c->tcp.pipe_size)) { + c->tcp.pipe_size) != (int)c->tcp.pipe_size) { trace("TCP (spliced): cannot set pool pipe size to %zu", c->tcp.pipe_size); } From 13f0291ede19fc6baea02e8327acec144bdf79e6 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 24 Oct 2024 15:59:20 +1100 Subject: [PATCH 061/382] tcp: Remove compile-time dependency on struct tcp_info version In the Makefile we probe to create several defines based on the presence of particular fields in struct tcp_info. These defines are used for two purposes, neither of which they accomplish well: 1) Determining if the tcp_info fields are available at runtime. For this purpose the defines are Just Plain Wrong, since the runtime kernel may not be the same as the compile time kernel. We corrected this for tcp_snd_wnd, but not for tcpi_bytes_acked or tcpi_min_rtt 2) Allowing the source to compile against older kernel headers which don't have the fields in question. This works in theory, but it does mean we won't be able to use the fields, even if later run against a newer kernel. Furthermore, it's quite fragile: without much more thorough tests of builds in different environments that we're currently set up for, it's very easy to miss cases where we're accessing a field without protection from an #ifdef. For example we currently access tcpi_snd_wnd without #ifdefs in tcp_update_seqack_wnd(). Improve this with a different approach, borrowed from qemu (which has many instances of similar problems). Don't compile against linux/tcp.h, using netinet/tcp.h instead. Then for when we need an extension field, define a struct tcp_info_linux, copied from the kernel, with all the fields we're interested in. That may need updating from future kernel versions, but only when we want to use a new extension, so it shouldn't be frequent. This allows us to remove the HAS_SND_WND define entirely. We keep HAS_BYTES_ACKED and HAS_MIN_RTT now, since they're used for purpose (1), we'll fix that in a later patch. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Trivial grammar fixes in comments] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 5 --- tcp.c | 30 ++++--------- tcp_info.h | 120 +++++++++++++++++++++++++++++++++++++++++++++++++ tcp_internal.h | 4 +- 4 files changed, 132 insertions(+), 27 deletions(-) create mode 100644 tcp_info.h diff --git a/Makefile b/Makefile index 74a9513..6faa501 100644 --- a/Makefile +++ b/Makefile @@ -67,11 +67,6 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ udp.h udp_flow.h util.h HEADERS = $(PASST_HEADERS) seccomp.h -C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; -ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) - FLAGS += -DHAS_SND_WND -endif - C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 }; ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) FLAGS += -DHAS_BYTES_ACKED diff --git a/tcp.c b/tcp.c index 0d22e07..2a0b272 100644 --- a/tcp.c +++ b/tcp.c @@ -274,6 +274,7 @@ #include <net/if.h> #include <netinet/in.h> #include <netinet/ip.h> +#include <netinet/tcp.h> #include <stdint.h> #include <stdbool.h> #include <stddef.h> @@ -286,8 +287,6 @@ #include <time.h> #include <arpa/inet.h> -#include <linux/tcp.h> /* For struct tcp_info */ - #include "checksum.h" #include "util.h" #include "iov.h" @@ -303,6 +302,7 @@ #include "flow_table.h" #include "tcp_internal.h" +#include "tcp_info.h" #include "tcp_buf.h" /* MSS rounding: see SET_MSS() */ @@ -318,11 +318,6 @@ #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ -/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of - * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP - */ -#define SOL_TCP IPPROTO_TCP - #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ #define CONN_IS_CLOSING(conn) \ @@ -365,14 +360,11 @@ char tcp_buf_discard [MAX_WINDOW]; /* Does the kernel support TCP_PEEK_OFF? */ bool peek_offset_cap; -#ifdef HAS_SND_WND + /* Does the kernel report sending window in TCP_INFO (kernel commit * 8f7baad7f035) */ bool snd_wnd_cap; -#else -#define snd_wnd_cap (false) -#endif /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -678,7 +670,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) * @tinfo: Pointer to struct tcp_info for socket */ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, - const struct tcp_info *tinfo) + const struct tcp_info_linux *tinfo) { #ifdef HAS_MIN_RTT const struct flowside *tapside = TAPFLOW(conn); @@ -1114,13 +1106,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, * Return: 1 if sequence or window were updated, 0 otherwise */ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - bool force_seq, struct tcp_info *tinfo) + bool force_seq, struct tcp_info_linux *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; /* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */ socklen_t sl = sizeof(*tinfo); - struct tcp_info tinfo_new; + struct tcp_info_linux tinfo_new; uint32_t new_wnd_to_tap = prev_wnd_to_tap; int s = conn->sock; @@ -1235,7 +1227,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, struct tcp_syn_opts *opts, size_t *optlen) { - struct tcp_info tinfo = { 0 }; + struct tcp_info_linux tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; @@ -2578,7 +2570,6 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) return ret; } -#ifdef HAS_SND_WND /** * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd * @@ -2586,7 +2577,7 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) */ static bool tcp_probe_snd_wnd_cap(void) { - struct tcp_info tinfo; + struct tcp_info_linux tinfo; socklen_t sl = sizeof(tinfo); int s; @@ -2604,13 +2595,12 @@ static bool tcp_probe_snd_wnd_cap(void) close(s); - if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) + + if (sl < (offsetof(struct tcp_info_linux, tcpi_snd_wnd) + sizeof(tinfo.tcpi_snd_wnd))) return false; return true; } -#endif /* HAS_SND_WND */ /** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data @@ -2645,9 +2635,7 @@ int tcp_init(struct ctx *c) (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); -#ifdef HAS_SND_WND snd_wnd_cap = tcp_probe_snd_wnd_cap(); -#endif debug("TCP_INFO tcpi_snd_wnd field%ssupported", snd_wnd_cap ? " " : " not "); diff --git a/tcp_info.h b/tcp_info.h new file mode 100644 index 0000000..06ccb16 --- /dev/null +++ b/tcp_info.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * + * Largely derived from include/linux/tcp.h in the Linux kernel + */ + +#ifndef TCP_INFO_H +#define TCP_INFO_H + +/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt() + * + * Some fields returned by TCP_INFO have been there for ages and are shared with + * BSD. struct tcp_info from netinet/tcp.h has only those fields. There are + * also a many Linux specific extensions to the structure, which are only found + * in the linux/tcp.h version of struct tcp_info. + * + * We want to use some of those extension fields, when available. We can test + * for availability in the runtime kernel using the length returned from + * getsockopt(). However, we won't necessarily be compiled against the same + * kernel headers as we'll run with, so compiling directly against linux/tcp.h + * means wrapping every field access in an #ifdef whose #else does the same + * thing as when the field is missing at runtime. This rapidly gets messy. + * + * Instead we define here struct tcp_info_linux which includes all the Linux + * extensions that we want to use. This is taken from v6.11 of the kernel. + */ +struct tcp_info_linux { + uint8_t tcpi_state; + uint8_t tcpi_ca_state; + uint8_t tcpi_retransmits; + uint8_t tcpi_probes; + uint8_t tcpi_backoff; + uint8_t tcpi_options; + uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; + uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; + + uint32_t tcpi_rto; + uint32_t tcpi_ato; + uint32_t tcpi_snd_mss; + uint32_t tcpi_rcv_mss; + + uint32_t tcpi_unacked; + uint32_t tcpi_sacked; + uint32_t tcpi_lost; + uint32_t tcpi_retrans; + uint32_t tcpi_fackets; + + /* Times. */ + uint32_t tcpi_last_data_sent; + uint32_t tcpi_last_ack_sent; + uint32_t tcpi_last_data_recv; + uint32_t tcpi_last_ack_recv; + + /* Metrics. */ + uint32_t tcpi_pmtu; + uint32_t tcpi_rcv_ssthresh; + uint32_t tcpi_rtt; + uint32_t tcpi_rttvar; + uint32_t tcpi_snd_ssthresh; + uint32_t tcpi_snd_cwnd; + uint32_t tcpi_advmss; + uint32_t tcpi_reordering; + + uint32_t tcpi_rcv_rtt; + uint32_t tcpi_rcv_space; + + uint32_t tcpi_total_retrans; + + /* Linux extensions */ + uint64_t tcpi_pacing_rate; + uint64_t tcpi_max_pacing_rate; + uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ + uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */ + uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */ + + uint32_t tcpi_notsent_bytes; + uint32_t tcpi_min_rtt; + uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */ + uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */ + + uint64_t tcpi_delivery_rate; + + uint64_t tcpi_busy_time; /* Time (usec) busy sending data */ + uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */ + uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ + + uint32_t tcpi_delivered; + uint32_t tcpi_delivered_ce; + + uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ + uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ + uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ + uint32_t tcpi_reord_seen; /* reordering events seen */ + + uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */ + + uint32_t tcpi_snd_wnd; /* peer's advertised receive window after + * scaling (bytes) + */ + uint32_t tcpi_rcv_wnd; /* local advertised receive window after + * scaling (bytes) + */ + + uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */ + + uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including + * SYN/SYN-ACK and recurring timeouts. + */ + uint16_t tcpi_total_rto_recoveries; /* Total number of RTO + * recoveries, including any + * unfinished recovery. + */ + uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries + * in milliseconds, including any + * unfinished recovery. + */ +}; + +#endif /* TCP_INFO_H */ diff --git a/tcp_internal.h b/tcp_internal.h index 1ab8ce2..a5a47df 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -175,12 +175,14 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); tcp_rst_do(c, conn); \ } while (0) +struct tcp_info_linux; + size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, struct iovec *iov, size_t dlen, const uint16_t *check, uint32_t seq, bool no_tcp_csum); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - bool force_seq, struct tcp_info *tinfo); + bool force_seq, struct tcp_info_linux *tinfo); int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, struct tcp_syn_opts *opts, size_t *optlen); From 81143813a6b3ec297c31d234bbdc6000ed8c7052 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 24 Oct 2024 15:59:21 +1100 Subject: [PATCH 062/382] tcp: Generalise probing for tcpi_snd_wnd field In order to use the tcpi_snd_wnd field from the TCP_INFO getsockopt() we need the field to be supported in the runtime kernel (snd_wnd_cap). In fact we should check that for for every tcp_info field we want to use, beyond the very old ones shared with BSD. Prepare to do that, by generalising the probing from setting a single bool to instead record the size of the returned TCP_INFO structure. We can then use that recorded value to check for the presence of any field we need. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/tcp.c b/tcp.c index 2a0b272..998e56d 100644 --- a/tcp.c +++ b/tcp.c @@ -361,10 +361,15 @@ char tcp_buf_discard [MAX_WINDOW]; /* Does the kernel support TCP_PEEK_OFF? */ bool peek_offset_cap; -/* Does the kernel report sending window in TCP_INFO (kernel commit - * 8f7baad7f035) - */ -bool snd_wnd_cap; +/* Size of data returned by TCP_INFO getsockopt() */ +socklen_t tcp_info_size; + +#define tcp_info_cap(f_) \ + ((offsetof(struct tcp_info_linux, tcpi_##f_) + \ + sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size) + +/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */ +#define snd_wnd_cap tcp_info_cap(snd_wnd) /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -2571,11 +2576,11 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) } /** - * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd + * tcp_probe_tcp_info() - Check what data TCP_INFO reports * - * Return: true if supported, false otherwise + * Return: Number of bytes returned by TCP_INFO getsockopt() */ -static bool tcp_probe_snd_wnd_cap(void) +static socklen_t tcp_probe_tcp_info(void) { struct tcp_info_linux tinfo; socklen_t sl = sizeof(tinfo); @@ -2595,11 +2600,7 @@ static bool tcp_probe_snd_wnd_cap(void) close(s); - if (sl < (offsetof(struct tcp_info_linux, tcpi_snd_wnd) + - sizeof(tinfo.tcpi_snd_wnd))) - return false; - - return true; + return sl; } /** @@ -2635,9 +2636,12 @@ int tcp_init(struct ctx *c) (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); - snd_wnd_cap = tcp_probe_snd_wnd_cap(); - debug("TCP_INFO tcpi_snd_wnd field%ssupported", - snd_wnd_cap ? " " : " not "); + tcp_info_size = tcp_probe_tcp_info(); + +#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \ + STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ") + dbg_tcpi(snd_wnd); +#undef dbg_tcpi return 0; } From e7fcd0c3481f15395ea4060eadfac0b6a8f69b29 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 24 Oct 2024 15:59:22 +1100 Subject: [PATCH 063/382] tcp: Use runtime tests for TCP_INFO fields In order to use particular fields from the TCP_INFO getsockopt() we need them to be in structure returned by the runtime kernel. We attempt to determine that with the HAS_BYTES_ACKED and HAS_MIN_RTT defines, probed in the Makefile. However, that's not correct, because the kernel headers we compile against may not be the same as the runtime kernel. We instead should check against the size of structure returned from the TCP_INFO getsockopt() as we already do for tcpi_snd_wnd. Switch from the compile time flags to a runtime test. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 10 ---------- tcp.c | 52 ++++++++++++++++++++++++++-------------------------- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/Makefile b/Makefile index 6faa501..4c2d020 100644 --- a/Makefile +++ b/Makefile @@ -67,16 +67,6 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ udp.h udp_flow.h util.h HEADERS = $(PASST_HEADERS) seccomp.h -C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 }; -ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) - FLAGS += -DHAS_BYTES_ACKED -endif - -C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 }; -ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) - FLAGS += -DHAS_MIN_RTT -endif - C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) FLAGS += -DHAS_GETRANDOM diff --git a/tcp.c b/tcp.c index 998e56d..0569dc6 100644 --- a/tcp.c +++ b/tcp.c @@ -370,6 +370,10 @@ socklen_t tcp_info_size; /* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */ #define snd_wnd_cap tcp_info_cap(snd_wnd) +/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */ +#define bytes_acked_cap tcp_info_cap(bytes_acked) +/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */ +#define min_rtt_cap tcp_info_cap(min_rtt) /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -677,11 +681,10 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, const struct tcp_info_linux *tinfo) { -#ifdef HAS_MIN_RTT const struct flowside *tapside = TAPFLOW(conn); int i, hole = -1; - if (!tinfo->tcpi_min_rtt || + if (!min_rtt_cap || (int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD) return; @@ -702,10 +705,6 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, if (hole == LOW_RTT_TABLE_SIZE) hole = 0; inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); -#else - (void)conn; - (void)tinfo; -#endif /* HAS_MIN_RTT */ } /** @@ -1121,30 +1120,29 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, uint32_t new_wnd_to_tap = prev_wnd_to_tap; int s = conn->sock; -#ifndef HAS_BYTES_ACKED - (void)force_seq; - - conn->seq_ack_to_tap = conn->seq_from_tap; - if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) - conn->seq_ack_to_tap = prev_ack_to_tap; -#else - if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn) - || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) { + if (!bytes_acked_cap) { conn->seq_ack_to_tap = conn->seq_from_tap; - } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { - if (!tinfo) { - tinfo = &tinfo_new; - if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) - return 0; - } - - conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + - conn->seq_init_from_tap; - if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) conn->seq_ack_to_tap = prev_ack_to_tap; + } else { + if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || + tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) || + (conn->flags & LOCAL) || force_seq) { + conn->seq_ack_to_tap = conn->seq_from_tap; + } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { + if (!tinfo) { + tinfo = &tinfo_new; + if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) + return 0; + } + + conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + + conn->seq_init_from_tap; + + if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) + conn->seq_ack_to_tap = prev_ack_to_tap; + } } -#endif /* !HAS_BYTES_ACKED */ if (!snd_wnd_cap) { tcp_get_sndbuf(conn); @@ -2641,6 +2639,8 @@ int tcp_init(struct ctx *c) #define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \ STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ") dbg_tcpi(snd_wnd); + dbg_tcpi(bytes_acked); + dbg_tcpi(min_rtt); #undef dbg_tcpi return 0; From f43f7d5e89b51b44a03de5a1eb566e14604bb08d Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 24 Oct 2024 10:50:58 +0200 Subject: [PATCH 064/382] tcp: cleanup tcp_buf_data_from_sock() Remove the err label as there is only one caller, and move code to the caller position. ret is not needed here anymore as it is always 0. Remove sendlen as we can user directly len. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_buf.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/tcp_buf.c b/tcp_buf.c index 44df0e4..cb6742c 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -382,8 +382,8 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) { uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; - int sendlen, len, dlen, v4 = CONN_V4(conn); - int s = conn->sock, i, ret = 0; + int len, dlen, v4 = CONN_V4(conn); + int s = conn->sock, i; struct msghdr mh_sock = { 0 }; uint16_t mss = MSS_GET(conn); uint32_t already_sent, seq; @@ -453,12 +453,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) len = recvmsg(s, &mh_sock, MSG_PEEK); while (len < 0 && errno == EINTR); - if (len < 0) - goto err; + if (len < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + tcp_rst(c, conn); + return -errno; + } + + return 0; + } if (!len) { if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { - if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) { + int ret = tcp_buf_send_flag(c, conn, FIN | ACK); + if (ret) { tcp_rst(c, conn); return ret; } @@ -469,19 +476,18 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len; if (!peek_offset_cap) - sendlen -= already_sent; + len -= already_sent; - if (sendlen <= 0) { + if (len <= 0) { conn_flag(c, conn, STALLED); return 0; } conn_flag(c, conn, ~STALLED); - send_bufs = DIV_ROUND_UP(sendlen, mss); - last_len = sendlen - (send_bufs - 1) * mss; + send_bufs = DIV_ROUND_UP(len, mss); + last_len = len - (send_bufs - 1) * mss; /* Likely, some new data was acked too. */ tcp_update_seqack_wnd(c, conn, false, NULL); @@ -502,12 +508,4 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; - -err: - if (errno != EAGAIN && errno != EWOULDBLOCK) { - ret = -errno; - tcp_rst(c, conn); - } - - return ret; } From 5563d5f668450441e4f3cedc9d83283739b5e0ca Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 25 Oct 2024 09:49:10 +0200 Subject: [PATCH 065/382] test: remove obsolete images Remove debian-9-nocloud-amd64-daily-20200210-166.qcow2 and openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2 as they cannot be downloaded anymore Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/Makefile | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/test/Makefile b/test/Makefile index 35a3b55..5e49047 100644 --- a/test/Makefile +++ b/test/Makefile @@ -8,7 +8,6 @@ WGET = wget -c DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \ - debian-9-nocloud-amd64-daily-20200210-166.qcow2 \ debian-10-nocloud-amd64.qcow2 \ debian-10-generic-arm64.qcow2 \ debian-10-generic-ppc64el-20220911-1135.qcow2 \ @@ -42,8 +41,7 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \ openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \ openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \ - openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \ - openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2 + openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \ trusty-server-cloudimg-i386-disk1.img \ @@ -135,9 +133,6 @@ realclean: clean debian-8.11.0-openstack-%.qcow2: $(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2 -debian-9-nocloud-%-daily-20200210-166.qcow2: - $(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2 - debian-10-nocloud-%.qcow2: $(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2 @@ -203,9 +198,6 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz: openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz: $(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz -openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2: - $(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2 - # Ubuntu downloads trusty-server-cloudimg-%-disk1.img: $(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img From 2053c36dec4ce3e5bfddb52f5f2957165a692f1d Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Mon, 28 Oct 2024 22:13:59 -0400 Subject: [PATCH 066/382] tcp: set ip and eth headers in l2 tap queues on the fly l2 tap queue entries are currently initialized at system start, and reused with preset headers through its whole life time. The only fields we need to update per message are things like payload size and checksums. If we want to reuse these entries between ipv4 and ipv6 messages we will need to set the pointer to the right header on the fly per message, since the header type may differ between entries in the same queue. The same needs to be done for the ethernet header. We do these changes here. Signed-off-by: Jon Maloy <jmaloy@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_buf.c | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/tcp_buf.c b/tcp_buf.c index cb6742c..e249c6b 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -130,8 +130,7 @@ void tcp_sock4_iov_init(const struct ctx *c) iov = tcp4_l2_iov[i]; iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]); + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; } @@ -173,8 +172,7 @@ void tcp_sock6_iov_init(const struct ctx *c) iov = tcp6_l2_iov[i]; iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]); + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; } @@ -273,11 +271,17 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) uint32_t seq; int ret; - if (CONN_V4(conn)) - iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - else - iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - + if (CONN_V4(conn)) { + iov = tcp4_l2_flags_iov[tcp4_flags_used]; + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[tcp4_flags_used]); + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + tcp4_flags_used++; + } else { + iov = tcp6_l2_flags_iov[tcp6_flags_used]; + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[tcp6_flags_used]); + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + tcp6_flags_used++; + } payload = iov[TCP_IOV_PAYLOAD].iov_base; seq = conn->seq_to_tap; @@ -296,21 +300,19 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) if (flags & DUP_ACK) { struct iovec *dup_iov; - int i; if (CONN_V4(conn)) dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; else dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - for (i = 0; i < TCP_NUM_IOVS; i++) { - /* All frames share the same ethernet header buffer */ - if (i != TCP_IOV_ETH) { - memcpy(dup_iov[i].iov_base, iov[i].iov_base, - iov[i].iov_len); - } - } - dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; + memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_TAP].iov_len); + dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base; + dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP]; + memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, l4len); + dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len; } if (CONN_V4(conn)) { @@ -350,8 +352,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, } tcp4_frame_conns[tcp4_payload_used] = conn; - - iov = tcp4_l2_iov[tcp4_payload_used++]; + iov = tcp4_l2_iov[tcp4_payload_used]; + iov[TCP_IOV_IP] = + IOV_OF_LVALUE(tcp4_payload_ip[tcp4_payload_used++]); + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false); iov[TCP_IOV_PAYLOAD].iov_len = l4len; @@ -359,8 +363,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp_payload_flush(c); } else if (CONN_V6(conn)) { tcp6_frame_conns[tcp6_payload_used] = conn; - - iov = tcp6_l2_iov[tcp6_payload_used++]; + iov = tcp6_l2_iov[tcp6_payload_used]; + iov[TCP_IOV_IP] = + IOV_OF_LVALUE(tcp6_payload_ip[tcp6_payload_used++]); + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq, false); iov[TCP_IOV_PAYLOAD].iov_len = l4len; From ba38e67cf405c5fd4c0fc043af453fa23a55fb35 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Mon, 28 Oct 2024 22:14:00 -0400 Subject: [PATCH 067/382] tcp: unify l2 TCPv4 and TCPv6 queues and structures Following the preparations in the previous commit, we can now remove the payload and flag queues dedicated for TCPv6 and TCPv4 and move all traffic into common queues handling both protocol types. Apart from reducing code and memory footprint, this change reduces a potential risk for TCPv4 traffic starving out TCPv6 traffic. Since we always flush out the TCPv4 frame queue before the TCPv6 queue, the latter will never be handled if the former fails to send all its frames. Tests with iperf3 shows no measurable change in performance after this change. Signed-off-by: Jon Maloy <jmaloy@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 6 +- tcp_buf.c | 248 ++++++++++++++++-------------------------------------- tcp_buf.h | 3 +- 3 files changed, 76 insertions(+), 181 deletions(-) diff --git a/tcp.c b/tcp.c index 0569dc6..10ad06a 100644 --- a/tcp.c +++ b/tcp.c @@ -2611,11 +2611,7 @@ int tcp_init(struct ctx *c) { ASSERT(!c->no_tcp); - if (c->ifi4) - tcp_sock4_iov_init(c); - - if (c->ifi6) - tcp_sock6_iov_init(c); + tcp_sock_iov_init(c); memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); diff --git a/tcp_buf.c b/tcp_buf.c index e249c6b..274e313 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -38,59 +38,44 @@ (c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM) /* Static buffers */ -/* Ethernet header for IPv4 frames */ + +/* Ethernet header for IPv4 and IPv6 frames */ static struct ethhdr tcp4_eth_src; +static struct ethhdr tcp6_eth_src; -static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM]; -/* IPv4 headers */ -static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; -/* TCP segments with payload for IPv4 frames */ -static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; +static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM]; -static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); +/* IP headers for IPv4 and IPv6 */ +struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; +struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; + +/* TCP segments with payload for IPv4 and IPv6 frames */ +static struct tcp_payload_t tcp_payload[TCP_FRAMES_MEM]; + +static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516"); +static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516"); /* References tracking the owner connection of frames in the tap outqueue */ -static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM]; -static unsigned int tcp4_payload_used; +static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM]; +static unsigned int tcp_payload_used; -static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; +static struct tap_hdr tcp_flags_tap_hdr[TCP_FRAMES_MEM]; /* IPv4 headers for TCP segment without payload */ static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; /* TCP segments without payload for IPv4 frames */ -static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; +static struct tcp_flags_t tcp_flags[TCP_FRAMES_MEM]; -static unsigned int tcp4_flags_used; +static unsigned int tcp_flags_used; -/* Ethernet header for IPv6 frames */ -static struct ethhdr tcp6_eth_src; - -static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM]; -/* IPv6 headers */ -static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; -/* TCP headers and data for IPv6 frames */ -static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; - -static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); - -/* References tracking the owner connection of frames in the tap outqueue */ -static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM]; -static unsigned int tcp6_payload_used; - -static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; /* IPv6 headers for TCP segment without payload */ static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; -/* TCP segment without payload for IPv6 frames */ -static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; - -static unsigned int tcp6_flags_used; /* recvmsg()/sendmsg() data for tap */ static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp_l2_flags_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS]; + /** * tcp_update_l2_buf() - Update Ethernet header buffers with addresses * @eth_d: Ethernet destination address, NULL if unchanged @@ -103,86 +88,46 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) } /** - * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets + * tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets * @c: Execution context */ -void tcp_sock4_iov_init(const struct ctx *c) -{ - struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; - int i; - - tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); - - for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) { - tcp4_payload_ip[i] = iph; - tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; - tcp4_payload[i].th.ack = 1; - } - - for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) { - tcp4_flags_ip[i] = iph; - tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; - tcp4_flags[i].th.ack = 1; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp4_l2_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]); - iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp4_l2_flags_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; - } -} - -/** - * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets - * @c: Execution context - */ -void tcp_sock6_iov_init(const struct ctx *c) +void tcp_sock_iov_init(const struct ctx *c) { struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); - struct iovec *iov; + struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); int i; tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); - for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) { + for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) { tcp6_payload_ip[i] = ip6; - tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; - tcp6_payload[i].th.ack = 1; + tcp4_payload_ip[i] = iph; + tcp_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp_payload[i].th.ack = 1; } - for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) { + for (i = 0; i < ARRAY_SIZE(tcp_flags); i++) { tcp6_flags_ip[i] = ip6; - tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; - tcp6_flags[i].th .ack = 1; + tcp4_flags_ip[i] = iph; + tcp_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp_flags[i].th.ack = 1; } for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp6_l2_iov[i]; + struct iovec *iov = tcp_l2_iov[i]; - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]); + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]); iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; + iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i]; } for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp6_l2_flags_iov[i]; + struct iovec *iov = tcp_l2_flags_iov[i]; - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_flags_tap_hdr[i]); + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp_flags[i]; } } @@ -192,13 +137,9 @@ void tcp_sock6_iov_init(const struct ctx *c) */ void tcp_flags_flush(const struct ctx *c) { - tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, - tcp6_flags_used); - tcp6_flags_used = 0; - - tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, - tcp4_flags_used); - tcp4_flags_used = 0; + tap_send_frames(c, &tcp_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp_flags_used); + tcp_flags_used = 0; } /** @@ -237,21 +178,13 @@ void tcp_payload_flush(const struct ctx *c) { size_t m; - m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, - tcp6_payload_used); - if (m != tcp6_payload_used) { - tcp_revert_seq(c, &tcp6_frame_conns[m], &tcp6_l2_iov[m], - tcp6_payload_used - m); + m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS, + tcp_payload_used); + if (m != tcp_payload_used) { + tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m], + tcp_payload_used - m); } - tcp6_payload_used = 0; - - m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, - tcp4_payload_used); - if (m != tcp4_payload_used) { - tcp_revert_seq(c, &tcp4_frame_conns[m], &tcp4_l2_iov[m], - tcp4_payload_used - m); - } - tcp4_payload_used = 0; + tcp_payload_used = 0; } /** @@ -271,41 +204,30 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) uint32_t seq; int ret; + iov = tcp_l2_flags_iov[tcp_flags_used]; if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov[tcp4_flags_used]; - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[tcp4_flags_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[tcp_flags_used]); iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; - tcp4_flags_used++; } else { - iov = tcp6_l2_flags_iov[tcp6_flags_used]; - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[tcp6_flags_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[tcp_flags_used]); iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; - tcp6_flags_used++; } - payload = iov[TCP_IOV_PAYLOAD].iov_base; + payload = iov[TCP_IOV_PAYLOAD].iov_base; seq = conn->seq_to_tap; ret = tcp_prepare_flags(c, conn, flags, &payload->th, &payload->opts, &optlen); - if (ret <= 0) { - if (CONN_V4(conn)) - tcp4_flags_used--; - else - tcp6_flags_used--; + if (ret <= 0) return ret; - } + tcp_flags_used++; l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (flags & DUP_ACK) { struct iovec *dup_iov; - if (CONN_V4(conn)) - dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - else - dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - + dup_iov = tcp_l2_flags_iov[tcp_flags_used++]; memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_len); dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base; @@ -315,13 +237,8 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len; } - if (CONN_V4(conn)) { - if (tcp4_flags_used > TCP_FRAMES_MEM - 2) - tcp_flags_flush(c); - } else { - if (tcp6_flags_used > TCP_FRAMES_MEM - 2) - tcp_flags_flush(c); - } + if (tcp_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); return 0; } @@ -337,42 +254,30 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, ssize_t dlen, int no_csum, uint32_t seq) { + const uint16_t *check = NULL; struct iovec *iov; size_t l4len; conn->seq_to_tap = seq + dlen; - + tcp_frame_conns[tcp_payload_used] = conn; + iov = tcp_l2_iov[tcp_payload_used]; if (CONN_V4(conn)) { - struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; - const uint16_t *check = NULL; - if (no_csum) { + struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1]; struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; } - - tcp4_frame_conns[tcp4_payload_used] = conn; - iov = tcp4_l2_iov[tcp4_payload_used]; - iov[TCP_IOV_IP] = - IOV_OF_LVALUE(tcp4_payload_ip[tcp4_payload_used++]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]); iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; - l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, - false); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (tcp4_payload_used > TCP_FRAMES_MEM - 1) - tcp_payload_flush(c); } else if (CONN_V6(conn)) { - tcp6_frame_conns[tcp6_payload_used] = conn; - iov = tcp6_l2_iov[tcp6_payload_used]; - iov[TCP_IOV_IP] = - IOV_OF_LVALUE(tcp6_payload_ip[tcp6_payload_used++]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]); iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; - l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq, - false); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (tcp6_payload_used > TCP_FRAMES_MEM - 1) - tcp_payload_flush(c); } + l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (++tcp_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); } /** @@ -388,8 +293,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) { uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; - int len, dlen, v4 = CONN_V4(conn); - int s = conn->sock, i; + int len, dlen, i, s = conn->sock; struct msghdr mh_sock = { 0 }; uint16_t mss = MSS_GET(conn); uint32_t already_sent, seq; @@ -436,19 +340,15 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) mh_sock.msg_iovlen = fill_bufs; } - if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || - (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { + if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) { tcp_payload_flush(c); /* Silence Coverity CWE-125 false positive */ - tcp4_payload_used = tcp6_payload_used = 0; + tcp_payload_used = 0; } for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { - if (v4) - iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; - else - iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; + iov->iov_base = &tcp_payload[tcp_payload_used + i].data; iov->iov_len = mss; } if (iov_rem) @@ -502,7 +402,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) dlen = mss; seq = conn->seq_to_tap; for (i = 0; i < send_bufs; i++) { - int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; + int no_csum = i && i != send_bufs - 1 && tcp_payload_used; if (i == send_bufs - 1) dlen = last_len; diff --git a/tcp_buf.h b/tcp_buf.h index 8d4b615..49c04d4 100644 --- a/tcp_buf.h +++ b/tcp_buf.h @@ -6,8 +6,7 @@ #ifndef TCP_BUF_H #define TCP_BUF_H -void tcp_sock4_iov_init(const struct ctx *c); -void tcp_sock6_iov_init(const struct ctx *c); +void tcp_sock_iov_init(const struct ctx *c); void tcp_flags_flush(const struct ctx *c); void tcp_payload_flush(const struct ctx *c); int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); From 988a4d75f89473cbf76e09852a03f21658859710 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 24 Oct 2024 23:10:09 +0200 Subject: [PATCH 068/382] Makefile: Exclude qrap.c from clang-tidy checks We'll deprecate qrap(1) soon, and warnings reported by clang-tidy as of LLVM versions 16 and later would need a bunch of changes there to be addressed, mostly around CERT C rule ERR33-C and checking return code from snprintf(). It makes no sense to fix warnings in qrap just for the sake of it, so officially declare the bitrotting season open. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4c2d020..01f0cc1 100644 --- a/Makefile +++ b/Makefile @@ -256,7 +256,7 @@ docs: README.md # weird for cases like standalone constants, and causes other # awkwardness for a bunch of cases we use -clang-tidy: $(SRCS) $(HEADERS) +clang-tidy: $(filter-out qrap.c,$(SRCS)) $(HEADERS) clang-tidy -checks=*,-modernize-*,\ -clang-analyzer-valist.Uninitialized,\ -cppcoreguidelines-init-variables,\ @@ -283,7 +283,7 @@ clang-tidy: $(SRCS) $(HEADERS) -misc-include-cleaner,\ -cppcoreguidelines-macro-to-enum \ -config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \ - --warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992 + --warnings-as-errors=* $(filter-out qrap.c,$(SRCS)) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992 SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET)) ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1) From 98efe7c2fdd82a2822e1be8e5c5c8caed846ae76 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 24 Oct 2024 23:25:33 +0200 Subject: [PATCH 069/382] treewide: Comply with CERT C rule ERR33-C for snprintf() clang-tidy, starting from LLVM version 16, up to at least LLVM version 19, now checks that we detect and handle errors for snprintf() as requested by CERT C rule ERR33-C. These warnings were logged with LLVM version 19.1.2 (at least Debian and Fedora match): /home/sbrivio/passt/arch.c:43:3: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors] 43 | snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /home/sbrivio/passt/arch.c:43:3: note: cast the expression to void to silence this warning /home/sbrivio/passt/conf.c:577:4: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors] 577 | snprintf(netns, PATH_MAX, "/proc/%ld/ns/net", pidval); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /home/sbrivio/passt/conf.c:577:4: note: cast the expression to void to silence this warning /home/sbrivio/passt/conf.c:579:5: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors] 579 | snprintf(userns, PATH_MAX, "/proc/%ld/ns/user", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 580 | pidval); | ~~~~~~~ /home/sbrivio/passt/conf.c:579:5: note: cast the expression to void to silence this warning /home/sbrivio/passt/pasta.c:105:2: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors] 105 | snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /home/sbrivio/passt/pasta.c:105:2: note: cast the expression to void to silence this warning /home/sbrivio/passt/pasta.c:242:2: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors] 242 | snprintf(uidmap, BUFSIZ, "0 %u 1", uid); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /home/sbrivio/passt/pasta.c:242:2: note: cast the expression to void to silence this warning /home/sbrivio/passt/pasta.c:243:2: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors] 243 | snprintf(gidmap, BUFSIZ, "0 %u 1", gid); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /home/sbrivio/passt/pasta.c:243:2: note: cast the expression to void to silence this warning /home/sbrivio/passt/tap.c:1155:4: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors] 1155 | snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /home/sbrivio/passt/tap.c:1155:4: note: cast the expression to void to silence this warning Don't silence the warnings as they might actually have some merit. Add an snprintf_check() function, instead, checking that we're not truncating messages while printing to buffers, and terminate if the check fails. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- arch.c | 6 +++++- conf.c | 13 +++++++++---- pasta.c | 11 ++++++++--- tap.c | 5 +++-- util.c | 30 ++++++++++++++++++++++++++++++ util.h | 2 ++ 6 files changed, 57 insertions(+), 10 deletions(-) diff --git a/arch.c b/arch.c index 04bebfc..d1dfb73 100644 --- a/arch.c +++ b/arch.c @@ -19,6 +19,7 @@ #include <unistd.h> #include "log.h" +#include "util.h" /** * arch_avx2_exec() - Switch to AVX2 build if supported @@ -40,7 +41,10 @@ void arch_avx2_exec(char **argv) if (__builtin_cpu_supports("avx2")) { char new_path[PATH_MAX + sizeof(".avx2")]; - snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe); + if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"), + "%s.avx2", exe)) + die_perror("Can't build AVX2 executable path"); + execve(new_path, argv, environ); warn_perror("Can't run AVX2 build, using non-AVX2 version"); } diff --git a/conf.c b/conf.c index b3b5342..fa5cec3 100644 --- a/conf.c +++ b/conf.c @@ -574,10 +574,15 @@ static void conf_pasta_ns(int *netns_only, char *userns, char *netns, if (pidval < 0 || pidval > INT_MAX) die("Invalid PID %s", argv[optind]); - snprintf(netns, PATH_MAX, "/proc/%ld/ns/net", pidval); - if (!*userns) - snprintf(userns, PATH_MAX, "/proc/%ld/ns/user", - pidval); + if (snprintf_check(netns, PATH_MAX, + "/proc/%ld/ns/net", pidval)) + die_perror("Can't build netns path"); + + if (!*userns) { + if (snprintf_check(userns, PATH_MAX, + "/proc/%ld/ns/user", pidval)) + die_perror("Can't build userns path"); + } } } diff --git a/pasta.c b/pasta.c index 307fb4a..a117704 100644 --- a/pasta.c +++ b/pasta.c @@ -102,7 +102,9 @@ static int pasta_wait_for_ns(void *arg) int flags = O_RDONLY | O_CLOEXEC; char ns[PATH_MAX]; - snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid); + if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid)) + die_perror("Can't build netns path"); + do { while ((c->pasta_netns_fd = open(ns, flags)) < 0) { if (errno != ENOENT) @@ -239,8 +241,11 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, c->quiet = 1; /* Configure user and group mappings */ - snprintf(uidmap, BUFSIZ, "0 %u 1", uid); - snprintf(gidmap, BUFSIZ, "0 %u 1", gid); + if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid)) + die_perror("Can't build uidmap"); + + if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid)) + die_perror("Can't build gidmap"); if (write_file("/proc/self/uid_map", uidmap) || write_file("/proc/self/setgroups", "deny") || diff --git a/tap.c b/tap.c index c53a39b..cfb82e9 100644 --- a/tap.c +++ b/tap.c @@ -1151,8 +1151,9 @@ int tap_sock_unix_open(char *sock_path) if (*sock_path) memcpy(path, sock_path, UNIX_PATH_MAX); - else - snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i); + else if (snprintf_check(path, UNIX_PATH_MAX - 1, + UNIX_SOCK_PATH, i)) + die_perror("Can't build UNIX domain socket path"); ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); if (ex < 0) diff --git a/util.c b/util.c index eba7d52..21ce0a8 100644 --- a/util.c +++ b/util.c @@ -749,3 +749,33 @@ void close_open_files(int argc, char **argv) if (rc) die_perror("Failed to close files leaked by parent"); } + +/** + * snprintf_check() - snprintf() wrapper, checking for truncation and errors + * @str: Output buffer + * @size: Maximum size to write to @str + * @format: Message + * + * Return: false on success, true on truncation or error, sets errno on failure + */ +bool snprintf_check(char *str, size_t size, const char *format, ...) +{ + va_list ap; + int rc; + + va_start(ap, format); + rc = vsnprintf(str, size, format, ap); + va_end(ap); + + if (rc < 0) { + errno = EIO; + return true; + } + + if ((size_t)rc >= size) { + errno = ENOBUFS; + return true; + } + + return false; +} diff --git a/util.h b/util.h index 2c1e08e..96f178c 100644 --- a/util.h +++ b/util.h @@ -11,6 +11,7 @@ #include <stdbool.h> #include <stddef.h> #include <stdint.h> +#include <stdio.h> #include <string.h> #include <signal.h> #include <arpa/inet.h> @@ -200,6 +201,7 @@ int write_file(const char *path, const char *buf); int write_all_buf(int fd, const void *buf, size_t len); int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip); void close_open_files(int argc, char **argv); +bool snprintf_check(char *str, size_t size, const char *format, ...); /** * af_name() - Return name of an address family From 744247856da10412a64ce0720f0e7359981748e1 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 24 Oct 2024 23:44:43 +0200 Subject: [PATCH 070/382] treewide: Silence cert-err33-c clang-tidy warnings for fprintf() We use fprintf() to print to standard output or standard error streams. If something gets truncated or there's an output error, we don't really want to try and report that, and at the same time it's not abnormal behaviour upon which we should terminate, either. Just silence the warning with an ugly FPRINTF() variadic macro casting the fprintf() expressions to void. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- conf.c | 46 +++++++++++++++++++++++----------------------- log.c | 6 +++--- util.h | 3 +++ 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/conf.c b/conf.c index fa5cec3..4db7c64 100644 --- a/conf.c +++ b/conf.c @@ -733,19 +733,19 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6) static void usage(const char *name, FILE *f, int status) { if (strstr(name, "pasta")) { - fprintf(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name); - fprintf(f, " %s [OPTION]... PID\n", name); - fprintf(f, " %s [OPTION]... --netns [PATH|NAME]\n", name); - fprintf(f, + FPRINTF(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name); + FPRINTF(f, " %s [OPTION]... PID\n", name); + FPRINTF(f, " %s [OPTION]... --netns [PATH|NAME]\n", name); + FPRINTF(f, "\n" "Without PID or --netns, run the given command or a\n" "default shell in a new network and user namespace, and\n" "connect it via pasta.\n"); } else { - fprintf(f, "Usage: %s [OPTION]...\n", name); + FPRINTF(f, "Usage: %s [OPTION]...\n", name); } - fprintf(f, + FPRINTF(f, "\n" " -d, --debug Be verbose\n" " --trace Be extra verbose, implies --debug\n" @@ -762,17 +762,17 @@ static void usage(const char *name, FILE *f, int status) " --version Show version and exit\n"); if (strstr(name, "pasta")) { - fprintf(f, + FPRINTF(f, " -I, --ns-ifname NAME namespace interface name\n" " default: same interface name as external one\n"); } else { - fprintf(f, + FPRINTF(f, " -s, --socket PATH UNIX domain socket path\n" " default: probe free path starting from " UNIX_SOCK_PATH "\n", 1); } - fprintf(f, + FPRINTF(f, " -F, --fd FD Use FD as pre-opened connected socket\n" " -p, --pcap FILE Log tap-facing traffic to pcap file\n" " -P, --pid FILE Write own PID to the given file\n" @@ -803,28 +803,28 @@ static void usage(const char *name, FILE *f, int status) " can be specified multiple times\n" " a single, empty option disables DNS information\n"); if (strstr(name, "pasta")) - fprintf(f, " default: don't use any addresses\n"); + FPRINTF(f, " default: don't use any addresses\n"); else - fprintf(f, " default: use addresses from /etc/resolv.conf\n"); - fprintf(f, + FPRINTF(f, " default: use addresses from /etc/resolv.conf\n"); + FPRINTF(f, " -S, --search LIST Space-separated list, search domains\n" " a single, empty option disables the DNS search list\n"); if (strstr(name, "pasta")) - fprintf(f, " default: don't use any search list\n"); + FPRINTF(f, " default: don't use any search list\n"); else - fprintf(f, " default: use search list from /etc/resolv.conf\n"); + FPRINTF(f, " default: use search list from /etc/resolv.conf\n"); if (strstr(name, "pasta")) - fprintf(f, " --dhcp-dns \tPass DNS list via DHCP/DHCPv6/NDP\n"); + FPRINTF(f, " --dhcp-dns \tPass DNS list via DHCP/DHCPv6/NDP\n"); else - fprintf(f, " --no-dhcp-dns No DNS list in DHCP/DHCPv6/NDP\n"); + FPRINTF(f, " --no-dhcp-dns No DNS list in DHCP/DHCPv6/NDP\n"); if (strstr(name, "pasta")) - fprintf(f, " --dhcp-search Pass list via DHCP/DHCPv6/NDP\n"); + FPRINTF(f, " --dhcp-search Pass list via DHCP/DHCPv6/NDP\n"); else - fprintf(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n"); + FPRINTF(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n"); - fprintf(f, + FPRINTF(f, " --map-host-loopback ADDR Translate ADDR to refer to host\n" " can be specified zero to two times (for IPv4 and IPv6)\n" " default: gateway address\n" @@ -852,7 +852,7 @@ static void usage(const char *name, FILE *f, int status) if (strstr(name, "pasta")) goto pasta_opts; - fprintf(f, + FPRINTF(f, " -1, --one-off Quit after handling one single client\n" " -t, --tcp-ports SPEC TCP port forwarding to guest\n" " can be specified multiple times\n" @@ -883,7 +883,7 @@ static void usage(const char *name, FILE *f, int status) pasta_opts: - fprintf(f, + FPRINTF(f, " -t, --tcp-ports SPEC TCP port forwarding to namespace\n" " can be specified multiple times\n" " SPEC can be:\n" @@ -1421,9 +1421,9 @@ void conf(struct ctx *c, int argc, char **argv) break; case 14: - fprintf(stdout, + FPRINTF(stdout, c->mode == MODE_PASTA ? "pasta " : "passt "); - fprintf(stdout, VERSION_BLOB); + FPRINTF(stdout, VERSION_BLOB); exit(EXIT_SUCCESS); case 15: ret = snprintf(c->ip4.ifname_out, diff --git a/log.c b/log.c index a61468e..6932885 100644 --- a/log.c +++ b/log.c @@ -274,7 +274,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap) char timestr[LOGTIME_STRLEN]; logtime_fmt(timestr, sizeof(timestr), now); - fprintf(stderr, "%s: ", timestr); + FPRINTF(stderr, "%s: ", timestr); } if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) { @@ -293,7 +293,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap) (log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) { (void)vfprintf(stderr, format, ap); if (newline && format[strlen(format)] != '\n') - fprintf(stderr, "\n"); + FPRINTF(stderr, "\n"); } } @@ -399,7 +399,7 @@ void passt_vsyslog(bool newline, int pri, const char *format, va_list ap) n += snprintf(buf + n, BUFSIZ - n, "\n"); if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr) - fprintf(stderr, "Failed to send %i bytes to syslog\n", n); + FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n); } /** diff --git a/util.h b/util.h index 96f178c..4f8b768 100644 --- a/util.h +++ b/util.h @@ -269,6 +269,9 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m) return mod_sub(x, i, m) < mod_sub(j, i, m); } +/* FPRINTF() intentionally silences cert-err33-c clang-tidy warnings */ +#define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__) + /* * Workarounds for https://github.com/llvm/llvm-project/issues/58992 * From 134b4d58b409013d9f231aac1d4ba69f7835da7c Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 24 Oct 2024 23:52:19 +0200 Subject: [PATCH 071/382] Makefile: Disable readability-math-missing-parentheses clang-tidy check With clang-tidy and LLVM 19: /home/sbrivio/passt/conf.c:1218:29: error: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors] 1218 | const char *octet = str + 3 * i; | ^~~~~~ | ( ) /home/sbrivio/passt/ndp.c:285:18: error: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors] 285 | .len = 1 + 2 * n, | ^~~~~~ | ( ) /home/sbrivio/passt/ndp.c:329:23: error: '%' has higher precedence than '-'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors] 329 | memset(ptr, 0, 8 - dns_s_len % 8); /* padding */ | ^~~~~~~~~~~~~~ | ( ) /home/sbrivio/passt/pcap.c:131:20: error: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors] 131 | pcap_frame(iov + i * frame_parts, frame_parts, offset, &now); | ^~~~~~~~~~~~~~~~ | ( ) /home/sbrivio/passt/util.c:216:10: error: '/' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors] 216 | return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 + | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ( ) /home/sbrivio/passt/util.c:217:10: error: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors] 217 | (a->tv_sec - b->tv_sec - 1) * 1000000; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ( ) /home/sbrivio/passt/util.c:220:9: error: '/' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors] 220 | return (a->tv_nsec - b->tv_nsec) / 1000 + | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ( ) /home/sbrivio/passt/util.c:221:9: error: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors] 221 | (a->tv_sec - b->tv_sec) * 1000000; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ( ) /home/sbrivio/passt/util.c:545:32: error: '/' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors] 545 | return clone(fn, stack_area + stack_size / 2, flags, arg); | ^~~~~~~~~~~~~~~ | ( ) Just... no. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- Makefile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 01f0cc1..c1c6e30 100644 --- a/Makefile +++ b/Makefile @@ -255,6 +255,12 @@ docs: README.md # makes sense when those defines form an enum-like set, but # weird for cases like standalone constants, and causes other # awkwardness for a bunch of cases we use +# +# - readability-math-missing-parentheses +# It's been a couple of centuries since multiplication has been granted +# precedence over addition in modern mathematical notation. Adding +# parentheses to reinforce that certainly won't improve readability. + clang-tidy: $(filter-out qrap.c,$(SRCS)) $(HEADERS) clang-tidy -checks=*,-modernize-*,\ @@ -281,7 +287,8 @@ clang-tidy: $(filter-out qrap.c,$(SRCS)) $(HEADERS) -concurrency-mt-unsafe,\ -readability-identifier-length,\ -misc-include-cleaner,\ - -cppcoreguidelines-macro-to-enum \ + -cppcoreguidelines-macro-to-enum,\ + -readability-math-missing-parentheses \ -config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \ --warnings-as-errors=* $(filter-out qrap.c,$(SRCS)) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992 From 59fe34ee36368bb28c8298b1a1bfad5d0d9f47a3 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 25 Oct 2024 00:10:36 +0200 Subject: [PATCH 072/382] treewide: Suppress clang-tidy warning if we already use O_CLOEXEC In pcap_init(), we should always open the packet capture file with O_CLOEXEC, even if we're not running in foreground: O_CLOEXEC means close-on-exec, not close-on-fork. In logfile_init() and pidfile_open(), the fact that we pass a third 'mode' argument to open() seems to confuse the android-cloexec-open checker in LLVM versions from 16 to 19 (at least). The checker is suggesting to add O_CLOEXEC to 'mode', and not in 'flags', where we already have it. Add a suppression for clang-tidy and a comment, and avoid repeating those three times by adding a new helper, output_file_open(). Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 6 +++++- log.c | 3 +-- pcap.c | 7 ++----- util.c | 27 +++++++++++---------------- util.h | 2 +- 5 files changed, 20 insertions(+), 25 deletions(-) diff --git a/conf.c b/conf.c index 4db7c64..14411b4 100644 --- a/conf.c +++ b/conf.c @@ -1194,7 +1194,11 @@ static void conf_open_files(struct ctx *c) if (c->mode != MODE_PASTA && c->fd_tap == -1) c->fd_tap_listen = tap_sock_unix_open(c->sock_path); - c->pidfile_fd = pidfile_open(c->pidfile); + if (*c->pidfile) { + c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY); + if (c->pidfile_fd < 0) + die_perror("Couldn't open PID file %s", c->pidfile); + } } /** diff --git a/log.c b/log.c index 6932885..19f1d98 100644 --- a/log.c +++ b/log.c @@ -416,8 +416,7 @@ void logfile_init(const char *name, const char *path, size_t size) if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) die_perror("Failed to read own /proc/self/exe link"); - log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC, - S_IRUSR | S_IWUSR); + log_file = output_file_open(path, O_APPEND | O_RDWR); if (log_file == -1) die_perror("Couldn't open log file %s", path); diff --git a/pcap.c b/pcap.c index 6ee6cdf..2e2ff93 100644 --- a/pcap.c +++ b/pcap.c @@ -158,18 +158,15 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) */ void pcap_init(struct ctx *c) { - int flags = O_WRONLY | O_CREAT | O_TRUNC; - if (pcap_fd != -1) return; if (!*c->pcap) return; - flags |= c->foreground ? O_CLOEXEC : 0; - pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR); + pcap_fd = output_file_open(c->pcap, O_WRONLY); if (pcap_fd == -1) { - perror("open"); + err_perror("Couldn't open pcap file %s", c->pcap); return; } diff --git a/util.c b/util.c index 21ce0a8..1d6d009 100644 --- a/util.c +++ b/util.c @@ -407,25 +407,20 @@ void pidfile_write(int fd, pid_t pid) } /** - * pidfile_open() - Open PID file if needed - * @path: Path for PID file, empty string if no PID file is requested + * output_file_open() - Open file for output, if needed + * @path: Path for output file + * @flags: Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC * - * Return: descriptor for PID file, -1 if path is NULL, won't return on failure + * Return: file descriptor on success, -1 on failure with errno set by open() */ -int pidfile_open(const char *path) +int output_file_open(const char *path, int flags) { - int fd; - - if (!*path) - return -1; - - if ((fd = open(path, O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC, - S_IRUSR | S_IWUSR)) < 0) { - perror("PID file open"); - exit(EXIT_FAILURE); - } - - return fd; + /* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for + * it in the 'mode' argument if we have one + */ + return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags, + /* NOLINTNEXTLINE(android-cloexec-open) */ + S_IRUSR | S_IWUSR); } /** diff --git a/util.h b/util.h index 4f8b768..3fc64cf 100644 --- a/util.h +++ b/util.h @@ -193,7 +193,7 @@ char *line_read(char *buf, size_t len, int fd); void ns_enter(const struct ctx *c); bool ns_is_init(void); int open_in_ns(const struct ctx *c, const char *path, int flags); -int pidfile_open(const char *path); +int output_file_open(const char *path, int flags); void pidfile_write(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); From 099ace64cedbf43922527dc7f132f0c0e65f308a Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 25 Oct 2024 00:29:50 +0200 Subject: [PATCH 073/382] treewide: Address cert-err33-c clang-tidy warnings for clock and timer functions For clock_gettime(), we shouldn't ignore errors if they happen at initialisation phase, because something is seriously wrong and it's not helpful if we proceed as if nothing happened. As we're up and running, though, it's probably better to report the error and use a stale value than to terminate altogether. Make sure we use a zero value if we don't have a stale one somewhere. For timerfd_gettime() and timerfd_settime() failures, just report an error, there isn't much else we can do. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt.c | 9 ++++++--- pcap.c | 17 +++++++++++------ tcp.c | 12 +++++++++--- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/passt.c b/passt.c index ad6f0bc..eaf231d 100644 --- a/passt.c +++ b/passt.c @@ -207,7 +207,8 @@ int main(int argc, char **argv) struct timespec now; struct sigaction sa; - clock_gettime(CLOCK_MONOTONIC, &log_start); + if (clock_gettime(CLOCK_MONOTONIC, &log_start)) + die_perror("Failed to get CLOCK_MONOTONIC time"); arch_avx2_exec(argv); @@ -265,7 +266,8 @@ int main(int argc, char **argv) secret_init(&c); - clock_gettime(CLOCK_MONOTONIC, &now); + if (clock_gettime(CLOCK_MONOTONIC, &now)) + die_perror("Failed to get CLOCK_MONOTONIC time"); flow_init(); @@ -313,7 +315,8 @@ loop: if (nfds == -1 && errno != EINTR) die_perror("epoll_wait() failed in main loop"); - clock_gettime(CLOCK_MONOTONIC, &now); + if (clock_gettime(CLOCK_MONOTONIC, &now)) + err_perror("Failed to get CLOCK_MONOTONIC time"); for (i = 0; i < nfds; i++) { union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64); diff --git a/pcap.c b/pcap.c index 2e2ff93..23205dd 100644 --- a/pcap.c +++ b/pcap.c @@ -100,12 +100,14 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt, void pcap(const char *pkt, size_t l2len) { struct iovec iov = { (char *)pkt, l2len }; - struct timespec now; + struct timespec now = { 0 }; if (pcap_fd == -1) return; - clock_gettime(CLOCK_REALTIME, &now); + if (clock_gettime(CLOCK_REALTIME, &now)) + err_perror("Failed to get CLOCK_REALTIME time"); + pcap_frame(&iov, 1, 0, &now); } @@ -119,13 +121,14 @@ void pcap(const char *pkt, size_t l2len) void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, size_t offset) { - struct timespec now; + struct timespec now = { 0 }; unsigned int i; if (pcap_fd == -1) return; - clock_gettime(CLOCK_REALTIME, &now); + if (clock_gettime(CLOCK_REALTIME, &now)) + err_perror("Failed to get CLOCK_REALTIME time"); for (i = 0; i < n; i++) pcap_frame(iov + i * frame_parts, frame_parts, offset, &now); @@ -143,12 +146,14 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, /* cppcheck-suppress unusedFunction */ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) { - struct timespec now; + struct timespec now = { 0 }; if (pcap_fd == -1) return; - clock_gettime(CLOCK_REALTIME, &now); + if (clock_gettime(CLOCK_REALTIME, &now)) + err_perror("Failed to get CLOCK_REALTIME time"); + pcap_frame(iov, iovcnt, offset, &now); } diff --git a/tcp.c b/tcp.c index 10ad06a..4e0a17e 100644 --- a/tcp.c +++ b/tcp.c @@ -549,7 +549,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) (unsigned long long)it.it_value.tv_sec, (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); - timerfd_settime(conn->timer, 0, &it, NULL); + if (timerfd_settime(conn->timer, 0, &it, NULL)) + flow_err(conn, "failed to set timer: %s", strerror(errno)); } /** @@ -2235,7 +2236,9 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * timer is currently armed, this event came from a previous setting, * and we just set the timer to a new point in the future: discard it. */ - timerfd_gettime(conn->timer, &check_armed); + if (timerfd_gettime(conn->timer, &check_armed)) + flow_err(conn, "failed to read timer: %s", strerror(errno)); + if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec) return; @@ -2273,7 +2276,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * case. This avoids having to preemptively reset the timer on * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. */ - timerfd_settime(conn->timer, 0, &new, &old); + if (timerfd_settime(conn->timer, 0, &new, &old)) + flow_err(conn, "failed to set timer: %s", + strerror(errno)); + if (old.it_value.tv_sec == ACT_TIMEOUT) { flow_dbg(conn, "activity timeout"); tcp_rst(c, conn); From b1a607fba11b3325117b76ffb41cc6edff774abf Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 25 Oct 2024 00:48:10 +0200 Subject: [PATCH 074/382] udp: Take care of cert-int09-c clang-tidy warning for enum udp_iov_idx /home/sbrivio/passt/udp.c:171:1: error: inital values in enum 'udp_iov_idx' are not consistent, consider explicit initialization of all, none or only the first enumerator [cert-int09-c,readability-enum-initial-value,-warnings-as-errors] 171 | enum udp_iov_idx { | ^ 172 | UDP_IOV_TAP = 0, 173 | UDP_IOV_ETH = 1, 174 | UDP_IOV_IP = 2, 175 | UDP_IOV_PAYLOAD = 3, 176 | UDP_NUM_IOVS | | = 4 Don't initialise any value, so that it's obvious that constants map to unique values. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- udp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/udp.c b/udp.c index 100610f..0c01067 100644 --- a/udp.c +++ b/udp.c @@ -169,11 +169,11 @@ udp_meta[UDP_MAX_FRAMES]; * @UDP_NUM_IOVS the number of entries in the iovec array */ enum udp_iov_idx { - UDP_IOV_TAP = 0, - UDP_IOV_ETH = 1, - UDP_IOV_IP = 2, - UDP_IOV_PAYLOAD = 3, - UDP_NUM_IOVS + UDP_IOV_TAP, + UDP_IOV_ETH, + UDP_IOV_IP, + UDP_IOV_PAYLOAD, + UDP_NUM_IOVS, }; /* IOVs and msghdr arrays for receiving datagrams from sockets */ From ee7d0b62a716201abc818eb0d1df4c6bb1051336 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 25 Oct 2024 00:57:58 +0200 Subject: [PATCH 075/382] util: Don't use errno after a successful call in __daemon() I thought we could just set errno to 0, do a bunch of stuff, and check that errno didn't change to infer we succeeded. But clang-tidy, starting with LLVM 19, reports: /home/sbrivio/passt/util.c:465:6: error: An undefined value may be read from 'errno' [clang-analyzer-unix.Errno,-warnings-as-errors] 465 | if (errno) | ^ /usr/include/errno.h:38:16: note: expanded from macro 'errno' 38 | # define errno (*__errno_location ()) | ^~~~~~~~~~~~~~~~~~~~~~ /home/sbrivio/passt/util.c:446:6: note: Assuming the condition is false 446 | if (pid == -1) { | ^~~~~~~~~ /home/sbrivio/passt/util.c:446:2: note: Taking false branch 446 | if (pid == -1) { | ^ /home/sbrivio/passt/util.c:451:6: note: Assuming 'pid' is 0 451 | if (pid) { | ^~~ /home/sbrivio/passt/util.c:451:2: note: Taking false branch 451 | if (pid) { | ^ /home/sbrivio/passt/util.c:463:2: note: Assuming that 'close' is successful; 'errno' becomes undefined after the call 463 | close(devnull_fd); | ^~~~~~~~~~~~~~~~~ /home/sbrivio/passt/util.c:465:6: note: An undefined value may be read from 'errno' 465 | if (errno) | ^ /usr/include/errno.h:38:16: note: expanded from macro 'errno' 38 | # define errno (*__errno_location ()) | ^~~~~~~~~~~~~~~~~~~~~~ And the LLVM documentation for the unix.Errno checker, 1.1.8.3 unix.Errno (C), mentions, at: https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno that: The C and POSIX standards often do not define if a standard library function may change value of errno if the call does not fail. Therefore, errno should only be used if it is known from the return value of a function that the call has failed. which is, somewhat surprisingly, the case for close(). Instead of using errno, check the actual return values of the calls we issue here. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- util.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/util.c b/util.c index 1d6d009..dddef93 100644 --- a/util.c +++ b/util.c @@ -444,16 +444,11 @@ int __daemon(int pidfile_fd, int devnull_fd) exit(EXIT_SUCCESS); } - errno = 0; - - setsid(); - - dup2(devnull_fd, STDIN_FILENO); - dup2(devnull_fd, STDOUT_FILENO); - dup2(devnull_fd, STDERR_FILENO); - close(devnull_fd); - - if (errno) + if (setsid() < 0 || + dup2(devnull_fd, STDIN_FILENO) < 0 || + dup2(devnull_fd, STDOUT_FILENO) < 0 || + dup2(devnull_fd, STDERR_FILENO) < 0 || + close(devnull_fd)) exit(EXIT_FAILURE); return 0; From d165d36a0c88fe8665da012f35cb60ced991568b Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 30 Oct 2024 21:31:05 +0100 Subject: [PATCH 076/382] tcp: Fix build against musl, __sum16 comes from linux/types.h Use a plain uint16_t instead and avoid including one extra header: the 'bitwise' attribute of __sum16 is just used by sparse(1). Reported-by: omni <omni+alpine@hack.org> Fixes: 3d484aa37090 ("tcp: Update TCP checksum using an iovec array") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tcp.c b/tcp.c index 4e0a17e..56ceba6 100644 --- a/tcp.c +++ b/tcp.c @@ -766,7 +766,7 @@ static void tcp_update_check_tcp4(const struct iphdr *iph, struct in_addr saddr = { .s_addr = iph->saddr }; struct in_addr daddr = { .s_addr = iph->daddr }; size_t check_ofs; - __sum16 *check; + uint16_t *check; int check_idx; uint32_t sum; char *ptr; @@ -797,7 +797,7 @@ static void tcp_update_check_tcp4(const struct iphdr *iph, return; } - check = (__sum16 *)ptr; + check = (uint16_t *)ptr; *check = 0; *check = csum_iov(iov, iov_cnt, l4offset, sum); @@ -816,7 +816,7 @@ static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, { uint16_t l4len = ntohs(ip6h->payload_len); size_t check_ofs; - __sum16 *check; + uint16_t *check; int check_idx; uint32_t sum; char *ptr; @@ -848,7 +848,7 @@ static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, return; } - check = (__sum16 *)ptr; + check = (uint16_t *)ptr; *check = 0; *check = csum_iov(iov, iov_cnt, l4offset, sum); From 9afce0b45c396e43a5499f227cc21849812a435b Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 30 Oct 2024 21:36:18 +0100 Subject: [PATCH 077/382] tap: Explicitly cast TUNSETIFF to fix build warning with musl on ppc64le On ppc64le, TUNSETIFF happens to be 2147767498, which is bigger than INT_MAX (2^31 - 1), and musl declares the second argument of ioctl() as 'int', not 'unsigned long' like glibc does, probably because of how POSIX specifies the equivalent argument, int dcmd, in posix_devctl(), so gcc reports a warning: tap.c: In function 'tap_ns_tun': tap.c:1291:24: warning: overflow in conversion from 'long unsigned int' to 'int' changes value from '2147767498' to '-2147199798' [-Woverflow] 1291 | rc = ioctl(fd, TUNSETIFF, &ifr); | ^~~~~~~~~ We don't care about that overflow, so explicitly cast TUNSETIFF to int. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap.c b/tap.c index cfb82e9..f638f2c 100644 --- a/tap.c +++ b/tap.c @@ -1288,7 +1288,7 @@ static int tap_ns_tun(void *arg) if (fd < 0) die_perror("Failed to open() /dev/net/tun"); - rc = ioctl(fd, TUNSETIFF, &ifr); + rc = ioctl(fd, (int)TUNSETIFF, &ifr); if (rc < 0) die_perror("TUNSETIFF ioctl on /dev/net/tun failed"); From 5e93bcd8bff7ea373d7befa1cf9761c6fff994b2 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 5 Nov 2024 12:44:04 +1100 Subject: [PATCH 078/382] test: Adjust misplaced sleeps in two_guests code Most of our transfer tests using socat use 'sleep' waaiting for the server side to be ready before starting the client. However in two_guests/basic the sleep is in the wrong place: rather than being between starting the server and starting the client, it's after waiting for the server to complete. This causes occasional hangs when the client runs before the server is ready - in that case the receiving guest sends an RST, which we don't (currently) propagate back to the sender. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/two_guests/basic | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/two_guests/basic b/test/two_guests/basic index 9ba5efe..e2338ff 100644 --- a/test/two_guests/basic +++ b/test/two_guests/basic @@ -52,33 +52,33 @@ check [ "__ADDR2_6__" = "__HOST_ADDR6__" ] test TCP/IPv4: guest 1 > guest 2 g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc +sleep 1 guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004 guest2w -sleep 1 g2out MSG2 cat msg check [ "__MSG2__" = "Hello_from_guest_1" ] test TCP/IPv6: guest 2 > guest 1 g2out GW2_6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc +sleep 1 guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001 guest1w -sleep 1 g1out MSG1 cat msg check [ "__MSG1__" = "Hello_from_guest_2" ] test UDP/IPv4: guest 1 > guest 2 guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc +sleep 1 guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004 guest2w -sleep 1 g2out MSG2 cat msg check [ "__MSG2__" = "Hello_from_guest_1" ] test UDP/IPv6: guest 2 > guest 1 guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc +sleep 1 guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001 guest1w -sleep 1 g1out MSG1 cat msg check [ "__MSG1__" = "Hello_from_guest_2" ] From 8f1b6a0ca68ae1530ac193cc47cd17ae8cbfd45d Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:17 +1100 Subject: [PATCH 079/382] clang: Add .clang-format file I've been experimenting with clangd, but its default format style is horrid. Since our style is basically that of the Linux kernel, copy the .clang-format from the kernel, minus reference to a bunch of kernel specific macros. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- .clang-format | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..78f177a --- /dev/null +++ b/.clang-format @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# clang-format configuration file. Intended for clang-format >= 11. +# +# For more information, see: +# +# Documentation/dev-tools/clang-format.rst +# https://clang.llvm.org/docs/ClangFormat.html +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeInheritanceComma: false +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeComma +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 8 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: false + +# Taken from: +# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \ +# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ +# | LC_ALL=C sort -u +ForEachMacros: + - 'for_each_nst' + +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +IndentGotoLabels: false +IndentPPDirectives: None +IndentWidth: 8 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 8 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true + +# Taken from git's rules +PenaltyBreakAssignment: 10 +PenaltyBreakBeforeFirstCallParameter: 30 +PenaltyBreakComment: 10 +PenaltyBreakFirstLessLess: 0 +PenaltyBreakString: 10 +PenaltyExcessCharacter: 100 +PenaltyReturnTypeOnItsOwnLine: 60 + +PointerAlignment: Right +ReflowComments: false +SortIncludes: false +SortUsingDeclarations: false +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatementsExceptForEachMacros +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp03 +TabWidth: 8 +UseTab: Always +... From 8346216c9adf34920a6c0724d332c53557051557 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:18 +1100 Subject: [PATCH 080/382] Makefile: Simplify exclusion of qrap from static checks There are things in qrap.c that clang-tidy complains about that aren't worth fixing. So, we currently exclude it using $(filter-out). However, we already have a make variable which has just the passt sources, excluding qrap, so we can use that instead of the awkward filter-out expression. Currently, we still include qrap.c for cppcheck, but there's not much point doing so: it's, well, qrap, so we don't care that much about lints. Exclude it from cppcheck as well, for consistency. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index c1c6e30..8e14309 100644 --- a/Makefile +++ b/Makefile @@ -262,7 +262,7 @@ docs: README.md # parentheses to reinforce that certainly won't improve readability. -clang-tidy: $(filter-out qrap.c,$(SRCS)) $(HEADERS) +clang-tidy: $(PASST_SRCS) $(HEADERS) clang-tidy -checks=*,-modernize-*,\ -clang-analyzer-valist.Uninitialized,\ -cppcoreguidelines-init-variables,\ @@ -290,14 +290,14 @@ clang-tidy: $(filter-out qrap.c,$(SRCS)) $(HEADERS) -cppcoreguidelines-macro-to-enum,\ -readability-math-missing-parentheses \ -config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \ - --warnings-as-errors=* $(filter-out qrap.c,$(SRCS)) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992 + --warnings-as-errors=* $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992 SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET)) ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1) VER := $(shell $(CC) -dumpversion) SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include endif -cppcheck: $(SRCS) $(HEADERS) +cppcheck: $(PASST_SRCS) $(HEADERS) if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \ CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \ else \ @@ -313,4 +313,4 @@ cppcheck: $(SRCS) $(HEADERS) --inline-suppr \ --suppress=unusedStructMember \ $(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \ - $(SRCS) $(HEADERS) + $(PASST_SRCS) $(HEADERS) From b78e72da0b27e222592ff1f1578c69bad4756c65 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:19 +1100 Subject: [PATCH 081/382] clang: Move clang-tidy configuration from Makefile to .clang-tidy Currently we configure clang-tidy with a very long command line spelled out in the Makefile (mostly a big list of lints to disable). Move it from here into a .clang-tidy configuration file, so that the config is accessible if clang-tidy is invoked in other ways (e.g. via clangd) as well. As a bonus this also means that we can move the bulky comments about why we're suppressing various tests inline with the relevant config lines. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- .clang-tidy | 93 +++++++++++++++++++++++++++++++++++++++++++ Makefile | 111 +--------------------------------------------------- 2 files changed, 95 insertions(+), 109 deletions(-) create mode 100644 .clang-tidy diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..9d346ec --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,93 @@ +--- +Checks: + - "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*" + + # TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed + - "-clang-analyzer-valist.Uninitialized" + + # Dubious value, would kill readability + - "-cppcoreguidelines-init-variables" + + # Dubious value over the compiler's built-in warning. Would + # increase verbosity. + - "-bugprone-assignment-in-if-condition" + + # Debatable whether these improve readability, right now it would look + # like a mess + - "-google-readability-braces-around-statements" + - "-hicpp-braces-around-statements" + - "-readability-braces-around-statements" + + # TODO: in most cases they are justified, but probably not everywhere + # + - "-readability-magic-numbers" + - "-cppcoreguidelines-avoid-magic-numbers" + + # TODO: this is Linux-only for the moment, nice to fix eventually + - "-llvmlibc-restrict-system-libc-headers" + + # Those are needed for syscalls, epoll_wait flags, etc. + - "-hicpp-signed-bitwise" + + # Probably not doable to impement this without plain memcpy(), memset() + - "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling" + + # TODO: not really important, but nice to fix eventually + - "-llvm-include-order" + + # Dubious value, would kill readability + - "-readability-isolate-declaration" + + # TODO: nice to fix eventually + - "-bugprone-narrowing-conversions" + - "-cppcoreguidelines-narrowing-conversions" + + # TODO: check, fix, and more in general constify wherever possible + - "-cppcoreguidelines-avoid-non-const-global-variables" + + # TODO: check paths where it might make sense to improve performance + - "-altera-unroll-loops" + - "-altera-id-dependent-backward-branch" + + # Not much can be done about them other than being careful + - "-bugprone-easily-swappable-parameters" + + # TODO: split reported functions + - "-readability-function-cognitive-complexity" + + # "Poor" alignment needed for structs reflecting message formats/headers + - "-altera-struct-pack-align" + + # TODO: check again if multithreading is implemented + - "-concurrency-mt-unsafe" + + # Complains about any identifier <3 characters, reasonable for + # globals, pointlessly verbose for locals and parameters. + - "-readability-identifier-length" + + # Wants to include headers which *directly* provide the things + # we use. That sounds nice, but means it will often want a OS + # specific header instead of a mostly standard one, such as + # <linux/limits.h> instead of <limits.h>. + - "-misc-include-cleaner" + + # Want to replace all #defines of integers with enums. Kind of + # makes sense when those defines form an enum-like set, but + # weird for cases like standalone constants, and causes other + # awkwardness for a bunch of cases we use + - "-cppcoreguidelines-macro-to-enum" + + # It's been a couple of centuries since multiplication has been granted + # precedence over addition in modern mathematical notation. Adding + # parentheses to reinforce that certainly won't improve readability. + - "-readability-math-missing-parentheses" +WarningsAsErrors: "*" +HeaderFileExtensions: + - h +ImplementationFileExtensions: + - c +HeaderFilterRegex: "" +FormatStyle: none +CheckOptions: + bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false" +SystemHeaders: false diff --git a/Makefile b/Makefile index 8e14309..f1e9937 100644 --- a/Makefile +++ b/Makefile @@ -181,116 +181,9 @@ docs: README.md done < README.md; \ ) > README.plain.md -# Checkers currently disabled for clang-tidy: -# - llvmlibc-restrict-system-libc-headers -# TODO: this is Linux-only for the moment, nice to fix eventually -# -# - google-readability-braces-around-statements -# - hicpp-braces-around-statements -# - readability-braces-around-statements -# Debatable whether that improves readability, right now it would look -# like a mess -# -# - readability-magic-numbers -# - cppcoreguidelines-avoid-magic-numbers -# TODO: in most cases they are justified, but probably not everywhere -# -# - clang-analyzer-valist.Uninitialized -# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed -# -# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling -# Probably not doable to impement this without plain memcpy(), memset() -# -# - cppcoreguidelines-init-variables -# Dubious value, would kill readability -# -# - hicpp-signed-bitwise -# Those are needed for syscalls, epoll_wait flags, etc. -# -# - llvm-include-order -# TODO: not really important, but nice to fix eventually -# -# - readability-isolate-declaration -# Dubious value, would kill readability -# -# - bugprone-narrowing-conversions -# - cppcoreguidelines-narrowing-conversions -# TODO: nice to fix eventually -# -# - cppcoreguidelines-avoid-non-const-global-variables -# TODO: check, fix, and more in general constify wherever possible -# -# - altera-unroll-loops -# - altera-id-dependent-backward-branch -# TODO: check paths where it might make sense to improve performance -# -# - bugprone-easily-swappable-parameters -# Not much can be done about them other than being careful -# -# - readability-function-cognitive-complexity -# TODO: split reported functions -# -# - altera-struct-pack-align -# "Poor" alignment needed for structs reflecting message formats/headers -# -# - concurrency-mt-unsafe -# TODO: check again if multithreading is implemented -# -# - readability-identifier-length -# Complains about any identifier <3 characters, reasonable for -# globals, pointlessly verbose for locals and parameters. -# -# - bugprone-assignment-in-if-condition -# Dubious value over the compiler's built-in warning. Would -# increase verbosity. -# -# - misc-include-cleaner -# Wants to include headers which *directly* provide the things -# we use. That sounds nice, but means it will often want a OS -# specific header instead of a mostly standard one, such as -# <linux/limits.h> instead of <limits.h>. -# -# - cppcoreguidelines-macro-to-enum -# Want to replace all #defines of integers with enums. Kind of -# makes sense when those defines form an enum-like set, but -# weird for cases like standalone constants, and causes other -# awkwardness for a bunch of cases we use -# -# - readability-math-missing-parentheses -# It's been a couple of centuries since multiplication has been granted -# precedence over addition in modern mathematical notation. Adding -# parentheses to reinforce that certainly won't improve readability. - - clang-tidy: $(PASST_SRCS) $(HEADERS) - clang-tidy -checks=*,-modernize-*,\ - -clang-analyzer-valist.Uninitialized,\ - -cppcoreguidelines-init-variables,\ - -bugprone-assignment-in-if-condition,\ - -google-readability-braces-around-statements,\ - -hicpp-braces-around-statements,\ - -readability-braces-around-statements,\ - -readability-magic-numbers,\ - -llvmlibc-restrict-system-libc-headers,\ - -hicpp-signed-bitwise,\ - -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\ - -llvm-include-order,\ - -cppcoreguidelines-avoid-magic-numbers,\ - -readability-isolate-declaration,\ - -bugprone-narrowing-conversions,\ - -cppcoreguidelines-narrowing-conversions,\ - -cppcoreguidelines-avoid-non-const-global-variables,\ - -altera-unroll-loops,-altera-id-dependent-backward-branch,\ - -bugprone-easily-swappable-parameters,\ - -readability-function-cognitive-complexity,\ - -altera-struct-pack-align,\ - -concurrency-mt-unsafe,\ - -readability-identifier-length,\ - -misc-include-cleaner,\ - -cppcoreguidelines-macro-to-enum,\ - -readability-math-missing-parentheses \ - -config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \ - --warnings-as-errors=* $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992 + clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \ + -DCLANG_TIDY_58992 SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET)) ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1) From 30b4f8816774665321e6903b4f55a929b015d16d Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:20 +1100 Subject: [PATCH 082/382] arch: Avoid explicit access to 'environ' We pass 'environ' to execve() in arch_avc2_exec(), so that we retain the environment in the current process. But the declaration of 'environ' is a bit weird - it doesn't seem to be in a standard header, requiring a manual explicit declaration. But, we can avoid needing to reference it explicitly by using execv() instead of execve(). This removes a clang warning. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- arch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch.c b/arch.c index d1dfb73..e1ee729 100644 --- a/arch.c +++ b/arch.c @@ -45,7 +45,7 @@ void arch_avx2_exec(char **argv) "%s.avx2", exe)) die_perror("Can't build AVX2 executable path"); - execve(new_path, argv, environ); + execv(new_path, argv); warn_perror("Can't run AVX2 build, using non-AVX2 version"); } } From f6b546c6e4f036bc569df05cf76eced3f68d6db8 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:21 +1100 Subject: [PATCH 083/382] flow: Correct type of flowside_at_sidx() Due to a copy-pasta error, this returns 'PIF_NONE' instead of NULL on the failure case. PIF_NONE expands to 0, which turns into NULL, but it's still confusing, so fix it. This removes a clang warning. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow_table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow_table.h b/flow_table.h index a499e7b..f15db53 100644 --- a/flow_table.h +++ b/flow_table.h @@ -110,7 +110,7 @@ static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx) const union flow *flow = flow_at_sidx(sidx); if (!flow) - return PIF_NONE; + return NULL; return &flow->f.side[sidx.sidei]; } From c938d8a93e2561df1a4ac7897327456e97babb8c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:22 +1100 Subject: [PATCH 084/382] netlink: RTA_PAYLOAD() returns int, not size_t Since it's the size of a chunk of memory it would seem logical that RTA_PAYLOAD() returns size_t. However, it doesn't - it explicitly casts its result to an int. RTNH_OK(), which often takes the result of RTA_PAYLOAD() as a parameter compares it to an int, so using size_t can result in comparison of different-signed integer warnings from clang. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/netlink.c b/netlink.c index 0bdbabf..4aba2a3 100644 --- a/netlink.c +++ b/netlink.c @@ -353,7 +353,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) */ bool nl_route_get_def_multipath(struct rtattr *rta, void *gw) { - size_t nh_len = RTA_PAYLOAD(rta); + int nh_len = RTA_PAYLOAD(rta); struct rtnexthop *rtnh; bool found = false; int hops = -1; @@ -582,7 +582,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src, *(unsigned int *)RTA_DATA(rta) = ifi_dst; } else if (rta->rta_type == RTA_MULTIPATH) { - size_t nh_len = RTA_PAYLOAD(rta); + int nh_len = RTA_PAYLOAD(rta); struct rtnexthop *rtnh; for (rtnh = (struct rtnexthop *)RTA_DATA(rta); From 93bce404c19652b40f2104633286b6dac5f85b0e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:23 +1100 Subject: [PATCH 085/382] Makefile: Move NETNS_RUN_DIR definition to C code NETNS_RUN_DIR is set in the Makefile, then passed into the C code with -D. But NETNS_RUN_DIR is just a fixed string, it doesn't depend on any make probes or variables, so there's really no reason to handle it via the Makefile. Just move it to a plain #define in conf.c. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 1 - conf.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f1e9937..41f24e8 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,6 @@ FLAGS := -Wall -Wextra -Wno-format-zero-length FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) -FLAGS += -DNETNS_RUN_DIR=\"/run/netns\" FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH) FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL) FLAGS += -DARCH=\"$(TARGET_ARCH)\" diff --git a/conf.c b/conf.c index 14411b4..86566db 100644 --- a/conf.c +++ b/conf.c @@ -46,6 +46,8 @@ #include "isolation.h" #include "log.h" +#define NETNS_RUN_DIR "/run/netns" + /** * next_chunk - Return the next piece of a string delimited by a character * @s: String to search From 7917159005d41d2f87213645e9460534beb1e14f Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:24 +1100 Subject: [PATCH 086/382] seccomp: Simplify handling of AUDIT_ARCH Currently we construct the AUDIT_ARCH variable in the Makefile, then pass it into the C code with -D. The only place that uses it, though is the BPF filter generated by seccomp.sh. seccomp.sh already needs to do things differently depending on the arch, so it might as well just insert the expanded AUDIT_ARCH directly into the generated code, rather than using a #define. Arguably this is better, even, since it ensures more locally that the arch the BPF checks for matches the arch seccomp.sh built the filter for. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 9 --------- seccomp.sh | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 41f24e8..c521d04 100644 --- a/Makefile +++ b/Makefile @@ -25,14 +25,6 @@ TARGET ?= $(shell $(CC) -dumpmachine) TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z]) TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/') -AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/') - # On some systems enabling optimization also enables source fortification, # automagically. Do not override it. FORTIFY_FLAG := @@ -44,7 +36,6 @@ FLAGS := -Wall -Wextra -Wno-format-zero-length FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) -FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH) FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL) FLAGS += -DARCH=\"$(TARGET_ARCH)\" FLAGS += -DVERSION=\"$(VERSION)\" diff --git a/seccomp.sh b/seccomp.sh index 38aa826..6499c58 100755 --- a/seccomp.sh +++ b/seccomp.sh @@ -20,6 +20,15 @@ OUT="$(mktemp)" [ -z "${ARCH}" ] && ARCH="$(uname -m)" [ -z "${CC}" ] && CC="cc" +AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z] \ + | sed 's/^ARM.*/ARM/' \ + | sed 's/I[456]86/I386/' \ + | sed 's/PPC64/PPC/' \ + | sed 's/PPCLE/PPC64LE/' \ + | sed 's/MIPS64EL/MIPSEL64/' \ + | sed 's/HPPA/PARISC/' \ + | sed 's/SH4/SH/')" + HEADER="/* This file was automatically generated by $(basename ${0}) */ #ifndef AUDIT_ARCH_PPC64LE @@ -32,7 +41,7 @@ struct sock_filter filter_@PROFILE@[] = { /* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))), - BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@), /* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))), @@ -233,7 +242,8 @@ gen_profile() { sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}" done - finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" + finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \ + "AUDIT_ARCH:${AUDIT_ARCH}" } printf '%s\n' "${HEADER}" > "${OUT}" From 13fc6d511eb89b15a0941c63ae44f147572b1470 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:25 +1100 Subject: [PATCH 087/382] Makefile: Use -DARCH for qrap only We insert -DARCH for all compiles, based on TARGET_ARCH determined in the Makefile. However, this is only used in qrap.c, not anywhere else in passt or pasta. Only supply this -D when compiling qrap specifically. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c521d04..2a8540a 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,6 @@ FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL) -FLAGS += -DARCH=\"$(TARGET_ARCH)\" FLAGS += -DVERSION=\"$(VERSION)\" FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) @@ -107,7 +106,7 @@ pasta.avx2 pasta.1 pasta: pasta%: passt% ln -sf $< $@ qrap: $(QRAP_SRCS) passt.h - $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS) + $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS) valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \ rt_sigreturn getpid gettid kill clock_gettime mmap \ From c560e2f65b625367d3baf0fcf06cf19996407659 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:26 +1100 Subject: [PATCH 088/382] Makefile: Don't attempt to auto-detect stack size We probe the available stack limit in the Makefile using rlimit, then use that to set the size of the stack when we clone() extra threads. But the rlimit at compile time need not be the same as the rlimit at runtime, so that's not particularly sensible. Ideally, we'd set the stack size based on an estimate of the actual maximum stack usage of all our clone()ed functions. We don't have that at the moment, but to keep things simple just set it to 1MiB - that's what the current probe will set things to on my default configuration Fedora 40, so it's likely to be fine in most cases. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 6 ------ util.h | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 2a8540a..56bf2e8 100644 --- a/Makefile +++ b/Makefile @@ -15,11 +15,6 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio # the IPv6 socket API? (Linux does) DUAL_STACK_SOCKETS := 1 -RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s') -ifeq ($(RLIMIT_STACK_VAL),unlimited) -RLIMIT_STACK_VAL := 1024 -endif - TARGET ?= $(shell $(CC) -dumpmachine) # Get 'uname -m'-like architecture description for target TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z]) @@ -36,7 +31,6 @@ FLAGS := -Wall -Wextra -Wno-format-zero-length FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) -FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL) FLAGS += -DVERSION=\"$(VERSION)\" FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) diff --git a/util.h b/util.h index 3fc64cf..c341236 100644 --- a/util.h +++ b/util.h @@ -132,7 +132,7 @@ static inline uint32_t ntohl_unaligned(const void *p) return ntohl(val); } -#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) +#define NS_FN_STACK_SIZE (1024 * 1024) /* 1MiB */ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg); #define NS_CALL(fn, arg) \ From 1d7cff3779e4bff944ce17c86471a87141c352d2 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:27 +1100 Subject: [PATCH 089/382] clang: Add rudimentary clangd configuration clangd's default configuration seems to try to treat .h files as C++ not C. There are many more spurious warnings generated at present, but this removes some of the most egregious ones. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- .clangd | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .clangd diff --git a/.clangd b/.clangd new file mode 100644 index 0000000..41bec92 --- /dev/null +++ b/.clangd @@ -0,0 +1,3 @@ +CompileFlags: + # Don't try to interpret our headers as C++' + Add: [-xc, -Wall] From 1e76a19895b5d8b2b5994263625fce35373041e7 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 10:25:28 +1100 Subject: [PATCH 090/382] util: Remove unused ffsl() function We supply a weak alias for ffsl() in case it's not defined in our libc. Except.. we don't have any users for it any more, so remove it. make cppcheck doesn't spot this at present for complicated reasons, but it might with tweaks to the options I'm experimenting with. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- util.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/util.h b/util.h index c341236..2858b10 100644 --- a/util.h +++ b/util.h @@ -158,9 +158,6 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, struct ctx; -/* cppcheck-suppress funcArgNamesDifferent */ -__attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); } - #ifdef CLOSE_RANGE_UNSHARE /* Linux kernel >= 5.9 */ /* glibc < 2.34 and musl as of 1.2.5 need these */ #ifndef SYS_close_range From c5f4e4d146f6f57a66bd4d7792e8ccf9625d039c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 12:43:04 +1100 Subject: [PATCH 091/382] fwd: Squash different-signedness comparison warning On certain architectures we get a warning about comparison between different signedness integers in fwd_probe_ephemeral(). This is because NUM_PORTS evaluates to an unsigned integer. It's a fixed value, though and we know it will fit in a signed long on anything reasonable, so add a cast to suppress the warning. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- fwd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fwd.c b/fwd.c index c71f5e1..0b7f8b1 100644 --- a/fwd.c +++ b/fwd.c @@ -75,8 +75,8 @@ void fwd_probe_ephemeral(void) if (*end || errno) goto parse_err; - if (min < 0 || min >= NUM_PORTS || - max < 0 || max >= NUM_PORTS) + if (min < 0 || min >= (long)NUM_PORTS || + max < 0 || max >= (long)NUM_PORTS) goto parse_err; fwd_ephemeral_min = min; From 0d7b8201ed5788416d1b36fc3a554b61ad10c201 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 17:54:14 +1100 Subject: [PATCH 092/382] linux_dep: Generalise tcp_info.h to handling Linux extension compatibility tcp_info.h exists just to contain a modern enough version of struct tcp_info for our needs, removing compile time dependency on the version of kernel headers. There are several other cases where we can remove similar compile time dependencies on kernel version. Prepare for that by renaming tcp_info.h to linux_dep.h. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_info.h => linux_dep.h | 10 ++++++---- tcp.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) rename tcp_info.h => linux_dep.h (97%) diff --git a/tcp_info.h b/linux_dep.h similarity index 97% rename from tcp_info.h rename to linux_dep.h index 06ccb16..8921623 100644 --- a/tcp_info.h +++ b/linux_dep.h @@ -1,13 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0-or-later * Copyright Red Hat * - * Largely derived from include/linux/tcp.h in the Linux kernel + * Declarations for Linux specific dependencies */ -#ifndef TCP_INFO_H -#define TCP_INFO_H +#ifndef LINUX_DEP_H +#define LINUX_DEP_H /* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt() + * + * Largely derived from include/linux/tcp.h in the Linux kernel * * Some fields returned by TCP_INFO have been there for ages and are shared with * BSD. struct tcp_info from netinet/tcp.h has only those fields. There are @@ -117,4 +119,4 @@ struct tcp_info_linux { */ }; -#endif /* TCP_INFO_H */ +#endif /* LINUX_DEP_H */ diff --git a/tcp.c b/tcp.c index 56ceba6..1bb122b 100644 --- a/tcp.c +++ b/tcp.c @@ -299,10 +299,10 @@ #include "log.h" #include "inany.h" #include "flow.h" +#include "linux_dep.h" #include "flow_table.h" #include "tcp_internal.h" -#include "tcp_info.h" #include "tcp_buf.h" /* MSS rounding: see SET_MSS() */ From d8e05a3fe0f2db444c51342888b37ed351b66f63 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 17:54:18 +1100 Subject: [PATCH 093/382] ndp: Use const pointer for ndp_ns packet We don't modify this structure at all. For some reason cppcheck doesn't catch this with our current options, but did when I was experimenting with some different options. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ndp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ndp.c b/ndp.c index a1ee834..faae408 100644 --- a/ndp.c +++ b/ndp.c @@ -234,8 +234,8 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr, return 1; if (ih->icmp6_type == NS) { - struct ndp_ns *ns = packet_get(p, 0, 0, sizeof(struct ndp_ns), - NULL); + const struct ndp_ns *ns = + packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL); if (!ns) return -1; From 6f913b3af062a889f70758f8d3a458dcf0ac0cdd Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 17:54:19 +1100 Subject: [PATCH 094/382] udp: Don't dereference uflow before NULL check in udp_reply_sock_handler() We have an ASSERT() verifying that we're able to look up the flow in udp_reply_sock_handler(). However, we dereference uflow before that in an initializer, rather defeating the point. Rearrange to avoid that. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udp.c b/udp.c index 0c01067..4be165f 100644 --- a/udp.c +++ b/udp.c @@ -644,12 +644,13 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); struct udp_flow *uflow = udp_at_sidx(ref.flowside); - int from_s = uflow->s[ref.flowside.sidei]; uint8_t topif = pif_at_sidx(tosidx); - int n, i; + int n, i, from_s; ASSERT(!c->no_udp && uflow); + from_s = uflow->s[ref.flowside.sidei]; + if (udp_sock_errs(c, from_s, events) < 0) { flow_err(uflow, "Unrecoverable error on reply socket"); flow_err_details(uflow); From 867db07fcfc24d0918fa92f98e26fc8f9bf40253 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 17:54:20 +1100 Subject: [PATCH 095/382] util: Work around cppcheck bug 6936 While experimenting with cppcheck options, I hit several false positives caused by this bug: https://trac.cppcheck.net/ticket/13227 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 2 +- util.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 56bf2e8..3c82f50 100644 --- a/Makefile +++ b/Makefile @@ -188,5 +188,5 @@ cppcheck: $(PASST_SRCS) $(HEADERS) $(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*) \ --inline-suppr \ --suppress=unusedStructMember \ - $(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \ + $(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936 \ $(PASST_SRCS) $(HEADERS) diff --git a/util.h b/util.h index 2858b10..0bf396a 100644 --- a/util.h +++ b/util.h @@ -68,6 +68,15 @@ #define STRINGIFY(x) #x #define STR(x) STRINGIFY(x) +#ifdef CPPCHECK_6936 +/* Some cppcheck versions get confused by aborts inside a loop, causing + * it to give false positive uninitialised variable warnings later in + * the function, because it doesn't realise the non-initialising path + * already exited. See https://trac.cppcheck.net/ticket/13227 + */ +#define ASSERT(expr) \ + ((expr) ? (void)0 : abort()) +#else #define ASSERT(expr) \ do { \ if (!(expr)) { \ @@ -79,6 +88,7 @@ abort(); \ } \ } while (0) +#endif #ifdef P_tmpdir #define TMPDIR P_tmpdir From b456ee1b53171c46b6f25c1c43d9fc17f6116745 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 14:03:21 +1100 Subject: [PATCH 096/382] test: Rename propagating signal handler nstool in "exec" mode will propagate some signals (specifically SIGTERM) to the process in the namespace it executes. The signal handler which accomplishes this is called simply sig_handler(). However, it turns out we're going to need some other signal handlers, so rename this to the more specific sig_propagate(). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/nstool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/nstool.c b/test/nstool.c index fc357d8..3f75edd 100644 --- a/test/nstool.c +++ b/test/nstool.c @@ -346,7 +346,7 @@ static int openns(const char *fmt, ...) } static pid_t sig_pid; -static void sig_handler(int signum) +static void sig_propagate(int signum) { int err; @@ -358,7 +358,7 @@ static void sig_handler(int signum) static void wait_for_child(pid_t pid) { struct sigaction sa = { - .sa_handler = sig_handler, + .sa_handler = sig_propagate, .sa_flags = SA_RESETHAND, }; int status, err; From 1699083f291ca8e639d0711eff59c61eecdf02c1 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 14:03:22 +1100 Subject: [PATCH 097/382] test: Make nstool hold robust against interruptions to control clients Currently nstool die()s on essentially any error. In most cases that's fine for our purposes. However, it's a problem when in "hold" mode and getting an IO error on an accept()ed socket. This could just indicate that the control client aborted prematurely, in which case we don't want to kill of the namespace we're holding. Adjust these to print an error, close() the control client socket and carry on. In addition, we need to explicitly ignore SIGPIPE in order not to be killed by an abruptly closed client connection. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/nstool.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/test/nstool.c b/test/nstool.c index 3f75edd..7ab5d2a 100644 --- a/test/nstool.c +++ b/test/nstool.c @@ -31,10 +31,15 @@ #define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0]))) -#define die(...) \ - do { \ - fprintf(stderr, __VA_ARGS__); \ - exit(1); \ +#define die(...) \ + do { \ + fprintf(stderr, "nstool: " __VA_ARGS__); \ + exit(1); \ + } while (0) + +#define err(...) \ + do { \ + fprintf(stderr, "nstool: " __VA_ARGS__); \ } while (0) struct ns_type { @@ -156,6 +161,9 @@ static int connect_ctl(const char *sockpath, bool wait, static void cmd_hold(int argc, char *argv[]) { + struct sigaction sa = { + .sa_handler = SIG_IGN, + }; int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX); struct sockaddr_un addr; const char *sockpath = argv[1]; @@ -185,6 +193,10 @@ static void cmd_hold(int argc, char *argv[]) if (!getcwd(info.cwd, sizeof(info.cwd))) die("getcwd(): %s\n", strerror(errno)); + rc = sigaction(SIGPIPE, &sa, NULL); + if (rc) + die("sigaction(SIGPIPE): %s\n", strerror(errno)); + do { int afd = accept(fd, NULL, NULL); char buf; @@ -193,17 +205,21 @@ static void cmd_hold(int argc, char *argv[]) die("accept(): %s\n", strerror(errno)); rc = write(afd, &info, sizeof(info)); - if (rc < 0) - die("write(): %s\n", strerror(errno)); + if (rc < 0) { + err("holder write() to control socket: %s\n", + strerror(errno)); + } if ((size_t)rc < sizeof(info)) - die("short write() on control socket\n"); + err("holder short write() on control socket\n"); rc = read(afd, &buf, sizeof(buf)); - if (rc < 0) - die("read(): %s\n", strerror(errno)); + if (rc < 0) { + err("holder read() on control socket: %s\n", + strerror(errno)); + } close(afd); - } while (rc == 0); + } while (rc <= 0); unlink(sockpath); } From 910f4f91030141b7e2e65644dc9fe678cc57f640 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 12:44:14 +1100 Subject: [PATCH 098/382] test: Don't require 64-bit prefixes in perf tests When determining the namespace's IPv6 address in the perf test setup, we explicitly filter for addresses with a 64-bit prefix length. There's no real reason we need that - as long as it's a global address we can use it. I suspect this was copied without thinking from a similar example in the NDP tests, where the 64-bit prefix length _is_ meaningful (though it's not entirely clear if the handling is correct there either). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/perf/pasta_tcp | 2 +- test/perf/pasta_udp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/perf/pasta_tcp b/test/perf/pasta_tcp index d1ccf7d..88284b2 100644 --- a/test/perf/pasta_tcp +++ b/test/perf/pasta_tcp @@ -211,7 +211,7 @@ tr TCP throughput over IPv6: host to ns iperf3s ns 10002 nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' -nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local' +nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local' bw - bw - bw - diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp index 544bf17..3d07091 100644 --- a/test/perf/pasta_udp +++ b/test/perf/pasta_udp @@ -196,7 +196,7 @@ tr UDP throughput over IPv6: host to ns iperf3s ns 10002 nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' -nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local' +nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local' iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472 bw __BW__ 0.3 0.5 iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972 From 9a0e544f05bf93609921f988b22f0680e143b4ad Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 6 Nov 2024 12:44:15 +1100 Subject: [PATCH 099/382] test: Improve test for NDP assigned prefix In the NDP tests we search explicitly for a guest address with prefix length 64. AFAICT this is an attempt to specifically find the SLAAC assigned address, rather than something assigned by other means. We can do that more explicitly by checking for .protocol == "kernel_ra". however. The SLAAC prefixes we assigned *will* always be 64-bit, that's hard-coded into our NDP implementation. RFC4862 doesn't really allow anything else since the interface identifiers for an Ethernet-like link are 64-bits. Let's actually verify that, rather than just assuming it, by extracting the prefix length assigned in the guest and checking it as well. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/passt/ndp | 4 ++-- test/pasta/ndp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/passt/ndp b/test/passt/ndp index f54b8ce..56b385b 100644 --- a/test/passt/ndp +++ b/test/passt/ndp @@ -23,8 +23,8 @@ hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").d check [ -n "__IFNAME__" ] test SLAAC: prefix -gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]' -gout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4 +gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]' +gout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4 hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4 check [ "__PREFIX6__" = "__HOST_PREFIX6__" ] diff --git a/test/pasta/ndp b/test/pasta/ndp index c59627f..2442ab5 100644 --- a/test/pasta/ndp +++ b/test/pasta/ndp @@ -22,8 +22,8 @@ ns ip link set dev __IFNAME__ up ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done test SLAAC: prefix -nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]' -nsout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4 +nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]' +nsout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4 hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4 check [ "__PREFIX6__" = "__HOST_PREFIX6__" ] From 78da088f7babfa0431c6fb2704ef0709fe057770 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Tue, 5 Nov 2024 20:07:44 -0500 Subject: [PATCH 100/382] tcp: unify payload and flags l2 frames array In order to reduce static memory and code footprint, we merge the array for l2 flag frames into the one for payload frames. This change also ensures that no flag message will be sent out over the l2 media bypassing already queued payload messages. Performance measurements with iperf3, where we force all traffic via the tap queue, show no significant difference: Dual traffic both directions sinmultaneously, with patch: ======================================================== host->ns: -------- [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-100.00 sec 36.3 GBytes 3.12 Gbits/sec 4759 sender [ 5] 0.00-100.04 sec 36.3 GBytes 3.11 Gbits/sec receiver ns->host: --------- [ ID] Interval Transfer Bitrate [ 5] 0.00-100.00 sec 321 GBytes 27.6 Gbits/sec receiver Dual traffic both directions sinmultaneously, without patch: ============================================================ host->ns: -------- [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-100.00 sec 35.0 GBytes 3.01 Gbits/sec 6001 sender [ 5] 0.00-100.04 sec 34.8 GBytes 2.99 Gbits/sec receiver ns->host -------- [ ID] Interval Transfer Bitrate [ 5] 0.00-100.00 sec 345 GBytes 29.6 Gbits/sec receiver Single connection, with patch: ============================== host->ns: --------- [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-100.00 sec 138 GBytes 11.8 Gbits/sec 922 sender [ 5] 0.00-100.04 sec 138 GBytes 11.8 Gbits/sec receiver ns->host: ----------- [ ID] Interval Transfer Bitrate [ 5] 0.00-100.00 sec 430 GBytes 36.9 Gbits/sec receiver Single connection, without patch: ================================= host->ns: ------------ [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-100.00 sec 139 GBytes 11.9 Gbits/sec 900 sender [ 5] 0.00-100.04 sec 139 GBytes 11.9 Gbits/sec receiver ns->host: --------- [ ID] Interval Transfer Bitrate [ 5] 0.00-100.00 sec 440 GBytes 37.8 Gbits/sec receiver Signed-off-by: Jon Maloy <jmaloy@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 1 - tcp_buf.c | 70 ++++++++++++-------------------------------------- tcp_buf.h | 1 - tcp_internal.h | 15 ----------- 4 files changed, 17 insertions(+), 70 deletions(-) diff --git a/tcp.c b/tcp.c index 1bb122b..a3d48fa 100644 --- a/tcp.c +++ b/tcp.c @@ -937,7 +937,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn) /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ void tcp_defer_handler(struct ctx *c) { - tcp_flags_flush(c); tcp_payload_flush(c); } diff --git a/tcp_buf.c b/tcp_buf.c index 274e313..d29c1a9 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -20,7 +20,7 @@ #include <netinet/ip.h> -#include <linux/tcp.h> +#include <netinet/tcp.h> #include "util.h" #include "ip.h" @@ -59,22 +59,10 @@ static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516") static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM]; static unsigned int tcp_payload_used; -static struct tap_hdr tcp_flags_tap_hdr[TCP_FRAMES_MEM]; -/* IPv4 headers for TCP segment without payload */ -static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; -/* TCP segments without payload for IPv4 frames */ -static struct tcp_flags_t tcp_flags[TCP_FRAMES_MEM]; - -static unsigned int tcp_flags_used; - -/* IPv6 headers for TCP segment without payload */ -static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; - /* recvmsg()/sendmsg() data for tap */ static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp_l2_flags_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS]; /** * tcp_update_l2_buf() - Update Ethernet header buffers with addresses @@ -103,15 +91,6 @@ void tcp_sock_iov_init(const struct ctx *c) for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) { tcp6_payload_ip[i] = ip6; tcp4_payload_ip[i] = iph; - tcp_payload[i].th.doff = sizeof(struct tcphdr) / 4; - tcp_payload[i].th.ack = 1; - } - - for (i = 0; i < ARRAY_SIZE(tcp_flags); i++) { - tcp6_flags_ip[i] = ip6; - tcp4_flags_ip[i] = iph; - tcp_flags[i].th.doff = sizeof(struct tcphdr) / 4; - tcp_flags[i].th.ack = 1; } for (i = 0; i < TCP_FRAMES_MEM; i++) { @@ -121,25 +100,6 @@ void tcp_sock_iov_init(const struct ctx *c) iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i]; } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - struct iovec *iov = tcp_l2_flags_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_flags_tap_hdr[i]); - iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp_flags[i]; - } -} - -/** - * tcp_flags_flush() - Send out buffers for segments with no data (flags) - * @c: Execution context - */ -void tcp_flags_flush(const struct ctx *c) -{ - tap_send_frames(c, &tcp_l2_flags_iov[0][0], TCP_NUM_IOVS, - tcp_flags_used); - tcp_flags_used = 0; } /** @@ -171,7 +131,7 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, } /** - * tcp_payload_flush() - Send out buffers for segments with data + * tcp_payload_flush() - Send out buffers for segments with data or flags * @c: Execution context */ void tcp_payload_flush(const struct ctx *c) @@ -197,37 +157,35 @@ void tcp_payload_flush(const struct ctx *c) */ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) { - struct tcp_flags_t *payload; + struct tcp_payload_t *payload; struct iovec *iov; size_t optlen; size_t l4len; uint32_t seq; int ret; - iov = tcp_l2_flags_iov[tcp_flags_used]; + iov = tcp_l2_iov[tcp_payload_used]; if (CONN_V4(conn)) { - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[tcp_flags_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]); iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; } else { - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[tcp_flags_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]); iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; } payload = iov[TCP_IOV_PAYLOAD].iov_base; seq = conn->seq_to_tap; ret = tcp_prepare_flags(c, conn, flags, &payload->th, - &payload->opts, &optlen); + (struct tcp_syn_opts *)&payload->data, &optlen); if (ret <= 0) return ret; - tcp_flags_used++; + tcp_payload_used++; l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false); iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (flags & DUP_ACK) { - struct iovec *dup_iov; + struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++]; - dup_iov = tcp_l2_flags_iov[tcp_flags_used++]; memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_len); dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base; @@ -237,8 +195,8 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len; } - if (tcp_flags_used > TCP_FRAMES_MEM - 2) - tcp_flags_flush(c); + if (tcp_payload_used > TCP_FRAMES_MEM - 2) + tcp_payload_flush(c); return 0; } @@ -254,6 +212,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, ssize_t dlen, int no_csum, uint32_t seq) { + struct tcp_payload_t *payload; const uint16_t *check = NULL; struct iovec *iov; size_t l4len; @@ -274,6 +233,11 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]); iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; } + payload = iov[TCP_IOV_PAYLOAD].iov_base; + payload->th.th_off = sizeof(struct tcphdr) / 4; + payload->th.th_x2 = 0; + payload->th.th_flags = 0; + payload->th.ack = 1; l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (++tcp_payload_used > TCP_FRAMES_MEM - 1) diff --git a/tcp_buf.h b/tcp_buf.h index 49c04d4..54f5e53 100644 --- a/tcp_buf.h +++ b/tcp_buf.h @@ -7,7 +7,6 @@ #define TCP_BUF_H void tcp_sock_iov_init(const struct ctx *c); -void tcp_flags_flush(const struct ctx *c); void tcp_payload_flush(const struct ctx *c); int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags); diff --git a/tcp_internal.h b/tcp_internal.h index a5a47df..c846f60 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -134,21 +134,6 @@ struct tcp_syn_opts { .ws = TCP_OPT_WS(ws_), \ }) -/** - * struct tcp_flags_t - TCP header and data to send zero-length - * segments (flags) - * @th: TCP header - * @opts TCP options - */ -struct tcp_flags_t { - struct tcphdr th; - struct tcp_syn_opts opts; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - extern char tcp_buf_discard [MAX_WINDOW]; void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, From 5f5e814cfc27c14cd7f116c8fb59e17d5671cafe Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 7 Nov 2024 17:47:08 +0100 Subject: [PATCH 101/382] dhcpv6: Use for loop instead of goto to avoid false positive cppcheck warning cppcheck 2.16.0 reports: dhcpv6.c:334:14: style: The comparison 'ia_type == 3' is always true. [knownConditionTrueFalse] if (ia_type == OPT_IA_NA) { ^ dhcpv6.c:306:12: note: 'ia_type' is assigned value '3' here. ia_type = OPT_IA_NA; ^ dhcpv6.c:334:14: note: The comparison 'ia_type == 3' is always true. if (ia_type == OPT_IA_NA) { ^ this is not really the case as we set ia_type to OPT_IA_TA and then jump back. Anyway, there's no particular reason to use a goto here: add a trivial foreach() macro to go through elements of an array and use it instead. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- dhcpv6.c | 51 +++++++++++++++++++++++---------------------------- util.h | 3 +++ 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/dhcpv6.c b/dhcpv6.c index 14a5c7e..f2e7307 100644 --- a/dhcpv6.c +++ b/dhcpv6.c @@ -296,47 +296,42 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset, static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p, struct in6_addr *la) { + int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type; + const struct opt_ia_addr *opt_addr; char buf[INET6_ADDRSTRLEN]; struct in6_addr req_addr; const struct opt_hdr *h; struct opt_hdr *ia; size_t offset; - int ia_type; - ia_type = OPT_IA_NA; -ia_ta: - offset = 0; - while ((ia = dhcpv6_opt(p, &offset, ia_type))) { - if (ntohs(ia->l) < OPT_VSIZE(ia_na)) - return NULL; - - offset += sizeof(struct opt_ia_na); - - while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) { - const struct opt_ia_addr *opt_addr; - - if (ntohs(h->l) != OPT_VSIZE(ia_addr)) + foreach(ia_type, ia_types) { + offset = 0; + while ((ia = dhcpv6_opt(p, &offset, *ia_type))) { + if (ntohs(ia->l) < OPT_VSIZE(ia_na)) return NULL; - opt_addr = (const struct opt_ia_addr *)h; - req_addr = opt_addr->addr; - if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) { - info("DHCPv6: requested address %s not on link", - inet_ntop(AF_INET6, &req_addr, - buf, sizeof(buf))); - return ia; - } + offset += sizeof(struct opt_ia_na); - offset += sizeof(struct opt_ia_addr); + while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) { + if (ntohs(h->l) != OPT_VSIZE(ia_addr)) + return NULL; + + opt_addr = (const struct opt_ia_addr *)h; + req_addr = opt_addr->addr; + if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) + goto err; + + offset += sizeof(struct opt_ia_addr); + } } } - if (ia_type == OPT_IA_NA) { - ia_type = OPT_IA_TA; - goto ia_ta; - } - return NULL; + +err: + info("DHCPv6: requested address %s not on link", + inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf))); + return ia; } /** diff --git a/util.h b/util.h index 0bf396a..582ef57 100644 --- a/util.h +++ b/util.h @@ -102,6 +102,9 @@ #define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0]))) +#define foreach(item, array) \ + for ((item) = (array); (item) - (array) < ARRAY_SIZE(array); (item)++) + #define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b)) #define FD_PROTO(x, proto) \ (IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x))) From 1feb90fe627959e4903e01ba83249fa33c4d472d Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 7 Nov 2024 18:08:46 +0100 Subject: [PATCH 102/382] dhcpv6: Turn some option headers pointers to const cppcheck 2.14.2 on Alpine reports: dhcpv6.c:431:32: style: Variable 'client_id' can be declared as pointer to const [constVariablePointer] struct opt_hdr *ia, *bad_ia, *client_id; ^ It's not only 'client_id': we can declare 'ia' as const pointer too. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- dhcpv6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhcpv6.c b/dhcpv6.c index f2e7307..0523bba 100644 --- a/dhcpv6.c +++ b/dhcpv6.c @@ -423,11 +423,11 @@ search: int dhcpv6(struct ctx *c, const struct pool *p, const struct in6_addr *saddr, const struct in6_addr *daddr) { - struct opt_hdr *ia, *bad_ia, *client_id; - const struct opt_hdr *server_id; + const struct opt_hdr *client_id, *server_id, *ia; const struct in6_addr *src; const struct msg_hdr *mh; const struct udphdr *uh; + struct opt_hdr *bad_ia; size_t mlen, n; uh = packet_get(p, 0, 0, sizeof(*uh), &mlen); From 87940f9aa72a342988e89b9509c2572e494d91a6 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 7 Nov 2024 18:58:49 +0100 Subject: [PATCH 103/382] tap: Cast TAP_BUF_BYTES - ETH_MAX_MTU to ssize_t, not TAP_BUF_BYTES Given that we're comparing against 'n', which is signed, we cast TAP_BUF_BYTES to ssize_t so that the maximum buffer usage, calculated as the difference between TAP_BUF_BYTES and ETH_MAX_MTU, will also be signed. This doesn't necessarily happen on 32-bit architectures, though. On armhf and i686, clang-tidy 18.1.8 and 19.1.2 report: /home/pi/passt/tap.c:1087:16: error: comparison of integers of different signs: 'ssize_t' (aka 'int') and 'unsigned int' [clang-diagnostic-sign-compare,-warnings-as-errors] 1087 | for (n = 0; n <= (ssize_t)TAP_BUF_BYTES - ETH_MAX_MTU; n += len) { | ~ ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cast the whole difference to ssize_t, as we know it's going to be positive anyway, instead of relying on that side effect. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap.c b/tap.c index f638f2c..a3ba958 100644 --- a/tap.c +++ b/tap.c @@ -1084,7 +1084,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) tap_flush_pools(); - for (n = 0; n <= (ssize_t)TAP_BUF_BYTES - ETH_MAX_MTU; n += len) { + for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) { len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU); if (len == 0) { From d4f09c9b96c68a1c6b1387cd5674cd331a939f27 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 7 Nov 2024 19:04:44 +0100 Subject: [PATCH 104/382] util: Define small and big thresholds for socket buffers as unsigned long long On 32-bit architectures, clang-tidy reports: /home/pi/passt/tcp.c:728:11: error: performing an implicit widening conversion to type 'uint64_t' (aka 'unsigned long long') of a multiplication performed in type 'unsigned long' [bugprone-implicit-widening-of-multiplication-result,-warnings-as-errors] 728 | if (v >= SNDBUF_BIG) | ^ /home/pi/passt/util.h:158:22: note: expanded from macro 'SNDBUF_BIG' 158 | #define SNDBUF_BIG (4UL * 1024 * 1024) | ^ /home/pi/passt/tcp.c:728:11: note: make conversion explicit to silence this warning 728 | if (v >= SNDBUF_BIG) | ^ /home/pi/passt/util.h:158:22: note: expanded from macro 'SNDBUF_BIG' 158 | #define SNDBUF_BIG (4UL * 1024 * 1024) | ^~~~~~~~~~~~~~~~~ /home/pi/passt/tcp.c:728:11: note: perform multiplication in a wider type 728 | if (v >= SNDBUF_BIG) | ^ /home/pi/passt/util.h:158:22: note: expanded from macro 'SNDBUF_BIG' 158 | #define SNDBUF_BIG (4UL * 1024 * 1024) | ^~~~~~~~~~ /home/pi/passt/tcp.c:730:15: error: performing an implicit widening conversion to type 'uint64_t' (aka 'unsigned long long') of a multiplication performed in type 'unsigned long' [bugprone-implicit-widening-of-multiplication-result,-warnings-as-errors] 730 | else if (v > SNDBUF_SMALL) | ^ /home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL' 159 | #define SNDBUF_SMALL (128UL * 1024) | ^ /home/pi/passt/tcp.c:730:15: note: make conversion explicit to silence this warning 730 | else if (v > SNDBUF_SMALL) | ^ /home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL' 159 | #define SNDBUF_SMALL (128UL * 1024) | ^~~~~~~~~~~~ /home/pi/passt/tcp.c:730:15: note: perform multiplication in a wider type 730 | else if (v > SNDBUF_SMALL) | ^ /home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL' 159 | #define SNDBUF_SMALL (128UL * 1024) | ^~~~~ /home/pi/passt/tcp.c:731:17: error: performing an implicit widening conversion to type 'uint64_t' (aka 'unsigned long long') of a multiplication performed in type 'unsigned long' [bugprone-implicit-widening-of-multiplication-result,-warnings-as-errors] 731 | v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2; | ^ /home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL' 159 | #define SNDBUF_SMALL (128UL * 1024) | ^ /home/pi/passt/tcp.c:731:17: note: make conversion explicit to silence this warning 731 | v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2; | ^ /home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL' 159 | #define SNDBUF_SMALL (128UL * 1024) | ^~~~~~~~~~~~ /home/pi/passt/tcp.c:731:17: note: perform multiplication in a wider type 731 | v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2; | ^ /home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL' 159 | #define SNDBUF_SMALL (128UL * 1024) | ^~~~~ because, wherever we use those thresholds, we define the other term of comparison as uint64_t. Define the thresholds as unsigned long long as well, to make sure we match types. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- util.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/util.h b/util.h index 582ef57..963f57b 100644 --- a/util.h +++ b/util.h @@ -158,9 +158,9 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, (void *)(arg)); \ } while (0) -#define RCVBUF_BIG (2UL * 1024 * 1024) -#define SNDBUF_BIG (4UL * 1024 * 1024) -#define SNDBUF_SMALL (128UL * 1024) +#define RCVBUF_BIG (2ULL * 1024 * 1024) +#define SNDBUF_BIG (4ULL * 1024 * 1024) +#define SNDBUF_SMALL (128ULL * 1024) #include <net/if.h> #include <limits.h> From 71869e2912b9ede9532725e9ee5e7752b7137009 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 7 Nov 2024 19:28:21 +0100 Subject: [PATCH 105/382] passt: Use NOLINT clang-tidy block instead of NOLINTNEXTLINE For some reason, this is only reported by clang-tidy 19.1.2 on Alpine: /home/sbrivio/passt/passt.c:314:53: error: conditional operator with identical true and false expressions [bugprone-branch-clone,-warnings-as-errors] 314 | nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL); | ^ We do have a suppression, but not on the line preceding it, because we also need a cppcheck suppression there. Use NOLINTBEGIN/NOLINTEND for the clang-tidy suppression. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- passt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/passt.c b/passt.c index eaf231d..fac6101 100644 --- a/passt.c +++ b/passt.c @@ -309,9 +309,10 @@ int main(int argc, char **argv) timer_init(&c, &now); loop: - /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ + /* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */ /* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */ nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL); + /* NOLINTEND(bugprone-branch-clone) */ if (nfds == -1 && errno != EINTR) die_perror("epoll_wait() failed in main loop"); From 58fa5508bde073a39c93a8f1296e363f1786c84c Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 7 Nov 2024 19:40:37 +0100 Subject: [PATCH 106/382] tap, tcp, util: Add some missing SOCK_CLOEXEC flags I have no idea why, but these are reported by clang-tidy (19.2.1) on Alpine (x86) only: /home/sbrivio/passt/tap.c:1139:38: error: 'socket' should use SOCK_CLOEXEC where possible [android-cloexec-socket,-warnings-as-errors] 1139 | int fd = socket(AF_UNIX, SOCK_STREAM, 0); | ^ | | SOCK_CLOEXEC /home/sbrivio/passt/tap.c:1158:51: error: 'socket' should use SOCK_CLOEXEC where possible [android-cloexec-socket,-warnings-as-errors] 1158 | ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); | ^ | | SOCK_CLOEXEC /home/sbrivio/passt/tcp.c:1413:44: error: 'socket' should use SOCK_CLOEXEC where possible [android-cloexec-socket,-warnings-as-errors] 1413 | s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); | ^ | | SOCK_CLOEXEC /home/sbrivio/passt/util.c:188:38: error: 'socket' should use SOCK_CLOEXEC where possible [android-cloexec-socket,-warnings-as-errors] 188 | if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { | ^ | | SOCK_CLOEXEC Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tap.c | 5 +++-- tcp.c | 2 +- util.c | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tap.c b/tap.c index a3ba958..14d9b3d 100644 --- a/tap.c +++ b/tap.c @@ -1136,7 +1136,7 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, */ int tap_sock_unix_open(char *sock_path) { - int fd = socket(AF_UNIX, SOCK_STREAM, 0); + int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); struct sockaddr_un addr = { .sun_family = AF_UNIX, }; @@ -1155,7 +1155,8 @@ int tap_sock_unix_open(char *sock_path) UNIX_SOCK_PATH, i)) die_perror("Can't build UNIX domain socket path"); - ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); + ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, + 0); if (ex < 0) die_perror("Failed to check for UNIX domain conflicts"); diff --git a/tcp.c b/tcp.c index a3d48fa..6a98dfa 100644 --- a/tcp.c +++ b/tcp.c @@ -1410,7 +1410,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) { int s; - s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP); if (s > FD_REF_MAX) { close(s); diff --git a/util.c b/util.c index dddef93..3448f30 100644 --- a/util.c +++ b/util.c @@ -183,7 +183,8 @@ void sock_probe_mem(struct ctx *c) int v = INT_MAX / 2, s; socklen_t sl; - if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { c->low_wmem = c->low_rmem = 1; return; } From b84cd05098275a7625223141d019f8af5a17323b Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 8 Nov 2024 13:53:27 +1100 Subject: [PATCH 107/382] log: Only check for FALLOC_FL_COLLAPSE_RANGE availability at runtime log.c has several #ifdefs on FALLOC_FL_COLLAPSE_RANGE that won't attempt to use it if not defined. But even if the value is defined at compile time, it might not be available in the runtime kernel, so we need to check for errors from a fallocate() call and fall back to other methods. Simplify this to only need the runtime check by using linux_dep.h to define FALLOC_FL_COLLAPSE_RANGE if it's not in the kernel headers. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 5 ----- linux_dep.h | 6 ++++++ log.c | 10 ++-------- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 3c82f50..0ba85b4 100644 --- a/Makefile +++ b/Makefile @@ -59,11 +59,6 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec FLAGS += -fstack-protector-strong endif -C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE; -ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) - EXTRA_SYSCALLS += fallocate -endif - prefix ?= /usr/local exec_prefix ?= $(prefix) bindir ?= $(exec_prefix)/bin diff --git a/linux_dep.h b/linux_dep.h index 8921623..eae9c3c 100644 --- a/linux_dep.h +++ b/linux_dep.h @@ -119,4 +119,10 @@ struct tcp_info_linux { */ }; +#include <linux/falloc.h> + +#ifndef FALLOC_FL_COLLAPSE_RANGE +#define FALLOC_FL_COLLAPSE_RANGE 0x08 +#endif + #endif /* LINUX_DEP_H */ diff --git a/log.c b/log.c index 19f1d98..239c8ce 100644 --- a/log.c +++ b/log.c @@ -26,6 +26,7 @@ #include <stdarg.h> #include <sys/socket.h> +#include "linux_dep.h" #include "log.h" #include "util.h" #include "passt.h" @@ -92,7 +93,6 @@ const char *logfile_prefix[] = { " ", /* LOG_DEBUG */ }; -#ifdef FALLOC_FL_COLLAPSE_RANGE /** * logfile_rotate_fallocate() - Write header, set log_written after fallocate() * @fd: Log file descriptor @@ -126,7 +126,6 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now) log_written -= log_cut_size; } -#endif /* FALLOC_FL_COLLAPSE_RANGE */ /** * logfile_rotate_move() - Fallback: move recent entries toward start, then cut @@ -198,21 +197,17 @@ out: * * Return: 0 on success, negative error code on failure * - * #syscalls fcntl - * - * fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there + * #syscalls fcntl fallocate */ static int logfile_rotate(int fd, const struct timespec *now) { if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */)) return -errno; -#ifdef FALLOC_FL_COLLAPSE_RANGE /* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */ if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size)) logfile_rotate_fallocate(fd, now); else -#endif logfile_rotate_move(fd, now); if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND)) @@ -432,4 +427,3 @@ void logfile_init(const char *name, const char *path, size_t size) /* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */ log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE); } - From d64f25724399fbb4ba9d36eda7e17984a4c6c91c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 8 Nov 2024 13:53:28 +1100 Subject: [PATCH 108/382] linux_dep: Move close_range() conditional handling to linux_dep.h util.h has some #ifdefs and weak definitions to handle compatibility with various kernel versions. Move this to linux_dep.h which handles several other similar cases. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- linux_dep.h | 20 ++++++++++++++++++++ util.c | 1 + util.h | 19 ------------------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/linux_dep.h b/linux_dep.h index eae9c3c..3a41e42 100644 --- a/linux_dep.h +++ b/linux_dep.h @@ -125,4 +125,24 @@ struct tcp_info_linux { #define FALLOC_FL_COLLAPSE_RANGE 0x08 #endif +#include <linux/close_range.h> + +#ifdef CLOSE_RANGE_UNSHARE /* Linux kernel >= 5.9 */ +/* glibc < 2.34 and musl as of 1.2.5 need these */ +#ifndef SYS_close_range +#define SYS_close_range 436 +#endif +__attribute__ ((weak)) +/* cppcheck-suppress funcArgNamesDifferent */ +int close_range(unsigned int first, unsigned int last, int flags) { + return syscall(SYS_close_range, first, last, flags); +} +#else +/* No reasonable fallback option */ +/* cppcheck-suppress funcArgNamesDifferent */ +int close_range(unsigned int first, unsigned int last, int flags) { + return 0; +} +#endif + #endif /* LINUX_DEP_H */ diff --git a/util.c b/util.c index 3448f30..913f34b 100644 --- a/util.c +++ b/util.c @@ -28,6 +28,7 @@ #include <linux/errqueue.h> #include <getopt.h> +#include "linux_dep.h" #include "util.h" #include "iov.h" #include "passt.h" diff --git a/util.h b/util.h index 963f57b..3616515 100644 --- a/util.h +++ b/util.h @@ -17,7 +17,6 @@ #include <arpa/inet.h> #include <unistd.h> #include <sys/syscall.h> -#include <linux/close_range.h> #include "log.h" @@ -171,24 +170,6 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, struct ctx; -#ifdef CLOSE_RANGE_UNSHARE /* Linux kernel >= 5.9 */ -/* glibc < 2.34 and musl as of 1.2.5 need these */ -#ifndef SYS_close_range -#define SYS_close_range 436 -#endif -__attribute__ ((weak)) -/* cppcheck-suppress funcArgNamesDifferent */ -int close_range(unsigned int first, unsigned int last, int flags) { - return syscall(SYS_close_range, first, last, flags); -} -#else -/* No reasonable fallback option */ -/* cppcheck-suppress funcArgNamesDifferent */ -int close_range(unsigned int first, unsigned int last, int flags) { - return 0; -} -#endif - int sock_l4_sa(const struct ctx *c, enum epoll_type type, const void *sa, socklen_t sl, const char *ifname, bool v6only, uint32_t data); From 14dd70e2b33941f1f7663969574278873c9e3d35 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 8 Nov 2024 13:53:29 +1100 Subject: [PATCH 109/382] linux_dep: Fix CLOSE_RANGE_UNSHARE availability handling If CLOSE_RANGE_UNSHARE isn't defined, we define a fallback version of close_range() which is a (successful) no-op. This is broken in several ways: * It doesn't actually fix compile if using old kernel headers, because the caller of close_range() still directly uses CLOSE_RANGE_UNSHARE unprotected by ifdefs * Even if it did fix the compile, it means inconsistent behaviour between a compile time failure to find the value (we silently don't close files) and a runtime failure (we die with an error from close_range()) * Silently not closing the files we intend to close for security reasons is probably not a good idea in any case We don't want to simply error if close_range() or CLOSE_RANGE_UNSHARE isn't available, because that would require running on kernel >= 5.9. On the other hand there's not really any other way to flush all possible fds leaked by the parent (close() in a loop takes over a minute). So in this case print a warning and carry on. As bonus this fixes a cppcheck error I see with some different options I'm looking to apply in future. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- linux_dep.h | 12 ++++-------- util.c | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/linux_dep.h b/linux_dep.h index 3a41e42..240f50a 100644 --- a/linux_dep.h +++ b/linux_dep.h @@ -127,22 +127,18 @@ struct tcp_info_linux { #include <linux/close_range.h> -#ifdef CLOSE_RANGE_UNSHARE /* Linux kernel >= 5.9 */ /* glibc < 2.34 and musl as of 1.2.5 need these */ #ifndef SYS_close_range #define SYS_close_range 436 #endif +#ifndef CLOSE_RANGE_UNSHARE /* Linux kernel < 5.9 */ +#define CLOSE_RANGE_UNSHARE (1U << 1) +#endif + __attribute__ ((weak)) /* cppcheck-suppress funcArgNamesDifferent */ int close_range(unsigned int first, unsigned int last, int flags) { return syscall(SYS_close_range, first, last, flags); } -#else -/* No reasonable fallback option */ -/* cppcheck-suppress funcArgNamesDifferent */ -int close_range(unsigned int first, unsigned int last, int flags) { - return 0; -} -#endif #endif /* LINUX_DEP_H */ diff --git a/util.c b/util.c index 913f34b..126dedb 100644 --- a/util.c +++ b/util.c @@ -738,8 +738,20 @@ void close_open_files(int argc, char **argv) rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE); } - if (rc) - die_perror("Failed to close files leaked by parent"); + if (rc) { + if (errno == ENOSYS || errno == EINVAL) { + /* This probably means close_range() or the + * CLOSE_RANGE_UNSHARE flag is not supported by the + * kernel. Not much we can do here except carry on and + * hope for the best. + */ + warn( +"Can't use close_range() to ensure no files leaked by parent"); + } else { + die_perror("Failed to close files leaked by parent"); + } + } + } /** From 0588163b1f981a3ef87a9a3fe155dc2f0e116e18 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 8 Nov 2024 13:53:30 +1100 Subject: [PATCH 110/382] cppcheck: Don't check the system headers We pass -I options to cppcheck so that it will find the system headers. Then we need to pass a bunch more options to suppress the zillions of cppcheck errors found in those headers. It turns out, however, that it's not recommended to give the system headers to cppcheck anyway. Instead it has built-in knowledge of the ANSI libc and uses that as the basis of its checks. We do need to suppress missingIncludeSystem warnings instead though. Not bothering with the system headers makes the cppcheck runtime go from ~37s to ~14s on my machine, which is a pretty nice win. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 0ba85b4..258d298 100644 --- a/Makefile +++ b/Makefile @@ -163,11 +163,6 @@ clang-tidy: $(PASST_SRCS) $(HEADERS) clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \ -DCLANG_TIDY_58992 -SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET)) -ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1) -VER := $(shell $(CC) -dumpversion) -SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include -endif cppcheck: $(PASST_SRCS) $(HEADERS) if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \ CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \ @@ -177,11 +172,8 @@ cppcheck: $(PASST_SRCS) $(HEADERS) cppcheck --std=c11 --error-exitcode=1 --enable=all --force \ --inconclusive --library=posix --quiet \ $${CPPCHECK_EXHAUSTIVE} \ - $(SYSTEM_INCLUDES:%=-I%) \ - $(SYSTEM_INCLUDES:%=--config-exclude=%) \ - $(SYSTEM_INCLUDES:%=--suppress=*:%/*) \ - $(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*) \ --inline-suppr \ + --suppress=missingIncludeSystem \ --suppress=unusedStructMember \ $(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936 \ $(PASST_SRCS) $(HEADERS) From 71f228d04b5c68b1cf42d95e4e5bbb82af0a0e60 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 14 Nov 2024 14:33:03 +1100 Subject: [PATCH 111/382] ndp: Remove redundant update to addr_seen ndp() updates addr_seen or addr_ll_seen based on the source address of the received packet. This is redundant since tap6_handler() has already updated addr_seen for any type of packet, not just NDP. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ndp.c | 9 ++------- ndp.h | 4 ++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/ndp.c b/ndp.c index faae408..ab80898 100644 --- a/ndp.c +++ b/ndp.c @@ -179,8 +179,8 @@ struct ndp_ns { * * Return: 0 if not handled here, 1 if handled, -1 on failure */ -int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr, - const struct pool *p) +int ndp(const struct ctx *c, const struct icmp6hdr *ih, + const struct in6_addr *saddr, const struct pool *p) { struct ndp_na na = { .ih = { @@ -336,11 +336,6 @@ dns_done: return 1; } - if (IN6_IS_ADDR_LINKLOCAL(saddr)) - c->ip6.addr_ll_seen = *saddr; - else - c->ip6.addr_seen = *saddr; - rsaddr = &c->ip6.our_tap_ll; if (ih->icmp6_type == NS) { diff --git a/ndp.h b/ndp.h index a786441..abe6d02 100644 --- a/ndp.h +++ b/ndp.h @@ -6,7 +6,7 @@ #ifndef NDP_H #define NDP_H -int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr, - const struct pool *p); +int ndp(const struct ctx *c, const struct icmp6hdr *ih, + const struct in6_addr *saddr, const struct pool *p); #endif /* NDP_H */ From 4e471670351a76b902e5376da4ee909f68485da2 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 14 Nov 2024 14:33:04 +1100 Subject: [PATCH 112/382] ndp: Add ndp_send() helper ndp() has a conditional on message type generating the reply message, then a tiny amount of common code, then another conditional to send the reply with slightly different parameters. We can make this a bit neater by making a helper function for sending the reply, and call it from each of the different message type paths. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ndp.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/ndp.c b/ndp.c index ab80898..fa1b67a 100644 --- a/ndp.c +++ b/ndp.c @@ -170,6 +170,21 @@ struct ndp_ns { struct in6_addr target_addr; } __attribute__((packed)); +/** + * ndp_send() - Send an NDP message + * @c: Execution context + * @dst: IPv6 address to send the message to + * @buf: ICMPv6 header + message payload + * @l4len: Length of message, including ICMPv6 header + */ +static void ndp_send(const struct ctx *c, const struct in6_addr *dst, + const void *buf, size_t l4len) +{ + const struct in6_addr *src = &c->ip6.our_tap_ll; + + tap_icmp6_send(c, src, dst, buf, l4len); +} + /** * ndp() - Check for NDP solicitations, reply as needed * @c: Execution context @@ -223,9 +238,6 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih, }, }, }; - const struct in6_addr *rsaddr; /* src addr for reply */ - unsigned char *ptr = NULL; - size_t dlen; if (ih->icmp6_type < RS || ih->icmp6_type > NA) return 0; @@ -249,7 +261,9 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih, sizeof(na.target_addr)); memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN); + ndp_send(c, saddr, &na, sizeof(struct ndp_na)); } else if (ih->icmp6_type == RS) { + unsigned char *ptr = NULL; size_t dns_s_len = 0; int i, n; @@ -332,18 +346,8 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih, dns_done: memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN); - } else { - return 1; - } - rsaddr = &c->ip6.our_tap_ll; - - if (ih->icmp6_type == NS) { - dlen = sizeof(struct ndp_na); - tap_icmp6_send(c, rsaddr, saddr, &na, dlen); - } else if (ih->icmp6_type == RS) { - dlen = ptr - (unsigned char *)&ra; - tap_icmp6_send(c, rsaddr, saddr, &ra, dlen); + ndp_send(c, saddr, &ra, ptr - (unsigned char *)&ra); } return 1; From cbc83e14df5ebbc656de8ec0e5c26a1a6efadf0e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 14 Nov 2024 14:33:05 +1100 Subject: [PATCH 113/382] ndp: Split out helpers for sending specific NDP message types Currently the large ndp() function responds to all NDP messages we handle, both parsing the message as necessary and sending the response. Split out the code to construct and send specific message types into ndp_na() (to send NA messages) and ndp_ra() (to send RA messages). As well as breaking up an excessively large function, this is a first step to being able to send unsolicited NDP messages. While we're there, remove a slighty ugly goto. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ndp.c | 132 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 76 insertions(+), 56 deletions(-) diff --git a/ndp.c b/ndp.c index fa1b67a..8f52471 100644 --- a/ndp.c +++ b/ndp.c @@ -186,16 +186,13 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst, } /** - * ndp() - Check for NDP solicitations, reply as needed + * ndp_na() - Send an NDP Neighbour Advertisement (NA) message * @c: Execution context - * @ih: ICMPv6 header - * @saddr: Source IPv6 address - * @p: Packet pool - * - * Return: 0 if not handled here, 1 if handled, -1 on failure + * @dst: IPv6 address to send the NA to + * @addr: IPv6 address to advertise */ -int ndp(const struct ctx *c, const struct icmp6hdr *ih, - const struct in6_addr *saddr, const struct pool *p) +static void ndp_na(const struct ctx *c, const struct in6_addr *dst, + const void *addr) { struct ndp_na na = { .ih = { @@ -212,6 +209,20 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih, }, } }; + + memcpy(&na.target_addr, addr, sizeof(na.target_addr)); + memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN); + + ndp_send(c, dst, &na, sizeof(na)); +} + +/** + * ndp_ra() - Send an NDP Router Advertisement (RA) message + * @c: Execution context + * @dst: IPv6 address to send the RA to + */ +static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) +{ struct ndp_ra ra = { .ih = { .icmp6_type = RA, @@ -238,58 +249,28 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih, }, }, }; + unsigned char *ptr = NULL; - if (ih->icmp6_type < RS || ih->icmp6_type > NA) - return 0; + memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix)); - if (c->no_ndp) - return 1; + ptr = &ra.var[0]; - if (ih->icmp6_type == NS) { - const struct ndp_ns *ns = - packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL); + if (c->mtu != -1) { + struct opt_mtu *mtu = (struct opt_mtu *)ptr; + *mtu = (struct opt_mtu) { + .header = { + .type = OPT_MTU, + .len = 1, + }, + .value = htonl(c->mtu), + }; + ptr += sizeof(struct opt_mtu); + } - if (!ns) - return -1; - - if (IN6_IS_ADDR_UNSPECIFIED(saddr)) - return 1; - - info("NDP: received NS, sending NA"); - - memcpy(&na.target_addr, &ns->target_addr, - sizeof(na.target_addr)); - memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN); - - ndp_send(c, saddr, &na, sizeof(struct ndp_na)); - } else if (ih->icmp6_type == RS) { - unsigned char *ptr = NULL; + if (!c->no_dhcp_dns) { size_t dns_s_len = 0; int i, n; - if (c->no_ra) - return 1; - - info("NDP: received RS, sending RA"); - memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix)); - - ptr = &ra.var[0]; - - if (c->mtu != -1) { - struct opt_mtu *mtu = (struct opt_mtu *)ptr; - *mtu = (struct opt_mtu) { - .header = { - .type = OPT_MTU, - .len = 1, - }, - .value = htonl(c->mtu), - }; - ptr += sizeof(struct opt_mtu); - } - - if (c->no_dhcp_dns) - goto dns_done; - for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n]); n++); if (n) { struct opt_rdnss *rdnss = (struct opt_rdnss *)ptr; @@ -343,11 +324,50 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih, memset(ptr, 0, 8 - dns_s_len % 8); /* padding */ ptr += 8 - dns_s_len % 8; } + } -dns_done: - memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN); + memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN); - ndp_send(c, saddr, &ra, ptr - (unsigned char *)&ra); + ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra); +} + +/** + * ndp() - Check for NDP solicitations, reply as needed + * @c: Execution context + * @ih: ICMPv6 header + * @saddr: Source IPv6 address + * @p: Packet pool + * + * Return: 0 if not handled here, 1 if handled, -1 on failure + */ +int ndp(const struct ctx *c, const struct icmp6hdr *ih, + const struct in6_addr *saddr, const struct pool *p) +{ + if (ih->icmp6_type < RS || ih->icmp6_type > NA) + return 0; + + if (c->no_ndp) + return 1; + + if (ih->icmp6_type == NS) { + const struct ndp_ns *ns; + + ns = packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL); + if (!ns) + return -1; + + if (IN6_IS_ADDR_UNSPECIFIED(saddr)) + return 1; + + info("NDP: received NS, sending NA"); + + ndp_na(c, saddr, &ns->target_addr); + } else if (ih->icmp6_type == RS) { + if (c->no_ra) + return 1; + + info("NDP: received RS, sending RA"); + ndp_ra(c, saddr); } return 1; From 36c070e6e320b97bb4761e29c934f5f269e06b35 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 14 Nov 2024 14:33:06 +1100 Subject: [PATCH 114/382] ndp: Use struct assignment in preference to memcpy() for IPv6 addresses There are a number of places we can simply assign IPv6 addresses about, rather than the current mildly ugly memcpy(). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ndp.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/ndp.c b/ndp.c index 8f52471..fd512ae 100644 --- a/ndp.c +++ b/ndp.c @@ -158,7 +158,7 @@ struct ndp_ra { unsigned char var[sizeof(struct opt_mtu) + sizeof(struct opt_rdnss) + sizeof(struct opt_dnssl)]; -} __attribute__((packed)); +} __attribute__((packed, aligned(__alignof__(struct in6_addr)))); /** * struct ndp_ns - NDP Neighbor Solicitation (NS) message @@ -168,7 +168,7 @@ struct ndp_ra { struct ndp_ns { struct icmp6hdr ih; struct in6_addr target_addr; -} __attribute__((packed)); +} __attribute__((packed, aligned(__alignof__(struct in6_addr)))); /** * ndp_send() - Send an NDP message @@ -192,7 +192,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst, * @addr: IPv6 address to advertise */ static void ndp_na(const struct ctx *c, const struct in6_addr *dst, - const void *addr) + const struct in6_addr *addr) { struct ndp_na na = { .ih = { @@ -202,6 +202,7 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst, .icmp6_solicited = 1, .icmp6_override = 1, }, + .target_addr = *addr, .target_l2_addr = { .header = { .type = OPT_TARGET_L2_ADDR, @@ -210,7 +211,6 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst, } }; - memcpy(&na.target_addr, addr, sizeof(na.target_addr)); memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN); ndp_send(c, dst, &na, sizeof(na)); @@ -242,6 +242,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) .valid_lifetime = ~0U, .pref_lifetime = ~0U, }, + .prefix = c->ip6.addr, .source_ll = { .header = { .type = OPT_SRC_L2_ADDR, @@ -251,8 +252,6 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) }; unsigned char *ptr = NULL; - memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix)); - ptr = &ra.var[0]; if (c->mtu != -1) { @@ -282,8 +281,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) .lifetime = ~0U, }; for (i = 0; i < n; i++) { - memcpy(&rdnss->dns[i], &c->ip6.dns[i], - sizeof(rdnss->dns[i])); + rdnss->dns[i] = c->ip6.dns[i]; } ptr += offsetof(struct opt_rdnss, dns) + i * sizeof(rdnss->dns[0]); From a60703e89991d23345ed929328001e19f5bc47e0 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 14 Nov 2024 14:33:07 +1100 Subject: [PATCH 115/382] ndp: Make route lifetime a #define Currently we open-code the lifetime of the route we advertise via NDP to be 65535s (the maximum). Change it to a #define. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ndp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ndp.c b/ndp.c index fd512ae..09df8d6 100644 --- a/ndp.c +++ b/ndp.c @@ -33,6 +33,8 @@ #include "tap.h" #include "log.h" +#define RT_LIFETIME 65535 + #define RS 133 #define RA 134 #define NS 135 @@ -229,7 +231,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) .icmp6_code = 0, .icmp6_hop_limit = 255, /* RFC 8319 */ - .icmp6_rt_lifetime = htons_constant(65535), + .icmp6_rt_lifetime = htons_constant(RT_LIFETIME), .icmp6_addrconf_managed = 1, }, .prefix_info = { From 71d5deed5eed3949ee09c5f0a53b4de0b09b4afc Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 14 Nov 2024 14:33:08 +1100 Subject: [PATCH 116/382] util: Add general low-level random bytes helper Currently secret_init() open codes getting good quality random bytes from the OS, either via getrandom(2) or reading /dev/random. We're going to add at least one more place that needs random data in future, so make a general helper for getting random bytes. While we're there, fix a number of minor bugs: - getrandom() can theoretically return a "short read", so handle that case - getrandom() as well as read can return a transient EINTR - We would attempt to read data from /dev/random if we failed to open it (open() returns -1), but not if we opened it as fd 0 (unlikely, but ok) - More specific error reporting Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt.c | 30 +----------------------------- util.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ util.h | 2 ++ 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/passt.c b/passt.c index fac6101..73649de 100644 --- a/passt.c +++ b/passt.c @@ -36,9 +36,6 @@ #include <sys/prctl.h> #include <netinet/if_ether.h> #include <libgen.h> -#ifdef HAS_GETRANDOM -#include <sys/random.h> -#endif #include "util.h" #include "passt.h" @@ -118,32 +115,7 @@ static void post_handler(struct ctx *c, const struct timespec *now) */ static void secret_init(struct ctx *c) { -#ifndef HAS_GETRANDOM - int dev_random = open("/dev/random", O_RDONLY); - unsigned int random_read = 0; - - while (dev_random && random_read < sizeof(c->hash_secret)) { - int ret = read(dev_random, - (uint8_t *)&c->hash_secret + random_read, - sizeof(c->hash_secret) - random_read); - - if (ret == -1 && errno == EINTR) - continue; - - if (ret <= 0) - break; - - random_read += ret; - } - if (dev_random >= 0) - close(dev_random); - - if (random_read < sizeof(c->hash_secret)) -#else - if (getrandom(&c->hash_secret, sizeof(c->hash_secret), - GRND_RANDOM) < 0) -#endif /* !HAS_GETRANDOM */ - die_perror("Failed to get random bytes for hash table and TCP"); + raw_random(&c->hash_secret, sizeof(c->hash_secret)); } /** diff --git a/util.c b/util.c index 126dedb..55cae3f 100644 --- a/util.c +++ b/util.c @@ -34,6 +34,9 @@ #include "passt.h" #include "packet.h" #include "log.h" +#ifdef HAS_GETRANDOM +#include <sys/random.h> +#endif /** * sock_l4_sa() - Create and bind socket to socket address, add to epoll list @@ -783,3 +786,54 @@ bool snprintf_check(char *str, size_t size, const char *format, ...) return false; } + +#define DEV_RANDOM "/dev/random" + +/** + * raw_random() - Get high quality random bytes + * @buf: Buffer to fill with random bytes + * @buflen: Number of bytes of random data to put in @buf + * + * Assumes that the random data is essential, and will die() if unable to obtain + * it. + */ +void raw_random(void *buf, size_t buflen) +{ + size_t random_read = 0; +#ifndef HAS_GETRANDOM + int fd = open(DEV_RANDOM, O_RDONLY); + + if (fd < 0) + die_perror("Couldn't open %s", DEV_RANDOM); +#endif + + while (random_read < buflen) { + ssize_t ret; + +#ifdef HAS_GETRANDOM + ret = getrandom((char *)buf + random_read, + buflen - random_read, GRND_RANDOM); +#else + ret = read(dev_random, (char *)buf + random_read, + buflen - random_read); +#endif + + if (ret == -1 && errno == EINTR) + continue; + + if (ret < 0) + die_perror("Error on random data source"); + + if (ret == 0) + break; + + random_read += ret; + } + +#ifndef HAS_GETRANDOM + close(dev_random); +#endif + + if (random_read < buflen) + die("Unexpected EOF on random data source"); +} diff --git a/util.h b/util.h index 3616515..90428c4 100644 --- a/util.h +++ b/util.h @@ -263,6 +263,8 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m) /* FPRINTF() intentionally silences cert-err33-c clang-tidy warnings */ #define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__) +void raw_random(void *buf, size_t buflen); + /* * Workarounds for https://github.com/llvm/llvm-project/issues/58992 * From b39760cc7d89e69c7fb12eccc3df3bd15e2d5665 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 14 Nov 2024 14:33:09 +1100 Subject: [PATCH 117/382] passt: Seed libc's pseudo random number generator We have an upcoming case where we need pseudo-random numbers to scatter timings, but we don't need cryptographically strong random numbers. libc's built in random() is fine for this purpose, but we should seed it. Extend secret_init() - the only current user of random numbers - to do this as well as generating the SipHash secret. Using /dev/random for a PRNG seed is probably overkill, but it's simple and we only do it once, so we might as well. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/passt.c b/passt.c index 73649de..83b26c5 100644 --- a/passt.c +++ b/passt.c @@ -110,12 +110,19 @@ static void post_handler(struct ctx *c, const struct timespec *now) } /** - * secret_init() - Create secret value for SipHash calculations + * random_init() - Initialise things based on random data * @c: Execution context */ -static void secret_init(struct ctx *c) +static void random_init(struct ctx *c) { + unsigned int seed; + + /* Create secret value for SipHash calculations */ raw_random(&c->hash_secret, sizeof(c->hash_secret)); + + /* Seed pseudo-RNG for things that need non-cryptographic random */ + raw_random(&seed, sizeof(seed)); + srandom(seed); } /** @@ -236,7 +243,7 @@ int main(int argc, char **argv) tap_sock_init(&c); - secret_init(&c); + random_init(&c); if (clock_gettime(CLOCK_MONOTONIC, &now)) die_perror("Failed to get CLOCK_MONOTONIC time"); From 6e1e44293ef991d8c946dd59fbbd65c54901b255 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 14 Nov 2024 14:33:10 +1100 Subject: [PATCH 118/382] ndp: Send unsolicited Router Advertisements Currently, our NDP implementation only sends Router Advertisements (RA) when it receives a Router Solicitation (RS) from the guest. However, RFC 4861 requires that we periodically send unsolicited RAs. Linux as a guest also requires this: it will send an RS when a link first comes up, but the route it gets from this will have a finite lifetime (we set this to 65535s, the maximum allowed, around 18 hours). When that expires the guest will not send a new RS, but instead expects the route to have been renewed (if still valid) by an unsolicited RA. Implement sending unsolicited RAs on a partially randomised timer, as required by RFC 4861. The RFC also specifies that solicited RAs should also be delayed, or even omitted, if the next unsolicited RA is soon enough. For now we don't do that, always sending an immediate RA in response to an RS. We can get away with this because in our use cases we expect to just have passt itself and the guest on the link, rather than a large broadcast domain. Link: https://github.com/kubevirt/kubevirt/issues/13191 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ip.h | 9 +++++++++ ndp.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ ndp.h | 3 +++ passt.c | 3 +++ 4 files changed, 69 insertions(+) diff --git a/ip.h b/ip.h index b8d4a5b..0742612 100644 --- a/ip.h +++ b/ip.h @@ -92,4 +92,13 @@ struct ipv6_opt_hdr { char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto, size_t *dlen); + +/* IPv6 link-local all-nodes multicast adddress, ff02::1 */ +static const struct in6_addr in6addr_ll_all_nodes = { + .s6_addr = { + 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + }, +}; + #endif /* IP_H */ diff --git a/ndp.c b/ndp.c index 09df8d6..7ee44b2 100644 --- a/ndp.c +++ b/ndp.c @@ -372,3 +372,57 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih, return 1; } + +/* Default interval between unsolicited RAs (seconds) */ +#define DEFAULT_MAX_RTR_ADV_INTERVAL 600 /* RFC 4861, 6.2.1 */ + +/* Minimum required interval between RAs (seconds) */ +#define MIN_DELAY_BETWEEN_RAS 3 /* RFC 4861, 10 */ + +static time_t next_ra; + +/** + * ndp_timer() - Send unsolicited NDP messages if necessary + * @c: Execution context + * @now: Current (monotonic) time + */ +void ndp_timer(const struct ctx *c, const struct timespec *now) +{ + time_t max_rtr_adv_interval = DEFAULT_MAX_RTR_ADV_INTERVAL; + time_t min_rtr_adv_interval, interval; + + if (c->no_ra || now->tv_sec < next_ra) + return; + + /* We must advertise before the route's lifetime expires */ + max_rtr_adv_interval = MIN(max_rtr_adv_interval, RT_LIFETIME - 1); + + /* But we must not go smaller than the minimum delay */ + max_rtr_adv_interval = MAX(max_rtr_adv_interval, MIN_DELAY_BETWEEN_RAS); + + /* RFC 4861, 6.2.1 */ + min_rtr_adv_interval = MAX(max_rtr_adv_interval / 3, + MIN_DELAY_BETWEEN_RAS); + + /* As required by RFC 4861, we randomise the interval between + * unsolicited RAs. This is to prevent multiple routers on a link + * getting synchronised (e.g. after booting a bunch of routers at once) + * and causing flurries of RAs at the same time. + * + * This random doesn't need to be cryptographically strong, so random(3) + * is fine. Other routers on the link also want to avoid + * synchronisation, and anything malicious has much easier ways to cause + * trouble. + * + * The modulus also makes this not strictly a uniform distribution, but, + * again, it's close enough for our purposes. + */ + interval = min_rtr_adv_interval + + random() % (max_rtr_adv_interval - min_rtr_adv_interval); + + info("NDP: sending unsolicited RA, next in %llds", (long long)interval); + + ndp_ra(c, &in6addr_ll_all_nodes); + + next_ra = now->tv_sec + interval; +} diff --git a/ndp.h b/ndp.h index abe6d02..41c2000 100644 --- a/ndp.h +++ b/ndp.h @@ -6,7 +6,10 @@ #ifndef NDP_H #define NDP_H +struct icmp6hdr; + int ndp(const struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr, const struct pool *p); +void ndp_timer(const struct ctx *c, const struct timespec *now); #endif /* NDP_H */ diff --git a/passt.c b/passt.c index 83b26c5..a51a4e1 100644 --- a/passt.c +++ b/passt.c @@ -49,6 +49,7 @@ #include "arch.h" #include "log.h" #include "tcp_splice.h" +#include "ndp.h" #define EPOLL_EVENTS 8 @@ -107,6 +108,8 @@ static void post_handler(struct ctx *c, const struct timespec *now) flow_defer_handler(c, now); #undef CALL_PROTO_HANDLER + + ndp_timer(c, now); } /** From 5e2446667729d01ef8208d0e7e866cee09c8a3fb Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 14 Nov 2024 23:48:54 +0100 Subject: [PATCH 119/382] selinux: Use auth_read_passwd() interface for all our getpwnam() needs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If passt or pasta are started as root, we need to read the passwd file (be it /etc/passwd or whatever sssd provides) to find out UID and GID of 'nobody' so that we can switch to it. Instead of a bunch of allow rules for passwd_file_t and sssd macros, use the more convenient auth_read_passwd() interface which should cover our usage of getpwnam(). The existing rules weren't actually enough: # strace -e openat passt -f [...] Started as root, will change to nobody. openat(AT_FDCWD, "/etc/nsswitch.conf", O_RDONLY|O_CLOEXEC) = 4 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 4 openat(AT_FDCWD, "/lib64/libnss_sss.so.2", O_RDONLY|O_CLOEXEC) = 4 openat(AT_FDCWD, "/var/lib/sss/mc/passwd", O_RDONLY|O_CLOEXEC) = -1 EACCES (Permission denied) openat(AT_FDCWD, "/var/lib/sss/mc/passwd", O_RDONLY|O_CLOEXEC) = -1 EACCES (Permission denied) openat(AT_FDCWD, "/etc/passwd", O_RDONLY|O_CLOEXEC) = 4 with corresponding SELinux warnings logged in audit.log. Reported-by: Minxi Hou <mhou@redhat.com> Analysed-by: Miloš Malik <mmalik@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/selinux/passt.te | 5 +---- contrib/selinux/pasta.te | 9 +-------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index 80bf780..c6cea34 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -47,8 +47,6 @@ require { type port_t; type http_port_t; - type passwd_file_t; - class netlink_route_socket { bind create nlmsg_read }; type sysctl_net_t; @@ -96,8 +94,7 @@ allow passt_t self:capability { sys_tty_config setpcap net_bind_service setuid s allow passt_t self:cap_userns { setpcap sys_admin sys_ptrace }; allow passt_t self:user_namespace create; -allow passt_t passwd_file_t:file read_file_perms; -sssd_search_lib(passt_t) +auth_read_passwd(passt_t) allow passt_t proc_net_t:file read; allow passt_t net_conf_t:file { open read }; diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te index 310383c..69be081 100644 --- a/contrib/selinux/pasta.te +++ b/contrib/selinux/pasta.te @@ -68,9 +68,6 @@ require { type system_dbusd_t; type systemd_hostnamed_t; type systemd_systemctl_exec_t; - type passwd_file_t; - type sssd_public_t; - type sssd_var_lib_t; class dbus send_msg; class system module_request; class system status; @@ -115,8 +112,7 @@ allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service }; allow pasta_t self:user_namespace create; -allow pasta_t passwd_file_t:file read_file_perms; -sssd_search_lib(pasta_t) +auth_read_passwd(pasta_t) domain_auto_trans(pasta_t, bin_t, unconfined_t); domain_auto_trans(pasta_t, shell_exec_t, unconfined_t); @@ -178,12 +174,9 @@ allow pasta_t init_t:system status; allow pasta_t unconfined_t:dir search; allow pasta_t unconfined_t:file read; allow pasta_t unconfined_t:lnk_file read; -allow pasta_t passwd_file_t:file { getattr open read }; allow pasta_t self:process { setpgid setcap }; allow pasta_t shell_exec_t:file { execute execute_no_trans map }; -allow pasta_t sssd_var_lib_t:dir search; -allow pasta_t sssd_public_t:dir search; allow pasta_t hostname_exec_t:file { execute execute_no_trans getattr open read map }; allow pasta_t system_dbusd_t:unix_stream_socket connectto; allow pasta_t system_dbusd_t:dbus send_msg; From bf9492747df006a794f281d6c26ee38989b44d23 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 15 Nov 2024 15:22:06 +0100 Subject: [PATCH 120/382] ndp: Don't send unsolicited router advertisement if we can't, yet ndp_timer() is called right away on the first epoll_wait() cycle, when the communication channel to the guest isn't ready yet: 1.0038: NDP: sending unsolicited RA, next in 264s 1.0038: tap: failed to send 1 frames of 1 check that it's up before sending it. This effectively delays the first gratuitous router advertisement, which is probably a good idea given that we expect the guest to send a router solicitation right away. Fixes: 6e1e44293ef9 ("ndp: Send unsolicited Router Advertisements") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- ndp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ndp.c b/ndp.c index 7ee44b2..1752d64 100644 --- a/ndp.c +++ b/ndp.c @@ -391,7 +391,7 @@ void ndp_timer(const struct ctx *c, const struct timespec *now) time_t max_rtr_adv_interval = DEFAULT_MAX_RTR_ADV_INTERVAL; time_t min_rtr_adv_interval, interval; - if (c->no_ra || now->tv_sec < next_ra) + if (c->fd_tap < 0 || c->no_ra || now->tv_sec < next_ra) return; /* We must advertise before the route's lifetime expires */ From 5ae21841acd7f55a4b57b99a5097ca99b84f07c4 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 19 Nov 2024 12:21:56 +1100 Subject: [PATCH 121/382] ndp: Don't send unsolicited RAs if NDP is disabled We recently added support for sending unsolicited NDP Router Advertisement packets. While we (correctly) disable this if the --no-ra option is given we incorrectly still send them if --no-ndp is set. Fix the oversight. Fixes: 6e1e44293ef9 ("ndp: Send unsolicited Router Advertisements") Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/passt.c b/passt.c index a51a4e1..06e0a33 100644 --- a/passt.c +++ b/passt.c @@ -109,7 +109,8 @@ static void post_handler(struct ctx *c, const struct timespec *now) flow_defer_handler(c, now); #undef CALL_PROTO_HANDLER - ndp_timer(c, now); + if (!c->no_ndp) + ndp_timer(c, now); } /** From af464c4ffbb7a5341f8a7beedce8382d598dbaf7 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 19 Nov 2024 20:53:43 +0100 Subject: [PATCH 122/382] tcp: Reset ACK_TO_TAP_DUE flag whenever an ACK isn't needed anymore We enter the timer handler with the ACK_TO_TAP_DUE flag, call tcp_prepare_flags() with ACK_IF_NEEDED, and realise that we acknowledged everything meanwhile, so we return early, but we also need to reset that flag to avoid unnecessarily scheduling the timer over and over again until more pending data appears. I'm not sure if this fixes any real issue, but I've spotted this in several logs reported by users, including one where we have some unexpected bursts of high CPU load during TCP transfers at low rates, from https://github.com/containers/podman/issues/23686. Link: https://github.com/containers/podman/discussions/24572 Link: https://github.com/containers/podman/issues/23686 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index 6a98dfa..f357920 100644 --- a/tcp.c +++ b/tcp.c @@ -1235,8 +1235,10 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int s = conn->sock; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && - !flags && conn->wnd_to_tap) + !flags && conn->wnd_to_tap) { + conn_flag(c, conn, ~ACK_TO_TAP_DUE); return 0; + } if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { conn_event(c, conn, CLOSED); From 238c69f9af458e41dea5ad8c988dbf65b05b5172 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 19 Nov 2024 20:53:44 +0100 Subject: [PATCH 123/382] tcp: Acknowledge keep-alive segments, ignore them for the rest RFC 9293, 3.8.4 says: Implementers MAY include "keep-alives" in their TCP implementations (MAY-5), although this practice is not universally accepted. Some TCP implementations, however, have included a keep-alive mechanism. To confirm that an idle connection is still active, these implementations send a probe segment designed to elicit a response from the TCP peer. Such a segment generally contains SEG.SEQ = SND.NXT-1 and may or may not contain one garbage octet of data. If keep-alives are included, the application MUST be able to turn them on or off for each TCP connection (MUST-24), and they MUST default to off (MUST-25). but currently, tcp_data_from_tap() is not aware of this and will schedule a fast re-transmit on the second keep-alive (because it's also a duplicate ACK), ignoring the fact that the sequence number was rewinded to SND.NXT-1. ACK these keep-alive segments, reset the activity timeout, and ignore them for the rest. At some point, we could think of implementing an approximation of keep-alive segments on outbound sockets, for example by setting TCP_KEEPIDLE to 1, and a large TCP_KEEPINTVL, so that we send a single keep-alive segment at approximately the same time, and never reset the connection. That's beyond the scope of this fix, though. Reported-by: Tim Besard <tim.besard@gmail.com> Link: https://github.com/containers/podman/discussions/24572 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tcp.c b/tcp.c index f357920..1eb85bb 100644 --- a/tcp.c +++ b/tcp.c @@ -1763,6 +1763,20 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, continue; seq = ntohl(th->seq); + if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) { + flow_trace(conn, + "keep-alive sequence: %u, previous: %u", + seq, conn->seq_from_tap); + + tcp_send_flag(c, conn, ACK); + tcp_timer_ctl(c, conn); + + if (p->count == 1) + return 1; + + continue; + } + ack_seq = ntohl(th->ack_seq); if (th->ack) { From b61be8468a804f5660cebcfdc10aa94b7ecac7a3 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 25 Nov 2024 11:40:53 +0100 Subject: [PATCH 124/382] passt.1: Fix "default" note about --map-guest-addr It's not true that there's no mapping by default: there's no mapping in the --map-guest-addr sense, by default, but in that case the default --map-host-loopback behaviour prevails. While at it, fix a typo. Fixes: 57b7bd2a48a1 ("fwd, conf: Allow NAT of the guest's assigned address") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- passt.1 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/passt.1 b/passt.1 index f084978..02a9bcc 100644 --- a/passt.1 +++ b/passt.1 @@ -373,14 +373,14 @@ Translate \fIaddr\fR in the guest to be equal to the guest's assigned address on the host. That is, packets from the guest to \fIaddr\fR will be redirected to the address assigned to the guest with \fB-a\fR, or by default the host's global address. This allows the guest to -access services availble on the host's global address, even though its +access services available on the host's global address, even though its own address shadows that of the host. If \fIaddr\fR is 'none', no address is mapped. Only one IPv4 and one IPv6 address can be translated, and if the option is specified multiple times, the last one for each address type takes effect. -Default is no mapping. +By default, mapping happens as described for the \-\-map-host-loopback option. .TP .BR \-4 ", " \-\-ipv4-only From 6819b2e1020411661dc0487ee3614f012d45b049 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 25 Nov 2024 11:46:33 +0100 Subject: [PATCH 125/382] conf, passt.1: Update --mac-addr default in usage() and man page Fixes: 90e83d50a9bd ("Don't take "our" MAC address from the host") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- conf.c | 2 +- passt.1 | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/conf.c b/conf.c index 86566db..d342c8a 100644 --- a/conf.c +++ b/conf.c @@ -788,7 +788,7 @@ static void usage(const char *name, FILE *f, int status) " -n, --netmask MASK Assign IPv4 MASK, dot-decimal or bits\n" " default: netmask from matching address on the host\n" " -M, --mac-addr ADDR Use source MAC address ADDR\n" - " default: MAC address from interface with default route\n" + " default: 9a:55:9a:55:9a:55 (locally administered)\n" " -g, --gateway ADDR Pass IPv4 or IPv6 address as gateway\n" " default: gateway from interface with default route\n" " -i, --interface NAME Interface for addresses and routes\n" diff --git a/passt.1 b/passt.1 index 02a9bcc..059abd3 100644 --- a/passt.1 +++ b/passt.1 @@ -174,8 +174,7 @@ according to the CIDR block of the assigned address (RFC 4632). .BR \-M ", " \-\-mac-addr " " \fIaddr Use source MAC address \fIaddr\fR when communicating to the guest or to the target namespace. -Default is to use the MAC address of the interface with the first IPv4 default -route on the host. +Default is the locally administered MAC addresses 9a:55:9a:55:9a:55. .TP .BR \-g ", " \-\-gateway " " \fIaddr From 2bf8ffcf078c5933e6a31dbffbfb4dc31bfd7bc5 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 25 Nov 2024 11:53:10 +0100 Subject: [PATCH 126/382] test/perf: Select a single IPv6 namespace address in pasta tests By dropping the filter on prefix length, commit 910f4f910301 ("test: Don't require 64-bit prefixes in perf tests") broke tests on setups where two global unicast IPv6 addresses are available, which is the typical case when the "host" is a VM running under passt with addresses from SLAAC and DHCPv6, because two addresses will be returned. Pick the first one instead. We don't really care about the prefix length, any of these addresses will work. Fixes: 910f4f910301 ("test: Don't require 64-bit prefixes in perf tests") Link: https://archives.passt.top/passt-dev/20241119214344.6b4a5b3a@elisabeth/ Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- test/perf/pasta_tcp | 2 +- test/perf/pasta_udp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/perf/pasta_tcp b/test/perf/pasta_tcp index 88284b2..bc0de3c 100644 --- a/test/perf/pasta_tcp +++ b/test/perf/pasta_tcp @@ -211,7 +211,7 @@ tr TCP throughput over IPv6: host to ns iperf3s ns 10002 nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' -nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local' +nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]' bw - bw - bw - diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp index 3d07091..ab2f3e8 100644 --- a/test/perf/pasta_udp +++ b/test/perf/pasta_udp @@ -196,7 +196,7 @@ tr UDP throughput over IPv6: host to ns iperf3s ns 10002 nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' -nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local' +nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]' iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472 bw __BW__ 0.3 0.5 iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972 From cda7f160f091515770a103765d50bac0f136faef Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 25 Nov 2024 08:50:39 +0100 Subject: [PATCH 127/382] ndp: Don't send first periodic router advertisement right after guest connects This is very visible with muvm, but it also happens with QEMU: we're sending the first unsolicited router advertisement milliseconds after the guest connects. That's usually pointless because, when the hypervisor connects, the guest is typically not ready yet to process anything of that sort: it's still booting. And if we happen to send it late enough (still milliseconds), with muvm, while the message is discarded, it sometimes (slightly) delays the response to the first solicited router advertisement, which is the one we need to have coming fast. Skip sending the unsolicited advertisement on the first timer run, just calculate the next delay. Keep it simple by observing that we're probably not trying to reach the 1970s with IPv6. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- ndp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ndp.c b/ndp.c index 1752d64..37bf7a3 100644 --- a/ndp.c +++ b/ndp.c @@ -420,9 +420,13 @@ void ndp_timer(const struct ctx *c, const struct timespec *now) interval = min_rtr_adv_interval + random() % (max_rtr_adv_interval - min_rtr_adv_interval); + if (!next_ra) + goto first; + info("NDP: sending unsolicited RA, next in %llds", (long long)interval); ndp_ra(c, &in6addr_ll_all_nodes); +first: next_ra = now->tv_sec + interval; } From c6e61064139ba94a763097144d1a84bd4fbafade Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 26 Nov 2024 14:27:27 +1100 Subject: [PATCH 128/382] test: Improve logic for waiting for SLAAC & DAD to complete in NDP tests Since 9a0e544f05bf the NDP tests attempt to explicitly wait for DAD to complete, rather than just having a hard coded sleep. However, the conditions we use are a bit sloppy and allow for a number of possible cases where it might not work correctly. Stefano seems to be hitting one of these (though I'm not sure which) with some later patches. - We wait for *lack* of a tentative address, so if the first check occurs before we have even a tentative address it will bypass the delay - It's not entirely clear if the permanent address will always appear as soon as the tentative address disappears - We weren't filtering on interface - We were doing the filtering with ip-address options rather than in jq. However in at least in some circumstances this seems to result in an empty .addr_info field, rather than omitting it entirely, which could cause us to get the wrong result So, instead, explicitly wait for the address we need to be present: an RA provided address on the external interface. While we're here we remove the requirement that it have global scope: the "kernel_ra" check is already sufficient to make sure this address comes from an NDP RA, not something else. If it's not the global scope address we expect, better to check it and fail, rather than keep waiting. Fixes: 9a0e544f05bf ("test: Improve test for NDP assigned prefix") Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/passt/ndp | 6 +++--- test/pasta/ndp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/passt/ndp b/test/passt/ndp index 56b385b..516cd6b 100644 --- a/test/passt/ndp +++ b/test/passt/ndp @@ -17,13 +17,13 @@ htools ip jq sipcalc grep cut test Interface name gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' guest ip link set dev __IFNAME__ up -# Wait for DAD to complete -guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +# Wait for SLAAC & DAD to complete +guest while ! ip -j -6 addr show dev __IFNAME__ | jq -e '.[].addr_info.[] | select(.protocol == "kernel_ra")'; do sleep 0.1; done hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' check [ -n "__IFNAME__" ] test SLAAC: prefix -gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]' +gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]' gout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4 hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4 diff --git a/test/pasta/ndp b/test/pasta/ndp index 2442ab5..952c1ea 100644 --- a/test/pasta/ndp +++ b/test/pasta/ndp @@ -18,11 +18,11 @@ test Interface name nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' check [ -n "__IFNAME__" ] ns ip link set dev __IFNAME__ up -# Wait for DAD to complete -ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +# Wait for SLAAC & DAD to complete +ns while ! ip -j -6 addr show dev __IFNAME__ | jq -e '.[].addr_info.[] | select(.protocol == "kernel_ra")'; do sleep 0.1; done test SLAAC: prefix -nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]' +nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]' nsout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4 hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4 From 14b84a7f077ecb734bb0e724f70bafeaa6d35a61 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 22 Nov 2024 07:57:43 +0100 Subject: [PATCH 129/382] treewide: Introduce 'local mode' for disconnected setups There are setups where no host interface is available or configured at all, intentionally or not, temporarily or not, but users expect (Podman) containers to run in any case as they did with slirp4netns, and we're now getting reports that we broke such setups at a rather alarming rate. To this end, if we don't find any usable host interface, instead of exiting: - for IPv4, use 169.254.2.1 as guest/container address and 169.254.2.2 as default gateway - for IPv6, don't assign any address (forcibly disable DHCPv6), and use the *first* link-local address we observe to represent the guest/container. Advertise fe80::1 as default gateway - use 'tap0' as default interface name for pasta Change ifi4 and ifi6 in struct ctx to int and accept a special -1 value meaning that no host interface was selected, but the IP family is enabled. The fact that the kernel uses unsigned int values for those is not an issue as 1. one can't create so many interfaces anyway and 2. we otherwise handle those values transparently. Fix a botched conditional in conf_print() to actually skip printing DHCPv6 information if DHCPv6 is disabled (and skip printing NDP information if NDP is disabled). Link: https://github.com/containers/podman/issues/24614 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 97 ++++++++++++++++++++++++++++++++++++++++++++------------- passt.1 | 33 +++++++++++++++++--- passt.h | 8 ++--- pasta.c | 7 +++-- tap.c | 3 ++ 5 files changed, 116 insertions(+), 32 deletions(-) diff --git a/conf.c b/conf.c index d342c8a..c6bffc4 100644 --- a/conf.c +++ b/conf.c @@ -48,6 +48,20 @@ #define NETNS_RUN_DIR "/run/netns" +#define IP4_LL_GUEST_ADDR (struct in_addr){ htonl_constant(0xa9fe0201) } + /* 169.254.2.1, libslirp default: 10.0.2.1 */ + +#define IP4_LL_GUEST_GW (struct in_addr){ htonl_constant(0xa9fe0202) } + /* 169.254.2.2, libslirp default: 10.0.2.2 */ + +#define IP4_LL_PREFIX_LEN 16 + +#define IP6_LL_GUEST_GW (struct in6_addr) \ + {{{ 0xfe, 0x80, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0x01 }}} + +const char *pasta_default_ifn = "tap0"; + /** * next_chunk - Return the next piece of a string delimited by a character * @s: String to search @@ -631,7 +645,7 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4) ifi = nl_get_ext_if(nl_sock, AF_INET); if (!ifi) { - info("Couldn't pick external interface: disabling IPv4"); + debug("Failed to detect external interface for IPv4"); return 0; } @@ -639,8 +653,8 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4) int rc = nl_route_get_def(nl_sock, ifi, AF_INET, &ip4->guest_gw); if (rc < 0) { - err("Couldn't discover IPv4 gateway address: %s", - strerror(-rc)); + debug("Couldn't discover IPv4 gateway address: %s", + strerror(-rc)); return 0; } } @@ -649,8 +663,8 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4) int rc = nl_addr_get(nl_sock, ifi, AF_INET, &ip4->addr, &ip4->prefix_len, NULL); if (rc < 0) { - err("Couldn't discover IPv4 address: %s", - strerror(-rc)); + debug("Couldn't discover IPv4 address: %s", + strerror(-rc)); return 0; } } @@ -677,6 +691,19 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4) return ifi; } +/** + * conf_ip4_local() - Configure IPv4 addresses and attributes for local mode + * @ip4: IPv4 context (will be written) + */ +static void conf_ip4_local(struct ip4_ctx *ip4) +{ + ip4->addr_seen = ip4->addr = IP4_LL_GUEST_ADDR; + ip4->our_tap_addr = ip4->guest_gw = IP4_LL_GUEST_GW; + ip4->prefix_len = IP4_LL_PREFIX_LEN; + + ip4->no_copy_addrs = ip4->no_copy_routes = true; +} + /** * conf_ip6() - Verify or detect IPv6 support, get relevant addresses * @ifi: Host interface to attempt (0 to determine one) @@ -693,15 +720,15 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6) ifi = nl_get_ext_if(nl_sock, AF_INET6); if (!ifi) { - info("Couldn't pick external interface: disabling IPv6"); + debug("Failed to detect external interface for IPv6"); return 0; } if (IN6_IS_ADDR_UNSPECIFIED(&ip6->guest_gw)) { rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->guest_gw); if (rc < 0) { - err("Couldn't discover IPv6 gateway address: %s", - strerror(-rc)); + debug("Couldn't discover IPv6 gateway address: %s", + strerror(-rc)); return 0; } } @@ -710,7 +737,7 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6) IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ? &ip6->addr : NULL, &prefix_len, &ip6->our_tap_ll); if (rc < 0) { - err("Couldn't discover IPv6 address: %s", strerror(-rc)); + debug("Couldn't discover IPv6 address: %s", strerror(-rc)); return 0; } @@ -726,6 +753,17 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6) return ifi; } +/** + * conf_ip6_local() - Configure IPv6 addresses and attributes for local mode + * @ip6: IPv6 context (will be written) + */ +static void conf_ip6_local(struct ip6_ctx *ip6) +{ + ip6->our_tap_ll = ip6->guest_gw = IP6_LL_GUEST_GW; + + ip6->no_copy_addrs = ip6->no_copy_routes = true; +} + /** * usage() - Print usage, exit with given status code * @name: Executable name @@ -948,12 +986,14 @@ static void conf_print(const struct ctx *c) char bufmac[ETH_ADDRSTRLEN], ifn[IFNAMSIZ]; int i; - info("Template interface: %s%s%s%s%s", - c->ifi4 ? if_indextoname(c->ifi4, ifn) : "", - c->ifi4 ? " (IPv4)" : "", - (c->ifi4 && c->ifi6) ? ", " : "", - c->ifi6 ? if_indextoname(c->ifi6, ifn) : "", - c->ifi6 ? " (IPv6)" : ""); + if (c->ifi4 > 0 || c->ifi6 > 0) { + info("Template interface: %s%s%s%s%s", + c->ifi4 > 0 ? if_indextoname(c->ifi4, ifn) : "", + c->ifi4 > 0 ? " (IPv4)" : "", + (c->ifi4 && c->ifi6) ? ", " : "", + c->ifi6 > 0 ? if_indextoname(c->ifi6, ifn) : "", + c->ifi6 > 0 ? " (IPv6)" : ""); + } if (*c->ip4.ifname_out || *c->ip6.ifname_out) { info("Outbound interface: %s%s%s%s%s", @@ -1024,9 +1064,9 @@ static void conf_print(const struct ctx *c) if (!c->no_ndp && !c->no_dhcpv6) info("NDP/DHCPv6:"); - else if (!c->no_ndp) - info("DHCPv6:"); else if (!c->no_dhcpv6) + info("DHCPv6:"); + else if (!c->no_ndp) info("NDP:"); else goto dns6; @@ -1733,10 +1773,23 @@ void conf(struct ctx *c, int argc, char **argv) c->ifi4 = conf_ip4(ifi4, &c->ip4); if (!v4_only) c->ifi6 = conf_ip6(ifi6, &c->ip6); - if ((!c->ifi4 && !c->ifi6) || - (*c->ip4.ifname_out && !c->ifi4) || + if ((*c->ip4.ifname_out && !c->ifi4) || (*c->ip6.ifname_out && !c->ifi6)) die("External interface not usable"); + if (!c->ifi4 && !c->ifi6) { + info("No external interface as template, switch to local mode"); + + conf_ip4_local(&c->ip4); + c->ifi4 = -1; + + conf_ip6_local(&c->ip6); + c->ifi6 = -1; + + if (!*c->pasta_ifn) { + strncpy(c->pasta_ifn, pasta_default_ifn, + sizeof(c->pasta_ifn) - 1); + } + } if (c->ifi4 && !no_map_gw && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) @@ -1840,6 +1893,8 @@ void conf(struct ctx *c, int argc, char **argv) if (!c->ifi6) { c->no_ndp = 1; c->no_dhcpv6 = 1; + } else if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) { + c->no_dhcpv6 = 1; } if (!c->mtu) @@ -1848,9 +1903,9 @@ void conf(struct ctx *c, int argc, char **argv) get_dns(c); if (!*c->pasta_ifn) { - if (c->ifi4) + if (c->ifi4 > 0) if_indextoname(c->ifi4, c->pasta_ifn); - else + else if (c->ifi6 > 0) if_indextoname(c->ifi6, c->pasta_ifn); } diff --git a/passt.1 b/passt.1 index 059abd3..15c8338 100644 --- a/passt.1 +++ b/passt.1 @@ -160,7 +160,9 @@ once for IPv6). By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces with the first default route, if any, for the corresponding IP version. If no default routes are available and there is any interface with any route for a -given IP version, the first of these interfaces will be chosen instead. +given IP version, the first of these interfaces will be chosen instead. If no +such interface exists, the link-local address 169.254.2.1 is assigned for IPv4, +and no additional address will be assigned for IPv6. .TP .BR \-n ", " \-\-netmask " " \fImask @@ -187,7 +189,9 @@ first default route, if any, for the corresponding IP version. If the default route is a multipath one, the gateway is the first nexthop router returned by the kernel which has the highest weight in the set of paths. If no default routes are available and there is just one interface with any route, that -interface will be chosen instead. +interface will be chosen instead. If no such interface exists, the link-local +address 169.254.2.2 is used for IPv4, and the link-local address fe80::1 is used +for IPv6. Note: these addresses are also used as source address for packets directed to the guest or to the target namespace having a loopback or local source address, @@ -202,7 +206,9 @@ Default is to use the interfaces specified by \fB--outbound-if4\fR and If no interfaces are given, the interface with the first default routes for each IP version is selected. If no default routes are available and there is just one -interface with any route, that interface will be chosen instead. +interface with any route, that interface will be chosen instead. If no such +interface exists, host interfaces will be ignored for the purposes of assigning +addresses and routes, and link-local addresses will be used instead. .TP .BR \-o ", " \-\-outbound " " \fIaddr @@ -221,7 +227,8 @@ derive IPv4 addresses and routes. By default, the interface given by the default route is selected. If no default routes are available and there is just one interface with any route, that -interface will be chosen instead. +interface will be chosen instead. If no such interface exists, outbound sockets +will not be bound to any specific interface. .TP .BR \-\-outbound-if6 " " \fIname @@ -231,7 +238,8 @@ derive IPv6 addresses and routes. By default, the interface given by the default route is selected. If no default routes are available and there is just one interface with any route, that -interface will be chosen instead. +interface will be chosen instead. If no such interface exists, outbound sockets +will not be bound to any specific interface. .TP .BR \-D ", " \-\-dns " " \fIaddr @@ -503,6 +511,7 @@ Default is \fBnone\fR. .BR \-I ", " \-\-ns-ifname " " \fIname Name of tap interface to be created in target namespace. By default, the same interface name as the external, routable interface is used. +If no such interface exists, the name \fItap0\fR will be used instead. .TP .BR \-t ", " \-\-tcp-ports " " \fIspec @@ -1031,6 +1040,20 @@ If the sending window cannot be queried, it will always be announced as the current sending buffer size to guest or target namespace. This might affect throughput of TCP connections. +.SS Local mode for disconnected setups + +If \fBpasst\fR and \fBpasta\fR fail to find a host interface with a configured +address, other than loopback addresses, they will, obviously, not attempt to +source addresses or routes from the host. + +In this case, unless configured otherwise, they will assign the IPv4 link-local +address 169.254.2.1 to the guest or target namespace, and no IPv6 address. The +notion of the guest or target namespace IPv6 address is derived from the first +link-local address observed. + +Default gateways will be assigned as the link-local address 169.254.2.2 for +IPv4, and as the link-local address fe80::1 for IPv6. + .SH LIMITATIONS Currently, IGMP/MLD proxying (RFC 4605) and support for SCTP (RFC 4960) are not diff --git a/passt.h b/passt.h index 72c7f72..799ee50 100644 --- a/passt.h +++ b/passt.h @@ -202,10 +202,10 @@ struct ip6_ctx { * @our_tap_mac: Pasta/passt's MAC on the tap link * @guest_mac: MAC address of guest or namespace, seen or configured * @hash_secret: 128-bit secret for siphash functions - * @ifi4: Index of template interface for IPv4, 0 if IPv4 disabled + * @ifi4: Template interface for IPv4, -1: none, 0: IPv4 disabled * @ip: IPv4 configuration * @dns_search: DNS search list - * @ifi6: Index of template interface for IPv6, 0 if IPv6 disabled + * @ifi6: Template interface for IPv6, -1: none, 0: IPv6 disabled * @ip6: IPv6 configuration * @pasta_ifn: Name of namespace interface for pasta * @pasta_ifi: Index of namespace interface for pasta @@ -258,12 +258,12 @@ struct ctx { unsigned char guest_mac[ETH_ALEN]; uint64_t hash_secret[2]; - unsigned int ifi4; + int ifi4; struct ip4_ctx ip4; struct fqdn dns_search[MAXDNSRCH]; - unsigned int ifi6; + int ifi6; struct ip6_ctx ip6; char pasta_ifn[IF_NAMESIZE]; diff --git a/pasta.c b/pasta.c index a117704..96dacc3 100644 --- a/pasta.c +++ b/pasta.c @@ -369,8 +369,11 @@ void pasta_ns_conf(struct ctx *c) 0, IFF_NOARP); if (c->ip6.no_copy_addrs) { - rc = nl_addr_set(nl_sock_ns, c->pasta_ifi, - AF_INET6, &c->ip6.addr, 64); + if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) { + rc = nl_addr_set(nl_sock_ns, + c->pasta_ifi, AF_INET6, + &c->ip6.addr, 64); + } } else { rc = nl_addr_dup(nl_sock, c->ifi6, nl_sock_ns, c->pasta_ifi, diff --git a/tap.c b/tap.c index 14d9b3d..5347df4 100644 --- a/tap.c +++ b/tap.c @@ -803,6 +803,9 @@ resume: if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen)) { c->ip6.addr_seen = *saddr; } + + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) + c->ip6.addr = *saddr; } else if (!IN6_IS_ADDR_UNSPECIFIED(saddr)){ c->ip6.addr_seen = *saddr; } From d6e9e2486f092901207e6565f5eee3817cf4e11a Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 15 Nov 2024 18:13:17 +0100 Subject: [PATCH 130/382] dhcp: Use -1 as "missing option" length instead of 0 We want to add support for option 80 (Rapid Commit, RFC 4039), whose length is 0. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- dhcp.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/dhcp.c b/dhcp.c index a06f143..387aee3 100644 --- a/dhcp.c +++ b/dhcp.c @@ -36,9 +36,9 @@ /** * struct opt - DHCP option * @sent: Convenience flag, set while filling replies - * @slen: Length of option defined for server + * @slen: Length of option defined for server, -1 if not going to be sent * @s: Option payload from server - * @clen: Length of option received from client + * @clen: Length of option received from client, -1 if not received * @c: Option payload from client */ struct opt { @@ -68,6 +68,11 @@ static struct opt opts[255]; */ void dhcp_init(void) { + int i; + + for (i = 0; i < ARRAY_SIZE(opts); i++) + opts[i].slen = -1; + opts[1] = (struct opt) { 0, 4, { 0 }, 0, { 0 }, }; /* Mask */ opts[3] = (struct opt) { 0, 4, { 0 }, 0, { 0 }, }; /* Router */ opts[51] = (struct opt) { 0, 4, { 0xff, @@ -154,17 +159,17 @@ static int fill(struct msg *m) * option 53 at the beginning of the list. * Put it there explicitly, unless requested via option 55. */ - if (!memchr(opts[55].c, 53, opts[55].clen)) + if (opts[55].clen > 0 && !memchr(opts[55].c, 53, opts[55].clen)) fill_one(m, 53, &offset); for (i = 0; i < opts[55].clen; i++) { o = opts[55].c[i]; - if (opts[o].slen) + if (opts[o].slen != -1) fill_one(m, o, &offset); } for (o = 0; o < 255; o++) { - if (opts[o].slen && !opts[o].sent) + if (opts[o].slen != -1 && !opts[o].sent) fill_one(m, o, &offset); } @@ -264,6 +269,9 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len) ".\xc0"); } } + + if (!opts[119].slen) + opts[119].slen = -1; } /** @@ -313,6 +321,9 @@ int dhcp(const struct ctx *c, const struct pool *p) offset += offsetof(struct msg, o); + for (i = 0; i < ARRAY_SIZE(opts); i++) + opts[i].clen = -1; + while (opt_off + 2 < opt_len) { const uint8_t *olen, *val; uint8_t *type; @@ -331,11 +342,12 @@ int dhcp(const struct ctx *c, const struct pool *p) opt_off += *olen + 2; } - if (opts[53].c[0] == DHCPDISCOVER) { + if (opts[53].clen > 0 && opts[53].c[0] == DHCPDISCOVER) { info("DHCP: offer to discover"); opts[53].s[0] = DHCPOFFER; - } else if (opts[53].c[0] == DHCPREQUEST || !opts[53].clen) { - info("%s: ack to request", opts[53].clen ? "DHCP" : "BOOTP"); + } else if (opts[53].clen <= 0 || opts[53].c[0] == DHCPREQUEST) { + info("%s: ack to request", /* DHCP needs a valid message type */ + (opts[53].clen <= 0) ? "BOOTP" : "DHCP"); opts[53].s[0] = DHCPACK; } else { return -1; @@ -374,6 +386,8 @@ int dhcp(const struct ctx *c, const struct pool *p) ((struct in_addr *)opts[6].s)[i] = c->ip4.dns[i]; opts[6].slen += sizeof(uint32_t); } + if (!opts[6].slen) + opts[6].slen = -1; if (!c->no_dhcp_dns_search) opt_set_dns_search(c, sizeof(m->o)); From 9da2038485c9334d28df34d2ebd5ba04a3c7662d Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 15 Nov 2024 18:18:22 +0100 Subject: [PATCH 131/382] dhcp: Introduce support for Rapid Commit (option 80, RFC 4039) I'm trying to speed up and simplify IP address acquisition in muvm. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- dhcp.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dhcp.c b/dhcp.c index 387aee3..a16cde8 100644 --- a/dhcp.c +++ b/dhcp.c @@ -342,9 +342,16 @@ int dhcp(const struct ctx *c, const struct pool *p) opt_off += *olen + 2; } + opts[80].slen = -1; if (opts[53].clen > 0 && opts[53].c[0] == DHCPDISCOVER) { - info("DHCP: offer to discover"); - opts[53].s[0] = DHCPOFFER; + if (opts[80].clen == -1) { + info("DHCP: offer to discover"); + opts[53].s[0] = DHCPOFFER; + } else { + info("DHCP: ack to discover (Rapid Commit)"); + opts[53].s[0] = DHCPACK; + opts[80].slen = 0; + } } else if (opts[53].clen <= 0 || opts[53].c[0] == DHCPREQUEST) { info("%s: ack to request", /* DHCP needs a valid message type */ (opts[53].clen <= 0) ? "BOOTP" : "DHCP"); From c0fbc7ef2ae2919bf6162b4149d341f448289836 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 25 Nov 2024 00:52:57 +0100 Subject: [PATCH 132/382] dhcp: Honour broadcast flag (RFC 2131, 4.1) It's widely considered a legacy option nowadays, and I've haven't seen clients setting it since Windows 95, but it's convenient for a minimal DHCP client not using raw IP sockets such as what I'm playing with for muvm. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- dhcp.c | 12 ++++++++++-- ip.h | 3 +++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/dhcp.c b/dhcp.c index a16cde8..d8515aa 100644 --- a/dhcp.c +++ b/dhcp.c @@ -112,6 +112,8 @@ struct msg { uint32_t xid; uint16_t secs; uint16_t flags; +#define FLAG_BROADCAST htons_constant(0x8000) + uint32_t ciaddr; struct in_addr yiaddr; uint32_t siaddr; @@ -285,10 +287,10 @@ int dhcp(const struct ctx *c, const struct pool *p) { size_t mlen, dlen, offset = 0, opt_len, opt_off = 0; char macstr[ETH_ADDRSTRLEN]; + struct in_addr mask, dst; const struct ethhdr *eh; const struct iphdr *iph; const struct udphdr *uh; - struct in_addr mask; unsigned int i; struct msg *m; @@ -400,7 +402,13 @@ int dhcp(const struct ctx *c, const struct pool *p) opt_set_dns_search(c, sizeof(m->o)); dlen = offsetof(struct msg, o) + fill(m); - tap_udp4_send(c, c->ip4.our_tap_addr, 67, c->ip4.addr, 68, m, dlen); + + if (m->flags & FLAG_BROADCAST) + dst = in4addr_broadcast; + else + dst = c->ip4.addr; + + tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, m, dlen); return 1; } diff --git a/ip.h b/ip.h index 0742612..1544dbf 100644 --- a/ip.h +++ b/ip.h @@ -101,4 +101,7 @@ static const struct in6_addr in6addr_ll_all_nodes = { }, }; +/* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */ +static const struct in_addr in4addr_broadcast = { 0xffffffff }; + #endif /* IP_H */ From dd143e38901af1c0c48f483d93558ba0773667db Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 22 Nov 2024 17:43:28 +0100 Subject: [PATCH 133/382] packet: replace struct desc by struct iovec To be able to manage buffers inside a shared memory provided by a VM via a vhost-user interface, we cannot rely on the fact that buffers are located in a pre-defined memory area and use a base address and a 32bit offset to address them. We need a 64bit address, so replace struct desc by struct iovec and update range checking. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 84 ++++++++++++++++++++++++++++++-------------------------- packet.h | 14 ++-------- 2 files changed, 47 insertions(+), 51 deletions(-) diff --git a/packet.c b/packet.c index ccfc846..3748996 100644 --- a/packet.c +++ b/packet.c @@ -22,6 +22,35 @@ #include "util.h" #include "log.h" +/** + * packet_check_range() - Check if a packet memory range is valid + * @p: Packet pool + * @offset: Offset of data range in packet descriptor + * @len: Length of desired data range + * @start: Start of the packet descriptor + * @func: For tracing: name of calling function + * @line: For tracing: caller line of function call + * + * Return: 0 if the range is valid, -1 otherwise + */ +static int packet_check_range(const struct pool *p, size_t offset, size_t len, + const char *start, const char *func, int line) +{ + if (start < p->buf) { + trace("packet start %p before buffer start %p, " + "%s:%i", (void *)start, (void *)p->buf, func, line); + return -1; + } + + if (start + len + offset > p->buf + p->buf_size) { + trace("packet offset plus length %lu from size %lu, " + "%s:%i", start - p->buf + len + offset, + p->buf_size, func, line); + return -1; + } + + return 0; +} /** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool @@ -41,34 +70,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start, return; } - if (start < p->buf) { - trace("add packet start %p before buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); + if (packet_check_range(p, 0, len, start, func, line)) return; - } - - if (start + len > p->buf + p->buf_size) { - trace("add packet start %p, length: %zu, buffer end %p, %s:%i", - (void *)start, len, (void *)(p->buf + p->buf_size), - func, line); - return; - } if (len > UINT16_MAX) { trace("add packet length %zu, %s:%i", len, func, line); return; } -#if UINTPTR_MAX == UINT64_MAX - if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) { - trace("add packet start %p, buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); - return; - } -#endif - - p->pkt[idx].offset = start - p->buf; - p->pkt[idx].len = len; + p->pkt[idx].iov_base = (void *)start; + p->pkt[idx].iov_len = len; p->count++; } @@ -96,36 +107,31 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (len > UINT16_MAX || len + offset > UINT32_MAX) { + if (len > UINT16_MAX) { if (func) { - trace("packet data length %zu, offset %zu, %s:%i", - len, offset, func, line); + trace("packet data length %zu, %s:%i", + len, func, line); } return NULL; } - if (p->pkt[idx].offset + len + offset > p->buf_size) { + if (len + offset > p->pkt[idx].iov_len) { if (func) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", p->pkt[idx].offset + len + offset, - p->buf_size, func, line); - } - return NULL; - } - - if (len + offset > p->pkt[idx].len) { - if (func) { - trace("data length %zu, offset %zu from length %u, " - "%s:%i", len, offset, p->pkt[idx].len, + trace("data length %zu, offset %zu from length %zu, " + "%s:%i", len, offset, p->pkt[idx].iov_len, func, line); } return NULL; } - if (left) - *left = p->pkt[idx].len - offset - len; + if (packet_check_range(p, offset, len, p->pkt[idx].iov_base, + func, line)) + return NULL; - return p->buf + p->pkt[idx].offset + offset; + if (left) + *left = p->pkt[idx].iov_len - offset - len; + + return (char *)p->pkt[idx].iov_base + offset; } /** diff --git a/packet.h b/packet.h index a784b07..8377dcf 100644 --- a/packet.h +++ b/packet.h @@ -6,16 +6,6 @@ #ifndef PACKET_H #define PACKET_H -/** - * struct desc - Generic offset-based descriptor within buffer - * @offset: Offset of descriptor relative to buffer start, 32-bit limit - * @len: Length of descriptor, host order, 16-bit limit - */ -struct desc { - uint32_t offset; - uint16_t len; -}; - /** * struct pool - Generic pool of packets stored in a buffer * @buf: Buffer storing packet descriptors @@ -29,7 +19,7 @@ struct pool { size_t buf_size; size_t size; size_t count; - struct desc pkt[1]; + struct iovec pkt[1]; }; void packet_add_do(struct pool *p, size_t len, const char *start, @@ -54,7 +44,7 @@ struct _name ## _t { \ size_t buf_size; \ size_t size; \ size_t count; \ - struct desc pkt[_size]; \ + struct iovec pkt[_size]; \ } #define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \ From 7d1cd4dbf50325b57eb25648f1f64168d7e4820b Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 22 Nov 2024 17:43:29 +0100 Subject: [PATCH 134/382] vhost-user: introduce virtio API Add virtio.c and virtio.h that define the functions needed to manage virtqueues. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 4 +- util.h | 9 + virtio.c | 650 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ virtio.h | 183 ++++++++++++++++ 4 files changed, 844 insertions(+), 2 deletions(-) create mode 100644 virtio.c create mode 100644 virtio.h diff --git a/Makefile b/Makefile index 258d298..9b61a47 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c + tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -47,7 +47,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h udp_flow.h util.h + udp.h udp_flow.h util.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} diff --git a/util.h b/util.h index 90428c4..41bbd60 100644 --- a/util.h +++ b/util.h @@ -144,7 +144,16 @@ static inline uint32_t ntohl_unaligned(const void *p) return ntohl(val); } +static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } +#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0) +#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0) +#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0) + +#define smp_wmb() smp_mb_release() +#define smp_rmb() smp_mb_acquire() + #define NS_FN_STACK_SIZE (1024 * 1024) /* 1MiB */ + int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg); #define NS_CALL(fn, arg) \ diff --git a/virtio.c b/virtio.c new file mode 100644 index 0000000..b23a68c --- /dev/null +++ b/virtio.c @@ -0,0 +1,650 @@ +// SPDX-License-Identifier: GPL-2.0-or-later AND BSD-3-Clause +/* + * virtio API, vring and virtqueue functions definition + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +/* Some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c + * originally licensed under the following terms: + * + * -- + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2016 Red Hat, Inc. + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Marc-André Lureau <mlureau@redhat.com> + * Victor Kaplansky <victork@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + * + * Some parts copied from QEMU hw/virtio/virtio.c + * licensed under the following terms: + * + * Copyright IBM, Corp. 2007 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * -- + * + * virtq_used_event() and virtq_avail_event() from + * https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-712000A + * licensed under the following terms: + * + * -- + * + * This header is BSD licensed so anyone can use the definitions + * to implement compatible drivers/servers. + * + * Copyright 2007, 2009, IBM Corporation + * Copyright 2011, Red Hat, Inc + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ‘‘AS IS’’ AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stddef.h> +#include <endian.h> +#include <string.h> +#include <errno.h> +#include <sys/eventfd.h> +#include <sys/socket.h> + +#include "util.h" +#include "virtio.h" + +#define VIRTQUEUE_MAX_SIZE 1024 + +/** + * vu_gpa_to_va() - Translate guest physical address to our virtual address. + * @dev: Vhost-user device + * @plen: Physical length to map (input), capped to region (output) + * @guest_addr: Guest physical address + * + * Return: virtual address in our address space of the guest physical address + */ +static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_addr) +{ + unsigned int i; + + if (*plen == 0) + return NULL; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const struct vu_dev_region *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && + (guest_addr < (r->gpa + r->size))) { + if ((guest_addr + *plen) > (r->gpa + r->size)) + *plen = r->gpa + r->size - guest_addr; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(guest_addr - r->gpa + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +/** + * vring_avail_flags() - Read the available ring flags + * @vq: Virtqueue + * + * Return: the available ring descriptor flags of the given virtqueue + */ +static inline uint16_t vring_avail_flags(const struct vu_virtq *vq) +{ + return le16toh(vq->vring.avail->flags); +} + +/** + * vring_avail_idx() - Read the available ring index + * @vq: Virtqueue + * + * Return: the available ring index of the given virtqueue + */ +static inline uint16_t vring_avail_idx(struct vu_virtq *vq) +{ + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); + + return vq->shadow_avail_idx; +} + +/** + * vring_avail_ring() - Read an available ring entry + * @vq: Virtqueue + * @i: Index of the entry to read + * + * Return: the ring entry content (head of the descriptor chain) + */ +static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i) +{ + return le16toh(vq->vring.avail->ring[i]); +} + +/** + * virtq_used_event - Get location of used event indices + * (only with VIRTIO_F_EVENT_IDX) + * @vq Virtqueue + * + * Return: return the location of the used event index + */ +static inline uint16_t *virtq_used_event(const struct vu_virtq *vq) +{ + /* For backwards compat, used event index is at *end* of avail ring. */ + return &vq->vring.avail->ring[vq->vring.num]; +} + +/** + * vring_get_used_event() - Get the used event from the available ring + * @vq Virtqueue + * + * Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set) + * used_event is a performant alternative where the driver + * specifies how far the device can progress before a notification + * is required. + */ +static inline uint16_t vring_get_used_event(const struct vu_virtq *vq) +{ + return le16toh(*virtq_used_event(vq)); +} + +/** + * virtqueue_get_head() - Get the head of the descriptor chain for a given + * index + * @vq: Virtqueue + * @idx: Available ring entry index + * @head: Head of the descriptor chain + */ +static void virtqueue_get_head(const struct vu_virtq *vq, + unsigned int idx, unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) + die("vhost-user: Guest says index %u is available", *head); +} + +/** + * virtqueue_read_indirect_desc() - Copy virtio ring descriptors from guest + * memory + * @dev: Vhost-user device + * @desc: Destination address to copy the descriptors to + * @addr: Guest memory address to copy from + * @len: Length of memory to copy + * + * Return: -1 if there is an error, 0 otherwise + */ +static int virtqueue_read_indirect_desc(struct vu_dev *dev, struct vring_desc *desc, + uint64_t addr, size_t len) +{ + uint64_t read_len; + + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) + return -1; + + if (len == 0) + return -1; + + while (len) { + const struct vring_desc *orig_desc; + + read_len = len; + orig_desc = vu_gpa_to_va(dev, &read_len, addr); + if (!orig_desc) + return -1; + + memcpy(desc, orig_desc, read_len); + len -= read_len; + addr += read_len; + desc += read_len / sizeof(struct vring_desc); + } + + return 0; +} + +/** + * enum virtqueue_read_desc_state - State in the descriptor chain + * @VIRTQUEUE_READ_DESC_ERROR Found an invalid descriptor + * @VIRTQUEUE_READ_DESC_DONE No more descriptors in the chain + * @VIRTQUEUE_READ_DESC_MORE there are more descriptors in the chain + */ +enum virtqueue_read_desc_state { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +/** + * virtqueue_read_next_desc() - Read the the next descriptor in the chain + * @desc: Virtio ring descriptors + * @i: Index of the current descriptor + * @max: Maximum value of the descriptor index + * @next: Index of the next descriptor in the chain (output value) + * + * Return: current chain descriptor state (error, next, done) + */ +static int virtqueue_read_next_desc(const struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) + return VIRTQUEUE_READ_DESC_DONE; + + /* Check they're not leading us off end of descriptors. */ + *next = le16toh(desc[i].next); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) + return VIRTQUEUE_READ_DESC_ERROR; + + return VIRTQUEUE_READ_DESC_MORE; +} + +/** + * vu_queue_empty() - Check if virtqueue is empty + * @vq: Virtqueue + * + * Return: true if the virtqueue is empty, false otherwise + */ +bool vu_queue_empty(struct vu_virtq *vq) +{ + if (vq->shadow_avail_idx != vq->last_avail_idx) + return false; + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +/** + * vring_can_notify() - Check if a notification can be sent + * @dev: Vhost-user device + * @vq: Virtqueue + * + * Return: true if notification can be sent + */ +static bool vring_can_notify(const struct vu_dev *dev, struct vu_virtq *vq) +{ + uint16_t old, new; + bool v; + + /* We need to expose used array entries before checking used event. */ + smp_mb(); + + /* Always notify when queue is empty (when feature acknowledge) */ + if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && vu_queue_empty(vq)) + return true; + + if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +/** + * vu_queue_notify() - Send a notification to the given virtqueue + * @dev: Vhost-user device + * @vq: Virtqueue + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) +{ + if (!vring_can_notify(dev, vq)) { + debug("vhost-user: virtqueue can skip notify..."); + return; + } + + if (eventfd_write(vq->call_fd, 1) < 0) + die_perror("Error writing vhost-user queue eventfd"); +} + +/* virtq_avail_event() - Get location of available event indices + * (only with VIRTIO_F_EVENT_IDX) + * @vq: Virtqueue + * + * Return: return the location of the available event index + */ +static inline uint16_t *virtq_avail_event(const struct vu_virtq *vq) +{ + /* For backwards compat, avail event index is at *end* of used ring. */ + return (uint16_t *)&vq->vring.used->ring[vq->vring.num]; +} + +/** + * vring_set_avail_event() - Set avail_event + * @vq: Virtqueue + * @val: Value to set to avail_event + * avail_event is used in the same way the used_event is in the + * avail_ring. + * avail_event is used to advise the driver that notifications + * are unnecessary until the driver writes entry with an index + * specified by avail_event into the available ring. + */ +static inline void vring_set_avail_event(const struct vu_virtq *vq, + uint16_t val) +{ + uint16_t val_le = htole16(val); + + if (!vq->notification) + return; + + memcpy(virtq_avail_event(vq), &val_le, sizeof(val_le)); +} + +/** + * virtqueue_map_desc() - Translate descriptor ring physical address into our + * virtual address space + * @dev: Vhost-user device + * @p_num_sg: First iov entry to use (input), + * first iov entry not used (output) + * @iov: Iov array to use to store buffer virtual addresses + * @max_num_sg: Maximum number of iov entries + * @pa: Guest physical address of the buffer to map into our virtual + * address + * @sz: Size of the buffer + * + * Return: false on error, true otherwise + */ +static bool virtqueue_map_desc(struct vu_dev *dev, + unsigned int *p_num_sg, struct iovec *iov, + unsigned int max_num_sg, + uint64_t pa, size_t sz) +{ + unsigned int num_sg = *p_num_sg; + + ASSERT(num_sg < max_num_sg); + ASSERT(sz); + + while (sz) { + uint64_t len = sz; + + iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); + if (iov[num_sg].iov_base == NULL) + die("vhost-user: invalid address for buffers"); + iov[num_sg].iov_len = len; + num_sg++; + sz -= len; + pa += len; + } + + *p_num_sg = num_sg; + return true; +} + +/** + * vu_queue_map_desc - Map the virtqueue descriptor ring into our virtual + * address space + * @dev: Vhost-user device + * @vq: Virtqueue + * @idx: First descriptor ring entry to map + * @elem: Virtqueue element to store descriptor ring iov + * + * Return: -1 if there is an error, 0 otherwise + */ +static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned int idx, + struct vu_virtq_element *elem) +{ + const struct vring_desc *desc = vq->vring.desc; + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + unsigned int out_num = 0, in_num = 0; + unsigned int max = vq->vring.num; + unsigned int i = idx; + uint64_t read_len; + int rc; + + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) + die("vhost-user: Invalid size for indirect buffer table"); + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[i].addr); + desc_len = le32toh(desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) + desc = desc_buf; + } + if (!desc) + die("vhost-user: Invalid indirect buffer table"); + i = 0; + } + + /* Collect all the descriptors */ + do { + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { + if (!virtqueue_map_desc(dev, &in_num, elem->in_sg, + elem->in_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) + return -1; + } else { + if (in_num) + die("Incorrect order for descriptors"); + if (!virtqueue_map_desc(dev, &out_num, elem->out_sg, + elem->out_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } + + /* If we've got too many, that implies a descriptor loop. */ + if ((in_num + out_num) > max) + die("vhost-user: Loop in queue descriptor list"); + rc = virtqueue_read_next_desc(desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) + die("vhost-user: Failed to read descriptor list"); + + elem->index = idx; + elem->in_num = in_num; + elem->out_num = out_num; + + return 0; +} + +/** + * vu_queue_pop() - Pop an entry from the virtqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @elem: Virtqueue element to file with the entry information + * + * Return: -1 if there is an error, 0 otherwise + */ +/* cppcheck-suppress unusedFunction */ +int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem) +{ + unsigned int head; + int ret; + + if (vu_queue_empty(vq)) + return -1; + + /* Needed after vu_queue_empty(), see comment in + * virtqueue_num_heads(). + */ + smp_rmb(); + + if (vq->inuse >= vq->vring.num) + die("vhost-user queue size exceeded"); + + virtqueue_get_head(vq, vq->last_avail_idx++, &head); + + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + vring_set_avail_event(vq, vq->last_avail_idx); + + ret = vu_queue_map_desc(dev, vq, head, elem); + + if (ret < 0) + return ret; + + vq->inuse++; + + return 0; +} + +/** + * vu_queue_detach_element() - Detach an element from the virqueue + * @vq: Virtqueue + */ +void vu_queue_detach_element(struct vu_virtq *vq) +{ + vq->inuse--; + /* unmap, when DMA support is added */ +} + +/** + * vu_queue_unpop() - Push back the previously popped element from the virqueue + * @vq: Virtqueue + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_unpop(struct vu_virtq *vq) +{ + vq->last_avail_idx--; + vu_queue_detach_element(vq); +} + +/** + * vu_queue_rewind() - Push back a given number of popped elements + * @vq: Virtqueue + * @num: Number of element to unpop + */ +/* cppcheck-suppress unusedFunction */ +bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num) +{ + if (num > vq->inuse) + return false; + + vq->last_avail_idx -= num; + vq->inuse -= num; + return true; +} + +/** + * vring_used_write() - Write an entry in the used ring + * @vq: Virtqueue + * @uelem: Entry to write + * @i: Index of the entry in the used ring + */ +static inline void vring_used_write(struct vu_virtq *vq, + const struct vring_used_elem *uelem, int i) +{ + struct vring_used *used = vq->vring.used; + + used->ring[i] = *uelem; +} + +/** + * vu_queue_fill_by_index() - Update information of a descriptor ring entry + * in the used ring + * @vq: Virtqueue + * @index: Descriptor ring index + * @len: Size of the element + * @idx: Used ring entry index + */ +void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, + unsigned int len, unsigned int idx) +{ + struct vring_used_elem uelem; + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = htole32(index); + uelem.len = htole32(len); + vring_used_write(vq, &uelem, idx); +} + +/** + * vu_queue_fill() - Update information of a given element in the used ring + * @dev: Vhost-user device + * @vq: Virtqueue + * @elem: Element information to fill + * @len: Size of the element + * @idx: Used ring entry index + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem, + unsigned int len, unsigned int idx) +{ + vu_queue_fill_by_index(vq, elem->index, len, idx); +} + +/** + * vring_used_idx_set() - Set the descriptor ring current index + * @vq: Virtqueue + * @val: Value to set in the index + */ +static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val) +{ + vq->vring.used->idx = htole16(val); + + vq->used_idx = val; +} + +/** + * vu_queue_flush() - Flush the virtqueue + * @vq: Virtqueue + * @count: Number of entry to flush + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_flush(struct vu_virtq *vq, unsigned int count) +{ + uint16_t old, new; + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + old = vq->used_idx; + new = old + count; + vring_used_idx_set(vq, new); + vq->inuse -= count; + if ((uint16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) + vq->signalled_used_valid = false; +} diff --git a/virtio.h b/virtio.h new file mode 100644 index 0000000..94efeb0 --- /dev/null +++ b/virtio.h @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * virtio API, vring and virtqueue functions definition + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +#ifndef VIRTIO_H +#define VIRTIO_H + +#include <stdbool.h> +#include <linux/vhost_types.h> + +/* Maximum size of a virtqueue */ +#define VIRTQUEUE_MAX_SIZE 1024 + +/** + * struct vu_ring - Virtqueue rings + * @num: Size of the queue + * @desc: Descriptor ring + * @avail: Available ring + * @used: Used ring + * @log_guest_addr: Guest address for logging + * @flags: Vring flags + * VHOST_VRING_F_LOG is set if log address is valid + */ +struct vu_ring { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + uint32_t flags; +}; + +/** + * struct vu_virtq - Virtqueue definition + * @vring: Virtqueue rings + * @last_avail_idx: Next head to pop + * @shadow_avail_idx: Last avail_idx read from VQ. + * @used_idx: Descriptor ring current index + * @signalled_used: Last used index value we have signalled on + * @signalled_used_valid: True if signalled_used if valid + * @notification: True if the queues notify (via event + * index or interrupt) + * @inuse: Number of entries in use + * @call_fd: The event file descriptor to signal when + * buffers are used. + * @kick_fd: The event file descriptor for adding + * buffers to the vring + * @err_fd: The event file descriptor to signal when + * error occurs + * @enable: True if the virtqueue is enabled + * @started: True if the virtqueue is started + * @vra: QEMU address of our rings + */ +struct vu_virtq { + struct vu_ring vring; + uint16_t last_avail_idx; + uint16_t shadow_avail_idx; + uint16_t used_idx; + uint16_t signalled_used; + bool signalled_used_valid; + bool notification; + unsigned int inuse; + int call_fd; + int kick_fd; + int err_fd; + unsigned int enable; + bool started; + struct vhost_vring_addr vra; +}; + +/** + * struct vu_dev_region - guest shared memory region + * @gpa: Guest physical address of the region + * @size: Memory size in bytes + * @qva: QEMU virtual address + * @mmap_offset: Offset where the region starts in the mapped memory + * @mmap_addr: Address of the mapped memory + */ +struct vu_dev_region { + uint64_t gpa; + uint64_t size; + uint64_t qva; + uint64_t mmap_offset; + uint64_t mmap_addr; +}; + +#define VHOST_USER_MAX_QUEUES 2 + +/* + * Set a reasonable maximum number of ram slots, which will be supported by + * any architecture. + */ +#define VHOST_USER_MAX_RAM_SLOTS 32 + +/** + * struct vu_dev - vhost-user device information + * @context: Execution context + * @nregions: Number of shared memory regions + * @regions: Guest shared memory regions + * @features: Vhost-user features + * @protocol_features: Vhost-user protocol features + */ +struct vu_dev { + uint32_t nregions; + struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS]; + struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; + uint64_t features; + uint64_t protocol_features; +}; + +/** + * struct vu_virtq_element - virtqueue element + * @index: Descriptor ring index + * @out_num: Number of outgoing iovec buffers + * @in_num: Number of incoming iovec buffers + * @in_sg: Incoming iovec buffers + * @out_sg: Outgoing iovec buffers + */ +struct vu_virtq_element { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + struct iovec *in_sg; + struct iovec *out_sg; +}; + +/** + * has_feature() - Check a feature bit in a features set + * @features: Features set + * @fb: Feature bit to check + * + * Return: True if the feature bit is set + */ +static inline bool has_feature(uint64_t features, unsigned int fbit) +{ + return !!(features & (1ULL << fbit)); +} + +/** + * vu_has_feature() - Check if a virtio-net feature is available + * @vdev: Vhost-user device + * @bit: Feature to check + * + * Return: True if the feature is available + */ +static inline bool vu_has_feature(const struct vu_dev *vdev, + unsigned int fbit) +{ + return has_feature(vdev->features, fbit); +} + +/** + * vu_has_protocol_feature() - Check if a vhost-user feature is available + * @vdev: Vhost-user device + * @bit: Feature to check + * + * Return: True if the feature is available + */ +/* cppcheck-suppress unusedFunction */ +static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, + unsigned int fbit) +{ + return has_feature(vdev->protocol_features, fbit); +} + +bool vu_queue_empty(struct vu_virtq *vq); +void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq); +int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, + struct vu_virtq_element *elem); +void vu_queue_detach_element(struct vu_virtq *vq); +void vu_queue_unpop(struct vu_virtq *vq); +bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num); +void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, + unsigned int len, unsigned int idx); +void vu_queue_fill(struct vu_virtq *vq, + const struct vu_virtq_element *elem, unsigned int len, + unsigned int idx); +void vu_queue_flush(struct vu_virtq *vq, unsigned int count); +#endif /* VIRTIO_H */ From 31117b27c6c905a6bf5fb2567f30fa2f9e0fb3cd Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 22 Nov 2024 17:43:30 +0100 Subject: [PATCH 135/382] vhost-user: introduce vhost-user API Add vhost_user.c and vhost_user.h that define the functions needed to implement vhost-user backend. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 4 +- vhost_user.c | 970 +++++++++++++++++++++++++++++++++++++++++++++++++++ vhost_user.h | 208 +++++++++++ virtio.h | 1 + 4 files changed, 1181 insertions(+), 2 deletions(-) create mode 100644 vhost_user.c create mode 100644 vhost_user.h diff --git a/Makefile b/Makefile index 9b61a47..bcb084e 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c virtio.c + tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c vhost_user.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -47,7 +47,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h udp_flow.h util.h virtio.h + udp.h udp_flow.h util.h vhost_user.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} diff --git a/vhost_user.c b/vhost_user.c new file mode 100644 index 0000000..89627a2 --- /dev/null +++ b/vhost_user.c @@ -0,0 +1,970 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * vhost-user API, command management and virtio interface + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + * + * Some parts from QEMU subprojects/libvhost-user/libvhost-user.c + * licensed under the following terms: + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2016 Red Hat, Inc. + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Marc-André Lureau <mlureau@redhat.com> + * Victor Kaplansky <victork@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <stddef.h> +#include <string.h> +#include <assert.h> +#include <stdbool.h> +#include <inttypes.h> +#include <time.h> +#include <net/ethernet.h> +#include <netinet/in.h> +#include <sys/epoll.h> +#include <sys/eventfd.h> +#include <sys/mman.h> +#include <linux/vhost_types.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "passt.h" +#include "tap.h" +#include "vhost_user.h" +#include "pcap.h" + +/* vhost-user version we are compatible with */ +#define VHOST_USER_VERSION 1 + +/** + * vu_print_capabilities() - print vhost-user capabilities + * this is part of the vhost-user backend + * convention. + */ +/* cppcheck-suppress unusedFunction */ +void vu_print_capabilities(void) +{ + info("{"); + info(" \"type\": \"net\""); + info("}"); + exit(EXIT_SUCCESS); +} + +/** + * vu_request_to_string() - convert a vhost-user request number to its name + * @req: request number + * + * Return: the name of request number + */ +static const char *vu_request_to_string(unsigned int req) +{ + if (req < VHOST_USER_MAX) { +#define REQ(req) [req] = #req + static const char * const vu_request_str[VHOST_USER_MAX] = { + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_SET_OWNER), + REQ(VHOST_USER_RESET_OWNER), + REQ(VHOST_USER_SET_MEM_TABLE), + REQ(VHOST_USER_SET_LOG_BASE), + REQ(VHOST_USER_SET_LOG_FD), + REQ(VHOST_USER_SET_VRING_NUM), + REQ(VHOST_USER_SET_VRING_ADDR), + REQ(VHOST_USER_SET_VRING_BASE), + REQ(VHOST_USER_GET_VRING_BASE), + REQ(VHOST_USER_SET_VRING_KICK), + REQ(VHOST_USER_SET_VRING_CALL), + REQ(VHOST_USER_SET_VRING_ERR), + REQ(VHOST_USER_GET_PROTOCOL_FEATURES), + REQ(VHOST_USER_SET_PROTOCOL_FEATURES), + REQ(VHOST_USER_GET_QUEUE_NUM), + REQ(VHOST_USER_SET_VRING_ENABLE), + REQ(VHOST_USER_SEND_RARP), + REQ(VHOST_USER_NET_SET_MTU), + REQ(VHOST_USER_SET_BACKEND_REQ_FD), + REQ(VHOST_USER_IOTLB_MSG), + REQ(VHOST_USER_SET_VRING_ENDIAN), + REQ(VHOST_USER_GET_CONFIG), + REQ(VHOST_USER_SET_CONFIG), + REQ(VHOST_USER_POSTCOPY_ADVISE), + REQ(VHOST_USER_POSTCOPY_LISTEN), + REQ(VHOST_USER_POSTCOPY_END), + REQ(VHOST_USER_GET_INFLIGHT_FD), + REQ(VHOST_USER_SET_INFLIGHT_FD), + REQ(VHOST_USER_GPU_SET_SOCKET), + REQ(VHOST_USER_VRING_KICK), + REQ(VHOST_USER_GET_MAX_MEM_SLOTS), + REQ(VHOST_USER_ADD_MEM_REG), + REQ(VHOST_USER_REM_MEM_REG), + }; +#undef REQ + return vu_request_str[req]; + } + + return "unknown"; +} + +/** + * qva_to_va() - Translate front-end (QEMU) virtual address to our virtual + * address + * @dev: vhost-user device + * @qemu_addr: front-end userspace address + * + * Return: the memory address in our process virtual address space. + */ +static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr) +{ + unsigned int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const struct vu_dev_region *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(qemu_addr - r->qva + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +/** + * vmsg_close_fds() - Close all file descriptors of a given message + * @vmsg: vhost-user message with the list of the file descriptors + */ +static void vmsg_close_fds(const struct vhost_user_msg *vmsg) +{ + int i; + + for (i = 0; i < vmsg->fd_num; i++) + close(vmsg->fds[i]); +} + +/** + * vu_remove_watch() - Remove a file descriptor from our passt epoll + * file descriptor + * @vdev: vhost-user device + * @fd: file descriptor to remove + */ +static void vu_remove_watch(const struct vu_dev *vdev, int fd) +{ + /* Placeholder to add passt related code */ + (void)vdev; + (void)fd; +} + +/** + * vmsg_set_reply_u64() - Set reply payload.u64 and clear request flags + * and fd_num + * @vmsg: vhost-user message + * @val: 64-bit value to reply + */ +static void vmsg_set_reply_u64(struct vhost_user_msg *vmsg, uint64_t val) +{ + vmsg->hdr.flags = 0; /* defaults will be set by vu_send_reply() */ + vmsg->hdr.size = sizeof(vmsg->payload.u64); + vmsg->payload.u64 = val; + vmsg->fd_num = 0; +} + +/** + * vu_message_read_default() - Read incoming vhost-user message from the + * front-end + * @conn_fd: vhost-user command socket + * @vmsg: vhost-user message + * + * Return: 0 if recvmsg() has been interrupted or if there's no data to read, + * 1 if a message has been received + */ +static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * + sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + ssize_t ret, sz_payload; + struct cmsghdr *cmsg; + + ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT); + if (ret < 0) { + if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + die_perror("vhost-user message receive (recvmsg)"); + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + size_t fd_size; + + ASSERT(cmsg->cmsg_len >= CMSG_LEN(0)); + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + ASSERT(fd_size <= sizeof(vmsg->fds)); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + break; + } + } + + sz_payload = vmsg->hdr.size; + if ((size_t)sz_payload > sizeof(vmsg->payload)) { + die("vhost-user message request too big: %d," + " size: vmsg->size: %zd, " + "while sizeof(vmsg->payload) = %zu", + vmsg->hdr.request, sz_payload, sizeof(vmsg->payload)); + } + + if (sz_payload) { + do + ret = recv(conn_fd, &vmsg->payload, sz_payload, 0); + while (ret < 0 && errno == EINTR); + + if (ret < 0) + die_perror("vhost-user message receive"); + + if (ret == 0) + die("EOF on vhost-user message receive"); + + if (ret < sz_payload) + die("Short-read on vhost-user message receive"); + } + + return 1; +} + +/** + * vu_message_write() - Send a message to the front-end + * @conn_fd: vhost-user command socket + * @vmsg: vhost-user message + * + * #syscalls:vu sendmsg + */ +static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE + vmsg->hdr.size, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + }; + int rc; + + ASSERT(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); + if (vmsg->fd_num > 0) { + size_t fdsize = vmsg->fd_num * sizeof(int); + struct cmsghdr *cmsg; + + msg.msg_controllen = CMSG_SPACE(fdsize); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); + } + + do + rc = sendmsg(conn_fd, &msg, 0); + while (rc < 0 && errno == EINTR); + + if (rc < 0) + die_perror("vhost-user message send"); + + if ((uint32_t)rc < VHOST_USER_HDR_SIZE + vmsg->hdr.size) + die("EOF on vhost-user message send"); +} + +/** + * vu_send_reply() - Update message flags and send it to front-end + * @conn_fd: vhost-user command socket + * @vmsg: vhost-user message + */ +static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg) +{ + msg->hdr.flags &= ~VHOST_USER_VERSION_MASK; + msg->hdr.flags |= VHOST_USER_VERSION; + msg->hdr.flags |= VHOST_USER_REPLY_MASK; + + vu_message_write(conn_fd, msg); +} + +/** + * vu_get_features_exec() - Provide back-end features bitmask to front-end + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as a reply is requested + */ +static bool vu_get_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + uint64_t features = + 1ULL << VIRTIO_F_VERSION_1 | + 1ULL << VIRTIO_NET_F_MRG_RXBUF | + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + (void)vdev; + + vmsg_set_reply_u64(msg, features); + + debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64); + + return true; +} + +/** + * vu_set_enable_all_rings() - Enable/disable all the virtqueues + * @vdev: vhost-user device + * @enable: New virtqueues state + */ +static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable) +{ + uint16_t i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + vdev->vq[i].enable = enable; +} + +/** + * vu_set_features_exec() - Enable features of the back-end + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vdev->features = msg->payload.u64; + /* We only support devices conforming to VIRTIO 1.0 or + * later + */ + if (!vu_has_feature(vdev, VIRTIO_F_VERSION_1)) + die("virtio legacy devices aren't supported by passt"); + + if (!vu_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) + vu_set_enable_all_rings(vdev, true); + + return false; +} + +/** + * vu_set_owner_exec() - Session start flag, do nothing in our case + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_owner_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + (void)vdev; + (void)msg; + + return false; +} + +/** + * map_ring() - Convert ring front-end (QEMU) addresses to our process + * virtual address space. + * @vdev: vhost-user device + * @vq: Virtqueue + * + * Return: True if ring cannot be mapped to our address space + */ +static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq) +{ + vq->vring.desc = qva_to_va(vdev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(vdev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(vdev, vq->vra.avail_user_addr); + + debug("Setting virtq addresses:"); + debug(" vring_desc at %p", (void *)vq->vring.desc); + debug(" vring_used at %p", (void *)vq->vring.used); + debug(" vring_avail at %p", (void *)vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + +/** + * vu_set_mem_table_exec() - Sets the memory map regions to be able to + * translate the vring addresses. + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + * + * #syscalls:vu mmap munmap + */ +static bool vu_set_mem_table_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + struct vhost_user_memory m = msg->payload.memory, *memory = &m; + unsigned int i; + + for (i = 0; i < vdev->nregions; i++) { + const struct vu_dev_region *r = &vdev->regions[i]; + + if (r->mmap_addr) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + munmap((void *)r->mmap_addr, r->size + r->mmap_offset); + } + } + vdev->nregions = memory->nregions; + + debug("vhost-user nregions: %u", memory->nregions); + for (i = 0; i < vdev->nregions; i++) { + struct vhost_user_memory_region *msg_region = &memory->regions[i]; + struct vu_dev_region *dev_region = &vdev->regions[i]; + void *mmap_addr; + + debug("vhost-user region %d", i); + debug(" guest_phys_addr: 0x%016"PRIx64, + msg_region->guest_phys_addr); + debug(" memory_size: 0x%016"PRIx64, + msg_region->memory_size); + debug(" userspace_addr 0x%016"PRIx64, + msg_region->userspace_addr); + debug(" mmap_offset 0x%016"PRIx64, + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned. + */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED | + MAP_NORESERVE, msg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) + die_perror("vhost-user region mmap error"); + + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + debug(" mmap_addr: 0x%016"PRIx64, + dev_region->mmap_addr); + + close(msg->fds[i]); + } + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + if (vdev->vq[i].vring.desc) { + if (map_ring(vdev, &vdev->vq[i])) + die("remapping queue %d during setmemtable", i); + } + } + + return false; +} + +/** + * vu_set_vring_num_exec() - Set the size of the queue (vring size) + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_num_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].vring.num = num; + + return false; +} + +/** + * vu_set_vring_addr_exec() - Set the addresses of the vring + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_addr_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + /* We need to copy the payload to vhost_vring_addr structure + * to access index because address of msg->payload.addr + * can be unaligned as it is packed. + */ + struct vhost_vring_addr addr = msg->payload.addr; + struct vu_virtq *vq = &vdev->vq[addr.index]; + + debug("vhost_vring_addr:"); + debug(" index: %d", addr.index); + debug(" flags: %d", addr.flags); + debug(" desc_user_addr: 0x%016" PRIx64, + (uint64_t)addr.desc_user_addr); + debug(" used_user_addr: 0x%016" PRIx64, + (uint64_t)addr.used_user_addr); + debug(" avail_user_addr: 0x%016" PRIx64, + (uint64_t)addr.avail_user_addr); + debug(" log_guest_addr: 0x%016" PRIx64, + (uint64_t)addr.log_guest_addr); + + vq->vra = msg->payload.addr; + vq->vring.flags = addr.flags; + vq->vring.log_guest_addr = addr.log_guest_addr; + + if (map_ring(vdev, vq)) + die("Invalid vring_addr message"); + + vq->used_idx = le16toh(vq->vring.used->idx); + + if (vq->last_avail_idx != vq->used_idx) { + debug("Last avail index != used index: %u != %u", + vq->last_avail_idx, vq->used_idx); + } + + return false; +} +/** + * vu_set_vring_base_exec() - Sets the next index to use for descriptors + * in this vring + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].shadow_avail_idx = vdev->vq[idx].last_avail_idx = num; + + return false; +} + +/** + * vu_get_vring_base_exec() - Stops the vring and returns the current + * descriptor index or indices + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as a reply is requested + */ +static bool vu_get_vring_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + + debug("State.index: %u", idx); + msg->payload.state.num = vdev->vq[idx].last_avail_idx; + msg->hdr.size = sizeof(msg->payload.state); + + vdev->vq[idx].started = false; + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + if (vdev->vq[idx].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + return true; +} + +/** + * vu_set_watch() - Add a file descriptor to the passt epoll file descriptor + * @vdev: vhost-user device + * @idx: queue index of the file descriptor to add + */ +static void vu_set_watch(const struct vu_dev *vdev, int idx) +{ + /* Placeholder to add passt related code */ + (void)vdev; + (void)idx; +} + +/** + * vu_check_queue_msg_file() - Check if a message is valid, + * close fds if NOFD bit is set + * @vmsg: vhost-user message + */ +static void vu_check_queue_msg_file(struct vhost_user_msg *msg) +{ + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + if (idx >= VHOST_USER_MAX_QUEUES) + die("Invalid vhost-user queue index: %u", idx); + + if (nofd) { + vmsg_close_fds(msg); + return; + } + + if (msg->fd_num != 1) + die("Invalid fds in vhost-user request: %d", msg->hdr.request); +} + +/** + * vu_set_vring_kick_exec() - Set the event file descriptor for adding buffers + * to the vring + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_kick_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vu_check_queue_msg_file(msg); + + if (vdev->vq[idx].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + if (!nofd) + vdev->vq[idx].kick_fd = msg->fds[0]; + + debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx); + + vdev->vq[idx].started = true; + + if (vdev->vq[idx].kick_fd != -1 && VHOST_USER_IS_QUEUE_TX(idx)) { + vu_set_watch(vdev, idx); + debug("Waiting for kicks on fd: %d for vq: %d", + vdev->vq[idx].kick_fd, idx); + } + + return false; +} + +/** + * vu_set_vring_call_exec() - Set the event file descriptor to signal when + * buffers are used + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_call_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vu_check_queue_msg_file(msg); + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + + if (!nofd) + vdev->vq[idx].call_fd = msg->fds[0]; + + /* in case of I/O hang after reconnecting */ + if (vdev->vq[idx].call_fd != -1) + eventfd_write(msg->fds[0], 1); + + debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx); + + return false; +} + +/** + * vu_set_vring_err_exec() - Set the event file descriptor to signal when + * error occurs + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_err_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vu_check_queue_msg_file(msg); + + if (vdev->vq[idx].err_fd != -1) { + close(vdev->vq[idx].err_fd); + vdev->vq[idx].err_fd = -1; + } + + if (!nofd) + vdev->vq[idx].err_fd = msg->fds[0]; + + return false; +} + +/** + * vu_get_protocol_features_exec() - Provide the protocol (vhost-user) features + * to the front-end + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as a reply is requested + */ +static bool vu_get_protocol_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; + + (void)vdev; + vmsg_set_reply_u64(msg, features); + + return true; +} + +/** + * vu_set_protocol_features_exec() - Enable protocol (vhost-user) features + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_protocol_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + uint64_t features = msg->payload.u64; + + debug("u64: 0x%016"PRIx64, features); + + vdev->protocol_features = msg->payload.u64; + + return false; +} + +/** + * vu_get_queue_num_exec() - Tell how many queues we support + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as a reply is requested + */ +static bool vu_get_queue_num_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + (void)vdev; + + vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES); + + return true; +} + +/** + * vu_set_vring_enable_exec() - Enable or disable corresponding vring + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_enable_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int enable = msg->payload.state.num; + unsigned int idx = msg->payload.state.index; + + debug("State.index: %u", idx); + debug("State.enable: %u", enable); + + if (idx >= VHOST_USER_MAX_QUEUES) + die("Invalid vring_enable index: %u", idx); + + vdev->vq[idx].enable = enable; + return false; +} + +/** + * vu_init() - Initialize vhost-user device structure + * @c: execution context + * @vdev: vhost-user device + */ +/* cppcheck-suppress unusedFunction */ +void vu_init(struct ctx *c, struct vu_dev *vdev) +{ + int i; + + vdev->context = c; + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + vdev->vq[i] = (struct vu_virtq){ + .call_fd = -1, + .kick_fd = -1, + .err_fd = -1, + .notification = true, + }; + } +} + +/** + * vu_cleanup() - Reset vhost-user device + * @vdev: vhost-user device + */ +/* cppcheck-suppress unusedFunction */ +void vu_cleanup(struct vu_dev *vdev) +{ + unsigned int i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + struct vu_virtq *vq = &vdev->vq[i]; + + vq->started = false; + vq->notification = true; + + if (vq->call_fd != -1) { + close(vq->call_fd); + vq->call_fd = -1; + } + if (vq->err_fd != -1) { + close(vq->err_fd); + vq->err_fd = -1; + } + if (vq->kick_fd != -1) { + vu_remove_watch(vdev, vq->kick_fd); + close(vq->kick_fd); + vq->kick_fd = -1; + } + + vq->vring.desc = 0; + vq->vring.used = 0; + vq->vring.avail = 0; + } + + for (i = 0; i < vdev->nregions; i++) { + const struct vu_dev_region *r = &vdev->regions[i]; + + if (r->mmap_addr) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + munmap((void *)r->mmap_addr, r->size + r->mmap_offset); + } + } + vdev->nregions = 0; +} + +/** + * vu_sock_reset() - Reset connection socket + * @vdev: vhost-user device + */ +static void vu_sock_reset(struct vu_dev *vdev) +{ + /* Placeholder to add passt related code */ + (void)vdev; +} + +static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, + struct vhost_user_msg *msg) = { + [VHOST_USER_GET_FEATURES] = vu_get_features_exec, + [VHOST_USER_SET_FEATURES] = vu_set_features_exec, + [VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec, + [VHOST_USER_SET_PROTOCOL_FEATURES] = vu_set_protocol_features_exec, + [VHOST_USER_GET_QUEUE_NUM] = vu_get_queue_num_exec, + [VHOST_USER_SET_OWNER] = vu_set_owner_exec, + [VHOST_USER_SET_MEM_TABLE] = vu_set_mem_table_exec, + [VHOST_USER_SET_VRING_NUM] = vu_set_vring_num_exec, + [VHOST_USER_SET_VRING_ADDR] = vu_set_vring_addr_exec, + [VHOST_USER_SET_VRING_BASE] = vu_set_vring_base_exec, + [VHOST_USER_GET_VRING_BASE] = vu_get_vring_base_exec, + [VHOST_USER_SET_VRING_KICK] = vu_set_vring_kick_exec, + [VHOST_USER_SET_VRING_CALL] = vu_set_vring_call_exec, + [VHOST_USER_SET_VRING_ERR] = vu_set_vring_err_exec, + [VHOST_USER_SET_VRING_ENABLE] = vu_set_vring_enable_exec, +}; + +/** + * vu_control_handler() - Handle control commands for vhost-user + * @vdev: vhost-user device + * @fd: vhost-user message socket + * @events: epoll events + */ +/* cppcheck-suppress unusedFunction */ +void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) +{ + struct vhost_user_msg msg = { 0 }; + bool need_reply, reply_requested; + int ret; + + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { + vu_sock_reset(vdev); + return; + } + + ret = vu_message_read_default(fd, &msg); + if (ret == 0) { + vu_sock_reset(vdev); + return; + } + debug("================ Vhost user message ================"); + debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request), + msg.hdr.request); + debug("Flags: 0x%x", msg.hdr.flags); + debug("Size: %u", msg.hdr.size); + + need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; + + if (msg.hdr.request >= 0 && msg.hdr.request < VHOST_USER_MAX && + vu_handle[msg.hdr.request]) + reply_requested = vu_handle[msg.hdr.request](vdev, &msg); + else + die("Unhandled request: %d", msg.hdr.request); + + /* cppcheck-suppress legacyUninitvar */ + if (!reply_requested && need_reply) { + msg.payload.u64 = 0; + msg.hdr.flags = 0; + msg.hdr.size = sizeof(msg.payload.u64); + msg.fd_num = 0; + reply_requested = true; + } + + if (reply_requested) + vu_send_reply(fd, &msg); +} diff --git a/vhost_user.h b/vhost_user.h new file mode 100644 index 0000000..5af349b --- /dev/null +++ b/vhost_user.h @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * vhost-user API, command management and virtio interface + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +/* some parts from subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VHOST_USER_H +#define VHOST_USER_H + +#include "virtio.h" +#include "iov.h" + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +#define VHOST_MEMORY_BASELINE_NREGIONS 8 + +/** + * enum vhost_user_protocol_feature - List of available vhost-user features + */ +enum vhost_user_protocol_feature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + VHOST_USER_PROTOCOL_F_REPLY_ACK = 3, + VHOST_USER_PROTOCOL_F_NET_MTU = 4, + VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5, + VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, + VHOST_USER_PROTOCOL_F_CONFIG = 9, + VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, + VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + + VHOST_USER_PROTOCOL_F_MAX +}; + +/** + * enum vhost_user_request - List of available vhost-user requests + */ +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_SET_BACKEND_REQ_FD = 21, + VHOST_USER_IOTLB_MSG = 22, + VHOST_USER_SET_VRING_ENDIAN = 23, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_CREATE_CRYPTO_SESSION = 26, + VHOST_USER_CLOSE_CRYPTO_SESSION = 27, + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, + VHOST_USER_GET_INFLIGHT_FD = 31, + VHOST_USER_SET_INFLIGHT_FD = 32, + VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_VRING_KICK = 35, + VHOST_USER_GET_MAX_MEM_SLOTS = 36, + VHOST_USER_ADD_MEM_REG = 37, + VHOST_USER_REM_MEM_REG = 38, + VHOST_USER_MAX +}; + +/** + * struct vhost_user_header - vhost-user message header + * @request: Request type of the message + * @flags: Request flags + * @size: The following payload size + */ +struct vhost_user_header { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3) + uint32_t flags; + uint32_t size; +} __attribute__ ((__packed__)); + +/** + * struct vhost_user_memory_region - Front-end shared memory region information + * @guest_phys_addr: Guest physical address of the region + * @memory_size: Memory size + * @userspace_addr: front-end (QEMU) userspace address + * @mmap_offset: region offset in the shared memory area + */ +struct vhost_user_memory_region { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +}; + +/** + * struct vhost_user_memory - List of all the shared memory regions + * @nregions: Number of memory regions + * @padding: Padding + * @regions: Memory regions list + */ +struct vhost_user_memory { + uint32_t nregions; + uint32_t padding; + struct vhost_user_memory_region regions[VHOST_MEMORY_BASELINE_NREGIONS]; +}; + +/** + * union vhost_user_payload - vhost-user message payload + * @u64: 64-bit payload + * @state: vring state payload + * @addr: vring addresses payload + * vhost_user_memory: Memory regions information payload + */ +union vhost_user_payload { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct vhost_user_memory memory; +}; + +/** + * struct vhost_user_msg - vhost-use message + * @hdr: Message header + * @payload: Message payload + * @fds: File descriptors associated with the message + * in the ancillary data. + * (shared memory or event file descriptors) + * @fd_num: Number of file descriptors + */ +struct vhost_user_msg { + struct vhost_user_header hdr; + union vhost_user_payload payload; + + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; + int fd_num; +} __attribute__ ((__packed__)); +#define VHOST_USER_HDR_SIZE sizeof(struct vhost_user_header) + +/* index of the RX virtqueue */ +#define VHOST_USER_RX_QUEUE 0 +/* index of the TX virtqueue */ +#define VHOST_USER_TX_QUEUE 1 + +/* in case of multiqueue, the RX and TX queues are interleaved */ +#define VHOST_USER_IS_QUEUE_TX(n) (n % 2) +#define VHOST_USER_IS_QUEUE_RX(n) (!(n % 2)) + +/* Default virtio-net header for passt */ +#define VU_HEADER ((struct virtio_net_hdr){ \ + .flags = VIRTIO_NET_HDR_F_DATA_VALID, \ + .gso_type = VIRTIO_NET_HDR_GSO_NONE, \ +}) + +/** + * vu_queue_enabled - Return state of a virtqueue + * @vq: virtqueue to check + * + * Return: true if the virqueue is enabled, false otherwise + */ +/* cppcheck-suppress unusedFunction */ +static inline bool vu_queue_enabled(const struct vu_virtq *vq) +{ + return vq->enable; +} + +/** + * vu_queue_started - Return state of a virtqueue + * @vq: virtqueue to check + * + * Return: true if the virqueue is started, false otherwise + */ +/* cppcheck-suppress unusedFunction */ +static inline bool vu_queue_started(const struct vu_virtq *vq) +{ + return vq->started; +} + +void vu_print_capabilities(void); +void vu_init(struct ctx *c, struct vu_dev *vdev); +void vu_cleanup(struct vu_dev *vdev); +void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events); +#endif /* VHOST_USER_H */ diff --git a/virtio.h b/virtio.h index 94efeb0..6410d60 100644 --- a/virtio.h +++ b/virtio.h @@ -105,6 +105,7 @@ struct vu_dev_region { * @protocol_features: Vhost-user protocol features */ struct vu_dev { + struct ctx *context; uint32_t nregions; struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS]; struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; From 5a8b33c667d4468e82c4d50e81da06c0e681761e Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 22 Nov 2024 17:43:31 +0100 Subject: [PATCH 136/382] udp: Prepare udp.c to be shared with vhost-user Export udp_payload_t, udp_update_hdr4(), udp_update_hdr6() and udp_sock_errs(). Rename udp_listen_sock_handler() to udp_buf_listen_sock_handler() and udp_reply_sock_handler to udp_buf_reply_sock_handler(). Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 74 ++++++++++++++++++++++++++++++-------------------- udp_internal.h | 34 +++++++++++++++++++++++ 2 files changed, 79 insertions(+), 29 deletions(-) create mode 100644 udp_internal.h diff --git a/udp.c b/udp.c index 4be165f..9718ed8 100644 --- a/udp.c +++ b/udp.c @@ -109,8 +109,7 @@ #include "pcap.h" #include "log.h" #include "flow_table.h" - -#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ +#include "udp_internal.h" /* "Spliced" sockets indexed by bound port (host order) */ static int udp_splice_ns [IP_VERSIONS][NUM_PORTS]; @@ -118,20 +117,8 @@ static int udp_splice_init[IP_VERSIONS][NUM_PORTS]; /* Static buffers */ -/** - * struct udp_payload_t - UDP header and data for inbound messages - * @uh: UDP header - * @data: UDP data - */ -static struct udp_payload_t { - struct udphdr uh; - char data[USHRT_MAX - sizeof(struct udphdr)]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -udp_payload[UDP_MAX_FRAMES]; +/* UDP header and data for inbound messages */ +static struct udp_payload_t udp_payload[UDP_MAX_FRAMES]; /* Ethernet header for IPv4 frames */ static struct ethhdr udp4_eth_hdr; @@ -302,9 +289,9 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n, * * Return: size of IPv4 payload (UDP header + data) */ -static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, - const struct flowside *toside, size_t dlen, - bool no_udp_csum) +size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, + const struct flowside *toside, size_t dlen, + bool no_udp_csum) { const struct in_addr *src = inany_v4(&toside->oaddr); const struct in_addr *dst = inany_v4(&toside->eaddr); @@ -345,9 +332,9 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, * * Return: size of IPv6 payload (UDP header + data) */ -static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, - const struct flowside *toside, size_t dlen, - bool no_udp_csum) +size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, + const struct flowside *toside, size_t dlen, + bool no_udp_csum) { uint16_t l4len = dlen + sizeof(bp->uh); @@ -477,7 +464,7 @@ static int udp_sock_recverr(int s) * * Return: Number of errors handled, or < 0 if we have an unrecoverable error */ -static int udp_sock_errs(const struct ctx *c, int s, uint32_t events) +int udp_sock_errs(const struct ctx *c, int s, uint32_t events) { unsigned n_err = 0; socklen_t errlen; @@ -554,7 +541,7 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, } /** - * udp_listen_sock_handler() - Handle new data from socket + * udp_buf_listen_sock_handler() - Handle new data from socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap @@ -562,8 +549,9 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, * * #syscalls recvmmsg */ -void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now) +static void udp_buf_listen_sock_handler(const struct ctx *c, + union epoll_ref ref, uint32_t events, + const struct timespec *now) { const socklen_t sasize = sizeof(udp_meta[0].s_in); int n, i; @@ -630,7 +618,21 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, } /** - * udp_reply_sock_handler() - Handle new data from flow specific socket + * udp_listen_sock_handler() - Handle new data from socket + * @c: Execution context + * @ref: epoll reference + * @events: epoll events bitmap + * @now: Current timestamp + */ +void udp_listen_sock_handler(const struct ctx *c, + union epoll_ref ref, uint32_t events, + const struct timespec *now) +{ + udp_buf_listen_sock_handler(c, ref, events, now); +} + +/** + * udp_buf_reply_sock_handler() - Handle new data from flow specific socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap @@ -638,8 +640,9 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, * * #syscalls recvmmsg */ -void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now) +static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, + const struct timespec *now) { flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); @@ -685,6 +688,19 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, } } +/** + * udp_reply_sock_handler() - Handle new data from flow specific socket + * @c: Execution context + * @ref: epoll reference + * @events: epoll events bitmap + * @now: Current timestamp + */ +void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) +{ + udp_buf_reply_sock_handler(c, ref, events, now); +} + /** * udp_tap_handler() - Handle packets from tap * @c: Execution context diff --git a/udp_internal.h b/udp_internal.h new file mode 100644 index 0000000..cc80e30 --- /dev/null +++ b/udp_internal.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef UDP_INTERNAL_H +#define UDP_INTERNAL_H + +#include "tap.h" /* needed by udp_meta_t */ + +#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ + +/** + * struct udp_payload_t - UDP header and data for inbound messages + * @uh: UDP header + * @data: UDP data + */ +struct udp_payload_t { + struct udphdr uh; + char data[USHRT_MAX - sizeof(struct udphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, + const struct flowside *toside, size_t dlen, + bool no_udp_csum); +size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, + const struct flowside *toside, size_t dlen, + bool no_udp_csum); +int udp_sock_errs(const struct ctx *c, int s, uint32_t events); +#endif /* UDP_INTERNAL_H */ From b7c292b758a165066b9042cfbac1a2e1d3d197c4 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 22 Nov 2024 17:43:32 +0100 Subject: [PATCH 137/382] tcp: Export headers functions Export tcp_fill_headers[4|6]() and tcp_update_check_tcp[4|6](). They'll be needed by vhost-user. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 30 +++++++++++++++--------------- tcp_internal.h | 15 +++++++++++++++ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/tcp.c b/tcp.c index 1eb85bb..e08ffd3 100644 --- a/tcp.c +++ b/tcp.c @@ -758,9 +758,9 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) * @iov_cnt: Length of the array * @l4offset: IPv4 payload offset in the iovec array */ -static void tcp_update_check_tcp4(const struct iphdr *iph, - const struct iovec *iov, int iov_cnt, - size_t l4offset) +void tcp_update_check_tcp4(const struct iphdr *iph, + const struct iovec *iov, int iov_cnt, + size_t l4offset) { uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); struct in_addr saddr = { .s_addr = iph->saddr }; @@ -810,9 +810,9 @@ static void tcp_update_check_tcp4(const struct iphdr *iph, * @iov_cnt: Length of the array * @l4offset: IPv6 payload offset in the iovec array */ -static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, - const struct iovec *iov, int iov_cnt, - size_t l4offset) +void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, + const struct iovec *iov, int iov_cnt, + size_t l4offset) { uint16_t l4len = ntohs(ip6h->payload_len); size_t check_ofs; @@ -978,11 +978,11 @@ static void tcp_fill_header(struct tcphdr *th, * * Return: The IPv4 payload length, host order */ -static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, - struct iphdr *iph, struct tcp_payload_t *bp, - size_t dlen, const uint16_t *check, - uint32_t seq, bool no_tcp_csum) +size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct iphdr *iph, struct tcp_payload_t *bp, + size_t dlen, const uint16_t *check, + uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); const struct in_addr *src4 = inany_v4(&tapside->oaddr); @@ -1030,10 +1030,10 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, * * Return: The IPv6 payload length, host order */ -static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, - struct ipv6hdr *ip6h, struct tcp_payload_t *bp, - size_t dlen, uint32_t seq, bool no_tcp_csum) +size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct ipv6hdr *ip6h, struct tcp_payload_t *bp, + size_t dlen, uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); size_t l4len = dlen + sizeof(bp->th); diff --git a/tcp_internal.h b/tcp_internal.h index c846f60..8625eed 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -162,6 +162,21 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); struct tcp_info_linux; +void tcp_update_check_tcp4(const struct iphdr *iph, + const struct iovec *iov, int iov_cnt, + size_t l4offset); +void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, + const struct iovec *iov, int iov_cnt, + size_t l4offset); +size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct iphdr *iph, struct tcp_payload_t *bp, + size_t dlen, const uint16_t *check, + uint32_t seq, bool no_tcp_csum); +size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct ipv6hdr *ip6h, struct tcp_payload_t *bp, + size_t dlen, uint32_t seq, bool no_tcp_csum); size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, struct iovec *iov, size_t dlen, const uint16_t *check, uint32_t seq, From b2e62f7e85ac77a91daf5d77b7f32198ef0e59c2 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 22 Nov 2024 17:43:33 +0100 Subject: [PATCH 138/382] passt: rename tap_sock_init() to tap_backend_init() Extract pool storage initialization loop to tap_sock_update_pool(), extract QEMU hints to tap_backend_show_hints(). Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt.c | 2 +- tap.c | 56 +++++++++++++++++++++++++++++++++++++++++--------------- tap.h | 2 +- 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/passt.c b/passt.c index 06e0a33..8a37407 100644 --- a/passt.c +++ b/passt.c @@ -245,7 +245,7 @@ int main(int argc, char **argv) pasta_netns_quit_init(&c); - tap_sock_init(&c); + tap_backend_init(&c); random_init(&c); diff --git a/tap.c b/tap.c index 5347df4..b489430 100644 --- a/tap.c +++ b/tap.c @@ -1193,11 +1193,31 @@ int tap_sock_unix_open(char *sock_path) return fd; } +/** + * tap_backend_show_hints() - Give help information to start QEMU + * @c: Execution context + */ +static void tap_backend_show_hints(struct ctx *c) +{ + switch (c->mode) { + case MODE_PASTA: + /* No hints */ + break; + case MODE_PASST: + info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); + info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", + c->sock_path); + info("or qrap, for earlier qemu versions:"); + info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); + break; + } +} + /** * tap_sock_unix_init() - Start listening for connections on AF_UNIX socket * @c: Execution context */ -static void tap_sock_unix_init(struct ctx *c) +static void tap_sock_unix_init(const struct ctx *c) { union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN }; struct epoll_event ev = { 0 }; @@ -1208,12 +1228,6 @@ static void tap_sock_unix_init(struct ctx *c) ev.events = EPOLLIN | EPOLLET; ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev); - - info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); - info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", - c->sock_path); - info("or qrap, for earlier qemu versions:"); - info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); } /** @@ -1326,21 +1340,31 @@ static void tap_sock_tun_init(struct ctx *c) } /** - * tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor - * @c: Execution context + * tap_sock_update_pool() - Set the buffer base and size for the pool of packets + * @base: Buffer base + * @size Buffer size */ -void tap_sock_init(struct ctx *c) +static void tap_sock_update_pool(void *base, size_t size) { - size_t sz = sizeof(pkt_buf); int i; - pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz); - pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz); + pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size); + pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size); for (i = 0; i < TAP_SEQS; i++) { - tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); - tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); + tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size); + tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size); } +} + +/** + * tap_backend_init() - Create and set up AF_UNIX socket or + * tuntap file descriptor + * @c: Execution context + */ +void tap_backend_init(struct ctx *c) +{ + tap_sock_update_pool(pkt_buf, sizeof(pkt_buf)); if (c->fd_tap != -1) { /* Passed as --fd */ struct epoll_event ev = { 0 }; @@ -1370,4 +1394,6 @@ void tap_sock_init(struct ctx *c) */ memset(&c->guest_mac, 0xff, sizeof(c->guest_mac)); } + + tap_backend_show_hints(c); } diff --git a/tap.h b/tap.h index 85f1e84..8728cc5 100644 --- a/tap.h +++ b/tap.h @@ -68,7 +68,7 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); int tap_sock_unix_open(char *sock_path); -void tap_sock_init(struct ctx *c); +void tap_backend_init(struct ctx *c); void tap_flush_pools(void); void tap_handler(struct ctx *c, const struct timespec *now); void tap_add_packet(struct ctx *c, ssize_t l2len, char *p); From 28997fcb29b560fc0dcfd91bad5eece3ded5eb72 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 22 Nov 2024 17:43:34 +0100 Subject: [PATCH 139/382] vhost-user: add vhost-user add virtio and vhost-user functions to connect with QEMU. $ ./passt --vhost-user and # qemu-system-x86_64 ... -m 4G \ -object memory-backend-memfd,id=memfd0,share=on,size=4G \ -numa node,memdev=memfd0 \ -chardev socket,id=chr0,path=/tmp/passt_1.socket \ -netdev vhost-user,id=netdev0,chardev=chr0 \ -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \ ... Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: as suggested by lvivier, include <netinet/if_ether.h> before including <linux/if_ether.h> as C libraries such as musl __UAPI_DEF_ETHHDR in <netinet/if_ether.h> if they already have a definition of struct ethhdr] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 6 +- conf.c | 19 +- epoll_type.h | 4 + iov.c | 1 - isolation.c | 17 +- packet.c | 11 ++ packet.h | 8 +- passt.1 | 10 +- passt.c | 9 + passt.h | 7 + pcap.c | 1 - tap.c | 77 ++++++-- tap.h | 5 +- tcp.c | 7 + tcp_vu.c | 498 +++++++++++++++++++++++++++++++++++++++++++++++++++ tcp_vu.h | 12 ++ udp.c | 11 ++ udp_vu.c | 343 +++++++++++++++++++++++++++++++++++ udp_vu.h | 13 ++ vhost_user.c | 41 +++-- vhost_user.h | 4 +- virtio.c | 5 - vu_common.c | 283 +++++++++++++++++++++++++++++ vu_common.h | 60 +++++++ 24 files changed, 1399 insertions(+), 53 deletions(-) create mode 100644 tcp_vu.c create mode 100644 tcp_vu.h create mode 100644 udp_vu.c create mode 100644 udp_vu.h create mode 100644 vu_common.c create mode 100644 vu_common.h diff --git a/Makefile b/Makefile index bcb084e..faa5c23 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c vhost_user.c virtio.c + tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ + vhost_user.c virtio.c vu_common.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -47,7 +48,8 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h udp_flow.h util.h vhost_user.h virtio.h + tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \ + virtio.h vu_common.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} diff --git a/conf.c b/conf.c index c6bffc4..eaa7d99 100644 --- a/conf.c +++ b/conf.c @@ -45,6 +45,7 @@ #include "lineread.h" #include "isolation.h" #include "log.h" +#include "vhost_user.h" #define NETNS_RUN_DIR "/run/netns" @@ -807,9 +808,14 @@ static void usage(const char *name, FILE *f, int status) " default: same interface name as external one\n"); } else { FPRINTF(f, - " -s, --socket PATH UNIX domain socket path\n" + " -s, --socket, --socket-path PATH UNIX domain socket path\n" " default: probe free path starting from " UNIX_SOCK_PATH "\n", 1); + FPRINTF(f, + " --vhost-user Enable vhost-user mode\n" + " UNIX domain socket is provided by -s option\n" + " --print-capabilities print back-end capabilities in JSON format,\n" + " only meaningful for vhost-user mode\n"); } FPRINTF(f, @@ -1345,6 +1351,10 @@ void conf(struct ctx *c, int argc, char **argv) {"map-guest-addr", required_argument, NULL, 22 }, {"host-lo-to-ns-lo", no_argument, NULL, 23 }, {"dns-host", required_argument, NULL, 24 }, + {"vhost-user", no_argument, NULL, 25 }, + /* vhost-user backend program convention */ + {"print-capabilities", no_argument, NULL, 26 }, + {"socket-path", required_argument, NULL, 's' }, { 0 }, }; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; @@ -1538,6 +1548,13 @@ void conf(struct ctx *c, int argc, char **argv) break; die("Invalid host nameserver address: %s", optarg); + case 25: + if (c->mode == MODE_PASTA) + die("--vhost-user is for passt mode only"); + c->mode = MODE_VU; + break; + case 26: + vu_print_capabilities(); break; case 'd': c->debug = 1; diff --git a/epoll_type.h b/epoll_type.h index 0ad1efa..f3ef415 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -36,6 +36,10 @@ enum epoll_type { EPOLL_TYPE_TAP_PASST, /* socket listening for qemu socket connections */ EPOLL_TYPE_TAP_LISTEN, + /* vhost-user command socket */ + EPOLL_TYPE_VHOST_CMD, + /* vhost-user kick event socket */ + EPOLL_TYPE_VHOST_KICK, EPOLL_NUM_TYPES, }; diff --git a/iov.c b/iov.c index 3f9e229..3741db2 100644 --- a/iov.c +++ b/iov.c @@ -68,7 +68,6 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, * * Returns: The number of bytes successfully copied. */ -/* cppcheck-suppress unusedFunction */ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, const void *buf, size_t bytes) { diff --git a/isolation.c b/isolation.c index 45fba1e..c944fb3 100644 --- a/isolation.c +++ b/isolation.c @@ -379,12 +379,21 @@ void isolate_postfork(const struct ctx *c) prctl(PR_SET_DUMPABLE, 0); - if (c->mode == MODE_PASTA) { - prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); - prog.filter = filter_pasta; - } else { + switch (c->mode) { + case MODE_PASST: prog.len = (unsigned short)ARRAY_SIZE(filter_passt); prog.filter = filter_passt; + break; + case MODE_PASTA: + prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); + prog.filter = filter_pasta; + break; + case MODE_VU: + prog.len = (unsigned short)ARRAY_SIZE(filter_vu); + prog.filter = filter_vu; + break; + default: + ASSERT(0); } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || diff --git a/packet.c b/packet.c index 3748996..e5a78d0 100644 --- a/packet.c +++ b/packet.c @@ -36,6 +36,17 @@ static int packet_check_range(const struct pool *p, size_t offset, size_t len, const char *start, const char *func, int line) { + if (p->buf_size == 0) { + int ret; + + ret = vu_packet_check_range((void *)p->buf, offset, len, start); + + if (ret == -1) + trace("cannot find region, %s:%i", func, line); + + return ret; + } + if (start < p->buf) { trace("packet start %p before buffer start %p, " "%s:%i", (void *)start, (void *)p->buf, func, line); diff --git a/packet.h b/packet.h index 8377dcf..3f70e94 100644 --- a/packet.h +++ b/packet.h @@ -8,8 +8,10 @@ /** * struct pool - Generic pool of packets stored in a buffer - * @buf: Buffer storing packet descriptors - * @buf_size: Total size of buffer + * @buf: Buffer storing packet descriptors, + * a struct vu_dev_region array for passt vhost-user mode + * @buf_size: Total size of buffer, + * 0 for passt vhost-user mode * @size: Number of usable descriptors for the pool * @count: Number of used descriptors for the pool * @pkt: Descriptors: see macros below @@ -22,6 +24,8 @@ struct pool { struct iovec pkt[1]; }; +int vu_packet_check_range(void *buf, size_t offset, size_t len, + const char *start); void packet_add_do(struct pool *p, size_t len, const char *start, const char *func, int line); void *packet_get_do(const struct pool *p, const size_t idx, diff --git a/passt.1 b/passt.1 index 15c8338..b2896a2 100644 --- a/passt.1 +++ b/passt.1 @@ -404,12 +404,20 @@ interface address are configured on a given host interface. .SS \fBpasst\fR-only options .TP -.BR \-s ", " \-\-socket " " \fIpath +.BR \-s ", " \-\-socket-path ", " \-\-socket " " \fIpath Path for UNIX domain socket used by \fBqemu\fR(1) or \fBqrap\fR(1) to connect to \fBpasst\fR. Default is to probe a free socket, not accepting connections, starting from \fI/tmp/passt_1.socket\fR to \fI/tmp/passt_64.socket\fR. +.TP +.BR \-\-vhost-user +Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR. + +.TP +.BR \-\-print-capabilities +Print back-end capabilities in JSON format, only meaningful for vhost-user mode. + .TP .BR \-F ", " \-\-fd " " \fIFD Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened diff --git a/passt.c b/passt.c index 8a37407..957f3d0 100644 --- a/passt.c +++ b/passt.c @@ -50,6 +50,7 @@ #include "log.h" #include "tcp_splice.h" #include "ndp.h" +#include "vu_common.h" #define EPOLL_EVENTS 8 @@ -72,6 +73,8 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device", [EPOLL_TYPE_TAP_PASST] = "connected qemu socket", [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", + [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", + [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -347,6 +350,12 @@ loop: case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); break; + case EPOLL_TYPE_VHOST_CMD: + vu_control_handler(c.vdev, c.fd_tap, eventmask); + break; + case EPOLL_TYPE_VHOST_KICK: + vu_kick_cb(c.vdev, ref, &now); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index 799ee50..c038630 100644 --- a/passt.h +++ b/passt.h @@ -25,6 +25,7 @@ union epoll_ref; #include "fwd.h" #include "tcp.h" #include "udp.h" +#include "vhost_user.h" /* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0 * (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise @@ -43,6 +44,7 @@ union epoll_ref; * @icmp: ICMP-specific reference part * @data: Data handled by protocol handlers * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone + * @queue: vhost-user queue index for this fd * @u64: Opaque reference for epoll_ctl() and epoll_wait() */ union epoll_ref { @@ -58,6 +60,7 @@ union epoll_ref { union udp_listen_epoll_ref udp; uint32_t data; int nsdir_fd; + int queue; }; }; uint64_t u64; @@ -94,6 +97,7 @@ struct fqdn { enum passt_modes { MODE_PASST, MODE_PASTA, + MODE_VU, }; /** @@ -229,6 +233,7 @@ struct ip6_ctx { * @freebind: Allow binding of non-local addresses for forwarding * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max + * @vdev: vhost-user device */ struct ctx { enum passt_modes mode; @@ -291,6 +296,8 @@ struct ctx { int low_wmem; int low_rmem; + + struct vu_dev *vdev; }; void proto_update_l2_buf(const unsigned char *eth_d, diff --git a/pcap.c b/pcap.c index 23205dd..3d623cf 100644 --- a/pcap.c +++ b/pcap.c @@ -143,7 +143,6 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, * @iovcnt: Number of buffers (@iov entries) * @offset: Offset of the L2 frame within the full data length */ -/* cppcheck-suppress unusedFunction */ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) { struct timespec now = { 0 }; diff --git a/tap.c b/tap.c index b489430..cde1719 100644 --- a/tap.c +++ b/tap.c @@ -58,6 +58,8 @@ #include "packet.h" #include "tap.h" #include "log.h" +#include "vhost_user.h" +#include "vu_common.h" /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); @@ -78,16 +80,22 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len) struct iovec iov[2]; size_t iovcnt = 0; - if (c->mode == MODE_PASST) { + switch (c->mode) { + case MODE_PASST: iov[iovcnt] = IOV_OF_LVALUE(vnet_len); iovcnt++; + /* fall through */ + case MODE_PASTA: + iov[iovcnt].iov_base = (void *)data; + iov[iovcnt].iov_len = l2len; + iovcnt++; + + tap_send_frames(c, iov, iovcnt, 1); + break; + case MODE_VU: + vu_send_single(c, data, l2len); + break; } - - iov[iovcnt].iov_base = (void *)data; - iov[iovcnt].iov_len = l2len; - iovcnt++; - - tap_send_frames(c, iov, iovcnt, 1); } /** @@ -414,10 +422,18 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, if (!nframes) return 0; - if (c->mode == MODE_PASTA) + switch (c->mode) { + case MODE_PASTA: m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes); - else + break; + case MODE_PASST: m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); + break; + case MODE_VU: + /* fall through */ + default: + ASSERT(0); + } if (m < nframes) debug("tap: failed to send %zu frames of %zu", @@ -979,7 +995,7 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket * @c: Execution context */ -static void tap_sock_reset(struct ctx *c) +void tap_sock_reset(struct ctx *c) { info("Client connection closed%s", c->one_off ? ", exiting" : ""); @@ -990,6 +1006,8 @@ static void tap_sock_reset(struct ctx *c) epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); close(c->fd_tap); c->fd_tap = -1; + if (c->mode == MODE_VU) + vu_cleanup(c->vdev); } /** @@ -1210,6 +1228,11 @@ static void tap_backend_show_hints(struct ctx *c) info("or qrap, for earlier qemu versions:"); info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); break; + case MODE_VU: + info("You can start qemu with:"); + info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n", + c->sock_path); + break; } } @@ -1237,8 +1260,8 @@ static void tap_sock_unix_init(const struct ctx *c) */ void tap_listen_handler(struct ctx *c, uint32_t events) { - union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST }; struct epoll_event ev = { 0 }; + union epoll_ref ref = { 0 }; int v = INT_MAX / 2; struct ucred ucred; socklen_t len; @@ -1278,6 +1301,10 @@ void tap_listen_handler(struct ctx *c, uint32_t events) trace("tap: failed to set SO_SNDBUF to %i", v); ref.fd = c->fd_tap; + if (c->mode == MODE_VU) + ref.type = EPOLL_TYPE_VHOST_CMD; + else + ref.type = EPOLL_TYPE_TAP_PASST; ev.events = EPOLLIN | EPOLLRDHUP; ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); @@ -1344,7 +1371,7 @@ static void tap_sock_tun_init(struct ctx *c) * @base: Buffer base * @size Buffer size */ -static void tap_sock_update_pool(void *base, size_t size) +void tap_sock_update_pool(void *base, size_t size) { int i; @@ -1364,7 +1391,10 @@ static void tap_sock_update_pool(void *base, size_t size) */ void tap_backend_init(struct ctx *c) { - tap_sock_update_pool(pkt_buf, sizeof(pkt_buf)); + if (c->mode == MODE_VU) + tap_sock_update_pool(NULL, 0); + else + tap_sock_update_pool(pkt_buf, sizeof(pkt_buf)); if (c->fd_tap != -1) { /* Passed as --fd */ struct epoll_event ev = { 0 }; @@ -1372,10 +1402,17 @@ void tap_backend_init(struct ctx *c) ASSERT(c->one_off); ref.fd = c->fd_tap; - if (c->mode == MODE_PASST) + switch (c->mode) { + case MODE_PASST: ref.type = EPOLL_TYPE_TAP_PASST; - else + break; + case MODE_PASTA: ref.type = EPOLL_TYPE_TAP_PASTA; + break; + case MODE_VU: + ref.type = EPOLL_TYPE_VHOST_CMD; + break; + } ev.events = EPOLLIN | EPOLLRDHUP; ev.data.u64 = ref.u64; @@ -1383,9 +1420,14 @@ void tap_backend_init(struct ctx *c) return; } - if (c->mode == MODE_PASTA) { + switch (c->mode) { + case MODE_PASTA: tap_sock_tun_init(c); - } else { + break; + case MODE_VU: + vu_init(c); + /* fall through */ + case MODE_PASST: tap_sock_unix_init(c); /* In passt mode, we don't know the guest's MAC address until it @@ -1393,6 +1435,7 @@ void tap_backend_init(struct ctx *c) * first packets will reach it. */ memset(&c->guest_mac, 0xff, sizeof(c->guest_mac)); + break; } tap_backend_show_hints(c); diff --git a/tap.h b/tap.h index 8728cc5..dfbd8b9 100644 --- a/tap.h +++ b/tap.h @@ -40,7 +40,8 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c, */ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) { - thdr->vnet_len = htonl(l2len); + if (thdr) + thdr->vnet_len = htonl(l2len); } void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, @@ -68,6 +69,8 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); int tap_sock_unix_open(char *sock_path); +void tap_sock_reset(struct ctx *c); +void tap_sock_update_pool(void *base, size_t size); void tap_backend_init(struct ctx *c); void tap_flush_pools(void); void tap_handler(struct ctx *c, const struct timespec *now); diff --git a/tcp.c b/tcp.c index e08ffd3..e197a1a 100644 --- a/tcp.c +++ b/tcp.c @@ -304,6 +304,7 @@ #include "flow_table.h" #include "tcp_internal.h" #include "tcp_buf.h" +#include "tcp_vu.h" /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 @@ -1314,6 +1315,9 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) { + if (c->mode == MODE_VU) + return tcp_vu_send_flag(c, conn, flags); + return tcp_buf_send_flag(c, conn, flags); } @@ -1707,6 +1711,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) */ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) { + if (c->mode == MODE_VU) + return tcp_vu_data_from_sock(c, conn); + return tcp_buf_data_from_sock(c, conn); } diff --git a/tcp_vu.c b/tcp_vu.c new file mode 100644 index 0000000..1bebb31 --- /dev/null +++ b/tcp_vu.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* tcp_vu.c - TCP L2 vhost-user management functions + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +#include <errno.h> +#include <stddef.h> +#include <stdint.h> + +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <sys/socket.h> + +#include <netinet/if_ether.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "siphash.h" +#include "inany.h" +#include "vhost_user.h" +#include "tcp.h" +#include "pcap.h" +#include "flow.h" +#include "tcp_conn.h" +#include "flow_table.h" +#include "tcp_vu.h" +#include "tap.h" +#include "tcp_internal.h" +#include "checksum.h" +#include "vu_common.h" +#include <time.h> + +static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1]; +static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; +static int head[VIRTQUEUE_MAX_SIZE + 1]; +static int head_cnt; + +/** + * tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP) + * @v6: Set for IPv6 packet + * + * Return: Return the size of the header + */ +static size_t tcp_vu_hdrlen(bool v6) +{ + size_t hdrlen; + + hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) + + sizeof(struct ethhdr) + sizeof(struct tcphdr); + + if (v6) + hdrlen += sizeof(struct ipv6hdr); + else + hdrlen += sizeof(struct iphdr); + + return hdrlen; +} + +/** + * tcp_vu_update_check() - Calculate TCP checksum + * @tapside: Address information for one side of the flow + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + */ +static void tcp_vu_update_check(const struct flowside *tapside, + struct iovec *iov, int iov_cnt) +{ + char *base = iov[0].iov_base; + + if (inany_v4(&tapside->oaddr)) { + const struct iphdr *iph = vu_ip(base); + + tcp_update_check_tcp4(iph, iov, iov_cnt, + (char *)vu_payloadv4(base) - base); + } else { + const struct ipv6hdr *ip6h = vu_ip(base); + + tcp_update_check_tcp6(ip6h, iov, iov_cnt, + (char *)vu_payloadv6(base) - base); + } +} + +/** + * tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload) + * @c: Execution context + * @conn: Connection pointer + * @flags: TCP flags: if not set, send segment only if ACK is due + * + * Return: negative error code on connection reset, 0 otherwise + */ +int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + const struct flowside *tapside = TAPFLOW(conn); + size_t l2len, l4len, optlen, hdrlen; + struct vu_virtq_element flags_elem[2]; + struct tcp_payload_t *payload; + struct ipv6hdr *ip6h = NULL; + struct iovec flags_iov[2]; + struct iphdr *iph = NULL; + struct ethhdr *eh; + uint32_t seq; + int elem_cnt; + int nb_ack; + int ret; + + hdrlen = tcp_vu_hdrlen(CONN_V6(conn)); + + vu_set_element(&flags_elem[0], NULL, &flags_iov[0]); + + elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1, + hdrlen + sizeof(struct tcp_syn_opts), NULL); + if (elem_cnt != 1) + return -1; + + ASSERT(flags_elem[0].in_sg[0].iov_len >= + hdrlen + sizeof(struct tcp_syn_opts)); + + vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1); + + eh = vu_eth(flags_elem[0].in_sg[0].iov_base); + + memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); + + if (CONN_V4(conn)) { + eh->h_proto = htons(ETH_P_IP); + + iph = vu_ip(flags_elem[0].in_sg[0].iov_base); + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + + payload = vu_payloadv4(flags_elem[0].in_sg[0].iov_base); + } else { + eh->h_proto = htons(ETH_P_IPV6); + + ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base); + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + payload = vu_payloadv6(flags_elem[0].in_sg[0].iov_base); + } + + memset(&payload->th, 0, sizeof(payload->th)); + payload->th.doff = offsetof(struct tcp_payload_t, data) / 4; + payload->th.ack = 1; + + seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, + (struct tcp_syn_opts *)payload->data, + &optlen); + if (ret <= 0) { + vu_queue_rewind(vq, 1); + return ret; + } + + if (CONN_V4(conn)) { + l4len = tcp_fill_headers4(conn, NULL, iph, payload, optlen, + NULL, seq, true); + l2len = sizeof(*iph); + } else { + l4len = tcp_fill_headers6(conn, NULL, ip6h, payload, optlen, + seq, true); + l2len = sizeof(*ip6h); + } + l2len += l4len + sizeof(struct ethhdr); + + flags_elem[0].in_sg[0].iov_len = l2len + + sizeof(struct virtio_net_hdr_mrg_rxbuf); + if (*c->pcap) { + tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1); + pcap_iov(&flags_elem[0].in_sg[0], 1, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + nb_ack = 1; + + if (flags & DUP_ACK) { + vu_set_element(&flags_elem[1], NULL, &flags_iov[1]); + + elem_cnt = vu_collect(vdev, vq, &flags_elem[1], 1, + flags_elem[0].in_sg[0].iov_len, NULL); + if (elem_cnt == 1 && + flags_elem[1].in_sg[0].iov_len >= + flags_elem[0].in_sg[0].iov_len) { + memcpy(flags_elem[1].in_sg[0].iov_base, + flags_elem[0].in_sg[0].iov_base, + flags_elem[0].in_sg[0].iov_len); + nb_ack++; + + if (*c->pcap) { + pcap_iov(&flags_elem[1].in_sg[0], 1, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + } + } + + vu_flush(vdev, vq, flags_elem, nb_ack); + + return 0; +} + +/** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers + * @c: Execution context + * @conn: Connection pointer + * @v6: Set for IPv6 connections + * @already_sent: Number of bytes already sent + * @fillsize: Maximum bytes to fill in guest-side receiving window + * @iov_cnt: number of iov (output) + * + * Return: Number of iov entries used to store the data or negative error code + */ +static ssize_t tcp_vu_sock_recv(const struct ctx *c, + const struct tcp_tap_conn *conn, bool v6, + uint32_t already_sent, size_t fillsize, + int *iov_cnt) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + struct msghdr mh_sock = { 0 }; + uint16_t mss = MSS_GET(conn); + int s = conn->sock; + ssize_t ret, len; + size_t hdrlen; + int elem_cnt; + int i; + + *iov_cnt = 0; + + hdrlen = tcp_vu_hdrlen(v6); + + vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE); + + elem_cnt = 0; + head_cnt = 0; + while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) { + struct iovec *iov; + size_t frame_size, dlen; + int cnt; + + cnt = vu_collect(vdev, vq, &elem[elem_cnt], + VIRTQUEUE_MAX_SIZE - elem_cnt, + MIN(mss, fillsize) + hdrlen, &frame_size); + if (cnt == 0) + break; + + dlen = frame_size - hdrlen; + + /* reserve space for headers in iov */ + iov = &elem[elem_cnt].in_sg[0]; + ASSERT(iov->iov_len >= hdrlen); + iov->iov_base = (char *)iov->iov_base + hdrlen; + iov->iov_len -= hdrlen; + head[head_cnt++] = elem_cnt; + + fillsize -= dlen; + elem_cnt += cnt; + } + + if (peek_offset_cap) { + mh_sock.msg_iov = iov_vu + 1; + mh_sock.msg_iovlen = elem_cnt; + } else { + iov_vu[0].iov_base = tcp_buf_discard; + iov_vu[0].iov_len = already_sent; + + mh_sock.msg_iov = iov_vu; + mh_sock.msg_iovlen = elem_cnt + 1; + } + + do + ret = recvmsg(s, &mh_sock, MSG_PEEK); + while (ret < 0 && errno == EINTR); + + if (ret < 0) { + vu_queue_rewind(vq, elem_cnt); + return -errno; + } + + if (!peek_offset_cap) + ret -= already_sent; + + /* adjust iov number and length of the last iov */ + len = ret; + for (i = 0; len && i < elem_cnt; i++) { + struct iovec *iov = &elem[i].in_sg[0]; + + if (iov->iov_len > (size_t)len) + iov->iov_len = len; + + len -= iov->iov_len; + } + /* adjust head count */ + while (head_cnt > 0 && head[head_cnt - 1] > i) + head_cnt--; + /* mark end of array */ + head[head_cnt] = i; + *iov_cnt = i; + + /* release unused buffers */ + vu_queue_rewind(vq, elem_cnt - i); + + /* restore space for headers in iov */ + for (i = 0; i < head_cnt; i++) { + struct iovec *iov = &elem[head[i]].in_sg[0]; + + iov->iov_base = (char *)iov->iov_base - hdrlen; + iov->iov_len += hdrlen; + } + + return ret; +} + +/** + * tcp_vu_prepare() - Prepare the frame header + * @c: Execution context + * @conn: Connection pointer + * @first: Pointer to the array of IO vectors + * @dlen: Packet data length + * @check: Checksum, if already known + */ +static void tcp_vu_prepare(const struct ctx *c, + struct tcp_tap_conn *conn, char *base, + size_t dlen, const uint16_t **check) +{ + const struct flowside *toside = TAPFLOW(conn); + struct tcp_payload_t *payload; + struct ipv6hdr *ip6h = NULL; + struct iphdr *iph = NULL; + struct ethhdr *eh; + + /* we guess the first iovec provided by the guest can embed + * all the headers needed by L2 frame + */ + + eh = vu_eth(base); + + memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); + + /* initialize header */ + + if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) { + eh->h_proto = htons(ETH_P_IP); + + iph = vu_ip(base); + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + payload = vu_payloadv4(base); + } else { + eh->h_proto = htons(ETH_P_IPV6); + + ip6h = vu_ip(base); + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + + payload = vu_payloadv6(base); + } + + memset(&payload->th, 0, sizeof(payload->th)); + payload->th.doff = offsetof(struct tcp_payload_t, data) / 4; + payload->th.ack = 1; + + if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) { + tcp_fill_headers4(conn, NULL, iph, payload, dlen, + *check, conn->seq_to_tap, true); + *check = &iph->check; + } else { + tcp_fill_headers6(conn, NULL, ip6h, payload, dlen, + conn->seq_to_tap, true); + } +} + +/** + * tcp_vu_data_from_sock() - Handle new data from socket, queue to vhost-user, + * in window + * @c: Execution context + * @conn: Connection pointer + * + * Return: Negative on connection reset, 0 otherwise + */ +int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) +{ + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + const struct flowside *tapside = TAPFLOW(conn); + size_t fillsize, hdrlen; + int v6 = CONN_V6(conn); + uint32_t already_sent; + const uint16_t *check; + int i, iov_cnt; + ssize_t len; + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + debug("Got packet, but RX virtqueue not usable yet"); + return 0; + } + + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + if (tcp_set_peek_offset(conn->sock, 0)) { + tcp_rst(c, conn); + return -1; + } + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + } + + /* Set up buffer descriptors we'll fill completely and partially. */ + + fillsize = wnd_scaled - already_sent; + + /* collect the buffers from vhost-user and fill them with the + * data from the socket + */ + len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, &iov_cnt); + if (len < 0) { + if (len != -EAGAIN && len != -EWOULDBLOCK) { + tcp_rst(c, conn); + return len; + } + return 0; + } + + if (!len) { + if (already_sent) { + conn_flag(c, conn, STALLED); + } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == + SOCK_FIN_RCVD) { + int ret = tcp_vu_send_flag(c, conn, FIN | ACK); + if (ret) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + conn_flag(c, conn, ~STALLED); + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, false, NULL); + + /* initialize headers */ + /* iov_vu is an array of buffers and the buffer size can be + * smaller than the frame size we want to use but with + * num_buffer we can merge several virtio iov buffers in one packet + * we need only to set the packet headers in the first iov and + * num_buffer to the number of iov entries + */ + + hdrlen = tcp_vu_hdrlen(v6); + for (i = 0, check = NULL; i < head_cnt; i++) { + struct iovec *iov = &elem[head[i]].in_sg[0]; + int buf_cnt = head[i + 1] - head[i]; + int dlen = iov_size(iov, buf_cnt) - hdrlen; + + vu_set_vnethdr(vdev, iov->iov_base, buf_cnt); + + /* we compute IPv4 header checksum only for the + * first and the last, all other checksums are the + * same as the first one + */ + if (i + 1 == head_cnt) + check = NULL; + + tcp_vu_prepare(c, conn, iov->iov_base, dlen, &check); + + if (*c->pcap) { + tcp_vu_update_check(tapside, iov, buf_cnt); + pcap_iov(iov, buf_cnt, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + + conn->seq_to_tap += dlen; + } + + /* send packets */ + vu_flush(vdev, vq, elem, iov_cnt); + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; +} diff --git a/tcp_vu.h b/tcp_vu.h new file mode 100644 index 0000000..6ab6057 --- /dev/null +++ b/tcp_vu.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +#ifndef TCP_VU_H +#define TCP_VU_H + +int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags); +int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); + +#endif /*TCP_VU_H */ diff --git a/udp.c b/udp.c index 9718ed8..5b0093a 100644 --- a/udp.c +++ b/udp.c @@ -110,6 +110,7 @@ #include "log.h" #include "flow_table.h" #include "udp_internal.h" +#include "udp_vu.h" /* "Spliced" sockets indexed by bound port (host order) */ static int udp_splice_ns [IP_VERSIONS][NUM_PORTS]; @@ -628,6 +629,11 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { + if (c->mode == MODE_VU) { + udp_vu_listen_sock_handler(c, ref, events, now); + return; + } + udp_buf_listen_sock_handler(c, ref, events, now); } @@ -698,6 +704,11 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { + if (c->mode == MODE_VU) { + udp_vu_reply_sock_handler(c, ref, events, now); + return; + } + udp_buf_reply_sock_handler(c, ref, events, now); } diff --git a/udp_vu.c b/udp_vu.c new file mode 100644 index 0000000..c911022 --- /dev/null +++ b/udp_vu.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* udp_vu.c - UDP L2 vhost-user management functions + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +#include <unistd.h> +#include <assert.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/udp.h> +#include <stdint.h> +#include <stddef.h> +#include <sys/uio.h> +#include <linux/virtio_net.h> + +#include "checksum.h" +#include "util.h" +#include "ip.h" +#include "siphash.h" +#include "inany.h" +#include "passt.h" +#include "pcap.h" +#include "log.h" +#include "vhost_user.h" +#include "udp_internal.h" +#include "flow.h" +#include "flow_table.h" +#include "udp_flow.h" +#include "udp_vu.h" +#include "vu_common.h" + +static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE]; +static struct vu_virtq_element elem [VIRTQUEUE_MAX_SIZE]; + +/** + * udp_vu_hdrlen() - return the size of the header in level 2 frame (UDP) + * @v6: Set for IPv6 packet + * + * Return: Return the size of the header + */ +static size_t udp_vu_hdrlen(bool v6) +{ + size_t hdrlen; + + hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) + + sizeof(struct ethhdr) + sizeof(struct udphdr); + + if (v6) + hdrlen += sizeof(struct ipv6hdr); + else + hdrlen += sizeof(struct iphdr); + + return hdrlen; +} + +/** + * udp_vu_sock_info() - get socket information + * @s: Socket to get information from + * @s_in: Socket address (output) + * + * Return: 0 if socket address can be read, -1 otherwise + */ +static int udp_vu_sock_info(int s, union sockaddr_inany *s_in) +{ + struct msghdr msg = { + .msg_name = s_in, + .msg_namelen = sizeof(union sockaddr_inany), + }; + + return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT); +} + +/** + * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers + * @c: Execution context + * @s: Socket to receive from + * @events: epoll events bitmap + * @v6: Set for IPv6 connections + * @dlen: Size of received data (output) + * + * Return: Number of iov entries used to store the datagram + */ +static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events, + bool v6, ssize_t *dlen) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + int iov_cnt, idx, iov_used; + struct msghdr msg = { 0 }; + size_t off, hdrlen; + + ASSERT(!c->no_udp); + + if (!(events & EPOLLIN)) + return 0; + + /* compute L2 header length */ + hdrlen = udp_vu_hdrlen(v6); + + vu_init_elem(elem, iov_vu, VIRTQUEUE_MAX_SIZE); + + iov_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE, + IP_MAX_MTU - sizeof(struct udphdr) + hdrlen, + NULL); + if (iov_cnt == 0) + return 0; + + /* reserve space for the headers */ + ASSERT(iov_vu[0].iov_len >= hdrlen); + iov_vu[0].iov_base = (char *)iov_vu[0].iov_base + hdrlen; + iov_vu[0].iov_len -= hdrlen; + + /* read data from the socket */ + msg.msg_iov = iov_vu; + msg.msg_iovlen = iov_cnt; + + *dlen = recvmsg(s, &msg, 0); + if (*dlen < 0) { + vu_queue_rewind(vq, iov_cnt); + return 0; + } + + /* restore the pointer to the headers address */ + iov_vu[0].iov_base = (char *)iov_vu[0].iov_base - hdrlen; + iov_vu[0].iov_len += hdrlen; + + /* count the numbers of buffer filled by recvmsg() */ + idx = iov_skip_bytes(iov_vu, iov_cnt, *dlen + hdrlen, &off); + + /* adjust last iov length */ + if (idx < iov_cnt) + iov_vu[idx].iov_len = off; + iov_used = idx + !!off; + + vu_set_vnethdr(vdev, iov_vu[0].iov_base, iov_used); + + /* release unused buffers */ + vu_queue_rewind(vq, iov_cnt - iov_used); + + return iov_used; +} + +/** + * udp_vu_prepare() - Prepare the packet header + * @c: Execution context + * @toside: Address information for one side of the flow + * @dlen: Packet data length + * + * Return: Layer-4 length + */ +static size_t udp_vu_prepare(const struct ctx *c, + const struct flowside *toside, ssize_t dlen) +{ + struct ethhdr *eh; + size_t l4len; + + /* ethernet header */ + eh = vu_eth(iov_vu[0].iov_base); + + memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); + + /* initialize header */ + if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) { + struct iphdr *iph = vu_ip(iov_vu[0].iov_base); + struct udp_payload_t *bp = vu_payloadv4(iov_vu[0].iov_base); + + eh->h_proto = htons(ETH_P_IP); + + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP); + + l4len = udp_update_hdr4(iph, bp, toside, dlen, true); + } else { + struct ipv6hdr *ip6h = vu_ip(iov_vu[0].iov_base); + struct udp_payload_t *bp = vu_payloadv6(iov_vu[0].iov_base); + + eh->h_proto = htons(ETH_P_IPV6); + + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP); + + l4len = udp_update_hdr6(ip6h, bp, toside, dlen, true); + } + + return l4len; +} + +/** + * udp_vu_csum() - Calculate and set checksum for a UDP packet + * @toside: Address information for one side of the flow + * @iov_used: Number of used iov_vu items + */ +static void udp_vu_csum(const struct flowside *toside, int iov_used) +{ + const struct in_addr *src4 = inany_v4(&toside->oaddr); + const struct in_addr *dst4 = inany_v4(&toside->eaddr); + char *base = iov_vu[0].iov_base; + struct udp_payload_t *bp; + + if (src4 && dst4) { + bp = vu_payloadv4(base); + csum_udp4(&bp->uh, *src4, *dst4, iov_vu, iov_used, + (char *)&bp->data - base); + } else { + bp = vu_payloadv6(base); + csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, + iov_vu, iov_used, (char *)&bp->data - base); + } +} + +/** + * udp_vu_listen_sock_handler() - Handle new data from socket + * @c: Execution context + * @ref: epoll reference + * @events: epoll events bitmap + * @now: Current timestamp + */ +void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + int i; + + if (udp_sock_errs(c, ref.fd, events) < 0) { + err("UDP: Unrecoverable error on listening socket:" + " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); + return; + } + + for (i = 0; i < UDP_MAX_FRAMES; i++) { + const struct flowside *toside; + union sockaddr_inany s_in; + flow_sidx_t sidx; + uint8_t pif; + ssize_t dlen; + int iov_used; + bool v6; + + if (udp_vu_sock_info(ref.fd, &s_in) < 0) + break; + + sidx = udp_flow_from_sock(c, ref, &s_in, now); + pif = pif_at_sidx(sidx); + + if (pif != PIF_TAP) { + if (flow_sidx_valid(sidx)) { + flow_sidx_t fromsidx = flow_sidx_opposite(sidx); + struct udp_flow *uflow = udp_at_sidx(sidx); + + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(pif_at_sidx(fromsidx)), + pif_name(pif)); + } else { + debug("Discarding 1 datagram without flow"); + } + + continue; + } + + toside = flowside_at_sidx(sidx); + + v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); + + iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen); + if (iov_used <= 0) + break; + + udp_vu_prepare(c, toside, dlen); + if (*c->pcap) { + udp_vu_csum(toside, iov_used); + pcap_iov(iov_vu, iov_used, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + vu_flush(vdev, vq, elem, iov_used); + } +} + +/** + * udp_vu_reply_sock_handler() - Handle new data from flow specific socket + * @c: Execution context + * @ref: epoll reference + * @events: epoll events bitmap + * @now: Current timestamp + */ +void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) +{ + flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); + const struct flowside *toside = flowside_at_sidx(tosidx); + struct udp_flow *uflow = udp_at_sidx(ref.flowside); + int from_s = uflow->s[ref.flowside.sidei]; + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + int i; + + ASSERT(!c->no_udp); + + if (udp_sock_errs(c, from_s, events) < 0) { + flow_err(uflow, "Unrecoverable error on reply socket"); + flow_err_details(uflow); + udp_flow_close(c, uflow); + return; + } + + for (i = 0; i < UDP_MAX_FRAMES; i++) { + uint8_t topif = pif_at_sidx(tosidx); + ssize_t dlen; + int iov_used; + bool v6; + + ASSERT(uflow); + + if (topif != PIF_TAP) { + uint8_t frompif = pif_at_sidx(ref.flowside); + + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(frompif), pif_name(topif)); + continue; + } + + v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); + + iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen); + if (iov_used <= 0) + break; + flow_trace(uflow, "Received 1 datagram on reply socket"); + uflow->ts = now->tv_sec; + + udp_vu_prepare(c, toside, dlen); + if (*c->pcap) { + udp_vu_csum(toside, iov_used); + pcap_iov(iov_vu, iov_used, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + vu_flush(vdev, vq, elem, iov_used); + } +} diff --git a/udp_vu.h b/udp_vu.h new file mode 100644 index 0000000..ba7018d --- /dev/null +++ b/udp_vu.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +#ifndef UDP_VU_H +#define UDP_VU_H + +void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now); +void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now); +#endif /* UDP_VU_H */ diff --git a/vhost_user.c b/vhost_user.c index 89627a2..51c90db 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -48,12 +48,13 @@ /* vhost-user version we are compatible with */ #define VHOST_USER_VERSION 1 +static struct vu_dev vdev_storage; + /** * vu_print_capabilities() - print vhost-user capabilities * this is part of the vhost-user backend * convention. */ -/* cppcheck-suppress unusedFunction */ void vu_print_capabilities(void) { info("{"); @@ -163,9 +164,7 @@ static void vmsg_close_fds(const struct vhost_user_msg *vmsg) */ static void vu_remove_watch(const struct vu_dev *vdev, int fd) { - /* Placeholder to add passt related code */ - (void)vdev; - (void)fd; + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, fd, NULL); } /** @@ -487,6 +486,14 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, } } + /* As vu_packet_check_range() has no access to the number of + * memory regions, mark the end of the array with mmap_addr = 0 + */ + ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1); + vdev->regions[vdev->nregions].mmap_addr = 0; + + tap_sock_update_pool(vdev->regions, 0); + return false; } @@ -615,9 +622,16 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev, */ static void vu_set_watch(const struct vu_dev *vdev, int idx) { - /* Placeholder to add passt related code */ - (void)vdev; - (void)idx; + union epoll_ref ref = { + .type = EPOLL_TYPE_VHOST_KICK, + .fd = vdev->vq[idx].kick_fd, + .queue = idx + }; + struct epoll_event ev = { 0 }; + + ev.data.u64 = ref.u64; + ev.events = EPOLLIN; + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); } /** @@ -829,14 +843,14 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, * @c: execution context * @vdev: vhost-user device */ -/* cppcheck-suppress unusedFunction */ -void vu_init(struct ctx *c, struct vu_dev *vdev) +void vu_init(struct ctx *c) { int i; - vdev->context = c; + c->vdev = &vdev_storage; + c->vdev->context = c; for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { - vdev->vq[i] = (struct vu_virtq){ + c->vdev->vq[i] = (struct vu_virtq){ .call_fd = -1, .kick_fd = -1, .err_fd = -1, @@ -849,7 +863,6 @@ void vu_init(struct ctx *c, struct vu_dev *vdev) * vu_cleanup() - Reset vhost-user device * @vdev: vhost-user device */ -/* cppcheck-suppress unusedFunction */ void vu_cleanup(struct vu_dev *vdev) { unsigned int i; @@ -896,8 +909,7 @@ void vu_cleanup(struct vu_dev *vdev) */ static void vu_sock_reset(struct vu_dev *vdev) { - /* Placeholder to add passt related code */ - (void)vdev; + tap_sock_reset(vdev->context); } static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, @@ -925,7 +937,6 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, * @fd: vhost-user message socket * @events: epoll events */ -/* cppcheck-suppress unusedFunction */ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) { struct vhost_user_msg msg = { 0 }; diff --git a/vhost_user.h b/vhost_user.h index 5af349b..464ba21 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -183,7 +183,6 @@ struct vhost_user_msg { * * Return: true if the virqueue is enabled, false otherwise */ -/* cppcheck-suppress unusedFunction */ static inline bool vu_queue_enabled(const struct vu_virtq *vq) { return vq->enable; @@ -195,14 +194,13 @@ static inline bool vu_queue_enabled(const struct vu_virtq *vq) * * Return: true if the virqueue is started, false otherwise */ -/* cppcheck-suppress unusedFunction */ static inline bool vu_queue_started(const struct vu_virtq *vq) { return vq->started; } void vu_print_capabilities(void); -void vu_init(struct ctx *c, struct vu_dev *vdev); +void vu_init(struct ctx *c); void vu_cleanup(struct vu_dev *vdev); void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events); #endif /* VHOST_USER_H */ diff --git a/virtio.c b/virtio.c index b23a68c..6a97435 100644 --- a/virtio.c +++ b/virtio.c @@ -325,7 +325,6 @@ static bool vring_can_notify(const struct vu_dev *dev, struct vu_virtq *vq) * @dev: Vhost-user device * @vq: Virtqueue */ -/* cppcheck-suppress unusedFunction */ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) { if (!vring_can_notify(dev, vq)) { @@ -498,7 +497,6 @@ static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned i * * Return: -1 if there is an error, 0 otherwise */ -/* cppcheck-suppress unusedFunction */ int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem) { unsigned int head; @@ -556,7 +554,6 @@ void vu_queue_unpop(struct vu_virtq *vq) * @vq: Virtqueue * @num: Number of element to unpop */ -/* cppcheck-suppress unusedFunction */ bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num) { if (num > vq->inuse) @@ -609,7 +606,6 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, * @len: Size of the element * @idx: Used ring entry index */ -/* cppcheck-suppress unusedFunction */ void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem, unsigned int len, unsigned int idx) { @@ -633,7 +629,6 @@ static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val) * @vq: Virtqueue * @count: Number of entry to flush */ -/* cppcheck-suppress unusedFunction */ void vu_queue_flush(struct vu_virtq *vq, unsigned int count) { uint16_t old, new; diff --git a/vu_common.c b/vu_common.c new file mode 100644 index 0000000..f2eb701 --- /dev/null +++ b/vu_common.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + * + * common_vu.c - vhost-user common UDP and TCP functions + */ + +#include <unistd.h> +#include <sys/uio.h> +#include <sys/eventfd.h> +#include <netinet/if_ether.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "passt.h" +#include "tap.h" +#include "vhost_user.h" +#include "pcap.h" +#include "vu_common.h" + +/** + * vu_packet_check_range() - Check if a given memory zone is contained in + * a mapped guest memory region + * @buf: Array of the available memory regions + * @offset: Offset of data range in packet descriptor + * @size: Length of desired data range + * @start: Start of the packet descriptor + * + * Return: 0 if the zone is in a mapped memory region, -1 otherwise + */ +int vu_packet_check_range(void *buf, size_t offset, size_t len, + const char *start) +{ + struct vu_dev_region *dev_region; + + for (dev_region = buf; dev_region->mmap_addr; dev_region++) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + char *m = (char *)dev_region->mmap_addr; + + if (m <= start && + start + offset + len <= m + dev_region->mmap_offset + + dev_region->size) + return 0; + } + + return -1; +} + +/** + * vu_init_elem() - initialize an array of virtqueue elements with 1 iov in each + * @elem: Array of virtqueue elements to initialize + * @iov: Array of iovec to assign to virtqueue element + * @elem_cnt: Number of virtqueue element + */ +void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt) +{ + int i; + + for (i = 0; i < elem_cnt; i++) + vu_set_element(&elem[i], NULL, &iov[i]); +} + +/** + * vu_collect() - collect virtio buffers from a given virtqueue + * @vdev: vhost-user device + * @vq: virtqueue to collect from + * @elem: Array of virtqueue element + * each element must be initialized with one iovec entry + * in the in_sg array. + * @max_elem: Number of virtqueue elements in the array + * @size: Maximum size of the data in the frame + * @frame_size: The total size of the buffers (output) + * + * Return: number of elements used to contain the frame + */ +int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq, + struct vu_virtq_element *elem, int max_elem, + size_t size, size_t *frame_size) +{ + size_t current_size = 0; + int elem_cnt = 0; + + while (current_size < size && elem_cnt < max_elem) { + struct iovec *iov; + int ret; + + ret = vu_queue_pop(vdev, vq, &elem[elem_cnt]); + if (ret < 0) + break; + + if (elem[elem_cnt].in_num < 1) { + warn("virtio-net receive queue contains no in buffers"); + vu_queue_detach_element(vq); + break; + } + + iov = &elem[elem_cnt].in_sg[0]; + + if (iov->iov_len > size - current_size) + iov->iov_len = size - current_size; + + current_size += iov->iov_len; + elem_cnt++; + + if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) + break; + } + + if (frame_size) + *frame_size = current_size; + + return elem_cnt; +} + +/** + * vu_set_vnethdr() - set virtio-net headers + * @vdev: vhost-user device + * @vnethdr: Address of the header to set + * @num_buffers: Number of guest buffers of the frame + */ +void vu_set_vnethdr(const struct vu_dev *vdev, + struct virtio_net_hdr_mrg_rxbuf *vnethdr, + int num_buffers) +{ + vnethdr->hdr = VU_HEADER; + if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) + vnethdr->num_buffers = htole16(num_buffers); +} + +/** + * vu_flush() - flush all the collected buffers to the vhost-user interface + * @vdev: vhost-user device + * @vq: vhost-user virtqueue + * @elem: virtqueue elements array to send back to the virtqueue + * @elem_cnt: Length of the array + */ +void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, + struct vu_virtq_element *elem, int elem_cnt) +{ + int i; + + for (i = 0; i < elem_cnt; i++) + vu_queue_fill(vq, &elem[i], elem[i].in_sg[0].iov_len, i); + + vu_queue_flush(vq, elem_cnt); + vu_queue_notify(vdev, vq); +} + +/** + * vu_handle_tx() - Receive data from the TX virtqueue + * @vdev: vhost-user device + * @index: index of the virtqueue + * @now: Current timestamp + */ +static void vu_handle_tx(struct vu_dev *vdev, int index, + const struct timespec *now) +{ + struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; + struct iovec out_sg[VIRTQUEUE_MAX_SIZE]; + struct vu_virtq *vq = &vdev->vq[index]; + int hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + int out_sg_count; + int count; + + ASSERT(VHOST_USER_IS_QUEUE_TX(index)); + + tap_flush_pools(); + + count = 0; + out_sg_count = 0; + while (count < VIRTQUEUE_MAX_SIZE) { + int ret; + + vu_set_element(&elem[count], &out_sg[out_sg_count], NULL); + ret = vu_queue_pop(vdev, vq, &elem[count]); + if (ret < 0) + break; + out_sg_count += elem[count].out_num; + + if (elem[count].out_num < 1) { + warn("virtio-net transmit queue contains no out buffers"); + break; + } + ASSERT(elem[count].out_num == 1); + + tap_add_packet(vdev->context, + elem[count].out_sg[0].iov_len - hdrlen, + (char *)elem[count].out_sg[0].iov_base + hdrlen); + count++; + } + tap_handler(vdev->context, now); + + if (count) { + int i; + + for (i = 0; i < count; i++) + vu_queue_fill(vq, &elem[i], 0, i); + vu_queue_flush(vq, count); + vu_queue_notify(vdev, vq); + } +} + +/** + * vu_kick_cb() - Called on a kick event to start to receive data + * @vdev: vhost-user device + * @ref: epoll reference information + * @now: Current timestamp + */ +void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, + const struct timespec *now) +{ + eventfd_t kick_data; + ssize_t rc; + + rc = eventfd_read(ref.fd, &kick_data); + if (rc == -1) + die_perror("vhost-user kick eventfd_read()"); + + debug("vhost-user: got kick_data: %016"PRIx64" idx: %d", + kick_data, ref.queue); + if (VHOST_USER_IS_QUEUE_TX(ref.queue)) + vu_handle_tx(vdev, ref.queue, now); +} + +/** + * vu_send_single() - Send a buffer to the front-end using the RX virtqueue + * @c: execution context + * @buf: address of the buffer + * @size: size of the buffer + * + * Return: number of bytes sent, -1 if there is an error + */ +int vu_send_single(const struct ctx *c, const void *buf, size_t size) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; + struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + size_t total; + int elem_cnt; + int i; + + debug("vu_send_single size %zu", size); + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + debug("Got packet, but RX virtqueue not usable yet"); + return -1; + } + + vu_init_elem(elem, in_sg, VIRTQUEUE_MAX_SIZE); + + size += sizeof(struct virtio_net_hdr_mrg_rxbuf); + elem_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE, size, &total); + if (total < size) { + debug("vu_send_single: no space to send the data " + "elem_cnt %d size %zd", elem_cnt, total); + goto err; + } + + vu_set_vnethdr(vdev, in_sg[0].iov_base, elem_cnt); + + total -= sizeof(struct virtio_net_hdr_mrg_rxbuf); + + /* copy data from the buffer to the iovec */ + iov_from_buf(in_sg, elem_cnt, sizeof(struct virtio_net_hdr_mrg_rxbuf), + buf, total); + + if (*c->pcap) { + pcap_iov(in_sg, elem_cnt, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + + vu_flush(vdev, vq, elem, elem_cnt); + + debug("vhost-user sent %zu", total); + + return total; +err: + for (i = 0; i < elem_cnt; i++) + vu_queue_detach_element(vq); + + return -1; +} diff --git a/vu_common.h b/vu_common.h new file mode 100644 index 0000000..901d972 --- /dev/null +++ b/vu_common.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + * + * vhost-user common UDP and TCP functions + */ + +#ifndef VU_COMMON_H +#define VU_COMMON_H +#include <linux/virtio_net.h> + +static inline void *vu_eth(void *base) +{ + return ((char *)base + sizeof(struct virtio_net_hdr_mrg_rxbuf)); +} + +static inline void *vu_ip(void *base) +{ + return (struct ethhdr *)vu_eth(base) + 1; +} + +static inline void *vu_payloadv4(void *base) +{ + return (struct iphdr *)vu_ip(base) + 1; +} + +static inline void *vu_payloadv6(void *base) +{ + return (struct ipv6hdr *)vu_ip(base) + 1; +} + +/** + * vu_set_element() - Initialize a vu_virtq_element + * @elem: Element to initialize + * @out_sg: One out iovec entry to set in elem + * @in_sg: One in iovec entry to set in elem + */ +static inline void vu_set_element(struct vu_virtq_element *elem, + struct iovec *out_sg, struct iovec *in_sg) +{ + elem->out_num = !!out_sg; + elem->out_sg = out_sg; + elem->in_num = !!in_sg; + elem->in_sg = in_sg; +} + +void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, + int elem_cnt); +int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq, + struct vu_virtq_element *elem, int max_elem, size_t size, + size_t *frame_size); +void vu_set_vnethdr(const struct vu_dev *vdev, + struct virtio_net_hdr_mrg_rxbuf *vnethdr, + int num_buffers); +void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, + struct vu_virtq_element *elem, int elem_cnt); +void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, + const struct timespec *now); +int vu_send_single(const struct ctx *c, const void *buf, size_t size); +#endif /* VU_COMMON_H */ From 676bf5488ec4bd4312dbae4be1a1bb2ed02bd2ba Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 22 Nov 2024 17:43:35 +0100 Subject: [PATCH 140/382] test: Add tests for passt in vhost-user mode Run functional and performance tests for vhost-user mode as well. For functional tests, we add passt_vu and passt_vu_in_ns as symbolic links to their non-vhost-user counterparts, as no differences are intended but we want to distinguish them in test logs. For performance tests, instead, we add separate perf/passt_vu_tcp and perf/passt_vu_udp files, as we need longer test duration, as well as higher UDP sending bandwidths and larger TCP windows, to actually get the highest throughput vhost-user mode offers. For valgrind tests, vhost-user mode needs two extra system calls: statx and readlink. Add them as EXTRA_SYSCALLS for the valgrind target. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- Makefile | 3 +- test/lib/perf_report | 15 +++ test/lib/setup | 77 ++++++++++++--- test/lib/setup_ugly | 2 +- test/passt_vu | 1 + test/passt_vu_in_ns | 1 + test/perf/passt_vu_tcp | 211 +++++++++++++++++++++++++++++++++++++++++ test/perf/passt_vu_udp | 159 +++++++++++++++++++++++++++++++ test/run | 25 +++++ test/two_guests_vu | 1 + 10 files changed, 479 insertions(+), 16 deletions(-) create mode 120000 test/passt_vu create mode 120000 test/passt_vu_in_ns create mode 100644 test/perf/passt_vu_tcp create mode 100644 test/perf/passt_vu_udp create mode 120000 test/two_guests_vu diff --git a/Makefile b/Makefile index faa5c23..cb74480 100644 --- a/Makefile +++ b/Makefile @@ -101,7 +101,8 @@ qrap: $(QRAP_SRCS) passt.h valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \ rt_sigreturn getpid gettid kill clock_gettime mmap \ - mmap2 munmap open unlink gettimeofday futex + mmap2 munmap open unlink gettimeofday futex statx \ + readlink valgrind: FLAGS += -g -DVALGRIND valgrind: all diff --git a/test/lib/perf_report b/test/lib/perf_report index d1ef50b..c4ec817 100755 --- a/test/lib/perf_report +++ b/test/lib/perf_report @@ -49,6 +49,21 @@ td:empty { visibility: hidden; } __passt_tcp_LINE__ __passt_udp_LINE__ </table> +</li><li><p>passt with vhost-user support</p> +<table class="passt" width="70%"> + <tr> + <th/> + <th id="perf_passt_vu_tcp" colspan="__passt_vu_tcp_cols__">TCP, __passt_vu_tcp_threads__ at __passt_vu_tcp_freq__ GHz</th> + <th id="perf_passt_vu_udp" colspan="__passt_vu_udp_cols__">UDP, __passt_vu_udp_threads__ at __passt_vu_udp_freq__ GHz</th> + </tr> + <tr> + <td align="right">MTU:</td> + __passt_vu_tcp_header__ + __passt_vu_udp_header__ + </tr> + __passt_vu_tcp_LINE__ __passt_vu_udp_LINE__ +</table> + <style type="text/CSS"> table.pasta_local td { border: 0px solid; padding: 6px; line-height: 1; } table.pasta_local td { text-align: right; } diff --git a/test/lib/setup b/test/lib/setup index 5338393..580825f 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -15,8 +15,7 @@ INITRAMFS="${BASEPATH}/mbuto.img" VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )" -__mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)" -VMEM="$((${__mem_kib} / 1024 / 4))" +MEM_KIB="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)" QEMU_ARCH="$(uname -m)" [ "${QEMU_ARCH}" = "i686" ] && QEMU_ARCH=i386 @@ -46,6 +45,7 @@ setup_passt() { [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" context_run passt "make clean" context_run passt "make valgrind" @@ -54,16 +54,29 @@ setup_passt() { # pidfile isn't created until passt is listening wait_for [ -f "${STATESETUP}/passt.pid" ] + __vmem="$((${MEM_KIB} / 1024 / 4))" + if [ ${VHOST_USER} -eq 1 ]; then + __vmem="$(((${__vmem} + 500) / 1000))G" + __qemu_netdev=" \ + -chardev socket,id=c,path=${STATESETUP}/passt.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + else + __qemu_netdev="-device virtio-net-pci,netdev=s \ + -netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket" + fi + GUEST_CID=94557 context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \ ' -machine accel=kvm' \ - ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ - ' -device virtio-net-pci,netdev=s0 ' \ - " -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \ + " ${__qemu_netdev}" \ " -pidfile ${STATESETUP}/qemu.pid" \ " -device vhost-vsock-pci,guest-cid=$GUEST_CID" @@ -142,6 +155,7 @@ setup_passt_in_ns() { [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_in_pasta.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" if [ ${VALGRIND} -eq 1 ]; then context_run passt "make clean" @@ -154,17 +168,30 @@ setup_passt_in_ns() { fi wait_for [ -f "${STATESETUP}/passt.pid" ] + __vmem="$((${MEM_KIB} / 1024 / 4))" + if [ ${VHOST_USER} -eq 1 ]; then + __vmem="$(((${__vmem} + 500) / 1000))G" + __qemu_netdev=" \ + -chardev socket,id=c,path=${STATESETUP}/passt.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + else + __qemu_netdev="-device virtio-net-pci,netdev=s \ + -netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket" + fi + GUEST_CID=94557 context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \ ' -machine accel=kvm' \ ' -M accel=kvm:tcg' \ - ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ - ' -device virtio-net-pci,netdev=s0 ' \ - " -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \ + " ${__qemu_netdev}" \ " -pidfile ${STATESETUP}/qemu.pid" \ " -device vhost-vsock-pci,guest-cid=$GUEST_CID" @@ -214,6 +241,7 @@ setup_two_guests() { [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001" wait_for [ -f "${STATESETUP}/passt_1.pid" ] @@ -222,33 +250,54 @@ setup_two_guests() { [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004" wait_for [ -f "${STATESETUP}/passt_2.pid" ] + __vmem="$((${MEM_KIB} / 1024 / 4))" + if [ ${VHOST_USER} -eq 1 ]; then + __vmem="$(((${__vmem} + 500) / 1000))G" + __qemu_netdev1=" \ + -chardev socket,id=c,path=${STATESETUP}/passt_1.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + __qemu_netdev2=" \ + -chardev socket,id=c,path=${STATESETUP}/passt_2.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + else + __qemu_netdev1="-device virtio-net-pci,netdev=s \ + -netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket" + __qemu_netdev2="-device virtio-net-pci,netdev=s \ + -netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket" + fi + GUEST_1_CID=94557 context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \ ' -M accel=kvm:tcg' \ - ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ - ' -device virtio-net-pci,netdev=s0 ' \ - " -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket " \ + " ${__qemu_netdev1}" \ " -pidfile ${STATESETUP}/qemu_1.pid" \ " -device vhost-vsock-pci,guest-cid=$GUEST_1_CID" GUEST_2_CID=94558 context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \ ' -M accel=kvm:tcg' \ - ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ - ' -device virtio-net-pci,netdev=s0 ' \ - " -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket " \ + " ${__qemu_netdev2}" \ " -pidfile ${STATESETUP}/qemu_2.pid" \ " -device vhost-vsock-pci,guest-cid=$GUEST_2_CID" diff --git a/test/lib/setup_ugly b/test/lib/setup_ugly index 4b2a077..2802cc3 100755 --- a/test/lib/setup_ugly +++ b/test/lib/setup_ugly @@ -33,7 +33,7 @@ setup_memory() { pane_or_context_run guest 'qemu-system-$(uname -m)' \ ' -machine accel=kvm' \ - ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ + ' -m '$((${MEM_KIB} / 1024 / 4))' -cpu host -smp '${VCPUS} \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ ' -initrd '${INITRAMFS_MEM}' -nographic -serial stdio' \ ' -nodefaults' \ diff --git a/test/passt_vu b/test/passt_vu new file mode 120000 index 0000000..22f1840 --- /dev/null +++ b/test/passt_vu @@ -0,0 +1 @@ +passt \ No newline at end of file diff --git a/test/passt_vu_in_ns b/test/passt_vu_in_ns new file mode 120000 index 0000000..3ff479e --- /dev/null +++ b/test/passt_vu_in_ns @@ -0,0 +1 @@ +passt_in_ns \ No newline at end of file diff --git a/test/perf/passt_vu_tcp b/test/perf/passt_vu_tcp new file mode 100644 index 0000000..b434008 --- /dev/null +++ b/test/perf/passt_vu_tcp @@ -0,0 +1,211 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/perf/passt_vu_tcp - Check TCP performance in passt vhost-user mode +# +# Copyright (c) 2021 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +gtools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr # From neper +nstools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr +htools bc head sed seq + +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test passt: throughput and latency + +guest /sbin/sysctl -w net.core.rmem_max=536870912 +guest /sbin/sysctl -w net.core.wmem_max=536870912 +guest /sbin/sysctl -w net.core.rmem_default=33554432 +guest /sbin/sysctl -w net.core.wmem_default=33554432 +guest /sbin/sysctl -w net.ipv4.tcp_rmem="4096 131072 268435456" +guest /sbin/sysctl -w net.ipv4.tcp_wmem="4096 131072 268435456" +guest /sbin/sysctl -w net.ipv4.tcp_timestamps=0 + +ns /sbin/sysctl -w net.ipv4.tcp_rmem="4096 524288 134217728" +ns /sbin/sysctl -w net.ipv4.tcp_wmem="4096 524288 134217728" +ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0 + +gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' + +hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 +hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l +hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ + +set THREADS 4 +set TIME 5 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -l 1M -O__OMIT__ -N + +info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz +report passt_vu tcp __THREADS__ __FREQ__ + +th MTU 256B 576B 1280B 1500B 9000B 65520B + + +tr TCP throughput over IPv6: guest to host +iperf3s ns 10002 + +bw - +bw - +guest ip link set dev __IFNAME__ mtu 1280 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M +bw __BW__ 1.2 1.5 +guest ip link set dev __IFNAME__ mtu 1500 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 32M +bw __BW__ 1.6 1.8 +guest ip link set dev __IFNAME__ mtu 9000 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M +bw __BW__ 4.0 5.0 +guest ip link set dev __IFNAME__ mtu 65520 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M +bw __BW__ 7.0 8.0 + +iperf3k ns + +tl TCP RR latency over IPv6: guest to host +lat - +lat - +lat - +lat - +lat - +nsb tcp_rr --nolog -6 +gout LAT tcp_rr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + +tl TCP CRR latency over IPv6: guest to host +lat - +lat - +lat - +lat - +lat - +nsb tcp_crr --nolog -6 +gout LAT tcp_crr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 500 400 + +tr TCP throughput over IPv4: guest to host +iperf3s ns 10002 + +guest ip link set dev __IFNAME__ mtu 256 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 2M +bw __BW__ 0.2 0.3 +guest ip link set dev __IFNAME__ mtu 576 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M +bw __BW__ 0.5 0.8 +guest ip link set dev __IFNAME__ mtu 1280 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M +bw __BW__ 1.2 1.5 +guest ip link set dev __IFNAME__ mtu 1500 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M +bw __BW__ 1.6 1.8 +guest ip link set dev __IFNAME__ mtu 9000 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M +bw __BW__ 4.0 5.0 +guest ip link set dev __IFNAME__ mtu 65520 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M +bw __BW__ 7.0 8.0 + +iperf3k ns + +# Reducing MTU below 1280 deconfigures IPv6, get our address back +guest dhclient -6 -x +guest dhclient -6 __IFNAME__ + +tl TCP RR latency over IPv4: guest to host +lat - +lat - +lat - +lat - +lat - +nsb tcp_rr --nolog -4 +gout LAT tcp_rr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + +tl TCP CRR latency over IPv4: guest to host +lat - +lat - +lat - +lat - +lat - +nsb tcp_crr --nolog -4 +gout LAT tcp_crr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 500 400 + +tr TCP throughput over IPv6: host to guest +iperf3s guest 10001 + +bw - +bw - +bw - +bw - +bw - +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -w 32M +bw __BW__ 6.0 6.8 + +iperf3k guest + +tl TCP RR latency over IPv6: host to guest +lat - +lat - +lat - +lat - +lat - +guestb tcp_rr --nolog -P 10001 -C 10011 -6 +sleep 1 +nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + +tl TCP CRR latency over IPv6: host to guest +lat - +lat - +lat - +lat - +lat - +guestb tcp_crr --nolog -P 10001 -C 10011 -6 +sleep 1 +nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 500 350 + + +tr TCP throughput over IPv4: host to guest +iperf3s guest 10001 + +bw - +bw - +bw - +bw - +bw - +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -w 32M +bw __BW__ 6.0 6.8 + +iperf3k guest + +tl TCP RR latency over IPv4: host to guest +lat - +lat - +lat - +lat - +lat - +guestb tcp_rr --nolog -P 10001 -C 10011 -4 +sleep 1 +nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + +tl TCP CRR latency over IPv6: host to guest +lat - +lat - +lat - +lat - +lat - +guestb tcp_crr --nolog -P 10001 -C 10011 -4 +sleep 1 +nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 500 300 + +te diff --git a/test/perf/passt_vu_udp b/test/perf/passt_vu_udp new file mode 100644 index 0000000..943ac11 --- /dev/null +++ b/test/perf/passt_vu_udp @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/perf/passt_vu_udp - Check UDP performance in passt vhost-user mode +# +# Copyright (c) 2021 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +gtools /sbin/sysctl ip jq nproc sleep iperf3 udp_rr # From neper +nstools ip jq sleep iperf3 udp_rr +htools bc head sed + +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test passt: throughput and latency + +guest /sbin/sysctl -w net.core.rmem_max=16777216 +guest /sbin/sysctl -w net.core.wmem_max=16777216 +guest /sbin/sysctl -w net.core.rmem_default=16777216 +guest /sbin/sysctl -w net.core.wmem_default=16777216 + +hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 +hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l +hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ + +set THREADS 2 +set TIME 1 +set OPTS -u -P __THREADS__ --pacing-timer 1000 + +info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz + +report passt_vu udp __THREADS__ __FREQ__ + +th pktlen 256B 576B 1280B 1500B 9000B 65520B + +tr UDP throughput over IPv6: guest to host +iperf3s ns 10002 +# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header + +bw - +bw - +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 3G -l 1232 +bw __BW__ 0.8 1.2 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 4G -l 1452 +bw __BW__ 1.0 1.5 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 10G -l 8952 +bw __BW__ 4.0 5.0 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 20G -l 64372 +bw __BW__ 4.0 5.0 + +iperf3k ns + +tl UDP RR latency over IPv6: guest to host +lat - +lat - +lat - +lat - +lat - +nsb udp_rr --nolog -6 +gout LAT udp_rr --nolog -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + + +tr UDP throughput over IPv4: guest to host +iperf3s ns 10002 +# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header + +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 1G -l 228 +bw __BW__ 0.0 0.0 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 2G -l 548 +bw __BW__ 0.4 0.6 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 3G -l 1252 +bw __BW__ 0.8 1.2 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 4G -l 1472 +bw __BW__ 1.0 1.5 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 10G -l 8972 +bw __BW__ 4.0 5.0 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 20G -l 65492 +bw __BW__ 4.0 5.0 + +iperf3k ns + +tl UDP RR latency over IPv4: guest to host +lat - +lat - +lat - +lat - +lat - +nsb udp_rr --nolog -4 +gout LAT udp_rr --nolog -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + + +tr UDP throughput over IPv6: host to guest +iperf3s guest 10001 +# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header + +bw - +bw - +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 3G -l 1232 +bw __BW__ 0.8 1.2 +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 4G -l 1452 +bw __BW__ 1.0 1.5 +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 10G -l 8952 +bw __BW__ 3.0 4.0 +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 20G -l 64372 +bw __BW__ 3.0 4.0 + +iperf3k guest + +tl UDP RR latency over IPv6: host to guest +lat - +lat - +lat - +lat - +lat - +guestb udp_rr --nolog -P 10001 -C 10011 -6 +sleep 1 +nsout LAT udp_rr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + + +tr UDP throughput over IPv4: host to guest +iperf3s guest 10001 +# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header + +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 1G -l 228 +bw __BW__ 0.0 0.0 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 2G -l 548 +bw __BW__ 0.4 0.6 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 3G -l 1252 +bw __BW__ 0.8 1.2 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 4G -l 1472 +bw __BW__ 1.0 1.5 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 10G -l 8972 +bw __BW__ 3.0 4.0 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 20G -l 65492 +bw __BW__ 3.0 4.0 + +iperf3k guest + +tl UDP RR latency over IPv4: host to guest +lat - +lat - +lat - +lat - +lat - +guestb udp_rr --nolog -P 10001 -C 10011 -4 +sleep 1 +nsout LAT udp_rr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + +te diff --git a/test/run b/test/run index 547a729..f188d8e 100755 --- a/test/run +++ b/test/run @@ -93,6 +93,7 @@ run() { test memory/passt teardown memory + VHOST_USER=0 setup passt test passt/ndp test passt/dhcp @@ -115,7 +116,22 @@ run() { test two_guests/basic teardown two_guests + VHOST_USER=1 + setup passt_in_ns + test passt_vu/ndp + test passt_vu_in_ns/dhcp + test passt_vu_in_ns/icmp + test passt_vu_in_ns/tcp + test passt_vu_in_ns/udp + test passt_vu_in_ns/shutdown + teardown passt_in_ns + + setup two_guests + test two_guests_vu/basic + teardown two_guests + VALGRIND=0 + VHOST_USER=0 setup passt_in_ns test passt/ndp test passt_in_ns/dhcp @@ -126,6 +142,15 @@ run() { test passt_in_ns/shutdown teardown passt_in_ns + VHOST_USER=1 + setup passt_in_ns + test passt_vu/ndp + test passt_vu_in_ns/dhcp + test perf/passt_vu_tcp + test perf/passt_vu_udp + test passt_vu_in_ns/shutdown + teardown passt_in_ns + # TODO: Make those faster by at least pre-installing gcc and make on # non-x86 images, then re-enable. skip_distro() { diff --git a/test/two_guests_vu b/test/two_guests_vu new file mode 120000 index 0000000..a8648fc --- /dev/null +++ b/test/two_guests_vu @@ -0,0 +1 @@ +two_guests \ No newline at end of file From 7e131e920c04054b9d005dac718ac54e5169fa71 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 22 Nov 2024 17:43:36 +0100 Subject: [PATCH 141/382] tcp: Move tcp_l2_buf_fill_headers() to tcp_buf.c This function only has callers in tcp_buf.c. More importantly, it's inherently tied to the "buf" path, because it uses internal knowledge of how we lay out the various headers across our locally allocated buffers. Therefore, move it to tcp_buf.c. Slightly reformat the prototypes while we're at it. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 57 +++++++------------------------------------------- tcp_buf.c | 39 ++++++++++++++++++++++++++++++---- tcp_internal.h | 22 ++++++++----------- tcp_vu.c | 22 ++++++++----------- 4 files changed, 61 insertions(+), 79 deletions(-) diff --git a/tcp.c b/tcp.c index e197a1a..61c12a5 100644 --- a/tcp.c +++ b/tcp.c @@ -976,14 +976,11 @@ static void tcp_fill_header(struct tcphdr *th, * @check: Checksum, if already known * @seq: Sequence number for this segment * @no_tcp_csum: Do not set TCP checksum - * - * Return: The IPv4 payload length, host order */ -size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, - struct iphdr *iph, struct tcp_payload_t *bp, - size_t dlen, const uint16_t *check, - uint32_t seq, bool no_tcp_csum) +void tcp_fill_headers4(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, struct iphdr *iph, + struct tcp_payload_t *bp, size_t dlen, + const uint16_t *check, uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); const struct in_addr *src4 = inany_v4(&tapside->oaddr); @@ -1014,8 +1011,6 @@ size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, } tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); - - return l4len; } /** @@ -1028,13 +1023,11 @@ size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, * @check: Checksum, if already known * @seq: Sequence number for this segment * @no_tcp_csum: Do not set TCP checksum - * - * Return: The IPv6 payload length, host order */ -size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, - struct ipv6hdr *ip6h, struct tcp_payload_t *bp, - size_t dlen, uint32_t seq, bool no_tcp_csum) +void tcp_fill_headers6(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, struct ipv6hdr *ip6h, + struct tcp_payload_t *bp, size_t dlen, + uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); size_t l4len = dlen + sizeof(bp->th); @@ -1065,40 +1058,6 @@ size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, } tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); - - return l4len; -} - -/** - * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers - * @conn: Connection pointer - * @iov: Pointer to an array of iovec of TCP pre-cooked buffers - * @dlen: TCP payload length - * @check: Checksum, if already known - * @seq: Sequence number for this segment - * @no_tcp_csum: Do not set TCP checksum - * - * Return: IP payload length, host order - */ -size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, - struct iovec *iov, size_t dlen, - const uint16_t *check, uint32_t seq, - bool no_tcp_csum) -{ - const struct flowside *tapside = TAPFLOW(conn); - const struct in_addr *a4 = inany_v4(&tapside->oaddr); - - if (a4) { - return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, dlen, - check, seq, no_tcp_csum); - } - - return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, dlen, - seq, no_tcp_csum); } /** diff --git a/tcp_buf.c b/tcp_buf.c index d29c1a9..0946cd5 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -147,6 +147,36 @@ void tcp_payload_flush(const struct ctx *c) tcp_payload_used = 0; } +/** + * tcp_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers + * @conn: Connection pointer + * @iov: Pointer to an array of iovec of TCP pre-cooked buffers + * @dlen: TCP payload length + * @check: Checksum, if already known + * @seq: Sequence number for this segment + * @no_tcp_csum: Do not set TCP checksum + */ +static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, + struct iovec *iov, size_t dlen, + const uint16_t *check, uint32_t seq, + bool no_tcp_csum) +{ + const struct flowside *tapside = TAPFLOW(conn); + const struct in_addr *a4 = inany_v4(&tapside->oaddr); + + if (a4) { + tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, dlen, + check, seq, no_tcp_csum); + } else { + tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, dlen, + seq, no_tcp_csum); + } +} + /** * tcp_buf_send_flag() - Send segment with flags to tap (no payload) * @c: Execution context @@ -181,8 +211,10 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) return ret; tcp_payload_used++; - l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false); + l4len = optlen + sizeof(struct tcphdr); iov[TCP_IOV_PAYLOAD].iov_len = l4len; + tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false); + if (flags & DUP_ACK) { struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++]; @@ -215,7 +247,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, struct tcp_payload_t *payload; const uint16_t *check = NULL; struct iovec *iov; - size_t l4len; conn->seq_to_tap = seq + dlen; tcp_frame_conns[tcp_payload_used] = conn; @@ -238,8 +269,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, payload->th.th_x2 = 0; payload->th.th_flags = 0; payload->th.ack = 1; - l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; + iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr); + tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false); if (++tcp_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); } diff --git a/tcp_internal.h b/tcp_internal.h index 8625eed..d7b125f 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -168,19 +168,15 @@ void tcp_update_check_tcp4(const struct iphdr *iph, void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, const struct iovec *iov, int iov_cnt, size_t l4offset); -size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, - struct iphdr *iph, struct tcp_payload_t *bp, - size_t dlen, const uint16_t *check, - uint32_t seq, bool no_tcp_csum); -size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, - struct ipv6hdr *ip6h, struct tcp_payload_t *bp, - size_t dlen, uint32_t seq, bool no_tcp_csum); -size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, - struct iovec *iov, size_t dlen, - const uint16_t *check, uint32_t seq, - bool no_tcp_csum); +void tcp_fill_headers4(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, struct iphdr *iph, + struct tcp_payload_t *bp, size_t dlen, + const uint16_t *check, uint32_t seq, bool no_tcp_csum); +void tcp_fill_headers6(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, struct ipv6hdr *ip6h, + struct tcp_payload_t *bp, size_t dlen, + uint32_t seq, bool no_tcp_csum); + int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, bool force_seq, struct tcp_info_linux *tinfo); int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, diff --git a/tcp_vu.c b/tcp_vu.c index 1bebb31..f27e175 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -98,7 +98,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; const struct flowside *tapside = TAPFLOW(conn); - size_t l2len, l4len, optlen, hdrlen; + size_t optlen, hdrlen; struct vu_virtq_element flags_elem[2]; struct tcp_payload_t *payload; struct ipv6hdr *ip6h = NULL; @@ -157,19 +157,15 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) return ret; } - if (CONN_V4(conn)) { - l4len = tcp_fill_headers4(conn, NULL, iph, payload, optlen, - NULL, seq, true); - l2len = sizeof(*iph); - } else { - l4len = tcp_fill_headers6(conn, NULL, ip6h, payload, optlen, - seq, true); - l2len = sizeof(*ip6h); - } - l2len += l4len + sizeof(struct ethhdr); + flags_elem[0].in_sg[0].iov_len = hdrlen + optlen; + + if (CONN_V4(conn)) { + tcp_fill_headers4(conn, NULL, iph, payload, optlen, NULL, seq, + true); + } else { + tcp_fill_headers6(conn, NULL, ip6h, payload, optlen, seq, true); + } - flags_elem[0].in_sg[0].iov_len = l2len + - sizeof(struct virtio_net_hdr_mrg_rxbuf); if (*c->pcap) { tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1); pcap_iov(&flags_elem[0].in_sg[0], 1, From 6fae899cbbb3ec5bc9a0d5c7dde16131b2f85f05 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Wed, 27 Nov 2024 11:25:21 +0100 Subject: [PATCH 142/382] virtio: check if avail ring is configured If the connection to the vhost-user front end is closed during transfers virtio rings are deconfigured and not available anymore, but we can try to access them to process queued data. This can trigger a SIGSEG as we try to access unavailable memory. To fix that check vq->vring.avail is sane before accessing the vring Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- virtio.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/virtio.c b/virtio.c index 6a97435..0598ff4 100644 --- a/virtio.c +++ b/virtio.c @@ -284,6 +284,9 @@ static int virtqueue_read_next_desc(const struct vring_desc *desc, */ bool vu_queue_empty(struct vu_virtq *vq) { + if (!vq->vring.avail) + return true; + if (vq->shadow_avail_idx != vq->last_avail_idx) return false; @@ -327,6 +330,9 @@ static bool vring_can_notify(const struct vu_dev *dev, struct vu_virtq *vq) */ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) { + if (!vq->vring.avail) + return; + if (!vring_can_notify(dev, vq)) { debug("vhost-user: virtqueue can skip notify..."); return; @@ -502,6 +508,9 @@ int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_elemen unsigned int head; int ret; + if (!vq->vring.avail) + return -1; + if (vu_queue_empty(vq)) return -1; @@ -591,6 +600,9 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, { struct vring_used_elem uelem; + if (!vq->vring.avail) + return; + idx = (idx + vq->used_idx) % vq->vring.num; uelem.id = htole32(index); @@ -633,6 +645,9 @@ void vu_queue_flush(struct vu_virtq *vq, unsigned int count) { uint16_t old, new; + if (!vq->vring.avail) + return; + /* Make sure buffer is written before we update index. */ smp_wmb(); From 00cc2303fd6ac4b72c19d1741dff72fd42c09a47 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Wed, 27 Nov 2024 12:15:51 +0100 Subject: [PATCH 143/382] Fix build on 32bit target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following errors when built with CFLAGS="-m32 -U__AVX2__": packet.c:57:23: warning: format ‘%lu’ expects argument of type ‘long unsigned int’, but argument 5 has type ‘size_t’ {aka ‘unsigned int’} [-Wformat=] 57 | trace("packet offset plus length %lu from size %lu, " 58 | "%s:%i", start - p->buf + len + offset, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | size_t {aka unsigned int} packet.c:57:23: warning: format ‘%lu’ expects argument of type ‘long unsigned int’, but argument 6 has type ‘size_t’ {aka ‘unsigned int’} [-Wformat=] 57 | trace("packet offset plus length %lu from size %lu, " | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 58 | "%s:%i", start - p->buf + len + offset, 59 | p->buf_size, func, line); | ~~~~~~~~~~~ | | | size_t {aka unsigned int} vhost_user.c:139:32: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] 139 | return (void *)(qemu_addr - r->qva + r->mmap_addr + | ^ vhost_user.c:439:32: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] 439 | munmap((void *)r->mmap_addr, r->size + r->mmap_offset); | ^ vhost_user.c:900:32: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] 900 | munmap((void *)r->mmap_addr, r->size + r->mmap_offset); | ^ virtio.c:111:32: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] 111 | return (void *)(guest_addr - r->gpa + r->mmap_addr + | ^ vu_common.c:37:27: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] 37 | char *m = (char *)dev_region->mmap_addr; | ^ Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 2 +- vhost_user.c | 11 +++++++---- virtio.c | 5 +++-- vu_common.c | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/packet.c b/packet.c index e5a78d0..03a11e6 100644 --- a/packet.c +++ b/packet.c @@ -54,7 +54,7 @@ static int packet_check_range(const struct pool *p, size_t offset, size_t len, } if (start + len + offset > p->buf + p->buf_size) { - trace("packet offset plus length %lu from size %lu, " + trace("packet offset plus length %zu from size %zu, " "%s:%i", start - p->buf + len + offset, p->buf_size, func, line); return -1; diff --git a/vhost_user.c b/vhost_user.c index 51c90db..4b8558f 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -136,8 +136,9 @@ static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr) if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ - return (void *)(qemu_addr - r->qva + r->mmap_addr + - r->mmap_offset); + return (void *)(uintptr_t)(qemu_addr - r->qva + + r->mmap_addr + + r->mmap_offset); } } @@ -436,7 +437,8 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, if (r->mmap_addr) { /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ - munmap((void *)r->mmap_addr, r->size + r->mmap_offset); + munmap((void *)(uintptr_t)r->mmap_addr, + r->size + r->mmap_offset); } } vdev->nregions = memory->nregions; @@ -897,7 +899,8 @@ void vu_cleanup(struct vu_dev *vdev) if (r->mmap_addr) { /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ - munmap((void *)r->mmap_addr, r->size + r->mmap_offset); + munmap((void *)(uintptr_t)r->mmap_addr, + r->size + r->mmap_offset); } } vdev->nregions = 0; diff --git a/virtio.c b/virtio.c index 0598ff4..a76de5e 100644 --- a/virtio.c +++ b/virtio.c @@ -108,8 +108,9 @@ static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_add if ((guest_addr + *plen) > (r->gpa + r->size)) *plen = r->gpa + r->size - guest_addr; /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ - return (void *)(guest_addr - r->gpa + r->mmap_addr + - r->mmap_offset); + return (void *)(uintptr_t)(guest_addr - r->gpa + + r->mmap_addr + + r->mmap_offset); } } diff --git a/vu_common.c b/vu_common.c index f2eb701..299b5a3 100644 --- a/vu_common.c +++ b/vu_common.c @@ -35,7 +35,7 @@ int vu_packet_check_range(void *buf, size_t offset, size_t len, for (dev_region = buf; dev_region->mmap_addr; dev_region++) { /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ - char *m = (char *)dev_region->mmap_addr; + char *m = (char *)(uintptr_t)dev_region->mmap_addr; if (m <= start && start + offset + len <= m + dev_region->mmap_offset + From 804a7ce94a14fbc4dee0a14b2c5f7a72ebb8bff6 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 27 Nov 2024 15:37:01 +0100 Subject: [PATCH 144/382] tcp_vu: Change 'dlen' to ssize_t in tcp_vu_data_from_sock() ...to quickly suppress a false positive from Coverity, which assumes that iov_size is 0 and 'dlen' might overflow as a result (with hdrlen being 66). An ASSERT() in tcp_vu_sock_recv() already guarantees that iov_size(iov, buf_cnt) here is anyway greater than 'hdrlen'. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: Laurent Vivier <lvivier@redhat.com> --- tcp_vu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp_vu.c b/tcp_vu.c index f27e175..bbae918 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -463,7 +463,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) for (i = 0, check = NULL; i < head_cnt; i++) { struct iovec *iov = &elem[head[i]].in_sg[0]; int buf_cnt = head[i + 1] - head[i]; - int dlen = iov_size(iov, buf_cnt) - hdrlen; + ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen; vu_set_vnethdr(vdev, iov->iov_base, buf_cnt); From f9311031713ab8f18e9c872a42a8f6a9935954ec Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 27 Nov 2024 14:54:04 +1100 Subject: [PATCH 145/382] iov: iov tail helpers In the vhost-user code we have a number of places where we need to locate a particular header within the guest-supplied IO vector. We need to work out which buffer the header is in, and verify that it's contiguous and aligned as we need. At the moment this is open-coded, but introduce a helper to make this more straightforward. We add a new datatype 'struct iov_tail' representing an IO vector from which we've logically consumed some number of headers. The IOV_PULL_HEADER macro consumes a new header from the vector, returning a pointer and updating the iov_tail. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- iov.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ iov.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) diff --git a/iov.c b/iov.c index 3741db2..4c6416c 100644 --- a/iov.c +++ b/iov.c @@ -155,3 +155,96 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt) return len; } + +/** + * iov_tail_prune() - Remove any unneeded buffers from an IOV tail + * @tail: IO vector tail (modified) + * + * If an IOV tail's offset is large enough, it may not include any bytes from + * the first (or first several) buffers in the underlying IO vector. Modify the + * tail's representation so it contains the same logical bytes, but only + * includes buffers that are actually needed. This will avoid stepping through + * unnecessary elements of the underlying IO vector on future operations. + * + * Return: true if the tail still contains any bytes, otherwise false + */ +bool iov_tail_prune(struct iov_tail *tail) +{ + size_t i; + + i = iov_skip_bytes(tail->iov, tail->cnt, tail->off, &tail->off); + tail->iov += i; + tail->cnt -= i; + + return !!tail->cnt; +} + +/** + * iov_tail_size - Calculate the total size of an IO vector tail + * @tail: IO vector tail + * + * Returns: The total size in bytes. + */ +/* cppcheck-suppress unusedFunction */ +size_t iov_tail_size(struct iov_tail *tail) +{ + iov_tail_prune(tail); + return iov_size(tail->iov, tail->cnt) - tail->off; +} + +/** + * iov_peek_header_() - Get pointer to a header from an IOV tail + * @tail: IOV tail to get header from + * @len: Length of header to get, in bytes + * @align: Required alignment of header, in bytes + * + * @tail may be pruned, but will represent the same bytes as before. + * + * Returns: Pointer to the first @len logical bytes of the tail, NULL if that + * overruns the IO vector, is not contiguous or doesn't have the + * requested alignment. + */ +void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align) +{ + char *p; + + if (!iov_tail_prune(tail)) + return NULL; /* Nothing left */ + + if (tail->off + len < tail->off) + return NULL; /* Overflow */ + + if (tail->off + len > tail->iov[0].iov_len) + return NULL; /* Not contiguous */ + + p = (char *)tail->iov[0].iov_base + tail->off; + if ((uintptr_t)p % align) + return NULL; /* not aligned */ + + return p; +} + +/** + * iov_remove_header_() - Remove a header from an IOV tail + * @tail: IOV tail to remove header from (modified) + * @len: Length of header to remove, in bytes + * @align: Required alignment of header, in bytes + * + * On success, @tail is updated so that it longer includes the bytes of the + * returned header. + * + * Returns: Pointer to the first @len logical bytes of the tail, NULL if that + * overruns the IO vector, is not contiguous or doesn't have the + * requested alignment. + */ +/* cppcheck-suppress unusedFunction */ +void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align) +{ + char *p = iov_peek_header_(tail, len, align); + + if (!p) + return NULL; + + tail->off = tail->off + len; + return p; +} diff --git a/iov.h b/iov.h index a9e1722..9855bf0 100644 --- a/iov.h +++ b/iov.h @@ -28,4 +28,80 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, void *buf, size_t bytes); size_t iov_size(const struct iovec *iov, size_t iov_cnt); + +/* + * DOC: Theory of Operation, struct iov_tail + * + * Sometimes a single logical network frame is split across multiple buffers, + * represented by an IO vector (struct iovec[]). We often want to process this + * one header / network layer at a time. So, it's useful to maintain a "tail" + * of the vector representing the parts we haven't yet extracted. + * + * The headers we extract need not line up with buffer boundaries (though we do + * assume they're contiguous within a single buffer for now). So, we could + * represent that tail as another struct iovec[], but that would mean copying + * the whole array of struct iovecs, just so we can adjust the offset and length + * on the first one. + * + * So, instead represent the tail as pointer into an existing struct iovec[], + * with an explicit offset for where the "tail" starts within it. If we extract + * enough headers that some buffers of the original vector no longer contain + * part of the tail, we (lazily) advance our struct iovec * to the first buffer + * we still need, and adjust the vector length and offset to match. + */ + +/** + * struct iov_tail - An IO vector which may have some headers logically removed + * @iov: IO vector + * @cnt: Number of entries in @iov + * @off: Current offset in @iov + */ +struct iov_tail { + const struct iovec *iov; + size_t cnt, off; +}; + +/** + * IOV_TAIL() - Create a new IOV tail + * @iov_: IO vector to create tail from + * @cnt_: Length of the IO vector at @iov_ + * @off_: Byte offset in the IO vector where the tail begins + */ +#define IOV_TAIL(iov_, cnt_, off_) \ + (struct iov_tail){ .iov = (iov_), .cnt = (cnt_), .off = (off_) } + +bool iov_tail_prune(struct iov_tail *tail); +size_t iov_tail_size(struct iov_tail *tail); +void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align); +void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align); + +/** + * IOV_PEEK_HEADER() - Get typed pointer to a header from an IOV tail + * @tail_: IOV tail to get header from + * @type_: Data type of the header + * + * @tail_ may be pruned, but will represent the same bytes as before. + * + * Returns: Pointer of type (@type_ *) located at the start of @tail_, NULL if + * we can't get a contiguous and aligned pointer. + */ +#define IOV_PEEK_HEADER(tail_, type_) \ + ((type_ *)(iov_peek_header_((tail_), \ + sizeof(type_), __alignof__(type_)))) + +/** + * IOV_REMOVE_HEADER() - Remove and return typed header from an IOV tail + * @tail_: IOV tail to remove header from (modified) + * @type_: Data type of the header to remove + * + * On success, @tail_ is updated so that it longer includes the bytes of the + * returned header. + * + * Returns: Pointer of type (@type_ *) located at the old start of @tail_, NULL + * if we can't get a contiguous and aligned pointer. + */ +#define IOV_REMOVE_HEADER(tail_, type_) \ + ((type_ *)(iov_remove_header_((tail_), \ + sizeof(type_), __alignof__(type_)))) + #endif /* IOVEC_H */ From 67151090bc349d9eec5a0b303d0cb3347b755251 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 27 Nov 2024 14:54:05 +1100 Subject: [PATCH 146/382] iov, checksum: Replace csum_iov() with csum_iov_tail() We usually want to checksum only the tail part of a frame, excluding at least some headers. csum_iov() does that for a frame represented as an IO vector, not actually summing the entire IO vector. We now have struct iov_tail to explicitly represent this construct, so replace csum_iov() with csum_iov_tail() taking that representation rather than 3 parameters. We propagate the same change to csum_udp4() and csum_udp6() which take similar parameters. This slightly simplifies the code, and will allow some further simplifications as struct iov_tail is more widely used. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- checksum.c | 56 +++++++++++++++++++++--------------------------------- checksum.h | 8 ++++---- iov.c | 1 - tap.c | 6 ++++-- tcp.c | 6 ++++-- udp.c | 7 ++++--- udp_vu.c | 9 +++++---- 7 files changed, 43 insertions(+), 50 deletions(-) diff --git a/checksum.c b/checksum.c index c673993..1c4354d 100644 --- a/checksum.c +++ b/checksum.c @@ -166,24 +166,22 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol, * @udp4hr: UDP header, initialised apart from checksum * @saddr: IPv4 source address * @daddr: IPv4 destination address - * @iov: Pointer to the array of IO vectors - * @iov_cnt: Length of the array - * @offset: UDP payload offset in the iovec array + * @data: UDP payload (as IO vector tail) */ void csum_udp4(struct udphdr *udp4hr, struct in_addr saddr, struct in_addr daddr, - const struct iovec *iov, int iov_cnt, size_t offset) + struct iov_tail *data) { /* UDP checksums are optional, so don't bother */ udp4hr->check = 0; if (UDP4_REAL_CHECKSUMS) { - uint16_t l4len = iov_size(iov, iov_cnt) - offset + - sizeof(struct udphdr); + uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr); uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP, saddr, daddr); + psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum); - udp4hr->check = csum_iov(iov, iov_cnt, offset, psum); + udp4hr->check = csum_iov_tail(data, psum); } } @@ -231,22 +229,20 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol, * @udp6hr: UDP header, initialised apart from checksum * @saddr: Source address * @daddr: Destination address - * @iov: Pointer to the array of IO vectors - * @iov_cnt: Length of the array - * @offset: UDP payload offset in the iovec array + * @data: UDP payload (as IO vector tail) */ void csum_udp6(struct udphdr *udp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, - const struct iovec *iov, int iov_cnt, size_t offset) + struct iov_tail *data) { - uint16_t l4len = iov_size(iov, iov_cnt) - offset + - sizeof(struct udphdr); + uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr); uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP, saddr, daddr); + udp6hr->check = 0; psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum); - udp6hr->check = csum_iov(iov, iov_cnt, offset, psum); + udp6hr->check = csum_iov_tail(data, psum); } /** @@ -501,31 +497,23 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) } /** - * csum_iov() - Calculates the unfolded checksum over an array of IO vectors - * - * @iov Pointer to the array of IO vectors - * @n Length of the array - * @offset: Offset of the data to checksum within the full data length + * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector + * @tail: IO vector tail to checksum * @init Initial 32-bit checksum, 0 for no pre-computed checksum * * Return: 16-bit folded, complemented checksum */ -uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, - uint32_t init) +uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init) { - unsigned int i; - size_t first; - - i = iov_skip_bytes(iov, n, offset, &first); - if (i >= n) - return (uint16_t)~csum_fold(init); - - init = csum_unfolded((char *)iov[i].iov_base + first, - iov[i].iov_len - first, init); - i++; - - for (; i < n; i++) - init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init); + if (iov_tail_prune(tail)) { + size_t i; + init = csum_unfolded((char *)tail->iov[0].iov_base + tail->off, + tail->iov[0].iov_len - tail->off, init); + for (i = 1; i < tail->cnt; i++) { + const struct iovec *iov = &tail->iov[i]; + init = csum_unfolded(iov->iov_base, iov->iov_len, init); + } + } return (uint16_t)~csum_fold(init); } diff --git a/checksum.h b/checksum.h index 31ba322..e243c97 100644 --- a/checksum.h +++ b/checksum.h @@ -9,6 +9,7 @@ struct udphdr; struct icmphdr; struct icmp6hdr; +struct iov_tail; uint32_t sum_16b(const void *buf, size_t len); uint16_t csum_fold(uint32_t sum); @@ -19,20 +20,19 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol, struct in_addr saddr, struct in_addr daddr); void csum_udp4(struct udphdr *udp4hr, struct in_addr saddr, struct in_addr daddr, - const struct iovec *iov, int iov_cnt, size_t offset); + struct iov_tail *data); void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen); uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol, const struct in6_addr *saddr, const struct in6_addr *daddr); void csum_udp6(struct udphdr *udp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, - const struct iovec *iov, int iov_cnt, size_t offset); + struct iov_tail *data); void csum_icmp6(struct icmp6hdr *icmp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, const void *payload, size_t dlen); uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init); uint16_t csum(const void *buf, size_t len, uint32_t init); -uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, - uint32_t init); +uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init); #endif /* CHECKSUM_H */ diff --git a/iov.c b/iov.c index 4c6416c..2f7be15 100644 --- a/iov.c +++ b/iov.c @@ -185,7 +185,6 @@ bool iov_tail_prune(struct iov_tail *tail) * * Returns: The total size in bytes. */ -/* cppcheck-suppress unusedFunction */ size_t iov_tail_size(struct iov_tail *tail) { iov_tail_prune(tail); diff --git a/tap.c b/tap.c index cde1719..c418064 100644 --- a/tap.c +++ b/tap.c @@ -184,11 +184,12 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, .iov_base = (void *)in, .iov_len = dlen }; + struct iov_tail payload = IOV_TAIL(&iov, 1, 0); uh->source = htons(sport); uh->dest = htons(dport); uh->len = htons(l4len); - csum_udp4(uh, src, dst, &iov, 1, 0); + csum_udp4(uh, src, dst, &payload); memcpy(data, in, dlen); tap_send_single(c, buf, dlen + (data - buf)); @@ -271,11 +272,12 @@ void tap_udp6_send(const struct ctx *c, .iov_base = in, .iov_len = dlen }; + struct iov_tail payload = IOV_TAIL(&iov, 1, 0); uh->source = htons(sport); uh->dest = htons(dport); uh->len = htons(l4len); - csum_udp6(uh, src, dst, &iov, 1, 0); + csum_udp6(uh, src, dst, &payload); memcpy(data, in, dlen); tap_send_single(c, buf, dlen + (data - buf)); diff --git a/tcp.c b/tcp.c index 61c12a5..f334ca5 100644 --- a/tcp.c +++ b/tcp.c @@ -764,6 +764,7 @@ void tcp_update_check_tcp4(const struct iphdr *iph, size_t l4offset) { uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); + struct iov_tail l4 = IOV_TAIL(iov, iov_cnt, l4offset); struct in_addr saddr = { .s_addr = iph->saddr }; struct in_addr daddr = { .s_addr = iph->daddr }; size_t check_ofs; @@ -801,7 +802,7 @@ void tcp_update_check_tcp4(const struct iphdr *iph, check = (uint16_t *)ptr; *check = 0; - *check = csum_iov(iov, iov_cnt, l4offset, sum); + *check = csum_iov_tail(&l4, sum); } /** @@ -815,6 +816,7 @@ void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, const struct iovec *iov, int iov_cnt, size_t l4offset) { + struct iov_tail l4 = IOV_TAIL(iov, iov_cnt, l4offset); uint16_t l4len = ntohs(ip6h->payload_len); size_t check_ofs; uint16_t *check; @@ -852,7 +854,7 @@ void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, check = (uint16_t *)ptr; *check = 0; - *check = csum_iov(iov, iov_cnt, l4offset, sum); + *check = csum_iov_tail(&l4, sum); } /** diff --git a/udp.c b/udp.c index 5b0093a..c89f031 100644 --- a/udp.c +++ b/udp.c @@ -316,7 +316,8 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, .iov_base = bp->data, .iov_len = dlen }; - csum_udp4(&bp->uh, *src, *dst, &iov, 1, 0); + struct iov_tail data = IOV_TAIL(&iov, 1, 0); + csum_udp4(&bp->uh, *src, *dst, &data); } return l4len; @@ -360,8 +361,8 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, .iov_base = bp->data, .iov_len = dlen }; - csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, - &iov, 1, 0); + struct iov_tail data = IOV_TAIL(&iov, 1, 0); + csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data); } return l4len; diff --git a/udp_vu.c b/udp_vu.c index c911022..9c697f3 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -199,15 +199,16 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used) const struct in_addr *dst4 = inany_v4(&toside->eaddr); char *base = iov_vu[0].iov_base; struct udp_payload_t *bp; + struct iov_tail data; if (src4 && dst4) { bp = vu_payloadv4(base); - csum_udp4(&bp->uh, *src4, *dst4, iov_vu, iov_used, - (char *)&bp->data - base); + data = IOV_TAIL(iov_vu, iov_used, (char *)&bp->data - base); + csum_udp4(&bp->uh, *src4, *dst4, &data); } else { bp = vu_payloadv6(base); - csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, - iov_vu, iov_used, (char *)&bp->data - base); + data = IOV_TAIL(iov_vu, iov_used, (char *)&bp->data - base); + csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data); } } From 2ee07697c4ab4f4efff6431aaa787f21bcc6f1d1 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 27 Nov 2024 14:54:06 +1100 Subject: [PATCH 147/382] tcp: Pass TCP header and payload separately to tcp_update_check_tcp[46]() Currently these expects both the TCP header and payload in a single IOV, and goes to some trouble to locate the checksum field within it. In the current caller we've already know where the TCP header is, so we might as well just pass it in. This will need to work a bit differently for vhost-user, but that code already needs to locate the TCP header for other reasons, so again we can just pass it in. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 106 ++++++++++--------------------------------------- tcp_internal.h | 10 ++--- tcp_vu.c | 12 ++++-- 3 files changed, 34 insertions(+), 94 deletions(-) diff --git a/tcp.c b/tcp.c index f334ca5..5c40e18 100644 --- a/tcp.c +++ b/tcp.c @@ -755,106 +755,42 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) /** * tcp_update_check_tcp4() - Calculate TCP checksum for IPv4 * @iph: IPv4 header - * @iov: Pointer to the array of IO vectors - * @iov_cnt: Length of the array - * @l4offset: IPv4 payload offset in the iovec array + * @th: TCP header (updated) + * @payload: TCP payload */ -void tcp_update_check_tcp4(const struct iphdr *iph, - const struct iovec *iov, int iov_cnt, - size_t l4offset) +void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th, + struct iov_tail *payload) { uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); - struct iov_tail l4 = IOV_TAIL(iov, iov_cnt, l4offset); struct in_addr saddr = { .s_addr = iph->saddr }; struct in_addr daddr = { .s_addr = iph->daddr }; - size_t check_ofs; - uint16_t *check; - int check_idx; uint32_t sum; - char *ptr; sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); - check_idx = iov_skip_bytes(iov, iov_cnt, - l4offset + offsetof(struct tcphdr, check), - &check_ofs); - - if (check_idx >= iov_cnt) { - err("TCP4 buffer is too small, iov size %zd, check offset %zd", - iov_size(iov, iov_cnt), - l4offset + offsetof(struct tcphdr, check)); - return; - } - - if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) { - err("TCP4 checksum field memory is not contiguous " - "check_ofs %zd check_idx %d iov_len %zd", - check_ofs, check_idx, iov[check_idx].iov_len); - return; - } - - ptr = (char *)iov[check_idx].iov_base + check_ofs; - if ((uintptr_t)ptr & (__alignof__(*check) - 1)) { - err("TCP4 checksum field is not correctly aligned in memory"); - return; - } - - check = (uint16_t *)ptr; - - *check = 0; - *check = csum_iov_tail(&l4, sum); + th->check = 0; + sum = csum_unfolded(th, sizeof(*th), sum); + th->check = csum_iov_tail(payload, sum); } /** * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6 * @ip6h: IPv6 header - * @iov: Pointer to the array of IO vectors - * @iov_cnt: Length of the array - * @l4offset: IPv6 payload offset in the iovec array + * @th: TCP header (updated) + * @payload: TCP payload */ -void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, - const struct iovec *iov, int iov_cnt, - size_t l4offset) +void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, struct tcphdr *th, + struct iov_tail *payload) { - struct iov_tail l4 = IOV_TAIL(iov, iov_cnt, l4offset); uint16_t l4len = ntohs(ip6h->payload_len); - size_t check_ofs; - uint16_t *check; - int check_idx; uint32_t sum; - char *ptr; sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr, &ip6h->daddr); - check_idx = iov_skip_bytes(iov, iov_cnt, - l4offset + offsetof(struct tcphdr, check), - &check_ofs); - - if (check_idx >= iov_cnt) { - err("TCP6 buffer is too small, iov size %zd, check offset %zd", - iov_size(iov, iov_cnt), - l4offset + offsetof(struct tcphdr, check)); - return; - } - - if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) { - err("TCP6 checksum field memory is not contiguous " - "check_ofs %zd check_idx %d iov_len %zd", - check_ofs, check_idx, iov[check_idx].iov_len); - return; - } - - ptr = (char *)iov[check_idx].iov_base + check_ofs; - if ((uintptr_t)ptr & (__alignof__(*check) - 1)) { - err("TCP6 checksum field is not correctly aligned in memory"); - return; - } - - check = (uint16_t *)ptr; - - *check = 0; - *check = csum_iov_tail(&l4, sum); + th->check = 0; + sum = csum_unfolded(th, sizeof(*th), sum); + th->check = csum_iov_tail(payload, sum); } /** @@ -1005,11 +941,12 @@ void tcp_fill_headers4(const struct tcp_tap_conn *conn, bp->th.check = 0; } else { const struct iovec iov = { - .iov_base = bp, - .iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr), + .iov_base = bp->data, + .iov_len = dlen, }; + struct iov_tail payload = IOV_TAIL(&iov, 1, 0); - tcp_update_check_tcp4(iph, &iov, 1, 0); + tcp_update_check_tcp4(iph, &bp->th, &payload); } tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); @@ -1052,11 +989,12 @@ void tcp_fill_headers6(const struct tcp_tap_conn *conn, bp->th.check = 0; } else { const struct iovec iov = { - .iov_base = bp, - .iov_len = ntohs(ip6h->payload_len) + .iov_base = bp->data, + .iov_len = dlen, }; + struct iov_tail payload = IOV_TAIL(&iov, 1, 0); - tcp_update_check_tcp6(ip6h, &iov, 1, 0); + tcp_update_check_tcp6(ip6h, &bp->th, &payload); } tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); diff --git a/tcp_internal.h b/tcp_internal.h index d7b125f..744c5c0 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -162,12 +162,10 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); struct tcp_info_linux; -void tcp_update_check_tcp4(const struct iphdr *iph, - const struct iovec *iov, int iov_cnt, - size_t l4offset); -void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, - const struct iovec *iov, int iov_cnt, - size_t l4offset); +void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th, + struct iov_tail *payload); +void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, struct tcphdr *th, + struct iov_tail *payload); void tcp_fill_headers4(const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct iphdr *iph, struct tcp_payload_t *bp, size_t dlen, diff --git a/tcp_vu.c b/tcp_vu.c index bbae918..134650e 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -73,15 +73,19 @@ static void tcp_vu_update_check(const struct flowside *tapside, char *base = iov[0].iov_base; if (inany_v4(&tapside->oaddr)) { + struct tcphdr *th = vu_payloadv4(base); const struct iphdr *iph = vu_ip(base); + struct iov_tail payload = IOV_TAIL(iov, iov_cnt, + (char *)(th + 1) - base); - tcp_update_check_tcp4(iph, iov, iov_cnt, - (char *)vu_payloadv4(base) - base); + tcp_update_check_tcp4(iph, th, &payload); } else { + struct tcphdr *th = vu_payloadv6(base); const struct ipv6hdr *ip6h = vu_ip(base); + struct iov_tail payload = IOV_TAIL(iov, iov_cnt, + (char *)(th + 1) - base); - tcp_update_check_tcp6(ip6h, iov, iov_cnt, - (char *)vu_payloadv6(base) - base); + tcp_update_check_tcp6(ip6h, th, &payload); } } From 08ea3cc581beed16afff3fa934f31cbdb82cbb95 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 27 Nov 2024 14:54:07 +1100 Subject: [PATCH 148/382] tcp: Pass TCP header and payload separately to tcp_fill_headers[46]() At the moment these take separate pointers to the tap specific and IP headers, but expect the TCP header and payload as a single tcp_payload_t. As well as being slightly inconsistent, this involves some slightly iffy pointer shenanigans when called on the flags path with a tcp_flags_t instead of a tcp_payload_t. More importantly, it's inconvenient for the upcoming vhost-user case, where the TCP header and payload might not be contiguous. Furthermore, the payload itself might not be contiguous. So, pass the TCP header as its own pointer, and the TCP payload as an IO vector. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- iov.c | 1 - tcp.c | 50 +++++++++++++++------------------------ tcp_buf.c | 22 ++++++++---------- tcp_internal.h | 4 ++-- tcp_vu.c | 63 ++++++++++++++++++++++++++++---------------------- 5 files changed, 65 insertions(+), 75 deletions(-) diff --git a/iov.c b/iov.c index 2f7be15..3b12272 100644 --- a/iov.c +++ b/iov.c @@ -236,7 +236,6 @@ void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align) * overruns the IO vector, is not contiguous or doesn't have the * requested alignment. */ -/* cppcheck-suppress unusedFunction */ void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align) { char *p = iov_peek_header_(tail, len, align); diff --git a/tcp.c b/tcp.c index 5c40e18..2f900fc 100644 --- a/tcp.c +++ b/tcp.c @@ -909,21 +909,21 @@ static void tcp_fill_header(struct tcphdr *th, * @conn: Connection pointer * @taph: tap backend specific header * @iph: Pointer to IPv4 header - * @bp: Pointer to TCP header followed by TCP payload - * @dlen: TCP payload length + * @th: Pointer to TCP header + * @payload: TCP payload * @check: Checksum, if already known * @seq: Sequence number for this segment * @no_tcp_csum: Do not set TCP checksum */ void tcp_fill_headers4(const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct iphdr *iph, - struct tcp_payload_t *bp, size_t dlen, + struct tcphdr *th, struct iov_tail *payload, const uint16_t *check, uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); const struct in_addr *src4 = inany_v4(&tapside->oaddr); const struct in_addr *dst4 = inany_v4(&tapside->eaddr); - size_t l4len = dlen + sizeof(bp->th); + size_t l4len = iov_tail_size(payload) + sizeof(*th); size_t l3len = l4len + sizeof(*iph); ASSERT(src4 && dst4); @@ -935,19 +935,12 @@ void tcp_fill_headers4(const struct tcp_tap_conn *conn, iph->check = check ? *check : csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4); - tcp_fill_header(&bp->th, conn, seq); + tcp_fill_header(th, conn, seq); - if (no_tcp_csum) { - bp->th.check = 0; - } else { - const struct iovec iov = { - .iov_base = bp->data, - .iov_len = dlen, - }; - struct iov_tail payload = IOV_TAIL(&iov, 1, 0); - - tcp_update_check_tcp4(iph, &bp->th, &payload); - } + if (no_tcp_csum) + th->check = 0; + else + tcp_update_check_tcp4(iph, th, payload); tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); } @@ -957,19 +950,19 @@ void tcp_fill_headers4(const struct tcp_tap_conn *conn, * @conn: Connection pointer * @taph: tap backend specific header * @ip6h: Pointer to IPv6 header - * @bp: Pointer to TCP header followed by TCP payload - * @dlen: TCP payload length + * @th: Pointer to TCP header + * @payload: TCP payload * @check: Checksum, if already known * @seq: Sequence number for this segment * @no_tcp_csum: Do not set TCP checksum */ void tcp_fill_headers6(const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct ipv6hdr *ip6h, - struct tcp_payload_t *bp, size_t dlen, + struct tcphdr *th, struct iov_tail *payload, uint32_t seq, bool no_tcp_csum) { + size_t l4len = iov_tail_size(payload) + sizeof(*th); const struct flowside *tapside = TAPFLOW(conn); - size_t l4len = dlen + sizeof(bp->th); ip6h->payload_len = htons(l4len); ip6h->saddr = tapside->oaddr.a6; @@ -983,19 +976,12 @@ void tcp_fill_headers6(const struct tcp_tap_conn *conn, ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; - tcp_fill_header(&bp->th, conn, seq); + tcp_fill_header(th, conn, seq); - if (no_tcp_csum) { - bp->th.check = 0; - } else { - const struct iovec iov = { - .iov_base = bp->data, - .iov_len = dlen, - }; - struct iov_tail payload = IOV_TAIL(&iov, 1, 0); - - tcp_update_check_tcp6(ip6h, &bp->th, &payload); - } + if (no_tcp_csum) + th->check = 0; + else + tcp_update_check_tcp6(ip6h, th, payload); tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); } diff --git a/tcp_buf.c b/tcp_buf.c index 0946cd5..830c23d 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -151,29 +151,27 @@ void tcp_payload_flush(const struct ctx *c) * tcp_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * @conn: Connection pointer * @iov: Pointer to an array of iovec of TCP pre-cooked buffers - * @dlen: TCP payload length * @check: Checksum, if already known * @seq: Sequence number for this segment * @no_tcp_csum: Do not set TCP checksum */ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, - struct iovec *iov, size_t dlen, - const uint16_t *check, uint32_t seq, - bool no_tcp_csum) + struct iovec *iov, const uint16_t *check, + uint32_t seq, bool no_tcp_csum) { + struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0); + struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr); const struct flowside *tapside = TAPFLOW(conn); const struct in_addr *a4 = inany_v4(&tapside->oaddr); if (a4) { tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, dlen, - check, seq, no_tcp_csum); + iov[TCP_IOV_IP].iov_base, th, + &tail, check, seq, no_tcp_csum); } else { tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, dlen, - seq, no_tcp_csum); + iov[TCP_IOV_IP].iov_base, th, + &tail, seq, no_tcp_csum); } } @@ -213,7 +211,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) tcp_payload_used++; l4len = optlen + sizeof(struct tcphdr); iov[TCP_IOV_PAYLOAD].iov_len = l4len; - tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false); + tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false); if (flags & DUP_ACK) { struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++]; @@ -270,7 +268,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, payload->th.th_flags = 0; payload->th.ack = 1; iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr); - tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false); + tcp_l2_buf_fill_headers(conn, iov, check, seq, false); if (++tcp_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); } diff --git a/tcp_internal.h b/tcp_internal.h index 744c5c0..9732b5b 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -168,11 +168,11 @@ void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, struct tcphdr *th, struct iov_tail *payload); void tcp_fill_headers4(const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct iphdr *iph, - struct tcp_payload_t *bp, size_t dlen, + struct tcphdr *th, struct iov_tail *payload, const uint16_t *check, uint32_t seq, bool no_tcp_csum); void tcp_fill_headers6(const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct ipv6hdr *ip6h, - struct tcp_payload_t *bp, size_t dlen, + struct tcphdr *th, struct iov_tail *payload, uint32_t seq, bool no_tcp_csum); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, diff --git a/tcp_vu.c b/tcp_vu.c index 134650e..470649e 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -104,10 +104,12 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) const struct flowside *tapside = TAPFLOW(conn); size_t optlen, hdrlen; struct vu_virtq_element flags_elem[2]; - struct tcp_payload_t *payload; struct ipv6hdr *ip6h = NULL; struct iovec flags_iov[2]; + struct tcp_syn_opts *opts; struct iphdr *iph = NULL; + struct iov_tail payload; + struct tcphdr *th; struct ethhdr *eh; uint32_t seq; int elem_cnt; @@ -139,35 +141,35 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) iph = vu_ip(flags_elem[0].in_sg[0].iov_base); *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); - payload = vu_payloadv4(flags_elem[0].in_sg[0].iov_base); + th = vu_payloadv4(flags_elem[0].in_sg[0].iov_base); } else { eh->h_proto = htons(ETH_P_IPV6); ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base); *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); - payload = vu_payloadv6(flags_elem[0].in_sg[0].iov_base); + th = vu_payloadv6(flags_elem[0].in_sg[0].iov_base); } - memset(&payload->th, 0, sizeof(payload->th)); - payload->th.doff = offsetof(struct tcp_payload_t, data) / 4; - payload->th.ack = 1; + memset(th, 0, sizeof(*th)); + th->doff = sizeof(*th) / 4; + th->ack = 1; seq = conn->seq_to_tap; - ret = tcp_prepare_flags(c, conn, flags, &payload->th, - (struct tcp_syn_opts *)payload->data, - &optlen); + opts = (struct tcp_syn_opts *)(th + 1); + ret = tcp_prepare_flags(c, conn, flags, th, opts, &optlen); if (ret <= 0) { vu_queue_rewind(vq, 1); return ret; } flags_elem[0].in_sg[0].iov_len = hdrlen + optlen; + payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen); if (CONN_V4(conn)) { - tcp_fill_headers4(conn, NULL, iph, payload, optlen, NULL, seq, - true); + tcp_fill_headers4(conn, NULL, iph, th, &payload, + NULL, seq, true); } else { - tcp_fill_headers6(conn, NULL, ip6h, payload, optlen, seq, true); + tcp_fill_headers6(conn, NULL, ip6h, th, &payload, seq, true); } if (*c->pcap) { @@ -317,23 +319,28 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, * tcp_vu_prepare() - Prepare the frame header * @c: Execution context * @conn: Connection pointer - * @first: Pointer to the array of IO vectors - * @dlen: Packet data length + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Number of entries in @iov * @check: Checksum, if already known */ -static void tcp_vu_prepare(const struct ctx *c, - struct tcp_tap_conn *conn, char *base, - size_t dlen, const uint16_t **check) +static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, + struct iovec *iov, size_t iov_cnt, + const uint16_t **check) { const struct flowside *toside = TAPFLOW(conn); - struct tcp_payload_t *payload; + bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); + size_t hdrlen = tcp_vu_hdrlen(v6); + struct iov_tail payload = IOV_TAIL(iov, iov_cnt, hdrlen); + char *base = iov[0].iov_base; struct ipv6hdr *ip6h = NULL; struct iphdr *iph = NULL; + struct tcphdr *th; struct ethhdr *eh; /* we guess the first iovec provided by the guest can embed * all the headers needed by L2 frame */ + ASSERT(iov[0].iov_len >= hdrlen); eh = vu_eth(base); @@ -342,31 +349,31 @@ static void tcp_vu_prepare(const struct ctx *c, /* initialize header */ - if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) { + if (!v6) { eh->h_proto = htons(ETH_P_IP); iph = vu_ip(base); *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); - payload = vu_payloadv4(base); + th = vu_payloadv4(base); } else { eh->h_proto = htons(ETH_P_IPV6); ip6h = vu_ip(base); *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); - payload = vu_payloadv6(base); + th = vu_payloadv6(base); } - memset(&payload->th, 0, sizeof(payload->th)); - payload->th.doff = offsetof(struct tcp_payload_t, data) / 4; - payload->th.ack = 1; + memset(th, 0, sizeof(*th)); + th->doff = sizeof(*th) / 4; + th->ack = 1; - if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) { - tcp_fill_headers4(conn, NULL, iph, payload, dlen, + if (!v6) { + tcp_fill_headers4(conn, NULL, iph, th, &payload, *check, conn->seq_to_tap, true); *check = &iph->check; } else { - tcp_fill_headers6(conn, NULL, ip6h, payload, dlen, + tcp_fill_headers6(conn, NULL, ip6h, th, &payload, conn->seq_to_tap, true); } } @@ -478,7 +485,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) if (i + 1 == head_cnt) check = NULL; - tcp_vu_prepare(c, conn, iov->iov_base, dlen, &check); + tcp_vu_prepare(c, conn, iov, buf_cnt, &check); if (*c->pcap) { tcp_vu_update_check(tapside, iov, buf_cnt); From 2abf5ab7f3734eae9377cfab4759ae83fabf3a7e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 27 Nov 2024 14:54:08 +1100 Subject: [PATCH 149/382] tcp: Merge tcp_update_check_tcp[46]() The only reason we need separate functions for the IPv4 and IPv6 case is to calculate the checksum of the IP pseudo-header, which is different for the two cases. However, the caller already knows which path it's on and can access the values needed for the pseudo-header partial sum more easily than tcp_update_check_tcp[46]() can. So, merge these functions into a single tcp_update_csum() function that just takes the pseudo-header partial sum, calculated in the caller. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 59 +++++++++++++++++--------------------------------- tcp_internal.h | 6 ++--- tcp_vu.c | 22 ++++++++++++------- 3 files changed, 36 insertions(+), 51 deletions(-) diff --git a/tcp.c b/tcp.c index 2f900fc..482e460 100644 --- a/tcp.c +++ b/tcp.c @@ -753,44 +753,16 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) } /** - * tcp_update_check_tcp4() - Calculate TCP checksum for IPv4 - * @iph: IPv4 header + * tcp_update_csum() - Calculate TCP checksum + * @psum: Unfolded partial checksum of the IPv4 or IPv6 pseudo-header * @th: TCP header (updated) * @payload: TCP payload */ -void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th, - struct iov_tail *payload) +void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload) { - uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); - struct in_addr saddr = { .s_addr = iph->saddr }; - struct in_addr daddr = { .s_addr = iph->daddr }; - uint32_t sum; - - sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); - th->check = 0; - sum = csum_unfolded(th, sizeof(*th), sum); - th->check = csum_iov_tail(payload, sum); -} - -/** - * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6 - * @ip6h: IPv6 header - * @th: TCP header (updated) - * @payload: TCP payload - */ -void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, struct tcphdr *th, - struct iov_tail *payload) -{ - uint16_t l4len = ntohs(ip6h->payload_len); - uint32_t sum; - - sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr, - &ip6h->daddr); - - th->check = 0; - sum = csum_unfolded(th, sizeof(*th), sum); - th->check = csum_iov_tail(payload, sum); + psum = csum_unfolded(th, sizeof(*th), psum); + th->check = csum_iov_tail(payload, psum); } /** @@ -937,10 +909,14 @@ void tcp_fill_headers4(const struct tcp_tap_conn *conn, tcp_fill_header(th, conn, seq); - if (no_tcp_csum) + if (no_tcp_csum) { th->check = 0; - else - tcp_update_check_tcp4(iph, th, payload); + } else { + uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, + *src4, *dst4); + + tcp_update_csum(psum, th, payload); + } tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); } @@ -978,10 +954,15 @@ void tcp_fill_headers6(const struct tcp_tap_conn *conn, tcp_fill_header(th, conn, seq); - if (no_tcp_csum) + if (no_tcp_csum) { th->check = 0; - else - tcp_update_check_tcp6(ip6h, th, payload); + } else { + uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, + &ip6h->saddr, + &ip6h->daddr); + + tcp_update_csum(psum, th, payload); + } tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); } diff --git a/tcp_internal.h b/tcp_internal.h index 9732b5b..cff06e0 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -162,10 +162,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); struct tcp_info_linux; -void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th, - struct iov_tail *payload); -void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, struct tcphdr *th, - struct iov_tail *payload); +void tcp_update_csum(uint32_t psum, struct tcphdr *th, + struct iov_tail *payload); void tcp_fill_headers4(const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct iphdr *iph, struct tcphdr *th, struct iov_tail *payload, diff --git a/tcp_vu.c b/tcp_vu.c index 470649e..a3d2e7d 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -71,22 +71,28 @@ static void tcp_vu_update_check(const struct flowside *tapside, struct iovec *iov, int iov_cnt) { char *base = iov[0].iov_base; + struct iov_tail payload; + struct tcphdr *th; + uint32_t psum; if (inany_v4(&tapside->oaddr)) { - struct tcphdr *th = vu_payloadv4(base); + const struct in_addr *src4 = inany_v4(&tapside->oaddr); + const struct in_addr *dst4 = inany_v4(&tapside->eaddr); const struct iphdr *iph = vu_ip(base); - struct iov_tail payload = IOV_TAIL(iov, iov_cnt, - (char *)(th + 1) - base); + size_t l4len = ntohs(iph->tot_len) - sizeof(*iph); - tcp_update_check_tcp4(iph, th, &payload); + th = vu_payloadv4(base); + psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, *src4, *dst4); } else { - struct tcphdr *th = vu_payloadv6(base); const struct ipv6hdr *ip6h = vu_ip(base); - struct iov_tail payload = IOV_TAIL(iov, iov_cnt, - (char *)(th + 1) - base); + size_t l4len = ntohs(ip6h->payload_len); - tcp_update_check_tcp6(ip6h, th, &payload); + th = vu_payloadv6(base); + psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, + &ip6h->saddr, &ip6h->daddr); } + payload = IOV_TAIL(iov, iov_cnt, (char *)(th + 1) - base); + tcp_update_csum(psum, th, &payload); } /** From a6348cad51398346b1ce1009be87a718b8f72bba Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 27 Nov 2024 14:54:09 +1100 Subject: [PATCH 150/382] tcp: Merge tcp_fill_headers[46]() with each other We have different versions of this function for IPv4 and IPv6, but the caller already requires some IP version specific code to get the right header pointers. Instead, have a common function that fills either an IPv4 or an IPv6 header based on which header pointer it is passed. This allows us to remove a small amount of code duplication and make a few slightly ugly conditionals. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 124 ++++++++++++++++++++++--------------------------- tcp_buf.c | 19 ++++---- tcp_internal.h | 13 ++---- tcp_vu.c | 32 +++++-------- 4 files changed, 83 insertions(+), 105 deletions(-) diff --git a/tcp.c b/tcp.c index 482e460..1872ccb 100644 --- a/tcp.c +++ b/tcp.c @@ -877,96 +877,84 @@ static void tcp_fill_header(struct tcphdr *th, } /** - * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers + * tcp_fill_headers() - Fill 802.3, IP, TCP headers * @conn: Connection pointer * @taph: tap backend specific header - * @iph: Pointer to IPv4 header + * @ip4h: Pointer to IPv4 header, or NULL + * @ip6h: Pointer to IPv6 header, or NULL * @th: Pointer to TCP header * @payload: TCP payload - * @check: Checksum, if already known + * @ip4_check: IPv4 checksum, if already known * @seq: Sequence number for this segment * @no_tcp_csum: Do not set TCP checksum */ -void tcp_fill_headers4(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct iphdr *iph, - struct tcphdr *th, struct iov_tail *payload, - const uint16_t *check, uint32_t seq, bool no_tcp_csum) +void tcp_fill_headers(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct iphdr *ip4h, struct ipv6hdr *ip6h, + struct tcphdr *th, struct iov_tail *payload, + const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); - const struct in_addr *src4 = inany_v4(&tapside->oaddr); - const struct in_addr *dst4 = inany_v4(&tapside->eaddr); size_t l4len = iov_tail_size(payload) + sizeof(*th); - size_t l3len = l4len + sizeof(*iph); + size_t l3len = l4len; + uint32_t psum = 0; - ASSERT(src4 && dst4); + if (ip4h) { + const struct in_addr *src4 = inany_v4(&tapside->oaddr); + const struct in_addr *dst4 = inany_v4(&tapside->eaddr); - iph->tot_len = htons(l3len); - iph->saddr = src4->s_addr; - iph->daddr = dst4->s_addr; + ASSERT(src4 && dst4); - iph->check = check ? *check : - csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4); + l3len += + sizeof(*ip4h); + + ip4h->tot_len = htons(l3len); + ip4h->saddr = src4->s_addr; + ip4h->daddr = dst4->s_addr; + + if (ip4_check) + ip4h->check = *ip4_check; + else + ip4h->check = csum_ip4_header(l3len, IPPROTO_TCP, + *src4, *dst4); + + if (!no_tcp_csum) { + psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, + *src4, *dst4); + } + } + + if (ip6h) { + l3len += sizeof(*ip6h); + + ip6h->payload_len = htons(l4len); + ip6h->saddr = tapside->oaddr.a6; + ip6h->daddr = tapside->eaddr.a6; + + ip6h->hop_limit = 255; + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_TCP; + + ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf; + ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; + ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; + + if (!no_tcp_csum) { + psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, + &ip6h->saddr, + &ip6h->daddr); + } + } tcp_fill_header(th, conn, seq); - if (no_tcp_csum) { + if (no_tcp_csum) th->check = 0; - } else { - uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, - *src4, *dst4); - + else tcp_update_csum(psum, th, payload); - } tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); } -/** - * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers - * @conn: Connection pointer - * @taph: tap backend specific header - * @ip6h: Pointer to IPv6 header - * @th: Pointer to TCP header - * @payload: TCP payload - * @check: Checksum, if already known - * @seq: Sequence number for this segment - * @no_tcp_csum: Do not set TCP checksum - */ -void tcp_fill_headers6(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct ipv6hdr *ip6h, - struct tcphdr *th, struct iov_tail *payload, - uint32_t seq, bool no_tcp_csum) -{ - size_t l4len = iov_tail_size(payload) + sizeof(*th); - const struct flowside *tapside = TAPFLOW(conn); - - ip6h->payload_len = htons(l4len); - ip6h->saddr = tapside->oaddr.a6; - ip6h->daddr = tapside->eaddr.a6; - - ip6h->hop_limit = 255; - ip6h->version = 6; - ip6h->nexthdr = IPPROTO_TCP; - - ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf; - ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; - ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; - - tcp_fill_header(th, conn, seq); - - if (no_tcp_csum) { - th->check = 0; - } else { - uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, - &ip6h->saddr, - &ip6h->daddr); - - tcp_update_csum(psum, th, payload); - } - - tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); -} - /** * tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap * @c: Execution context diff --git a/tcp_buf.c b/tcp_buf.c index 830c23d..a975a55 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -161,18 +161,19 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, { struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0); struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr); + struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base; const struct flowside *tapside = TAPFLOW(conn); const struct in_addr *a4 = inany_v4(&tapside->oaddr); + struct ipv6hdr *ip6h = NULL; + struct iphdr *ip4h = NULL; - if (a4) { - tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, th, - &tail, check, seq, no_tcp_csum); - } else { - tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, th, - &tail, seq, no_tcp_csum); - } + if (a4) + ip4h = iov[TCP_IOV_IP].iov_base; + else + ip6h = iov[TCP_IOV_IP].iov_base; + + tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail, + check, seq, no_tcp_csum); } /** diff --git a/tcp_internal.h b/tcp_internal.h index cff06e0..94e5780 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -164,14 +164,11 @@ struct tcp_info_linux; void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload); -void tcp_fill_headers4(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct iphdr *iph, - struct tcphdr *th, struct iov_tail *payload, - const uint16_t *check, uint32_t seq, bool no_tcp_csum); -void tcp_fill_headers6(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct ipv6hdr *ip6h, - struct tcphdr *th, struct iov_tail *payload, - uint32_t seq, bool no_tcp_csum); +void tcp_fill_headers(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct iphdr *ip4h, struct ipv6hdr *ip6h, + struct tcphdr *th, struct iov_tail *payload, + const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, bool force_seq, struct tcp_info_linux *tinfo); diff --git a/tcp_vu.c b/tcp_vu.c index a3d2e7d..db2c64d 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -111,9 +111,9 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) size_t optlen, hdrlen; struct vu_virtq_element flags_elem[2]; struct ipv6hdr *ip6h = NULL; + struct iphdr *ip4h = NULL; struct iovec flags_iov[2]; struct tcp_syn_opts *opts; - struct iphdr *iph = NULL; struct iov_tail payload; struct tcphdr *th; struct ethhdr *eh; @@ -144,8 +144,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) if (CONN_V4(conn)) { eh->h_proto = htons(ETH_P_IP); - iph = vu_ip(flags_elem[0].in_sg[0].iov_base); - *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + ip4h = vu_ip(flags_elem[0].in_sg[0].iov_base); + *ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); th = vu_payloadv4(flags_elem[0].in_sg[0].iov_base); } else { @@ -171,12 +171,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) flags_elem[0].in_sg[0].iov_len = hdrlen + optlen; payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen); - if (CONN_V4(conn)) { - tcp_fill_headers4(conn, NULL, iph, th, &payload, - NULL, seq, true); - } else { - tcp_fill_headers6(conn, NULL, ip6h, th, &payload, seq, true); - } + tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload, + NULL, seq, true); if (*c->pcap) { tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1); @@ -339,7 +335,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, struct iov_tail payload = IOV_TAIL(iov, iov_cnt, hdrlen); char *base = iov[0].iov_base; struct ipv6hdr *ip6h = NULL; - struct iphdr *iph = NULL; + struct iphdr *ip4h = NULL; struct tcphdr *th; struct ethhdr *eh; @@ -358,8 +354,8 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, if (!v6) { eh->h_proto = htons(ETH_P_IP); - iph = vu_ip(base); - *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + ip4h = vu_ip(base); + *ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); th = vu_payloadv4(base); } else { eh->h_proto = htons(ETH_P_IPV6); @@ -374,14 +370,10 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, th->doff = sizeof(*th) / 4; th->ack = 1; - if (!v6) { - tcp_fill_headers4(conn, NULL, iph, th, &payload, - *check, conn->seq_to_tap, true); - *check = &iph->check; - } else { - tcp_fill_headers6(conn, NULL, ip6h, th, &payload, - conn->seq_to_tap, true); - } + tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload, + *check, conn->seq_to_tap, true); + if (ip4h) + *check = &ip4h->check; } /** From b6e79efa0b0c8ab6327f5184f81c5b3ab8af4ff8 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 27 Nov 2024 14:54:10 +1100 Subject: [PATCH 151/382] tcp_vu: Remove unnecessary tcp_vu_update_check() function Because the vhost-user <-> virtio-net path ignores checksums, we usually don't calculate them when sending packets to the guest. So, we always pass no_tcp_csum=true to tcp_fill_headers(). We do want accurate checksums when capturing packets though, so the captures don't show bogus values. Currently we handle this by updating the checksum field immediately before writing the packet to the capture file, using tcp_vu_update_check(). This is unnecessary, though: in each case tcp_fill_headers() is called not very long before, so we can alter its no_tcp_csum parameter pased on whether we're generating captures or not. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_vu.c | 59 +++++++++++--------------------------------------------- 1 file changed, 11 insertions(+), 48 deletions(-) diff --git a/tcp_vu.c b/tcp_vu.c index db2c64d..5d5c97d 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -61,40 +61,6 @@ static size_t tcp_vu_hdrlen(bool v6) return hdrlen; } -/** - * tcp_vu_update_check() - Calculate TCP checksum - * @tapside: Address information for one side of the flow - * @iov: Pointer to the array of IO vectors - * @iov_cnt: Length of the array - */ -static void tcp_vu_update_check(const struct flowside *tapside, - struct iovec *iov, int iov_cnt) -{ - char *base = iov[0].iov_base; - struct iov_tail payload; - struct tcphdr *th; - uint32_t psum; - - if (inany_v4(&tapside->oaddr)) { - const struct in_addr *src4 = inany_v4(&tapside->oaddr); - const struct in_addr *dst4 = inany_v4(&tapside->eaddr); - const struct iphdr *iph = vu_ip(base); - size_t l4len = ntohs(iph->tot_len) - sizeof(*iph); - - th = vu_payloadv4(base); - psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, *src4, *dst4); - } else { - const struct ipv6hdr *ip6h = vu_ip(base); - size_t l4len = ntohs(ip6h->payload_len); - - th = vu_payloadv6(base); - psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, - &ip6h->saddr, &ip6h->daddr); - } - payload = IOV_TAIL(iov, iov_cnt, (char *)(th + 1) - base); - tcp_update_csum(psum, th, &payload); -} - /** * tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload) * @c: Execution context @@ -107,7 +73,6 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) { struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; - const struct flowside *tapside = TAPFLOW(conn); size_t optlen, hdrlen; struct vu_virtq_element flags_elem[2]; struct ipv6hdr *ip6h = NULL; @@ -172,10 +137,9 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen); tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload, - NULL, seq, true); + NULL, seq, !*c->pcap); if (*c->pcap) { - tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1); pcap_iov(&flags_elem[0].in_sg[0], 1, sizeof(struct virtio_net_hdr_mrg_rxbuf)); } @@ -319,15 +283,16 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, /** * tcp_vu_prepare() - Prepare the frame header - * @c: Execution context - * @conn: Connection pointer - * @iov: Pointer to the array of IO vectors - * @iov_cnt: Number of entries in @iov - * @check: Checksum, if already known + * @c: Execution context + * @conn: Connection pointer + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Number of entries in @iov + * @check: Checksum, if already known + * @no_tcp_csum: Do not set TCP checksum */ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, struct iovec *iov, size_t iov_cnt, - const uint16_t **check) + const uint16_t **check, bool no_tcp_csum) { const struct flowside *toside = TAPFLOW(conn); bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); @@ -371,7 +336,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, th->ack = 1; tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload, - *check, conn->seq_to_tap, true); + *check, conn->seq_to_tap, no_tcp_csum); if (ip4h) *check = &ip4h->check; } @@ -389,8 +354,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; - const struct flowside *tapside = TAPFLOW(conn); - size_t fillsize, hdrlen; + size_t hdrlen, fillsize; int v6 = CONN_V6(conn); uint32_t already_sent; const uint16_t *check; @@ -483,10 +447,9 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) if (i + 1 == head_cnt) check = NULL; - tcp_vu_prepare(c, conn, iov, buf_cnt, &check); + tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap); if (*c->pcap) { - tcp_vu_update_check(tapside, iov, buf_cnt); pcap_iov(iov, buf_cnt, sizeof(struct virtio_net_hdr_mrg_rxbuf)); } From d9c0f8eefb0015a5a06c7259666c877fff6fbe92 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Wed, 27 Nov 2024 17:16:45 +0100 Subject: [PATCH 152/382] Makefile: Use make internal string functions TARGET_ARCH is computed from '$(CC) -dumpmachine' using external bash commands like echo, cut, tr and sed. This can be done using make internal string functions. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index cb74480..1fce737 100644 --- a/Makefile +++ b/Makefile @@ -17,8 +17,9 @@ DUAL_STACK_SOCKETS := 1 TARGET ?= $(shell $(CC) -dumpmachine) # Get 'uname -m'-like architecture description for target -TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z]) -TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/') +TARGET_ARCH := $(firstword $(subst -, ,$(TARGET))) +TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH)) +TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH)) # On some systems enabling optimization also enables source fortification, # automagically. Do not override it. From 020c8b7127e38872e68bffb30ad388001e088552 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 28 Nov 2024 13:08:41 +0100 Subject: [PATCH 153/382] tcp_vu: Compute IPv4 header checksum if dlen changes In tcp_vu_data_from_sock() we compute IPv4 header checksum only for the first and the last packets, and re-use the first packet checksum for all the other packets as the content of the header doesn't change. It's more accurate to check the dlen value to know if the checksum should change as dlen is the only information that can change in the loop. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_vu.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tcp_vu.c b/tcp_vu.c index 5d5c97d..10e17d3 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -354,12 +354,12 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + ssize_t len, previous_dlen; size_t hdrlen, fillsize; int v6 = CONN_V6(conn); uint32_t already_sent; const uint16_t *check; int i, iov_cnt; - ssize_t len; if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { debug("Got packet, but RX virtqueue not usable yet"); @@ -433,19 +433,17 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) */ hdrlen = tcp_vu_hdrlen(v6); - for (i = 0, check = NULL; i < head_cnt; i++) { + for (i = 0, previous_dlen = -1, check = NULL; i < head_cnt; i++) { struct iovec *iov = &elem[head[i]].in_sg[0]; int buf_cnt = head[i + 1] - head[i]; ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen; vu_set_vnethdr(vdev, iov->iov_base, buf_cnt); - /* we compute IPv4 header checksum only for the - * first and the last, all other checksums are the - * same as the first one - */ - if (i + 1 == head_cnt) + /* The IPv4 header checksum varies only with dlen */ + if (previous_dlen != dlen) check = NULL; + previous_dlen = dlen; tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap); From 966fdc8749048d37a4ffe845388e1ec106eb278d Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 28 Nov 2024 15:06:44 +0100 Subject: [PATCH 154/382] perf/passt_vu_tcp: Make it shine Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/perf/passt_vu_tcp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/test/perf/passt_vu_tcp b/test/perf/passt_vu_tcp index b434008..c4409b9 100644 --- a/test/perf/passt_vu_tcp +++ b/test/perf/passt_vu_tcp @@ -38,10 +38,10 @@ hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\ hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ -set THREADS 4 -set TIME 5 +set THREADS 6 +set TIME 2 set OMIT 0.1 -set OPTS -Z -P __THREADS__ -l 1M -O__OMIT__ -N +set OPTS -Z -P __THREADS__ -O__OMIT__ -N info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz report passt_vu tcp __THREADS__ __FREQ__ @@ -55,16 +55,16 @@ iperf3s ns 10002 bw - bw - guest ip link set dev __IFNAME__ mtu 1280 -iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M -l 1M bw __BW__ 1.2 1.5 guest ip link set dev __IFNAME__ mtu 1500 -iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 32M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 32M -l 1M bw __BW__ 1.6 1.8 guest ip link set dev __IFNAME__ mtu 9000 -iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M -l 1M bw __BW__ 4.0 5.0 guest ip link set dev __IFNAME__ mtu 65520 -iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M -l 1M bw __BW__ 7.0 8.0 iperf3k ns @@ -93,22 +93,22 @@ tr TCP throughput over IPv4: guest to host iperf3s ns 10002 guest ip link set dev __IFNAME__ mtu 256 -iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 2M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 2M -l 1M bw __BW__ 0.2 0.3 guest ip link set dev __IFNAME__ mtu 576 -iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M -l 1M bw __BW__ 0.5 0.8 guest ip link set dev __IFNAME__ mtu 1280 -iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M -l 1M bw __BW__ 1.2 1.5 guest ip link set dev __IFNAME__ mtu 1500 -iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M -l 1M bw __BW__ 1.6 1.8 guest ip link set dev __IFNAME__ mtu 9000 -iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M -l 1M bw __BW__ 4.0 5.0 guest ip link set dev __IFNAME__ mtu 65520 -iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M -l 1M bw __BW__ 7.0 8.0 iperf3k ns @@ -145,7 +145,7 @@ bw - bw - bw - bw - -iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -w 32M +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -w 256M -l 16k bw __BW__ 6.0 6.8 iperf3k guest @@ -181,7 +181,7 @@ bw - bw - bw - bw - -iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -w 32M +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -w 256M -l 16k bw __BW__ 6.0 6.8 iperf3k guest From 1db4f773e87fc77eae2c4965a6bb90fcb56a0ff3 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 5 Dec 2024 15:26:01 +1100 Subject: [PATCH 155/382] udp: Improve detail of UDP endpoint sanity checking In udp_flow_new() we reject a flow if the endpoint isn't unicast, or it has a zero endpoint port. Those conditions aren't strictly illegal, but we can't safely handle them at present: * Multicast UDP endpoints are certainly possible, but our current flow tracking only makes sense for simple unicast flows - we'll need different handling if we want to handle multicast flows in future * It's not entirely clear if port 0 is RFC-ishly correct, but for socket interfaces port 0 sometimes has a special meaning such as "pick the port for me, kernel". That makes flows on port 0 unsafe to forward in the usual way. For the same reason we also can't safely handle port 0 as our port. In principle that's also true for our address, however in the case of flows initiated from a socket, we may not know our address since the socket could be bound to 0.0.0.0 or ::, so we can only verify that our address is unicast for flows initiated from the tap side. Refine the current check in udp_flow_new() to slightly more detailed checks in udp_flow_from_sock() and udp_flow_from_tap() to make what is and isn't handled clearer. This makes this checking more similar to what we do for TCP connections. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp_flow.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/udp_flow.c b/udp_flow.c index b81be2c..c8fdb5f 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -75,16 +75,10 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, int s_ini, const struct timespec *now) { - const struct flowside *ini = &flow->f.side[INISIDE]; struct udp_flow *uflow = NULL; const struct flowside *tgt; uint8_t tgtpif; - if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { - flow_trace(flow, "Invalid endpoint to initiate UDP flow"); - goto cancel; - } - if (!(tgt = flow_target(c, flow, IPPROTO_UDP))) goto cancel; tgtpif = flow->f.pif[TGTSIDE]; @@ -189,6 +183,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, const union sockaddr_inany *s_in, const struct timespec *now) { + const struct flowside *ini; struct udp_flow *uflow; union flow *flow; flow_sidx_t sidx; @@ -210,7 +205,19 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, return FLOW_SIDX_NONE; } - flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port); + ini = flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port); + + if (!inany_is_unicast(&ini->eaddr) || + ini->eport == 0 || ini->oport == 0) { + /* In principle ini->oddr also must be unicast, but when we've + * been initiated from a socket bound to 0.0.0.0 or ::, we don't + * know our address, so we have to leave it unpopulated. + */ + flow_err(flow, "Invalid endpoint on UDP recvfrom()"); + flow_alloc_cancel(flow); + return FLOW_SIDX_NONE; + } + return udp_flow_new(c, flow, ref.fd, now); } @@ -233,6 +240,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, in_port_t srcport, in_port_t dstport, const struct timespec *now) { + const struct flowside *ini; struct udp_flow *uflow; union flow *flow; flow_sidx_t sidx; @@ -256,7 +264,15 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, return FLOW_SIDX_NONE; } - flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport); + ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, + daddr, dstport); + + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || + !inany_is_unicast(&ini->oaddr) || ini->oport == 0) { + flow_dbg(flow, "Invalid endpoint on UDP packet"); + flow_alloc_cancel(flow); + return FLOW_SIDX_NONE; + } return udp_flow_new(c, flow, -1, now); } From 190829705e315972a7c674d2fa55d322aa18d26e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 5 Dec 2024 15:26:02 +1100 Subject: [PATCH 156/382] flow: Remove over-zealous sanity checks in flow_sidx_hash() In flow_sidx_hash() we verify that the flow we're hashing doesn't have an unspecified endpoint address, or zero for either port. The hash table only works if we're looking for exact matches of address and port, and this is attempting to catch any cases where we might have left address or port unpopulated or filled with a wildcard. This doesn't really work though, because there are cases where unspecified addresses or zero ports are correct: * We already use unspecified addresses for our address in cases where we don't know the specific local address for that side, and exclude the obvious extra check on side->oaddr for that reason. * Zero port numbers aren't strictly forbidden over the wire. We forbid them for TCP & UDP because they can't safely be handled on the socket side. However for ICMP a zero id, which goes in the port field is valid. * Possible future flow types (for example, for multicast protocols) might legitimately have an unspecified address. Although it makes them easier to miss, these sorts of sanity checks really have to be done at the protocol / flow type layer, and we already do so. Remove the checks in flow_sidx_hash() other than checking that the pif is specified. Reported-by: Stefan <steffhip@gmail.com> Link: https://bugs.passt.top/show_bug.cgi?id=105 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/flow.c b/flow.c index 1ea112b..ee1221b 100644 --- a/flow.c +++ b/flow.c @@ -597,12 +597,7 @@ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx) const struct flowside *side = &f->side[sidx.sidei]; uint8_t pif = f->pif[sidx.sidei]; - /* For the hash table to work, entries must have complete endpoint - * information, and at least a forwarding port. - */ - ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) && - side->eport != 0 && side->oport != 0); - + ASSERT(pif != PIF_NONE); return flow_hash(c, FLOW_PROTO(f), pif, side); } From 8996d183c5c50399d9dbae4d60d77d08f44ffb54 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 5 Dec 2024 08:37:18 +0100 Subject: [PATCH 157/382] udp_vu: update segment size In udp_vu_sock_recv(), collect a segment with a size defined to IP_MAX_MTU + ETH_HLEN + sizeof(struct virtio_net_hdr_mrg_rxbuf) The original version double counted the IP header: IP_MAX_MTU includes the IP header, and so did hdrlen. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp_vu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udp_vu.c b/udp_vu.c index 9c697f3..4123510 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -104,7 +104,8 @@ static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events, vu_init_elem(elem, iov_vu, VIRTQUEUE_MAX_SIZE); iov_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE, - IP_MAX_MTU - sizeof(struct udphdr) + hdrlen, + IP_MAX_MTU + ETH_HLEN + + sizeof(struct virtio_net_hdr_mrg_rxbuf), NULL); if (iov_cnt == 0) return 0; From 2139ad33fc8ab48736d65f3d65dc882f0d612006 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Mon, 9 Dec 2024 17:54:49 +0100 Subject: [PATCH 158/382] tap: Use a common function to start a new connection Merge code from tap_backend_init(), tap_sock_tun_init() and tap_listen_handler() to set epoll_ref entry and to add it to epollfd. No functionality change Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 66 +++++++++++++++++++++++++++-------------------------------- 1 file changed, 30 insertions(+), 36 deletions(-) diff --git a/tap.c b/tap.c index c418064..b2d3045 100644 --- a/tap.c +++ b/tap.c @@ -1255,6 +1255,33 @@ static void tap_sock_unix_init(const struct ctx *c) epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev); } +/** + * tap_start_connection() - start a new connection + * @c: Execution context + */ +static void tap_start_connection(const struct ctx *c) +{ + struct epoll_event ev = { 0 }; + union epoll_ref ref = { 0 }; + + ref.fd = c->fd_tap; + switch (c->mode) { + case MODE_PASST: + ref.type = EPOLL_TYPE_TAP_PASST; + break; + case MODE_PASTA: + ref.type = EPOLL_TYPE_TAP_PASTA; + break; + case MODE_VU: + ref.type = EPOLL_TYPE_VHOST_CMD; + break; + } + + ev.events = EPOLLIN | EPOLLRDHUP; + ev.data.u64 = ref.u64; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); +} + /** * tap_listen_handler() - Handle new connection on listening socket * @c: Execution context @@ -1262,8 +1289,6 @@ static void tap_sock_unix_init(const struct ctx *c) */ void tap_listen_handler(struct ctx *c, uint32_t events) { - struct epoll_event ev = { 0 }; - union epoll_ref ref = { 0 }; int v = INT_MAX / 2; struct ucred ucred; socklen_t len; @@ -1302,14 +1327,7 @@ void tap_listen_handler(struct ctx *c, uint32_t events) setsockopt(c->fd_tap, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v))) trace("tap: failed to set SO_SNDBUF to %i", v); - ref.fd = c->fd_tap; - if (c->mode == MODE_VU) - ref.type = EPOLL_TYPE_VHOST_CMD; - else - ref.type = EPOLL_TYPE_TAP_PASST; - ev.events = EPOLLIN | EPOLLRDHUP; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); + tap_start_connection(c); } /** @@ -1353,19 +1371,13 @@ static int tap_ns_tun(void *arg) */ static void tap_sock_tun_init(struct ctx *c) { - union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASTA }; - struct epoll_event ev = { 0 }; - NS_CALL(tap_ns_tun, c); if (c->fd_tap == -1) die("Failed to set up tap device in namespace"); pasta_ns_conf(c); - ref.fd = c->fd_tap; - ev.events = EPOLLIN | EPOLLRDHUP; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); + tap_start_connection(c); } /** @@ -1399,26 +1411,8 @@ void tap_backend_init(struct ctx *c) tap_sock_update_pool(pkt_buf, sizeof(pkt_buf)); if (c->fd_tap != -1) { /* Passed as --fd */ - struct epoll_event ev = { 0 }; - union epoll_ref ref; - ASSERT(c->one_off); - ref.fd = c->fd_tap; - switch (c->mode) { - case MODE_PASST: - ref.type = EPOLL_TYPE_TAP_PASST; - break; - case MODE_PASTA: - ref.type = EPOLL_TYPE_TAP_PASTA; - break; - case MODE_VU: - ref.type = EPOLL_TYPE_VHOST_CMD; - break; - } - - ev.events = EPOLLIN | EPOLLRDHUP; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); + tap_start_connection(c); return; } From 947f5cdb93062fd4e56adbab9901bbbb8aa8b5cb Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Mon, 9 Dec 2024 17:54:50 +0100 Subject: [PATCH 159/382] tap: Call vu_init() with --fd We need to initialize vhost-user structures with --fd too. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tap.c b/tap.c index b2d3045..cd32a90 100644 --- a/tap.c +++ b/tap.c @@ -1405,10 +1405,12 @@ void tap_sock_update_pool(void *base, size_t size) */ void tap_backend_init(struct ctx *c) { - if (c->mode == MODE_VU) + if (c->mode == MODE_VU) { tap_sock_update_pool(NULL, 0); - else + vu_init(c); + } else { tap_sock_update_pool(pkt_buf, sizeof(pkt_buf)); + } if (c->fd_tap != -1) { /* Passed as --fd */ ASSERT(c->one_off); @@ -1421,8 +1423,6 @@ void tap_backend_init(struct ctx *c) tap_sock_tun_init(c); break; case MODE_VU: - vu_init(c); - /* fall through */ case MODE_PASST: tap_sock_unix_init(c); From e24f0262229a1f9c673dca3452ad103cbe06b866 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Tue, 10 Dec 2024 13:36:45 -0500 Subject: [PATCH 160/382] pasta: make it possible to disable socket splicing During testing it is sometimes useful to force traffic which would normally be forwared by socket splicing through the tap interface. In this commit, we add a command switch enabling such funtionality for inbound local traffic. For outbound local traffic this is much trickier, if even possible, so leave that for a later commit. Suggested-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Jon Maloy <jmaloy@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 7 ++++++- fwd.c | 2 +- passt.1 | 5 +++++ passt.h | 2 ++ 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/conf.c b/conf.c index eaa7d99..97d8beb 100644 --- a/conf.c +++ b/conf.c @@ -977,7 +977,8 @@ pasta_opts: " Don't copy all routes to namespace\n" " --no-copy-addrs DEPRECATED:\n" " Don't copy all addresses to namespace\n" - " --ns-mac-addr ADDR Set MAC address on tap interface\n"); + " --ns-mac-addr ADDR Set MAC address on tap interface\n" + " --no-splice Disable inbound socket splicing\n"); exit(status); } @@ -1319,6 +1320,7 @@ void conf(struct ctx *c, int argc, char **argv) {"no-dhcpv6", no_argument, &c->no_dhcpv6, 1 }, {"no-ndp", no_argument, &c->no_ndp, 1 }, {"no-ra", no_argument, &c->no_ra, 1 }, + {"no-splice", no_argument, &c->no_splice, 1 }, {"freebind", no_argument, &c->freebind, 1 }, {"no-map-gw", no_argument, &no_map_gw, 1 }, {"ipv4-only", no_argument, NULL, '4' }, @@ -1756,6 +1758,9 @@ void conf(struct ctx *c, int argc, char **argv) } } while (name != -1); + if (c->mode != MODE_PASTA) + c->no_splice = 1; + if (c->mode == MODE_PASTA && !c->pasta_conf_ns) { if (copy_routes_opt) die("--no-copy-routes needs --config-net"); diff --git a/fwd.c b/fwd.c index 0b7f8b1..2829cd2 100644 --- a/fwd.c +++ b/fwd.c @@ -443,7 +443,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, else if (proto == IPPROTO_UDP) tgt->eport += c->udp.fwd_in.delta[tgt->eport]; - if (c->mode == MODE_PASTA && inany_is_loopback(&ini->eaddr) && + if (!c->no_splice && inany_is_loopback(&ini->eaddr) && (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) { /* spliceable */ diff --git a/passt.1 b/passt.1 index b2896a2..d9cd33e 100644 --- a/passt.1 +++ b/passt.1 @@ -695,6 +695,11 @@ Configure MAC address \fIaddr\fR on the tap interface in the namespace. Default is to let the tap driver build a pseudorandom hardware address. +.TP +.BR \-\-no-splice +Disable the bypass path for inbound, local traffic. See the section \fBHandling +of local traffic in pasta\fR in the \fBNOTES\fR for more details. + .SH EXAMPLES .SS \fBpasta diff --git a/passt.h b/passt.h index c038630..0dd4efa 100644 --- a/passt.h +++ b/passt.h @@ -229,6 +229,7 @@ struct ip6_ctx { * @no_dhcpv6: Disable DHCPv6 server * @no_ndp: Disable NDP handler altogether * @no_ra: Disable router advertisements + * @no_splice: Disable socket splicing for inbound traffic * @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses * @freebind: Allow binding of non-local addresses for forwarding * @low_wmem: Low probed net.core.wmem_max @@ -291,6 +292,7 @@ struct ctx { int no_dhcpv6; int no_ndp; int no_ra; + int no_splice; int host_lo_to_ns_lo; int freebind; From 09478d55fe1a21f8c55902399df84d13867e71be Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 11 Dec 2024 00:13:39 +0100 Subject: [PATCH 161/382] treewide: Dodge dynamic memory allocation in strerror() from glibc > 2.40 With glibc commit 25a5eb4010df ("string: strerror, strsignal cannot use buffer after dlmopen (bug 32026)"), strerror() now needs, at least on x86, the getrandom() and brk() system calls, in order to fill in the locale-translated error message. But getrandom() and brk() are not allowed by our seccomp profiles. This became visible on Fedora Rawhide with the "podman login and logout" Podman tests, defined at test/e2e/login_logout_test.go in the Podman source tree, where pasta would terminate upon printing error descriptions (at least the ones related to the SO_ERROR queue for spliced connections). Avoid dynamic memory allocation by calling strerrordesc_np() instead, which is a GNU function returning a static, untranslated version of the error description. If it's not available, keep calling strerror(), which at that point should be simple enough as to be usable (at least, that's currently the case for musl). Reported-by: Paul Holzinger <pholzing@redhat.com> Link: https://github.com/containers/podman/issues/24804 Analysed-by: Paul Holzinger <pholzing@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Tested-by: Paul Holzinger <pholzing@redhat.com> --- conf.c | 10 +++++----- icmp.c | 4 ++-- log.c | 2 +- netlink.c | 2 +- pasta.c | 22 +++++++++++----------- tcp.c | 22 +++++++++++----------- tcp_splice.c | 16 ++++++++-------- udp.c | 4 ++-- udp_flow.c | 8 ++++---- util.c | 6 +++--- util.h | 32 ++++++++++++++++++++++++++++++++ 11 files changed, 80 insertions(+), 48 deletions(-) diff --git a/conf.c b/conf.c index 97d8beb..df2b016 100644 --- a/conf.c +++ b/conf.c @@ -365,7 +365,7 @@ mode_conflict: die("Port forwarding mode '%s' conflicts with previous mode", optarg); bind_fail: die("Failed to bind port %u (%s) for option '-%c %s', exiting", - i, strerror(-ret), optname, optarg); + i, strerror_(-ret), optname, optarg); bind_all_fail: die("Failed to bind any port for '-%c %s', exiting", optname, optarg); } @@ -655,7 +655,7 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4) &ip4->guest_gw); if (rc < 0) { debug("Couldn't discover IPv4 gateway address: %s", - strerror(-rc)); + strerror_(-rc)); return 0; } } @@ -665,7 +665,7 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4) &ip4->addr, &ip4->prefix_len, NULL); if (rc < 0) { debug("Couldn't discover IPv4 address: %s", - strerror(-rc)); + strerror_(-rc)); return 0; } } @@ -729,7 +729,7 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6) rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->guest_gw); if (rc < 0) { debug("Couldn't discover IPv6 gateway address: %s", - strerror(-rc)); + strerror_(-rc)); return 0; } } @@ -738,7 +738,7 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6) IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ? &ip6->addr : NULL, &prefix_len, &ip6->our_tap_ll); if (rc < 0) { - debug("Couldn't discover IPv6 address: %s", strerror(-rc)); + debug("Couldn't discover IPv6 address: %s", strerror_(-rc)); return 0; } diff --git a/icmp.c b/icmp.c index f514dbc..143e93b 100644 --- a/icmp.c +++ b/icmp.c @@ -85,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl); if (n < 0) { - flow_err(pingf, "recvfrom() error: %s", strerror(errno)); + flow_err(pingf, "recvfrom() error: %s", strerror_(errno)); return; } @@ -301,7 +301,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0); if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) { flow_dbg(pingf, "failed to relay request to socket: %s", - strerror(errno)); + strerror_(errno)); } else { flow_dbg(pingf, "echo request to socket, ID: %"PRIu16", seq: %"PRIu16, diff --git a/log.c b/log.c index 239c8ce..95e4576 100644 --- a/log.c +++ b/log.c @@ -322,7 +322,7 @@ void logmsg_perror(int pri, const char *format, ...) vlogmsg(false, false, pri, format, ap); va_end(ap); - logmsg(true, true, pri, ": %s", strerror(errno_copy)); + logmsg(true, true, pri, ": %s", strerror_(errno_copy)); } /** diff --git a/netlink.c b/netlink.c index 4aba2a3..0407692 100644 --- a/netlink.c +++ b/netlink.c @@ -320,7 +320,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) } if (status < 0) - warn("netlink: RTM_GETROUTE failed: %s", strerror(-status)); + warn("netlink: RTM_GETROUTE failed: %s", strerror_(-status)); if (defifi) { if (ndef > 1) { diff --git a/pasta.c b/pasta.c index 96dacc3..ff41c95 100644 --- a/pasta.c +++ b/pasta.c @@ -296,7 +296,7 @@ void pasta_ns_conf(struct ctx *c) rc = nl_link_set_flags(nl_sock_ns, 1 /* lo */, IFF_UP, IFF_UP); if (rc < 0) die("Couldn't bring up loopback interface in namespace: %s", - strerror(-rc)); + strerror_(-rc)); /* Get or set MAC in target namespace */ if (MAC_IS_ZERO(c->guest_mac)) @@ -305,7 +305,7 @@ void pasta_ns_conf(struct ctx *c) rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac); if (rc < 0) die("Couldn't set MAC address in namespace: %s", - strerror(-rc)); + strerror_(-rc)); if (c->pasta_conf_ns) { unsigned int flags = IFF_UP; @@ -332,7 +332,7 @@ void pasta_ns_conf(struct ctx *c) if (rc < 0) { die("Couldn't set IPv4 address(es) in namespace: %s", - strerror(-rc)); + strerror_(-rc)); } if (c->ip4.no_copy_routes) { @@ -346,7 +346,7 @@ void pasta_ns_conf(struct ctx *c) if (rc < 0) { die("Couldn't set IPv4 route(s) in guest: %s", - strerror(-rc)); + strerror_(-rc)); } } @@ -355,13 +355,13 @@ void pasta_ns_conf(struct ctx *c) &c->ip6.addr_ll_seen); if (rc < 0) { warn("Can't get LL address from namespace: %s", - strerror(-rc)); + strerror_(-rc)); } rc = nl_addr_set_ll_nodad(nl_sock_ns, c->pasta_ifi); if (rc < 0) { warn("Can't set nodad for LL in namespace: %s", - strerror(-rc)); + strerror_(-rc)); } /* We dodged DAD: re-enable neighbour solicitations */ @@ -382,7 +382,7 @@ void pasta_ns_conf(struct ctx *c) if (rc < 0) { die("Couldn't set IPv6 address(es) in namespace: %s", - strerror(-rc)); + strerror_(-rc)); } if (c->ip6.no_copy_routes) { @@ -397,7 +397,7 @@ void pasta_ns_conf(struct ctx *c) if (rc < 0) { die("Couldn't set IPv6 route(s) in guest: %s", - strerror(-rc)); + strerror_(-rc)); } } } @@ -446,18 +446,18 @@ void pasta_netns_quit_init(const struct ctx *c) return; if ((dir_fd = open(c->netns_dir, O_CLOEXEC | O_RDONLY)) < 0) - die("netns dir open: %s, exiting", strerror(errno)); + die("netns dir open: %s, exiting", strerror_(errno)); if (fstatfs(dir_fd, &s) || s.f_type == DEVPTS_SUPER_MAGIC || s.f_type == PROC_SUPER_MAGIC || s.f_type == SYSFS_MAGIC) try_inotify = false; if (try_inotify && (fd = inotify_init1(flags)) < 0) - warn("inotify_init1(): %s, use a timer", strerror(errno)); + warn("inotify_init1(): %s, use a timer", strerror_(errno)); if (fd >= 0 && inotify_add_watch(fd, c->netns_dir, IN_DELETE) < 0) { warn("inotify_add_watch(): %s, use a timer", - strerror(errno)); + strerror_(errno)); close(fd); fd = -1; } diff --git a/tcp.c b/tcp.c index 1872ccb..ec433f7 100644 --- a/tcp.c +++ b/tcp.c @@ -516,7 +516,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) fd = timerfd_create(CLOCK_MONOTONIC, 0); if (fd == -1 || fd > FD_REF_MAX) { flow_dbg(conn, "failed to get timer: %s", - strerror(errno)); + strerror_(errno)); if (fd > -1) close(fd); conn->timer = -1; @@ -526,7 +526,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { flow_dbg(conn, "failed to add timer: %s", - strerror(errno)); + strerror_(errno)); close(conn->timer); conn->timer = -1; return; @@ -551,7 +551,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); if (timerfd_settime(conn->timer, 0, &it, NULL)) - flow_err(conn, "failed to set timer: %s", strerror(errno)); + flow_err(conn, "failed to set timer: %s", strerror_(errno)); } /** @@ -1307,7 +1307,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af) return s; err("TCP: Unable to open socket for new connection: %s", - strerror(-s)); + strerror_(-s)); return -1; } @@ -1360,7 +1360,7 @@ static void tcp_bind_outbound(const struct ctx *c, flow_dbg(conn, "Can't bind TCP outbound socket to %s:%hu: %s", inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), - tgt->oport, strerror(errno)); + tgt->oport, strerror_(errno)); } } @@ -1371,7 +1371,7 @@ static void tcp_bind_outbound(const struct ctx *c, strlen(c->ip4.ifname_out))) { flow_dbg(conn, "Can't bind IPv4 TCP socket to" " interface %s: %s", c->ip4.ifname_out, - strerror(errno)); + strerror_(errno)); } } } else if (bind_sa.sa_family == AF_INET6) { @@ -1381,7 +1381,7 @@ static void tcp_bind_outbound(const struct ctx *c, strlen(c->ip6.ifname_out))) { flow_dbg(conn, "Can't bind IPv6 TCP socket to" " interface %s: %s", c->ip6.ifname_out, - strerror(errno)); + strerror_(errno)); } } } @@ -2113,7 +2113,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * and we just set the timer to a new point in the future: discard it. */ if (timerfd_gettime(conn->timer, &check_armed)) - flow_err(conn, "failed to read timer: %s", strerror(errno)); + flow_err(conn, "failed to read timer: %s", strerror_(errno)); if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec) return; @@ -2154,7 +2154,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) */ if (timerfd_settime(conn->timer, 0, &new, &old)) flow_err(conn, "failed to set timer: %s", - strerror(errno)); + strerror_(errno)); if (old.it_value.tv_sec == ACT_TIMEOUT) { flow_dbg(conn, "activity timeout"); @@ -2422,13 +2422,13 @@ static void tcp_sock_refill_init(const struct ctx *c) int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 host socket pool: %s", - strerror(-rc)); + strerror_(-rc)); } if (c->ifi6) { int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 host socket pool: %s", - strerror(-rc)); + strerror_(-rc)); } } diff --git a/tcp_splice.c b/tcp_splice.c index 93f8bce..3a0f868 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -160,7 +160,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) || epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) { int ret = -errno; - flow_err(conn, "ERROR on epoll_ctl(): %s", strerror(errno)); + flow_err(conn, "ERROR on epoll_ctl(): %s", strerror_(errno)); return ret; } @@ -314,7 +314,7 @@ static int tcp_splice_connect_finish(const struct ctx *c, if (conn->pipe[sidei][0] < 0) { if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) { flow_err(conn, "cannot create %d->%d pipe: %s", - sidei, !sidei, strerror(errno)); + sidei, !sidei, strerror_(errno)); conn_flag(c, conn, CLOSING); return -EIO; } @@ -370,7 +370,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) if (connect(conn->s[1], &sa.sa, sl)) { if (errno != EINPROGRESS) { flow_trace(conn, "Couldn't connect socket for splice: %s", - strerror(errno)); + strerror_(errno)); return -errno; } @@ -469,10 +469,10 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl); if (rc) flow_err(conn, "Error retrieving SO_ERROR: %s", - strerror(errno)); + strerror_(errno)); else flow_trace(conn, "Error event on socket: %s", - strerror(err)); + strerror_(err)); goto close; } @@ -551,7 +551,7 @@ eintr: &lowat, sizeof(lowat))) { flow_trace(conn, "Setting SO_RCVLOWAT %i: %s", - lowat, strerror(errno)); + lowat, strerror_(errno)); } else { conn_flag(c, conn, lowat_set_flag); conn_flag(c, conn, lowat_act_flag); @@ -696,13 +696,13 @@ static int tcp_sock_refill_ns(void *arg) int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 ns socket pool: %s", - strerror(-rc)); + strerror_(-rc)); } if (c->ifi6) { int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 ns socket pool: %s", - strerror(-rc)); + strerror_(-rc)); } return 0; diff --git a/udp.c b/udp.c index c89f031..923cc38 100644 --- a/udp.c +++ b/udp.c @@ -453,7 +453,7 @@ static int udp_sock_recverr(int s) /* TODO: When possible propagate and otherwise handle errors */ debug("%s error on UDP socket %i: %s", - str_ee_origin(ee), s, strerror(ee->ee_errno)); + str_ee_origin(ee), s, strerror_(ee->ee_errno)); return 1; } @@ -492,7 +492,7 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events) } if (err) { - debug("Unqueued error on UDP socket %i: %s", s, strerror(err)); + debug("Unqueued error on UDP socket %i: %s", s, strerror_(err)); n_err++; } diff --git a/udp_flow.c b/udp_flow.c index c8fdb5f..343caae 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -95,7 +95,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, if (uflow->s[INISIDE] < 0) { flow_err(uflow, "Couldn't duplicate listening socket: %s", - strerror(errno)); + strerror_(errno)); goto cancel; } } @@ -115,14 +115,14 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, if (uflow->s[TGTSIDE] < 0) { flow_dbg(uflow, "Couldn't open socket for spliced flow: %s", - strerror(errno)); + strerror_(errno)); goto cancel; } if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) { flow_dbg(uflow, "Couldn't connect flow socket: %s", - strerror(errno)); + strerror_(errno)); goto cancel; } @@ -144,7 +144,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, } else if (errno != EAGAIN) { flow_err(uflow, "Unexpected error discarding datagrams: %s", - strerror(errno)); + strerror_(errno)); } } diff --git a/util.c b/util.c index 55cae3f..11973c4 100644 --- a/util.c +++ b/util.c @@ -90,7 +90,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, ret = -errno; if (fd < 0) { - warn("L4 socket: %s", strerror(-ret)); + warn("L4 socket: %s", strerror_(-ret)); return ret; } @@ -162,7 +162,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, if (type == EPOLL_TYPE_TCP_LISTEN && listen(fd, 128) < 0) { ret = -errno; - warn("TCP socket listen: %s", strerror(-ret)); + warn("TCP socket listen: %s", strerror_(-ret)); close(fd); return ret; } @@ -171,7 +171,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, ev.data.u64 = ref.u64; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) { ret = -errno; - warn("L4 epoll_ctl: %s", strerror(-ret)); + warn("L4 epoll_ctl: %s", strerror_(-ret)); return ret; } diff --git a/util.h b/util.h index 41bbd60..3fa1d12 100644 --- a/util.h +++ b/util.h @@ -274,6 +274,38 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m) void raw_random(void *buf, size_t buflen); +/* + * Starting from glibc 2.40.9000 and commit 25a5eb4010df ("string: strerror, + * strsignal cannot use buffer after dlmopen (bug 32026)"), strerror() needs + * getrandom(2) and brk(2) as it allocates memory for the locale-translated + * error description, but our seccomp profiles forbid both. + * + * Use the strerror_() wrapper instead, calling into strerrordesc_np() to get + * a static untranslated string. It's a GNU implementation, but also defined by + * bionic. + * + * If strerrordesc_np() is not defined (e.g. musl), call strerror(). C libraries + * not defining strerrordesc_np() are expected to provide strerror() + * implementations that are simple enough for us to call. + */ +__attribute__ ((weak)) const char *strerrordesc_np(int errnum); + +/** + * strerror_() - strerror() wrapper calling strerrordesc_np() if available + * @errnum: Error code + * + * Return: error description string + */ +static inline const char *strerror_(int errnum) +{ + if (strerrordesc_np) + return strerrordesc_np(errnum); + + return strerror(errnum); +} + +#define strerror(x) @ "Don't call strerror() directly, use strerror_() instead" + /* * Workarounds for https://github.com/llvm/llvm-project/issues/58992 * From e5ba8adef71ec53e192373ed1267dc338719dda0 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 12 Dec 2024 10:50:48 +0100 Subject: [PATCH 162/382] README: Mark vhost-user as supported Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 752e59f..54fed07 100644 --- a/README.md +++ b/README.md @@ -321,7 +321,7 @@ speeding up local connections, and usually requiring NAT. _pasta_: protocol * ✅ 4 to 50 times IPv4 TCP throughput of existing, conceptually similar solutions depending on MTU (UDP and IPv6 hard to compare) -* 🛠 [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for +* ✅ [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for maximum one copy on every data path and lower request-response latency * ⌚ [multithreading](https://bugs.passt.top/show_bug.cgi?id=13) * ⌚ [raw IP socket support](https://bugs.passt.top/show_bug.cgi?id=14) if From 2385b69a66807e32dca5ae17ab64686888e4c682 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 19 Dec 2024 17:27:44 +0100 Subject: [PATCH 163/382] Makefile: Report error and stop if we can't set TARGET I don't think it's necessarily productive to check all the possible error conditions in the Makefile, but this one is annoying: issue 'make' without a C compiler, then install one, and build again. Then run passt and it will mysteriously terminate on epoll_wait(), because seccomp.h is good enough to build against, but the resulting seccomp filter doesn't allow any system call. Not really fun to debug. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 1fce737..464eef1 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio DUAL_STACK_SOCKETS := 1 TARGET ?= $(shell $(CC) -dumpmachine) +$(if $(TARGET),,$(error Failed to get target architecture)) # Get 'uname -m'-like architecture description for target TARGET_ARCH := $(firstword $(subst -, ,$(TARGET))) TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH)) From 324233bd9b8baa3ec13a7425ea3ec7145e3ce645 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 20 Dec 2024 12:40:29 +0100 Subject: [PATCH 164/382] udp_flow: Don't block multicast and broadcast messages It was reported that SSDP notifications sent from a container (with e.g. minidlna) stopped appearing on the network starting from commit 1db4f773e87f ("udp: Improve detail of UDP endpoint sanity checking"). As a minimal reproducer using minidlnad(8): $ mkdir /tmp/minidlna $ cat conf media_dir=/tmp/minidlna db_dir=/tmp/minidlna $ ./pasta -d --config-net -- sh -c '/usr/sbin/minidlnad -p 31337 -S -f conf -P /dev/null & (sleep 1; killall minidlnad)' [...] 1.0327: Flow 0 (NEW): FREE -> NEW 1.0327: Flow 0 (INI): NEW -> INI 1.0327: Flow 0 (INI): TAP [88.198.0.164]:54185 -> [239.255.255.250]:1900 => ? 1.0327: Flow 0 (INI): Invalid endpoint on UDP packet 1.0327: Flow 0 (FREE): INI -> FREE 1.0328: Flow 0 (FREE): TAP [88.198.0.164]:54185 -> [239.255.255.250]:1900 => ? 1.0328: Dropping datagram with no flow TAP 88.198.0.164:54185 -> 239.255.255.250:1900 This is an actual regression as there's no particular reason to block outbound multicast UDP packets. And even if we don't handle multicast groups in any particular way (https://bugs.passt.top/show_bug.cgi?id=2, "Add IGMP/MLD proxy"), there's no reason to block inbound multicast or broadcast packets either, should they ever be somehow delivered to passt or pasta. Let multicast and broadcast packets through, refusing only to establish flows with unspecified endpoint, as those would actually cause havoc in the flow table. IP-wise, SSDP notifications look like this (after this patch), inside and outside: $ pasta -p /tmp/minidlna.pcap --config-net -- sh -c '/usr/sbin/minidlnad -p 31337 -S -f minidlna.conf -P /dev/null & (sleep 1; killall minidlnad)' [...] $ tshark -a packets:1 -r /tmp/minidlna.pcap ssdp 2 0.074808 88.198.0.164 ? 239.255.255.250 SSDP 200 NOTIFY * HTTP/1.1 # tshark -i ens3 -a packets:1 multicast 2>/dev/null 1 0.000000000 88.198.0.164 ? 239.255.255.250 SSDP 200 NOTIFY * HTTP/1.1 Link: https://github.com/containers/podman/issues/24871 Fixes: 1db4f773e87f ("udp: Improve detail of UDP endpoint sanity checking") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp_flow.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udp_flow.c b/udp_flow.c index 343caae..9fd7d06 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -209,7 +209,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || ini->oport == 0) { - /* In principle ini->oddr also must be unicast, but when we've + /* In principle ini->oddr also must be specified, but when we've * been initiated from a socket bound to 0.0.0.0 or ::, we don't * know our address, so we have to leave it unpopulated. */ @@ -267,8 +267,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport); - if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || - !inany_is_unicast(&ini->oaddr) || ini->oport == 0) { + if (inany_is_unspecified(&ini->eaddr) || ini->eport == 0 || + inany_is_unspecified(&ini->oaddr) || ini->oport == 0) { flow_dbg(flow, "Invalid endpoint on UDP packet"); flow_alloc_cancel(flow); return FLOW_SIDX_NONE; From 898e853635a79e33917bb4646ff1fb5fc3a92997 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 19 Dec 2024 12:13:52 +0100 Subject: [PATCH 165/382] virtio: Use const pointer for vu_dev We don't modify the structure in some virtio functions. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- virtio.c | 14 +++++++++----- virtio.h | 2 +- vu_common.c | 2 +- vu_common.h | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/virtio.c b/virtio.c index a76de5e..625bac3 100644 --- a/virtio.c +++ b/virtio.c @@ -92,7 +92,8 @@ * * Return: virtual address in our address space of the guest physical address */ -static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_addr) +static void *vu_gpa_to_va(const struct vu_dev *dev, uint64_t *plen, + uint64_t guest_addr) { unsigned int i; @@ -210,7 +211,8 @@ static void virtqueue_get_head(const struct vu_virtq *vq, * * Return: -1 if there is an error, 0 otherwise */ -static int virtqueue_read_indirect_desc(struct vu_dev *dev, struct vring_desc *desc, +static int virtqueue_read_indirect_desc(const struct vu_dev *dev, + struct vring_desc *desc, uint64_t addr, size_t len) { uint64_t read_len; @@ -390,7 +392,7 @@ static inline void vring_set_avail_event(const struct vu_virtq *vq, * * Return: false on error, true otherwise */ -static bool virtqueue_map_desc(struct vu_dev *dev, +static bool virtqueue_map_desc(const struct vu_dev *dev, unsigned int *p_num_sg, struct iovec *iov, unsigned int max_num_sg, uint64_t pa, size_t sz) @@ -426,7 +428,8 @@ static bool virtqueue_map_desc(struct vu_dev *dev, * * Return: -1 if there is an error, 0 otherwise */ -static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned int idx, +static int vu_queue_map_desc(const struct vu_dev *dev, + struct vu_virtq *vq, unsigned int idx, struct vu_virtq_element *elem) { const struct vring_desc *desc = vq->vring.desc; @@ -504,7 +507,8 @@ static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned i * * Return: -1 if there is an error, 0 otherwise */ -int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem) +int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, + struct vu_virtq_element *elem) { unsigned int head; int ret; diff --git a/virtio.h b/virtio.h index 6410d60..0af259d 100644 --- a/virtio.h +++ b/virtio.h @@ -170,7 +170,7 @@ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, bool vu_queue_empty(struct vu_virtq *vq); void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq); -int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, +int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem); void vu_queue_detach_element(struct vu_virtq *vq); void vu_queue_unpop(struct vu_virtq *vq); diff --git a/vu_common.c b/vu_common.c index 299b5a3..6d365be 100644 --- a/vu_common.c +++ b/vu_common.c @@ -73,7 +73,7 @@ void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt * * Return: number of elements used to contain the frame */ -int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq, +int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, struct vu_virtq_element *elem, int max_elem, size_t size, size_t *frame_size) { diff --git a/vu_common.h b/vu_common.h index 901d972..bd70faf 100644 --- a/vu_common.h +++ b/vu_common.h @@ -46,7 +46,7 @@ static inline void vu_set_element(struct vu_virtq_element *elem, void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt); -int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq, +int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, struct vu_virtq_element *elem, int max_elem, size_t size, size_t *frame_size); void vu_set_vnethdr(const struct vu_dev *vdev, From 3876fc780d01870040343cdab7da3f14f53272d5 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 27 Dec 2024 11:40:19 +0100 Subject: [PATCH 166/382] seccomp: Unconditionally allow accept(2) even if accept4(2) is present On Alpine Linux 3.21, passt aborts right away as soon as QEMU connects to it. Most likely, this has always been the case with musl, because since musl commit dc01e2cbfb29 ("add fallback emulation for accept4 on old kernels"), accept4() without flags is implemented using accept(). However, I guess that nobody realised earlier because it's typically pasta(1) being used on musl-based distributions, and the only place where we call accept4() without flags is tap_listen_handler(). Add accept() to the list of allowed system calls regardless of the presence of accept4(). Reported-by: NN708 <nn708@outlook.com> Link: https://bugs.passt.top/show_bug.cgi?id=106 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- passt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passt.c b/passt.c index 957f3d0..1a0c404 100644 --- a/passt.c +++ b/passt.c @@ -180,7 +180,7 @@ void exit_handler(int signal) * #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close * #syscalls bind connect recvfrom sendto shutdown * #syscalls arm:recv ppc64le:recv arm:send ppc64le:send - * #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait + * #syscalls accept4 accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait * #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64 */ int main(int argc, char **argv) From 725acd111ba340122f2bb0601e373534eb4b5ed8 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 6 Jan 2025 10:10:29 +0100 Subject: [PATCH 167/382] tcp_splice: Set (again) TCP_NODELAY on both sides In commit 7ecf69329787 ("pasta, tcp: Don't set TCP_CORK on spliced sockets") I just assumed that we wouldn't benefit from disabling Nagle's algorithm once we drop TCP_CORK (and its 200ms fixed delay). It turns out that with some patterns, such as a PostgreSQL server in a container receiving parameterised, short queries, for which pasta sees several short inbound messages (Parse, Bind, Describe, Execute and Sync commands getting each one their own packet, 5 to 49 bytes TCP payload each), we'll read them usually in two batches, and send them in matching batches, for example: 9165.2467: pasta: epoll event on connected spliced TCP socket 117 (events: 0x00000001) 9165.2468: Flow 0 (TCP connection (spliced)): 76 from read-side call 9165.2468: Flow 0 (TCP connection (spliced)): 76 from write-side call (passed 524288) 9165.2469: pasta: epoll event on connected spliced TCP socket 117 (events: 0x00000001) 9165.2470: Flow 0 (TCP connection (spliced)): 15 from read-side call 9165.2470: Flow 0 (TCP connection (spliced)): 15 from write-side call (passed 524288) 9165.2944: pasta: epoll event on connected spliced TCP socket 118 (events: 0x00000001) and the kernel delivers the first one, waits for acknowledgement from the receiver, then delivers the second one. This adds very substantial and unnecessary delay. It's usually a fixed ~40ms between the two batches, which is clearly unacceptable for loopback connections. In this example, the delay is shown by the timestamp of the response from socket 118. The peer (server) doesn't actually take that long (less than a millisecond), but it takes that long for the kernel to deliver our request. To avoid batching and delays, disable Nagle's algorithm by setting TCP_NODELAY on both internal and external sockets: this way, we get one inbound packet for each original message, we transfer them right away, and the kernel delivers them to the process in the container as they are, without delay. We can do this safely as we don't care much about network utilisation when there's in fact pretty much no network (loopback connections). This is unfortunately not visible in the TCP request-response tests from the test suite because, with smaller messages (we use one byte), Nagle's algorithm doesn't even kick in. It's probably not trivial to implement a universal test covering this case. Fixes: 7ecf69329787 ("pasta, tcp: Don't set TCP_CORK on spliced sockets") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_splice.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index 3a0f868..3a000ff 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -348,6 +348,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) uint8_t tgtpif = conn->f.pif[TGTSIDE]; union sockaddr_inany sa; socklen_t sl; + int one = 1; if (tgtpif == PIF_HOST) conn->s[1] = tcp_conn_sock(c, af); @@ -359,12 +360,21 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) if (conn->s[1] < 0) return -1; - if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, - &((int){ 1 }), sizeof(int))) { + if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &one, sizeof(one))) { flow_trace(conn, "failed to set TCP_QUICKACK on socket %i", conn->s[1]); } + if (setsockopt(conn->s[0], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) { + flow_trace(conn, "failed to set TCP_NODELAY on socket %i", + conn->s[0]); + } + + if (setsockopt(conn->s[1], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) { + flow_trace(conn, "failed to set TCP_NODELAY on socket %i", + conn->s[1]); + } + pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport); if (connect(conn->s[1], &sa.sa, sl)) { From 2c174f1fe8a5f1923b14cde703941d4daac39850 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 9 Jan 2025 14:06:48 +0100 Subject: [PATCH 168/382] checksum: fix checksum with odd base address csum_unfolded() must call csum_avx2() with a 32byte aligned base address. To be able to do that if the buffer is not correctly aligned, it splits the buffers in 2 parts, the second part is 32byte aligned and can be used with csum_avx2(), the first part is the remaining part, that is not 32byte aligned and we use sum_16b() to compute the checksum. A problem appears if the length of the first part is odd because the checksum is using 16bit words to do the checksum. If the length is odd, when the second part is computed, all words are shifted by 1 byte, meaning weight of upper and lower byte is swapped. For instance a 13 bytes buffer: bytes: aa AA bb BB cc CC dd DD ee EE ff FF gg 16bit words: AAaa BBbb CCcc DDdd EEee FFff 00gg If we don't split the sequence, the checksum is: AAaa + BBbb + CCcc + DDdd + EEee + FFff + 00gg If we split the sequence with an even length for the first part: (AAaa + BBbb) + (CCcc + DDdd + EEee + FFff + 00gg) But if the first part has an odd length: (AAaa + BBbb + 00cc) + (ddCC + eeDD + ffEE + ggFF) To avoid the problem, do not call csum_avx2() if the first part cannot have an even length, and compute the checksum of all the buffer using sum_16b(). This is slower but it can only happen if the buffer base address is odd, and this can only happen if the binary is built using '-Os', and that means we have chosen to prioritize size over speed. Reported-by: Mike Jones <mike@mjones.io> Link: https://bugs.passt.top/show_bug.cgi?id=108 Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Added comment explaining why we check for pad & 1] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- checksum.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/checksum.c b/checksum.c index 1c4354d..b01e0fe 100644 --- a/checksum.c +++ b/checksum.c @@ -452,7 +452,8 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) intptr_t align = ROUND_UP((intptr_t)buf, sizeof(__m256i)); unsigned int pad = align - (intptr_t)buf; - if (len < pad) + /* Don't mix sum_16b() and csum_avx2() with odd padding lengths */ + if (pad & 1 || len < pad) pad = len; if (pad) From f04b483d1509b852951fe1421ef6f6740c9f9a08 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Sat, 11 Jan 2025 00:46:51 +0100 Subject: [PATCH 169/382] test/pasta_podman: Run Podman tests on a single CPU thread Increasingly often, I'm getting occasional failures of the same type as https://github.com/containers/podman/issues/24147. I guess it mostly depends on the system load. It will be a while until I'll actually run tests on a kernel including my fix for it, kernel commit a502ea6fa94b ("udp: Deal with race between UDP socket address change and rehash"), so add a horrible workaround using taskset(1), for the moment. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/pasta_podman/bats | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pasta_podman/bats b/test/pasta_podman/bats index 6b1c575..2f07be8 100644 --- a/test/pasta_podman/bats +++ b/test/pasta_podman/bats @@ -23,4 +23,4 @@ check [ "__PASTA_BIN__" = "__WD__/pasta" ] test Podman system test with bats -host PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats test/podman/test/system/505-networking-pasta.bats +host PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" taskset -c 1 bats test/podman/test/system/505-networking-pasta.bats From 1b95bd6fa1148f3609bebf7b2bcd6d47376e61a6 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Wed, 15 Jan 2025 17:22:30 +0100 Subject: [PATCH 170/382] vhost_user: fix multibuffer from linux Under some conditions, linux can provide several buffers in the same element (multiple entries in the iovec array). I didn't identify what changed between the kernel guest that provides one buffer and the one that provides several (doesn't seem to be a kernel change or a configuration change). Fix the following assert: ASSERTION FAILED in virtqueue_map_desc (virtio.c:402): num_sg < max_num_sg What I can see is the buffer can be splitted in two iovecs: - vnet header - packet data This change manages this special case but the real fix will be to allow tap_add_packet() to manage iovec array. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vu_common.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/vu_common.c b/vu_common.c index 6d365be..431fba6 100644 --- a/vu_common.c +++ b/vu_common.c @@ -18,6 +18,8 @@ #include "pcap.h" #include "vu_common.h" +#define VU_MAX_TX_BUFFER_NB 2 + /** * vu_packet_check_range() - Check if a given memory zone is contained in * a mapped guest memory region @@ -168,10 +170,15 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, count = 0; out_sg_count = 0; - while (count < VIRTQUEUE_MAX_SIZE) { + while (count < VIRTQUEUE_MAX_SIZE && + out_sg_count + VU_MAX_TX_BUFFER_NB <= VIRTQUEUE_MAX_SIZE) { int ret; - vu_set_element(&elem[count], &out_sg[out_sg_count], NULL); + elem[count].out_num = VU_MAX_TX_BUFFER_NB; + elem[count].out_sg = &out_sg[out_sg_count]; + elem[count].in_num = 0; + elem[count].in_sg = NULL; + ret = vu_queue_pop(vdev, vq, &elem[count]); if (ret < 0) break; @@ -181,11 +188,20 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, warn("virtio-net transmit queue contains no out buffers"); break; } - ASSERT(elem[count].out_num == 1); + if (elem[count].out_num == 1) { + tap_add_packet(vdev->context, + elem[count].out_sg[0].iov_len - hdrlen, + (char *)elem[count].out_sg[0].iov_base + + hdrlen); + } else { + /* vnet header can be in a separate iovec */ + ASSERT(elem[count].out_num == 2); + ASSERT(elem[count].out_sg[0].iov_len == (size_t)hdrlen); + tap_add_packet(vdev->context, + elem[count].out_sg[1].iov_len, + (char *)elem[count].out_sg[1].iov_base); + } - tap_add_packet(vdev->context, - elem[count].out_sg[0].iov_len - hdrlen, - (char *)elem[count].out_sg[0].iov_base + hdrlen); count++; } tap_handler(vdev->context, now); From 707f77b0a93160c8695b3cf5bfd7c24d9992b106 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 16 Jan 2025 20:06:59 +0100 Subject: [PATCH 171/382] tcp: Fix ACK sequence getting out of sync on EPOLLOUT wake-up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the next patches, I'm extending the usage of STALLED to a few more cases. Doing so revealed this issue: if we set STALLED and, consequently, EPOLLOUT (which is wrong, fixed later) right after we set a connection to ESTABLISHED (which also happened by mistake while I was preparing another change), with the guest sending data together with the final ACK in the handshake, say: 41.3661: vhost-user: got kick_data: 0000000000000001 idx: 1 41.3662: Flow 2 (NEW): FREE -> NEW 41.3663: Flow 2 (INI): NEW -> INI 41.3663: Flow 2 (INI): TAP [2a01:4f8:222:904::2]:52536 -> [2001:db8:9a55::1]:10003 => ? 41.3665: Flow 2 (TGT): INI -> TGT 41.3666: Flow 2 (TGT): TAP [2a01:4f8:222:904::2]:52536 -> [2001:db8:9a55::1]:10003 => HOST [::]:0 -> [2001:db8:9a55::1]:10003 41.3667: Flow 2 (TCP connection): TGT -> TYPED 41.3667: Flow 2 (TCP connection): TAP [2a01:4f8:222:904::2]:52536 -> [2001:db8:9a55::1]:10003 => HOST [::]:0 -> [2001:db8:9a55::1]:10003 41.3669: Flow 2 (TCP connection): TAP_SYN_RCVD: CLOSED -> SYN_SENT 41.3670: Flow 2 (TCP connection): Side 0 hash table insert: bucket: 339814 41.3672: Flow 2 (TCP connection): TYPED -> ACTIVE 41.3673: Flow 2 (TCP connection): TAP [2a01:4f8:222:904::2]:52536 -> [2001:db8:9a55::1]:10003 => HOST [::]:0 -> [2001:db8:9a55::1]:10003 41.3674: Flow 2 (TCP connection): TAP_SYN_ACK_SENT: SYN_SENT -> SYN_RCVD 41.3675: Flow 2 (TCP connection): ACK_FROM_TAP_DUE 41.3675: Flow 2 (TCP connection): timer expires in 10.000s 41.3675: vhost-user: got kick_data: 0000000000000001 idx: 1 41.3676: Flow 2 (TCP connection): ACK_FROM_TAP_DUE dropped 41.3676: Flow 2 (TCP connection): ESTABLISHED: SYN_RCVD -> ESTABLISHED 41.3678: Flow 2 (TCP connection): STALLED 41.3678: vhost-user: got kick_data: 0000000000000002 idx: 1 41.3679: Flow 2 (TCP connection): ACK_TO_TAP_DUE 41.3680: Flow 2 (TCP connection): timer expires in 0.010s 41.3680: Flow 2 (TCP connection): STALLED dropped we'll immediately get an EPOLLOUT event, call tcp_update_seqack_wnd(), but ignore window and ACK sequence update. At this point, we think we acknowledged all the data to the guest (but we didn't) and we'll happily proceed to clear the ACK_TO_TAP_DUE flag: 41.3780: Flow 2 (TCP connection): ACK_TO_TAP_DUE dropped 41.3780: Flow 2 (TCP connection): timer expires in 7200.000s 41.5754: vhost-user: got kick_data: 0000000000000001 idx: 1 41.9956: vhost-user: got kick_data: 0000000000000001 idx: 1 42.8275: vhost-user: got kick_data: 0000000000000001 idx: 1 while the guest starts retransmitting that data desperately, without ever getting an ACK segment from us: 1433 38.746353 2a01:4f8:222:904::2 → 2001:db8:9a55::1 94 TCP 54312 → 10003 [SYN] Seq=0 Win=65460 Len=0 MSS=65460 SACK_PERM TSval=1089126192 TSecr=0 WS=128 1434 38.747357 2001:db8:9a55::1 → 2a01:4f8:222:904::2 82 TCP 10003 → 54312 [SYN, ACK] Seq=0 Ack=1 Win=65535 Len=0 MSS=61440 WS=256 1435 38.747500 2a01:4f8:222:904::2 → 2001:db8:9a55::1 74 TCP 54312 → 10003 [ACK] Seq=1 Ack=1 Win=65536 Len=0 1436 38.747769 2a01:4f8:222:904::2 → 2001:db8:9a55::1 8266 TCP 54312 → 10003 [PSH, ACK] Seq=1 Ack=1 Win=65536 Len=8192 1437 38.747798 2a01:4f8:222:904::2 → 2001:db8:9a55::1 32841 TCP 54312 → 10003 [ACK] Seq=8193 Ack=1 Win=65536 Len=32767 1438 38.748049 2001:db8:9a55::1 → 2a01:4f8:222:904::2 74 TCP [TCP Window Update] 10003 → 54312 [ACK] Seq=1 Ack=1 Win=65280 Len=0 1439 38.954044 2a01:4f8:222:904::2 → 2001:db8:9a55::1 8266 TCP [TCP Retransmission] 54312 → 10003 [PSH, ACK] Seq=1 Ack=1 Win=65536 Len=8192 1440 39.370096 2a01:4f8:222:904::2 → 2001:db8:9a55::1 8266 TCP [TCP Retransmission] 54312 → 10003 [PSH, ACK] Seq=1 Ack=1 Win=65536 Len=8192 1441 40.202135 2a01:4f8:222:904::2 → 2001:db8:9a55::1 8266 TCP [TCP Retransmission] 54312 → 10003 [PSH, ACK] Seq=1 Ack=1 Win=65536 Len=8192 because seq_ack_to_tap is already set to the sequence after frame number 1437 in the example. For some reason, I could only reproduce this with vhost-user, IPv6, and passt running under valgrind while taking captures. Even under these conditions, it happens quite rarely. Forcibly send an ACK segment if we update the ACK sequence (or the advertised window). Fixes: e5eefe77435a ("tcp: Refactor to use events instead of states, split out spliced implementation") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index ec433f7..72fca63 100644 --- a/tcp.c +++ b/tcp.c @@ -2200,8 +2200,10 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, if (events & EPOLLIN) tcp_data_from_sock(c, conn); - if (events & EPOLLOUT) - tcp_update_seqack_wnd(c, conn, false, NULL); + if (events & EPOLLOUT) { + if (tcp_update_seqack_wnd(c, conn, false, NULL)) + tcp_send_flag(c, conn, ACK); + } return; } From 22cf08ba00890c83922c61f5d65803b7f4c1299a Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 16 Jan 2025 20:31:35 +0100 Subject: [PATCH 172/382] tcp: Don't subscribe to EPOLLOUT events on STALLED I inadvertently added that in an unrelated change, but it doesn't make sense: STALLED means we have pending socket data that we can't write to the guest, not the other way around. Fixes: bb708111833e ("treewide: Packet abstraction with mandatory boundary checks") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index 72fca63..ef33388 100644 --- a/tcp.c +++ b/tcp.c @@ -437,7 +437,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) return EPOLLET; if (conn_flags & STALLED) - return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET; + return EPOLLIN | EPOLLRDHUP | EPOLLET; return EPOLLIN | EPOLLRDHUP; } From b8f573cdc222905c06f39625c0567da265a2e36e Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 14 Jan 2025 23:03:49 +0100 Subject: [PATCH 173/382] tcp: Set EPOLLET when when reading from a socket fails with EAGAIN Before SO_PEEK_OFF support was introduced by commit e63d281871ef ("tcp: leverage support of SO_PEEK_OFF socket option when available"), we would peek data from sockets using a "discard" buffer as first iovec element, so that, unless we had no pending data at all, we would always get a positive return code from recvmsg() (except for closing connections or errors). If we couldn't send more data to the guest, in the window, we would set the STALLED flag (causing the epoll descriptor to switch to edge-triggered mode), and return early from tcp_data_from_sock(). With SO_PEEK_OFF, we don't have a discard buffer, and if there's data on the socket, but nothing beyond our current peeking offset, we'll get EAGAIN instead of our current "discard" length. In that case, we return even earlier, and we don't set EPOLLET on the socket as a result. As reported by Asahi Lina, this causes event loops where the kernel is signalling socket readiness, because there's data we didn't dequeue yet (waiting for the guest to acknowledge it), but we won't actually peek anything new, and return early without setting EPOLLET. This is the original report, mentioning the originally proposed fix: -- When there is unacknowledged data in the inbound socket buffer, passt leaves the socket in the epoll instance to accept new data from the server. Since there is already data in the socket buffer, an epoll without EPOLLET will repeatedly fire while no data is processed, busy-looping the CPU: epoll_pwait(3, [...], 8, 1000, NULL, 8) = 4 recvmsg(25, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable) recvmsg(169, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable) recvmsg(111, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable) recvmsg(180, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable) epoll_pwait(3, [...], 8, 1000, NULL, 8) = 4 recvmsg(25, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable) recvmsg(169, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable) recvmsg(111, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable) recvmsg(180, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable) Add in the missing EPOLLET flag for this case. This brings CPU usage down from around ~80% when downloading over TCP, to ~5% (use case: passt as network transport for muvm, downloading Steam games). -- we can't set EPOLLET unconditionally though, at least right now, because we don't monitor the guest tap for EPOLLOUT in case we fail to write on that side because we filled up that buffer (and not the window of a TCP connection). Instead, rely on the observation that, once a connection is established, we only get EAGAIN on recvmsg() if we are attempting to peek data from a socket with a non-zero peeking offset: we only peek when there's pending data on a socket, and in that case, if we peek without offset, we'll always see some data. And if we peek data with a non-zero offset and get EAGAIN, that means that we're either waiting for more data to arrive on the socket (which would cause further wake-ups, even with EPOLLET), or we're waiting for the guest to acknowledge some of it, which would anyway cause a wake-up. In that case, it's safe to set STALLED and, in turn, EPOLLET on the socket, which fixes the EPOLLIN event loop. While we're establishing a connection from the socket side, though, we'll call, once, tcp_{buf,vu}_data_from_sock() to see if we got any data while we were waiting for SYN, ACK from the guest. See the comment at the end of tcp_conn_from_sock_finish(). And if there's no data queued on the socket as we check, we'll also get EAGAIN, even if our peeking offset is zero. For this reason, we need to additionally check that 'already_sent' is not zero, meaning, explicitly, that our peeking offset is not zero. Reported-by: Asahi Lina <lina@asahilina.net> Fixes: e63d281871ef ("tcp: leverage support of SO_PEEK_OFF socket option when available") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_buf.c | 3 +++ tcp_vu.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/tcp_buf.c b/tcp_buf.c index a975a55..8c15101 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -359,6 +359,9 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) return -errno; } + if (already_sent) /* No new data and EAGAIN: set EPOLLET */ + conn_flag(c, conn, STALLED); + return 0; } diff --git a/tcp_vu.c b/tcp_vu.c index 10e17d3..8256f53 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -399,6 +399,10 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) tcp_rst(c, conn); return len; } + + if (already_sent) /* No new data and EAGAIN: set EPOLLET */ + conn_flag(c, conn, STALLED); + return 0; } From a8f4fc481ce3afbf48522a0af44d222d665b515e Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 16 Jan 2025 20:47:00 +0100 Subject: [PATCH 174/382] tcp: Mask EPOLLIN altogether if we're blocked waiting on an ACK from the guest There are pretty much two cases of the (misnomer) STALLED: in one case, we could send more data to the guest if it becomes available, and in another case, we can't, because we filled the window. If, in this second case, we keep EPOLLIN enabled, but never read from the socket, we get short but CPU-annoying storms of EPOLLIN events, upon which we reschedule the ACK timeout handler, never read from the socket, go back to epoll_wait(), and so on: timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0 epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1 timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0 epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1 timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0 epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1 also known as: 29.1517: Flow 2 (TCP connection): timer expires in 2.000s 29.1517: Flow 2 (TCP connection): timer expires in 2.000s 29.1517: Flow 2 (TCP connection): timer expires in 2.000s which, for some reason, becomes very visible with muvm and aria2c downloading from a server nearby in parallel chunks. That's because EPOLLIN isn't cleared if we don't read from the socket, and even with EPOLLET, epoll_wait() will repeatedly wake us up until we actually read something. In this case, we don't want to subscribe to EPOLLIN at all: all we're waiting for is an ACK segment from the guest. Differentiate this case with a new connection flag, ACK_FROM_TAP_BLOCKS, which doesn't just indicate that we're waiting for an ACK from the guest (ACK_FROM_TAP_DUE), but also that we're blocked waiting for it. If this flag is set before we set STALLED, EPOLLIN will be masked while we set EPOLLET because of STALLED. Whenever we clear STALLED, we also clear this flag. This is definitely not elegant, but it's a minimal fix. We can probably simplify this at a later point by having a category of connection flags directly corresponding to epoll flags, and dropping STALLED altogether, or, perhaps, always using EPOLLET (but we need a mechanism to re-check sockets for pending data if we can't temporarily write to the guest). I suspect that this might also be implied in https://github.com/containers/podman/issues/23686, hence the Link: tag. It doesn't necessarily mean I'm fixing it (I can't reproduce that). Link: https://github.com/containers/podman/issues/23686 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 8 ++++++-- tcp_buf.c | 2 ++ tcp_conn.h | 1 + tcp_vu.c | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index ef33388..3b3193a 100644 --- a/tcp.c +++ b/tcp.c @@ -345,7 +345,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = { static const char *tcp_flag_str[] __attribute((__unused__)) = { "STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE", - "ACK_FROM_TAP_DUE", + "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", }; /* Listening sockets, used for automatic port forwarding in pasta mode only */ @@ -436,8 +436,12 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) if (events & TAP_FIN_SENT) return EPOLLET; - if (conn_flags & STALLED) + if (conn_flags & STALLED) { + if (conn_flags & ACK_FROM_TAP_BLOCKS) + return EPOLLRDHUP | EPOLLET; + return EPOLLIN | EPOLLRDHUP | EPOLLET; + } return EPOLLIN | EPOLLRDHUP; } diff --git a/tcp_buf.c b/tcp_buf.c index 8c15101..cbefa42 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -309,6 +309,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) } if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, ACK_FROM_TAP_BLOCKS); conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -387,6 +388,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) return 0; } + conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS); conn_flag(c, conn, ~STALLED); send_bufs = DIV_ROUND_UP(len, mss); diff --git a/tcp_conn.h b/tcp_conn.h index 6ae0511..d342680 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -77,6 +77,7 @@ struct tcp_tap_conn { #define ACTIVE_CLOSE BIT(2) #define ACK_TO_TAP_DUE BIT(3) #define ACK_FROM_TAP_DUE BIT(4) +#define ACK_FROM_TAP_BLOCKS BIT(5) #define SNDBUF_BITS 24 unsigned int sndbuf :SNDBUF_BITS; diff --git a/tcp_vu.c b/tcp_vu.c index 8256f53..a216bb1 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -381,6 +381,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) } if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, ACK_FROM_TAP_BLOCKS); conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -423,6 +424,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) return 0; } + conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS); conn_flag(c, conn, ~STALLED); /* Likely, some new data was acked too. */ From 6016e04a3aae90cdd49fec391088b83a6d2170a6 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 19 Dec 2024 12:13:53 +0100 Subject: [PATCH 175/382] vhost-user: update protocol features and commands list vhost-user protocol specification has been updated with feature flags and commands we will need to implement migration. Signed-off-by: Laurent Vivier <lvivier@redhat.com> [sbrivio: Fix comment to union vhost_user_payload] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 5 +++++ vhost_user.h | 40 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/vhost_user.c b/vhost_user.c index 4b8558f..48226a8 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -110,6 +110,11 @@ static const char *vu_request_to_string(unsigned int req) REQ(VHOST_USER_GET_MAX_MEM_SLOTS), REQ(VHOST_USER_ADD_MEM_REG), REQ(VHOST_USER_REM_MEM_REG), + REQ(VHOST_USER_SET_STATUS), + REQ(VHOST_USER_GET_STATUS), + REQ(VHOST_USER_GET_SHARED_OBJECT), + REQ(VHOST_USER_SET_DEVICE_STATE_FD), + REQ(VHOST_USER_CHECK_DEVICE_STATE), }; #undef REQ return vu_request_str[req]; diff --git a/vhost_user.h b/vhost_user.h index 464ba21..c880893 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -37,6 +37,10 @@ enum vhost_user_protocol_feature { VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + VHOST_USER_PROTOCOL_F_STATUS = 16, + /* Feature 17 reserved for VHOST_USER_PROTOCOL_F_XEN_MMAP. */ + VHOST_USER_PROTOCOL_F_SHARED_OBJECT = 18, + VHOST_USER_PROTOCOL_F_DEVICE_STATE = 19, VHOST_USER_PROTOCOL_F_MAX }; @@ -83,6 +87,11 @@ enum vhost_user_request { VHOST_USER_GET_MAX_MEM_SLOTS = 36, VHOST_USER_ADD_MEM_REG = 37, VHOST_USER_REM_MEM_REG = 38, + VHOST_USER_SET_STATUS = 39, + VHOST_USER_GET_STATUS = 40, + VHOST_USER_GET_SHARED_OBJECT = 41, + VHOST_USER_SET_DEVICE_STATE_FD = 42, + VHOST_USER_CHECK_DEVICE_STATE = 43, VHOST_USER_MAX }; @@ -128,12 +137,39 @@ struct vhost_user_memory { struct vhost_user_memory_region regions[VHOST_MEMORY_BASELINE_NREGIONS]; }; +/** + * struct vhost_user_log - Address and size of the shared memory region used + * to log page update + * @mmap_size: Size of the shared memory region + * @mmap_offset: Offset of the shared memory region + */ +struct vhost_user_log { + uint64_t mmap_size; + uint64_t mmap_offset; +}; + +/** + * struct vhost_user_transfer_device_state - Set the direction and phase + * of the backend device state fd + * @direction: Device state transfer direction (save or load) + * @phase: Migration phase (only stopped is supported) + */ +struct vhost_user_transfer_device_state { + uint32_t direction; +#define VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE 0 +#define VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD 1 + uint32_t phase; +#define VHOST_USER_TRANSFER_STATE_PHASE_STOPPED 0 +}; + /** * union vhost_user_payload - vhost-user message payload * @u64: 64-bit payload * @state: vring state payload * @addr: vring addresses payload - * vhost_user_memory: Memory regions information payload + * @memory: Memory regions information payload + * @log: Memory logging payload + * @transfer_state: Device state payload */ union vhost_user_payload { #define VHOST_USER_VRING_IDX_MASK 0xff @@ -142,6 +178,8 @@ union vhost_user_payload { struct vhost_vring_state state; struct vhost_vring_addr addr; struct vhost_user_memory memory; + struct vhost_user_log log; + struct vhost_user_transfer_device_state transfer_state; }; /** From b04195c60ff34db89b6bc400ad582d0ff399757b Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 19 Dec 2024 12:13:54 +0100 Subject: [PATCH 176/382] vhost-user: add VHOST_USER_SET_LOG_FD command VHOST_USER_SET_LOG_FD is an optional message with an eventfd in ancillary data, it may be used to inform the front-end that the log has been modified. Signed-off-by: Laurent Vivier <lvivier@redhat.com> [sbrivio: Fix comment to vu_set_log_fd_exec()] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ vhost_user.h | 1 + virtio.h | 2 ++ 3 files changed, 59 insertions(+) diff --git a/vhost_user.c b/vhost_user.c index 48226a8..3f34c91 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -504,6 +504,57 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, return false; } +/** + * vu_close_log() - Close the logging file descriptor + * @vdev: vhost-user device + */ +static void vu_close_log(struct vu_dev *vdev) +{ + if (vdev->log_call_fd != -1) { + close(vdev->log_call_fd); + vdev->log_call_fd = -1; + } +} + +/** + * vu_log_kick() - Inform the front-end that the log has been modified + * @vdev: vhost-user device + */ +/* cppcheck-suppress unusedFunction */ +void vu_log_kick(const struct vu_dev *vdev) +{ + if (vdev->log_call_fd != -1) { + int rc; + + rc = eventfd_write(vdev->log_call_fd, 1); + if (rc == -1) + die_perror("vhost-user kick eventfd_write()"); + } +} + +/** + * vu_set_log_fd_exec() - Set the eventfd used to report logging update + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_log_fd_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + if (msg->fd_num != 1) + die("Invalid log_fd message"); + + if (vdev->log_call_fd != -1) + close(vdev->log_call_fd); + + vdev->log_call_fd = msg->fds[0]; + + debug("Got log_call_fd: %d", vdev->log_call_fd); + + return false; +} + /** * vu_set_vring_num_exec() - Set the size of the queue (vring size) * @vdev: vhost-user device @@ -864,8 +915,10 @@ void vu_init(struct ctx *c) .notification = true, }; } + c->vdev->log_call_fd = -1; } + /** * vu_cleanup() - Reset vhost-user device * @vdev: vhost-user device @@ -909,6 +962,8 @@ void vu_cleanup(struct vu_dev *vdev) } } vdev->nregions = 0; + + vu_close_log(vdev); } /** @@ -929,6 +984,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, [VHOST_USER_GET_QUEUE_NUM] = vu_get_queue_num_exec, [VHOST_USER_SET_OWNER] = vu_set_owner_exec, [VHOST_USER_SET_MEM_TABLE] = vu_set_mem_table_exec, + [VHOST_USER_SET_LOG_FD] = vu_set_log_fd_exec, [VHOST_USER_SET_VRING_NUM] = vu_set_vring_num_exec, [VHOST_USER_SET_VRING_ADDR] = vu_set_vring_addr_exec, [VHOST_USER_SET_VRING_BASE] = vu_set_vring_base_exec, diff --git a/vhost_user.h b/vhost_user.h index c880893..bf3eb50 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -240,5 +240,6 @@ static inline bool vu_queue_started(const struct vu_virtq *vq) void vu_print_capabilities(void); void vu_init(struct ctx *c); void vu_cleanup(struct vu_dev *vdev); +void vu_log_kick(const struct vu_dev *vdev); void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events); #endif /* VHOST_USER_H */ diff --git a/virtio.h b/virtio.h index 0af259d..3b0df34 100644 --- a/virtio.h +++ b/virtio.h @@ -103,6 +103,7 @@ struct vu_dev_region { * @regions: Guest shared memory regions * @features: Vhost-user features * @protocol_features: Vhost-user protocol features + * @log_call_fd: Eventfd to report logging update */ struct vu_dev { struct ctx *context; @@ -111,6 +112,7 @@ struct vu_dev { struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; uint64_t features; uint64_t protocol_features; + int log_call_fd; }; /** From 538312af196308dea9a4ddb9442bed921c0dc915 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 19 Dec 2024 12:13:55 +0100 Subject: [PATCH 177/382] vhost-user: Pass vu_dev to more virtio functions vu_dev will be needed to log page update. Add the parameter to: vring_used_write() vu_queue_fill_by_index() vu_queue_fill() vring_used_idx_set() vu_queue_flush() The new parameter is unused for now. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- virtio.c | 32 ++++++++++++++++++++++---------- virtio.h | 10 ++++++---- vu_common.c | 8 ++++---- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/virtio.c b/virtio.c index 625bac3..52d5a4d 100644 --- a/virtio.c +++ b/virtio.c @@ -580,28 +580,34 @@ bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num) /** * vring_used_write() - Write an entry in the used ring + * @dev: Vhost-user device * @vq: Virtqueue * @uelem: Entry to write * @i: Index of the entry in the used ring */ -static inline void vring_used_write(struct vu_virtq *vq, +static inline void vring_used_write(const struct vu_dev *vdev, + struct vu_virtq *vq, const struct vring_used_elem *uelem, int i) { struct vring_used *used = vq->vring.used; used->ring[i] = *uelem; + (void)vdev; } + /** * vu_queue_fill_by_index() - Update information of a descriptor ring entry * in the used ring + * @dev: Vhost-user device * @vq: Virtqueue * @index: Descriptor ring index * @len: Size of the element * @idx: Used ring entry index */ -void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, - unsigned int len, unsigned int idx) +void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int index, unsigned int len, + unsigned int idx) { struct vring_used_elem uelem; @@ -612,7 +618,7 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, uelem.id = htole32(index); uelem.len = htole32(len); - vring_used_write(vq, &uelem, idx); + vring_used_write(vdev, vq, &uelem, idx); } /** @@ -623,30 +629,36 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, * @len: Size of the element * @idx: Used ring entry index */ -void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem, - unsigned int len, unsigned int idx) +void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, + const struct vu_virtq_element *elem, unsigned int len, + unsigned int idx) { - vu_queue_fill_by_index(vq, elem->index, len, idx); + vu_queue_fill_by_index(vdev, vq, elem->index, len, idx); } /** * vring_used_idx_set() - Set the descriptor ring current index + * @dev: Vhost-user device * @vq: Virtqueue * @val: Value to set in the index */ -static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val) +static inline void vring_used_idx_set(const struct vu_dev *vdev, + struct vu_virtq *vq, uint16_t val) { vq->vring.used->idx = htole16(val); + (void)vdev; vq->used_idx = val; } /** * vu_queue_flush() - Flush the virtqueue + * @dev: Vhost-user device * @vq: Virtqueue * @count: Number of entry to flush */ -void vu_queue_flush(struct vu_virtq *vq, unsigned int count) +void vu_queue_flush(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int count) { uint16_t old, new; @@ -658,7 +670,7 @@ void vu_queue_flush(struct vu_virtq *vq, unsigned int count) old = vq->used_idx; new = old + count; - vring_used_idx_set(vq, new); + vring_used_idx_set(vdev, vq, new); vq->inuse -= count; if ((uint16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) vq->signalled_used_valid = false; diff --git a/virtio.h b/virtio.h index 3b0df34..d95bb07 100644 --- a/virtio.h +++ b/virtio.h @@ -177,10 +177,12 @@ int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, void vu_queue_detach_element(struct vu_virtq *vq); void vu_queue_unpop(struct vu_virtq *vq); bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num); -void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, - unsigned int len, unsigned int idx); -void vu_queue_fill(struct vu_virtq *vq, +void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int index, unsigned int len, + unsigned int idx); +void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, const struct vu_virtq_element *elem, unsigned int len, unsigned int idx); -void vu_queue_flush(struct vu_virtq *vq, unsigned int count); +void vu_queue_flush(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int count); #endif /* VIRTIO_H */ diff --git a/vu_common.c b/vu_common.c index 431fba6..0ba2351 100644 --- a/vu_common.c +++ b/vu_common.c @@ -142,9 +142,9 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, int i; for (i = 0; i < elem_cnt; i++) - vu_queue_fill(vq, &elem[i], elem[i].in_sg[0].iov_len, i); + vu_queue_fill(vdev, vq, &elem[i], elem[i].in_sg[0].iov_len, i); - vu_queue_flush(vq, elem_cnt); + vu_queue_flush(vdev, vq, elem_cnt); vu_queue_notify(vdev, vq); } @@ -210,8 +210,8 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, int i; for (i = 0; i < count; i++) - vu_queue_fill(vq, &elem[i], 0, i); - vu_queue_flush(vq, count); + vu_queue_fill(vdev, vq, &elem[i], 0, i); + vu_queue_flush(vdev, vq, count); vu_queue_notify(vdev, vq); } } From 3c1d91b8162607ec27b05502278a361cd73a54e2 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 19 Dec 2024 12:13:56 +0100 Subject: [PATCH 178/382] vhost-user: add VHOST_USER_SET_LOG_BASE command Sets logging shared memory space. When the back-end has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol feature, the log memory fd is provided in the ancillary data of VHOST_USER_SET_LOG_BASE message, the size and offset of shared memory area provided in the message. Signed-off-by: Laurent Vivier <lvivier@redhat.com> [sbrivio: Fix coding style in a bunch of places] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- util.h | 3 ++ vhost_user.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++- vhost_user.h | 3 ++ virtio.c | 74 ++++++++++++++++++++++++++++++++++++++++++-- virtio.h | 4 +++ 5 files changed, 167 insertions(+), 3 deletions(-) diff --git a/util.h b/util.h index 3fa1d12..d02333d 100644 --- a/util.h +++ b/util.h @@ -152,6 +152,9 @@ static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } #define smp_wmb() smp_mb_release() #define smp_rmb() smp_mb_acquire() +#define qatomic_or(ptr, n) \ + ((void) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)) + #define NS_FN_STACK_SIZE (1024 * 1024) /* 1MiB */ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, diff --git a/vhost_user.c b/vhost_user.c index 3f34c91..66ded12 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -510,6 +510,12 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, */ static void vu_close_log(struct vu_dev *vdev) { + if (vdev->log_table) { + if (munmap(vdev->log_table, vdev->log_size) != 0) + die_perror("close log munmap() error"); + vdev->log_table = NULL; + } + if (vdev->log_call_fd != -1) { close(vdev->log_call_fd); vdev->log_call_fd = -1; @@ -520,7 +526,6 @@ static void vu_close_log(struct vu_dev *vdev) * vu_log_kick() - Inform the front-end that the log has been modified * @vdev: vhost-user device */ -/* cppcheck-suppress unusedFunction */ void vu_log_kick(const struct vu_dev *vdev) { if (vdev->log_call_fd != -1) { @@ -532,6 +537,83 @@ void vu_log_kick(const struct vu_dev *vdev) } } +/** + * vu_log_page() - Update logging table + * @log_table: Base address of the logging table + * @page: Page number that has been updated + */ +/* NOLINTNEXTLINE(readability-non-const-parameter) */ +static void vu_log_page(uint8_t *log_table, uint64_t page) +{ + qatomic_or(&log_table[page / 8], 1 << (page % 8)); +} + +/** + * vu_log_write() - Log memory write + * @dev: vhost-user device + * @address: Memory address + * @length: Memory size + */ +void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length) +{ + uint64_t page; + + if (!vdev->log_table || !length || + !vu_has_feature(vdev, VHOST_F_LOG_ALL)) + return; + + page = address / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < address + length) { + vu_log_page(vdev->log_table, page); + page++; + } + vu_log_kick(vdev); +} + +/** + * vu_set_log_base_exec() - Set the memory log base + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + * + * #syscalls:vu mmap|mmap2 munmap + */ +static bool vu_set_log_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + uint64_t log_mmap_size, log_mmap_offset; + void *base; + int fd; + + if (msg->fd_num != 1 || msg->hdr.size != sizeof(msg->payload.log)) + die("vhost-user: Invalid log_base message"); + + fd = msg->fds[0]; + log_mmap_offset = msg->payload.log.mmap_offset; + log_mmap_size = msg->payload.log.mmap_size; + + debug("vhost-user log mmap_offset: %"PRId64, log_mmap_offset); + debug("vhost-user log mmap_size: %"PRId64, log_mmap_size); + + base = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, + log_mmap_offset); + close(fd); + if (base == MAP_FAILED) + die("vhost-user log mmap error"); + + if (vdev->log_table) + munmap(vdev->log_table, vdev->log_size); + + vdev->log_table = base; + vdev->log_size = log_mmap_size; + + msg->hdr.size = sizeof(msg->payload.u64); + msg->fd_num = 0; + + return true; +} + /** * vu_set_log_fd_exec() - Set the eventfd used to report logging update * @vdev: vhost-user device @@ -915,6 +997,7 @@ void vu_init(struct ctx *c) .notification = true, }; } + c->vdev->log_table = NULL; c->vdev->log_call_fd = -1; } @@ -984,6 +1067,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, [VHOST_USER_GET_QUEUE_NUM] = vu_get_queue_num_exec, [VHOST_USER_SET_OWNER] = vu_set_owner_exec, [VHOST_USER_SET_MEM_TABLE] = vu_set_mem_table_exec, + [VHOST_USER_SET_LOG_BASE] = vu_set_log_base_exec, [VHOST_USER_SET_LOG_FD] = vu_set_log_fd_exec, [VHOST_USER_SET_VRING_NUM] = vu_set_vring_num_exec, [VHOST_USER_SET_VRING_ADDR] = vu_set_vring_addr_exec, diff --git a/vhost_user.h b/vhost_user.h index bf3eb50..e769cb1 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -15,6 +15,7 @@ #include "iov.h" #define VHOST_USER_F_PROTOCOL_FEATURES 30 +#define VHOST_LOG_PAGE 4096 #define VHOST_MEMORY_BASELINE_NREGIONS 8 @@ -241,5 +242,7 @@ void vu_print_capabilities(void); void vu_init(struct ctx *c); void vu_cleanup(struct vu_dev *vdev); void vu_log_kick(const struct vu_dev *vdev); +void vu_log_write(const struct vu_dev *vdev, uint64_t address, + uint64_t length); void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events); #endif /* VHOST_USER_H */ diff --git a/virtio.c b/virtio.c index 52d5a4d..2b58e4d 100644 --- a/virtio.c +++ b/virtio.c @@ -81,6 +81,7 @@ #include "util.h" #include "virtio.h" +#include "vhost_user.h" #define VIRTQUEUE_MAX_SIZE 1024 @@ -592,7 +593,72 @@ static inline void vring_used_write(const struct vu_dev *vdev, struct vring_used *used = vq->vring.used; used->ring[i] = *uelem; - (void)vdev; + vu_log_write(vdev, vq->vring.log_guest_addr + + offsetof(struct vring_used, ring[i]), + sizeof(used->ring[i])); +} + +/** + * vu_log_queue_fill() - Log virtqueue memory update + * @dev: vhost-user device + * @vq: Virtqueue + * @index: Descriptor ring index + * @len: Size of the element + */ +static void vu_log_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int index, unsigned int len) +{ + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + struct vring_desc *desc = vq->vring.desc; + unsigned int max, min; + unsigned num_bufs = 0; + uint64_t read_len; + + if (!vdev->log_table || !len || !vu_has_feature(vdev, VHOST_F_LOG_ALL)) + return; + + max = vq->vring.num; + + if (le16toh(desc[index].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[index].len) % sizeof(struct vring_desc)) + die("Invalid size for indirect buffer table"); + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[index].addr); + desc_len = le32toh(desc[index].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(vdev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(vdev, desc_buf, + desc_addr, + desc_len)) + desc = desc_buf; + } + + if (!desc) + die("Invalid indirect buffer table"); + + index = 0; + } + + do { + if (++num_bufs > max) + die("Looped descriptor"); + + if (le16toh(desc[index].flags) & VRING_DESC_F_WRITE) { + min = MIN(le32toh(desc[index].len), len); + vu_log_write(vdev, le64toh(desc[index].addr), min); + len -= min; + } + } while (len > 0 && + (virtqueue_read_next_desc(desc, index, max, &index) == + VIRTQUEUE_READ_DESC_MORE)); } @@ -614,6 +680,8 @@ void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq, if (!vq->vring.avail) return; + vu_log_queue_fill(vdev, vq, index, len); + idx = (idx + vq->used_idx) % vq->vring.num; uelem.id = htole32(index); @@ -646,7 +714,9 @@ static inline void vring_used_idx_set(const struct vu_dev *vdev, struct vu_virtq *vq, uint16_t val) { vq->vring.used->idx = htole16(val); - (void)vdev; + vu_log_write(vdev, vq->vring.log_guest_addr + + offsetof(struct vring_used, idx), + sizeof(vq->vring.used->idx)); vq->used_idx = val; } diff --git a/virtio.h b/virtio.h index d95bb07..f572341 100644 --- a/virtio.h +++ b/virtio.h @@ -104,6 +104,8 @@ struct vu_dev_region { * @features: Vhost-user features * @protocol_features: Vhost-user protocol features * @log_call_fd: Eventfd to report logging update + * @log_size: Size of the logging memory region + * @log_table: Base of the logging memory region */ struct vu_dev { struct ctx *context; @@ -113,6 +115,8 @@ struct vu_dev { uint64_t features; uint64_t protocol_features; int log_call_fd; + uint64_t log_size; + uint8_t *log_table; }; /** From 78c73e9395b13354272010d2f202c819689d48f8 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 19 Dec 2024 12:13:57 +0100 Subject: [PATCH 179/382] vhost-user: Report to front-end we support VHOST_USER_PROTOCOL_F_LOG_SHMFD This features allows QEMU to be migrated. We need also to report VHOST_F_LOG_ALL. This protocol feature reports we can log the page update and implement VHOST_USER_SET_LOG_BASE and VHOST_USER_SET_LOG_FD. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vhost_user.c b/vhost_user.c index 66ded12..747b7f6 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -334,6 +334,7 @@ static bool vu_get_features_exec(struct vu_dev *vdev, uint64_t features = 1ULL << VIRTIO_F_VERSION_1 | 1ULL << VIRTIO_NET_F_MRG_RXBUF | + 1ULL << VHOST_F_LOG_ALL | 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; (void)vdev; @@ -911,7 +912,8 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev, static bool vu_get_protocol_features_exec(struct vu_dev *vdev, struct vhost_user_msg *msg) { - uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | + 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD; (void)vdev; vmsg_set_reply_u64(msg, features); From 878e16345461eb2745c761f6929fd6e9da0df447 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 19 Dec 2024 12:13:58 +0100 Subject: [PATCH 180/382] vhost-user: add VHOST_USER_CHECK_DEVICE_STATE command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After transferring the back-end’s internal state during migration, check whether the back-end was able to successfully fully process the state. The value returned indicates success or error; 0 is success, any non-zero value is an error. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 21 +++++++++++++++++++++ virtio.h | 18 ++++++++++-------- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/vhost_user.c b/vhost_user.c index 747b7f6..2962709 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -980,6 +980,23 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, return false; } +/** + * vu_check_device_state_exec() -- Return device state migration result + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as the reply contains the migration result + */ +static bool vu_check_device_state_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + (void)vdev; + + vmsg_set_reply_u64(msg, vdev->device_state_result); + + return true; +} + /** * vu_init() - Initialize vhost-user device structure * @c: execution context @@ -1001,6 +1018,7 @@ void vu_init(struct ctx *c) } c->vdev->log_table = NULL; c->vdev->log_call_fd = -1; + c->vdev->device_state_result = -1; } @@ -1049,6 +1067,8 @@ void vu_cleanup(struct vu_dev *vdev) vdev->nregions = 0; vu_close_log(vdev); + + vdev->device_state_result = -1; } /** @@ -1079,6 +1099,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, [VHOST_USER_SET_VRING_CALL] = vu_set_vring_call_exec, [VHOST_USER_SET_VRING_ERR] = vu_set_vring_err_exec, [VHOST_USER_SET_VRING_ENABLE] = vu_set_vring_enable_exec, + [VHOST_USER_CHECK_DEVICE_STATE] = vu_check_device_state_exec, }; /** diff --git a/virtio.h b/virtio.h index f572341..512ec1b 100644 --- a/virtio.h +++ b/virtio.h @@ -98,14 +98,15 @@ struct vu_dev_region { /** * struct vu_dev - vhost-user device information - * @context: Execution context - * @nregions: Number of shared memory regions - * @regions: Guest shared memory regions - * @features: Vhost-user features - * @protocol_features: Vhost-user protocol features - * @log_call_fd: Eventfd to report logging update - * @log_size: Size of the logging memory region - * @log_table: Base of the logging memory region + * @context: Execution context + * @nregions: Number of shared memory regions + * @regions: Guest shared memory regions + * @features: Vhost-user features + * @protocol_features: Vhost-user protocol features + * @log_call_fd: Eventfd to report logging update + * @log_size: Size of the logging memory region + * @log_table: Base of the logging memory region + * @device_state_result: Device state migration result */ struct vu_dev { struct ctx *context; @@ -117,6 +118,7 @@ struct vu_dev { int log_call_fd; uint64_t log_size; uint8_t *log_table; + int device_state_result; }; /** From 31d70024beda1e49131d7b68dd7554bee16c79f3 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 19 Dec 2024 12:13:59 +0100 Subject: [PATCH 181/382] vhost-user: add VHOST_USER_SET_DEVICE_STATE_FD command Set the file descriptor to use to transfer the backend device state during migration. Signed-off-by: Laurent Vivier <lvivier@redhat.com> [sbrivio: Fixed nits and coding style here and there] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- epoll_type.h | 2 ++ passt.c | 4 +++ vhost_user.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++-- virtio.h | 2 ++ vu_common.c | 49 +++++++++++++++++++++++++++++++ vu_common.h | 1 + 6 files changed, 138 insertions(+), 2 deletions(-) diff --git a/epoll_type.h b/epoll_type.h index f3ef415..fd9eac3 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -40,6 +40,8 @@ enum epoll_type { EPOLL_TYPE_VHOST_CMD, /* vhost-user kick event socket */ EPOLL_TYPE_VHOST_KICK, + /* vhost-user migration socket */ + EPOLL_TYPE_VHOST_MIGRATION, EPOLL_NUM_TYPES, }; diff --git a/passt.c b/passt.c index 1a0c404..b1c8ab6 100644 --- a/passt.c +++ b/passt.c @@ -75,6 +75,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", + [EPOLL_TYPE_VHOST_MIGRATION] = "vhost-user migration socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -356,6 +357,9 @@ loop: case EPOLL_TYPE_VHOST_KICK: vu_kick_cb(c.vdev, ref, &now); break; + case EPOLL_TYPE_VHOST_MIGRATION: + vu_migrate(c.vdev, eventmask); + break; default: /* Can't happen */ ASSERT(0); diff --git a/vhost_user.c b/vhost_user.c index 2962709..daff9ab 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -981,7 +981,78 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, } /** - * vu_check_device_state_exec() -- Return device state migration result + * vu_set_migration_watch() - Add the migration file descriptor to epoll + * @vdev: vhost-user device + * @fd: File descriptor to add + * @direction: Direction of the migration (save or load backend state) + */ +static void vu_set_migration_watch(const struct vu_dev *vdev, int fd, + uint32_t direction) +{ + union epoll_ref ref = { + .type = EPOLL_TYPE_VHOST_MIGRATION, + .fd = fd, + }; + struct epoll_event ev = { 0 }; + + ev.data.u64 = ref.u64; + switch (direction) { + case VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE: + ev.events = EPOLLOUT; + break; + case VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD: + ev.events = EPOLLIN; + break; + default: + ASSERT(0); + } + + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); +} + +/** + * vu_set_device_state_fd_exec() - Set the device state migration channel + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as the reply contains 0 to indicate success + * and set bit 8 as we don't provide our own fd. + */ +static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int direction = msg->payload.transfer_state.direction; + unsigned int phase = msg->payload.transfer_state.phase; + + if (msg->fd_num != 1) + die("Invalid device_state_fd message"); + + if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED) + die("Invalid device_state_fd phase: %d", phase); + + if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE && + direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD) + die("Invalide device_state_fd direction: %d", direction); + + if (vdev->device_state_fd != -1) { + vu_remove_watch(vdev, vdev->device_state_fd); + close(vdev->device_state_fd); + } + + vdev->device_state_fd = msg->fds[0]; + vdev->device_state_result = -1; + vu_set_migration_watch(vdev, vdev->device_state_fd, direction); + + debug("Got device_state_fd: %d", vdev->device_state_fd); + + /* We don't provide a new fd for the data transfer */ + vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK); + + return true; +} + +/** + * vu_check_device_state_exec() - Return device state migration result * @vdev: vhost-user device * @vmsg: vhost-user message * @@ -1018,6 +1089,7 @@ void vu_init(struct ctx *c) } c->vdev->log_table = NULL; c->vdev->log_call_fd = -1; + c->vdev->device_state_fd = -1; c->vdev->device_state_result = -1; } @@ -1068,7 +1140,12 @@ void vu_cleanup(struct vu_dev *vdev) vu_close_log(vdev); - vdev->device_state_result = -1; + if (vdev->device_state_fd != -1) { + vu_remove_watch(vdev, vdev->device_state_fd); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + vdev->device_state_result = -1; + } } /** @@ -1099,6 +1176,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, [VHOST_USER_SET_VRING_CALL] = vu_set_vring_call_exec, [VHOST_USER_SET_VRING_ERR] = vu_set_vring_err_exec, [VHOST_USER_SET_VRING_ENABLE] = vu_set_vring_enable_exec, + [VHOST_USER_SET_DEVICE_STATE_FD] = vu_set_device_state_fd_exec, [VHOST_USER_CHECK_DEVICE_STATE] = vu_check_device_state_exec, }; diff --git a/virtio.h b/virtio.h index 512ec1b..7bef2d2 100644 --- a/virtio.h +++ b/virtio.h @@ -106,6 +106,7 @@ struct vu_dev_region { * @log_call_fd: Eventfd to report logging update * @log_size: Size of the logging memory region * @log_table: Base of the logging memory region + * @device_state_fd: Device state migration channel * @device_state_result: Device state migration result */ struct vu_dev { @@ -118,6 +119,7 @@ struct vu_dev { int log_call_fd; uint64_t log_size; uint8_t *log_table; + int device_state_fd; int device_state_result; }; diff --git a/vu_common.c b/vu_common.c index 0ba2351..87a0d94 100644 --- a/vu_common.c +++ b/vu_common.c @@ -297,3 +297,52 @@ err: return -1; } + +/** + * vu_migrate() - Send/receive passt insternal state to/from QEMU + * @vdev: vhost-user device + * @events: epoll events + */ +void vu_migrate(struct vu_dev *vdev, uint32_t events) +{ + int ret; + + /* TODO: collect/set passt internal state + * and use vdev->device_state_fd to send/receive it + */ + debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); + if (events & EPOLLOUT) { + debug("Saving backend state"); + + /* send some stuff */ + ret = write(vdev->device_state_fd, "PASST", 6); + /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ + vdev->device_state_result = ret == -1 ? -1 : 0; + /* Closing the file descriptor signals the end of transfer */ + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, + vdev->device_state_fd, NULL); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + } else if (events & EPOLLIN) { + char buf[6]; + + debug("Loading backend state"); + /* read some stuff */ + ret = read(vdev->device_state_fd, buf, sizeof(buf)); + /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ + if (ret != sizeof(buf)) { + vdev->device_state_result = -1; + } else { + ret = strncmp(buf, "PASST", sizeof(buf)); + vdev->device_state_result = ret == 0 ? 0 : -1; + } + } else if (events & EPOLLHUP) { + debug("Closing migration channel"); + + /* The end of file signals the end of the transfer. */ + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, + vdev->device_state_fd, NULL); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + } +} diff --git a/vu_common.h b/vu_common.h index bd70faf..d56c021 100644 --- a/vu_common.h +++ b/vu_common.h @@ -57,4 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, const struct timespec *now); int vu_send_single(const struct ctx *c, const void *buf, size_t size); +void vu_migrate(struct vu_dev *vdev, uint32_t events); #endif /* VU_COMMON_H */ From 412ed4f09ff2e07545acdc5fe87a55a34aab4f92 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 19 Dec 2024 12:14:00 +0100 Subject: [PATCH 182/382] vhost-user: Report to front-end we support VHOST_USER_PROTOCOL_F_DEVICE_STATE Report to front-end that we support device state commands: VHOST_USER_CHECK_DEVICE_STATE VHOST_USER_SET_LOG_BASE These feature is needed to transfer backend state using frontend channel. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vhost_user.c b/vhost_user.c index daff9ab..f12dec5 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -913,7 +913,8 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev, struct vhost_user_msg *msg) { uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | - 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD; + 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | + 1ULL << VHOST_USER_PROTOCOL_F_DEVICE_STATE; (void)vdev; vmsg_set_reply_u64(msg, features); From c96a88d550fcda3f1972aee395fcfda19905d0a4 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Mon, 20 Jan 2025 14:15:22 +0100 Subject: [PATCH 183/382] vhost_user: remove ASSERT() on iovec number Replace ASSERT() on the number of iovec in the element and on the first entry length by a debug() message. Signed-off-by: Laurent Vivier <lvivier@redhat.com> [sbrivio: Fix typo in failure message] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vu_common.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vu_common.c b/vu_common.c index 87a0d94..aa5ca7b 100644 --- a/vu_common.c +++ b/vu_common.c @@ -195,8 +195,12 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, hdrlen); } else { /* vnet header can be in a separate iovec */ - ASSERT(elem[count].out_num == 2); - ASSERT(elem[count].out_sg[0].iov_len == (size_t)hdrlen); + if (elem[count].out_num != 2) + debug("virtio-net transmit queue contains more than one buffer ([%d]: %u)", + count, elem[count].out_num); + if (elem[count].out_sg[0].iov_len != (size_t)hdrlen) + debug("virtio-net transmit queue entry not aligned on hdrlen ([%d]: %d != %zu)", + count, hdrlen, elem[count].out_sg[0].iov_len); tap_add_packet(vdev->context, elem[count].out_sg[1].iov_len, (char *)elem[count].out_sg[1].iov_base); From 8757834d145a06b845aa0bb6bdfd4f93971b8d74 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 20 Jan 2025 16:49:30 +0100 Subject: [PATCH 184/382] tcp: Buffer sizes are *not* inherited on accept()/accept4() ...so it's pointless to set SO_RCVBUF and SO_SNDBUF on listening sockets. Call tcp_sock_set_bufsize() after accept4(), for inbound sockets. As we didn't have large buffer sizes set for inbound sockets for a long time (they are set explicitly only if the maximum size is big enough, more than than the ~200 KiB default), I ran some more throughput tests for this one, and I see slightly better numbers (say, 17 gbps instead of 15 gbps guest to host without vhost-user). Fixes: 904b86ade7db ("tcp: Rework window handling, timers, add SO_RCVLOWAT and pools for sockets/pipes") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tcp.c b/tcp.c index 3b3193a..a012b81 100644 --- a/tcp.c +++ b/tcp.c @@ -2057,6 +2057,8 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; + tcp_sock_set_bufsize(c, s); + /* FIXME: When listening port has a specific bound address, record that * as our address */ @@ -2260,7 +2262,6 @@ static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr, if (s < 0) return s; - tcp_sock_set_bufsize(c, s); return s; } @@ -2317,9 +2318,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4, NULL, port, tref.u32); - if (s >= 0) - tcp_sock_set_bufsize(c, s); - else + if (s < 0) s = -1; if (c->tcp.fwd_out.mode == FWD_AUTO) @@ -2343,9 +2342,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6, NULL, port, tref.u32); - if (s >= 0) - tcp_sock_set_bufsize(c, s); - else + if (s < 0) s = -1; if (c->tcp.fwd_out.mode == FWD_AUTO) From 54bb972cfb2637f64a9718023a2351f8f259abdb Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 17 Jan 2025 10:10:10 +0100 Subject: [PATCH 185/382] tcp: Disable Nagle's algorithm (set TCP_NODELAY) on all sockets Following up on 725acd111ba3 ("tcp_splice: Set (again) TCP_NODELAY on both sides"), David argues that, in general, we don't know what kind of TCP traffic we're dealing with, on any side or path. TCP segments might have been delivered to our socket with a PSH flag, but we don't have a way to know about it. Similarly, the guest might send us segments with PSH or URG set, but we don't know if we should generally TCP_CORK sockets and uncork on those flags, because that would assume they're running a Linux kernel (and a particular version of it) matching the kernel that delivers outbound packets for us. Given that we can't make any assumption and everything might very well be interactive traffic, disable Nagle's algorithm on all non-spliced sockets as well. After all, John Nagle himself is nowadays recommending that delayed ACKs should never be enabled together with his algorithm, but we don't have a practical way to ensure that our environment is free from delayed ACKs (TCP_QUICKACK is not really usable for this purpose): https://news.ycombinator.com/item?id=34180239 Suggested-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tcp.c b/tcp.c index a012b81..4d6a6b3 100644 --- a/tcp.c +++ b/tcp.c @@ -756,6 +756,19 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) trace("TCP: failed to set SO_SNDBUF to %i", v); } +/** + * tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm) + * @s: Socket, can be -1 to avoid check in the caller + */ +static void tcp_sock_set_nodelay(int s) +{ + if (s == -1) + return; + + if (setsockopt(s, SOL_TCP, TCP_NODELAY, &((int){ 1 }), sizeof(int))) + debug("TCP: failed to set TCP_NODELAY on socket %i", s); +} + /** * tcp_update_csum() - Calculate TCP checksum * @psum: Unfolded partial checksum of the IPv4 or IPv6 pseudo-header @@ -1285,6 +1298,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) return -errno; tcp_sock_set_bufsize(c, s); + tcp_sock_set_nodelay(s); return s; } @@ -2058,6 +2072,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, goto cancel; tcp_sock_set_bufsize(c, s); + tcp_sock_set_nodelay(s); /* FIXME: When listening port has a specific bound address, record that * as our address From db2c91ae86c7c0d1d068714db2342b9057506148 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 20 Jan 2025 18:36:30 +0100 Subject: [PATCH 186/382] tcp: Set ACK flag on *all* RST segments, even for client in SYN-SENT state Somewhat curiously, RFC 9293, section 3.10.7.3, states: If the state is SYN-SENT, then [...] Second, check the RST bit: - If the RST bit is set, [...] o If the ACK was acceptable, then signal to the user "error: connection reset", drop the segment, enter CLOSED state, delete TCB, and return. Otherwise (no ACK), drop the segment and return. which matches verbatim RFC 793, pages 66-67, and is implemented as-is by tcp_rcv_synsent_state_process() in the Linux kernel, that is: /* No ACK in the segment */ if (th->rst) { /* rfc793: * "If the RST bit is set * * Otherwise (no ACK) drop the segment and return." */ goto discard_and_undo; } meaning that if a client is in SYN-SENT state, and we send a RST segment once we realise that we can't establish the outbound connection, the client will ignore our segment and will need to pointlessly wait until the connection times out instead of aborting it right away. The ACK flag on a RST, in this case, doesn't really seem to have any function, but we must set it nevertheless. The ACK sequence number is already correct because we always set it before calling tcp_prepare_flags(), whenever relevant. This leaves us with no cases where we should *not* set the ACK flag on non-SYN segments, so always set the ACK flag for RST segments. Note that non-SYN, non-RST segments were already covered by commit 4988e2b40631 ("tcp: Unconditionally force ACK for all !SYN, !RST packets"). Reported-by: Dirk Janssen <Dirk.Janssen@schiphol.nl> Reported-by: Roeland van de Pol <Roeland.van.de.Pol@schiphol.nl> Reported-by: Robert Floor <Robert.Floor@schiphol.nl> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index 4d6a6b3..c89f323 100644 --- a/tcp.c +++ b/tcp.c @@ -1147,7 +1147,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, *opts = TCP_SYN_OPTS(mss, conn->ws_to_tap); *optlen = sizeof(*opts); - } else if (!(flags & RST)) { + } else { flags |= ACK; } From ec5c4d936dafcbc5e07caeb594dfd771050da221 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 21 Jan 2025 00:39:06 +0100 Subject: [PATCH 187/382] tcp: Set PSH flag for last incoming packets in a batch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far we omitted setting PSH flags for inbound traffic altogether: as we ignore the nature of the data we're sending, we can't conclude that some data is more or less urgent. This works fine with Linux guests, as the Linux kernel doesn't do much with it, on input: it will generally deliver data to the application layer without delay. However, with Windows, things change: if we don't set the PSH flag on interactive inbound traffic, we can expect long delays before the data is delivered to the application. This is very visible with RDP, where packets we send on behalf of the RDP client are delivered with delays exceeding one second: $ tshark -r rdp.pcap -td -Y 'frame.number in { 33170 .. 33173 }' --disable-protocol tls 33170 0.030296 93.235.154.248 → 88.198.0.164 54 TCP 49012 → 3389 [ACK] Seq=13820 Ack=285229 Win=387968 Len=0 33171 0.985412 88.198.0.164 → 93.235.154.248 105 TCP 3389 → 49012 [PSH, ACK] Seq=285229 Ack=13820 Win=63198 Len=51 33172 0.030373 93.235.154.248 → 88.198.0.164 54 TCP 49012 → 3389 [ACK] Seq=13820 Ack=285280 Win=387968 Len=0 33173 1.383776 88.198.0.164 → 93.235.154.248 424 TCP 3389 → 49012 [PSH, ACK] Seq=285280 Ack=13820 Win=63198 Len=370 in this example (packet capture taken by passt), frame #33172 is a mouse event sent by the RDP client, and frame #33173 is the first event (display reacting to click) sent back by the server. This appears as a 1.4 s delay before we get frame #33173. If we set PSH, instead: $ tshark -r rdp_psh.pcap -td -Y 'frame.number in { 314 .. 317 }' --disable-protocol tls 314 0.002503 93.235.154.248 → 88.198.0.164 170 TCP 51066 → 3389 [PSH, ACK] Seq=7779 Ack=74047 Win=31872 Len=116 315 0.000557 88.198.0.164 → 93.235.154.248 54 TCP 3389 → 51066 [ACK] Seq=79162 Ack=7895 Win=62872 Len=0 316 0.012752 93.235.154.248 → 88.198.0.164 170 TCP 51066 → 3389 [PSH, ACK] Seq=7895 Ack=79162 Win=31872 Len=116 317 0.011927 88.198.0.164 → 93.235.154.248 107 TCP 3389 → 51066 [PSH, ACK] Seq=79162 Ack=8011 Win=62756 Len=53 here, in frame #316, our mouse event is delivered without a delay and receives a response in approximately 12 ms. Set PSH on the last segment for any batch we dequeue from the socket, that is, set it whenever we know that we might not be sending data to the same port for a while. Reported-by: NN708 Link: https://bugs.passt.top/show_bug.cgi?id=107 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_buf.c | 11 ++++++++--- tcp_vu.c | 7 +++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/tcp_buf.c b/tcp_buf.c index cbefa42..72d99c5 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -239,9 +239,10 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) * @dlen: TCP payload length * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer * @seq: Sequence number to be sent + * @push: Set PSH flag, last segment in a batch */ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, - ssize_t dlen, int no_csum, uint32_t seq) + ssize_t dlen, int no_csum, uint32_t seq, bool push) { struct tcp_payload_t *payload; const uint16_t *check = NULL; @@ -268,6 +269,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, payload->th.th_x2 = 0; payload->th.th_flags = 0; payload->th.ack = 1; + payload->th.psh = push; iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr); tcp_l2_buf_fill_headers(conn, iov, check, seq, false); if (++tcp_payload_used > TCP_FRAMES_MEM - 1) @@ -402,11 +404,14 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) seq = conn->seq_to_tap; for (i = 0; i < send_bufs; i++) { int no_csum = i && i != send_bufs - 1 && tcp_payload_used; + bool push = false; - if (i == send_bufs - 1) + if (i == send_bufs - 1) { dlen = last_len; + push = true; + } - tcp_data_to_tap(c, conn, dlen, no_csum, seq); + tcp_data_to_tap(c, conn, dlen, no_csum, seq, push); seq += dlen; } diff --git a/tcp_vu.c b/tcp_vu.c index a216bb1..fad7065 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -289,10 +289,11 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, * @iov_cnt: Number of entries in @iov * @check: Checksum, if already known * @no_tcp_csum: Do not set TCP checksum + * @push: Set PSH flag, last segment in a batch */ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, struct iovec *iov, size_t iov_cnt, - const uint16_t **check, bool no_tcp_csum) + const uint16_t **check, bool no_tcp_csum, bool push) { const struct flowside *toside = TAPFLOW(conn); bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); @@ -334,6 +335,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, memset(th, 0, sizeof(*th)); th->doff = sizeof(*th) / 4; th->ack = 1; + th->psh = push; tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload, *check, conn->seq_to_tap, no_tcp_csum); @@ -443,6 +445,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) struct iovec *iov = &elem[head[i]].in_sg[0]; int buf_cnt = head[i + 1] - head[i]; ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen; + bool push = i == head_cnt - 1; vu_set_vnethdr(vdev, iov->iov_base, buf_cnt); @@ -451,7 +454,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) check = NULL; previous_dlen = dlen; - tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap); + tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap, push); if (*c->pcap) { pcap_iov(iov, buf_cnt, From 4f2c8e79130ef3d6132e34c49746e397745f9d73 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Tue, 21 Jan 2025 14:16:02 +0100 Subject: [PATCH 188/382] vhost_user: Drop packet with unsupported iovec array If the iovec array cannot be managed, drop it rather than passing the second entry to tap_add_packet(). Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vu_common.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vu_common.c b/vu_common.c index aa5ca7b..f43d8ac 100644 --- a/vu_common.c +++ b/vu_common.c @@ -195,15 +195,17 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, hdrlen); } else { /* vnet header can be in a separate iovec */ - if (elem[count].out_num != 2) + if (elem[count].out_num != 2) { debug("virtio-net transmit queue contains more than one buffer ([%d]: %u)", count, elem[count].out_num); - if (elem[count].out_sg[0].iov_len != (size_t)hdrlen) + } else if (elem[count].out_sg[0].iov_len != (size_t)hdrlen) { debug("virtio-net transmit queue entry not aligned on hdrlen ([%d]: %d != %zu)", - count, hdrlen, elem[count].out_sg[0].iov_len); - tap_add_packet(vdev->context, - elem[count].out_sg[1].iov_len, - (char *)elem[count].out_sg[1].iov_base); + count, hdrlen, elem[count].out_sg[0].iov_len); + } else { + tap_add_packet(vdev->context, + elem[count].out_sg[1].iov_len, + (char *)elem[count].out_sg[1].iov_base); + } } count++; From d477a1fb03c5995d07e481b25dd94fc9e9bc02f2 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 23 Jan 2025 08:55:49 +0100 Subject: [PATCH 189/382] netlink: Skip loopback interface while looking for a template There might be reasons to have routes on the loopback interface, for example Any-IP/AnyIP routes as implemented by Linux kernel commit ab79ad14a2d5 ("ipv6: Implement Any-IP support for IPv6."). If we use the loopback interface as a template, though, we'll pick 'lo' (typically) as interface name for our tap interface, but we'll already have an interface called 'lo' in the target namespace, and as we TUNSETIFF on it, we'll fail with EINVAL, because it's not a tap interface. Skip the loopback interface while looking for a template interface or, more accurately, skip the interface with index 1. Strictly speaking, we should fetch interface flags via RTM_GETLINK instead, and check for IFF_LOOPBACK, but interleaving that request while we're iterating over routes is unnecessarily complicated. Link: https://www.reddit.com/r/podman/comments/1i6pj7u/starting_pod_without_external_network/ Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- netlink.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/netlink.c b/netlink.c index 0407692..37d8b5b 100644 --- a/netlink.c +++ b/netlink.c @@ -297,6 +297,10 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) if (!thisifi) continue; /* No interface for this route */ + /* Skip 'lo': we should test IFF_LOOPBACK, but keep it simple */ + if (thisifi == 1) + continue; + /* Skip routes to link-local addresses */ if (af == AF_INET && dst && IN4_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len)) From dd6a6854c73a09c4091c1776ee7f349d1e1f966c Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 24 Jan 2025 20:07:41 +0100 Subject: [PATCH 190/382] vhost-user: Implement an empty VHOST_USER_SEND_RARP command Passt cannot manage and doesn't need to manage the broadcast of a fake RARP, but QEMU will report an error message if Passt doesn't implement it. Implement an empty SEND_RARP command to silence QEMU error message. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/vhost_user.c b/vhost_user.c index f12dec5..6bf0dda 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -914,7 +914,8 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev, { uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | - 1ULL << VHOST_USER_PROTOCOL_F_DEVICE_STATE; + 1ULL << VHOST_USER_PROTOCOL_F_DEVICE_STATE | + 1ULL << VHOST_USER_PROTOCOL_F_RARP; (void)vdev; vmsg_set_reply_u64(msg, features); @@ -981,6 +982,32 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, return false; } +/** + * vu_set_send_rarp_exec() - vhost-user specification says: "Broadcast a fake + * RARP to notify the migration is terminated", + * but passt doesn't need to update any ARP table, + * so do nothing to silence QEMU bogus error message + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_send_rarp_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + char macstr[ETH_ADDRSTRLEN]; + + (void)vdev; + + /* ignore the command */ + + debug("Ignore command VHOST_USER_SEND_RARP for %s", + eth_ntop((unsigned char *)&msg->payload.u64, macstr, + sizeof(macstr))); + + return false; +} + /** * vu_set_migration_watch() - Add the migration file descriptor to epoll * @vdev: vhost-user device @@ -1177,6 +1204,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, [VHOST_USER_SET_VRING_CALL] = vu_set_vring_call_exec, [VHOST_USER_SET_VRING_ERR] = vu_set_vring_err_exec, [VHOST_USER_SET_VRING_ENABLE] = vu_set_vring_enable_exec, + [VHOST_USER_SEND_RARP] = vu_send_rarp_exec, [VHOST_USER_SET_DEVICE_STATE_FD] = vu_set_device_state_fd_exec, [VHOST_USER_CHECK_DEVICE_STATE] = vu_check_device_state_exec, }; From 10c4a9e1b383becd7366bda986f886675f7c4cb2 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 30 Jan 2025 17:52:10 +1100 Subject: [PATCH 191/382] tcp: Always pass NULL event with EPOLL_CTL_DEL In tcp_epoll_ctl() we pass an event pointer with EPOLL_CTL_DEL, even though it will be ignored. It's possible this was a workaround for pre-2.6.9 kernels which required a non-NULL pointer here, but we rely on the kernel accepting NULL events for EPOLL_CTL_DEL in lots of other places. Use NULL instead for simplicity and consistency. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index c89f323..4eed82b 100644 --- a/tcp.c +++ b/tcp.c @@ -468,9 +468,9 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->events == CLOSED) { if (conn->in_epoll) - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL); if (conn->timer != -1) - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, NULL); return 0; } From 0349cf637f64a5128846c79d9537849e1ed3e1cc Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 30 Jan 2025 17:52:11 +1100 Subject: [PATCH 192/382] util: Rename and make global vu_remove_watch() vu_remove_watch() is used in vhost_user.c to remove an fd from the global epoll set. There's nothing really vhost user specific about it though, so rename, move to util.c and use it in a bunch of places outside vhost_user.c where it makes things marginally more readable. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- icmp.c | 2 +- tap.c | 2 +- tcp.c | 4 ++-- tcp_splice.c | 4 ++-- udp_flow.c | 2 +- util.c | 10 ++++++++++ util.h | 1 + vhost_user.c | 21 +++++---------------- vu_common.c | 6 ++---- 9 files changed, 25 insertions(+), 27 deletions(-) diff --git a/icmp.c b/icmp.c index 143e93b..bcf498d 100644 --- a/icmp.c +++ b/icmp.c @@ -150,7 +150,7 @@ unexpected: static void icmp_ping_close(const struct ctx *c, const struct icmp_ping_flow *pingf) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL); + epoll_del(c, pingf->sock); close(pingf->sock); flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE)); } diff --git a/tap.c b/tap.c index cd32a90..772648f 100644 --- a/tap.c +++ b/tap.c @@ -1005,7 +1005,7 @@ void tap_sock_reset(struct ctx *c) exit(EXIT_SUCCESS); /* Close the connected socket, wait for a new connection */ - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); + epoll_del(c, c->fd_tap); close(c->fd_tap); c->fd_tap = -1; if (c->mode == MODE_VU) diff --git a/tcp.c b/tcp.c index 4eed82b..7787381 100644 --- a/tcp.c +++ b/tcp.c @@ -468,9 +468,9 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->events == CLOSED) { if (conn->in_epoll) - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL); + epoll_del(c, conn->sock); if (conn->timer != -1) - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, NULL); + epoll_del(c, conn->timer); return 0; } diff --git a/tcp_splice.c b/tcp_splice.c index 3a000ff..5db1d62 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -200,8 +200,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, } if (flag == CLOSING) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[0], NULL); - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[1], NULL); + epoll_del(c, conn->s[0]); + epoll_del(c, conn->s[1]); } } diff --git a/udp_flow.c b/udp_flow.c index 9fd7d06..7fae81d 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -52,7 +52,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) if (uflow->s[TGTSIDE] >= 0) { /* But the flow specific one needs to be removed */ - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, uflow->s[TGTSIDE], NULL); + epoll_del(c, uflow->s[TGTSIDE]); close(uflow->s[TGTSIDE]); uflow->s[TGTSIDE] = -1; } diff --git a/util.c b/util.c index 11973c4..c7b09f0 100644 --- a/util.c +++ b/util.c @@ -837,3 +837,13 @@ void raw_random(void *buf, size_t buflen) if (random_read < buflen) die("Unexpected EOF on random data source"); } + +/** + * epoll_del() - Remove a file descriptor from our passt epoll + * @c: Execution context + * @fd: File descriptor to remove + */ +void epoll_del(const struct ctx *c, int fd) +{ + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL); +} diff --git a/util.h b/util.h index d02333d..800a28b 100644 --- a/util.h +++ b/util.h @@ -276,6 +276,7 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m) #define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__) void raw_random(void *buf, size_t buflen); +void epoll_del(const struct ctx *c, int fd); /* * Starting from glibc 2.40.9000 and commit 25a5eb4010df ("string: strerror, diff --git a/vhost_user.c b/vhost_user.c index 6bf0dda..bbbf504 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -162,17 +162,6 @@ static void vmsg_close_fds(const struct vhost_user_msg *vmsg) close(vmsg->fds[i]); } -/** - * vu_remove_watch() - Remove a file descriptor from our passt epoll - * file descriptor - * @vdev: vhost-user device - * @fd: file descriptor to remove - */ -static void vu_remove_watch(const struct vu_dev *vdev, int fd) -{ - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, fd, NULL); -} - /** * vmsg_set_reply_u64() - Set reply payload.u64 and clear request flags * and fd_num @@ -748,7 +737,7 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev, vdev->vq[idx].call_fd = -1; } if (vdev->vq[idx].kick_fd != -1) { - vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + epoll_del(vdev->context, vdev->vq[idx].kick_fd); close(vdev->vq[idx].kick_fd); vdev->vq[idx].kick_fd = -1; } @@ -816,7 +805,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev, vu_check_queue_msg_file(msg); if (vdev->vq[idx].kick_fd != -1) { - vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + epoll_del(vdev->context, vdev->vq[idx].kick_fd); close(vdev->vq[idx].kick_fd); vdev->vq[idx].kick_fd = -1; } @@ -1063,7 +1052,7 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, die("Invalide device_state_fd direction: %d", direction); if (vdev->device_state_fd != -1) { - vu_remove_watch(vdev, vdev->device_state_fd); + epoll_del(vdev->context, vdev->device_state_fd); close(vdev->device_state_fd); } @@ -1145,7 +1134,7 @@ void vu_cleanup(struct vu_dev *vdev) vq->err_fd = -1; } if (vq->kick_fd != -1) { - vu_remove_watch(vdev, vq->kick_fd); + epoll_del(vdev->context, vq->kick_fd); close(vq->kick_fd); vq->kick_fd = -1; } @@ -1169,7 +1158,7 @@ void vu_cleanup(struct vu_dev *vdev) vu_close_log(vdev); if (vdev->device_state_fd != -1) { - vu_remove_watch(vdev, vdev->device_state_fd); + epoll_del(vdev->context, vdev->device_state_fd); close(vdev->device_state_fd); vdev->device_state_fd = -1; vdev->device_state_result = -1; diff --git a/vu_common.c b/vu_common.c index f43d8ac..2c12dca 100644 --- a/vu_common.c +++ b/vu_common.c @@ -325,8 +325,7 @@ void vu_migrate(struct vu_dev *vdev, uint32_t events) /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ vdev->device_state_result = ret == -1 ? -1 : 0; /* Closing the file descriptor signals the end of transfer */ - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, - vdev->device_state_fd, NULL); + epoll_del(vdev->context, vdev->device_state_fd); close(vdev->device_state_fd); vdev->device_state_fd = -1; } else if (events & EPOLLIN) { @@ -346,8 +345,7 @@ void vu_migrate(struct vu_dev *vdev, uint32_t events) debug("Closing migration channel"); /* The end of file signals the end of the transfer. */ - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, - vdev->device_state_fd, NULL); + epoll_del(vdev->context, vdev->device_state_fd); close(vdev->device_state_fd); vdev->device_state_fd = -1; } From dcd6d8191aa29f232593ad2819a197e135f8cac8 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 31 Jan 2025 19:13:00 +0100 Subject: [PATCH 193/382] tcp: Add HOSTSIDE(x), HOSTFLOW(x) macros Those are symmetric to TAPSIDE(x)/TAPFLOW(x) and I'll use them in the next patch to extract 'oport' in order to re-bind sockets to the original socket-side local port. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp_internal.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tcp_internal.h b/tcp_internal.h index 94e5780..9cf31f5 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -38,9 +38,13 @@ #define OPT_SACK 5 #define OPT_TS 8 -#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP) -#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)])) -#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_))) +#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP) +#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)])) +#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_))) + +#define HOSTSIDE(conn_) ((conn_)->f.pif[1] == PIF_HOST) +#define HOSTFLOW(conn_) (&((conn_)->f.side[HOSTSIDE(conn_)])) +#define HOST_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_))) #define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr)) #define CONN_V6(conn) (!CONN_V4(conn)) From bf2860819d868c7d116923e9b5d798d410d38715 Mon Sep 17 00:00:00 2001 From: 7ppKb5bW <pONy4THS@protonmail.com> Date: Sun, 2 Feb 2025 19:21:21 +0000 Subject: [PATCH 194/382] pasta.te: fix demo.sh and remove one duplicate rule On Fedora 41, without "allow pasta_t unconfined_t:dir read" /usr/bin/pasta can't open /proc/[pid]/ns, which is required by pasta_netns_quit_init(). This patch also remove one duplicate rule "allow pasta_t nsfs_t:file read;", "allow pasta_t nsfs_t:file { open read };" at line 123 is enough. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/selinux/pasta.te | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te index 69be081..d0ff0cc 100644 --- a/contrib/selinux/pasta.te +++ b/contrib/selinux/pasta.te @@ -171,7 +171,7 @@ allow pasta_t init_t:lnk_file read; allow pasta_t init_t:unix_stream_socket connectto; allow pasta_t init_t:dbus send_msg; allow pasta_t init_t:system status; -allow pasta_t unconfined_t:dir search; +allow pasta_t unconfined_t:dir { read search }; allow pasta_t unconfined_t:file read; allow pasta_t unconfined_t:lnk_file read; allow pasta_t self:process { setpgid setcap }; @@ -192,8 +192,6 @@ allow pasta_t sysctl_net_t:dir search; allow pasta_t sysctl_net_t:file { open read write }; allow pasta_t kernel_t:system module_request; -allow pasta_t nsfs_t:file read; - allow pasta_t proc_t:dir mounton; allow pasta_t proc_t:filesystem mount; allow pasta_t net_conf_t:lnk_file read; From 722d347c1932f630a53ba05ea0270a651ed601b2 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 3 Feb 2025 08:19:16 +0100 Subject: [PATCH 195/382] tcp: Don't reset outbound connection on SYN retries Reported by somebody on IRC: if the server has considerable latency, it might happen that the client retries sending SYN segments for the same flow while we're still in a TAP_SYN_RCVD, non-ESTABLISHED state. In that case, we should go with the blanket assumption that we need to reset the connection on any unexpected segment: RFC 9293 explicitly mentions this case in Figure 8: Recovery from Old Duplicate SYN, section 3.5. It doesn't make sense for us to set a specific sequence number, socket-side, but we should definitely wait and see. Ignoring the duplicate SYN segment should also be compatible with section 3.10.7.3. SYN-SENT STATE, which mentions updating sequences socket-side (which we can't do anyway), but certainly not reset the connection. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tcp.c b/tcp.c index 7787381..51ad692 100644 --- a/tcp.c +++ b/tcp.c @@ -1920,6 +1920,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, /* Establishing connection from tap */ if (conn->events & TAP_SYN_RCVD) { + if (th->syn && !th->ack && !th->fin) + return 1; /* SYN retry: ignore and keep waiting */ + if (!(conn->events & TAP_SYN_ACK_SENT)) goto reset; From b75ad159e8a13a10ce1fb4b86503636420da126d Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Sun, 2 Feb 2025 20:49:58 +0100 Subject: [PATCH 196/382] vhost_user: On 32-bit ARM, mmap() is not available, mmap2() is used instead Link: https://buildd.debian.org/status/fetch.php?pkg=passt&arch=armel&ver=0.0%7Egit20250121.4f2c8e7-1&stamp=1737477467&raw=0 Link: https://buildd.debian.org/status/fetch.php?pkg=passt&arch=armhf&ver=0.0%7Egit20250121.4f2c8e7-1&stamp=1737477421&raw=0 Fixes: 31117b27c6c9 ("vhost-user: introduce vhost-user API") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- vhost_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vhost_user.c b/vhost_user.c index bbbf504..58baee2 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -419,7 +419,7 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq) * * Return: False as no reply is requested * - * #syscalls:vu mmap munmap + * #syscalls:vu mmap|mmap2 munmap */ static bool vu_set_mem_table_exec(struct vu_dev *vdev, struct vhost_user_msg *msg) From 71fa7362776bfa075d83383b600d2beeab923893 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Sun, 2 Feb 2025 20:53:47 +0100 Subject: [PATCH 197/382] tcp_splice, udp_flow: fcntl64() support on PPC64 depends on glibc version I explicitly added fcntl64() to the list of allowed system calls for PPC64 a while ago, and now it turns out it's not available in recent Debian builds. The warning from seccomp.sh is harmless because we unconditionally try to enable fcntl() anyway, but take care of it anyway. Link: https://buildd.debian.org/status/fetch.php?pkg=passt&arch=ppc64&ver=0.0%7Egit20250121.4f2c8e7-1&stamp=1737477147&raw=0 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp_splice.c | 2 +- udp_flow.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index 5db1d62..f048a82 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -28,7 +28,7 @@ * - FIN_SENT_0: FIN (write shutdown) sent to accepted socket * - FIN_SENT_1: FIN (write shutdown) sent to target socket * - * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64 + * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64 */ #include <sched.h> diff --git a/udp_flow.c b/udp_flow.c index 7fae81d..83c2568 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -174,7 +174,7 @@ cancel: * @s_in: Source socket address, filled in by recvmmsg() * @now: Timestamp * - * #syscalls fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64 + * #syscalls fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64 * * Return: sidx for the destination side of the flow for this packet, or * FLOW_SIDX_NONE if we couldn't find or create a flow. From e25a93032f8c09f1e0bfbc32e81431dd995f9605 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Sun, 26 Jan 2025 09:05:03 +0100 Subject: [PATCH 198/382] util: Add read_remainder() and read_all_buf() These are symmetric to write_remainder() and write_all_buf() and almost a copy and paste of them, with the most notable differences being reversed reads/writes and a couple of better-safe-than-sorry asserts to keep Coverity happy. I'll use them in the next patch. At least for the moment, they're going to be used for vhost-user mode only, so I'm not unconditionally enabling readv() in the seccomp profile: the caller has to ensure it's there. [dgibson: make read_remainder() take const pointer to iovec] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- util.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ util.h | 2 ++ 2 files changed, 86 insertions(+) diff --git a/util.c b/util.c index c7b09f0..800c6b5 100644 --- a/util.c +++ b/util.c @@ -606,6 +606,90 @@ int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip) return 0; } +/** + * read_all_buf() - Fill a whole buffer from a file descriptor + * @fd: File descriptor + * @buf: Pointer to base of buffer + * @len: Length of buffer + * + * Return: 0 on success, -1 on error (with errno set) + * + * #syscalls read + */ +int read_all_buf(int fd, void *buf, size_t len) +{ + size_t left = len; + char *p = buf; + + while (left) { + ssize_t rc; + + ASSERT(left <= len); + + do + rc = read(fd, p, left); + while ((rc < 0) && errno == EINTR); + + if (rc < 0) + return -1; + + if (rc == 0) { + errno = ENODATA; + return -1; + } + + p += rc; + left -= rc; + } + return 0; +} + +/** + * read_remainder() - Read the tail of an IO vector from a file descriptor + * @fd: File descriptor + * @iov: IO vector + * @cnt: Number of entries in @iov + * @skip: Number of bytes of the vector to skip reading + * + * Return: 0 on success, -1 on error (with errno set) + * + * Note: mode-specific seccomp profiles need to enable readv() to use this. + */ +/* cppcheck-suppress unusedFunction */ +int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip) +{ + size_t i = 0, offset; + + while ((i += iov_skip_bytes(iov + i, cnt - i, skip, &offset)) < cnt) { + ssize_t rc; + + if (offset) { + ASSERT(offset < iov[i].iov_len); + /* Read the remainder of the partially read buffer */ + if (read_all_buf(fd, (char *)iov[i].iov_base + offset, + iov[i].iov_len - offset) < 0) + return -1; + i++; + } + + if (cnt == i) + break; + + /* Fill as many of the remaining buffers as we can */ + rc = readv(fd, &iov[i], cnt - i); + if (rc < 0) + return -1; + + if (rc == 0) { + errno = ENODATA; + return -1; + } + + skip = rc; + } + return 0; +} + /** sockaddr_ntop() - Convert a socket address to text format * @sa: Socket address * @dst: output buffer, minimum SOCKADDR_STRLEN bytes diff --git a/util.h b/util.h index 800a28b..23b165c 100644 --- a/util.h +++ b/util.h @@ -203,6 +203,8 @@ int fls(unsigned long x); int write_file(const char *path, const char *buf); int write_all_buf(int fd, const void *buf, size_t len); int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip); +int read_all_buf(int fd, void *buf, size_t len); +int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip); void close_open_files(int argc, char **argv); bool snprintf_check(char *str, size_t size, const char *format, ...); From e894d9ae8212c49dc44e52ad583954ed24e6905b Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 31 Jan 2025 11:41:51 +0100 Subject: [PATCH 199/382] vhost_user: Turn some vhost-user message reports to trace() Having every vhost-user message printed as part of debug output makes debugging anything else a bit complicated. Change per-packet debug() messages in vu_kick_cb() and vu_send_single() to trace() [dgibson: switch different messages to trace()] Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 4 ++-- vu_common.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vhost_user.c b/vhost_user.c index 58baee2..9e38cfd 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -640,8 +640,8 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev, unsigned int idx = msg->payload.state.index; unsigned int num = msg->payload.state.num; - debug("State.index: %u", idx); - debug("State.num: %u", num); + trace("State.index: %u", idx); + trace("State.num: %u", num); vdev->vq[idx].vring.num = num; return false; diff --git a/vu_common.c b/vu_common.c index 2c12dca..ab04d31 100644 --- a/vu_common.c +++ b/vu_common.c @@ -238,7 +238,7 @@ void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, if (rc == -1) die_perror("vhost-user kick eventfd_read()"); - debug("vhost-user: got kick_data: %016"PRIx64" idx: %d", + trace("vhost-user: got kick_data: %016"PRIx64" idx: %d", kick_data, ref.queue); if (VHOST_USER_IS_QUEUE_TX(ref.queue)) vu_handle_tx(vdev, ref.queue, now); @@ -262,7 +262,7 @@ int vu_send_single(const struct ctx *c, const void *buf, size_t size) int elem_cnt; int i; - debug("vu_send_single size %zu", size); + trace("vu_send_single size %zu", size); if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { debug("Got packet, but RX virtqueue not usable yet"); @@ -294,7 +294,7 @@ int vu_send_single(const struct ctx *c, const void *buf, size_t size) vu_flush(vdev, vq, elem, elem_cnt); - debug("vhost-user sent %zu", total); + trace("vhost-user sent %zu", total); return total; err: From 8c24301462c39027e6eb6f1ad56c1f6c83fb0c23 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 28 Jan 2025 00:03:13 +0100 Subject: [PATCH 200/382] Introduce passt-repair A privileged helper to set/clear TCP_REPAIR on sockets on behalf of passt. Not used yet. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- .gitignore | 1 + Makefile | 17 ++- contrib/apparmor/usr.bin.passt-repair | 29 +++++ contrib/fedora/passt.spec | 2 + contrib/selinux/passt-repair.fc | 11 ++ contrib/selinux/passt-repair.te | 58 ++++++++++ hooks/pre-push | 1 + passt-repair.1 | 70 ++++++++++++ passt-repair.c | 154 ++++++++++++++++++++++++++ seccomp.sh | 6 +- 10 files changed, 342 insertions(+), 7 deletions(-) create mode 100644 contrib/apparmor/usr.bin.passt-repair create mode 100644 contrib/selinux/passt-repair.fc create mode 100644 contrib/selinux/passt-repair.te create mode 100644 passt-repair.1 create mode 100644 passt-repair.c diff --git a/.gitignore b/.gitignore index d1c8be9..5824a71 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ /passt.avx2 /pasta /pasta.avx2 +/passt-repair /qrap /pasta.1 /seccomp.h diff --git a/Makefile b/Makefile index 464eef1..6ab8d24 100644 --- a/Makefile +++ b/Makefile @@ -42,9 +42,10 @@ PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ vhost_user.c virtio.c vu_common.c QRAP_SRCS = qrap.c -SRCS = $(PASST_SRCS) $(QRAP_SRCS) +PASST_REPAIR_SRCS = passt-repair.c +SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS) -MANPAGES = passt.1 pasta.1 qrap.1 +MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ @@ -72,9 +73,9 @@ mandir ?= $(datarootdir)/man man1dir ?= $(mandir)/man1 ifeq ($(TARGET_ARCH),x86_64) -BIN := passt passt.avx2 pasta pasta.avx2 qrap +BIN := passt passt.avx2 pasta pasta.avx2 qrap passt-repair else -BIN := passt pasta qrap +BIN := passt pasta qrap passt-repair endif all: $(BIN) $(MANPAGES) docs @@ -83,7 +84,10 @@ static: FLAGS += -static -DGLIBC_NO_STATIC_NSS static: clean all seccomp.h: seccomp.sh $(PASST_SRCS) $(PASST_HEADERS) - @ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh $(PASST_SRCS) $(PASST_HEADERS) + @ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp.h $(PASST_SRCS) $(PASST_HEADERS) + +seccomp_repair.h: seccomp.sh $(PASST_REPAIR_SRCS) + @ ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp_repair.h $(PASST_REPAIR_SRCS) passt: $(PASST_SRCS) $(HEADERS) $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_SRCS) -o passt $(LDFLAGS) @@ -101,6 +105,9 @@ pasta.avx2 pasta.1 pasta: pasta%: passt% qrap: $(QRAP_SRCS) passt.h $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS) +passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h + $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS) + valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \ rt_sigreturn getpid gettid kill clock_gettime mmap \ mmap2 munmap open unlink gettimeofday futex statx \ diff --git a/contrib/apparmor/usr.bin.passt-repair b/contrib/apparmor/usr.bin.passt-repair new file mode 100644 index 0000000..901189d --- /dev/null +++ b/contrib/apparmor/usr.bin.passt-repair @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# contrib/apparmor/usr.bin.passt-repair - AppArmor profile for passt-repair(1) +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +abi <abi/3.0>, + +#include <tunables/global> + +profile passt-repair /usr/bin/passt-repair { + #include <abstractions/base> + /** rw, # passt's ".repair" socket might be anywhere + unix (connect, receive, send) type=stream, + + capability dac_override, # connect to passt's socket as root + capability net_admin, # currently needed for TCP_REPAIR socket option + capability net_raw, # what TCP_REPAIR should require instead + + network unix stream, # connect and use UNIX domain socket + network inet stream, # use TCP sockets +} diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec index 7950fb9..6a83f8b 100644 --- a/contrib/fedora/passt.spec +++ b/contrib/fedora/passt.spec @@ -108,9 +108,11 @@ fi %{_bindir}/passt %{_bindir}/pasta %{_bindir}/qrap +%{_bindir}/passt-repair %{_mandir}/man1/passt.1* %{_mandir}/man1/pasta.1* %{_mandir}/man1/qrap.1* +%{_mandir}/man1/passt-repair.1* %ifarch x86_64 %{_bindir}/passt.avx2 %{_mandir}/man1/passt.avx2.1* diff --git a/contrib/selinux/passt-repair.fc b/contrib/selinux/passt-repair.fc new file mode 100644 index 0000000..bcd526e --- /dev/null +++ b/contrib/selinux/passt-repair.fc @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# contrib/selinux/passt-repair.fc - SELinux: File Context for passt-repair +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +/usr/bin/passt-repair system_u:object_r:passt_repair_exec_t:s0 diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te new file mode 100644 index 0000000..e3ffbcd --- /dev/null +++ b/contrib/selinux/passt-repair.te @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# contrib/selinux/passt-repair.te - SELinux: Type Enforcement for passt-repair +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +policy_module(passt-repair, 0.1) + +require { + type unconfined_t; + type passt_t; + role unconfined_r; + class process transition; + + class file { read execute execute_no_trans entrypoint open map }; + class capability { dac_override net_admin net_raw }; + class chr_file { append open getattr read write ioctl }; + + class unix_stream_socket { create connect sendto }; + class sock_file { read write }; + + class tcp_socket { read setopt write }; + + type console_device_t; + type user_devpts_t; + type user_tmp_t; +} + +type passt_repair_t; +domain_type(passt_repair_t); +type passt_repair_exec_t; +files_type(passt_repair_exec_t); + +role unconfined_r types passt_repair_t; + +allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans entrypoint open map }; +type_transition unconfined_t passt_repair_exec_t:process passt_repair_t; +allow unconfined_t passt_repair_t:process transition; + +allow passt_repair_t self:capability { dac_override net_admin net_raw }; + +allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl }; +allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl }; + +allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write }; +allow passt_repair_t passt_t:unix_stream_socket { connectto read write }; +allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write }; + +allow passt_repair_t unconfined_t:sock_file { read write }; +allow passt_repair_t passt_t:sock_file { read write }; +allow passt_repair_t user_tmp_t:sock_file { read write }; + +allow passt_repair_t unconfined_t:tcp_socket { read setopt write }; +allow passt_repair_t passt_t:tcp_socket { read setopt write }; diff --git a/hooks/pre-push b/hooks/pre-push index 33a2052..8dbfa5f 100755 --- a/hooks/pre-push +++ b/hooks/pre-push @@ -56,6 +56,7 @@ cd .. make pkgs scp passt passt.avx2 passt.1 qrap qrap.1 "${USER_HOST}:${BIN}" scp pasta pasta.avx2 pasta.1 "${USER_HOST}:${BIN}" +scp passt-repair passt-repair.1 "${USER_HOST}:${BIN}" ssh "${USER_HOST}" "rm -f ${BIN}/*.deb" ssh "${USER_HOST}" "rm -f ${BIN}/*.rpm" diff --git a/passt-repair.1 b/passt-repair.1 new file mode 100644 index 0000000..8d07c97 --- /dev/null +++ b/passt-repair.1 @@ -0,0 +1,70 @@ +.\" SPDX-License-Identifier: GPL-2.0-or-later +.\" Copyright (c) 2025 Red Hat GmbH +.\" Author: Stefano Brivio <sbrivio@redhat.com> +.TH passt-repair 1 + +.SH NAME +.B passt-repair +\- Helper setting TCP_REPAIR socket options for \fBpasst\fR(1) + +.SH SYNOPSIS +.B passt-repair +\fIPATH\fR + +.SH DESCRIPTION + +.B passt-repair +is a privileged helper setting and clearing repair mode on TCP sockets on behalf +of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain +socket, specified by \fIPATH\fR. + +It can be used to migrate TCP connections between guests without granting +additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections, +\fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR +capability (see \fBcapabilities\fR(7)) to be set or cleared. + +.SH PROTOCOL + +\fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via +\fI--repair-path\fR option in \fBpasst\fR(1) itself. By default, the name is the +same as the UNIX domain socket used for guest communication, suffixed by +\fI.repair\fR. + +The messages consist of one 8-bit signed integer that can be \fITCP_REPAIR_ON\fR +(1), \fITCP_REPAIR_OFF\fR (2), or \fITCP_REPAIR_OFF_WP\fR (-1), as defined by +the Linux kernel user API, and one to SCM_MAX_FD (253) sockets as SCM_RIGHTS +(see \fBunix\fR(7)) ancillary message, sent by the server, \fBpasst\fR(1). + +The client, \fBpasst-repair\fR(1), replies with the same byte (and no ancillary +message) to indicate success, and closes the connection on failure. + +The server closes the connection on error or completion. + +.SH NOTES + +\fBpasst-repair\fR(1) can be granted the \fBCAP_NET_ADMIN\fR capability +(preferred, as it limits privileges to the strictly necessary ones), or it can +be run as root. + +.SH AUTHOR + +Stefano Brivio <sbrivio@redhat.com>. + +.SH REPORTING BUGS + +Please report issues on the bug tracker at https://bugs.passt.top/, or +send a message to the passt-user@passt.top mailing list, see +https://lists.passt.top/. + +.SH COPYRIGHT + +Copyright (c) 2025 Red Hat GmbH. + +\fBpasst-repair\fR is free software: you can redistribute them and/or modify +them under the terms of the GNU General Public License as published by the Free +Software Foundation, either version 2 of the License, or (at your option) any +later version. + +.SH SEE ALSO + +\fBpasst\fR(1), \fBqemu\fR(1), \fBcapabilities\fR(7), \fBunix\fR(7). diff --git a/passt-repair.c b/passt-repair.c new file mode 100644 index 0000000..767a821 --- /dev/null +++ b/passt-repair.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * passt-repair.c - Privileged helper to set/clear TCP_REPAIR on sockets + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + * + * Connect to passt via UNIX domain socket, receive sockets via SCM_RIGHTS along + * with byte commands mapping to TCP_REPAIR values, and switch repair mode on or + * off. Reply by echoing the command. Exit on EOF. + */ + +#include <sys/prctl.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include <unistd.h> +#include <netdb.h> + +#include <netinet/tcp.h> + +#include <linux/audit.h> +#include <linux/capability.h> +#include <linux/filter.h> +#include <linux/seccomp.h> + +#include "seccomp_repair.h" + +#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ + +/** + * main() - Entry point and whole program with loop + * @argc: Argument count, must be 2 + * @argv: Argument: path of UNIX domain socket to connect to + * + * Return: 0 on success (EOF), 1 on error, 2 on usage error + * + * #syscalls:repair connect setsockopt write exit_group + * #syscalls:repair socket s390x:socketcall i686:socketcall + * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv + * #syscalls:repair sendto sendmsg arm:send ppc64le:send + */ +int main(int argc, char **argv) +{ + char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)] + __attribute__ ((aligned(__alignof__(struct cmsghdr)))); + struct sockaddr_un a = { AF_UNIX, "" }; + int fds[SCM_MAX_FD], s, ret, i, n; + struct sock_fprog prog; + int8_t cmd = INT8_MAX; + struct cmsghdr *cmsg; + struct msghdr msg; + struct iovec iov; + + prctl(PR_SET_DUMPABLE, 0); + + prog.len = (unsigned short)sizeof(filter_repair) / + sizeof(filter_repair[0]); + prog.filter = filter_repair; + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || + prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { + fprintf(stderr, "Failed to apply seccomp filter"); + return 1; + } + + iov = (struct iovec){ &cmd, sizeof(cmd) }; + msg = (struct msghdr){ NULL, 0, &iov, 1, buf, sizeof(buf), 0 }; + cmsg = CMSG_FIRSTHDR(&msg); + + if (argc != 2) { + fprintf(stderr, "Usage: %s PATH\n", argv[0]); + return 2; + } + + ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); + if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) { + fprintf(stderr, "Invalid socket path: %s\n", argv[1]); + return 2; + } + + if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + perror("Failed to create AF_UNIX socket"); + return 1; + } + + if (connect(s, (struct sockaddr *)&a, sizeof(a))) { + fprintf(stderr, "Failed to connect to %s: %s\n", argv[1], + strerror(errno)); + return 1; + } + +loop: + ret = recvmsg(s, &msg, 0); + if (ret < 0) { + perror("Failed to receive message"); + return 1; + } + + if (!ret) /* Done */ + return 0; + + if (!cmsg || + cmsg->cmsg_len < CMSG_LEN(sizeof(int)) || + cmsg->cmsg_len > CMSG_LEN(sizeof(int) * SCM_MAX_FD) || + cmsg->cmsg_type != SCM_RIGHTS) { + fprintf(stderr, "No/bad ancillary data from peer\n"); + return 1; + } + + n = cmsg->cmsg_len / CMSG_LEN(sizeof(int)); + memcpy(fds, CMSG_DATA(cmsg), sizeof(int) * n); + + if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF && + cmd != TCP_REPAIR_OFF_NO_WP) { + fprintf(stderr, "Unsupported command 0x%04x\n", cmd); + return 1; + } + + for (i = 0; i < n; i++) { + int o = cmd; + + if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &o, sizeof(o))) { + fprintf(stderr, + "Setting TCP_REPAIR to %i on socket %i: %s", o, + fds[i], strerror(errno)); + return 1; + } + + /* Close _our_ copy */ + close(fds[i]); + + /* Confirm setting by echoing the command back */ + if (send(s, &cmd, sizeof(cmd), 0) < 0) { + fprintf(stderr, "Reply to command %i: %s\n", + o, strerror(errno)); + return 1; + } + } + + goto loop; + + return 0; +} diff --git a/seccomp.sh b/seccomp.sh index 6499c58..4c521ae 100755 --- a/seccomp.sh +++ b/seccomp.sh @@ -14,8 +14,10 @@ # Author: Stefano Brivio <sbrivio@redhat.com> TMP="$(mktemp)" -IN="$@" OUT="$(mktemp)" +OUT_FINAL="${1}" +shift +IN="$@" [ -z "${ARCH}" ] && ARCH="$(uname -m)" [ -z "${CC}" ] && CC="cc" @@ -268,4 +270,4 @@ for __p in ${__profiles}; do gen_profile "${__p}" ${__calls} done -mv "${OUT}" seccomp.h +mv "${OUT}" "${OUT_FINAL}" From 52e57f9c9a6d8ae4153ac592d01d868b31c10171 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 31 Jan 2025 18:27:07 +0100 Subject: [PATCH 201/382] tcp: Get socket port and address using getsockname() when connecting from guest For migration only: we need to store 'oport', our socket-side port, as we establish a connection from the guest, so that we can bind the same oport as source port in the migration target. Similar for 'oaddr': this is needed in case the migration target has additional network interfaces, and we need to make sure our socket is bound to the equivalent interface as it was on the source. Use getsockname() to fetch them. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 4 ++-- flow_table.h | 4 ++-- tcp.c | 28 +++++++++++++++++++++++++++- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/flow.c b/flow.c index ee1221b..a6fe6d1 100644 --- a/flow.c +++ b/flow.c @@ -414,8 +414,8 @@ const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, * * Return: pointer to the target flowside information */ -const struct flowside *flow_target(const struct ctx *c, union flow *flow, - uint8_t proto) +struct flowside *flow_target(const struct ctx *c, union flow *flow, + uint8_t proto) { char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN]; struct flow_common *f = &flow->f; diff --git a/flow_table.h b/flow_table.h index f15db53..eeb6f41 100644 --- a/flow_table.h +++ b/flow_table.h @@ -168,8 +168,8 @@ const struct flowside *flow_target_af(union flow *flow, uint8_t pif, sa_family_t af, const void *saddr, in_port_t sport, const void *daddr, in_port_t dport); -const struct flowside *flow_target(const struct ctx *c, union flow *flow, - uint8_t proto); +struct flowside *flow_target(const struct ctx *c, union flow *flow, + uint8_t proto); union flow *flow_set_type(union flow *flow, enum flow_type type); #define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_) diff --git a/tcp.c b/tcp.c index 51ad692..fac322c 100644 --- a/tcp.c +++ b/tcp.c @@ -1415,6 +1415,8 @@ static void tcp_bind_outbound(const struct ctx *c, * @opts: Pointer to start of options * @optlen: Bytes in options: caller MUST ensure available length * @now: Current timestamp + * + * #syscalls:vu getsockname */ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, const void *saddr, const void *daddr, @@ -1423,9 +1425,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, { in_port_t srcport = ntohs(th->source); in_port_t dstport = ntohs(th->dest); - const struct flowside *ini, *tgt; + const struct flowside *ini; struct tcp_tap_conn *conn; union sockaddr_inany sa; + struct flowside *tgt; union flow *flow; int s = -1, mss; uint64_t hash; @@ -1530,6 +1533,29 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, } tcp_epoll_ctl(c, conn); + + if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ + if (af == AF_INET) { + struct sockaddr_in s_in; + + sl = sizeof(s_in); + if (!getsockname(s, (struct sockaddr *)&s_in, &sl)) { + /* NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) */ + tgt->oport = ntohs(s_in.sin_port); + tgt->oaddr = inany_from_v4(s_in.sin_addr); + } + } else { + struct sockaddr_in6 s_in6; + + sl = sizeof(s_in6); + if (!getsockname(s, (struct sockaddr *)&s_in6, &sl)) { + /* NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) */ + tgt->oport = ntohs(s_in6.sin6_port); + tgt->oaddr.a6 = s_in6.sin6_addr; + } + } + } + FLOW_ACTIVATE(conn); return; From dcf014be8876d5417b0eddb8b07152c6b2035485 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Sun, 2 Feb 2025 10:38:46 +0100 Subject: [PATCH 202/382] doc: Add mock of migration source and target These test programs show the migration of a TCP connection using the passt-repair helper. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- doc/migration/.gitignore | 2 + doc/migration/Makefile | 20 ++++++++ doc/migration/README | 51 ++++++++++++++++++++ doc/migration/source.c | 92 +++++++++++++++++++++++++++++++++++ doc/migration/target.c | 102 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 267 insertions(+) create mode 100644 doc/migration/.gitignore create mode 100644 doc/migration/Makefile create mode 100644 doc/migration/README create mode 100644 doc/migration/source.c create mode 100644 doc/migration/target.c diff --git a/doc/migration/.gitignore b/doc/migration/.gitignore new file mode 100644 index 0000000..59cb765 --- /dev/null +++ b/doc/migration/.gitignore @@ -0,0 +1,2 @@ +/source +/target diff --git a/doc/migration/Makefile b/doc/migration/Makefile new file mode 100644 index 0000000..04f6891 --- /dev/null +++ b/doc/migration/Makefile @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +TARGETS = source target +CFLAGS = -Wall -Wextra -pedantic + +all: $(TARGETS) + +$(TARGETS): %: %.c + +clean: + rm -f $(TARGETS) diff --git a/doc/migration/README b/doc/migration/README new file mode 100644 index 0000000..375603b --- /dev/null +++ b/doc/migration/README @@ -0,0 +1,51 @@ +<!--- +SPDX-License-Identifier: GPL-2.0-or-later +Copyright (c) 2025 Red Hat GmbH +Author: Stefano Brivio <sbrivio@redhat.com> +--> + +Migration +========= + +These test programs show a migration of a TCP connection from one process to +another using the TCP_REPAIR socket option. + +The two processes are a mock of the matching implementation in passt(1), and run +unprivileged, so they rely on the passt-repair helper to connect to them and set +or clear TCP_REPAIR on the connection socket, transferred to the helper using +SCM_RIGHTS. + +The passt-repair helper needs to have the CAP_NET_ADMIN capability, or run as +root. + +Example of usage +---------------- + +* Start the test server + + $ nc -l 9999 + +* Start the source side of the TCP client (mock of the source instance of passt) + + $ ./source 127.0.0.1 9999 9998 /tmp/repair.sock + +* The client sends a test string, and waits for a connection from passt-repair + + # passt-repair /tmp/repair.sock + +* The socket is now in repair mode, and `source` dumps sequences, then exits + + sending sequence: 3244673313 + receiving sequence: 2250449386 + +* Continue the connection on the target side, restarting from those sequences + + $ ./target 127.0.0.1 9999 9998 /tmp/repair.sock 3244673313 2250449386 + +* The target side now waits for a connection from passt-repair + + # passt-repair /tmp/repair.sock + +* The target side asks passt-repair to switch the socket to repair mode, sets up + the TCP sequences, then asks passt-repair to clear repair mode, and sends a + test string to the server diff --git a/doc/migration/source.c b/doc/migration/source.c new file mode 100644 index 0000000..d44ebf1 --- /dev/null +++ b/doc/migration/source.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * doc/migration/source.c - Mock of TCP migration source, use with passt-repair + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <arpa/inet.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <unistd.h> +#include <netdb.h> +#include <netinet/tcp.h> + +int main(int argc, char **argv) +{ + struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } }; + struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0, + NULL, NULL, NULL }; + struct sockaddr_un a_helper = { AF_UNIX, { 0 } }; + int seq, s, s_helper; + int8_t cmd; + struct iovec iov = { &cmd, sizeof(cmd) }; + char buf[CMSG_SPACE(sizeof(int))]; + struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 }; + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + socklen_t seqlen = sizeof(int); + struct addrinfo *r; + + (void)argc; + + if (argc != 5) { + fprintf(stderr, "%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH\n", + argv[0]); + return -1; + } + + strcpy(a_helper.sun_path, argv[4]); + getaddrinfo(argv[1], argv[2], &hints, &r); + + /* Connect socket to server and send some data */ + s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP); + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int)); + bind(s, (struct sockaddr *)&a, sizeof(a)); + connect(s, r->ai_addr, r->ai_addrlen); + send(s, "before migration\n", sizeof("before migration\n"), 0); + + /* Wait for helper */ + s_helper = socket(AF_UNIX, SOCK_STREAM, 0); + unlink(a_helper.sun_path); + bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper)); + listen(s_helper, 1); + s_helper = accept(s_helper, NULL, NULL); + + /* Set up message for helper, with socket */ + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &s, sizeof(s)); + + /* Send command to helper: turn repair mode on, wait for reply */ + cmd = TCP_REPAIR_ON; + sendmsg(s_helper, &msg, 0); + recv(s_helper, &((int8_t){ 0 }), 1, 0); + + /* Terminate helper */ + close(s_helper); + + /* Get sending sequence */ + seq = TCP_SEND_QUEUE; + setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq)); + getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen); + fprintf(stdout, "%u ", seq); + + /* Get receiving sequence */ + seq = TCP_RECV_QUEUE; + setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq)); + getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen); + fprintf(stdout, "%u\n", seq); +} diff --git a/doc/migration/target.c b/doc/migration/target.c new file mode 100644 index 0000000..f7d3108 --- /dev/null +++ b/doc/migration/target.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * doc/migration/target.c - Mock of TCP migration target, use with passt-repair + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <arpa/inet.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <unistd.h> +#include <netdb.h> +#include <netinet/tcp.h> + +int main(int argc, char **argv) +{ + struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } }; + struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0, + NULL, NULL, NULL }; + struct sockaddr_un a_helper = { AF_UNIX, { 0 } }; + int s, s_helper, seq; + int8_t cmd; + struct iovec iov = { &cmd, sizeof(cmd) }; + char buf[CMSG_SPACE(sizeof(int))]; + struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 }; + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + struct addrinfo *r; + + (void)argc; + + strcpy(a_helper.sun_path, argv[4]); + getaddrinfo(argv[1], argv[2], &hints, &r); + + if (argc != 7) { + fprintf(stderr, + "%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH SSEQ RSEQ\n", + argv[0]); + return -1; + } + + /* Prepare socket, bind to source port */ + s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP); + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int)); + bind(s, (struct sockaddr *)&a, sizeof(a)); + + /* Wait for helper */ + s_helper = socket(AF_UNIX, SOCK_STREAM, 0); + unlink(a_helper.sun_path); + bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper)); + listen(s_helper, 1); + s_helper = accept(s_helper, NULL, NULL); + + /* Set up message for helper, with socket */ + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &s, sizeof(s)); + + /* Send command to helper: turn repair mode on, wait for reply */ + cmd = TCP_REPAIR_ON; + sendmsg(s_helper, &msg, 0); + recv(s_helper, &((int){ 0 }), 1, 0); + + /* Set sending sequence */ + seq = TCP_SEND_QUEUE; + setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq)); + seq = atoi(argv[5]); + setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq)); + + /* Set receiving sequence */ + seq = TCP_RECV_QUEUE; + setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq)); + seq = atoi(argv[6]); + setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq)); + + /* Connect setting kernel state only, without actual SYN / handshake */ + connect(s, r->ai_addr, r->ai_addrlen); + + /* Send command to helper: turn repair mode off, wait for reply */ + cmd = TCP_REPAIR_OFF; + sendmsg(s_helper, &msg, 0); + + recv(s_helper, &((int8_t){ 0 }), 1, 0); + + /* Terminate helper */ + close(s_helper); + + /* Send some more data */ + send(s, "after migration\n", sizeof("after migration\n"), 0); +} From b4a7b5d4a66db5f419cb5de87da3403cfba3847d Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 4 Feb 2025 16:42:13 +1100 Subject: [PATCH 203/382] migrate: Fix several errors with passt-repair The passt-repair helper is now merged, but alas it contains several small bugs: * close() is not in the seccomp profile, meaning it will immediately SIGSYS when you make a request of it * The generated header, seccomp_repair.h isn't listed in .gitignore or removed by "make clean" Fixes: 8c24301462c3 ("Introduce passt-repair") Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- .gitignore | 1 + Makefile | 2 +- passt-repair.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 5824a71..3c16adc 100644 --- a/.gitignore +++ b/.gitignore @@ -7,5 +7,6 @@ /qrap /pasta.1 /seccomp.h +/seccomp_repair.h /c*.json README.plain.md diff --git a/Makefile b/Makefile index 6ab8d24..d3d4b78 100644 --- a/Makefile +++ b/Makefile @@ -117,7 +117,7 @@ valgrind: all .PHONY: clean clean: - $(RM) $(BIN) *~ *.o seccomp.h pasta.1 \ + $(RM) $(BIN) *~ *.o seccomp.h seccomp_repair.h pasta.1 \ passt.tar passt.tar.gz *.deb *.rpm \ passt.pid README.plain.md diff --git a/passt-repair.c b/passt-repair.c index 767a821..dd8578f 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -46,7 +46,7 @@ * * Return: 0 on success (EOF), 1 on error, 2 on usage error * - * #syscalls:repair connect setsockopt write exit_group + * #syscalls:repair connect setsockopt write close exit_group * #syscalls:repair socket s390x:socketcall i686:socketcall * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv * #syscalls:repair sendto sendmsg arm:send ppc64le:send From 745c163e60b0e5da7bf6013645d79b4bdbf3e848 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 4 Feb 2025 16:42:15 +1100 Subject: [PATCH 204/382] tcp: Simplify handling of getsockname() For migration we need to get the specific local address and port for connected sockets with getsockname(). We currently open code marshalling the results into the flow entry. However, we already have inany_from_sockaddr() which handles the fiddly parts of this, so use it. Also report failures, which may make debugging problems easier. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Drop re-declarations of 'sa' and 'sl'] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/tcp.c b/tcp.c index fac322c..af6bd95 100644 --- a/tcp.c +++ b/tcp.c @@ -1535,24 +1535,12 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, tcp_epoll_ctl(c, conn); if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ - if (af == AF_INET) { - struct sockaddr_in s_in; - - sl = sizeof(s_in); - if (!getsockname(s, (struct sockaddr *)&s_in, &sl)) { - /* NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) */ - tgt->oport = ntohs(s_in.sin_port); - tgt->oaddr = inany_from_v4(s_in.sin_addr); - } + sl = sizeof(sa); + if (!getsockname(s, &sa.sa, &sl)) { + inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa); } else { - struct sockaddr_in6 s_in6; - - sl = sizeof(s_in6); - if (!getsockname(s, (struct sockaddr *)&s_in6, &sl)) { - /* NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) */ - tgt->oport = ntohs(s_in6.sin6_port); - tgt->oaddr.a6 = s_in6.sin6_addr; - } + err("Failed to get local address for socket: %s", + strerror_(errno)); } } From d0006fa784a7de881db187756770d2492c75df5d Mon Sep 17 00:00:00 2001 From: Paul Holzinger <pholzing@redhat.com> Date: Wed, 5 Feb 2025 14:00:41 +0100 Subject: [PATCH 205/382] treewide: use _exit() over exit() In the podman CI I noticed many seccomp denials in our logs even though tests passed: comm="pasta.avx2" exe="/usr/bin/pasta.avx2" sig=31 arch=c000003e syscall=202 compat=0 ip=0x7fb3d31f69db code=0x80000000 Which is futex being called and blocked by the pasta profile. After a few tries I managed to reproduce locally with this loop in ~20 min: while :; do podman run -d --network bridge quay.io/libpod/testimage:20241011 \ sleep 100 && \ sleep 10 && \ podman rm -fa -t0 done And using a pasta version with prctl(PR_SET_DUMPABLE, 1); set I got the following stack trace: Stack trace of thread 1: #0 0x00007fc95e6de91b __lll_lock_wait_private (libc.so.6 + 0x9491b) #1 0x00007fc95e68d6de __run_exit_handlers (libc.so.6 + 0x436de) #2 0x00007fc95e68d70e exit (libc.so.6 + 0x4370e) #3 0x000055f31b78c50b n/a (n/a + 0x0) #4 0x00007fc95e68d70e exit (libc.so.6 + 0x4370e) #5 0x000055f31b78d5a2 n/a (n/a + 0x0) Pasta got killed in exit(), it seems glibc is trying to use a lock when running exit handlers even though no exit handlers are defined. Given no exit handlers are needed we can call _exit() instead. This skips exit handlers and does not flush stdio streams compared to exit() which should be fine for the use here. Based on the input from Stefano I did not change the test/doc programs or qrap as they do not use seccomp filters. Signed-off-by: Paul Holzinger <pholzing@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 8 ++++---- log.h | 4 ++-- passt.c | 8 ++++---- pasta.c | 8 ++++---- tap.c | 2 +- util.c | 8 ++++---- vhost_user.c | 2 +- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/conf.c b/conf.c index df2b016..6817377 100644 --- a/conf.c +++ b/conf.c @@ -769,7 +769,7 @@ static void conf_ip6_local(struct ip6_ctx *ip6) * usage() - Print usage, exit with given status code * @name: Executable name * @f: Stream to print usage info to - * @status: Status code for exit() + * @status: Status code for _exit() */ static void usage(const char *name, FILE *f, int status) { @@ -925,7 +925,7 @@ static void usage(const char *name, FILE *f, int status) " SPEC is as described for TCP above\n" " default: none\n"); - exit(status); + _exit(status); pasta_opts: @@ -980,7 +980,7 @@ pasta_opts: " --ns-mac-addr ADDR Set MAC address on tap interface\n" " --no-splice Disable inbound socket splicing\n"); - exit(status); + _exit(status); } /** @@ -1482,7 +1482,7 @@ void conf(struct ctx *c, int argc, char **argv) FPRINTF(stdout, c->mode == MODE_PASTA ? "pasta " : "passt "); FPRINTF(stdout, VERSION_BLOB); - exit(EXIT_SUCCESS); + _exit(EXIT_SUCCESS); case 15: ret = snprintf(c->ip4.ifname_out, sizeof(c->ip4.ifname_out), "%s", optarg); diff --git a/log.h b/log.h index a30b091..22c7b9a 100644 --- a/log.h +++ b/log.h @@ -32,13 +32,13 @@ void logmsg_perror(int pri, const char *format, ...) #define die(...) \ do { \ err(__VA_ARGS__); \ - exit(EXIT_FAILURE); \ + _exit(EXIT_FAILURE); \ } while (0) #define die_perror(...) \ do { \ err_perror(__VA_ARGS__); \ - exit(EXIT_FAILURE); \ + _exit(EXIT_FAILURE); \ } while (0) extern int log_trace; diff --git a/passt.c b/passt.c index b1c8ab6..53fdd38 100644 --- a/passt.c +++ b/passt.c @@ -167,7 +167,7 @@ void exit_handler(int signal) { (void)signal; - exit(EXIT_SUCCESS); + _exit(EXIT_SUCCESS); } /** @@ -210,7 +210,7 @@ int main(int argc, char **argv) sigaction(SIGQUIT, &sa, NULL); if (argc < 1) - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); strncpy(argv0, argv[0], PATH_MAX - 1); name = basename(argv0); @@ -226,7 +226,7 @@ int main(int argc, char **argv) } else if (strstr(name, "passt")) { c.mode = MODE_PASST; } else { - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); } madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE); @@ -259,7 +259,7 @@ int main(int argc, char **argv) flow_init(); if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c))) - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); proto_update_l2_buf(c.guest_mac, c.our_tap_mac); diff --git a/pasta.c b/pasta.c index ff41c95..f15084d 100644 --- a/pasta.c +++ b/pasta.c @@ -73,12 +73,12 @@ void pasta_child_handler(int signal) !waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) { if (infop.si_pid == pasta_child_pid) { if (infop.si_code == CLD_EXITED) - exit(infop.si_status); + _exit(infop.si_status); /* If killed by a signal, si_status is the number. * Follow common shell convention of returning it + 128. */ - exit(infop.si_status + 128); + _exit(infop.si_status + 128); /* Nothing to do, detached PID namespace going away */ } @@ -499,7 +499,7 @@ void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd) return; info("Namespace %s is gone, exiting", c->netns_base); - exit(EXIT_SUCCESS); + _exit(EXIT_SUCCESS); } /** @@ -525,7 +525,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref) return; info("Namespace %s is gone, exiting", c->netns_base); - exit(EXIT_SUCCESS); + _exit(EXIT_SUCCESS); } close(fd); diff --git a/tap.c b/tap.c index 772648f..8c92d23 100644 --- a/tap.c +++ b/tap.c @@ -1002,7 +1002,7 @@ void tap_sock_reset(struct ctx *c) info("Client connection closed%s", c->one_off ? ", exiting" : ""); if (c->one_off) - exit(EXIT_SUCCESS); + _exit(EXIT_SUCCESS); /* Close the connected socket, wait for a new connection */ epoll_del(c, c->fd_tap); diff --git a/util.c b/util.c index 800c6b5..4d51e04 100644 --- a/util.c +++ b/util.c @@ -405,7 +405,7 @@ void pidfile_write(int fd, pid_t pid) if (write(fd, pid_buf, n) < 0) { perror("PID file write"); - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); } close(fd); @@ -441,12 +441,12 @@ int __daemon(int pidfile_fd, int devnull_fd) if (pid == -1) { perror("fork"); - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); } if (pid) { pidfile_write(pidfile_fd, pid); - exit(EXIT_SUCCESS); + _exit(EXIT_SUCCESS); } if (setsid() < 0 || @@ -454,7 +454,7 @@ int __daemon(int pidfile_fd, int devnull_fd) dup2(devnull_fd, STDOUT_FILENO) < 0 || dup2(devnull_fd, STDERR_FILENO) < 0 || close(devnull_fd)) - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); return 0; } diff --git a/vhost_user.c b/vhost_user.c index 9e38cfd..159f0b3 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -60,7 +60,7 @@ void vu_print_capabilities(void) info("{"); info(" \"type\": \"net\""); info("}"); - exit(EXIT_SUCCESS); + _exit(EXIT_SUCCESS); } /** From a9d63f91a59a4c02cd77af41fa70d82e73f17576 Mon Sep 17 00:00:00 2001 From: Paul Holzinger <pholzing@redhat.com> Date: Wed, 5 Feb 2025 14:00:42 +0100 Subject: [PATCH 206/382] passt-repair: use _exit() over return When returning from main it does the same as calling exit() which is not good as glibc might try to call futex() which will be blocked by seccomp. See the prevoius commit "treewide: use _exit() over exit()" for a more detailed explanation. Signed-off-by: Paul Holzinger <pholzing@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt-repair.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/passt-repair.c b/passt-repair.c index dd8578f..6f79423 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -71,7 +71,7 @@ int main(int argc, char **argv) if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { fprintf(stderr, "Failed to apply seccomp filter"); - return 1; + _exit(1); } iov = (struct iovec){ &cmd, sizeof(cmd) }; @@ -80,42 +80,42 @@ int main(int argc, char **argv) if (argc != 2) { fprintf(stderr, "Usage: %s PATH\n", argv[0]); - return 2; + _exit(2); } ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) { fprintf(stderr, "Invalid socket path: %s\n", argv[1]); - return 2; + _exit(2); } if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { perror("Failed to create AF_UNIX socket"); - return 1; + _exit(1); } if (connect(s, (struct sockaddr *)&a, sizeof(a))) { fprintf(stderr, "Failed to connect to %s: %s\n", argv[1], strerror(errno)); - return 1; + _exit(1); } loop: ret = recvmsg(s, &msg, 0); if (ret < 0) { perror("Failed to receive message"); - return 1; + _exit(1); } if (!ret) /* Done */ - return 0; + _exit(0); if (!cmsg || cmsg->cmsg_len < CMSG_LEN(sizeof(int)) || cmsg->cmsg_len > CMSG_LEN(sizeof(int) * SCM_MAX_FD) || cmsg->cmsg_type != SCM_RIGHTS) { fprintf(stderr, "No/bad ancillary data from peer\n"); - return 1; + _exit(1); } n = cmsg->cmsg_len / CMSG_LEN(sizeof(int)); @@ -124,7 +124,7 @@ loop: if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF && cmd != TCP_REPAIR_OFF_NO_WP) { fprintf(stderr, "Unsupported command 0x%04x\n", cmd); - return 1; + _exit(1); } for (i = 0; i < n; i++) { @@ -134,7 +134,7 @@ loop: fprintf(stderr, "Setting TCP_REPAIR to %i on socket %i: %s", o, fds[i], strerror(errno)); - return 1; + _exit(1); } /* Close _our_ copy */ @@ -144,11 +144,11 @@ loop: if (send(s, &cmd, sizeof(cmd), 0) < 0) { fprintf(stderr, "Reply to command %i: %s\n", o, strerror(errno)); - return 1; + _exit(1); } } goto loop; - return 0; + _exit(0); } From 9215f68a0c2ad274b73862bc865fbdbb464e182a Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 5 Feb 2025 16:57:55 +0100 Subject: [PATCH 207/382] passt-repair: Build fixes for musl When building against musl headers: - sizeof() needs stddef.h, as it should be; - we can't initialise a struct msghdr by simply listing fields in order, as they contain explicit padding fields. Use field names instead. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- passt-repair.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/passt-repair.c b/passt-repair.c index 6f79423..3c3247b 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -21,6 +21,7 @@ #include <sys/socket.h> #include <sys/un.h> #include <errno.h> +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -75,7 +76,11 @@ int main(int argc, char **argv) } iov = (struct iovec){ &cmd, sizeof(cmd) }; - msg = (struct msghdr){ NULL, 0, &iov, 1, buf, sizeof(buf), 0 }; + msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0, + .msg_iov = &iov, .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = sizeof(buf), + .msg_flags = 0 }; cmsg = CMSG_FIRSTHDR(&msg); if (argc != 2) { From 593be3277429f0a2c06f6bebab4f20736c96abc8 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 5 Feb 2025 17:02:27 +0100 Subject: [PATCH 208/382] passt-repair.1: Fix indication of TCP_REPAIR constants ...perhaps I should adopt the healthy habit of actually reading headers instead of using my mental copy. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- passt-repair.1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passt-repair.1 b/passt-repair.1 index 8d07c97..7c1b140 100644 --- a/passt-repair.1 +++ b/passt-repair.1 @@ -31,7 +31,7 @@ same as the UNIX domain socket used for guest communication, suffixed by \fI.repair\fR. The messages consist of one 8-bit signed integer that can be \fITCP_REPAIR_ON\fR -(1), \fITCP_REPAIR_OFF\fR (2), or \fITCP_REPAIR_OFF_WP\fR (-1), as defined by +(1), \fITCP_REPAIR_OFF\fR (0), or \fITCP_REPAIR_OFF_NO_WP\fR (-1), as defined by the Linux kernel user API, and one to SCM_MAX_FD (253) sockets as SCM_RIGHTS (see \fBunix\fR(7)) ancillary message, sent by the server, \fBpasst\fR(1). From f66769c2de82550ac1ee2548960c09a4b052341f Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 5 Feb 2025 17:21:59 +0100 Subject: [PATCH 209/382] apparmor: Workaround for unconfined libvirtd when triggered by unprivileged user If libvirtd is triggered by an unprivileged user, the virt-aa-helper mechanism doesn't work, because per-VM profiles can't be instantiated, and as a result libvirtd runs unconfined. This means passt can't start, because the passt subprofile from libvirt's profile is not loaded either. Example: $ virsh start alpine error: Failed to start domain 'alpine' error: internal error: Child process (passt --one-off --socket /run/user/1000/libvirt/qemu/run/passt/1-alpine-net0.socket --pid /run/user/1000/libvirt/qemu/run/passt/1-alpine-net0-passt.pid --tcp-ports 40922:2) unexpected fatal signal 11 Add an annoying workaround for the moment being. Much better than encouraging users to start guests as root, or to disable AppArmor altogether. Reported-by: Prafulla Giri <prafulla.giri@protonmail.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/apparmor/usr.bin.passt | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/contrib/apparmor/usr.bin.passt b/contrib/apparmor/usr.bin.passt index 9568189..62a4514 100644 --- a/contrib/apparmor/usr.bin.passt +++ b/contrib/apparmor/usr.bin.passt @@ -27,4 +27,25 @@ profile passt /usr/bin/passt{,.avx2} { owner @{HOME}/** w, # pcap(), pidfile_open(), # pidfile_write() + + # Workaround: libvirt's profile comes with a passt subprofile which includes, + # in turn, <abstractions/passt>, and adds libvirt-specific rules on top, to + # allow passt (when started by libvirtd) to write socket and PID files in the + # location requested by libvirtd itself, and to execute passt itself. + # + # However, when libvirt runs as unprivileged user, the mechanism based on + # virt-aa-helper, designed to build per-VM profiles as guests are started, + # doesn't work. The helper needs to create and load profiles on the fly, which + # can't be done by unprivileged users, of course. + # + # As a result, libvirtd runs unconfined if guests are started by unprivileged + # users, starting passt unconfined as well, which means that passt runs under + # its own stand-alone profile (this one), which implies in turn that execve() + # of /usr/bin/passt is not allowed, and socket and PID files can't be written. + # + # Duplicate libvirt-specific rules here as long as this is not solved in + # libvirt's profile itself. + /usr/bin/passt r, + owner @{run}/user/[0-9]*/libvirt/qemu/run/passt/* rw, + owner @{run}/libvirt/qemu/passt/* rw, } From 0da87b393b63747526d162c728987f320b41771e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 6 Feb 2025 16:49:42 +1100 Subject: [PATCH 210/382] debug: Add tcpdump to mbuto.img Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/passt.mbuto | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/passt.mbuto b/test/passt.mbuto index 138d365..d4d57cb 100755 --- a/test/passt.mbuto +++ b/test/passt.mbuto @@ -13,7 +13,7 @@ PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl - nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}" + nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump}" # OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and # sshd-session the per-session program. We need the latter as well, and the path @@ -65,6 +65,7 @@ EOF # sshd via vsock cat > /etc/passwd << EOF root:x:0:0:root:/root:/bin/sh +tcpdump:x:72:72:tcpdump:/:/sbin/nologin sshd:x:100:100:Privilege-separated SSH:/var/empty/sshd:/sbin/nologin EOF cat > /etc/shadow << EOF From a5cca995dee9b4196d41c86034a4948d346266ca Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 6 Feb 2025 09:33:05 +0100 Subject: [PATCH 211/382] conf, passt.1: Un-deprecate --host-lo-to-ns-lo It was established behaviour, and it's now the third report about it: users ask how to achieve the same functionality, and we don't have a better answer yet. The idea behind declaring it deprecated to start with, I guess, was that we would eventually replace it by more flexible and generic configuration options, which is still planned. But there's nothing preventing us to alias this in the future to a particular configuration. So, stop scaring users off, and un-deprecate this. Link: https://archives.passt.top/passt-dev/20240925102009.62b9a0ce@elisabeth/ Link: https://github.com/rootless-containers/rootlesskit/pull/482#issuecomment-2591855705 Link: https://github.com/moby/moby/issues/48838 Link: https://github.com/containers/podman/discussions/25243 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- conf.c | 3 +-- passt.1 | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/conf.c b/conf.c index 6817377..f5d04db 100644 --- a/conf.c +++ b/conf.c @@ -963,8 +963,7 @@ pasta_opts: " -U, --udp-ns SPEC UDP port forwarding to init namespace\n" " SPEC is as described above\n" " default: auto\n" - " --host-lo-to-ns-lo DEPRECATED:\n" - " Translate host-loopback forwards to\n" + " --host-lo-to-ns-lo Translate host-loopback forwards to\n" " namespace loopback\n" " --userns NSPATH Target user namespace to join\n" " --netns PATH|NAME Target network namespace to join\n" diff --git a/passt.1 b/passt.1 index d9cd33e..2928af5 100644 --- a/passt.1 +++ b/passt.1 @@ -622,7 +622,7 @@ Configure UDP port forwarding from target namespace to init namespace. Default is \fBauto\fR. .TP -.BR \-\-host-lo-to-ns-lo " " (DEPRECATED) +.BR \-\-host-lo-to-ns-lo If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from the host's loopback address will appear on the loopback address in the guest as well. Without this option such forwarded packets will appear From a0b7f56b3a3c220b3d8065d7cfdd83a6e3919467 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 7 Feb 2025 01:51:38 +0100 Subject: [PATCH 212/382] passt-repair: Don't use perror(), accept ECONNRESET as termination If we use glibc's perror(), we need to allow dup() and fcntl() in our seccomp profiles, which are a bit too much for this simple helper. On top of that, we would probably need a wrapper to avoid allocation for translated messages. While at it: ECONNRESET is just a close() from passt, treat it like EOF. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- passt-repair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/passt-repair.c b/passt-repair.c index 3c3247b..d137a18 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -95,7 +95,7 @@ int main(int argc, char **argv) } if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { - perror("Failed to create AF_UNIX socket"); + fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno); _exit(1); } @@ -108,8 +108,12 @@ int main(int argc, char **argv) loop: ret = recvmsg(s, &msg, 0); if (ret < 0) { - perror("Failed to receive message"); - _exit(1); + if (errno == ECONNRESET) { + ret = 0; + } else { + fprintf(stderr, "Failed to read message: %i\n", errno); + _exit(1); + } } if (!ret) /* Done */ From 0f009ea598707c5978846387d716f4a612d07b36 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 7 Feb 2025 01:55:08 +0100 Subject: [PATCH 213/382] passt-repair: Fix calculation of payload length from cmsg_len There's no inverse function for CMSG_LEN(), so we need to loop over SCM_MAX_FD (253) possible input values. The previous calculation is clearly wrong, as not every int takes CMSG_LEN(sizeof(int)) in cmsg data. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt-repair.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/passt-repair.c b/passt-repair.c index d137a18..5ad5c9c 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -57,7 +57,7 @@ int main(int argc, char **argv) char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)] __attribute__ ((aligned(__alignof__(struct cmsghdr)))); struct sockaddr_un a = { AF_UNIX, "" }; - int fds[SCM_MAX_FD], s, ret, i, n; + int fds[SCM_MAX_FD], s, ret, i, n = 0; struct sock_fprog prog; int8_t cmd = INT8_MAX; struct cmsghdr *cmsg; @@ -127,7 +127,21 @@ loop: _exit(1); } - n = cmsg->cmsg_len / CMSG_LEN(sizeof(int)); + /* No inverse formula for CMSG_LEN(x), and building one with CMSG_LEN(0) + * works but there's no guarantee it does. Search the whole domain. + */ + for (i = 1; i < SCM_MAX_FD; i++) { + if (CMSG_LEN(sizeof(int) * i) == cmsg->cmsg_len) { + n = i; + break; + } + } + if (!n) { + fprintf(stderr, "Invalid ancillary data length %zu from peer\n", + cmsg->cmsg_len); + _exit(1); + } + memcpy(fds, CMSG_DATA(cmsg), sizeof(int) * n); if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF && From b7b70ba24369891d79079d247f246c1e357948d2 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 7 Feb 2025 01:58:00 +0100 Subject: [PATCH 214/382] passt-repair: Dodge "structurally unreachable code" warning from Coverity While main() conventionally returns int, and we need a return at the end of the function to avoid compiler warnings, turning that return into _exit() to avoid exit handlers triggers a Coverity warning. It's unreachable code anyway, so switch that single occurence back to a plain return. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- passt-repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passt-repair.c b/passt-repair.c index 5ad5c9c..322066a 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -173,5 +173,5 @@ loop: goto loop; - _exit(0); + return 0; } From fe8b6a7c42625ee1fc63186204d32458b1ba31b9 Mon Sep 17 00:00:00 2001 From: Enrique Llorente <ellorent@redhat.com> Date: Tue, 4 Feb 2025 10:43:37 +0100 Subject: [PATCH 215/382] dhcp: Don't re-use request message for reply The logic composing the DHCP reply message is reusing the request message to compose it, future long options like FQDN may exceed the request message limit making it go beyond the lower bound. This change creates a new reply message with a fixed options size of 308 and fills it in with proper fields from requests adding on top the generated options, this way the reply lower bound does not depend on the request. Signed-off-by: Enrique Llorente <ellorent@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- dhcp.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/dhcp.c b/dhcp.c index d8515aa..2a23ed4 100644 --- a/dhcp.c +++ b/dhcp.c @@ -151,9 +151,6 @@ static int fill(struct msg *m) { int i, o, offset = 0; - m->op = BOOTREPLY; - m->secs = 0; - for (o = 0; o < 255; o++) opts[o].sent = 0; @@ -291,8 +288,9 @@ int dhcp(const struct ctx *c, const struct pool *p) const struct ethhdr *eh; const struct iphdr *iph; const struct udphdr *uh; + struct msg const *m; + struct msg reply; unsigned int i; - struct msg *m; eh = packet_get(p, 0, offset, sizeof(*eh), NULL); offset += sizeof(*eh); @@ -321,6 +319,22 @@ int dhcp(const struct ctx *c, const struct pool *p) m->op != BOOTREQUEST) return -1; + reply.op = BOOTREPLY; + reply.htype = m->htype; + reply.hlen = m->hlen; + reply.hops = 0; + reply.xid = m->xid; + reply.secs = 0; + reply.flags = m->flags; + reply.ciaddr = m->ciaddr; + reply.yiaddr = c->ip4.addr; + reply.siaddr = 0; + reply.giaddr = m->giaddr; + memcpy(&reply.chaddr, m->chaddr, sizeof(reply.chaddr)); + memset(&reply.sname, 0, sizeof(reply.sname)); + memset(&reply.file, 0, sizeof(reply.file)); + reply.magic = m->magic; + offset += offsetof(struct msg, o); for (i = 0; i < ARRAY_SIZE(opts); i++) @@ -364,7 +378,6 @@ int dhcp(const struct ctx *c, const struct pool *p) info(" from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr))); - m->yiaddr = c->ip4.addr; mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len)); memcpy(opts[1].s, &mask, sizeof(mask)); memcpy(opts[3].s, &c->ip4.guest_gw, sizeof(c->ip4.guest_gw)); @@ -401,14 +414,14 @@ int dhcp(const struct ctx *c, const struct pool *p) if (!c->no_dhcp_dns_search) opt_set_dns_search(c, sizeof(m->o)); - dlen = offsetof(struct msg, o) + fill(m); + dlen = offsetof(struct msg, o) + fill(&reply); if (m->flags & FLAG_BROADCAST) dst = in4addr_broadcast; else dst = c->ip4.addr; - tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, m, dlen); + tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, &reply, dlen); return 1; } From 864be475d9db58c93540eb883ecf656c3eff861f Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 7 Feb 2025 08:59:57 +0100 Subject: [PATCH 216/382] passt-repair: Send one confirmation *per command*, not *per socket* It looks like me, myself and I couldn't agree on the "simple" protocol between passt and passt-repair. The man page and passt say it's one confirmation per command, but the passt-repair implementation had one confirmation per socket instead. This caused all sort of mysterious issues with repair mode pseudo-randomly enabled, and leading to hours of fun (mostly not mine). Oops. Switch to one confirmation per command (of course). Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- passt-repair.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/passt-repair.c b/passt-repair.c index 322066a..614cee0 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -63,6 +63,7 @@ int main(int argc, char **argv) struct cmsghdr *cmsg; struct msghdr msg; struct iovec iov; + int op; prctl(PR_SET_DUMPABLE, 0); @@ -150,25 +151,24 @@ loop: _exit(1); } - for (i = 0; i < n; i++) { - int o = cmd; + op = cmd; - if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &o, sizeof(o))) { + for (i = 0; i < n; i++) { + if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &op, sizeof(op))) { fprintf(stderr, - "Setting TCP_REPAIR to %i on socket %i: %s", o, + "Setting TCP_REPAIR to %i on socket %i: %s", op, fds[i], strerror(errno)); _exit(1); } /* Close _our_ copy */ close(fds[i]); + } - /* Confirm setting by echoing the command back */ - if (send(s, &cmd, sizeof(cmd), 0) < 0) { - fprintf(stderr, "Reply to command %i: %s\n", - o, strerror(errno)); - _exit(1); - } + /* Confirm setting by echoing the command back */ + if (send(s, &cmd, sizeof(cmd), 0) < 0) { + fprintf(stderr, "Reply to %i: %s\n", op, strerror(errno)); + _exit(1); } goto loop; From a3d142a6f64d89fffe26634e158dedd55fa31e7b Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Mon, 3 Feb 2025 09:22:10 +0100 Subject: [PATCH 217/382] conf: Don't map DNS traffic to host, if host gateway is a resolver This should be a relatively common case and I'm a bit surprised it's been broken since I added the "gateway mapping" functionality, but it doesn't happen with Podman, and not with systemd-resolved or similar local proxies, and also not with servers where typically the gateway is just a router and not a DNS resolver. That could be the reason why nobody noticed until now. By default, we'll map the address of the default gateway, in containers and guests, to represent "the host", so that we have a well-defined way to reach the host. Say: 0.0029: NAT to host 127.0.0.1: 192.168.100.1 But if the host gateway is also a DNS resolver: 0.0029: DNS: 0.0029: 192.168.100.1 then we'll send DNS queries directed to it to the host instead: 0.0372: Flow 0 (INI): TAP [192.168.100.157]:41892 -> [192.168.100.1]:53 => ? 0.0372: Flow 0 (TGT): INI -> TGT 0.0373: Flow 0 (TGT): TAP [192.168.100.157]:41892 -> [192.168.100.1]:53 => HOST [0.0.0.0]:41892 -> [127.0.0.1]:53 0.0373: Flow 0 (UDP flow): TGT -> TYPED 0.0373: Flow 0 (UDP flow): TAP [192.168.100.157]:41892 -> [192.168.100.1]:53 => HOST [0.0.0.0]:41892 -> [127.0.0.1]:53 0.0373: Flow 0 (UDP flow): Side 0 hash table insert: bucket: 31049 0.0374: Flow 0 (UDP flow): TYPED -> ACTIVE 0.0374: Flow 0 (UDP flow): TAP [192.168.100.157]:41892 -> [192.168.100.1]:53 => HOST [0.0.0.0]:41892 -> [127.0.0.1]:53 which doesn't quite work, of course: 0.0374: pasta: epoll event on UDP reply socket 95 (events: 0x00000008) 0.0374: ICMP error on UDP socket 95: Connection refused unless the host is a resolver itself... but then we wouldn't find the address of the gateway in its /etc/resolv.conf, presumably. Fix this by making an exception for DNS traffic: if the default gateway is a resolver, match on DNS traffic going to the default gateway, and explicitly forward it to the configured resolver. Reported-by: Prafulla Giri <prafulla.giri@protonmail.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 16 ++++++++++------ passt.1 | 14 ++++++++++---- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/conf.c b/conf.c index f5d04db..142dc94 100644 --- a/conf.c +++ b/conf.c @@ -426,10 +426,12 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver, if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)) c->ip4.dns_host = ns4; - /* Guest or container can only access local addresses via - * redirect + /* Special handling if guest or container can only access local + * addresses via redirect, or if the host gateway is also a + * resolver and we shadow its address */ - if (IN4_IS_ADDR_LOOPBACK(&ns4)) { + if (IN4_IS_ADDR_LOOPBACK(&ns4) || + IN4_ARE_ADDR_EQUAL(&ns4, &c->ip4.map_host_loopback)) { if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) return; @@ -445,10 +447,12 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver, if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) c->ip6.dns_host = ns6; - /* Guest or container can only access local addresses via - * redirect + /* Special handling if guest or container can only access local + * addresses via redirect, or if the host gateway is also a + * resolver and we shadow its address */ - if (IN6_IS_ADDR_LOOPBACK(&ns6)) { + if (IN6_IS_ADDR_LOOPBACK(&ns6) || + IN6_ARE_ADDR_EQUAL(&ns6, &c->ip6.map_host_loopback)) { if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) return; diff --git a/passt.1 b/passt.1 index 2928af5..29cc3ed 100644 --- a/passt.1 +++ b/passt.1 @@ -941,10 +941,16 @@ with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while the last observed source address from guest or namespace is 192.0.2.2, this will be translated to a connection from 192.0.2.1 to 192.0.2.2. -Similarly, for traffic coming from guest or namespace, packets with -destination address corresponding to the \fB\-\-map-host-loopback\fR -address will have their destination address translated to a loopback -address. +Similarly, for traffic coming from guest or namespace, packets with destination +address corresponding to the \fB\-\-map-host-loopback\fR address will have their +destination address translated to a loopback address. + +As an exception, traffic identified as DNS, originally directed to the +\fB\-\-map-host-loopback\fR address, if this address matches a resolver address +on the host, is \fBnot\fR translated to loopback, but rather handled in the same +way as if specified as \-\-dns-forward address, if no such option was given. +In the common case where the host gateway also acts a resolver, this avoids that +the host mapping shadows the gateway/resolver itself. .SS Handling of local traffic in pasta From 31e8109a86eeebb473ffba8124a3f399cf0aeccf Mon Sep 17 00:00:00 2001 From: Enrique Llorente <ellorent@redhat.com> Date: Fri, 7 Feb 2025 12:36:55 +0100 Subject: [PATCH 218/382] dhcp, dhcpv6: Add hostname and client fqdn ops Both DHCPv4 and DHCPv6 has the capability to pass the hostname to clients, the DHCPv4 uses option 12 (hostname) while the DHCPv6 uses option 39 (client fqdn), for some virt deployments like kubevirt is expected to have the VirtualMachine name as the guest hostname. This change add the following arguments: - -H --hostname NAME to configure the hostname DHCPv4 option(12) - --fqdn NAME to configure client fqdn option for both DHCPv4(81) and DHCPv6(39) Signed-off-by: Enrique Llorente <ellorent@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 20 ++++++++-- dhcp.c | 61 +++++++++++++++++++++++++---- dhcpv6.c | 99 ++++++++++++++++++++++++++++++++++++++++-------- passt.1 | 10 +++++ passt.h | 5 +++ pasta.c | 17 +++++++-- test/lib/setup | 10 ++--- test/passt.mbuto | 6 ++- test/passt/dhcp | 15 +++++++- util.c | 24 ++++++++++++ util.h | 6 +++ 11 files changed, 235 insertions(+), 38 deletions(-) diff --git a/conf.c b/conf.c index 142dc94..d9de07c 100644 --- a/conf.c +++ b/conf.c @@ -858,7 +858,9 @@ static void usage(const char *name, FILE *f, int status) FPRINTF(f, " default: use addresses from /etc/resolv.conf\n"); FPRINTF(f, " -S, --search LIST Space-separated list, search domains\n" - " a single, empty option disables the DNS search list\n"); + " a single, empty option disables the DNS search list\n" + " -H, --hostname NAME Hostname to configure client with\n" + " --fqdn NAME FQDN to configure client with\n"); if (strstr(name, "pasta")) FPRINTF(f, " default: don't use any search list\n"); else @@ -1316,6 +1318,7 @@ void conf(struct ctx *c, int argc, char **argv) {"outbound", required_argument, NULL, 'o' }, {"dns", required_argument, NULL, 'D' }, {"search", required_argument, NULL, 'S' }, + {"hostname", required_argument, NULL, 'H' }, {"no-tcp", no_argument, &c->no_tcp, 1 }, {"no-udp", no_argument, &c->no_udp, 1 }, {"no-icmp", no_argument, &c->no_icmp, 1 }, @@ -1360,6 +1363,7 @@ void conf(struct ctx *c, int argc, char **argv) /* vhost-user backend program convention */ {"print-capabilities", no_argument, NULL, 26 }, {"socket-path", required_argument, NULL, 's' }, + {"fqdn", required_argument, NULL, 27 }, { 0 }, }; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; @@ -1382,9 +1386,9 @@ void conf(struct ctx *c, int argc, char **argv) if (c->mode == MODE_PASTA) { c->no_dhcp_dns = c->no_dhcp_dns_search = 1; fwd_default = FWD_AUTO; - optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:46t:u:T:U:"; + optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:"; } else { - optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:461t:u:"; + optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:"; } c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET; @@ -1561,6 +1565,11 @@ void conf(struct ctx *c, int argc, char **argv) case 26: vu_print_capabilities(); break; + case 27: + if (snprintf_check(c->fqdn, PASST_MAXDNAME, + "%s", optarg)) + die("Invalid FQDN: %s", optarg); + break; case 'd': c->debug = 1; c->quiet = 0; @@ -1730,6 +1739,11 @@ void conf(struct ctx *c, int argc, char **argv) die("Cannot use DNS search domain %s", optarg); break; + case 'H': + if (snprintf_check(c->hostname, PASST_MAXDNAME, + "%s", optarg)) + die("Invalid hostname: %s", optarg); + break; case '4': v4_only = true; v6_only = false; diff --git a/dhcp.c b/dhcp.c index 2a23ed4..401cb5b 100644 --- a/dhcp.c +++ b/dhcp.c @@ -63,6 +63,11 @@ static struct opt opts[255]; #define OPT_MIN 60 /* RFC 951 */ +/* Total option size (excluding end option) is 576 (RFC 2131), minus + * offset of options (268), minus end option and its length (2). + */ +#define OPT_MAX 306 + /** * dhcp_init() - Initialise DHCP options */ @@ -122,7 +127,7 @@ struct msg { uint8_t sname[64]; uint8_t file[128]; uint32_t magic; - uint8_t o[308]; + uint8_t o[OPT_MAX + 2 /* End option and its length */ ]; } __attribute__((__packed__)); /** @@ -130,15 +135,28 @@ struct msg { * @m: Message to fill * @o: Option number * @offset: Current offset within options field, updated on insertion + * + * Return: false if m has space to write the option, true otherwise */ -static void fill_one(struct msg *m, int o, int *offset) +static bool fill_one(struct msg *m, int o, int *offset) { + size_t slen = opts[o].slen; + + /* If we don't have space to write the option, then just skip */ + if (*offset + 1 /* length of option */ + slen > OPT_MAX) + return true; + m->o[*offset] = o; - m->o[*offset + 1] = opts[o].slen; - memcpy(&m->o[*offset + 2], opts[o].s, opts[o].slen); + m->o[*offset + 1] = slen; + + /* Move to option */ + *offset += 2; + + memcpy(&m->o[*offset], opts[o].s, slen); opts[o].sent = 1; - *offset += 2 + opts[o].slen; + *offset += slen; + return false; } /** @@ -159,17 +177,20 @@ static int fill(struct msg *m) * Put it there explicitly, unless requested via option 55. */ if (opts[55].clen > 0 && !memchr(opts[55].c, 53, opts[55].clen)) - fill_one(m, 53, &offset); + if (fill_one(m, 53, &offset)) + debug("DHCP: skipping option 53"); for (i = 0; i < opts[55].clen; i++) { o = opts[55].c[i]; if (opts[o].slen != -1) - fill_one(m, o, &offset); + if (fill_one(m, o, &offset)) + debug("DHCP: skipping option %i", o); } for (o = 0; o < 255; o++) { if (opts[o].slen != -1 && !opts[o].sent) - fill_one(m, o, &offset); + if (fill_one(m, o, &offset)) + debug("DHCP: skipping option %i", o); } m->o[offset++] = 255; @@ -411,6 +432,30 @@ int dhcp(const struct ctx *c, const struct pool *p) if (!opts[6].slen) opts[6].slen = -1; + opt_len = strlen(c->hostname); + if (opt_len > 0) { + opts[12].slen = opt_len; + memcpy(opts[12].s, &c->hostname, opt_len); + } + + opt_len = strlen(c->fqdn); + if (opt_len > 0) { + opt_len += 3 /* flags */ + + 2; /* Length byte for first label, and terminator */ + + if (sizeof(opts[81].s) >= opt_len) { + opts[81].s[0] = 0x4; /* flags (E) */ + opts[81].s[1] = 0xff; /* RCODE1 */ + opts[81].s[2] = 0xff; /* RCODE2 */ + + encode_domain_name((char *)opts[81].s + 3, c->fqdn); + + opts[81].slen = opt_len; + } else { + debug("DHCP: client FQDN option doesn't fit, skipping"); + } + } + if (!c->no_dhcp_dns_search) opt_set_dns_search(c, sizeof(m->o)); diff --git a/dhcpv6.c b/dhcpv6.c index 0523bba..373a988 100644 --- a/dhcpv6.c +++ b/dhcpv6.c @@ -48,6 +48,7 @@ struct opt_hdr { # define STATUS_NOTONLINK htons_constant(4) # define OPT_DNS_SERVERS htons_constant(23) # define OPT_DNS_SEARCH htons_constant(24) +# define OPT_CLIENT_FQDN htons_constant(39) #define STR_NOTONLINK "Prefix not appropriate for link." uint16_t l; @@ -58,6 +59,9 @@ struct opt_hdr { sizeof(struct opt_hdr)) #define OPT_VSIZE(x) (sizeof(struct opt_##x) - \ sizeof(struct opt_hdr)) +#define OPT_MAX_SIZE IPV6_MIN_MTU - (sizeof(struct ipv6hdr) + \ + sizeof(struct udphdr) + \ + sizeof(struct msg_hdr)) /** * struct opt_client_id - DHCPv6 Client Identifier option @@ -163,6 +167,18 @@ struct opt_dns_search { char list[MAXDNSRCH * NS_MAXDNAME]; } __attribute__((packed)); +/** + * struct opt_client_fqdn - Client FQDN option (RFC 4704) + * @hdr: Option header + * @flags: Flags described by RFC 4704 + * @domain_name: Client FQDN + */ +struct opt_client_fqdn { + struct opt_hdr hdr; + uint8_t flags; + char domain_name[PASST_MAXDNAME]; +} __attribute__((packed)); + /** * struct msg_hdr - DHCPv6 client/server message header * @type: DHCP message type @@ -193,6 +209,7 @@ struct msg_hdr { * @client_id: Client Identifier, variable length * @dns_servers: DNS Recursive Name Server, here just for storage size * @dns_search: Domain Search List, here just for storage size + * @client_fqdn: Client FQDN, variable length */ static struct resp_t { struct msg_hdr hdr; @@ -203,6 +220,7 @@ static struct resp_t { struct opt_client_id client_id; struct opt_dns_servers dns_servers; struct opt_dns_search dns_search; + struct opt_client_fqdn client_fqdn; } __attribute__((__packed__)) resp = { { 0 }, SERVER_ID, @@ -228,6 +246,10 @@ static struct resp_t { { { OPT_DNS_SEARCH, 0, }, { 0 }, }, + + { { OPT_CLIENT_FQDN, 0, }, + 0, { 0 }, + }, }; static const struct opt_status_code sc_not_on_link = { @@ -346,7 +368,6 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset) { struct opt_dns_servers *srv = NULL; struct opt_dns_search *srch = NULL; - char *p = NULL; int i; if (c->no_dhcp_dns) @@ -383,34 +404,81 @@ search: if (!name_len) continue; + name_len += 2; /* Length byte for first label, and terminator */ + if (name_len > + NS_MAXDNAME + 1 /* Length byte for first label */ || + name_len > 255) { + debug("DHCP: DNS search name '%s' too long, skipping", + c->dns_search[i].n); + continue; + } + if (!srch) { srch = (struct opt_dns_search *)(buf + offset); offset += sizeof(struct opt_hdr); srch->hdr.t = OPT_DNS_SEARCH; srch->hdr.l = 0; - p = srch->list; } - *p = '.'; - p = stpncpy(p + 1, c->dns_search[i].n, name_len); - p++; - srch->hdr.l += name_len + 2; - offset += name_len + 2; + encode_domain_name(buf + offset, c->dns_search[i].n); + + srch->hdr.l += name_len; + offset += name_len; + } - if (srch) { - for (i = 0; i < srch->hdr.l; i++) { - if (srch->list[i] == '.') { - srch->list[i] = strcspn(srch->list + i + 1, - "."); - } - } + if (srch) srch->hdr.l = htons(srch->hdr.l); - } return offset; } +/** + * dhcpv6_client_fqdn_fill() - Fill in client FQDN option + * @c: Execution context + * @buf: Response message buffer where options will be appended + * @offset: Offset in message buffer for new options + * + * Return: updated length of response message buffer. + */ +static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c, + char *buf, int offset) + +{ + struct opt_client_fqdn const *req_opt; + struct opt_client_fqdn *o; + size_t opt_len; + + opt_len = strlen(c->fqdn); + if (opt_len == 0) { + return offset; + } + + opt_len += 2; /* Length byte for first label, and terminator */ + if (opt_len > OPT_MAX_SIZE - (offset + + sizeof(struct opt_hdr) + + 1 /* flags */ )) { + debug("DHCPv6: client FQDN option doesn't fit, skipping"); + return offset; + } + + o = (struct opt_client_fqdn *)(buf + offset); + encode_domain_name(o->domain_name, c->fqdn); + req_opt = (struct opt_client_fqdn *)dhcpv6_opt(p, &(size_t){ 0 }, + OPT_CLIENT_FQDN); + if (req_opt && req_opt->flags & 0x01 /* S flag */) + o->flags = 0x02 /* O flag */; + else + o->flags = 0x00; + + opt_len++; + + o->hdr.t = OPT_CLIENT_FQDN; + o->hdr.l = htons(opt_len); + + return offset + sizeof(struct opt_hdr) + opt_len; +} + /** * dhcpv6() - Check if this is a DHCPv6 message, reply as needed * @c: Execution context @@ -544,6 +612,7 @@ int dhcpv6(struct ctx *c, const struct pool *p, n = offsetof(struct resp_t, client_id) + sizeof(struct opt_hdr) + ntohs(client_id->l); n = dhcpv6_dns_fill(c, (char *)&resp, n); + n = dhcpv6_client_fqdn_fill(p, c, (char *)&resp, n); resp.hdr.xid = mh->xid; diff --git a/passt.1 b/passt.1 index 29cc3ed..9d347d8 100644 --- a/passt.1 +++ b/passt.1 @@ -401,6 +401,16 @@ Enable IPv6-only operation. IPv4 traffic will be ignored. By default, IPv4 operation is enabled as long as at least an IPv4 route and an interface address are configured on a given host interface. +.TP +.BR \-H ", " \-\-hostname " " \fIname +Hostname to configure the client with. +Send \fIname\fR as DHCP option 12 (hostname). + +.TP +.BR \-\-fqdn " " \fIname +FQDN to configure the client with. +Send \fIname\fR as Client FQDN: DHCP option 81 and DHCPv6 option 39. + .SS \fBpasst\fR-only options .TP diff --git a/passt.h b/passt.h index 0dd4efa..f3151f0 100644 --- a/passt.h +++ b/passt.h @@ -209,6 +209,8 @@ struct ip6_ctx { * @ifi4: Template interface for IPv4, -1: none, 0: IPv4 disabled * @ip: IPv4 configuration * @dns_search: DNS search list + * @hostname: Guest hostname + * @fqdn: Guest FQDN * @ifi6: Template interface for IPv6, -1: none, 0: IPv6 disabled * @ip6: IPv6 configuration * @pasta_ifn: Name of namespace interface for pasta @@ -269,6 +271,9 @@ struct ctx { struct fqdn dns_search[MAXDNSRCH]; + char hostname[PASST_MAXDNAME]; + char fqdn[PASST_MAXDNAME]; + int ifi6; struct ip6_ctx ip6; diff --git a/pasta.c b/pasta.c index f15084d..585a51c 100644 --- a/pasta.c +++ b/pasta.c @@ -169,10 +169,12 @@ void pasta_open_ns(struct ctx *c, const char *netns) * struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd() * @exe: Executable to run * @argv: Command and arguments to run + * @ctx: Context to read config from */ struct pasta_spawn_cmd_arg { const char *exe; char *const *argv; + struct ctx *c; }; /** @@ -186,6 +188,7 @@ static int pasta_spawn_cmd(void *arg) { char hostname[HOST_NAME_MAX + 1] = HOSTNAME_PREFIX; const struct pasta_spawn_cmd_arg *a; + size_t conf_hostname_len; sigset_t set; /* We run in a detached PID and mount namespace: mount /proc over */ @@ -195,9 +198,15 @@ static int pasta_spawn_cmd(void *arg) if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) warn("Cannot set ping_group_range, ICMP requests might fail"); - if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1, - HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) || - errno == ENAMETOOLONG) { + a = (const struct pasta_spawn_cmd_arg *)arg; + + conf_hostname_len = strlen(a->c->hostname); + if (conf_hostname_len > 0) { + if (sethostname(a->c->hostname, conf_hostname_len)) + warn("Unable to set configured hostname"); + } else if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1, + HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) || + errno == ENAMETOOLONG) { hostname[HOST_NAME_MAX] = '\0'; if (sethostname(hostname, strlen(hostname))) warn("Unable to set pasta-prefixed hostname"); @@ -208,7 +217,6 @@ static int pasta_spawn_cmd(void *arg) sigaddset(&set, SIGUSR1); sigwaitinfo(&set, NULL); - a = (const struct pasta_spawn_cmd_arg *)arg; execvp(a->exe, a->argv); die_perror("Failed to start command or shell"); @@ -230,6 +238,7 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, struct pasta_spawn_cmd_arg arg = { .exe = argv[0], .argv = argv, + .c = c, }; char uidmap[BUFSIZ], gidmap[BUFSIZ]; char *sh_argv[] = { NULL, NULL }; diff --git a/test/lib/setup b/test/lib/setup index 580825f..ee67152 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -49,7 +49,7 @@ setup_passt() { context_run passt "make clean" context_run passt "make valgrind" - context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -P ${STATESETUP}/passt.pid" + context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -H hostname1 --fqdn fqdn1.passt.test -P ${STATESETUP}/passt.pid" # pidfile isn't created until passt is listening wait_for [ -f "${STATESETUP}/passt.pid" ] @@ -160,11 +160,11 @@ setup_passt_in_ns() { if [ ${VALGRIND} -eq 1 ]; then context_run passt "make clean" context_run passt "make valgrind" - context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" + context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" else context_run passt "make clean" context_run passt "make" - context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" + context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" fi wait_for [ -f "${STATESETUP}/passt.pid" ] @@ -243,7 +243,7 @@ setup_two_guests() { [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" - context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001" + context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} --fqdn fqdn1.passt.test -H hostname1 -t 10001 -u 10001" wait_for [ -f "${STATESETUP}/passt_1.pid" ] __opts= @@ -252,7 +252,7 @@ setup_two_guests() { [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" - context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004" + context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} --hostname hostname2 --fqdn fqdn2 -t 10004 -u 10004" wait_for [ -f "${STATESETUP}/passt_2.pid" ] __vmem="$((${MEM_KIB} / 1024 / 4))" diff --git a/test/passt.mbuto b/test/passt.mbuto index d4d57cb..e45a284 100755 --- a/test/passt.mbuto +++ b/test/passt.mbuto @@ -13,7 +13,7 @@ PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl - nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump}" + nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump env}" # OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and # sshd-session the per-session program. We need the latter as well, and the path @@ -41,6 +41,7 @@ FIXUP="${FIXUP}"' #!/bin/sh LOG=/var/log/dhclient-script.log echo \${reason} \${interface} >> \$LOG +env >> \$LOG set >> \$LOG [ -n "\${new_interface_mtu}" ] && ip link set dev \${interface} mtu \${new_interface_mtu} @@ -54,7 +55,8 @@ set >> \$LOG [ -n "\${new_ip6_address}" ] && ip addr add \${new_ip6_address}/\${new_ip6_prefixlen} dev \${interface} [ -n "\${new_dhcp6_name_servers}" ] && for d in \${new_dhcp6_name_servers}; do echo "nameserver \${d}%\${interface}" >> /etc/resolv.conf; done [ -n "\${new_dhcp6_domain_search}" ] && (printf "search"; for d in \${new_dhcp6_domain_search}; do printf " %s" "\${d}"; done; printf "\n") >> /etc/resolv.conf -[ -n "\${new_host_name}" ] && hostname "\${new_host_name}" +[ -n "\${new_host_name}" ] && echo "\${new_host_name}" > /tmp/new_host_name +[ -n "\${new_fqdn_fqdn}" ] && echo "\${new_fqdn_fqdn}" > /tmp/new_fqdn_fqdn exit 0 EOF chmod 755 /sbin/dhclient-script diff --git a/test/passt/dhcp b/test/passt/dhcp index 9925ab9..145f1ba 100644 --- a/test/passt/dhcp +++ b/test/passt/dhcp @@ -11,7 +11,7 @@ # Copyright (c) 2021 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> -gtools ip jq dhclient sed tr +gtools ip jq dhclient sed tr hostname htools ip jq sed tr head test Interface name @@ -47,7 +47,16 @@ gout SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^searc hout HOST_SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' check [ "__SEARCH__" = "__HOST_SEARCH__" ] +test DHCP: Hostname +gout NEW_HOST_NAME cat /tmp/new_host_name +check [ "__NEW_HOST_NAME__" = "hostname1" ] + +test DHCP: Client FQDN +gout NEW_FQDN_FQDN cat /tmp/new_fqdn_fqdn +check [ "__NEW_FQDN_FQDN__" = "fqdn1.passt.test" ] + test DHCPv6: address +guest rm /tmp/new_fqdn_fqdn guest /sbin/dhclient -6 __IFNAME__ # Wait for DAD to complete guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done @@ -70,3 +79,7 @@ test DHCPv6: search list gout SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' hout HOST_SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' check [ "__SEARCH6__" = "__HOST_SEARCH6__" ] + +test DHCPv6: Hostname +gout NEW_FQDN_FQDN cat /tmp/new_fqdn_fqdn +check [ "__NEW_FQDN_FQDN__" = "fqdn1.passt.test" ] diff --git a/util.c b/util.c index 4d51e04..ba33866 100644 --- a/util.c +++ b/util.c @@ -930,4 +930,28 @@ void raw_random(void *buf, size_t buflen) void epoll_del(const struct ctx *c, int fd) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL); + +} + +/** + * encode_domain_name() - Encode domain name according to RFC 1035, section 3.1 + * @buf: Buffer to fill in with encoded domain name + * @domain_name: Input domain name string with terminator + * + * The buffer's 'buf' size has to be >= strlen(domain_name) + 2 + */ +void encode_domain_name(char *buf, const char *domain_name) +{ + size_t i; + char *p; + + buf[0] = strcspn(domain_name, "."); + p = buf + 1; + for (i = 0; domain_name[i]; i++) { + if (domain_name[i] == '.') + p[i] = strcspn(domain_name + i + 1, "."); + else + p[i] = domain_name[i]; + } + p[i] = 0L; } diff --git a/util.h b/util.h index 23b165c..9c92a37 100644 --- a/util.h +++ b/util.h @@ -40,6 +40,9 @@ #ifndef IP_MAX_MTU #define IP_MAX_MTU USHRT_MAX #endif +#ifndef IPV6_MIN_MTU +#define IPV6_MIN_MTU 1280 +#endif #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) @@ -352,4 +355,7 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr, #define accept4(s, addr, addrlen, flags) \ wrap_accept4((s), (addr), (addrlen), (flags)) +#define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */ +void encode_domain_name(char *buf, const char *domain_name); + #endif /* UTIL_H */ From 472e2e930f6e17d9d8664d6cf44c47af1db58bb3 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 11 Feb 2025 20:11:00 +0100 Subject: [PATCH 219/382] tcp: Don't discard window information on keep-alive segments It looks like a detail, but it's critical if we're dealing with somebody, such as near-future self, using TCP_REPAIR to migrate TCP connections in the guest or container. The last packet sent from the 'source' process/guest/container typically reports a small window, or zero, because the guest/container hadn't been draining it for a while. The next packet, appearing as the target sets TCP_REPAIR_OFF on the migrated socket, is a keep-alive (also called "window probe" in CRIU or TCP_REPAIR-related code), and it comes with an updated window value, reflecting the pre-migration "regular" value. If we ignore it, it might take a while/forever before we realise we can actually restart sending. Fixes: 238c69f9af45 ("tcp: Acknowledge keep-alive segments, ignore them for the rest") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index af6bd95..2addf4a 100644 --- a/tcp.c +++ b/tcp.c @@ -1664,8 +1664,10 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp_send_flag(c, conn, ACK); tcp_timer_ctl(c, conn); - if (p->count == 1) + if (p->count == 1) { + tcp_tap_window_update(conn, ntohs(th->window)); return 1; + } continue; } From 90f91fe72673e36c8e071a1750e9c03deb20ab0f Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 11 Feb 2025 20:19:05 +0100 Subject: [PATCH 220/382] tcp: Implement conservative zero-window probe on ACK timeout This probably doesn't cover all the cases where we should send a zero-window probe, but it's rather unobtrusive and obvious, so start from here, also because I just observed this case (without the fix from the previous patch, to take into account window information from keep-alive segments). If we hit the ACK timeout, and try re-sending data from the socket, if the window is zero, we'll just fail again, go back to the timer, and so on, until we hit the maximum number of re-transmissions and reset the connection. Don't do that: forcibly try to send something by implementing the equivalent of a zero-window probe in this case. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tcp.c b/tcp.c index 2addf4a..b87478f 100644 --- a/tcp.c +++ b/tcp.c @@ -2175,6 +2175,8 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; + if (!conn->wnd_from_tap) + conn->wnd_from_tap = 1; /* Zero-window probe */ if (tcp_set_peek_offset(conn->sock, 0)) { tcp_rst(c, conn); } else { From def7de4690ddb40f7c3b29e6ca81d30e9409fb5d Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Tue, 11 Feb 2025 20:43:32 +0100 Subject: [PATCH 221/382] tcp_vu: Fix off-by one in header count array adjustment head_cnt represents the number of frames we're going to forward to the guest in tcp_vu_sock_recv(), each of which could require multiple buffers ("elements"). We initialise it with as many frames as we can find space for in vu buffers, and we then need to adjust it down to the number of frames we actually (partially) filled. We adjust it down based on number of individual buffers used by the data from recvmsg(). At this point 'i' is *one greater than* that number of buffers, so we need to discard all (unused) frames with a buffer index >= i, instead of > i. Reported-by: David Gibson <david@gibson.dropbear.id.au> [david: Contributed actual commit message] Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_vu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp_vu.c b/tcp_vu.c index fad7065..0622f17 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -261,7 +261,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, len -= iov->iov_len; } /* adjust head count */ - while (head_cnt > 0 && head[head_cnt - 1] > i) + while (head_cnt > 0 && head[head_cnt - 1] >= i) head_cnt--; /* mark end of array */ head[head_cnt] = i; From 836fe215e049ee423750d3315a02742d8224eab2 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 12 Feb 2025 01:07:33 +0100 Subject: [PATCH 222/382] passt-repair: Fix off-by-one in check for number of file descriptors Actually, 254 is too many, but 253 isn't. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt-repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passt-repair.c b/passt-repair.c index 614cee0..1174ae3 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -131,7 +131,7 @@ loop: /* No inverse formula for CMSG_LEN(x), and building one with CMSG_LEN(0) * works but there's no guarantee it does. Search the whole domain. */ - for (i = 1; i < SCM_MAX_FD; i++) { + for (i = 1; i <= SCM_MAX_FD; i++) { if (CMSG_LEN(sizeof(int) * i) == cmsg->cmsg_len) { n = i; break; From 5911e08c0f53e46547e7eeb1dd824c8ab96e512e Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 12 Feb 2025 18:07:13 +1100 Subject: [PATCH 223/382] migrate: Skeleton of live migration logic Introduce facilities for guest migration on top of vhost-user infrastructure. Add migration facilities based on top of the current vhost-user infrastructure, moving vu_migrate() and related functions to migrate.c. Versioned migration stages define function pointers to be called on source or target, or data sections that need to be transferred. The migration header consists of a magic number, a version number for the encoding, and a "compat_version" which represents the oldest version which is compatible with the current one. We don't use it yet, but that allows for the future possibility of backwards compatible protocol extensions. Co-authored-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 12 +-- epoll_type.h | 2 - migrate.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++ migrate.h | 51 ++++++++++++ passt.c | 8 +- passt.h | 8 ++ util.h | 29 +++++++ vhost_user.c | 60 +++------------ virtio.h | 4 - vu_common.c | 49 +----------- vu_common.h | 2 +- 11 files changed, 324 insertions(+), 115 deletions(-) create mode 100644 migrate.c create mode 100644 migrate.h diff --git a/Makefile b/Makefile index d3d4b78..be89b07 100644 --- a/Makefile +++ b/Makefile @@ -38,8 +38,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ - ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ + ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \ + tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ vhost_user.c virtio.c vu_common.c QRAP_SRCS = qrap.c PASST_REPAIR_SRCS = passt-repair.c @@ -49,10 +49,10 @@ MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ - lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ - siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \ - virtio.h vu_common.h + lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \ + pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \ + tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \ + vhost_user.h virtio.h vu_common.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} diff --git a/epoll_type.h b/epoll_type.h index fd9eac3..f3ef415 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -40,8 +40,6 @@ enum epoll_type { EPOLL_TYPE_VHOST_CMD, /* vhost-user kick event socket */ EPOLL_TYPE_VHOST_KICK, - /* vhost-user migration socket */ - EPOLL_TYPE_VHOST_MIGRATION, EPOLL_NUM_TYPES, }; diff --git a/migrate.c b/migrate.c new file mode 100644 index 0000000..aeac872 --- /dev/null +++ b/migrate.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * migrate.c - Migration sections, layout, and routines + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <errno.h> +#include <sys/uio.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "inany.h" +#include "flow.h" +#include "flow_table.h" + +#include "migrate.h" + +/* Magic identifier for migration data */ +#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0 + +/* Stages for version 1 */ +static const struct migrate_stage stages_v1[] = { + { 0 }, +}; + +/* Supported encoding versions, from latest (most preferred) to oldest */ +static const struct migrate_version versions[] = { + { 1, stages_v1, }, + { 0 }, +}; + +/* Current encoding version */ +#define CURRENT_VERSION (&versions[0]) + +/** + * migrate_source() - Migration as source, send state to hypervisor + * @c: Execution context + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int migrate_source(struct ctx *c, int fd) +{ + const struct migrate_version *v = CURRENT_VERSION; + const struct migrate_header header = { + .magic = htonll_constant(MIGRATE_MAGIC), + .version = htonl(v->id), + .compat_version = htonl(v->id), + }; + const struct migrate_stage *s; + int ret; + + if (write_all_buf(fd, &header, sizeof(header))) { + ret = errno; + err("Can't send migration header: %s, abort", strerror_(ret)); + return ret; + } + + for (s = v->s; s->name; s++) { + if (!s->source) + continue; + + debug("Source side migration stage: %s", s->name); + + if ((ret = s->source(c, s, fd))) { + err("Source migration stage: %s: %s, abort", s->name, + strerror_(ret)); + return ret; + } + } + + return 0; +} + +/** + * migrate_target_read_header() - Read header in target + * @fd: Descriptor for state transfer + * + * Return: version structure on success, NULL on failure with errno set + */ +static const struct migrate_version *migrate_target_read_header(int fd) +{ + const struct migrate_version *v; + struct migrate_header h; + uint32_t id, compat_id; + + if (read_all_buf(fd, &h, sizeof(h))) + return NULL; + + id = ntohl(h.version); + compat_id = ntohl(h.compat_version); + + debug("Source magic: 0x%016" PRIx64 ", version: %u, compat: %u", + ntohll(h.magic), id, compat_id); + + if (ntohll(h.magic) != MIGRATE_MAGIC || !id || !compat_id) { + err("Invalid incoming device state"); + errno = EINVAL; + return NULL; + } + + for (v = versions; v->id; v++) + if (v->id <= id && v->id >= compat_id) + return v; + + errno = ENOTSUP; + err("Unsupported device state version: %u", id); + return NULL; +} + +/** + * migrate_target() - Migration as target, receive state from hypervisor + * @c: Execution context + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int migrate_target(struct ctx *c, int fd) +{ + const struct migrate_version *v; + const struct migrate_stage *s; + int ret; + + if (!(v = migrate_target_read_header(fd))) + return errno; + + for (s = v->s; s->name; s++) { + if (!s->target) + continue; + + debug("Target side migration stage: %s", s->name); + + if ((ret = s->target(c, s, fd))) { + err("Target migration stage: %s: %s, abort", s->name, + strerror_(ret)); + return ret; + } + } + + return 0; +} + +/** + * migrate_init() - Set up things necessary for migration + * @c: Execution context + */ +void migrate_init(struct ctx *c) +{ + c->device_state_result = -1; +} + +/** + * migrate_close() - Close migration channel + * @c: Execution context + */ +void migrate_close(struct ctx *c) +{ + if (c->device_state_fd != -1) { + debug("Closing migration channel, fd: %d", c->device_state_fd); + close(c->device_state_fd); + c->device_state_fd = -1; + c->device_state_result = -1; + } +} + +/** + * migrate_request() - Request a migration of device state + * @c: Execution context + * @fd: fd to transfer state + * @target: Are we the target of the migration? + */ +void migrate_request(struct ctx *c, int fd, bool target) +{ + debug("Migration requested, fd: %d (was %d)", fd, c->device_state_fd); + + if (c->device_state_fd != -1) + migrate_close(c); + + c->device_state_fd = fd; + c->migrate_target = target; +} + +/** + * migrate_handler() - Send/receive passt internal state to/from hypervisor + * @c: Execution context + */ +void migrate_handler(struct ctx *c) +{ + int rc; + + if (c->device_state_fd < 0) + return; + + debug("Handling migration request from fd: %d, target: %d", + c->device_state_fd, c->migrate_target); + + if (c->migrate_target) + rc = migrate_target(c, c->device_state_fd); + else + rc = migrate_source(c, c->device_state_fd); + + migrate_close(c); + + c->device_state_result = rc; +} diff --git a/migrate.h b/migrate.h new file mode 100644 index 0000000..2c51cd9 --- /dev/null +++ b/migrate.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef MIGRATE_H +#define MIGRATE_H + +/** + * struct migrate_header - Migration header from source + * @magic: 0xB1BB1D1B0BB1D1B0, network order + * @version: Highest known, target aborts if too old, network order + * @compat_version: Lowest version compatible with @version, target aborts + * if too new, network order + */ +struct migrate_header { + uint64_t magic; + uint32_t version; + uint32_t compat_version; +} __attribute__((packed)); + +/** + * struct migrate_stage - Callbacks and parameters for one stage of migration + * @name: Stage name (for debugging) + * @source: Callback to implement this stage on the source + * @target: Callback to implement this stage on the target + */ +struct migrate_stage { + const char *name; + int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd); + int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd); + + /* Add here separate rollback callbacks if needed */ +}; + +/** + * struct migrate_version - Stages for a particular protocol version + * @id: Version number, host order + * @s: Ordered array of stages, NULL-terminated + */ +struct migrate_version { + uint32_t id; + const struct migrate_stage *s; +}; + +void migrate_init(struct ctx *c); +void migrate_close(struct ctx *c); +void migrate_request(struct ctx *c, int fd, bool target); +void migrate_handler(struct ctx *c); + +#endif /* MIGRATE_H */ diff --git a/passt.c b/passt.c index 53fdd38..935a69f 100644 --- a/passt.c +++ b/passt.c @@ -51,6 +51,7 @@ #include "tcp_splice.h" #include "ndp.h" #include "vu_common.h" +#include "migrate.h" #define EPOLL_EVENTS 8 @@ -75,7 +76,6 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", - [EPOLL_TYPE_VHOST_MIGRATION] = "vhost-user migration socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -202,6 +202,7 @@ int main(int argc, char **argv) isolate_initial(argc, argv); c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1; + c.device_state_fd = -1; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; @@ -357,9 +358,6 @@ loop: case EPOLL_TYPE_VHOST_KICK: vu_kick_cb(c.vdev, ref, &now); break; - case EPOLL_TYPE_VHOST_MIGRATION: - vu_migrate(c.vdev, eventmask); - break; default: /* Can't happen */ ASSERT(0); @@ -368,5 +366,7 @@ loop: post_handler(&c, &now); + migrate_handler(&c); + goto loop; } diff --git a/passt.h b/passt.h index f3151f0..5fdea52 100644 --- a/passt.h +++ b/passt.h @@ -237,6 +237,9 @@ struct ip6_ctx { * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max * @vdev: vhost-user device + * @device_state_fd: Device state migration channel + * @device_state_result: Device state migration result + * @migrate_target: Are we the target, on the next migration request? */ struct ctx { enum passt_modes mode; @@ -305,6 +308,11 @@ struct ctx { int low_rmem; struct vu_dev *vdev; + + /* Migration */ + int device_state_fd; + int device_state_result; + bool migrate_target; }; void proto_update_l2_buf(const unsigned char *eth_d, diff --git a/util.h b/util.h index 9c92a37..7df7767 100644 --- a/util.h +++ b/util.h @@ -125,14 +125,43 @@ (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) #endif +#ifndef __bswap_constant_32 +#define __bswap_constant_32(x) \ + ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) +#endif + +#ifndef __bswap_constant_64 +#define __bswap_constant_64(x) \ + ((((x) & 0xff00000000000000ULL) >> 56) | \ + (((x) & 0x00ff000000000000ULL) >> 40) | \ + (((x) & 0x0000ff0000000000ULL) >> 24) | \ + (((x) & 0x000000ff00000000ULL) >> 8) | \ + (((x) & 0x00000000ff000000ULL) << 8) | \ + (((x) & 0x0000000000ff0000ULL) << 24) | \ + (((x) & 0x000000000000ff00ULL) << 40) | \ + (((x) & 0x00000000000000ffULL) << 56)) +#endif + #if __BYTE_ORDER == __BIG_ENDIAN #define htons_constant(x) (x) #define htonl_constant(x) (x) +#define htonll_constant(x) (x) +#define ntohs_constant(x) (x) +#define ntohl_constant(x) (x) +#define ntohll_constant(x) (x) #else #define htons_constant(x) (__bswap_constant_16(x)) #define htonl_constant(x) (__bswap_constant_32(x)) +#define htonll_constant(x) (__bswap_constant_64(x)) +#define ntohs_constant(x) (__bswap_constant_16(x)) +#define ntohl_constant(x) (__bswap_constant_32(x)) +#define ntohll_constant(x) (__bswap_constant_64(x)) #endif +#define ntohll(x) (be64toh((x))) +#define htonll(x) (htobe64((x))) + /** * ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address * @p: Pointer to the BE value in memory diff --git a/vhost_user.c b/vhost_user.c index 159f0b3..256c8ab 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -44,6 +44,7 @@ #include "tap.h" #include "vhost_user.h" #include "pcap.h" +#include "migrate.h" /* vhost-user version we are compatible with */ #define VHOST_USER_VERSION 1 @@ -997,36 +998,6 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev, return false; } -/** - * vu_set_migration_watch() - Add the migration file descriptor to epoll - * @vdev: vhost-user device - * @fd: File descriptor to add - * @direction: Direction of the migration (save or load backend state) - */ -static void vu_set_migration_watch(const struct vu_dev *vdev, int fd, - uint32_t direction) -{ - union epoll_ref ref = { - .type = EPOLL_TYPE_VHOST_MIGRATION, - .fd = fd, - }; - struct epoll_event ev = { 0 }; - - ev.data.u64 = ref.u64; - switch (direction) { - case VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE: - ev.events = EPOLLOUT; - break; - case VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD: - ev.events = EPOLLIN; - break; - default: - ASSERT(0); - } - - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); -} - /** * vu_set_device_state_fd_exec() - Set the device state migration channel * @vdev: vhost-user device @@ -1051,16 +1022,8 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD) die("Invalide device_state_fd direction: %d", direction); - if (vdev->device_state_fd != -1) { - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - } - - vdev->device_state_fd = msg->fds[0]; - vdev->device_state_result = -1; - vu_set_migration_watch(vdev, vdev->device_state_fd, direction); - - debug("Got device_state_fd: %d", vdev->device_state_fd); + migrate_request(vdev->context, msg->fds[0], + direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD); /* We don't provide a new fd for the data transfer */ vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK); @@ -1075,12 +1038,11 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, * * Return: True as the reply contains the migration result */ +/* cppcheck-suppress constParameterCallback */ static bool vu_check_device_state_exec(struct vu_dev *vdev, struct vhost_user_msg *msg) { - (void)vdev; - - vmsg_set_reply_u64(msg, vdev->device_state_result); + vmsg_set_reply_u64(msg, vdev->context->device_state_result); return true; } @@ -1106,8 +1068,8 @@ void vu_init(struct ctx *c) } c->vdev->log_table = NULL; c->vdev->log_call_fd = -1; - c->vdev->device_state_fd = -1; - c->vdev->device_state_result = -1; + + migrate_init(c); } @@ -1157,12 +1119,8 @@ void vu_cleanup(struct vu_dev *vdev) vu_close_log(vdev); - if (vdev->device_state_fd != -1) { - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - vdev->device_state_result = -1; - } + /* If we lose the VU dev, we also lose our migration channel */ + migrate_close(vdev->context); } /** diff --git a/virtio.h b/virtio.h index 7bef2d2..0a59441 100644 --- a/virtio.h +++ b/virtio.h @@ -106,8 +106,6 @@ struct vu_dev_region { * @log_call_fd: Eventfd to report logging update * @log_size: Size of the logging memory region * @log_table: Base of the logging memory region - * @device_state_fd: Device state migration channel - * @device_state_result: Device state migration result */ struct vu_dev { struct ctx *context; @@ -119,8 +117,6 @@ struct vu_dev { int log_call_fd; uint64_t log_size; uint8_t *log_table; - int device_state_fd; - int device_state_result; }; /** diff --git a/vu_common.c b/vu_common.c index ab04d31..48826b1 100644 --- a/vu_common.c +++ b/vu_common.c @@ -5,6 +5,7 @@ * common_vu.c - vhost-user common UDP and TCP functions */ +#include <errno.h> #include <unistd.h> #include <sys/uio.h> #include <sys/eventfd.h> @@ -17,6 +18,7 @@ #include "vhost_user.h" #include "pcap.h" #include "vu_common.h" +#include "migrate.h" #define VU_MAX_TX_BUFFER_NB 2 @@ -303,50 +305,3 @@ err: return -1; } - -/** - * vu_migrate() - Send/receive passt insternal state to/from QEMU - * @vdev: vhost-user device - * @events: epoll events - */ -void vu_migrate(struct vu_dev *vdev, uint32_t events) -{ - int ret; - - /* TODO: collect/set passt internal state - * and use vdev->device_state_fd to send/receive it - */ - debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); - if (events & EPOLLOUT) { - debug("Saving backend state"); - - /* send some stuff */ - ret = write(vdev->device_state_fd, "PASST", 6); - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ - vdev->device_state_result = ret == -1 ? -1 : 0; - /* Closing the file descriptor signals the end of transfer */ - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - } else if (events & EPOLLIN) { - char buf[6]; - - debug("Loading backend state"); - /* read some stuff */ - ret = read(vdev->device_state_fd, buf, sizeof(buf)); - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ - if (ret != sizeof(buf)) { - vdev->device_state_result = -1; - } else { - ret = strncmp(buf, "PASST", sizeof(buf)); - vdev->device_state_result = ret == 0 ? 0 : -1; - } - } else if (events & EPOLLHUP) { - debug("Closing migration channel"); - - /* The end of file signals the end of the transfer. */ - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - } -} diff --git a/vu_common.h b/vu_common.h index d56c021..f538f23 100644 --- a/vu_common.h +++ b/vu_common.h @@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, const struct timespec *now); int vu_send_single(const struct ctx *c, const void *buf, size_t size); -void vu_migrate(struct vu_dev *vdev, uint32_t events); + #endif /* VU_COMMON_H */ From 155cd0c41e549cea956b7f8506cda7803cf63419 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Feb 2025 18:07:14 +1100 Subject: [PATCH 224/382] migrate: Migrate guest observed addresses Most of the information in struct ctx doesn't need to be migrated. Either it's strictly back end information which is allowed to differ between the two ends, or it must already be configured identically on the two ends. There are a few exceptions though. In particular passt learns several addresses of the guest by observing what it sends out. If we lose this information across migration we might get away with it, but if there are active flows we might misdirect some packets before re-learning the guest address. Avoid this by migrating the guest's observed addresses. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Coding style stuff, comments, etc.] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- migrate.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/migrate.c b/migrate.c index aeac872..72a6d40 100644 --- a/migrate.c +++ b/migrate.c @@ -27,8 +27,81 @@ /* Magic identifier for migration data */ #define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0 +/** + * struct migrate_seen_addrs_v1 - Migratable guest addresses for v1 state stream + * @addr6: Observed guest IPv6 address + * @addr6_ll: Observed guest IPv6 link-local address + * @addr4: Observed guest IPv4 address + * @mac: Observed guest MAC address + */ +struct migrate_seen_addrs_v1 { + struct in6_addr addr6; + struct in6_addr addr6_ll; + struct in_addr addr4; + unsigned char mac[ETH_ALEN]; +} __attribute__((packed)); + +/** + * seen_addrs_source_v1() - Copy and send guest observed addresses from source + * @c: Execution context + * @stage: Migration stage, unused + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ +static int seen_addrs_source_v1(struct ctx *c, + const struct migrate_stage *stage, int fd) +{ + struct migrate_seen_addrs_v1 addrs = { + .addr6 = c->ip6.addr_seen, + .addr6_ll = c->ip6.addr_ll_seen, + .addr4 = c->ip4.addr_seen, + }; + + (void)stage; + + memcpy(addrs.mac, c->guest_mac, sizeof(addrs.mac)); + + if (write_all_buf(fd, &addrs, sizeof(addrs))) + return errno; + + return 0; +} + +/** + * seen_addrs_target_v1() - Receive and use guest observed addresses on target + * @c: Execution context + * @stage: Migration stage, unused + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int seen_addrs_target_v1(struct ctx *c, + const struct migrate_stage *stage, int fd) +{ + struct migrate_seen_addrs_v1 addrs; + + (void)stage; + + if (read_all_buf(fd, &addrs, sizeof(addrs))) + return errno; + + c->ip6.addr_seen = addrs.addr6; + c->ip6.addr_ll_seen = addrs.addr6_ll; + c->ip4.addr_seen = addrs.addr4; + memcpy(c->guest_mac, addrs.mac, sizeof(c->guest_mac)); + + return 0; +} + /* Stages for version 1 */ static const struct migrate_stage stages_v1[] = { + { + .name = "observed addresses", + .source = seen_addrs_source_v1, + .target = seen_addrs_target_v1, + }, { 0 }, }; From b899141ad52fb417fe608d9c8cfe66f9572207c7 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 12 Feb 2025 18:07:15 +1100 Subject: [PATCH 225/382] Add interfaces and configuration bits for passt-repair In vhost-user mode, by default, create a second UNIX domain socket accepting connections from passt-repair, with the usual listener socket. When we need to set or clear TCP_REPAIR on sockets, we'll send them via SCM_RIGHTS to passt-repair, who sets the socket option values we ask for. To that end, introduce batched functions to request TCP_REPAIR settings on sockets, so that we don't have to send a single message for each socket, on migration. When needed, repair_flush() will send the message and check for the reply. Co-authored-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 12 +-- conf.c | 43 +++++++++- epoll_type.h | 4 + migrate.c | 5 +- passt.1 | 11 +++ passt.c | 9 +++ passt.h | 7 ++ repair.c | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++ repair.h | 16 ++++ tap.c | 65 +-------------- util.c | 62 +++++++++++++++ util.h | 1 + 12 files changed, 381 insertions(+), 73 deletions(-) create mode 100644 repair.c create mode 100644 repair.h diff --git a/Makefile b/Makefile index be89b07..d4e1096 100644 --- a/Makefile +++ b/Makefile @@ -38,9 +38,9 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ - ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \ - tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ - vhost_user.c virtio.c vu_common.c + ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \ + repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \ + udp_vu.c util.c vhost_user.c virtio.c vu_common.c QRAP_SRCS = qrap.c PASST_REPAIR_SRCS = passt-repair.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS) @@ -50,9 +50,9 @@ MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \ - pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \ - tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \ - vhost_user.h virtio.h vu_common.h + pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \ + tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \ + udp_vu.h util.h vhost_user.h virtio.h vu_common.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} diff --git a/conf.c b/conf.c index d9de07c..18017f5 100644 --- a/conf.c +++ b/conf.c @@ -820,6 +820,9 @@ static void usage(const char *name, FILE *f, int status) " UNIX domain socket is provided by -s option\n" " --print-capabilities print back-end capabilities in JSON format,\n" " only meaningful for vhost-user mode\n"); + FPRINTF(f, + " --repair-path PATH path for passt-repair(1)\n" + " default: append '.repair' to UNIX domain path\n"); } FPRINTF(f, @@ -1245,8 +1248,25 @@ static void conf_nat(const char *arg, struct in_addr *addr4, */ static void conf_open_files(struct ctx *c) { - if (c->mode != MODE_PASTA && c->fd_tap == -1) - c->fd_tap_listen = tap_sock_unix_open(c->sock_path); + if (c->mode != MODE_PASTA && c->fd_tap == -1) { + c->fd_tap_listen = sock_unix(c->sock_path); + + if (c->mode == MODE_VU && strcmp(c->repair_path, "none")) { + if (!*c->repair_path && + snprintf_check(c->repair_path, + sizeof(c->repair_path), "%s.repair", + c->sock_path)) { + warn("passt-repair path %s not usable", + c->repair_path); + c->fd_repair_listen = -1; + } else { + c->fd_repair_listen = sock_unix(c->repair_path); + } + } else { + c->fd_repair_listen = -1; + } + c->fd_repair = -1; + } if (*c->pidfile) { c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY); @@ -1360,10 +1380,12 @@ void conf(struct ctx *c, int argc, char **argv) {"host-lo-to-ns-lo", no_argument, NULL, 23 }, {"dns-host", required_argument, NULL, 24 }, {"vhost-user", no_argument, NULL, 25 }, + /* vhost-user backend program convention */ {"print-capabilities", no_argument, NULL, 26 }, {"socket-path", required_argument, NULL, 's' }, {"fqdn", required_argument, NULL, 27 }, + {"repair-path", required_argument, NULL, 28 }, { 0 }, }; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; @@ -1570,6 +1592,9 @@ void conf(struct ctx *c, int argc, char **argv) "%s", optarg)) die("Invalid FQDN: %s", optarg); break; + case 28: + /* Handle this once we checked --vhost-user */ + break; case 'd': c->debug = 1; c->quiet = 0; @@ -1841,8 +1866,8 @@ void conf(struct ctx *c, int argc, char **argv) if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw)) c->no_dhcp = 1; - /* Inbound port options & DNS can be parsed now (after IPv4/IPv6 - * settings) + /* Inbound port options, DNS, and --repair-path can be parsed now, after + * IPv4/IPv6 settings and --vhost-user. */ fwd_probe_ephemeral(); udp_portmap_clear(); @@ -1888,6 +1913,16 @@ void conf(struct ctx *c, int argc, char **argv) } die("Cannot use DNS address %s", optarg); + } else if (name == 28) { + if (c->mode != MODE_VU && strcmp(optarg, "none")) + die("--repair-path is for vhost-user mode only"); + + if (snprintf_check(c->repair_path, + sizeof(c->repair_path), "%s", + optarg)) + die("Invalid passt-repair path: %s", optarg); + + break; } } while (name != -1); diff --git a/epoll_type.h b/epoll_type.h index f3ef415..7f2a121 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -40,6 +40,10 @@ enum epoll_type { EPOLL_TYPE_VHOST_CMD, /* vhost-user kick event socket */ EPOLL_TYPE_VHOST_KICK, + /* TCP_REPAIR helper listening socket */ + EPOLL_TYPE_REPAIR_LISTEN, + /* TCP_REPAIR helper socket */ + EPOLL_TYPE_REPAIR, EPOLL_NUM_TYPES, }; diff --git a/migrate.c b/migrate.c index 72a6d40..1c59016 100644 --- a/migrate.c +++ b/migrate.c @@ -23,6 +23,7 @@ #include "flow_table.h" #include "migrate.h" +#include "repair.h" /* Magic identifier for migration data */ #define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0 @@ -232,7 +233,7 @@ void migrate_init(struct ctx *c) } /** - * migrate_close() - Close migration channel + * migrate_close() - Close migration channel and connection to passt-repair * @c: Execution context */ void migrate_close(struct ctx *c) @@ -243,6 +244,8 @@ void migrate_close(struct ctx *c) c->device_state_fd = -1; c->device_state_result = -1; } + + repair_close(c); } /** diff --git a/passt.1 b/passt.1 index 9d347d8..60066c2 100644 --- a/passt.1 +++ b/passt.1 @@ -428,6 +428,17 @@ Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR. .BR \-\-print-capabilities Print back-end capabilities in JSON format, only meaningful for vhost-user mode. +.TP +.BR \-\-repair-path " " \fIpath +Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect +to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during +migration. \fB--repair-path none\fR disables this interface (if you need to +specify a socket path called "none" you can prefix the path by \fI./\fR). + +Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path +chosen for the hypervisor UNIX domain socket. No socket is created if not in +\-\-vhost-user mode. + .TP .BR \-F ", " \-\-fd " " \fIFD Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened diff --git a/passt.c b/passt.c index 935a69f..6f9fb4d 100644 --- a/passt.c +++ b/passt.c @@ -52,6 +52,7 @@ #include "ndp.h" #include "vu_common.h" #include "migrate.h" +#include "repair.h" #define EPOLL_EVENTS 8 @@ -76,6 +77,8 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", + [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", + [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -358,6 +361,12 @@ loop: case EPOLL_TYPE_VHOST_KICK: vu_kick_cb(c.vdev, ref, &now); break; + case EPOLL_TYPE_REPAIR_LISTEN: + repair_listen_handler(&c, eventmask); + break; + case EPOLL_TYPE_REPAIR: + repair_handler(&c, eventmask); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index 5fdea52..1f0dab5 100644 --- a/passt.h +++ b/passt.h @@ -20,6 +20,7 @@ union epoll_ref; #include "siphash.h" #include "ip.h" #include "inany.h" +#include "migrate.h" #include "flow.h" #include "icmp.h" #include "fwd.h" @@ -193,6 +194,7 @@ struct ip6_ctx { * @foreground: Run in foreground, don't log to stderr by default * @nofile: Maximum number of open files (ulimit -n) * @sock_path: Path for UNIX domain socket + * @repair_path: TCP_REPAIR helper path, can be "none", empty for default * @pcap: Path for packet capture file * @pidfile: Path to PID file, empty string if not configured * @pidfile_fd: File descriptor for PID file, -1 if none @@ -203,6 +205,8 @@ struct ip6_ctx { * @epollfd: File descriptor for epoll instance * @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any * @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket + * @fd_repair_listen: File descriptor for listening TCP_REPAIR socket, if any + * @fd_repair: Connected AF_UNIX socket for TCP_REPAIR helper * @our_tap_mac: Pasta/passt's MAC on the tap link * @guest_mac: MAC address of guest or namespace, seen or configured * @hash_secret: 128-bit secret for siphash functions @@ -249,6 +253,7 @@ struct ctx { int foreground; int nofile; char sock_path[UNIX_PATH_MAX]; + char repair_path[UNIX_PATH_MAX]; char pcap[PATH_MAX]; char pidfile[PATH_MAX]; @@ -265,6 +270,8 @@ struct ctx { int epollfd; int fd_tap_listen; int fd_tap; + int fd_repair_listen; + int fd_repair; unsigned char our_tap_mac[ETH_ALEN]; unsigned char guest_mac[ETH_ALEN]; uint64_t hash_secret[2]; diff --git a/repair.c b/repair.c new file mode 100644 index 0000000..d288617 --- /dev/null +++ b/repair.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <errno.h> +#include <sys/uio.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "inany.h" +#include "flow.h" +#include "flow_table.h" + +#include "repair.h" + +#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ + +/* Pending file descriptors for next repair_flush() call, or command change */ +static int repair_fds[SCM_MAX_FD]; + +/* Pending command: flush pending file descriptors if it changes */ +static int8_t repair_cmd; + +/* Number of pending file descriptors set in @repair_fds */ +static int repair_nfds; + +/** + * repair_sock_init() - Start listening for connections on helper socket + * @c: Execution context + */ +void repair_sock_init(const struct ctx *c) +{ + union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN }; + struct epoll_event ev = { 0 }; + + if (c->fd_repair_listen == -1) + return; + + if (listen(c->fd_repair_listen, 0)) { + err_perror("listen() on repair helper socket, won't migrate"); + return; + } + + ref.fd = c->fd_repair_listen; + ev.events = EPOLLIN | EPOLLHUP | EPOLLET; + ev.data.u64 = ref.u64; + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev)) + err_perror("repair helper socket epoll_ctl(), won't migrate"); +} + +/** + * repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket + * @c: Execution context + * @events: epoll events + */ +void repair_listen_handler(struct ctx *c, uint32_t events) +{ + union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR }; + struct epoll_event ev = { 0 }; + struct ucred ucred; + socklen_t len; + + if (events != EPOLLIN) { + debug("Spurious event 0x%04x on TCP_REPAIR helper socket", + events); + return; + } + + len = sizeof(ucred); + + /* Another client is already connected: accept and close right away. */ + if (c->fd_repair != -1) { + int discard = accept4(c->fd_repair_listen, NULL, NULL, + SOCK_NONBLOCK); + + if (discard == -1) + return; + + if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len)) + info("Discarding TCP_REPAIR helper, PID %i", ucred.pid); + + close(discard); + return; + } + + if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) { + debug_perror("accept4() on TCP_REPAIR helper listening socket"); + return; + } + + if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len)) + info("Accepted TCP_REPAIR helper, PID %i", ucred.pid); + + ref.fd = c->fd_repair; + ev.events = EPOLLHUP | EPOLLET; + ev.data.u64 = ref.u64; + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) { + debug_perror("epoll_ctl() on TCP_REPAIR helper socket"); + close(c->fd_repair); + c->fd_repair = -1; + } +} + +/** + * repair_close() - Close connection to TCP_REPAIR helper + * @c: Execution context + */ +void repair_close(struct ctx *c) +{ + debug("Closing TCP_REPAIR helper socket"); + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL); + close(c->fd_repair); + c->fd_repair = -1; +} + +/** + * repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket + * @c: Execution context + * @events: epoll events + */ +void repair_handler(struct ctx *c, uint32_t events) +{ + (void)events; + + repair_close(c); +} + +/** + * repair_flush() - Flush current set of sockets to helper, with current command + * @c: Execution context + * + * Return: 0 on success, negative error code on failure + */ +int repair_flush(struct ctx *c) +{ + struct iovec iov = { &repair_cmd, sizeof(repair_cmd) }; + char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)] + __attribute__ ((aligned(__alignof__(struct cmsghdr)))); + struct cmsghdr *cmsg; + struct msghdr msg; + int8_t reply; + + if (!repair_nfds) + return 0; + + msg = (struct msghdr){ NULL, 0, &iov, 1, + buf, CMSG_SPACE(sizeof(int) * repair_nfds), 0 }; + cmsg = CMSG_FIRSTHDR(&msg); + + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds); + memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds); + + repair_nfds = 0; + + if (sendmsg(c->fd_repair, &msg, 0) < 0) { + int ret = -errno; + err_perror("Failed to send sockets to TCP_REPAIR helper"); + repair_close(c); + return ret; + } + + if (recv(c->fd_repair, &reply, sizeof(reply), 0) < 0) { + int ret = -errno; + err_perror("Failed to receive reply from TCP_REPAIR helper"); + repair_close(c); + return ret; + } + + if (reply != repair_cmd) { + err("Unexpected reply from TCP_REPAIR helper: %d", reply); + repair_close(c); + return -ENXIO; + } + + return 0; +} + +/** + * repair_set() - Add socket to TCP_REPAIR set with given command + * @c: Execution context + * @s: Socket to add + * @cmd: TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP + * + * Return: 0 on success, negative error code on failure + */ +/* cppcheck-suppress unusedFunction */ +int repair_set(struct ctx *c, int s, int cmd) +{ + int rc; + + if (repair_nfds && repair_cmd != cmd) { + if ((rc = repair_flush(c))) + return rc; + } + + repair_cmd = cmd; + repair_fds[repair_nfds++] = s; + + if (repair_nfds >= SCM_MAX_FD) { + if ((rc = repair_flush(c))) + return rc; + } + + return 0; +} diff --git a/repair.h b/repair.h new file mode 100644 index 0000000..de279d6 --- /dev/null +++ b/repair.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef REPAIR_H +#define REPAIR_H + +void repair_sock_init(const struct ctx *c); +void repair_listen_handler(struct ctx *c, uint32_t events); +void repair_handler(struct ctx *c, uint32_t events); +void repair_close(struct ctx *c); +int repair_flush(struct ctx *c); +int repair_set(struct ctx *c, int s, int cmd); + +#endif /* REPAIR_H */ diff --git a/tap.c b/tap.c index 8c92d23..d0673e5 100644 --- a/tap.c +++ b/tap.c @@ -56,6 +56,7 @@ #include "netlink.h" #include "pasta.h" #include "packet.h" +#include "repair.h" #include "tap.h" #include "log.h" #include "vhost_user.h" @@ -1151,68 +1152,6 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, tap_pasta_input(c, now); } -/** - * tap_sock_unix_open() - Create and bind AF_UNIX socket - * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix) - * - * Return: socket descriptor on success, won't return on failure - */ -int tap_sock_unix_open(char *sock_path) -{ - int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - struct sockaddr_un addr = { - .sun_family = AF_UNIX, - }; - int i; - - if (fd < 0) - die_perror("Failed to open UNIX domain socket"); - - for (i = 1; i < UNIX_SOCK_MAX; i++) { - char *path = addr.sun_path; - int ex, ret; - - if (*sock_path) - memcpy(path, sock_path, UNIX_PATH_MAX); - else if (snprintf_check(path, UNIX_PATH_MAX - 1, - UNIX_SOCK_PATH, i)) - die_perror("Can't build UNIX domain socket path"); - - ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, - 0); - if (ex < 0) - die_perror("Failed to check for UNIX domain conflicts"); - - ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr)); - if (!ret || (errno != ENOENT && errno != ECONNREFUSED && - errno != EACCES)) { - if (*sock_path) - die("Socket path %s already in use", path); - - close(ex); - continue; - } - close(ex); - - unlink(path); - ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr)); - if (*sock_path && ret) - die_perror("Failed to bind UNIX domain socket"); - - if (!ret) - break; - } - - if (i == UNIX_SOCK_MAX) - die_perror("Failed to bind UNIX domain socket"); - - info("UNIX domain socket bound at %s", addr.sun_path); - if (!*sock_path) - memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX); - - return fd; -} - /** * tap_backend_show_hints() - Give help information to start QEMU * @c: Execution context @@ -1423,6 +1362,8 @@ void tap_backend_init(struct ctx *c) tap_sock_tun_init(c); break; case MODE_VU: + repair_sock_init(c); + /* fall through */ case MODE_PASST: tap_sock_unix_init(c); diff --git a/util.c b/util.c index ba33866..656e86a 100644 --- a/util.c +++ b/util.c @@ -178,6 +178,68 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, return fd; } +/** + * sock_unix() - Create and bind AF_UNIX socket + * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix) + * + * Return: socket descriptor on success, won't return on failure + */ +int sock_unix(char *sock_path) +{ + int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + int i; + + if (fd < 0) + die_perror("Failed to open UNIX domain socket"); + + for (i = 1; i < UNIX_SOCK_MAX; i++) { + char *path = addr.sun_path; + int ex, ret; + + if (*sock_path) + memcpy(path, sock_path, UNIX_PATH_MAX); + else if (snprintf_check(path, UNIX_PATH_MAX - 1, + UNIX_SOCK_PATH, i)) + die_perror("Can't build UNIX domain socket path"); + + ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, + 0); + if (ex < 0) + die_perror("Failed to check for UNIX domain conflicts"); + + ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr)); + if (!ret || (errno != ENOENT && errno != ECONNREFUSED && + errno != EACCES)) { + if (*sock_path) + die("Socket path %s already in use", path); + + close(ex); + continue; + } + close(ex); + + unlink(path); + ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr)); + if (*sock_path && ret) + die_perror("Failed to bind UNIX domain socket"); + + if (!ret) + break; + } + + if (i == UNIX_SOCK_MAX) + die_perror("Failed to bind UNIX domain socket"); + + info("UNIX domain socket bound at %s", addr.sun_path); + if (!*sock_path) + memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX); + + return fd; +} + /** * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed * @c: Execution context diff --git a/util.h b/util.h index 7df7767..50e96d3 100644 --- a/util.h +++ b/util.h @@ -217,6 +217,7 @@ struct ctx; int sock_l4_sa(const struct ctx *c, enum epoll_type type, const void *sa, socklen_t sl, const char *ifname, bool v6only, uint32_t data); +int sock_unix(char *sock_path); void sock_probe_mem(struct ctx *c); long timespec_diff_ms(const struct timespec *a, const struct timespec *b); int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b); From f3fe795ff58656c39a39dbfac47fe6769f5ce293 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 12 Feb 2025 18:07:16 +1100 Subject: [PATCH 226/382] vhost_user: Make source quit after reporting migration state This will close all the sockets we currently have open in repair mode, and completes our migration tasks as source. If the hypervisor wants to have us back at this point, somebody needs to restart us. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vhost_user.c b/vhost_user.c index 256c8ab..7ab1377 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -1203,4 +1203,11 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) if (reply_requested) vu_send_reply(fd, &msg); + + if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE && + vdev->context->device_state_result == 0 && + !vdev->context->migrate_target) { + info("Migration complete, exiting"); + _exit(EXIT_SUCCESS); + } } From 6f122f0171fe4bc235d572945e0bf963e81139ea Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 12 Feb 2025 18:07:17 +1100 Subject: [PATCH 227/382] tcp: Get bound address for connected inbound sockets too So that we can bind inbound sockets to specific addresses, like we already do for outbound sockets. While at it, change the error message in tcp_conn_from_tap() to match this one. Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 6 +++--- flow_table.h | 6 +++--- tcp.c | 22 ++++++++++++++-------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/flow.c b/flow.c index a6fe6d1..3ac551b 100644 --- a/flow.c +++ b/flow.c @@ -390,9 +390,9 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, * * Return: pointer to the initiating flowside information */ -const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, - const union sockaddr_inany *ssa, - in_port_t dport) +struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, + const union sockaddr_inany *ssa, + in_port_t dport) { struct flowside *ini = &flow->f.side[INISIDE]; diff --git a/flow_table.h b/flow_table.h index eeb6f41..9a2ff24 100644 --- a/flow_table.h +++ b/flow_table.h @@ -161,9 +161,9 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, sa_family_t af, const void *saddr, in_port_t sport, const void *daddr, in_port_t dport); -const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, - const union sockaddr_inany *ssa, - in_port_t dport); +struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, + const union sockaddr_inany *ssa, + in_port_t dport); const struct flowside *flow_target_af(union flow *flow, uint8_t pif, sa_family_t af, const void *saddr, in_port_t sport, diff --git a/tcp.c b/tcp.c index b87478f..a1d6c53 100644 --- a/tcp.c +++ b/tcp.c @@ -1536,12 +1536,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ sl = sizeof(sa); - if (!getsockname(s, &sa.sa, &sl)) { + if (!getsockname(s, &sa.sa, &sl)) inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa); - } else { - err("Failed to get local address for socket: %s", - strerror_(errno)); - } + else + err_perror("Can't get local address for socket %i", s); } FLOW_ACTIVATE(conn); @@ -2075,9 +2073,9 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { - const struct flowside *ini; union sockaddr_inany sa; socklen_t sl = sizeof(sa); + struct flowside *ini; union flow *flow; int s; @@ -2093,12 +2091,20 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, tcp_sock_set_bufsize(c, s); tcp_sock_set_nodelay(s); - /* FIXME: When listening port has a specific bound address, record that - * as our address + /* FIXME: If useful: when the listening port has a specific bound + * address, record that as our address, as implemented for vhost-user + * mode only, below. */ ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, ref.tcp_listen.port); + if (c->mode == MODE_VU) { /* Rebind to same address after migration */ + if (!getsockname(s, &sa.sa, &sl)) + inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa); + else + err_perror("Can't get local address for socket %i", s); + } + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { char sastr[SOCKADDR_STRLEN]; From a3011584563bb7d6cf46416e8e84873c2615ad63 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Feb 2025 18:07:19 +1100 Subject: [PATCH 228/382] rampstream: Add utility to test for corruption of data streams Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/.gitignore | 1 + test/Makefile | 5 +- test/migrate/rampstream_in | 59 +++++++++++++++ test/migrate/rampstream_out | 55 ++++++++++++++ test/passt.mbuto | 5 +- test/rampstream-check.sh | 3 + test/rampstream.c | 143 ++++++++++++++++++++++++++++++++++++ 7 files changed, 267 insertions(+), 4 deletions(-) create mode 100644 test/migrate/rampstream_in create mode 100644 test/migrate/rampstream_out create mode 100755 test/rampstream-check.sh create mode 100644 test/rampstream.c diff --git a/test/.gitignore b/test/.gitignore index 6dd4790..3573444 100644 --- a/test/.gitignore +++ b/test/.gitignore @@ -8,5 +8,6 @@ QEMU_EFI.fd *.raw.xz *.bin nstool +rampstream guest-key guest-key.pub diff --git a/test/Makefile b/test/Makefile index 5e49047..bf63db8 100644 --- a/test/Makefile +++ b/test/Makefile @@ -52,7 +52,8 @@ UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS) DOWNLOAD_ASSETS = mbuto podman \ $(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS) -TESTDATA_ASSETS = small.bin big.bin medium.bin +TESTDATA_ASSETS = small.bin big.bin medium.bin \ + rampstream LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \ $(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \ $(UBUNTU_NEW_IMGS:%=prepared-%) \ @@ -85,7 +86,7 @@ podman/bin/podman: pull-podman guest-key guest-key.pub: ssh-keygen -f guest-key -N '' -mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub $(TESTDATA_ASSETS) +mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub rampstream-check.sh $(TESTDATA_ASSETS) ./mbuto/mbuto -p ./$< -c lz4 -f $@ mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2 diff --git a/test/migrate/rampstream_in b/test/migrate/rampstream_in new file mode 100644 index 0000000..46f4143 --- /dev/null +++ b/test/migrate/rampstream_in @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/basic - Check basic migration functionality +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 +set RAMPS 6000000 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: host > guest +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +guest1b socat -u TCP4-LISTEN:10001 EXEC:"rampstream-check.sh __RAMPS__" +sleep 1 +hostb socat -u EXEC:"test/rampstream send __RAMPS__" TCP4:__ADDR1__:10001 + +sleep 1 + +#mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw + +guest2 cat rampstream.err +guest2 [ $(cat rampstream.status) -eq 0 ] diff --git a/test/migrate/rampstream_out b/test/migrate/rampstream_out new file mode 100644 index 0000000..91b9c63 --- /dev/null +++ b/test/migrate/rampstream_out @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/basic - Check basic migration functionality +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 +set RAMPS 6000000 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: guest > host +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +hostb socat -u TCP4-LISTEN:10006 EXEC:"test/rampstream check __RAMPS__" +sleep 1 +guest1b socat -u EXEC:"rampstream send __RAMPS__" TCP4:__MAP_HOST4__:10006 +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw diff --git a/test/passt.mbuto b/test/passt.mbuto index e45a284..5e00132 100755 --- a/test/passt.mbuto +++ b/test/passt.mbuto @@ -13,7 +13,8 @@ PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl - nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump env}" + nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump + env}" # OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and # sshd-session the per-session program. We need the latter as well, and the path @@ -31,7 +32,7 @@ LINKS="${LINKS:- DIRS="${DIRS} /tmp /usr/sbin /usr/share /var/log /var/lib /etc/ssh /run/sshd /root/.ssh" -COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin" +COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin rampstream,/bin/rampstream rampstream-check.sh,/bin/rampstream-check.sh" FIXUP="${FIXUP}"' mv /sbin/* /usr/sbin || : diff --git a/test/rampstream-check.sh b/test/rampstream-check.sh new file mode 100755 index 0000000..c27acdb --- /dev/null +++ b/test/rampstream-check.sh @@ -0,0 +1,3 @@ +#! /bin/sh + +(rampstream check "$@" 2>&1; echo $? > rampstream.status) | tee rampstream.err diff --git a/test/rampstream.c b/test/rampstream.c new file mode 100644 index 0000000..8d81296 --- /dev/null +++ b/test/rampstream.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* rampstream - Generate a check and stream of bytes in a ramp pattern + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <sys/types.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> + +/* Length of the repeating ramp. This is a deliberately not a "round" number so + * that we're very likely to misalign with likely block or chunk sizes of the + * transport. That means we'll detect gaps in the stream, even if they occur + * neatly on block boundaries. Specifically this is the largest 8-bit prime. */ +#define RAMPLEN 251 + +#define INTERVAL 10000 + +#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0]))) + +#define die(...) \ + do { \ + fprintf(stderr, "rampstream: " __VA_ARGS__); \ + exit(1); \ + } while (0) + +static void usage(void) +{ + die("Usage:\n" + " rampstream send <number>\n" + " Generate a ramp pattern of bytes on stdout, repeated <number>\n" + " times\n" + " rampstream check <number>\n" + " Check a ramp pattern of bytes on stdin, repeater <number>\n" + " times\n"); +} + +static void ramp_send(unsigned long long num, const uint8_t *ramp) +{ + unsigned long long i; + + for (i = 0; i < num; i++) { + int off = 0; + ssize_t rc; + + if (i % INTERVAL == 0) + fprintf(stderr, "%llu...\r", i); + + while (off < RAMPLEN) { + rc = write(1, ramp + off, RAMPLEN - off); + if (rc < 0) { + if (errno == EINTR || + errno == EAGAIN || + errno == EWOULDBLOCK) + continue; + die("Error writing ramp: %s\n", + strerror(errno)); + } + if (rc == 0) + die("Zero length write\n"); + off += rc; + } + } +} + +static void ramp_check(unsigned long long num, const uint8_t *ramp) +{ + unsigned long long i; + + for (i = 0; i < num; i++) { + uint8_t buf[RAMPLEN]; + int off = 0; + ssize_t rc; + + if (i % INTERVAL == 0) + fprintf(stderr, "%llu...\r", i); + + while (off < RAMPLEN) { + rc = read(0, buf + off, RAMPLEN - off); + if (rc < 0) { + if (errno == EINTR || + errno == EAGAIN || + errno == EWOULDBLOCK) + continue; + die("Error reading ramp: %s\n", + strerror(errno)); + } + if (rc == 0) + die("Unexpected EOF, ramp %llu, byte %d\n", + i, off); + off += rc; + } + + if (memcmp(buf, ramp, sizeof(buf)) != 0) { + int j, k; + + for (j = 0; j < RAMPLEN; j++) + if (buf[j] != ramp[j]) + break; + for (k = j; k < RAMPLEN && k < j + 16; k++) + fprintf(stderr, + "Byte %d: expected 0x%02x, got 0x%02x\n", + k, ramp[k], buf[k]); + die("Data mismatch, ramp %llu, byte %d\n", i, j); + } + } +} + +int main(int argc, char *argv[]) +{ + const char *subcmd = argv[1]; + unsigned long long num; + uint8_t ramp[RAMPLEN]; + char *e; + int i; + + if (argc < 2) + usage(); + + errno = 0; + num = strtoull(argv[2], &e, 0); + if (*e || errno) + usage(); + + /* Initialize the ramp block */ + for (i = 0; i < RAMPLEN; i++) + ramp[i] = i; + + if (strcmp(subcmd, "send") == 0) + ramp_send(num, ramp); + else if (strcmp(subcmd, "check") == 0) + ramp_check(num, ramp); + else + usage(); + + exit(0); +} From 9a84df4c3f9608c5e814f24ee3306a6c64a73edd Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 13 Feb 2025 00:42:52 +0100 Subject: [PATCH 229/382] selinux: Add rules needed to run tests ...other than being convenient, they might be reasonably representative of typical stand-alone usage. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/selinux/passt.te | 4 ++++ contrib/selinux/pasta.te | 14 ++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index c6cea34..6e7a4cb 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -20,6 +20,7 @@ require { type fs_t; type tmp_t; type user_tmp_t; + type user_home_t; type tmpfs_t; type root_t; @@ -80,6 +81,9 @@ allow passt_t root_t:dir mounton; allow passt_t tmp_t:dir { add_name mounton remove_name write }; allow passt_t tmpfs_t:filesystem mount; allow passt_t fs_t:filesystem unmount; +allow passt_t user_home_t:dir search; +allow passt_t user_tmp_t:fifo_file append; +allow passt_t user_tmp_t:file map; manage_files_pattern(passt_t, user_tmp_t, user_tmp_t) files_pid_filetrans(passt_t, user_tmp_t, file) diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te index d0ff0cc..89c8043 100644 --- a/contrib/selinux/pasta.te +++ b/contrib/selinux/pasta.te @@ -18,6 +18,7 @@ require { type bin_t; type user_home_t; type user_home_dir_t; + type user_tmp_t; type fs_t; type tmp_t; type tmpfs_t; @@ -56,8 +57,10 @@ require { attribute port_type; type port_t; type http_port_t; + type http_cache_port_t; type ssh_port_t; type reserved_port_t; + type unreserved_port_t; type dns_port_t; type dhcpc_port_t; type chronyd_port_t; @@ -122,8 +125,8 @@ domain_auto_trans(pasta_t, ping_exec_t, ping_t); allow pasta_t nsfs_t:file { open read }; -allow pasta_t user_home_t:dir getattr; -allow pasta_t user_home_t:file { open read getattr setattr }; +allow pasta_t user_home_t:dir { getattr search }; +allow pasta_t user_home_t:file { open read getattr setattr execute execute_no_trans map}; allow pasta_t user_home_dir_t:dir { search getattr open add_name read write }; allow pasta_t user_home_dir_t:file { create open read write }; allow pasta_t tmp_t:dir { add_name mounton remove_name write }; @@ -133,6 +136,11 @@ allow pasta_t root_t:dir mounton; manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t) files_pid_filetrans(pasta_t, pasta_pid_t, file) +allow pasta_t user_tmp_t:dir { add_name remove_name search write }; +allow pasta_t user_tmp_t:fifo_file append; +allow pasta_t user_tmp_t:file { create open write }; +allow pasta_t user_tmp_t:sock_file { create unlink }; + allow pasta_t console_device_t:chr_file { open write getattr ioctl }; allow pasta_t user_devpts_t:chr_file { getattr read write ioctl }; logging_send_syslog_msg(pasta_t) @@ -160,6 +168,8 @@ allow pasta_t self:udp_socket create_stream_socket_perms; allow pasta_t reserved_port_t:udp_socket name_bind; allow pasta_t llmnr_port_t:tcp_socket name_bind; allow pasta_t llmnr_port_t:udp_socket name_bind; +allow pasta_t http_cache_port_t:tcp_socket { name_bind name_connect }; +allow pasta_t unreserved_port_t:udp_socket name_bind; corenet_udp_sendrecv_generic_node(pasta_t) corenet_udp_bind_generic_node(pasta_t) allow pasta_t node_t:icmp_socket { name_bind node_bind }; From 98d474c8950e9cc5715d5686614fb0f504377303 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 13 Feb 2025 22:00:57 +0100 Subject: [PATCH 230/382] contrib/selinux: Enable mapping guest memory for libvirt guests This doesn't actually belong to passt's own policy: we should export an interface and libvirt's policy should use it, because passt's policy shouldn't be aware of svirt_image_t at all. However, libvirt doesn't maintain its own policy, which makes policy updates rather involved. Add this workaround to ensure --vhost-user is working in combination with libvirt, as it might take ages before we can get the proper rule in libvirt's policy. Reported-by: Laine Stump <laine@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/selinux/passt.te | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index 6e7a4cb..fc1320d 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -24,6 +24,12 @@ require { type tmpfs_t; type root_t; + # Workaround: passt --vhost-user needs to map guest memory, but + # libvirt doesn't maintain its own policy, which makes updates + # particularly complicated. To avoid breakage in the short term, + # deal with it in passt's own policy. + type svirt_image_t; + class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map }; class dir { search write add_name remove_name mounton }; class chr_file { append read write open getattr ioctl }; @@ -131,3 +137,9 @@ allow passt_t user_tmp_t:dir { add_name write }; allow passt_t user_tmp_t:file { create open }; allow passt_t user_tmp_t:sock_file { create read write unlink }; allow passt_t unconfined_t:unix_stream_socket { read write }; + +# Workaround: passt --vhost-user needs to map guest memory, but +# libvirt doesn't maintain its own policy, which makes updates +# particularly complicated. To avoid breakage in the short term, +# deal with it in passt's own policy. +allow passt_t svirt_image_t:file { read write map }; From 30f1e082c3c0cee0a985b3c32e2b05280c596343 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 13 Feb 2025 16:24:55 +0100 Subject: [PATCH 231/382] tcp: Keep updating window and checking for socket data after FIN from guest Once we get a FIN segment from the container/guest, we enter something resembling CLOSE_WAIT (from the perspective of the peer), but that doesn't mean that we should stop processing window updates from the guest and checking for socket data if the guest acknowledges something. If we don't do that, we can very easily run into a situation where we send a burst of data to the tap, get a zero window update, along with a FIN segment, because the flow is meant to be unidirectional, and now the connection will be stuck forever, because we'll ignore updates. Reproducer, server: $ pasta --config-net -t 9999 -- sh -c 'echo DONE | socat TCP-LISTEN:9997,shut-down STDIO' and client: $ ./test/rampstream send 50000 | socat -u STDIN TCP:$LOCAL_ADDR:9997 2025/02/13 09:14:45 socat[2997126] E write(5, 0x55f5dbf47000, 8192): Broken pipe while at it, update the message string for the third passive close state (which we see in this case): it's CLOSE_WAIT, not LAST_ACK. Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index a1d6c53..16d01f6 100644 --- a/tcp.c +++ b/tcp.c @@ -338,7 +338,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = { "SYN_RCVD", /* approximately maps to TAP_SYN_ACK_SENT */ /* Passive close: */ - "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK", + "CLOSE_WAIT", "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", /* Active close (+5): */ "CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT", }; @@ -1968,6 +1968,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, /* Established connections not accepting data from tap */ if (conn->events & TAP_FIN_RCVD) { tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); + tcp_tap_window_update(conn, ntohs(th->window)); + tcp_data_from_sock(c, conn); if (conn->events & SOCK_FIN_RCVD && conn->seq_ack_from_tap == conn->seq_to_tap) From 71249ef3f9bcf1dbb2d6c13cdbc41ba88c794f06 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 13 Feb 2025 20:54:04 +0100 Subject: [PATCH 232/382] tcp, tcp_splice: Don't set SO_SNDBUF and SO_RCVBUF to maximum values I added this a long long time ago because it dramatically improved throughput back then: with rmem_max and wmem_max >= 4 MiB, we would force send and receive buffer sizes for TCP sockets to the maximum allowed value. This effectively disables TCP auto-tuning, which would otherwise allow us to exceed those limits, as crazy as it might sound. But in any case, it made sense. Now that we have zero (internal) copies on every path, plus vhost-user support, it turns out that these settings are entirely obsolete. I get substantially the same throughput in every test we perform, even with very short durations (one second). The settings are not just useless: they actually cause us quite some trouble on guest state migration, because they lead to huge queues that need to be moved as well. Drop those settings. Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 41 +++++++++-------------------------------- tcp_conn.h | 4 ++-- tcp_splice.c | 6 +++--- 3 files changed, 14 insertions(+), 37 deletions(-) diff --git a/tcp.c b/tcp.c index 16d01f6..b978b30 100644 --- a/tcp.c +++ b/tcp.c @@ -738,24 +738,6 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) SNDBUF_SET(conn, MIN(INT_MAX, v)); } -/** - * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values - * @s: Socket, can be -1 to avoid check in the caller - */ -static void tcp_sock_set_bufsize(const struct ctx *c, int s) -{ - int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */ - - if (s == -1) - return; - - if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v))) - trace("TCP: failed to set SO_RCVBUF to %i", v); - - if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v))) - trace("TCP: failed to set SO_SNDBUF to %i", v); -} - /** * tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm) * @s: Socket, can be -1 to avoid check in the caller @@ -1278,12 +1260,11 @@ int tcp_conn_pool_sock(int pool[]) /** * tcp_conn_new_sock() - Open and prepare new socket for connection - * @c: Execution context * @af: Address family * * Return: socket number on success, negative code if socket creation failed */ -static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) +static int tcp_conn_new_sock(sa_family_t af) { int s; @@ -1297,7 +1278,6 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) if (s < 0) return -errno; - tcp_sock_set_bufsize(c, s); tcp_sock_set_nodelay(s); return s; @@ -1305,12 +1285,11 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) /** * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace - * @c: Execution context * @af: Address family (AF_INET or AF_INET6) * * Return: Socket fd on success, -errno on failure */ -int tcp_conn_sock(const struct ctx *c, sa_family_t af) +int tcp_conn_sock(sa_family_t af) { int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4; int s; @@ -1321,7 +1300,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af) /* If the pool is empty we just open a new one without refilling the * pool to keep latency down. */ - if ((s = tcp_conn_new_sock(c, af)) >= 0) + if ((s = tcp_conn_new_sock(af)) >= 0) return s; err("TCP: Unable to open socket for new connection: %s", @@ -1462,7 +1441,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, goto cancel; } - if ((s = tcp_conn_sock(c, af)) < 0) + if ((s = tcp_conn_sock(af)) < 0) goto cancel; pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport); @@ -1483,7 +1462,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, } else { /* Not a local, bound destination, inconclusive test */ close(s); - if ((s = tcp_conn_sock(c, af)) < 0) + if ((s = tcp_conn_sock(af)) < 0) goto cancel; } @@ -2090,7 +2069,6 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; - tcp_sock_set_bufsize(c, s); tcp_sock_set_nodelay(s); /* FIXME: If useful: when the listening port has a specific bound @@ -2434,13 +2412,12 @@ static int tcp_ns_socks_init(void *arg) /** * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets - * @c: Execution context * @pool: Pool of sockets to refill * @af: Address family to use * * Return: 0 on success, negative error code if there was at least one error */ -int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) +int tcp_sock_refill_pool(int pool[], sa_family_t af) { int i; @@ -2450,7 +2427,7 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) if (pool[i] >= 0) continue; - if ((fd = tcp_conn_new_sock(c, af)) < 0) + if ((fd = tcp_conn_new_sock(af)) < 0) return fd; pool[i] = fd; @@ -2466,13 +2443,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) static void tcp_sock_refill_init(const struct ctx *c) { if (c->ifi4) { - int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET); + int rc = tcp_sock_refill_pool(init_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 host socket pool: %s", strerror_(-rc)); } if (c->ifi6) { - int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6); + int rc = tcp_sock_refill_pool(init_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 host socket pool: %s", strerror_(-rc)); diff --git a/tcp_conn.h b/tcp_conn.h index d342680..8c20805 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -143,8 +143,8 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn); bool tcp_splice_flow_defer(struct tcp_splice_conn *conn); void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn); int tcp_conn_pool_sock(int pool[]); -int tcp_conn_sock(const struct ctx *c, sa_family_t af); -int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); +int tcp_conn_sock(sa_family_t af); +int tcp_sock_refill_pool(int pool[], sa_family_t af); void tcp_splice_refill(const struct ctx *c); #endif /* TCP_CONN_H */ diff --git a/tcp_splice.c b/tcp_splice.c index f048a82..f1a9223 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -351,7 +351,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) int one = 1; if (tgtpif == PIF_HOST) - conn->s[1] = tcp_conn_sock(c, af); + conn->s[1] = tcp_conn_sock(af); else if (tgtpif == PIF_SPLICE) conn->s[1] = tcp_conn_sock_ns(c, af); else @@ -703,13 +703,13 @@ static int tcp_sock_refill_ns(void *arg) ns_enter(c); if (c->ifi4) { - int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); + int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 ns socket pool: %s", strerror_(-rc)); } if (c->ifi6) { - int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); + int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 ns socket pool: %s", strerror_(-rc)); From 7c33b1208632a9581d0ee7aabd1e0584a5d1fb20 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Sat, 15 Feb 2025 00:08:41 +1100 Subject: [PATCH 233/382] vhost_user: Clear ring address on GET_VRING_BASE GET_VRING_BASE stops the queue, clearing the call and kick fds. However, we don't clear vring.avail. That means that if vu_queue_notify() is called it won't realise the queue isn't ready and will die with an EBADFD. We get this during migration, because for some reason, qemu reconfigures the vhost-user device when a migration is triggered. There's a window between the GET_VRING_BASE and re-establishing the call fd where the notify function can be called, causing a crash. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vhost_user.c b/vhost_user.c index 7ab1377..be1aa94 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -732,6 +732,7 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev, msg->hdr.size = sizeof(msg->payload.state); vdev->vq[idx].started = false; + vdev->vq[idx].vring.avail = 0; if (vdev->vq[idx].call_fd != -1) { close(vdev->vq[idx].call_fd); From 667caa09c6d46d937b3076254176eded262b3eca Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Sun, 16 Feb 2025 08:16:33 +0100 Subject: [PATCH 234/382] tcp_splice: Don't wake up on input data if we can't write it anywhere If we set the OUT_WAIT_* flag (waiting on EPOLLOUT) for a side of a given flow, it means that we're blocked, waiting for the receiver to actually receive data, with a full pipe. In that case, if we keep EPOLLIN set for the socket on the other side (our receiving side), we'll get into a loop such as: 41.0230: pasta: epoll event on connected spliced TCP socket 108 (events: 0x00000001) 41.0230: Flow 1 (TCP connection (spliced)): -1 from read-side call 41.0230: Flow 1 (TCP connection (spliced)): -1 from write-side call (passed 8192) 41.0230: Flow 1 (TCP connection (spliced)): event at tcp_splice_sock_handler:577 41.0230: pasta: epoll event on connected spliced TCP socket 108 (events: 0x00000001) 41.0230: Flow 1 (TCP connection (spliced)): -1 from read-side call 41.0230: Flow 1 (TCP connection (spliced)): -1 from write-side call (passed 8192) 41.0230: Flow 1 (TCP connection (spliced)): event at tcp_splice_sock_handler:577 leading to 100% CPU usage, of course. Drop EPOLLIN on our receiving side as long when we're waiting for output readiness on the other side. Link: https://github.com/containers/podman/issues/23686#issuecomment-2661036584 Link: https://www.reddit.com/r/podman/comments/1iph50j/pasta_high_cpu_on_podman_rootless_container/ Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_splice.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index f1a9223..8a39a6f 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -131,8 +131,12 @@ static void tcp_splice_conn_epoll_events(uint16_t events, ev[1].events = EPOLLOUT; } - flow_foreach_sidei(sidei) - ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0; + flow_foreach_sidei(sidei) { + if (events & OUT_WAIT(sidei)) { + ev[sidei].events |= EPOLLOUT; + ev[!sidei].events &= ~EPOLLIN; + } + } } /** From 01b6a164d94f26be7ad500f71210bdb888f416aa Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Sun, 16 Feb 2025 08:31:13 +0100 Subject: [PATCH 235/382] tcp_splice: A typo three years ago and SO_RCVLOWAT is gone In commit e5eefe77435a ("tcp: Refactor to use events instead of states, split out spliced implementation"), this: if (!bitmap_isset(rcvlowat_set, conn - ts) && readlen > (long)c->tcp.pipe_size / 10) { (note the !) became: if (conn->flags & lowat_set_flag && readlen > (long)c->tcp.pipe_size / 10) { in the new tcp_splice_sock_handler(). We want to check, there, if we should set SO_RCVLOWAT, only if we haven't set it already. But, instead, we're checking if it's already set before we set it, so we'll never set it, of course. Fix the check and re-enable the functionality, which should give us improved CPU utilisation in non-interactive cases where we are not transferring at full pipe capacity. Fixes: e5eefe77435a ("tcp: Refactor to use events instead of states, split out spliced implementation") Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp_splice.c b/tcp_splice.c index 8a39a6f..5d845c9 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -556,7 +556,7 @@ eintr: if (readlen >= (long)c->tcp.pipe_size * 10 / 100) continue; - if (conn->flags & lowat_set_flag && + if (!(conn->flags & lowat_set_flag) && readlen > (long)c->tcp.pipe_size / 10) { int lowat = c->tcp.pipe_size / 4; From 3e903bbb1f386ebb892b1196d339d2d705bce8a2 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Sat, 15 Feb 2025 06:13:13 +0100 Subject: [PATCH 236/382] repair, passt-repair: Build and warning fixes for musl Checked against musl 1.2.5. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- passt-repair.c | 4 +++- repair.c | 13 +++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/passt-repair.c b/passt-repair.c index 1174ae3..e0c366e 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -63,6 +63,7 @@ int main(int argc, char **argv) struct cmsghdr *cmsg; struct msghdr msg; struct iovec iov; + size_t cmsg_len; int op; prctl(PR_SET_DUMPABLE, 0); @@ -138,8 +139,9 @@ loop: } } if (!n) { + cmsg_len = cmsg->cmsg_len; /* socklen_t is 'unsigned' on musl */ fprintf(stderr, "Invalid ancillary data length %zu from peer\n", - cmsg->cmsg_len); + cmsg_len); _exit(1); } diff --git a/repair.c b/repair.c index d288617..dac28a6 100644 --- a/repair.c +++ b/repair.c @@ -13,6 +13,7 @@ */ #include <errno.h> +#include <sys/socket.h> #include <sys/uio.h> #include "util.h" @@ -145,9 +146,9 @@ void repair_handler(struct ctx *c, uint32_t events) */ int repair_flush(struct ctx *c) { - struct iovec iov = { &repair_cmd, sizeof(repair_cmd) }; char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)] - __attribute__ ((aligned(__alignof__(struct cmsghdr)))); + __attribute__ ((aligned(__alignof__(struct cmsghdr)))) = { 0 }; + struct iovec iov = { &repair_cmd, sizeof(repair_cmd) }; struct cmsghdr *cmsg; struct msghdr msg; int8_t reply; @@ -155,8 +156,12 @@ int repair_flush(struct ctx *c) if (!repair_nfds) return 0; - msg = (struct msghdr){ NULL, 0, &iov, 1, - buf, CMSG_SPACE(sizeof(int) * repair_nfds), 0 }; + msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0, + .msg_iov = &iov, .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = CMSG_SPACE(sizeof(int) * + repair_nfds), + .msg_flags = 0 }; cmsg = CMSG_FIRSTHDR(&msg); cmsg->cmsg_level = SOL_SOCKET; From 89ecf2fd40adab549bdf25cdb68996f56d67b13e Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 13 Feb 2025 23:14:13 +1100 Subject: [PATCH 237/382] migrate: Migrate TCP flows This implements flow preparation on the source, transfer of data with a format roughly inspired by struct tcp_tap_conn, plus a specific structure for parameters that don't fit in the flow table, and flow insertion on the target, with all the appropriate window options, window scaling, MSS, etc. Contents of pending queues are transferred as well. The target side is rather convoluted because we first need to create sockets and switch them to repair mode, before we can apply options that are *not* stored in the flow table. This also means that, if we're testing this on the same machine, in the same namespace, we need to close the listening socket on the source before we can start moving data. Further, we need to connect() the socket on the target before we can restore data queues, but we can't do that (again, on the same machine) as long as the matching source socket is open, which implies an arbitrary limit on queue sizes we can transfer, because we can only dump pending queues on the source as long as the socket is open, of course. Co-authored-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Tested-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/selinux/passt.te | 4 +- flow.c | 243 +++++++++++ flow.h | 8 + migrate.c | 10 + passt.c | 6 +- repair.c | 1 - tcp.c | 919 +++++++++++++++++++++++++++++++++++++++ tcp_conn.h | 103 +++++ 8 files changed, 1288 insertions(+), 6 deletions(-) diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index fc1320d..f595079 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -45,7 +45,7 @@ require { type net_conf_t; type proc_net_t; type node_t; - class tcp_socket { create accept listen name_bind name_connect }; + class tcp_socket { create accept listen name_bind name_connect getattr }; class udp_socket { create accept listen }; class icmp_socket { bind create name_bind node_bind setopt read write }; class sock_file { create unlink write }; @@ -129,7 +129,7 @@ corenet_udp_sendrecv_all_ports(passt_t) allow passt_t node_t:icmp_socket { name_bind node_bind }; allow passt_t port_t:icmp_socket name_bind; -allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write }; +allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr }; allow passt_t self:udp_socket { create getopt setopt connect bind read write }; allow passt_t self:icmp_socket { bind create setopt read write }; diff --git a/flow.c b/flow.c index 3ac551b..cc881e8 100644 --- a/flow.c +++ b/flow.c @@ -19,6 +19,7 @@ #include "inany.h" #include "flow.h" #include "flow_table.h" +#include "repair.h" const char *flow_state_str[] = { [FLOW_STATE_FREE] = "FREE", @@ -52,6 +53,35 @@ const uint8_t flow_proto[] = { static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); +#define foreach_flow(i, flow, bound) \ + for ((i) = 0, (flow) = &flowtab[(i)]; \ + (i) < (bound); \ + (i)++, (flow) = &flowtab[(i)]) \ + if ((flow)->f.state == FLOW_STATE_FREE) \ + (i) += (flow)->free.n - 1; \ + else + +#define foreach_active_flow(i, flow, bound) \ + foreach_flow((i), (flow), (bound)) \ + if ((flow)->f.state != FLOW_STATE_ACTIVE) \ + /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ + continue; \ + else + +#define foreach_tcp_flow(i, flow, bound) \ + foreach_active_flow((i), (flow), (bound)) \ + if ((flow)->f.type != FLOW_TCP) \ + /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ + continue; \ + else + +#define foreach_established_tcp_flow(i, flow, bound) \ + foreach_tcp_flow((i), (flow), (bound)) \ + if (!tcp_flow_is_established(&(flow)->tcp)) \ + /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ + continue; \ + else + /* Global Flow Table */ /** @@ -874,6 +904,219 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) *last_next = FLOW_MAX; } +/** + * flow_migrate_source_rollback() - Disable repair mode, return failure + * @c: Execution context + * @max_flow: Maximum index of affected flows + * @ret: Negative error code + * + * Return: @ret + */ +static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow, + int ret) +{ + union flow *flow; + unsigned i; + + debug("...roll back migration"); + + foreach_established_tcp_flow(i, flow, max_flow) + if (tcp_flow_repair_off(c, &flow->tcp)) + die("Failed to roll back TCP_REPAIR mode"); + + if (repair_flush(c)) + die("Failed to roll back TCP_REPAIR mode"); + + return ret; +} + +/** + * flow_migrate_repair_all() - Turn repair mode on or off for all flows + * @c: Execution context + * @enable: Switch repair mode on if set, off otherwise + * + * Return: 0 on success, negative error code on failure + */ +static int flow_migrate_repair_all(struct ctx *c, bool enable) +{ + union flow *flow; + unsigned i; + int rc; + + foreach_established_tcp_flow(i, flow, FLOW_MAX) { + if (enable) + rc = tcp_flow_repair_on(c, &flow->tcp); + else + rc = tcp_flow_repair_off(c, &flow->tcp); + + if (rc) { + debug("Can't %s repair mode: %s", + enable ? "enable" : "disable", strerror_(-rc)); + return flow_migrate_source_rollback(c, i, rc); + } + } + + if ((rc = repair_flush(c))) { + debug("Can't %s repair mode: %s", + enable ? "enable" : "disable", strerror_(-rc)); + return flow_migrate_source_rollback(c, i, rc); + } + + return 0; +} + +/** + * flow_migrate_source_pre() - Prepare flows for migration: enable repair mode + * @c: Execution context + * @stage: Migration stage information (unused) + * @fd: Migration file descriptor (unused) + * + * Return: 0 on success, positive error code on failure + */ +int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, + int fd) +{ + int rc; + + (void)stage; + (void)fd; + + if ((rc = flow_migrate_repair_all(c, true))) + return -rc; + + return 0; +} + +/** + * flow_migrate_source() - Dump all the remaining information and send data + * @c: Execution context (unused) + * @stage: Migration stage information (unused) + * @fd: Migration file descriptor + * + * Return: 0 on success, positive error code on failure + */ +int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, + int fd) +{ + uint32_t count = 0; + bool first = true; + union flow *flow; + unsigned i; + int rc; + + (void)c; + (void)stage; + + foreach_established_tcp_flow(i, flow, FLOW_MAX) + count++; + + count = htonl(count); + if (write_all_buf(fd, &count, sizeof(count))) { + rc = errno; + err_perror("Can't send flow count (%u)", ntohl(count)); + return flow_migrate_source_rollback(c, FLOW_MAX, rc); + } + + debug("Sending %u flows", ntohl(count)); + + /* Dump and send information that can be stored in the flow table. + * + * Limited rollback options here: if we fail to transfer any data (that + * is, on the first flow), undo everything and resume. Otherwise, the + * stream might now be inconsistent, and we might have closed listening + * TCP sockets, so just terminate. + */ + foreach_established_tcp_flow(i, flow, FLOW_MAX) { + rc = tcp_flow_migrate_source(fd, &flow->tcp); + if (rc) { + err("Can't send data, flow %u: %s", i, strerror_(-rc)); + if (!first) + die("Inconsistent migration state, exiting"); + + return flow_migrate_source_rollback(c, FLOW_MAX, -rc); + } + + first = false; + } + + /* And then "extended" data (including window data we saved previously): + * the target needs to set repair mode on sockets before it can set + * this stuff, but it needs sockets (and flows) for that. + * + * This also closes sockets so that the target can start connecting + * theirs: you can't sendmsg() to queues (using the socket) if the + * socket is not connected (EPIPE), not even in repair mode. And the + * target needs to restore queues now because we're sending the data. + * + * So, no rollback here, just try as hard as we can. Tolerate per-flow + * failures but not if the stream might be inconsistent (reported here + * as EIO). + */ + foreach_established_tcp_flow(i, flow, FLOW_MAX) { + rc = tcp_flow_migrate_source_ext(fd, i, &flow->tcp); + if (rc) { + err("Extended data for flow %u: %s", i, strerror_(-rc)); + + if (rc == -EIO) + die("Inconsistent migration state, exiting"); + } + } + + return 0; +} + +/** + * flow_migrate_target() - Receive flows and insert in flow table + * @c: Execution context + * @stage: Migration stage information (unused) + * @fd: Migration file descriptor + * + * Return: 0 on success, positive error code on failure + */ +int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, + int fd) +{ + uint32_t count; + unsigned i; + int rc; + + (void)stage; + + if (read_all_buf(fd, &count, sizeof(count))) + return errno; + + count = ntohl(count); + debug("Receiving %u flows", count); + + if ((rc = flow_migrate_repair_all(c, true))) + return -rc; + + repair_flush(c); + + /* TODO: flow header with type, instead? */ + for (i = 0; i < count; i++) { + rc = tcp_flow_migrate_target(c, fd); + if (rc) { + debug("Migration data failure at flow %u: %s, abort", + i, strerror_(-rc)); + return -rc; + } + } + + repair_flush(c); + + for (i = 0; i < count; i++) { + rc = tcp_flow_migrate_target_ext(c, flowtab + i, fd); + if (rc) { + debug("Migration data failure at flow %u: %s, abort", + i, strerror_(-rc)); + return -rc; + } + } + + return 0; +} + /** * flow_init() - Initialise flow related data structures */ diff --git a/flow.h b/flow.h index 24ba3ef..675726e 100644 --- a/flow.h +++ b/flow.h @@ -249,6 +249,14 @@ union flow; void flow_init(void); void flow_defer_handler(const struct ctx *c, const struct timespec *now); +int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage, + int fd); +int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, + int fd); +int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, + int fd); +int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, + int fd); void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) __attribute__((format(printf, 3, 4))); diff --git a/migrate.c b/migrate.c index 1c59016..0fca77b 100644 --- a/migrate.c +++ b/migrate.c @@ -103,6 +103,16 @@ static const struct migrate_stage stages_v1[] = { .source = seen_addrs_source_v1, .target = seen_addrs_target_v1, }, + { + .name = "prepare flows", + .source = flow_migrate_source_pre, + .target = NULL, + }, + { + .name = "transfer flows", + .source = flow_migrate_source, + .target = flow_migrate_target, + }, { 0 }, }; diff --git a/passt.c b/passt.c index 6f9fb4d..68d1a28 100644 --- a/passt.c +++ b/passt.c @@ -223,9 +223,6 @@ int main(int argc, char **argv) if (sigaction(SIGCHLD, &sa, NULL)) die_perror("Couldn't install signal handlers"); - if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) - die_perror("Couldn't set disposition for SIGPIPE"); - c.mode = MODE_PASTA; } else if (strstr(name, "passt")) { c.mode = MODE_PASST; @@ -233,6 +230,9 @@ int main(int argc, char **argv) _exit(EXIT_FAILURE); } + if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) + die_perror("Couldn't set disposition for SIGPIPE"); + madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE); c.epollfd = epoll_create1(EPOLL_CLOEXEC); diff --git a/repair.c b/repair.c index dac28a6..3ee089f 100644 --- a/repair.c +++ b/repair.c @@ -202,7 +202,6 @@ int repair_flush(struct ctx *c) * * Return: 0 on success, negative error code on failure */ -/* cppcheck-suppress unusedFunction */ int repair_set(struct ctx *c, int s, int cmd) { int rc; diff --git a/tcp.c b/tcp.c index b978b30..98e1c6a 100644 --- a/tcp.c +++ b/tcp.c @@ -280,6 +280,7 @@ #include <stddef.h> #include <string.h> #include <sys/epoll.h> +#include <sys/ioctl.h> #include <sys/socket.h> #include <sys/timerfd.h> #include <sys/types.h> @@ -287,6 +288,8 @@ #include <time.h> #include <arpa/inet.h> +#include <linux/sockios.h> + #include "checksum.h" #include "util.h" #include "iov.h" @@ -299,6 +302,7 @@ #include "log.h" #include "inany.h" #include "flow.h" +#include "repair.h" #include "linux_dep.h" #include "flow_table.h" @@ -306,6 +310,21 @@ #include "tcp_buf.h" #include "tcp_vu.h" +#ifndef __USE_MISC +/* From Linux UAPI, missing in netinet/tcp.h provided by musl */ +struct tcp_repair_opt { + __u32 opt_code; + __u32 opt_val; +}; + +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; +#endif + /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 #define WINDOW_DEFAULT 14600 /* RFC 6928 */ @@ -326,6 +345,19 @@ ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) #define CONN_HAS(conn, set) (((conn)->events & (set)) == (set)) +/* Buffers to migrate pending data from send and receive queues. No, they don't + * use memory if we don't use them. And we're going away after this, so splurge. + */ +#define TCP_MIGRATE_SND_QUEUE_MAX (64 << 20) +#define TCP_MIGRATE_RCV_QUEUE_MAX (64 << 20) +uint8_t tcp_migrate_snd_queue [TCP_MIGRATE_SND_QUEUE_MAX]; +uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX]; + +#define TCP_MIGRATE_RESTORE_CHUNK_MIN 1024 /* Try smaller when above this */ + +/* "Extended" data (not stored in the flow table) for TCP flow migration */ +static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX]; + static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -1468,6 +1500,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, conn->sock = s; conn->timer = -1; + conn->listening_sock = -1; conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; @@ -1968,10 +2001,27 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, ack_due = 1; if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) { + socklen_t sl; + struct tcp_info tinfo; + shutdown(conn->sock, SHUT_WR); conn_event(c, conn, SOCK_FIN_SENT); tcp_send_flag(c, conn, ACK); ack_due = 0; + + /* If we received a FIN, but the socket is in TCP_ESTABLISHED + * state, it must be a migrated socket. The kernel saw the FIN + * on the source socket, but not on the target socket. + * + * Approximate the effect of that FIN: as we're sending a FIN + * out ourselves, the socket is now in a state equivalent to + * LAST_ACK. Now that we sent the FIN out, close it with a RST. + */ + sl = sizeof(tinfo); + getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl); + if (tinfo.tcpi_state == TCP_ESTABLISHED && + conn->events & SOCK_FIN_RCVD) + goto reset; } if (ack_due) @@ -2054,6 +2104,7 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { + struct tcp_tap_conn *conn; union sockaddr_inany sa; socklen_t sl = sizeof(sa); struct flowside *ini; @@ -2069,6 +2120,9 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; + conn = (struct tcp_tap_conn *)flow; + conn->listening_sock = ref.fd; + tcp_sock_set_nodelay(s); /* FIXME: If useful: when the listening port has a specific bound @@ -2634,3 +2688,868 @@ void tcp_timer(struct ctx *c, const struct timespec *now) if (c->mode == MODE_PASTA) tcp_splice_refill(c); } + +/** + * tcp_flow_is_established() - Was the connection established? Includes closing + * @conn: Pointer to the TCP connection structure + * + * Return: true if the connection was established, false otherwise + */ +bool tcp_flow_is_established(const struct tcp_tap_conn *conn) +{ + return conn->events & ESTABLISHED; +} + +/** + * tcp_flow_repair_on() - Enable repair mode for a single TCP flow + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn) +{ + int rc = 0; + + if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON))) + err("Failed to set TCP_REPAIR"); + + return rc; +} + +/** + * tcp_flow_repair_off() - Clear repair mode for a single TCP flow + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn) +{ + int rc = 0; + + if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF))) + err("Failed to clear TCP_REPAIR"); + + return rc; +} + +/** + * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options + * @c: Execution context + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t) +{ + struct tcp_info tinfo; + socklen_t sl; + + sl = sizeof(tinfo); + if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { + int rc = -errno; + err_perror("Querying TCP_INFO, socket %i", s); + return rc; + } + + t->snd_ws = tinfo.tcpi_snd_wscale; + t->rcv_ws = tinfo.tcpi_rcv_wscale; + t->tcpi_state = tinfo.tcpi_state; + t->tcpi_options = tinfo.tcpi_options; + + return 0; +} + +/** + * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG + * @c: Execution context + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t) +{ + socklen_t sl = sizeof(t->mss); + + if (getsockopt(s, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) { + int rc = -errno; + err_perror("Getting MSS, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters + * @c: Execution context + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t) +{ + struct tcp_repair_window wnd; + socklen_t sl = sizeof(wnd); + + if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) { + int rc = -errno; + err_perror("Getting window repair data, socket %i", s); + return rc; + } + + t->snd_wl1 = wnd.snd_wl1; + t->snd_wnd = wnd.snd_wnd; + t->max_window = wnd.max_window; + t->rcv_wnd = wnd.rcv_wnd; + t->rcv_wup = wnd.rcv_wup; + + /* If we received a FIN, we also need to adjust window parameters. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK) { + t->rcv_wup--; + t->rcv_wnd++; + } + + return 0; +} + +/** + * tcp_flow_repair_wnd() - Restore window parameters from extended data + * @c: Execution context + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t) +{ + struct tcp_repair_window wnd; + + wnd.snd_wl1 = t->snd_wl1; + wnd.snd_wnd = t->snd_wnd; + wnd.max_window = t->max_window; + wnd.rcv_wnd = t->rcv_wnd; + wnd.rcv_wup = t->rcv_wup; + + if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sizeof(wnd))) { + int rc = -errno; + err_perror("Setting window data, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_select_queue() - Select queue (receive or send) for next operation + * @s: Socket + * @queue: TCP_RECV_QUEUE or TCP_SEND_QUEUE + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_select_queue(int s, int queue) +{ + if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue))) { + int rc = -errno; + err_perror("Selecting TCP_SEND_QUEUE, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data + * @s: Socket + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + * + * #syscalls:vu ioctl + */ +static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t) +{ + ssize_t rc; + + if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) { + rc = -errno; + err_perror("Getting send queue size, socket %i", s); + return rc; + } + + if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) { + rc = -errno; + err_perror("Getting not sent count, socket %i", s); + return rc; + } + + /* If we sent a FIN, SIOCOUTQ and SIOCOUTQNSD are one greater than the + * actual pending queue length, because they are based on the sequence + * numbers, not directly on the buffer contents. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->tcpi_state == TCP_FIN_WAIT1 || t->tcpi_state == TCP_FIN_WAIT2 || + t->tcpi_state == TCP_LAST_ACK || t->tcpi_state == TCP_CLOSING) { + if (t->sndq) + t->sndq--; + if (t->notsent) + t->notsent--; + } + + if (t->notsent > t->sndq) { + err("Invalid notsent count socket %i, send: %u, not sent: %u", + s, t->sndq, t->notsent); + return -EINVAL; + } + + if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) { + err("Send queue too large to migrate socket %i: %u bytes", + s, t->sndq); + return -ENOBUFS; + } + + rc = recv(s, tcp_migrate_snd_queue, + MIN(t->sndq, TCP_MIGRATE_SND_QUEUE_MAX), MSG_PEEK); + if (rc < 0) { + if (errno == EAGAIN) { /* EAGAIN means empty */ + rc = 0; + } else { + rc = -errno; + err_perror("Can't read send queue, socket %i", s); + return rc; + } + } + + if ((uint32_t)rc < t->sndq) { + err("Short read migrating send queue"); + return -ENXIO; + } + + t->notsent = MIN(t->notsent, t->sndq); + + return 0; +} + +/** + * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue + * @s: Socket + * @len: Length of data to be restored + * @buf: Buffer with content of pending data queue + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf) +{ + size_t chunk = len; + uint8_t *p = buf; + + while (len > 0) { + ssize_t rc = send(s, p, MIN(len, chunk), 0); + + if (rc < 0) { + if ((errno == ENOBUFS || errno == ENOMEM) && + chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) { + chunk /= 2; + continue; + } + + rc = -errno; + err_perror("Can't write queue, socket %i", s); + return rc; + } + + len -= rc; + p += rc; + } + + return 0; +} + +/** + * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue + * @s: Socket + * @v: Sequence value, set on return + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_seq(int s, uint32_t *v) +{ + socklen_t sl = sizeof(*v); + + if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) { + int rc = -errno; + err_perror("Dumping sequence, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_repair_seq() - Restore sequence for pre-selected queue + * @s: Socket + * @v: Sequence value to be set + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_seq(int s, const uint32_t *v) +{ + if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) { + int rc = -errno; + err_perror("Setting sequence, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it + * @s: Socket + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + * + * #syscalls:vu ioctl + */ +static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) +{ + ssize_t rc; + + if (ioctl(s, SIOCINQ, &t->rcvq) < 0) { + rc = -errno; + err_perror("Get receive queue size, socket %i", s); + return rc; + } + + /* If we received a FIN, SIOCINQ is one greater than the actual number + * of bytes on the queue, because it's based on the sequence number + * rather than directly on the buffer contents. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->rcvq && + (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK)) + t->rcvq--; + + if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { + err("Receive queue too large to migrate socket %i: %u bytes", + s, t->rcvq); + return -ENOBUFS; + } + + rc = recv(s, tcp_migrate_rcv_queue, t->rcvq, MSG_PEEK); + if (rc < 0) { + if (errno == EAGAIN) { /* EAGAIN means empty */ + rc = 0; + } else { + rc = -errno; + err_perror("Can't read receive queue for socket %i", s); + return rc; + } + } + + if ((uint32_t)rc < t->rcvq) { + err("Short read migrating receive queue"); + return -ENXIO; + } + + return 0; +} + +/** + * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps) + * @s: Socket + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t) +{ + const struct tcp_repair_opt opts[] = { + { TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) }, + { TCPOPT_MAXSEG, t->mss }, + { TCPOPT_SACK_PERMITTED, 0 }, + { TCPOPT_TIMESTAMP, 0 }, + }; + socklen_t sl; + + sl = sizeof(opts[0]) * (2 + + !!(t->tcpi_options & TCPI_OPT_SACK) + + !!(t->tcpi_options & TCPI_OPT_TIMESTAMPS)); + + if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) { + int rc = -errno; + err_perror("Setting repair options, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening + * @fd: Descriptor for state migration + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) +{ + struct tcp_tap_transfer t = { + .retrans = conn->retrans, + .ws_from_tap = conn->ws_from_tap, + .ws_to_tap = conn->ws_to_tap, + .events = conn->events, + + .tap_mss = htonl(MSS_GET(conn)), + + .sndbuf = htonl(conn->sndbuf), + + .flags = conn->flags, + .seq_dup_ack_approx = conn->seq_dup_ack_approx, + + .wnd_from_tap = htons(conn->wnd_from_tap), + .wnd_to_tap = htons(conn->wnd_to_tap), + + .seq_to_tap = htonl(conn->seq_to_tap), + .seq_ack_from_tap = htonl(conn->seq_ack_from_tap), + .seq_from_tap = htonl(conn->seq_from_tap), + .seq_ack_to_tap = htonl(conn->seq_ack_to_tap), + .seq_init_from_tap = htonl(conn->seq_init_from_tap), + }; + + memcpy(&t.pif, conn->f.pif, sizeof(t.pif)); + memcpy(&t.side, conn->f.side, sizeof(t.side)); + + if (write_all_buf(fd, &t, sizeof(t))) { + int rc = -errno; + err_perror("Can't write migration data, socket %i", conn->sock); + return rc; + } + + if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD)) + close(conn->listening_sock); + + return 0; +} + +/** + * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data + * @fd: Descriptor for state migration + * @fidx: Flow index + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure + */ +int tcp_flow_migrate_source_ext(int fd, int fidx, + const struct tcp_tap_conn *conn) +{ + uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; + struct tcp_tap_transfer_ext *t = &migrate_ext[fidx]; + int s = conn->sock; + int rc; + + /* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode + * weird. + */ + if (tcp_set_peek_offset(s, -1)) { + rc = -errno; + goto fail; + } + + if ((rc = tcp_flow_dump_tinfo(s, t))) + goto fail; + + if ((rc = tcp_flow_dump_mss(s, t))) + goto fail; + + if ((rc = tcp_flow_dump_wnd(s, t))) + goto fail; + + if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE))) + goto fail; + + if ((rc = tcp_flow_dump_sndqueue(s, t))) + goto fail; + + if ((rc = tcp_flow_dump_seq(s, &t->seq_snd))) + goto fail; + + if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE))) + goto fail; + + if ((rc = tcp_flow_dump_rcvqueue(s, t))) + goto fail; + + if ((rc = tcp_flow_dump_seq(s, &t->seq_rcv))) + goto fail; + + close(s); + + /* Adjustments unrelated to FIN segments: sequence numbers we dumped are + * based on the end of the queues. + */ + t->seq_rcv -= t->rcvq; + t->seq_snd -= t->sndq; + + debug("Extended migration data, socket %i sequences send %u receive %u", + s, t->seq_snd, t->seq_rcv); + debug(" pending queues: send %u not sent %u receive %u", + t->sndq, t->notsent, t->rcvq); + debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup); + debug(" SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); + + /* Endianness fix-ups */ + t->seq_snd = htonl(t->seq_snd); + t->seq_rcv = htonl(t->seq_rcv); + t->sndq = htonl(t->sndq); + t->notsent = htonl(t->notsent); + t->rcvq = htonl(t->rcvq); + + t->snd_wl1 = htonl(t->snd_wl1); + t->snd_wnd = htonl(t->snd_wnd); + t->max_window = htonl(t->max_window); + t->rcv_wnd = htonl(t->rcv_wnd); + t->rcv_wup = htonl(t->rcv_wup); + + if (write_all_buf(fd, t, sizeof(*t))) { + err_perror("Failed to write extended data, socket %i", s); + return -EIO; + } + + if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) { + err_perror("Failed to write send queue data, socket %i", s); + return -EIO; + } + + if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) { + err_perror("Failed to write receive queue data, socket %i", s); + return -EIO; + } + + return 0; + +fail: + /* For any type of failure dumping data, write an invalid extended data + * descriptor that allows us to keep the stream in sync, but tells the + * target to skip the flow. If we fail to transfer data, that's fatal: + * return -EIO in that case (and only in that case). + */ + t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */ + + if (write_all_buf(fd, t, sizeof(*t))) { + err_perror("Failed to write extended data, socket %i", s); + return -EIO; + } + + if (rc == -EIO) /* but not a migration data transfer failure */ + return -ENODATA; + + return rc; +} + +/** + * tcp_flow_repair_socket() - Open and bind socket, request repair mode + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) +{ + sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6; + const struct flowside *sockside = HOSTFLOW(conn); + union sockaddr_inany a; + socklen_t sl; + int s, rc; + + pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport); + + if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, + IPPROTO_TCP)) < 0) { + rc = -errno; + err_perror("Failed to create socket for migrated flow"); + return rc; + } + s = conn->sock; + + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int))) + debug_perror("Setting SO_REUSEADDR on socket %i", s); + + tcp_sock_set_nodelay(s); + + if ((rc = bind(s, &a.sa, sizeof(a)))) { + err_perror("Failed to bind socket for migrated flow"); + goto err; + } + + if ((rc = tcp_flow_repair_on(c, conn))) + goto err; + + return 0; + +err: + close(s); + conn->sock = -1; + return rc; +} + +/** + * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_connect(const struct ctx *c, + struct tcp_tap_conn *conn) +{ + const struct flowside *tgt = HOSTFLOW(conn); + int rc; + + rc = flowside_connect(c, conn->sock, PIF_HOST, tgt); + if (rc) { + rc = -errno; + err_perror("Failed to connect migrated socket %i", conn->sock); + return rc; + } + + conn->in_epoll = 0; + conn->timer = -1; + conn->listening_sock = -1; + + return 0; +} + +/** + * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert + * @c: Execution context + * @fd: Descriptor for state migration + * + * Return: 0 on success, negative on fatal failure, but 0 on single flow failure + */ +int tcp_flow_migrate_target(struct ctx *c, int fd) +{ + struct tcp_tap_transfer t; + struct tcp_tap_conn *conn; + union flow *flow; + int rc; + + if (!(flow = flow_alloc())) { + err("Flow table full on migration target"); + return 0; + } + + if (read_all_buf(fd, &t, sizeof(t))) { + flow_alloc_cancel(flow); + err_perror("Failed to receive migration data"); + return -errno; + } + + flow->f.state = FLOW_STATE_TGT; + memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif)); + memcpy(&flow->f.side, &t.side, sizeof(flow->f.side)); + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + + conn->retrans = t.retrans; + conn->ws_from_tap = t.ws_from_tap; + conn->ws_to_tap = t.ws_to_tap; + conn->events = t.events; + + conn->sndbuf = htonl(t.sndbuf); + + conn->flags = t.flags; + conn->seq_dup_ack_approx = t.seq_dup_ack_approx; + + MSS_SET(conn, ntohl(t.tap_mss)); + + conn->wnd_from_tap = ntohs(t.wnd_from_tap); + conn->wnd_to_tap = ntohs(t.wnd_to_tap); + + conn->seq_to_tap = ntohl(t.seq_to_tap); + conn->seq_ack_from_tap = ntohl(t.seq_ack_from_tap); + conn->seq_from_tap = ntohl(t.seq_from_tap); + conn->seq_ack_to_tap = ntohl(t.seq_ack_to_tap); + conn->seq_init_from_tap = ntohl(t.seq_init_from_tap); + + if ((rc = tcp_flow_repair_socket(c, conn))) { + flow_err(flow, "Can't set up socket: %s, drop", strerror_(rc)); + flow_alloc_cancel(flow); + return 0; + } + + flow_hash_insert(c, TAP_SIDX(conn)); + FLOW_ACTIVATE(conn); + + return 0; +} + +/** + * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect + * @c: Execution context + * @flow: Existing flow for this connection data + * @fd: Descriptor for state migration + * + * Return: 0 on success, negative on fatal failure, but 0 on single flow failure + */ +int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd) +{ + struct tcp_tap_conn *conn = &flow->tcp; + uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; + struct tcp_tap_transfer_ext t; + int s = conn->sock, rc; + + if (read_all_buf(fd, &t, sizeof(t))) { + rc = -errno; + err_perror("Failed to read extended data for socket %i", s); + return rc; + } + + if (!t.tcpi_state) { /* Source wants us to skip this flow */ + flow_err(flow, "Dropping as requested by source"); + goto fail; + } + + /* Endianness fix-ups */ + t.seq_snd = ntohl(t.seq_snd); + t.seq_rcv = ntohl(t.seq_rcv); + t.sndq = ntohl(t.sndq); + t.notsent = ntohl(t.notsent); + t.rcvq = ntohl(t.rcvq); + + t.snd_wl1 = ntohl(t.snd_wl1); + t.snd_wnd = ntohl(t.snd_wnd); + t.max_window = ntohl(t.max_window); + t.rcv_wnd = ntohl(t.rcv_wnd); + t.rcv_wup = ntohl(t.rcv_wup); + + debug("Extended migration data, socket %i sequences send %u receive %u", + s, t.seq_snd, t.seq_rcv); + debug(" pending queues: send %u not sent %u receive %u", + t.sndq, t.notsent, t.rcvq); + debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup); + debug(" SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); + + if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq || + t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { + err("Bad queues socket %i, send: %u, not sent: %u, receive: %u", + s, t.sndq, t.notsent, t.rcvq); + return -EINVAL; + } + + if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) { + rc = -errno; + err_perror("Failed to read send queue data, socket %i", s); + return rc; + } + + if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) { + rc = -errno; + err_perror("Failed to read receive queue data, socket %i", s); + return rc; + } + + if (tcp_flow_select_queue(s, TCP_SEND_QUEUE)) + goto fail; + + if (tcp_flow_repair_seq(s, &t.seq_snd)) + goto fail; + + if (tcp_flow_select_queue(s, TCP_RECV_QUEUE)) + goto fail; + + if (tcp_flow_repair_seq(s, &t.seq_rcv)) + goto fail; + + if (tcp_flow_repair_connect(c, conn)) + goto fail; + + if (tcp_flow_repair_queue(s, t.rcvq, tcp_migrate_rcv_queue)) + goto fail; + + if (tcp_flow_select_queue(s, TCP_SEND_QUEUE)) + goto fail; + + if (tcp_flow_repair_queue(s, t.sndq - t.notsent, + tcp_migrate_snd_queue)) + goto fail; + + if (tcp_flow_repair_opt(s, &t)) + goto fail; + + /* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't + * send it out, because we already sent it for sure. + * + * Call shutdown(x, SHUT_WR) in repair mode, so that we move to + * FIN_WAIT_1 (tcp_shutdown()) without sending anything + * (goto in tcp_write_xmit()). + */ + if (t.tcpi_state == TCP_FIN_WAIT2) { + int v; + + v = TCP_SEND_QUEUE; + if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) + debug_perror("Selecting repair queue, socket %i", s); + else + shutdown(s, SHUT_WR); + } + + if (tcp_flow_repair_wnd(s, &t)) + goto fail; + + tcp_flow_repair_off(c, conn); + repair_flush(c); + + if (t.notsent) { + if (tcp_flow_repair_queue(s, t.notsent, + tcp_migrate_snd_queue + + (t.sndq - t.notsent))) { + /* This sometimes seems to fail for unclear reasons. + * Don't fail the whole migration, just reset the flow + * and carry on to the next one. + */ + goto fail; + } + } + + /* If we sent a FIN but it wasn't acknowledged yet (TCP_FIN_WAIT1), send + * it out, because we don't know if we already sent it. + * + * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to + * TCP_FIN_WAIT1. + */ + if (t.tcpi_state == TCP_FIN_WAIT1) + shutdown(s, SHUT_WR); + + if (tcp_set_peek_offset(conn->sock, peek_offset)) + goto fail; + + tcp_send_flag(c, conn, ACK); + tcp_data_from_sock(c, conn); + + if ((rc = tcp_epoll_ctl(c, conn))) { + debug("Failed to subscribe to epoll for migrated socket %i: %s", + conn->sock, strerror_(-rc)); + goto fail; + } + + return 0; + +fail: + tcp_flow_repair_off(c, conn); + repair_flush(c); + + conn->flags = 0; /* Not waiting for ACK, don't schedule timer */ + tcp_rst(c, conn); + + return 0; +} diff --git a/tcp_conn.h b/tcp_conn.h index 8c20805..42dff48 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -19,6 +19,7 @@ * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number * @events: Connection events, implying connection states + * @listening_sock: Listening socket this socket was accept()ed from, or -1 * @timer: timerfd descriptor for timeout events * @flags: Connection flags representing internal attributes * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS @@ -68,6 +69,7 @@ struct tcp_tap_conn { #define CONN_STATE_BITS /* Setting these clears other flags */ \ (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) + int listening_sock; int timer :FD_REF_BITS; @@ -96,6 +98,93 @@ struct tcp_tap_conn { uint32_t seq_init_from_tap; }; +/** + * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order + * @pif: Interfaces for each side of the flow + * @side: Addresses and ports for each side of the flow + * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT + * @ws_from_tap: Window scaling factor advertised from tap/guest + * @ws_to_tap: Window scaling factor advertised to tap/guest + * @events: Connection events, implying connection states + * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS + * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS + * @flags: Connection flags representing internal attributes + * @seq_dup_ack_approx: Last duplicate ACK number sent to tap + * @wnd_from_tap: Last window size from tap, unscaled (as received) + * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) + * @seq_to_tap: Next sequence for packets to tap + * @seq_ack_from_tap: Last ACK number received from tap + * @seq_from_tap: Next sequence for packets from tap (not actually sent) + * @seq_ack_to_tap: Last ACK number sent to tap + * @seq_init_from_tap: Initial sequence number from tap +*/ +struct tcp_tap_transfer { + uint8_t pif[SIDES]; + struct flowside side[SIDES]; + + uint8_t retrans; + uint8_t ws_from_tap; + uint8_t ws_to_tap; + uint8_t events; + + uint32_t tap_mss; + + uint32_t sndbuf; + + uint8_t flags; + uint8_t seq_dup_ack_approx; + + uint16_t wnd_from_tap; + uint16_t wnd_to_tap; + + uint32_t seq_to_tap; + uint32_t seq_ack_from_tap; + uint32_t seq_from_tap; + uint32_t seq_ack_to_tap; + uint32_t seq_init_from_tap; +} __attribute__((packed, aligned(__alignof__(uint32_t)))); + +/** + * struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order + * @seq_snd: Socket-side send sequence + * @seq_rcv: Socket-side receive sequence + * @sndq: Length of pending send queue (unacknowledged / not sent) + * @notsent: Part of pending send queue that wasn't sent out yet + * @rcvq: Length of pending receive queue + * @mss: Socket-side MSS clamp + * @snd_wl1: Next sequence used in window probe (next sequence - 1) + * @snd_wnd: Socket-side sending window + * @max_window: Window clamp + * @rcv_wnd: Socket-side receive window + * @rcv_wup: rcv_nxt on last window update sent + * @snd_ws: Window scaling factor, send + * @rcv_ws: Window scaling factor, receive + * @tcpi_state: Connection state in TCP_INFO style (enum, tcp_states.h) + * @tcpi_options: TCPI_OPT_* constants (timestamps, selective ACK) + */ +struct tcp_tap_transfer_ext { + uint32_t seq_snd; + uint32_t seq_rcv; + + uint32_t sndq; + uint32_t notsent; + uint32_t rcvq; + + uint32_t mss; + + /* We can't just use struct tcp_repair_window: we need network order */ + uint32_t snd_wl1; + uint32_t snd_wnd; + uint32_t max_window; + uint32_t rcv_wnd; + uint32_t rcv_wup; + + uint8_t snd_ws; + uint8_t rcv_ws; + uint8_t tcpi_state; + uint8_t tcpi_options; +} __attribute__((packed, aligned(__alignof__(uint32_t)))); + /** * struct tcp_splice_conn - Descriptor for a spliced TCP connection * @f: Generic flow information @@ -140,6 +229,20 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; bool tcp_flow_defer(const struct tcp_tap_conn *conn); + +int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn); +int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn); + +int tcp_flow_migrate_shrink_window(int fidx, const struct tcp_tap_conn *conn); +int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn); +int tcp_flow_migrate_source_ext(int fd, int fidx, + const struct tcp_tap_conn *conn); + +int tcp_flow_migrate_target(struct ctx *c, int fd); +int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd); + +bool tcp_flow_is_established(const struct tcp_tap_conn *conn); + bool tcp_splice_flow_defer(struct tcp_splice_conn *conn); void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn); int tcp_conn_pool_sock(int pool[]); From a1e48a02ff3550eb7875a7df6726086e9b3a1213 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 13 Feb 2025 23:14:14 +1100 Subject: [PATCH 238/382] test: Add migration tests PCAP=1 ./run migrate/bidirectional gives an overview of how the whole thing is working. Add 12 tests in total, checking basic functionality with and without flows in both directions, with and without sockets in half-closed states (both inbound and outbound), migration behaviour under traffic flood, under traffic flood with > 253 flows, and strict checking of sequences under flood with ramp patterns in both directions. These tests need preparation and teardown for each case, as we need to restore the source guest in its own context and pane before we can test again. Eventually, we could consider alternating source and target so that we don't need to restart from scratch every time, but that's beyond the scope of this initial test implementation. Trick: './run migrate/*' runs all the tests with preparation and teardown steps. Co-authored-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/lib/layout | 57 +++++++++++++- test/lib/setup | 138 ++++++++++++++++++++++++++++++++- test/lib/test | 48 ++++++++++++ test/migrate/basic | 59 ++++++++++++++ test/migrate/basic_fin | 62 +++++++++++++++ test/migrate/bidirectional | 64 +++++++++++++++ test/migrate/bidirectional_fin | 64 +++++++++++++++ test/migrate/iperf3_bidir6 | 58 ++++++++++++++ test/migrate/iperf3_in4 | 50 ++++++++++++ test/migrate/iperf3_in6 | 58 ++++++++++++++ test/migrate/iperf3_many_out6 | 60 ++++++++++++++ test/migrate/iperf3_out4 | 47 +++++++++++ test/migrate/iperf3_out6 | 58 ++++++++++++++ test/migrate/rampstream_in | 12 +-- test/migrate/rampstream_out | 8 +- test/run | 42 +++++++++- 16 files changed, 871 insertions(+), 14 deletions(-) create mode 100644 test/migrate/basic create mode 100644 test/migrate/basic_fin create mode 100644 test/migrate/bidirectional create mode 100644 test/migrate/bidirectional_fin create mode 100644 test/migrate/iperf3_bidir6 create mode 100644 test/migrate/iperf3_in4 create mode 100644 test/migrate/iperf3_in6 create mode 100644 test/migrate/iperf3_many_out6 create mode 100644 test/migrate/iperf3_out4 create mode 100644 test/migrate/iperf3_out6 diff --git a/test/lib/layout b/test/lib/layout index 4d03572..fddcdc4 100644 --- a/test/lib/layout +++ b/test/lib/layout @@ -135,7 +135,7 @@ layout_two_guests() { get_info_cols pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1 - pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2 + pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #1" qemu_2 guest_2 tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done' tmux send-keys -t ${PANE_INFO} -N 100 C-m @@ -143,13 +143,66 @@ layout_two_guests() { pane_watch_contexts ${PANE_HOST} host host pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1 - pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2 + pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #1" pasta_1 passt_2 info_layout "two guests, two passt instances, in namespaces" sleep 1 } +# layout_migrate() - Two guest panes, two passt panes, two passt-repair panes, +# plus host and log +layout_migrate() { + sleep 1 + + tmux kill-pane -a -t 0 + cmd_write 0 clear + + tmux split-window -v -t passt_test + tmux split-window -h -l '33%' + tmux split-window -h -t passt_test:1.1 + + tmux split-window -h -l '35%' -t passt_test:1.0 + tmux split-window -v -t passt_test:1.0 + + tmux split-window -v -t passt_test:1.4 + tmux split-window -v -t passt_test:1.6 + + tmux split-window -v -t passt_test:1.3 + + PANE_GUEST_1=0 + PANE_GUEST_2=1 + PANE_INFO=2 + PANE_MON=3 + PANE_HOST=4 + PANE_PASST_REPAIR_1=5 + PANE_PASST_1=6 + PANE_PASST_REPAIR_2=7 + PANE_PASST_2=8 + + get_info_cols + + pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1 + pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2 + + tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done' + tmux send-keys -t ${PANE_INFO} -N 100 C-m + tmux select-pane -t ${PANE_INFO} -T "test log" + + pane_watch_contexts ${PANE_MON} "QEMU monitor" mon mon + + pane_watch_contexts ${PANE_HOST} host host + pane_watch_contexts ${PANE_PASST_REPAIR_1} "passt-repair #1 in namespace #1" repair_1 passt_repair_1 + pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1 + + pane_watch_contexts ${PANE_PASST_REPAIR_2} "passt-repair #2 in namespace #2" repair_2 passt_repair_2 + pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2 + + info_layout "two guests, two passt + passt-repair instances, in namespaces" + + sleep 1 +} + # layout_demo_pasta() - Four panes for pasta demo layout_demo_pasta() { sleep 1 diff --git a/test/lib/setup b/test/lib/setup index ee67152..575bc21 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -305,6 +305,117 @@ setup_two_guests() { context_setup_guest guest_2 ${GUEST_2_CID} } +# setup_migrate() - Set up two namespace, run qemu, passt/passt-repair in both +setup_migrate() { + context_setup_host host + context_setup_host mon + context_setup_host pasta_1 + context_setup_host pasta_2 + + layout_migrate + + # Ports: + # + # guest #1 | guest #2 | ns #1 | host + # --------- |-----------|-----------|------------ + # 10001 as server | | to guest | to ns #1 + # 10002 | | as server | to ns #1 + # 10003 | | to init | as server + # 10004 | as server | to guest | to ns #1 + + __opts= + [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/pasta_1.pcap" + [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + + __map_host4=192.0.2.1 + __map_host6=2001:db8:9a55::1 + __map_ns4=192.0.2.2 + __map_ns6=2001:db8:9a55::2 + + # Option 1: send stuff via spliced path in pasta + # context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002 -T 10003 -u 10001,10002 -U 10003 --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold" + # Option 2: send stuff via tap (--map-guest-addr) instead (useful to see capture of full migration) + context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002,10004 -T 10003 -u 10001,10002,10004 -U 10003 --map-guest-addr ${__map_host4} --map-guest-addr ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold" + context_setup_nstool passt_1 ${STATESETUP}/ns1.hold + context_setup_nstool passt_repair_1 ${STATESETUP}/ns1.hold + + context_setup_nstool passt_2 ${STATESETUP}/ns1.hold + context_setup_nstool passt_repair_2 ${STATESETUP}/ns1.hold + + context_setup_nstool qemu_1 ${STATESETUP}/ns1.hold + context_setup_nstool qemu_2 ${STATESETUP}/ns1.hold + + __ifname="$(context_run qemu_1 "ip -j link show | jq -rM '.[] | select(.link_type == \"ether\").ifname'")" + + sleep 1 + + __opts="--vhost-user" + [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap" + [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + + context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001" + wait_for [ -f "${STATESETUP}/passt_1.pid" ] + + context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair" + + __opts="--vhost-user" + [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap" + [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + + context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004" + wait_for [ -f "${STATESETUP}/passt_2.pid" ] + + context_run_bg passt_repair_2 "./passt-repair ${STATESETUP}/passt_2.socket.repair" + + __vmem="512M" # Keep migration fast + __qemu_netdev1=" \ + -chardev socket,id=c,path=${STATESETUP}/passt_1.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + __qemu_netdev2=" \ + -chardev socket,id=c,path=${STATESETUP}/passt_2.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + + GUEST_1_CID=94557 + context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \ + ' -M accel=kvm:tcg' \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ + ' -kernel '"${KERNEL}" \ + ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ + ' -nodefaults' \ + ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ + " ${__qemu_netdev1}" \ + " -pidfile ${STATESETUP}/qemu_1.pid" \ + " -device vhost-vsock-pci,guest-cid=$GUEST_1_CID" \ + " -monitor unix:${STATESETUP}/qemu_1_mon.sock,server,nowait" + + GUEST_2_CID=94558 + context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \ + ' -M accel=kvm:tcg' \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ + ' -kernel '"${KERNEL}" \ + ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ + ' -nodefaults' \ + ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ + " ${__qemu_netdev2}" \ + " -pidfile ${STATESETUP}/qemu_2.pid" \ + " -device vhost-vsock-pci,guest-cid=$GUEST_2_CID" \ + " -monitor unix:${STATESETUP}/qemu_2_mon.sock,server,nowait" \ + " -incoming tcp:0:20005" + + context_setup_guest guest_1 ${GUEST_1_CID} + # Only available after migration: + ( context_setup_guest guest_2 ${GUEST_2_CID} & ) +} + # teardown_context_watch() - Remove contexts and stop panes watching them # $1: Pane number watching # $@: Context names @@ -375,7 +486,8 @@ teardown_two_guests() { context_wait pasta_1 context_wait pasta_2 - rm -f "${STATESETUP}/passt__[12].pid" "${STATESETUP}/pasta_[12].pid" + rm "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid" + rm "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid" teardown_context_watch ${PANE_HOST} host teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1 @@ -384,6 +496,30 @@ teardown_two_guests() { teardown_context_watch ${PANE_PASST_2} pasta_2 passt_2 } +# teardown_migrate() - Exit namespaces, kill qemu processes, passt and pasta +teardown_migrate() { + ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_1.pid") + ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_2.pid") + context_wait qemu_1 + context_wait qemu_2 + + ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/passt_2.pid") + context_wait passt_1 + context_wait passt_2 + ${NSTOOL} stop "${STATESETUP}/ns1.hold" + context_wait pasta_1 + + rm -f "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid" + rm -f "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid" + + teardown_context_watch ${PANE_HOST} host + + teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1 + teardown_context_watch ${PANE_GUEST_2} qemu_2 guest_2 + teardown_context_watch ${PANE_PASST_1} pasta_1 passt_1 + teardown_context_watch ${PANE_PASST_2} pasta_1 passt_2 +} + # teardown_demo_passt() - Exit namespace, kill qemu, passt and pasta teardown_demo_passt() { tmux send-keys -t ${PANE_GUEST} "C-c" diff --git a/test/lib/test b/test/lib/test index e6726be..758250a 100755 --- a/test/lib/test +++ b/test/lib/test @@ -68,6 +68,45 @@ test_iperf3() { TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )" } +# test_iperf3m() - Ugly helper for iperf3 directive, guest migration variant +# $1: Variable name: to put the measure bandwidth into +# $2: Initial source/client context +# $3: Second source/client context the guest is moving to +# $4: Destination name or address for client +# $5: Port number, ${i} is translated to process index +# $6: Run time, in seconds +# $7: Client options +test_iperf3m() { + __var="${1}"; shift + __cctx="${1}"; shift + __cctx2="${1}"; shift + __dest="${1}"; shift + __port="${1}"; shift + __time="${1}"; shift + + pane_or_context_run "${__cctx}" 'rm -f c.json' + + # A 1s wait for connection on what's basically a local link + # indicates something is pretty wrong + __timeout=1000 + pane_or_context_run_bg "${__cctx}" \ + 'iperf3 -J -c '${__dest}' -p '${__port} \ + ' --connect-timeout '${__timeout} \ + ' -t'${__time}' -i0 '"${@}"' > c.json' \ + + __jval=".end.sum_received.bits_per_second" + + sleep $((${__time} + 3)) + + pane_or_context_output "${__cctx2}" \ + 'cat c.json' + + __bw=$(pane_or_context_output "${__cctx2}" \ + 'cat c.json | jq -rMs "map('${__jval}') | add"') + + TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )" +} + test_one_line() { __line="${1}" @@ -177,6 +216,12 @@ test_one_line() { "guest2w") pane_or_context_wait guest_2 || TEST_ONE_nok=1 ;; + "mon") + pane_or_context_run mon "${__arg}" || TEST_ONE_nok=1 + ;; + "monb") + pane_or_context_run_bg mon "${__arg}" + ;; "ns") pane_or_context_run ns "${__arg}" || TEST_ONE_nok=1 ;; @@ -292,6 +337,9 @@ test_one_line() { "iperf3") test_iperf3 ${__arg} ;; + "iperf3m") + test_iperf3m ${__arg} + ;; "set") TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__arg%% *}__" "${__arg#* }")" ;; diff --git a/test/migrate/basic b/test/migrate/basic new file mode 100644 index 0000000..3f11f7d --- /dev/null +++ b/test/migrate/basic @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/basic - Check basic migration functionality +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: guest1/guest2 > host +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc +sleep 1 +# Option 1: via spliced path in pasta, namespace to host +# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003 +# Option 2: via --map-guest-addr (tap) in pasta, namespace to host +guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006 +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] diff --git a/test/migrate/basic_fin b/test/migrate/basic_fin new file mode 100644 index 0000000..aa61ec5 --- /dev/null +++ b/test/migrate/basic_fin @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/basic_fin - Outbound traffic across migration, half-closed socket +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: guest1, half-close, guest2 > host +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' + +hostb echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg +#hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc + +#sleep 20 +# Option 1: via spliced path in pasta, namespace to host +# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003 +# Option 2: via --map-guest-addr (tap) in pasta, namespace to host +guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006 +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] diff --git a/test/migrate/bidirectional b/test/migrate/bidirectional new file mode 100644 index 0000000..4c04081 --- /dev/null +++ b/test/migrate/bidirectional @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/bidirectional - Check migration with messages in both directions +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4: guest1/guest2 > host, host > guest1/guest2 +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' + +hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc +guest1b socat -u TCP4-LISTEN:10001 OPEN:msg,create,trunc +sleep 1 + +guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006 +hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001 +sleep 1 +guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock +host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +sleep 2 +guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null +host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null + +hostw +# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1, +# use sleep 1 for the moment +sleep 1 + +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] + +g2out MSG cat msg +check [ "__MSG__" = "Dear guest 1, you are now guest 2" ] diff --git a/test/migrate/bidirectional_fin b/test/migrate/bidirectional_fin new file mode 100644 index 0000000..1c13527 --- /dev/null +++ b/test/migrate/bidirectional_fin @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/bidirectional_fin - Both directions, half-closed sockets +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4: guest1/guest2 <- (half closed) -> host +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' + +hostb echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg +guest1b echo FIN | socat TCP4-LISTEN:10001,shut-down STDIO,ignoreeof > msg +sleep 1 + +guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006 +hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001 +sleep 1 +guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock +host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +sleep 2 +guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null +host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null + +hostw +# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1, +# use sleep 1 for the moment +sleep 1 + +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] + +g2out MSG cat msg +check [ "__MSG__" = "Dear guest 1, you are now guest 2" ] diff --git a/test/migrate/iperf3_bidir6 b/test/migrate/iperf3_bidir6 new file mode 100644 index 0000000..4bfefb5 --- /dev/null +++ b/test/migrate/iperf3_bidir6 @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_bidir6 - Migration behaviour with many bidirectional flows +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 128 +set TIME 3 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N --bidir + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 host <-> guest flood, many flows, during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_in4 b/test/migrate/iperf3_in4 new file mode 100644 index 0000000..c5f3916 --- /dev/null +++ b/test/migrate/iperf3_in4 @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_in4 - Migration behaviour under inbound IPv4 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +guest1 /sbin/sysctl -w net.core.rmem_max=33554432 +guest1 /sbin/sysctl -w net.core.wmem_max=33554432 + +set THREADS 1 +set TIME 4 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4 host to guest throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_in6 b/test/migrate/iperf3_in6 new file mode 100644 index 0000000..16cf504 --- /dev/null +++ b/test/migrate/iperf3_in6 @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_in6 - Migration behaviour under inbound IPv6 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 4 +set TIME 3 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 host to guest throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_many_out6 b/test/migrate/iperf3_many_out6 new file mode 100644 index 0000000..88133f2 --- /dev/null +++ b/test/migrate/iperf3_many_out6 @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_many_out6 - Migration behaviour with many outbound flows +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 16 +set TIME 3 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N -l 1M + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 guest to host flood, many flows, during migration + +test TCP/IPv6 host to guest throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_out4 b/test/migrate/iperf3_out4 new file mode 100644 index 0000000..968057b --- /dev/null +++ b/test/migrate/iperf3_out4 @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_out4 - Migration behaviour under outbound IPv4 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 6 +set TIME 2 +set OMIT 0.1 +set OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4 guest to host throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_out6 b/test/migrate/iperf3_out6 new file mode 100644 index 0000000..21fbfcd --- /dev/null +++ b/test/migrate/iperf3_out6 @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_out6 - Migration behaviour under outbound IPv6 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 6 +set TIME 2 +set OMIT 0.1 +set OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 guest to host throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/rampstream_in b/test/migrate/rampstream_in index 46f4143..df333ba 100644 --- a/test/migrate/rampstream_in +++ b/test/migrate/rampstream_in @@ -6,10 +6,10 @@ # PASTA - Pack A Subtle Tap Abstraction # for network namespace/tap device mode # -# test/migrate/basic - Check basic migration functionality +# test/migrate/rampstream_in - Check sequence correctness with inbound ramp # -# Copyright (c) 2025 Red Hat GmbH -# Author: Stefano Brivio <sbrivio@redhat.com> +# Copyright (c) 2025 Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> g1tools ip jq dhclient socat cat htools ip jq @@ -43,15 +43,15 @@ g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__") hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] -test TCP/IPv4: host > guest +test TCP/IPv4: sequence check, ramps, inbound g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' guest1b socat -u TCP4-LISTEN:10001 EXEC:"rampstream-check.sh __RAMPS__" sleep 1 hostb socat -u EXEC:"test/rampstream send __RAMPS__" TCP4:__ADDR1__:10001 -sleep 1 +sleep 1 -#mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock +monb echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock hostw diff --git a/test/migrate/rampstream_out b/test/migrate/rampstream_out index 91b9c63..8ed3229 100644 --- a/test/migrate/rampstream_out +++ b/test/migrate/rampstream_out @@ -6,10 +6,10 @@ # PASTA - Pack A Subtle Tap Abstraction # for network namespace/tap device mode # -# test/migrate/basic - Check basic migration functionality +# test/migrate/rampstream_out - Check sequence correctness with outbound ramp # -# Copyright (c) 2025 Red Hat GmbH -# Author: Stefano Brivio <sbrivio@redhat.com> +# Copyright (c) 2025 Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> g1tools ip jq dhclient socat cat htools ip jq @@ -43,7 +43,7 @@ g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__") hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] -test TCP/IPv4: guest > host +test TCP/IPv4: sequence check, ramps, outbound g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' hostb socat -u TCP4-LISTEN:10006 EXEC:"test/rampstream check __RAMPS__" sleep 1 diff --git a/test/run b/test/run index f188d8e..4e86f30 100755 --- a/test/run +++ b/test/run @@ -130,6 +130,43 @@ run() { test two_guests_vu/basic teardown two_guests + setup migrate + test migrate/basic + teardown migrate + setup migrate + test migrate/basic_fin + teardown migrate + setup migrate + test migrate/bidirectional + teardown migrate + setup migrate + test migrate/bidirectional_fin + teardown migrate + setup migrate + test migrate/iperf3_out4 + teardown migrate + setup migrate + test migrate/iperf3_out6 + teardown migrate + setup migrate + test migrate/iperf3_in4 + teardown migrate + setup migrate + test migrate/iperf3_in6 + teardown migrate + setup migrate + test migrate/iperf3_bidir6 + teardown migrate + setup migrate + test migrate/iperf3_many_out6 + teardown migrate + setup migrate + test migrate/rampstream_in + teardown migrate + setup migrate + test migrate/rampstream_out + teardown migrate + VALGRIND=0 VHOST_USER=0 setup passt_in_ns @@ -186,7 +223,10 @@ run_selected() { __setup= for __test; do - if [ "${__test%%/*}" != "${__setup}" ]; then + # HACK: the migrate tests need the setup repeated for + # each test + if [ "${__test%%/*}" != "${__setup}" -o \ + "${__test%%/*}" = "migrate" ]; then [ -n "${__setup}" ] && teardown "${__setup}" __setup="${__test%%/*}" setup "${__setup}" From bcc4908c2b4a20c581f2b03fed40da97b804106f Mon Sep 17 00:00:00 2001 From: Enrique Llorente <ellorent@redhat.com> Date: Mon, 17 Feb 2025 10:28:14 +0100 Subject: [PATCH 239/382] dhcp: Remove option 255 length byte The option 255 (end of options) do not need the length byte, this change remove that allowing to have one extra byte at other dynamic options. Signed-off-by: Enrique Llorente <ellorent@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- dhcp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dhcp.c b/dhcp.c index 401cb5b..4a209f1 100644 --- a/dhcp.c +++ b/dhcp.c @@ -64,9 +64,9 @@ static struct opt opts[255]; #define OPT_MIN 60 /* RFC 951 */ /* Total option size (excluding end option) is 576 (RFC 2131), minus - * offset of options (268), minus end option and its length (2). + * offset of options (268), minus end option (1). */ -#define OPT_MAX 306 +#define OPT_MAX 307 /** * dhcp_init() - Initialise DHCP options @@ -127,7 +127,7 @@ struct msg { uint8_t sname[64]; uint8_t file[128]; uint32_t magic; - uint8_t o[OPT_MAX + 2 /* End option and its length */ ]; + uint8_t o[OPT_MAX + 1 /* End option */ ]; } __attribute__((__packed__)); /** @@ -194,7 +194,6 @@ static int fill(struct msg *m) } m->o[offset++] = 255; - m->o[offset++] = 0; if (offset < OPT_MIN) { memset(&m->o[offset], 0, OPT_MIN - offset); From 0a51060f7ac3e1e1a9d87ffdb037b9c367a2a4d9 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 18 Feb 2025 13:07:17 +1100 Subject: [PATCH 240/382] packet: Use flexible array member in struct pool Currently we have a dummy pkt[1] array, which we alias with an array of a different size via various macros. However, we already require C11 which includes flexible array members, so we can do better. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packet.h b/packet.h index 3f70e94..85ee550 100644 --- a/packet.h +++ b/packet.h @@ -21,7 +21,7 @@ struct pool { size_t buf_size; size_t size; size_t count; - struct iovec pkt[1]; + struct iovec pkt[]; }; int vu_packet_check_range(void *buf, size_t offset, size_t len, From 354bc0bab1cb6095592288674d375511443427fd Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 18 Feb 2025 13:07:18 +1100 Subject: [PATCH 241/382] packet: Don't pass start and offset separately to packet_check_range() Fundamentally what packet_check_range() does is to check whether a given memory range is within the allowed / expected memory set aside for packets from a particular pool. That range could represent a whole packet (from packet_add_do()) or part of a packet (from packet_get_do()), but it doesn't really matter which. However, we pass the start of the range as two parameters: @start which is the start of the packet, and @offset which is the offset within the packet of the range we're interested in. We never use these separately, only as (start + offset). Simplify the interface of packet_check_range() and vu_packet_check_range() to directly take the start of the relevant range. This will allow some additional future improvements. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 36 +++++++++++++++++++----------------- packet.h | 3 +-- vu_common.c | 11 ++++------- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/packet.c b/packet.c index 03a11e6..0330b54 100644 --- a/packet.c +++ b/packet.c @@ -23,23 +23,22 @@ #include "log.h" /** - * packet_check_range() - Check if a packet memory range is valid + * packet_check_range() - Check if a memory range is valid for a pool * @p: Packet pool - * @offset: Offset of data range in packet descriptor + * @ptr: Start of desired data range * @len: Length of desired data range - * @start: Start of the packet descriptor * @func: For tracing: name of calling function * @line: For tracing: caller line of function call * * Return: 0 if the range is valid, -1 otherwise */ -static int packet_check_range(const struct pool *p, size_t offset, size_t len, - const char *start, const char *func, int line) +static int packet_check_range(const struct pool *p, const char *ptr, size_t len, + const char *func, int line) { if (p->buf_size == 0) { int ret; - ret = vu_packet_check_range((void *)p->buf, offset, len, start); + ret = vu_packet_check_range((void *)p->buf, ptr, len); if (ret == -1) trace("cannot find region, %s:%i", func, line); @@ -47,16 +46,16 @@ static int packet_check_range(const struct pool *p, size_t offset, size_t len, return ret; } - if (start < p->buf) { - trace("packet start %p before buffer start %p, " - "%s:%i", (void *)start, (void *)p->buf, func, line); + if (ptr < p->buf) { + trace("packet range start %p before buffer start %p, %s:%i", + (void *)ptr, (void *)p->buf, func, line); return -1; } - if (start + len + offset > p->buf + p->buf_size) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", start - p->buf + len + offset, - p->buf_size, func, line); + if (ptr + len > p->buf + p->buf_size) { + trace("packet range end %p after buffer end %p, %s:%i", + (void *)(ptr + len), (void *)(p->buf + p->buf_size), + func, line); return -1; } @@ -81,7 +80,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start, return; } - if (packet_check_range(p, 0, len, start, func, line)) + if (packet_check_range(p, start, len, func, line)) return; if (len > UINT16_MAX) { @@ -110,6 +109,8 @@ void packet_add_do(struct pool *p, size_t len, const char *start, void *packet_get_do(const struct pool *p, size_t idx, size_t offset, size_t len, size_t *left, const char *func, int line) { + char *ptr; + if (idx >= p->size || idx >= p->count) { if (func) { trace("packet %zu from pool size: %zu, count: %zu, " @@ -135,14 +136,15 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (packet_check_range(p, offset, len, p->pkt[idx].iov_base, - func, line)) + ptr = (char *)p->pkt[idx].iov_base + offset; + + if (packet_check_range(p, ptr, len, func, line)) return NULL; if (left) *left = p->pkt[idx].iov_len - offset - len; - return (char *)p->pkt[idx].iov_base + offset; + return ptr; } /** diff --git a/packet.h b/packet.h index 85ee550..bdc07fe 100644 --- a/packet.h +++ b/packet.h @@ -24,8 +24,7 @@ struct pool { struct iovec pkt[]; }; -int vu_packet_check_range(void *buf, size_t offset, size_t len, - const char *start); +int vu_packet_check_range(void *buf, const char *ptr, size_t len); void packet_add_do(struct pool *p, size_t len, const char *start, const char *func, int line); void *packet_get_do(const struct pool *p, const size_t idx, diff --git a/vu_common.c b/vu_common.c index 48826b1..686a09b 100644 --- a/vu_common.c +++ b/vu_common.c @@ -26,14 +26,12 @@ * vu_packet_check_range() - Check if a given memory zone is contained in * a mapped guest memory region * @buf: Array of the available memory regions - * @offset: Offset of data range in packet descriptor + * @ptr: Start of desired data range * @size: Length of desired data range - * @start: Start of the packet descriptor * * Return: 0 if the zone is in a mapped memory region, -1 otherwise */ -int vu_packet_check_range(void *buf, size_t offset, size_t len, - const char *start) +int vu_packet_check_range(void *buf, const char *ptr, size_t len) { struct vu_dev_region *dev_region; @@ -41,9 +39,8 @@ int vu_packet_check_range(void *buf, size_t offset, size_t len, /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ char *m = (char *)(uintptr_t)dev_region->mmap_addr; - if (m <= start && - start + offset + len <= m + dev_region->mmap_offset + - dev_region->size) + if (m <= ptr && + ptr + len <= m + dev_region->mmap_offset + dev_region->size) return 0; } From 6b4065153c67e7578d448927e49f244deea70e4d Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 18 Feb 2025 13:07:19 +1100 Subject: [PATCH 242/382] tap: Remove unused ETH_HDR_INIT() macro The uses of this macro were removed in d4598e1d18ac ("udp: Use the same buffer for the L2 header for all frames"). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/tap.h b/tap.h index dfbd8b9..a476a12 100644 --- a/tap.h +++ b/tap.h @@ -6,8 +6,6 @@ #ifndef TAP_H #define TAP_H -#define ETH_HDR_INIT(proto) { .h_proto = htons_constant(proto) } - /** * struct tap_hdr - tap backend specific headers * @vnet_len: Frame length (for qemu socket transport) From 5a07eb3cccf1abf0a44d6ab01819f8f605c87ef4 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 18 Feb 2025 13:50:13 +1100 Subject: [PATCH 243/382] tcp_vu: head_cnt need not be global head_cnt is a global variable which tracks how many entries in head[] are currently used. The fact that it's global obscures the fact that the lifetime over which it has a meaningful value is quite short: a single call to of tcp_vu_data_from_sock(). Make it a local to tcp_vu_data_from_sock() to make that lifetime clearer. We keep the head[] array global for now - although technically it has the same valid lifetime - because it's large enough we might not want to put it on the stack. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_vu.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tcp_vu.c b/tcp_vu.c index 0622f17..6891ed1 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -38,7 +38,6 @@ static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1]; static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; static int head[VIRTQUEUE_MAX_SIZE + 1]; -static int head_cnt; /** * tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP) @@ -183,7 +182,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) static ssize_t tcp_vu_sock_recv(const struct ctx *c, const struct tcp_tap_conn *conn, bool v6, uint32_t already_sent, size_t fillsize, - int *iov_cnt) + int *iov_cnt, int *head_cnt) { struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; @@ -202,7 +201,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE); elem_cnt = 0; - head_cnt = 0; + *head_cnt = 0; while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) { struct iovec *iov; size_t frame_size, dlen; @@ -221,7 +220,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, ASSERT(iov->iov_len >= hdrlen); iov->iov_base = (char *)iov->iov_base + hdrlen; iov->iov_len -= hdrlen; - head[head_cnt++] = elem_cnt; + head[(*head_cnt)++] = elem_cnt; fillsize -= dlen; elem_cnt += cnt; @@ -261,17 +260,18 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, len -= iov->iov_len; } /* adjust head count */ - while (head_cnt > 0 && head[head_cnt - 1] >= i) - head_cnt--; + while (*head_cnt > 0 && head[*head_cnt - 1] >= i) + (*head_cnt)--; + /* mark end of array */ - head[head_cnt] = i; + head[*head_cnt] = i; *iov_cnt = i; /* release unused buffers */ vu_queue_rewind(vq, elem_cnt - i); /* restore space for headers in iov */ - for (i = 0; i < head_cnt; i++) { + for (i = 0; i < *head_cnt; i++) { struct iovec *iov = &elem[head[i]].in_sg[0]; iov->iov_base = (char *)iov->iov_base - hdrlen; @@ -357,11 +357,11 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; ssize_t len, previous_dlen; + int i, iov_cnt, head_cnt; size_t hdrlen, fillsize; int v6 = CONN_V6(conn); uint32_t already_sent; const uint16_t *check; - int i, iov_cnt; if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { debug("Got packet, but RX virtqueue not usable yet"); @@ -396,7 +396,8 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) /* collect the buffers from vhost-user and fill them with the * data from the socket */ - len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, &iov_cnt); + len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, + &iov_cnt, &head_cnt); if (len < 0) { if (len != -EAGAIN && len != -EWOULDBLOCK) { tcp_rst(c, conn); From e56c8038fc23a349ff4a457c6b447f927ac1a56e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 18 Feb 2025 19:59:21 +1100 Subject: [PATCH 244/382] tcp: More type safety for tcp_flow_migrate_target_ext() tcp_flow_migrate_target_ext() takes a raw union flow *, although it is TCP specific, and requires a FLOW_TYPE_TCP entry. Our usual convention is that such functions should take a struct tcp_tap_conn * instead. Convert it to do so. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 2 +- tcp.c | 7 +++---- tcp_conn.h | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/flow.c b/flow.c index cc881e8..abe95b2 100644 --- a/flow.c +++ b/flow.c @@ -1106,7 +1106,7 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, repair_flush(c); for (i = 0; i < count; i++) { - rc = tcp_flow_migrate_target_ext(c, flowtab + i, fd); + rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd); if (rc) { debug("Migration data failure at flow %u: %s, abort", i, strerror_(-rc)); diff --git a/tcp.c b/tcp.c index 98e1c6a..272e4cd 100644 --- a/tcp.c +++ b/tcp.c @@ -3394,14 +3394,13 @@ int tcp_flow_migrate_target(struct ctx *c, int fd) /** * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect * @c: Execution context - * @flow: Existing flow for this connection data + * @conn: Connection entry to complete with extra data * @fd: Descriptor for state migration * * Return: 0 on success, negative on fatal failure, but 0 on single flow failure */ -int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd) +int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd) { - struct tcp_tap_conn *conn = &flow->tcp; uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; struct tcp_tap_transfer_ext t; int s = conn->sock, rc; @@ -3413,7 +3412,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd) } if (!t.tcpi_state) { /* Source wants us to skip this flow */ - flow_err(flow, "Dropping as requested by source"); + flow_err(conn, "Dropping as requested by source"); goto fail; } diff --git a/tcp_conn.h b/tcp_conn.h index 42dff48..53887c0 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -239,7 +239,7 @@ int tcp_flow_migrate_source_ext(int fd, int fidx, const struct tcp_tap_conn *conn); int tcp_flow_migrate_target(struct ctx *c, int fd); -int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd); +int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd); bool tcp_flow_is_established(const struct tcp_tap_conn *conn); From 854bc7b1a3b4e5443ea071e49b3a68198dbb88b3 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 18 Feb 2025 19:59:22 +1100 Subject: [PATCH 245/382] tcp: Remove spurious prototype for tcp_flow_migrate_shrink_window This function existed in drafts of the migration code, but not the final version. Get rid of the prototype. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_conn.h | 1 - 1 file changed, 1 deletion(-) diff --git a/tcp_conn.h b/tcp_conn.h index 53887c0..8a15b08 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -233,7 +233,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn); int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn); int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn); -int tcp_flow_migrate_shrink_window(int fidx, const struct tcp_tap_conn *conn); int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn); int tcp_flow_migrate_source_ext(int fd, int fidx, const struct tcp_tap_conn *conn); From ba0823f8a0e60d4fc0cb21179aaf64940509156a Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 18 Feb 2025 19:59:23 +1100 Subject: [PATCH 246/382] tcp: Don't pass both flow pointer and flow index tcp_flow_migrate_source_ext() is passed both the index of the flow it operates on and the pointer to the connection structure. However, the former is trivially derived from the latter. Simplify the interface. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 2 +- tcp.c | 6 ++---- tcp_conn.h | 3 +-- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/flow.c b/flow.c index abe95b2..cc393e0 100644 --- a/flow.c +++ b/flow.c @@ -1053,7 +1053,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, * as EIO). */ foreach_established_tcp_flow(i, flow, FLOW_MAX) { - rc = tcp_flow_migrate_source_ext(fd, i, &flow->tcp); + rc = tcp_flow_migrate_source_ext(fd, &flow->tcp); if (rc) { err("Extended data for flow %u: %s", i, strerror_(-rc)); diff --git a/tcp.c b/tcp.c index 272e4cd..21b6c6c 100644 --- a/tcp.c +++ b/tcp.c @@ -3141,16 +3141,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) /** * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data * @fd: Descriptor for state migration - * @fidx: Flow index * @conn: Pointer to the TCP connection structure * * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure */ -int tcp_flow_migrate_source_ext(int fd, int fidx, - const struct tcp_tap_conn *conn) +int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) { uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; - struct tcp_tap_transfer_ext *t = &migrate_ext[fidx]; + struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)]; int s = conn->sock; int rc; diff --git a/tcp_conn.h b/tcp_conn.h index 8a15b08..9126a36 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -234,8 +234,7 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn); int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn); int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn); -int tcp_flow_migrate_source_ext(int fd, int fidx, - const struct tcp_tap_conn *conn); +int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn); int tcp_flow_migrate_target(struct ctx *c, int fd); int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd); From adb46c11d0ea67824cf8c4ef2113ec0b2c563c0e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 18 Feb 2025 19:59:24 +1100 Subject: [PATCH 247/382] flow: Add flow_perror() helper Our general logging helpers include a number of _perror() variants which, like perror(3) include the description of the current errno. We didn't have those for our flow specific logging helpers, though. Fill this gap with flow_perror() and flow_dbg_perror(), and use them where it's useful. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 12 +++++++----- flow.h | 18 ++++++++++++++---- icmp.c | 5 ++--- tcp.c | 33 +++++++++++++++------------------ tcp_splice.c | 9 ++++----- udp_flow.c | 19 +++++++------------ 6 files changed, 49 insertions(+), 47 deletions(-) diff --git a/flow.c b/flow.c index cc393e0..c68f6bb 100644 --- a/flow.c +++ b/flow.c @@ -289,11 +289,13 @@ int flowside_connect(const struct ctx *c, int s, /** flow_log_ - Log flow-related message * @f: flow the message is related to + * @newline: Append newline at the end of the message, if missing * @pri: Log priority * @fmt: Format string * @...: printf-arguments */ -void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) +void flow_log_(const struct flow_common *f, bool newline, int pri, + const char *fmt, ...) { const char *type_or_state; char msg[BUFSIZ]; @@ -309,7 +311,7 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) else type_or_state = FLOW_TYPE(f); - logmsg(true, false, pri, + logmsg(newline, false, pri, "Flow %u (%s): %s", flow_idx(f), type_or_state, msg); } @@ -329,7 +331,7 @@ void flow_log_details_(const struct flow_common *f, int pri, const struct flowside *tgt = &f->side[TGTSIDE]; if (state >= FLOW_STATE_TGT) - flow_log_(f, pri, + flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu", pif_name(f->pif[INISIDE]), inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), @@ -342,7 +344,7 @@ void flow_log_details_(const struct flow_common *f, int pri, inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)), tgt->eport); else if (state >= FLOW_STATE_INI) - flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?", + flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => ?", pif_name(f->pif[INISIDE]), inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), ini->eport, @@ -363,7 +365,7 @@ static void flow_set_state(struct flow_common *f, enum flow_state state) ASSERT(oldstate < FLOW_NUM_STATES); f->state = state; - flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], + flow_log_(f, true, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], FLOW_STATE(f)); flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate)); diff --git a/flow.h b/flow.h index 675726e..dcf7645 100644 --- a/flow.h +++ b/flow.h @@ -258,11 +258,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, int fd); -void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) - __attribute__((format(printf, 3, 4))); - -#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, (pri), __VA_ARGS__) +void flow_log_(const struct flow_common *f, bool newline, int pri, + const char *fmt, ...) + __attribute__((format(printf, 4, 5))); +#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, true, (pri), __VA_ARGS__) #define flow_dbg(f, ...) flow_log((f), LOG_DEBUG, __VA_ARGS__) #define flow_err(f, ...) flow_log((f), LOG_ERR, __VA_ARGS__) @@ -272,6 +272,16 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) flow_dbg((f), __VA_ARGS__); \ } while (0) +#define flow_log_perror_(f, pri, ...) \ + do { \ + int errno_ = errno; \ + flow_log_((f), false, (pri), __VA_ARGS__); \ + logmsg(true, true, (pri), ": %s", strerror_(errno_)); \ + } while (0) + +#define flow_dbg_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_DEBUG, __VA_ARGS__) +#define flow_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_ERR, __VA_ARGS__) + void flow_log_details_(const struct flow_common *f, int pri, enum flow_state state); #define flow_log_details(f_, pri) \ diff --git a/icmp.c b/icmp.c index bcf498d..7e2b342 100644 --- a/icmp.c +++ b/icmp.c @@ -85,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl); if (n < 0) { - flow_err(pingf, "recvfrom() error: %s", strerror_(errno)); + flow_perror(pingf, "recvfrom() error"); return; } @@ -300,8 +300,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0); if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) { - flow_dbg(pingf, "failed to relay request to socket: %s", - strerror_(errno)); + flow_dbg_perror(pingf, "failed to relay request to socket"); } else { flow_dbg(pingf, "echo request to socket, ID: %"PRIu16", seq: %"PRIu16, diff --git a/tcp.c b/tcp.c index 21b6c6c..f498f5b 100644 --- a/tcp.c +++ b/tcp.c @@ -551,8 +551,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) fd = timerfd_create(CLOCK_MONOTONIC, 0); if (fd == -1 || fd > FD_REF_MAX) { - flow_dbg(conn, "failed to get timer: %s", - strerror_(errno)); + flow_dbg_perror(conn, "failed to get timer"); if (fd > -1) close(fd); conn->timer = -1; @@ -561,8 +560,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) conn->timer = fd; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { - flow_dbg(conn, "failed to add timer: %s", - strerror_(errno)); + flow_dbg_perror(conn, "failed to add timer"); close(conn->timer); conn->timer = -1; return; @@ -587,7 +585,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); if (timerfd_settime(conn->timer, 0, &it, NULL)) - flow_err(conn, "failed to set timer: %s", strerror_(errno)); + flow_perror(conn, "failed to set timer"); } /** @@ -1386,10 +1384,10 @@ static void tcp_bind_outbound(const struct ctx *c, if (bind(s, &bind_sa.sa, sl)) { char sstr[INANY_ADDRSTRLEN]; - flow_dbg(conn, - "Can't bind TCP outbound socket to %s:%hu: %s", - inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), - tgt->oport, strerror_(errno)); + flow_dbg_perror(conn, + "Can't bind TCP outbound socket to %s:%hu", + inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), + tgt->oport); } } @@ -1398,9 +1396,9 @@ static void tcp_bind_outbound(const struct ctx *c, if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip4.ifname_out, strlen(c->ip4.ifname_out))) { - flow_dbg(conn, "Can't bind IPv4 TCP socket to" - " interface %s: %s", c->ip4.ifname_out, - strerror_(errno)); + flow_dbg_perror(conn, + "Can't bind IPv4 TCP socket to interface %s", + c->ip4.ifname_out); } } } else if (bind_sa.sa_family == AF_INET6) { @@ -1408,9 +1406,9 @@ static void tcp_bind_outbound(const struct ctx *c, if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip6.ifname_out, strlen(c->ip6.ifname_out))) { - flow_dbg(conn, "Can't bind IPv6 TCP socket to" - " interface %s: %s", c->ip6.ifname_out, - strerror_(errno)); + flow_dbg_perror(conn, + "Can't bind IPv6 TCP socket to interface %s", + c->ip6.ifname_out); } } } @@ -2193,7 +2191,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * and we just set the timer to a new point in the future: discard it. */ if (timerfd_gettime(conn->timer, &check_armed)) - flow_err(conn, "failed to read timer: %s", strerror_(errno)); + flow_perror(conn, "failed to read timer"); if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec) return; @@ -2235,8 +2233,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. */ if (timerfd_settime(conn->timer, 0, &new, &old)) - flow_err(conn, "failed to set timer: %s", - strerror_(errno)); + flow_perror(conn, "failed to set timer"); if (old.it_value.tv_sec == ACT_TIMEOUT) { flow_dbg(conn, "activity timeout"); diff --git a/tcp_splice.c b/tcp_splice.c index 5d845c9..0d10e3d 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -164,7 +164,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) || epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) { int ret = -errno; - flow_err(conn, "ERROR on epoll_ctl(): %s", strerror_(errno)); + flow_perror(conn, "ERROR on epoll_ctl()"); return ret; } @@ -317,8 +317,8 @@ static int tcp_splice_connect_finish(const struct ctx *c, if (conn->pipe[sidei][0] < 0) { if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) { - flow_err(conn, "cannot create %d->%d pipe: %s", - sidei, !sidei, strerror_(errno)); + flow_perror(conn, "cannot create %d->%d pipe", + sidei, !sidei); conn_flag(c, conn, CLOSING); return -EIO; } @@ -482,8 +482,7 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl); if (rc) - flow_err(conn, "Error retrieving SO_ERROR: %s", - strerror_(errno)); + flow_perror(conn, "Error retrieving SO_ERROR"); else flow_trace(conn, "Error event on socket: %s", strerror_(err)); diff --git a/udp_flow.c b/udp_flow.c index 83c2568..c6b8630 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -93,9 +93,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, */ uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0); if (uflow->s[INISIDE] < 0) { - flow_err(uflow, - "Couldn't duplicate listening socket: %s", - strerror_(errno)); + flow_perror(uflow, + "Couldn't duplicate listening socket"); goto cancel; } } @@ -113,16 +112,13 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, tgtpif, tgt, fref.data); if (uflow->s[TGTSIDE] < 0) { - flow_dbg(uflow, - "Couldn't open socket for spliced flow: %s", - strerror_(errno)); + flow_dbg_perror(uflow, + "Couldn't open socket for spliced flow"); goto cancel; } if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) { - flow_dbg(uflow, - "Couldn't connect flow socket: %s", - strerror_(errno)); + flow_dbg_perror(uflow, "Couldn't connect flow socket"); goto cancel; } @@ -142,9 +138,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, flow_trace(uflow, "Discarded %d spurious reply datagrams", rc); } else if (errno != EAGAIN) { - flow_err(uflow, - "Unexpected error discarding datagrams: %s", - strerror_(errno)); + flow_perror(uflow, + "Unexpected error discarding datagrams"); } } From 7ffca35fddf1568698199c931ba1877c1908b443 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 19 Feb 2025 13:28:34 +1100 Subject: [PATCH 248/382] flow: Remove unneeded index from foreach_* macros The foreach macros are odd in that they take two loop counters: an integer index, and a pointer to the flow. We nearly always want the latter, not the former, and we can get the index from the pointer trivially when we need it. So, rearrange the macros not to need the integer index. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/flow.c b/flow.c index c68f6bb..3fcdd9f 100644 --- a/flow.c +++ b/flow.c @@ -53,30 +53,28 @@ const uint8_t flow_proto[] = { static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); -#define foreach_flow(i, flow, bound) \ - for ((i) = 0, (flow) = &flowtab[(i)]; \ - (i) < (bound); \ - (i)++, (flow) = &flowtab[(i)]) \ +#define foreach_flow(flow, bound) \ + for ((flow) = flowtab; FLOW_IDX(flow) < (bound); (flow)++) \ if ((flow)->f.state == FLOW_STATE_FREE) \ - (i) += (flow)->free.n - 1; \ + (flow) += (flow)->free.n - 1; \ else -#define foreach_active_flow(i, flow, bound) \ - foreach_flow((i), (flow), (bound)) \ +#define foreach_active_flow(flow, bound) \ + foreach_flow((flow), (bound)) \ if ((flow)->f.state != FLOW_STATE_ACTIVE) \ /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ continue; \ else -#define foreach_tcp_flow(i, flow, bound) \ - foreach_active_flow((i), (flow), (bound)) \ +#define foreach_tcp_flow(flow, bound) \ + foreach_active_flow((flow), (bound)) \ if ((flow)->f.type != FLOW_TCP) \ /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ continue; \ else -#define foreach_established_tcp_flow(i, flow, bound) \ - foreach_tcp_flow((i), (flow), (bound)) \ +#define foreach_established_tcp_flow(flow, bound) \ + foreach_tcp_flow((flow), (bound)) \ if (!tcp_flow_is_established(&(flow)->tcp)) \ /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ continue; \ @@ -918,11 +916,10 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow, int ret) { union flow *flow; - unsigned i; debug("...roll back migration"); - foreach_established_tcp_flow(i, flow, max_flow) + foreach_established_tcp_flow(flow, max_flow) if (tcp_flow_repair_off(c, &flow->tcp)) die("Failed to roll back TCP_REPAIR mode"); @@ -942,10 +939,9 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow, static int flow_migrate_repair_all(struct ctx *c, bool enable) { union flow *flow; - unsigned i; int rc; - foreach_established_tcp_flow(i, flow, FLOW_MAX) { + foreach_established_tcp_flow(flow, FLOW_MAX) { if (enable) rc = tcp_flow_repair_on(c, &flow->tcp); else @@ -954,14 +950,15 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable) if (rc) { debug("Can't %s repair mode: %s", enable ? "enable" : "disable", strerror_(-rc)); - return flow_migrate_source_rollback(c, i, rc); + return flow_migrate_source_rollback(c, FLOW_IDX(flow), + rc); } } if ((rc = repair_flush(c))) { debug("Can't %s repair mode: %s", enable ? "enable" : "disable", strerror_(-rc)); - return flow_migrate_source_rollback(c, i, rc); + return flow_migrate_source_rollback(c, FLOW_IDX(flow), rc); } return 0; @@ -1003,13 +1000,12 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, uint32_t count = 0; bool first = true; union flow *flow; - unsigned i; int rc; (void)c; (void)stage; - foreach_established_tcp_flow(i, flow, FLOW_MAX) + foreach_established_tcp_flow(flow, FLOW_MAX) count++; count = htonl(count); @@ -1028,10 +1024,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, * stream might now be inconsistent, and we might have closed listening * TCP sockets, so just terminate. */ - foreach_established_tcp_flow(i, flow, FLOW_MAX) { + foreach_established_tcp_flow(flow, FLOW_MAX) { rc = tcp_flow_migrate_source(fd, &flow->tcp); if (rc) { - err("Can't send data, flow %u: %s", i, strerror_(-rc)); + err("Can't send data, flow %u: %s", FLOW_IDX(flow), + strerror_(-rc)); if (!first) die("Inconsistent migration state, exiting"); @@ -1054,10 +1051,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, * failures but not if the stream might be inconsistent (reported here * as EIO). */ - foreach_established_tcp_flow(i, flow, FLOW_MAX) { + foreach_established_tcp_flow(flow, FLOW_MAX) { rc = tcp_flow_migrate_source_ext(fd, &flow->tcp); if (rc) { - err("Extended data for flow %u: %s", i, strerror_(-rc)); + err("Extended data for flow %u: %s", FLOW_IDX(flow), + strerror_(-rc)); if (rc == -EIO) die("Inconsistent migration state, exiting"); From b79a22d3601b69cf58b1803c5ead7f4667c46827 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 19 Feb 2025 13:28:35 +1100 Subject: [PATCH 249/382] flow: Remove unneeded bound parameter from flow traversal macros The foreach macros used to step through flows each take a 'bound' parameter to only scan part of the flow table. Only one place actually passes a bound different from FLOW_MAX. So we can simplify every other invocation by having that one case manually handle the bound. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/flow.c b/flow.c index 3fcdd9f..602fea7 100644 --- a/flow.c +++ b/flow.c @@ -53,28 +53,28 @@ const uint8_t flow_proto[] = { static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); -#define foreach_flow(flow, bound) \ - for ((flow) = flowtab; FLOW_IDX(flow) < (bound); (flow)++) \ +#define foreach_flow(flow) \ + for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++) \ if ((flow)->f.state == FLOW_STATE_FREE) \ (flow) += (flow)->free.n - 1; \ else -#define foreach_active_flow(flow, bound) \ - foreach_flow((flow), (bound)) \ +#define foreach_active_flow(flow) \ + foreach_flow((flow)) \ if ((flow)->f.state != FLOW_STATE_ACTIVE) \ /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ continue; \ else -#define foreach_tcp_flow(flow, bound) \ - foreach_active_flow((flow), (bound)) \ +#define foreach_tcp_flow(flow) \ + foreach_active_flow((flow)) \ if ((flow)->f.type != FLOW_TCP) \ /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ continue; \ else -#define foreach_established_tcp_flow(flow, bound) \ - foreach_tcp_flow((flow), (bound)) \ +#define foreach_established_tcp_flow(flow) \ + foreach_tcp_flow((flow)) \ if (!tcp_flow_is_established(&(flow)->tcp)) \ /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ continue; \ @@ -907,21 +907,23 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) /** * flow_migrate_source_rollback() - Disable repair mode, return failure * @c: Execution context - * @max_flow: Maximum index of affected flows + * @bound: No need to roll back flow indices >= @bound * @ret: Negative error code * * Return: @ret */ -static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow, - int ret) +static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret) { union flow *flow; debug("...roll back migration"); - foreach_established_tcp_flow(flow, max_flow) + foreach_established_tcp_flow(flow) { + if (FLOW_IDX(flow) >= bound) + break; if (tcp_flow_repair_off(c, &flow->tcp)) die("Failed to roll back TCP_REPAIR mode"); + } if (repair_flush(c)) die("Failed to roll back TCP_REPAIR mode"); @@ -941,7 +943,7 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable) union flow *flow; int rc; - foreach_established_tcp_flow(flow, FLOW_MAX) { + foreach_established_tcp_flow(flow) { if (enable) rc = tcp_flow_repair_on(c, &flow->tcp); else @@ -1005,7 +1007,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, (void)c; (void)stage; - foreach_established_tcp_flow(flow, FLOW_MAX) + foreach_established_tcp_flow(flow) count++; count = htonl(count); @@ -1024,7 +1026,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, * stream might now be inconsistent, and we might have closed listening * TCP sockets, so just terminate. */ - foreach_established_tcp_flow(flow, FLOW_MAX) { + foreach_established_tcp_flow(flow) { rc = tcp_flow_migrate_source(fd, &flow->tcp); if (rc) { err("Can't send data, flow %u: %s", FLOW_IDX(flow), @@ -1051,7 +1053,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, * failures but not if the stream might be inconsistent (reported here * as EIO). */ - foreach_established_tcp_flow(flow, FLOW_MAX) { + foreach_established_tcp_flow(flow) { rc = tcp_flow_migrate_source_ext(fd, &flow->tcp); if (rc) { err("Extended data for flow %u: %s", FLOW_IDX(flow), From 65e317a8fca4eaf9efbfe642cc7e4322c56aa1f7 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 19 Feb 2025 13:28:36 +1100 Subject: [PATCH 250/382] flow: Clean up and generalise flow traversal macros The migration code introduced a number of 'foreach' macros to traverse the flow table. These aren't inherently tied to migration, so polish up their naming, move them to flow_table.h and also use in flow_defer_handler() which is the other place we need to traverse the whole table. For now we keep foreach_established_tcp_flow() as is. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 36 ++++++++---------------------------- flow_table.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/flow.c b/flow.c index 602fea7..bb5dcc3 100644 --- a/flow.c +++ b/flow.c @@ -53,28 +53,8 @@ const uint8_t flow_proto[] = { static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); -#define foreach_flow(flow) \ - for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++) \ - if ((flow)->f.state == FLOW_STATE_FREE) \ - (flow) += (flow)->free.n - 1; \ - else - -#define foreach_active_flow(flow) \ - foreach_flow((flow)) \ - if ((flow)->f.state != FLOW_STATE_ACTIVE) \ - /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ - continue; \ - else - -#define foreach_tcp_flow(flow) \ - foreach_active_flow((flow)) \ - if ((flow)->f.type != FLOW_TCP) \ - /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ - continue; \ - else - #define foreach_established_tcp_flow(flow) \ - foreach_tcp_flow((flow)) \ + flow_foreach_of_type((flow), FLOW_TCP) \ if (!tcp_flow_is_established(&(flow)->tcp)) \ /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ continue; \ @@ -801,7 +781,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) struct flow_free_cluster *free_head = NULL; unsigned *last_next = &flow_first_free; bool timer = false; - unsigned idx; + union flow *flow; if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) { timer = true; @@ -810,8 +790,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */ - for (idx = 0; idx < FLOW_MAX; idx++) { - union flow *flow = &flowtab[idx]; + flow_foreach_slot(flow) { bool closed = false; switch (flow->f.state) { @@ -828,12 +807,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) } else { /* New free cluster, add to chain */ free_head = &flow->free; - *last_next = idx; + *last_next = FLOW_IDX(flow); last_next = &free_head->next; } /* Skip remaining empty entries */ - idx += skip - 1; + flow += skip - 1; continue; } @@ -886,14 +865,15 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) if (free_head) { /* Add slot to current free cluster */ - ASSERT(idx == FLOW_IDX(free_head) + free_head->n); + ASSERT(FLOW_IDX(flow) == + FLOW_IDX(free_head) + free_head->n); free_head->n++; flow->free.n = flow->free.next = 0; } else { /* Create new free cluster */ free_head = &flow->free; free_head->n = 1; - *last_next = idx; + *last_next = FLOW_IDX(flow); last_next = &free_head->next; } } else { diff --git a/flow_table.h b/flow_table.h index 9a2ff24..fd2c57b 100644 --- a/flow_table.h +++ b/flow_table.h @@ -50,6 +50,42 @@ extern union flow flowtab[]; #define flow_foreach_sidei(sidei_) \ for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++) + +/** + * flow_foreach_slot() - Step through each flow table entry + * @flow: Takes values of pointer to each flow table entry + * + * Includes FREE slots. + */ +#define flow_foreach_slot(flow) \ + for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++) + +/** + * flow_foreach() - Step through each active flow + * @flow: Takes values of pointer to each active flow + */ +#define flow_foreach(flow) \ + flow_foreach_slot((flow)) \ + if ((flow)->f.state == FLOW_STATE_FREE) \ + (flow) += (flow)->free.n - 1; \ + else if ((flow)->f.state != FLOW_STATE_ACTIVE) { \ + flow_err((flow), "Bad flow state during traversal"); \ + continue; \ + } else + +/** + * flow_foreach_of_type() - Step through each active flow of given type + * @flow: Takes values of pointer to each flow + * @type_: Type of flow to traverse + */ +#define flow_foreach_of_type(flow, type_) \ + flow_foreach((flow)) \ + if ((flow)->f.type != (type_)) \ + /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ + continue; \ + else + + /** flow_idx() - Index of flow from common structure * @f: Common flow fields pointer * From 3dc7da68a2731f661d7251a5fc759daffe24ca70 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 19 Feb 2025 14:14:27 +1100 Subject: [PATCH 251/382] conf: More thorough error checking when parsing --mtu option We're a bit sloppy with parsing MTU which can lead to some surprising, though fairly harmless, results: * Passing a non-number like '-m xyz' will not give an error and act like -m 0 * Junk after a number (e.g. '-m 1500pqr') will be ignored rather than giving an error * We parse the MTU as a long, then immediately assign to an int, so on some platforms certain ludicrously out of bounds values will be silently truncated, rather than giving an error Be a bit more thorough with the error checking to avoid that. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/conf.c b/conf.c index 18017f5..335f37c 100644 --- a/conf.c +++ b/conf.c @@ -1652,20 +1652,29 @@ void conf(struct ctx *c, int argc, char **argv) die("Invalid PID file: %s", optarg); break; - case 'm': - errno = 0; - c->mtu = strtol(optarg, NULL, 0); + case 'm': { + unsigned long mtu; + char *e; - if (!c->mtu) { + errno = 0; + mtu = strtoul(optarg, &e, 0); + + if (errno || *e) + die("Invalid MTU: %s", optarg); + + if (!mtu) { c->mtu = -1; break; } - if (c->mtu < ETH_MIN_MTU || c->mtu > (int)ETH_MAX_MTU || - errno) - die("Invalid MTU: %s", optarg); + if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) { + die("MTU %lu out of range (%u..%u)", mtu, + ETH_MIN_MTU, ETH_MAX_MTU); + } + c->mtu = mtu; break; + } case 'a': if (inet_pton(AF_INET6, optarg, &c->ip6.addr) && !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr) && From 1cc5d4c9fe0a84d3d39fc07358996989ca1b5875 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 19 Feb 2025 14:14:28 +1100 Subject: [PATCH 252/382] conf: Use 0 instead of -1 as "unassigned" mtu value On the command line -m 0 means "don't assign an MTU" (letting the guest use its default. However, internally we use (c->mtu == -1) to represent that state. We use (c->mtu == 0) to represent "the user didn't specify on the command line, so use the default" - but this is only used during conf(), never afterwards. This is unnecessarily confusing. We can instead just initialise c->mtu to its default (65520) before parsing options and use 0 on both the command line and internally to represent the "don't assign" special case. This ensures that c->mtu is always 0..65535, so we can store it in a uint16_t which is more natural. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 11 ++--------- dhcp.c | 2 +- ndp.c | 2 +- passt.h | 3 ++- pasta.c | 2 +- tcp.c | 2 +- 6 files changed, 8 insertions(+), 14 deletions(-) diff --git a/conf.c b/conf.c index 335f37c..c5ee07b 100644 --- a/conf.c +++ b/conf.c @@ -1413,6 +1413,7 @@ void conf(struct ctx *c, int argc, char **argv) optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:"; } + c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t)); c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET; c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET; memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN); @@ -1662,12 +1663,7 @@ void conf(struct ctx *c, int argc, char **argv) if (errno || *e) die("Invalid MTU: %s", optarg); - if (!mtu) { - c->mtu = -1; - break; - } - - if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) { + if (mtu && (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU)) { die("MTU %lu out of range (%u..%u)", mtu, ETH_MIN_MTU, ETH_MAX_MTU); } @@ -1980,9 +1976,6 @@ void conf(struct ctx *c, int argc, char **argv) c->no_dhcpv6 = 1; } - if (!c->mtu) - c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t)); - get_dns(c); if (!*c->pasta_ifn) { diff --git a/dhcp.c b/dhcp.c index 4a209f1..66a716e 100644 --- a/dhcp.c +++ b/dhcp.c @@ -417,7 +417,7 @@ int dhcp(const struct ctx *c, const struct pool *p) &c->ip4.guest_gw, sizeof(c->ip4.guest_gw)); } - if (c->mtu != -1) { + if (c->mtu) { opts[26].slen = 2; opts[26].s[0] = c->mtu / 256; opts[26].s[1] = c->mtu % 256; diff --git a/ndp.c b/ndp.c index 37bf7a3..ded2081 100644 --- a/ndp.c +++ b/ndp.c @@ -256,7 +256,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) ptr = &ra.var[0]; - if (c->mtu != -1) { + if (c->mtu) { struct opt_mtu *mtu = (struct opt_mtu *)ptr; *mtu = (struct opt_mtu) { .header = { diff --git a/passt.h b/passt.h index 1f0dab5..28d1389 100644 --- a/passt.h +++ b/passt.h @@ -274,6 +274,8 @@ struct ctx { int fd_repair; unsigned char our_tap_mac[ETH_ALEN]; unsigned char guest_mac[ETH_ALEN]; + uint16_t mtu; + uint64_t hash_secret[2]; int ifi4; @@ -298,7 +300,6 @@ struct ctx { int no_icmp; struct icmp_ctx icmp; - int mtu; int no_dns; int no_dns_search; int no_dhcp_dns; diff --git a/pasta.c b/pasta.c index 585a51c..fa3e7de 100644 --- a/pasta.c +++ b/pasta.c @@ -319,7 +319,7 @@ void pasta_ns_conf(struct ctx *c) if (c->pasta_conf_ns) { unsigned int flags = IFF_UP; - if (c->mtu != -1) + if (c->mtu) nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu); if (c->ifi6) /* Avoid duplicate address detection on link up */ diff --git a/tcp.c b/tcp.c index f498f5b..e3c0a53 100644 --- a/tcp.c +++ b/tcp.c @@ -1139,7 +1139,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, if (flags & SYN) { int mss; - if (c->mtu == -1) { + if (!c->mtu) { mss = tinfo.tcpi_snd_mss; } else { mss = c->mtu - sizeof(struct tcphdr); From 183bedf478e34079244fe4cfbb2c1a0f02a5a037 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 18 Feb 2025 09:34:26 +0100 Subject: [PATCH 253/382] Makefile: Use mmap2() as alternative for mmap() in valgrind extra syscalls ...instead of unconditionally trying to enable both: mmap2() is the 32-bit ARM variant for mmap() (and perhaps for other architectures), bot if mmap() is available, valgrind will use that one. This avoids seccomp.sh warning us about missing mmap2() if mmap() is present, and is consistent with what we do in vhost-user code. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index d4e1096..f2ac8e5 100644 --- a/Makefile +++ b/Makefile @@ -109,9 +109,9 @@ passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS) valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \ - rt_sigreturn getpid gettid kill clock_gettime mmap \ - mmap2 munmap open unlink gettimeofday futex statx \ - readlink + rt_sigreturn getpid gettid kill clock_gettime \ + mmap|mmap2 munmap open unlink gettimeofday futex \ + statx readlink valgrind: FLAGS += -g -DVALGRIND valgrind: all From 16553c82806e0a55508baf553cb79e902638c10f Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 18 Feb 2025 09:42:28 +0100 Subject: [PATCH 254/382] dhcp: Add option code byte in calculation for OPT_MAX boundary check Otherwise we'll limit messages to 577 bytes, instead of 576 bytes as intended: $ fqdn="thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.then_make_it_251_with_this" $ hostname="__eighteen_bytes__" $ ./pasta --fqdn ${fqdn} -H ${hostname} -p dhcp.pcap -- /sbin/dhclient -4 Saving packet capture to dhcp.pcap $ tshark -r dhcp.pcap -V -Y 'dhcp.option.value == 5' | grep "Total Length" Total Length: 577 This was hidden by the issue fixed by commit bcc4908c2b4a ("dhcp Remove option 255 length byte") until now. Fixes: 31e8109a86ee ("dhcp, dhcpv6: Add hostname and client fqdn ops") Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Enrique Llorente <ellorent@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- dhcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhcp.c b/dhcp.c index 66a716e..b0de04b 100644 --- a/dhcp.c +++ b/dhcp.c @@ -143,7 +143,7 @@ static bool fill_one(struct msg *m, int o, int *offset) size_t slen = opts[o].slen; /* If we don't have space to write the option, then just skip */ - if (*offset + 1 /* length of option */ + slen > OPT_MAX) + if (*offset + 2 /* code and length of option */ + slen > OPT_MAX) return true; m->o[*offset] = o; From 4dac2351fae5534c01e144273f849ce9ece0dca7 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 18 Feb 2025 09:49:40 +0100 Subject: [PATCH 255/382] contrib/fedora: Actually install passt-repair SELinux policy file Otherwise we build it, but we don't install it. Not an issue that warrants a a release right away as it's anyway usable. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/fedora/passt.spec | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec index 6a83f8b..745cf01 100644 --- a/contrib/fedora/passt.spec +++ b/contrib/fedora/passt.spec @@ -44,7 +44,7 @@ Requires(preun): %{name} Requires(preun): policycoreutils %description selinux -This package adds SELinux enforcement to passt(1) and pasta(1). +This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1). %prep %setup -q -n passt-%{git_hash} @@ -82,6 +82,7 @@ make -f %{_datadir}/selinux/devel/Makefile install -p -m 644 -D passt.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp install -p -m 644 -D passt.if %{buildroot}%{_datadir}/selinux/devel/include/distributed/passt.if install -p -m 644 -D pasta.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp +install -p -m 644 -D passt-repair.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp popd %pre selinux @@ -90,11 +91,13 @@ popd %post selinux %selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp +%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %postun selinux if [ $1 -eq 0 ]; then %selinux_modules_uninstall -s %{selinuxtype} passt %selinux_modules_uninstall -s %{selinuxtype} pasta + %selinux_modules_uninstall -s %{selinuxtype} passt-repair fi %posttrans selinux @@ -124,6 +127,7 @@ fi %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/devel/include/distributed/passt.if %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp +%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %changelog {{{ passt_git_changelog }}} From ea69ca6a20ac7408a913fd5de383a5383d679678 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Wed, 19 Feb 2025 10:20:41 -0500 Subject: [PATCH 256/382] tap: always set the no_frag flag in IPv4 headers When studying the Linux source code and Wireshark dumps it seems like the no_frag flag in the IPv4 header is always set. Following discussions in the Internet on this subject indicates that modern routers never fragment packets, and that it isn't even supported in many cases. Adding to this that incoming messages forwarded on the tap interface never even pass through a router it seems safe to always set this flag. This makes the IPv4 headers of forwarded messages identical to those sent by the external sockets, something we must consider desirable. Signed-off-by: Jon Maloy <jmaloy@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ip.h | 3 ++- tap.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ip.h b/ip.h index 1544dbf..858cc89 100644 --- a/ip.h +++ b/ip.h @@ -36,13 +36,14 @@ .tos = 0, \ .tot_len = 0, \ .id = 0, \ - .frag_off = 0, \ + .frag_off = htons(IP_DF), \ .ttl = 0xff, \ .protocol = (proto), \ .saddr = 0, \ .daddr = 0, \ } #define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \ + (uint32_t)htons_constant(IP_DF) + \ (uint32_t)htons(0xff00 | (proto))) diff --git a/tap.c b/tap.c index d0673e5..44b0fc0 100644 --- a/tap.c +++ b/tap.c @@ -153,7 +153,7 @@ static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, ip4h->tos = 0; ip4h->tot_len = htons(l3len); ip4h->id = 0; - ip4h->frag_off = 0; + ip4h->frag_off = htons(IP_DF); ip4h->ttl = 255; ip4h->protocol = proto; ip4h->saddr = src.s_addr; From be86232f72dcfbd51a889206e80d587fbcaa1c5b Mon Sep 17 00:00:00 2001 From: Michal Privoznik <mprivozn@redhat.com> Date: Fri, 21 Feb 2025 12:53:13 +0100 Subject: [PATCH 257/382] seccomp.sh: Silence stty errors When printing list of allowed syscalls the width of terminal is obtained for nicer output (see commit below). The width is obtained by running 'stty'. While this works when building from a console, it doesn't work during rpmbuild/emerge/.. as stdout is usually not a console but a logfile and stdin is usually /dev/null or something. This results in stty reporting errors like this: stty: 'standard input': Inappropriate ioctl for device Redirect stty's stderr to /dev/null to silence it. Fixes: 712ca3235329 ("seccomp.sh: Try to account for terminal width while formatting list of system calls") Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- seccomp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seccomp.sh b/seccomp.sh index 4c521ae..a7bc417 100755 --- a/seccomp.sh +++ b/seccomp.sh @@ -255,7 +255,7 @@ for __p in ${__profiles}; do __calls="${__calls} ${EXTRA_SYSCALLS:-}" __calls="$(filter ${__calls})" - cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null + cols="$(stty -a 2>/dev/null | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args} From 87471731e6bb0b5df3a50277527caf3381b45ee4 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 28 Feb 2025 01:14:01 +0100 Subject: [PATCH 258/382] selinux: Fixes/workarounds for passt and passt-repair, mostly for libvirt usage Here are a bunch of workarounds and a couple of fixes for libvirt usage which are rather hard to split into single logical patches as there appear to be some obscure dependencies between some of them: - passt-repair needs to have an exec_type typeattribute (otherwise the policy for lsmd(1) causes a violation on getattr on its executable) file, and that typeattribute just happened to be there for passt as a result of init_daemon_domain(), but passt-repair isn't a daemon, so we need an explicit corecmd_executable_file() - passt-repair needs a workaround, which I'll revisit once https://github.com/fedora-selinux/selinux-policy/issues/2579 is solved, for usage with libvirt: allow it to use qemu_var_run_t and virt_var_run_t sockets - add 'bpf' and 'dac_read_search' capabilities for passt-repair: they are needed (for whatever reason I didn't investigate) to actually receive socket files via SCM_RIGHTS - passt needs further workarounds in the sense of https://github.com/fedora-selinux/selinux-policy/issues/2579: allow it to use map and use svirt_tmpfs_t (not just svirt_image_t): it depends on where the libvirt guest image is - ...it also needs to map /dev/null if <access mode='shared'/> is enabled in libvirt's XML for the memoryBacking object, for vhost-user operation - and 'ioctl' on the TCP socket appears to be actually needed, on top of 'getattr', to dump some socket parameters Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/selinux/passt-repair.te | 33 +++++++++++++++++++++++++++++++-- contrib/selinux/passt.te | 9 +++++++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te index e3ffbcd..f171be6 100644 --- a/contrib/selinux/passt-repair.te +++ b/contrib/selinux/passt-repair.te @@ -28,12 +28,22 @@ require { type console_device_t; type user_devpts_t; type user_tmp_t; + + # Workaround: passt-repair needs to needs to access socket files + # that passt, started by libvirt, might create under different + # labels, depending on whether passt is started as root or not. + # + # However, libvirt doesn't maintain its own policy, which makes + # updates particularly complicated. To avoid breakage in the short + # term, deal with that in passt's own policy. + type qemu_var_run_t; + type virt_var_run_t; } type passt_repair_t; domain_type(passt_repair_t); type passt_repair_exec_t; -files_type(passt_repair_exec_t); +corecmd_executable_file(passt_repair_exec_t); role unconfined_r types passt_repair_t; @@ -41,7 +51,8 @@ allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans en type_transition unconfined_t passt_repair_exec_t:process passt_repair_t; allow unconfined_t passt_repair_t:process transition; -allow passt_repair_t self:capability { dac_override net_admin net_raw }; +allow passt_repair_t self:capability { dac_override dac_read_search net_admin net_raw }; +allow passt_repair_t self:capability2 bpf; allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl }; allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl }; @@ -50,9 +61,27 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write }; allow passt_repair_t passt_t:unix_stream_socket { connectto read write }; allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write }; +allow passt_repair_t user_tmp_t:dir search; + allow passt_repair_t unconfined_t:sock_file { read write }; allow passt_repair_t passt_t:sock_file { read write }; allow passt_repair_t user_tmp_t:sock_file { read write }; allow passt_repair_t unconfined_t:tcp_socket { read setopt write }; allow passt_repair_t passt_t:tcp_socket { read setopt write }; + +# Workaround: passt-repair needs to needs to access socket files +# that passt, started by libvirt, might create under different +# labels, depending on whether passt is started as root or not. +# +# However, libvirt doesn't maintain its own policy, which makes +# updates particularly complicated. To avoid breakage in the short +# term, deal with that in passt's own policy. +allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write }; +allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write }; + +allow passt_repair_t qemu_var_run_t:dir search; +allow passt_repair_t virt_var_run_t:dir search; + +allow passt_repair_t qemu_var_run_t:sock_file { read write }; +allow passt_repair_t virt_var_run_t:sock_file { read write }; diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index f595079..f8ea672 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -29,6 +29,9 @@ require { # particularly complicated. To avoid breakage in the short term, # deal with it in passt's own policy. type svirt_image_t; + type svirt_tmpfs_t; + type svirt_t; + type null_device_t; class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map }; class dir { search write add_name remove_name mounton }; @@ -45,7 +48,7 @@ require { type net_conf_t; type proc_net_t; type node_t; - class tcp_socket { create accept listen name_bind name_connect getattr }; + class tcp_socket { create accept listen name_bind name_connect getattr ioctl }; class udp_socket { create accept listen }; class icmp_socket { bind create name_bind node_bind setopt read write }; class sock_file { create unlink write }; @@ -129,7 +132,7 @@ corenet_udp_sendrecv_all_ports(passt_t) allow passt_t node_t:icmp_socket { name_bind node_bind }; allow passt_t port_t:icmp_socket name_bind; -allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr }; +allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl }; allow passt_t self:udp_socket { create getopt setopt connect bind read write }; allow passt_t self:icmp_socket { bind create setopt read write }; @@ -143,3 +146,5 @@ allow passt_t unconfined_t:unix_stream_socket { read write }; # particularly complicated. To avoid breakage in the short term, # deal with it in passt's own policy. allow passt_t svirt_image_t:file { read write map }; +allow passt_t svirt_tmpfs_t:file { read write map }; +allow passt_t null_device_t:chr_file map; From 7b92f2e8525a94fb6f80d5e0bedba7eacc378714 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 27 Feb 2025 16:55:13 +1100 Subject: [PATCH 259/382] migrate, flow: Trivially succeed if migrating with no flows We could get a migration request when we have no active flows; or at least none that we need or are able to migrate. In this case after sending or receiving the number of flows we continue to step through various lists. In the target case, this could include communication with passt-repair. If passt-repair wasn't started that could cause further errors, but of course they shouldn't matter if we have nothing to repair. Make it more obvious that there's nothing to do and avoid such errors by short-circuiting flow_migrate_{source,target}() if there are no migratable flows. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/flow.c b/flow.c index bb5dcc3..6cf96c2 100644 --- a/flow.c +++ b/flow.c @@ -999,6 +999,9 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, debug("Sending %u flows", ntohl(count)); + if (!count) + return 0; + /* Dump and send information that can be stored in the flow table. * * Limited rollback options here: if we fail to transfer any data (that @@ -1070,6 +1073,9 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, count = ntohl(count); debug("Receiving %u flows", count); + if (!count) + return 0; + if ((rc = flow_migrate_repair_all(c, true))) return -rc; From 39f85bce1a3b9da3bd11458c521e589f674e587a Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 27 Feb 2025 16:55:14 +1100 Subject: [PATCH 260/382] migrate, flow: Don't attempt to migrate TCP flows without passt-repair Migrating TCP flows requires passt-repair in order to use TCP_REPAIR. If passt-repair is not started, our failure mode is pretty ugly though: we'll attempt the migration, hitting various problems when we can't enter repair mode. In some cases we may not roll back these changes properly, meaning we break network connections on the source. Our general approach is not to completely block migration if there are problems, but simply to break any flows we can't migrate. So, if we have no connection from passt-repair carry on with the migration, but don't attempt to migrate any TCP connections. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/flow.c b/flow.c index 6cf96c2..749c498 100644 --- a/flow.c +++ b/flow.c @@ -923,6 +923,10 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable) union flow *flow; int rc; + /* If we don't have a repair helper, there's nothing we can do */ + if (c->fd_repair < 0) + return 0; + foreach_established_tcp_flow(flow) { if (enable) rc = tcp_flow_repair_on(c, &flow->tcp); @@ -987,8 +991,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, (void)c; (void)stage; - foreach_established_tcp_flow(flow) - count++; + /* If we don't have a repair helper, we can't migrate TCP flows */ + if (c->fd_repair >= 0) { + foreach_established_tcp_flow(flow) + count++; + } count = htonl(count); if (write_all_buf(fd, &count, sizeof(count))) { From 56ce03ed0acf2a41c67d44e353c00a018604ccb7 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 27 Feb 2025 16:55:15 +1100 Subject: [PATCH 261/382] tcp: Correct error code handling from tcp_flow_repair_socket() There are two small bugs in error returns from tcp_low_repair_socket(), which is supposed to return a negative errno code: 1) On bind() failures, wedirectly pass on the return code from bind(), which is just 0 or -1, instead of an error code. 2) In the caller, tcp_flow_migrate_target() we call strerror_() directly on the negative error code, but strerror() requires a positive error code. Correct both of these. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index e3c0a53..8528ee3 100644 --- a/tcp.c +++ b/tcp.c @@ -3280,7 +3280,8 @@ int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) tcp_sock_set_nodelay(s); - if ((rc = bind(s, &a.sa, sizeof(a)))) { + if (bind(s, &a.sa, sizeof(a))) { + rc = -errno; err_perror("Failed to bind socket for migrated flow"); goto err; } @@ -3375,7 +3376,7 @@ int tcp_flow_migrate_target(struct ctx *c, int fd) conn->seq_init_from_tap = ntohl(t.seq_init_from_tap); if ((rc = tcp_flow_repair_socket(c, conn))) { - flow_err(flow, "Can't set up socket: %s, drop", strerror_(rc)); + flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc)); flow_alloc_cancel(flow); return 0; } From b2708218a6eec82fad98da52d7569d13cf35e05c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 27 Feb 2025 16:55:16 +1100 Subject: [PATCH 262/382] tcp: Unconditionally move to CLOSED state on tcp_rst() tcp_rst() attempts to send an RST packet to the guest, and if that succeeds moves the flow to CLOSED state. However, even if the tcp_send_flag() fails the flow is still dead: we've usually closed the socket already, and something has already gone irretrievably wrong. So we should still mark the flow as CLOSED. That will cause it to be cleaned up, meaning any future packets from the guest for it won't match a flow, so should generate new RSTs (they don't at the moment, but that's a separate bug). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 8528ee3..d23b6d9 100644 --- a/tcp.c +++ b/tcp.c @@ -1214,8 +1214,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->events == CLOSED) return; - if (!tcp_send_flag(c, conn, RST)) - conn_event(c, conn, CLOSED); + tcp_send_flag(c, conn, RST); + conn_event(c, conn, CLOSED); } /** From 52419a64f2dfa31707b31148e6a311bb57be6e5f Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 27 Feb 2025 16:55:17 +1100 Subject: [PATCH 263/382] migrate, tcp: Don't flow_alloc_cancel() during incoming migration In tcp_flow_migrate_target(), if we're unable to create and bind the new socket, we print an error, cancel the flow and carry on. This seems to make sense based on our policy of generally letting the migration complete even if some or all flows are lost in the process. But it doesn't quite work: the flow_alloc_cancel() means that the flows in the target's flow table are no longer one to one match to the flows which the source is sending data for. This means that data for later flows will be mismatched to a different flow. Most likely that will cause some nasty error later, but even worse it might appear to succeed but lead to data corruption due to incorrectly restoring one of the flows. Instead, we should leave the flow in the table until we've read all the data for it, *then* discard it. Technically removing the flow_alloc_cancel() would be enough for this: if tcp_flow_repair_socket() fails it leaves conn->sock == -1, which will cause the restore functions in tcp_flow_migrate_target_ext() to fail, discarding the flow. To make what's going on clearer (and with less extraneous error messages), put several explicit tests for a missing socket later in the migration path to read the data associated with the flow but explicitly discard it. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tcp.c b/tcp.c index d23b6d9..b3aa9a2 100644 --- a/tcp.c +++ b/tcp.c @@ -2708,6 +2708,9 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn) { int rc = 0; + if (conn->sock < 0) + return 0; + if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON))) err("Failed to set TCP_REPAIR"); @@ -2725,6 +2728,9 @@ int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn) { int rc = 0; + if (conn->sock < 0) + return 0; + if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF))) err("Failed to clear TCP_REPAIR"); @@ -3377,7 +3383,8 @@ int tcp_flow_migrate_target(struct ctx *c, int fd) if ((rc = tcp_flow_repair_socket(c, conn))) { flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc)); - flow_alloc_cancel(flow); + /* Can't leave the flow in an incomplete state */ + FLOW_ACTIVATE(conn); return 0; } @@ -3453,6 +3460,10 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd return rc; } + if (conn->sock < 0) + /* We weren't able to create the socket, discard flow */ + goto fail; + if (tcp_flow_select_queue(s, TCP_SEND_QUEUE)) goto fail; @@ -3540,8 +3551,10 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd return 0; fail: - tcp_flow_repair_off(c, conn); - repair_flush(c); + if (conn->sock >= 0) { + tcp_flow_repair_off(c, conn); + repair_flush(c); + } conn->flags = 0; /* Not waiting for ACK, don't schedule timer */ tcp_rst(c, conn); From 008175636c789d36ef585a94eee4d62536cac7d6 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 5 Mar 2025 15:32:28 +1100 Subject: [PATCH 264/382] ip: Helpers to access IPv6 flow label The flow label is a 20-bit field in the IPv6 header. The length and alignment make it awkward to pass around as is. Obviously, it can be packed into a 32-bit integer though, and we do this in two places. We have some further upcoming places where we want to manipulate the flow label, so make some helpers for marshalling and unmarshalling it to an integer. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ip.h | 25 +++++++++++++++++++++++++ tap.c | 4 +--- tcp.c | 4 +--- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/ip.h b/ip.h index 858cc89..5edb7e7 100644 --- a/ip.h +++ b/ip.h @@ -91,6 +91,31 @@ struct ipv6_opt_hdr { */ } __attribute__((packed)); /* required for some archs */ +/** + * ip6_set_flow_lbl() - Set flow label in an IPv6 header + * @ip6h: Pointer to IPv6 header, updated + * @flow: Set @ip6h flow label to the low 20 bits of this integer + */ +static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow) +{ + ip6h->flow_lbl[0] = (flow >> 16) & 0xf; + ip6h->flow_lbl[1] = (flow >> 8) & 0xff; + ip6h->flow_lbl[2] = (flow >> 0) & 0xff; +} + +/** ip6_get_flow_lbl() - Get flow label from an IPv6 header + * @ip6h: Pointer to IPv6 header + * + * Return: flow label from @ip6h as an integer (<= 20 bits) + */ +/* cppcheck-suppress unusedFunction */ +static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h) +{ + return (ip6h->flow_lbl[0] & 0xf) << 16 | + ip6h->flow_lbl[1] << 8 | + ip6h->flow_lbl[2]; +} + char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto, size_t *dlen); diff --git a/tap.c b/tap.c index 44b0fc0..3908262 100644 --- a/tap.c +++ b/tap.c @@ -241,9 +241,7 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h, ip6h->hop_limit = 255; ip6h->saddr = *src; ip6h->daddr = *dst; - ip6h->flow_lbl[0] = (flow >> 16) & 0xf; - ip6h->flow_lbl[1] = (flow >> 8) & 0xff; - ip6h->flow_lbl[2] = (flow >> 0) & 0xff; + ip6_set_flow_lbl(ip6h, flow); return ip6h + 1; } diff --git a/tcp.c b/tcp.c index b3aa9a2..7459803 100644 --- a/tcp.c +++ b/tcp.c @@ -963,9 +963,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn, ip6h->version = 6; ip6h->nexthdr = IPPROTO_TCP; - ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf; - ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; - ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; + ip6_set_flow_lbl(ip6h, conn->sock); if (!no_tcp_csum) { psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, From 1f236817ea715e9215e0fe4ecb0938d0a9809ce1 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 5 Mar 2025 15:32:29 +1100 Subject: [PATCH 265/382] tap: Consider IPv6 flow label when building packet sequences To allow more batching, we group together related packets into "seqs" in the tap layer, before passing them to the L4 protocol layers. Currently we consider the IP protocol, both IP addresses and also the L4 ports when grouping things into seqs. We ignore the IPv6 flow label. We have some future cases where we want to consider the the flow label in the L4 code, which is awkward if we could be given a single batch with multiple labels. Add the flow label to tap6_l4_t and group by it as well as the other criteria. In future we could possibly use the flow label _instead_ of peeking into the L4 header for the ports, but we don't do so for now. The guest should use the same flow label for all packets in a low, but if it doesn't this change won't break anything, it just means we'll batch things a bit sub-optimally. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ip.h | 1 - tap.c | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ip.h b/ip.h index 5edb7e7..c82431e 100644 --- a/ip.h +++ b/ip.h @@ -108,7 +108,6 @@ static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow) * * Return: flow label from @ip6h as an integer (<= 20 bits) */ -/* cppcheck-suppress unusedFunction */ static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h) { return (ip6h->flow_lbl[0] & 0xf) << 16 | diff --git a/tap.c b/tap.c index 3908262..202abae 100644 --- a/tap.c +++ b/tap.c @@ -489,6 +489,7 @@ static struct tap4_l4_t { * struct l4_seq6_t - Message sequence for one protocol handler call, IPv6 * @msgs: Count of messages in sequence * @protocol: Protocol number + * @flow_lbl: IPv6 flow label * @source: Source port * @dest: Destination port * @saddr: Source address @@ -497,6 +498,7 @@ static struct tap4_l4_t { */ static struct tap6_l4_t { uint8_t protocol; + uint32_t flow_lbl :20; uint16_t source; uint16_t dest; @@ -870,6 +872,7 @@ resume: ((seq)->protocol == (proto) && \ (seq)->source == (uh)->source && \ (seq)->dest == (uh)->dest && \ + (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \ IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \ IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)) @@ -878,6 +881,7 @@ resume: (seq)->protocol = (proto); \ (seq)->source = (uh)->source; \ (seq)->dest = (uh)->dest; \ + (seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \ (seq)->saddr = *saddr; \ (seq)->daddr = *daddr; \ } while (0) From 672d786de1c1f2aca32caedbcf440f710c4aecb5 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 5 Mar 2025 15:32:30 +1100 Subject: [PATCH 266/382] tcp: Send RST in response to guest packets that match no connection Currently, if a non-SYN TCP packet arrives which doesn't match any existing connection, we simply ignore it. However RFC 9293, section 3.10.7.1 says we should respond with an RST to a non-SYN, non-RST packet that's for a CLOSED (i.e. non-existent) connection. This can arise in practice with migration, in cases where some error means we have to discard a connection. We destroy the connection with tcp_rst() in that case, but because the guest is stopped, we may not be able to deliver the RST packet on the tap interface immediately. This change ensures an RST will be sent if the guest tries to use the connection again. A similar situation can arise if a passt/pasta instance is killed or crashes, but is then replaced with another attached to the same guest. This can leave the guest with stale connections that the new passt instance isn't aware of. It's better to send an RST so the guest knows quickly these are broken, rather than letting them linger until they time out. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 17 +++++++------- tap.h | 6 +++++ tcp.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- tcp.h | 2 +- 4 files changed, 88 insertions(+), 11 deletions(-) diff --git a/tap.c b/tap.c index 202abae..86d051e 100644 --- a/tap.c +++ b/tap.c @@ -122,7 +122,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c, * * Return: pointer at which to write the packet's payload */ -static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) +void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) { struct ethhdr *eh = (struct ethhdr *)buf; @@ -143,8 +143,8 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) * * Return: pointer at which to write the packet's payload */ -static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, - struct in_addr dst, size_t l4len, uint8_t proto) +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, + struct in_addr dst, size_t l4len, uint8_t proto) { uint16_t l3len = l4len + sizeof(*ip4h); @@ -229,10 +229,9 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst, * * Return: pointer at which to write the packet's payload */ -static void *tap_push_ip6h(struct ipv6hdr *ip6h, - const struct in6_addr *src, - const struct in6_addr *dst, - size_t l4len, uint8_t proto, uint32_t flow) +void *tap_push_ip6h(struct ipv6hdr *ip6h, + const struct in6_addr *src, const struct in6_addr *dst, + size_t l4len, uint8_t proto, uint32_t flow) { ip6h->payload_len = htons(l4len); ip6h->priority = 0; @@ -744,7 +743,7 @@ append: for (k = 0; k < p->count; ) k += tcp_tap_handler(c, PIF_TAP, AF_INET, &seq->saddr, &seq->daddr, - p, k, now); + 0, p, k, now); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; @@ -927,7 +926,7 @@ append: for (k = 0; k < p->count; ) k += tcp_tap_handler(c, PIF_TAP, AF_INET6, &seq->saddr, &seq->daddr, - p, k, now); + seq->flow_lbl, p, k, now); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; diff --git a/tap.h b/tap.h index a476a12..390ac12 100644 --- a/tap.h +++ b/tap.h @@ -42,6 +42,9 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) thdr->vnet_len = htonl(l2len); } +void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, + struct in_addr dst, size_t l4len, uint8_t proto); void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, const void *in, size_t dlen); @@ -49,6 +52,9 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst, const void *in, size_t l4len); const struct in6_addr *tap_ip6_daddr(const struct ctx *c, const struct in6_addr *src); +void *tap_push_ip6h(struct ipv6hdr *ip6h, + const struct in6_addr *src, const struct in6_addr *dst, + size_t l4len, uint8_t proto, uint32_t flow); void tap_udp6_send(const struct ctx *c, const struct in6_addr *src, in_port_t sport, const struct in6_addr *dst, in_port_t dport, diff --git a/tcp.c b/tcp.c index 7459803..fb04e2e 100644 --- a/tcp.c +++ b/tcp.c @@ -1866,6 +1866,75 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, tcp_data_from_sock(c, conn); } +/** + * tcp_rst_no_conn() - Send RST in response to a packet with no connection + * @c: Execution context + * @af: Address family, AF_INET or AF_INET6 + * @saddr: Source address of the packet we're responding to + * @daddr: Destination address of the packet we're responding to + * @flow_lbl: IPv6 flow label (ignored for IPv4) + * @th: TCP header of the packet we're responding to + * @l4len: Packet length, including TCP header + */ +static void tcp_rst_no_conn(const struct ctx *c, int af, + const void *saddr, const void *daddr, + uint32_t flow_lbl, + const struct tcphdr *th, size_t l4len) +{ + struct iov_tail payload = IOV_TAIL(NULL, 0, 0); + struct tcphdr *rsth; + char buf[USHRT_MAX]; + uint32_t psum = 0; + size_t rst_l2len; + + /* Don't respond to RSTs without a connection */ + if (th->rst) + return; + + if (af == AF_INET) { + struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); + const struct in_addr *rst_src = daddr; + const struct in_addr *rst_dst = saddr; + + rsth = tap_push_ip4h(ip4h, *rst_src, *rst_dst, + sizeof(*rsth), IPPROTO_TCP); + psum = proto_ipv4_header_psum(sizeof(*rsth), IPPROTO_TCP, + *rst_src, *rst_dst); + + } else { + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); + const struct in6_addr *rst_src = daddr; + const struct in6_addr *rst_dst = saddr; + + rsth = tap_push_ip6h(ip6h, rst_src, rst_dst, + sizeof(*rsth), IPPROTO_TCP, flow_lbl); + psum = proto_ipv6_header_psum(sizeof(*rsth), IPPROTO_TCP, + rst_src, rst_dst); + } + + memset(rsth, 0, sizeof(*rsth)); + + rsth->source = th->dest; + rsth->dest = th->source; + rsth->rst = 1; + rsth->doff = sizeof(*rsth) / 4UL; + + /* Sequence matching logic from RFC 9293 section 3.10.7.1 */ + if (th->ack) { + rsth->seq = th->ack_seq; + } else { + size_t dlen = l4len - th->doff * 4UL; + uint32_t ack = ntohl(th->seq) + dlen; + + rsth->ack_seq = htonl(ack); + rsth->ack = 1; + } + + tcp_update_csum(psum, rsth, &payload); + rst_l2len = ((char *)rsth - buf) + sizeof(*rsth); + tap_send_single(c, buf, rst_l2len); +} + /** * tcp_tap_handler() - Handle packets from tap and state transitions * @c: Execution context @@ -1873,6 +1942,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address + * @flow_lbl: IPv6 flow label (ignored for IPv4) * @p: Pool of TCP packets, with TCP headers * @idx: Index of first packet in pool to process * @now: Current timestamp @@ -1880,7 +1950,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, * Return: count of consumed packets */ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - const void *saddr, const void *daddr, + const void *saddr, const void *daddr, uint32_t flow_lbl, const struct pool *p, int idx, const struct timespec *now) { struct tcp_tap_conn *conn; @@ -1913,6 +1983,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (opts && th->syn && !th->ack) tcp_conn_from_tap(c, af, saddr, daddr, th, opts, optlen, now); + else + tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, len); return 1; } diff --git a/tcp.h b/tcp.h index cf30744..9142eca 100644 --- a/tcp.h +++ b/tcp.h @@ -16,7 +16,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events); int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - const void *saddr, const void *daddr, + const void *saddr, const void *daddr, uint32_t flow_lbl, const struct pool *p, int idx, const struct timespec *now); int tcp_sock_init(const struct ctx *c, const union inany_addr *addr, const char *ifname, in_port_t port); From 1924e25f0723c0a86c1e33812f8e1d8aa045a146 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 5 Mar 2025 17:20:03 +1100 Subject: [PATCH 267/382] conf: Be more precise about minimum MTUs Currently we reject the -m option if given a value less than ETH_MIN_MTU (68). That define is derived from the kernel, but its name is misleading: it doesn't really have anything to do with Ethernet per se, but is rather the minimum payload any L2 link must be able to handle in order to carry IPv4. For IPv6, it's not sufficient: that requires an MTU of at least 1280. Newer kernels have better named constants IPV4_MIN_MTU and IPv6_MIN_MTU. Copy and use those constants instead, along with some more specific error messages. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 18 +++++++++++++++--- ip.h | 7 +++++++ util.h | 6 ------ 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/conf.c b/conf.c index c5ee07b..065e720 100644 --- a/conf.c +++ b/conf.c @@ -1663,9 +1663,9 @@ void conf(struct ctx *c, int argc, char **argv) if (errno || *e) die("Invalid MTU: %s", optarg); - if (mtu && (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU)) { - die("MTU %lu out of range (%u..%u)", mtu, - ETH_MIN_MTU, ETH_MAX_MTU); + if (mtu > ETH_MAX_MTU) { + die("MTU %lu too large (max %u)", + mtu, ETH_MAX_MTU); } c->mtu = mtu; @@ -1842,9 +1842,21 @@ void conf(struct ctx *c, int argc, char **argv) c->ifi4 = conf_ip4(ifi4, &c->ip4); if (!v4_only) c->ifi6 = conf_ip6(ifi6, &c->ip6); + + if (c->ifi4 && c->mtu < IPV4_MIN_MTU) { + warn("MTU %"PRIu16" is too small for IPv4 (minimum %u)", + c->mtu, IPV4_MIN_MTU); + } + if (c->ifi6 && c->mtu < IPV6_MIN_MTU) { + warn("MTU %"PRIu16" is too small for IPv6 (minimum %u)", + c->mtu, IPV6_MIN_MTU); + } + if ((*c->ip4.ifname_out && !c->ifi4) || (*c->ip6.ifname_out && !c->ifi6)) die("External interface not usable"); + + if (!c->ifi4 && !c->ifi6) { info("No external interface as template, switch to local mode"); diff --git a/ip.h b/ip.h index c82431e..471c57e 100644 --- a/ip.h +++ b/ip.h @@ -129,4 +129,11 @@ static const struct in6_addr in6addr_ll_all_nodes = { /* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */ static const struct in_addr in4addr_broadcast = { 0xffffffff }; +#ifndef IPV4_MIN_MTU +#define IPV4_MIN_MTU 68 +#endif +#ifndef IPV6_MIN_MTU +#define IPV6_MIN_MTU 1280 +#endif + #endif /* IP_H */ diff --git a/util.h b/util.h index 50e96d3..0f70f4d 100644 --- a/util.h +++ b/util.h @@ -34,15 +34,9 @@ #ifndef ETH_MAX_MTU #define ETH_MAX_MTU USHRT_MAX #endif -#ifndef ETH_MIN_MTU -#define ETH_MIN_MTU 68 -#endif #ifndef IP_MAX_MTU #define IP_MAX_MTU USHRT_MAX #endif -#ifndef IPV6_MIN_MTU -#define IPV6_MIN_MTU 1280 -#endif #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) From 82a839be988ecfdb013b5823afc93211200a9f55 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Thu, 6 Mar 2025 13:00:03 -0500 Subject: [PATCH 268/382] tap: break out building of udp header from tap_udp4_send function We will need to build the UDP header at other locations than in function tap_udp4_send(), so we break that part out to a separate function. Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Jon Maloy <jmaloy@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 34 +++++++++++++++++++++++++++------- tap.h | 5 +++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/tap.c b/tap.c index 86d051e..6f7063e 100644 --- a/tap.c +++ b/tap.c @@ -163,7 +163,7 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, } /** - * tap_udp4_send() - Send UDP over IPv4 packet + * tap_push_uh4() - Build UDPv4 header with checksum * @c: Execution context * @src: IPv4 source address * @sport: UDP source port @@ -171,16 +171,14 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, * @dport: UDP destination port * @in: UDP payload contents (not including UDP header) * @dlen: UDP payload length (not including UDP header) + * + * Return: pointer at which to write the packet's payload */ -void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, +void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, const void *in, size_t dlen) { size_t l4len = dlen + sizeof(struct udphdr); - char buf[USHRT_MAX]; - struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); - struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP); - char *data = (char *)(uh + 1); const struct iovec iov = { .iov_base = (void *)in, .iov_len = dlen @@ -191,8 +189,30 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, uh->dest = htons(dport); uh->len = htons(l4len); csum_udp4(uh, src, dst, &payload); - memcpy(data, in, dlen); + return (char *)uh + sizeof(*uh); +} +/** + * tap_udp4_send() - Send UDP over IPv4 packet + * @c: Execution context + * @src: IPv4 source address + * @sport: UDP source port + * @dst: IPv4 destination address + * @dport: UDP destination port + * @in: UDP payload contents (not including UDP header) + * @dlen: UDP payload length (not including UDP header) + */ +void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, + struct in_addr dst, in_port_t dport, + const void *in, size_t dlen) +{ + size_t l4len = dlen + sizeof(struct udphdr); + char buf[USHRT_MAX]; + struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); + struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP); + char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen); + + memcpy(data, in, dlen); tap_send_single(c, buf, dlen + (data - buf)); } diff --git a/tap.h b/tap.h index 390ac12..a2cf9bc 100644 --- a/tap.h +++ b/tap.h @@ -6,6 +6,8 @@ #ifndef TAP_H #define TAP_H +struct udphdr; + /** * struct tap_hdr - tap backend specific headers * @vnet_len: Frame length (for qemu socket transport) @@ -45,6 +47,9 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, struct in_addr dst, size_t l4len, uint8_t proto); +void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport, + struct in_addr dst, in_port_t dport, + const void *in, size_t dlen); void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, const void *in, size_t dlen); From 55431f0077b6a25c264bd2492680d7f99815cc5f Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Thu, 6 Mar 2025 13:00:04 -0500 Subject: [PATCH 269/382] udp: create and send ICMPv4 to local peer when applicable When a local peer sends a UDP message to a non-existing port on an existing remote host, that host will return an ICMP message containing the error code ICMP_PORT_UNREACH, plus the header and the first eight bytes of the original message. If the sender socket has been connected, it uses this message to issue a "Connection Refused" event to the user. Until now, we have only read such events from the externally facing socket, but we don't forward them back to the local sender because we cannot read the ICMP message directly to user space. Because of this, the local peer will hang and wait for a response that never arrives. We now fix this for IPv4 by recreating and forwarding a correct ICMP message back to the internal sender. We synthesize the message based on the information in the extended error structure, plus the returned part of the original message body. Note that for the sake of completeness, we even produce ICMP messages for other error codes. We have noticed that at least ICMP_PROT_UNREACH is propagated as an error event back to the user. Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Jon Maloy <jmaloy@redhat.com> [sbrivio: fix cppcheck warning: udp_send_conn_fail_icmp4() doesn't modify 'in', it can be declared as const] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 2 +- tap.h | 2 ++ udp.c | 87 +++++++++++++++++++++++++++++++++++++++++++------- udp_internal.h | 2 +- udp_vu.c | 4 +-- 5 files changed, 81 insertions(+), 16 deletions(-) diff --git a/tap.c b/tap.c index 6f7063e..57d0795 100644 --- a/tap.c +++ b/tap.c @@ -159,7 +159,7 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, ip4h->saddr = src.s_addr; ip4h->daddr = dst.s_addr; ip4h->check = csum_ip4_header(l3len, proto, src, dst); - return ip4h + 1; + return (char *)ip4h + sizeof(*ip4h); } /** diff --git a/tap.h b/tap.h index a2cf9bc..9ac17ce 100644 --- a/tap.h +++ b/tap.h @@ -50,6 +50,8 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, const void *in, size_t dlen); +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, + struct in_addr dst, size_t l4len, uint8_t proto); void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, const void *in, size_t dlen); diff --git a/udp.c b/udp.c index 923cc38..b72c3ce 100644 --- a/udp.c +++ b/udp.c @@ -87,6 +87,7 @@ #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/udp.h> +#include <netinet/ip_icmp.h> #include <stdint.h> #include <stddef.h> #include <string.h> @@ -112,6 +113,9 @@ #include "udp_internal.h" #include "udp_vu.h" +/* Maximum UDP data to be returned in ICMP messages */ +#define ICMP4_MAX_DLEN 8 + /* "Spliced" sockets indexed by bound port (host order) */ static int udp_splice_ns [IP_VERSIONS][NUM_PORTS]; static int udp_splice_init[IP_VERSIONS][NUM_PORTS]; @@ -402,25 +406,76 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; } +/** + * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer + * @c: Execution context + * @ee: Extended error descriptor + * @toside: Destination side of flow + * @saddr: Address of ICMP generating node + * @in: First bytes (max 8) of original UDP message body + * @dlen: Length of the read part of original UDP message body + */ +static void udp_send_conn_fail_icmp4(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + struct in_addr saddr, + const void *in, size_t dlen) +{ + struct in_addr oaddr = toside->oaddr.v4mapped.a4; + struct in_addr eaddr = toside->eaddr.v4mapped.a4; + in_port_t eport = toside->eport; + in_port_t oport = toside->oport; + struct { + struct icmphdr icmp4h; + struct iphdr ip4h; + struct udphdr uh; + char data[ICMP4_MAX_DLEN]; + } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg; + size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen; + size_t l4len = dlen + sizeof(struct udphdr); + + ASSERT(dlen <= ICMP4_MAX_DLEN); + memset(&msg, 0, sizeof(msg)); + msg.icmp4h.type = ee->ee_type; + msg.icmp4h.code = ee->ee_code; + if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED) + msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info); + + /* Reconstruct the original headers as returned in the ICMP message */ + tap_push_ip4h(&msg.ip4h, eaddr, oaddr, l4len, IPPROTO_UDP); + tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen); + memcpy(&msg.data, in, dlen); + + tap_icmp4_send(c, saddr, eaddr, &msg, msglen); +} + /** * udp_sock_recverr() - Receive and clear an error from a socket - * @s: Socket to receive from + * @c: Execution context + * @ref: epoll reference * * Return: 1 if error received and processed, 0 if no more errors in queue, < 0 * if there was an error reading the queue * * #syscalls recvmsg */ -static int udp_sock_recverr(int s) +static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) { const struct sock_extended_err *ee; const struct cmsghdr *hdr; + union sockaddr_inany saddr; char buf[CMSG_SPACE(sizeof(*ee))]; + char data[ICMP4_MAX_DLEN]; + int s = ref.fd; + struct iovec iov = { + .iov_base = data, + .iov_len = sizeof(data) + }; struct msghdr mh = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = NULL, - .msg_iovlen = 0, + .msg_name = &saddr, + .msg_namelen = sizeof(saddr), + .msg_iov = &iov, + .msg_iovlen = 1, .msg_control = buf, .msg_controllen = sizeof(buf), }; @@ -450,8 +505,15 @@ static int udp_sock_recverr(int s) } ee = (const struct sock_extended_err *)CMSG_DATA(hdr); + if (ref.type == EPOLL_TYPE_UDP_REPLY) { + flow_sidx_t sidx = flow_sidx_opposite(ref.flowside); + const struct flowside *toside = flowside_at_sidx(sidx); - /* TODO: When possible propagate and otherwise handle errors */ + udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr, + data, rc); + } else { + trace("Ignoring received IP_RECVERR cmsg on listener socket"); + } debug("%s error on UDP socket %i: %s", str_ee_origin(ee), s, strerror_(ee->ee_errno)); @@ -461,15 +523,16 @@ static int udp_sock_recverr(int s) /** * udp_sock_errs() - Process errors on a socket * @c: Execution context - * @s: Socket to receive from + * @ref: epoll reference * @events: epoll events bitmap * * Return: Number of errors handled, or < 0 if we have an unrecoverable error */ -int udp_sock_errs(const struct ctx *c, int s, uint32_t events) +int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events) { unsigned n_err = 0; socklen_t errlen; + int s = ref.fd; int rc, err; ASSERT(!c->no_udp); @@ -478,7 +541,7 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events) return 0; /* Nothing to do */ /* Empty the error queue */ - while ((rc = udp_sock_recverr(s)) > 0) + while ((rc = udp_sock_recverr(c, ref)) > 0) n_err += rc; if (rc < 0) @@ -558,7 +621,7 @@ static void udp_buf_listen_sock_handler(const struct ctx *c, const socklen_t sasize = sizeof(udp_meta[0].s_in); int n, i; - if (udp_sock_errs(c, ref.fd, events) < 0) { + if (udp_sock_errs(c, ref, events) < 0) { err("UDP: Unrecoverable error on listening socket:" " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); /* FIXME: what now? close/re-open socket? */ @@ -661,7 +724,7 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, from_s = uflow->s[ref.flowside.sidei]; - if (udp_sock_errs(c, from_s, events) < 0) { + if (udp_sock_errs(c, ref, events) < 0) { flow_err(uflow, "Unrecoverable error on reply socket"); flow_err_details(uflow); udp_flow_close(c, uflow); diff --git a/udp_internal.h b/udp_internal.h index cc80e30..3b081f5 100644 --- a/udp_internal.h +++ b/udp_internal.h @@ -30,5 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen, bool no_udp_csum); -int udp_sock_errs(const struct ctx *c, int s, uint32_t events); +int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events); #endif /* UDP_INTERNAL_H */ diff --git a/udp_vu.c b/udp_vu.c index 4123510..c26a223 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -227,7 +227,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; int i; - if (udp_sock_errs(c, ref.fd, events) < 0) { + if (udp_sock_errs(c, ref, events) < 0) { err("UDP: Unrecoverable error on listening socket:" " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); return; @@ -302,7 +302,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, ASSERT(!c->no_udp); - if (udp_sock_errs(c, from_s, events) < 0) { + if (udp_sock_errs(c, ref, events) < 0) { flow_err(uflow, "Unrecoverable error on reply socket"); flow_err_details(uflow); udp_flow_close(c, uflow); From 87e6a464429372dfaa7212b61e5062dad87179dc Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Thu, 6 Mar 2025 13:00:05 -0500 Subject: [PATCH 270/382] tap: break out building of udp header from tap_udp6_send function We will need to build the UDP header at other locations than in function tap_udp6_send(), so we break that part out to a separate function. Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Jon Maloy <jmaloy@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 46 ++++++++++++++++++++++++++++++++++------------ tap.h | 4 ++++ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/tap.c b/tap.c index 57d0795..7082620 100644 --- a/tap.c +++ b/tap.c @@ -265,7 +265,7 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h, } /** - * tap_udp6_send() - Send UDP over IPv6 packet + * tap_push_uh6() - Build UDPv6 header with checksum * @c: Execution context * @src: IPv6 source address * @sport: UDP source port @@ -274,6 +274,38 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h, * @flow: Flow label * @in: UDP payload contents (not including UDP header) * @dlen: UDP payload length (not including UDP header) + * + * Return: pointer at which to write the packet's payload + */ +void *tap_push_uh6(struct udphdr *uh, + const struct in6_addr *src, in_port_t sport, + const struct in6_addr *dst, in_port_t dport, + void *in, size_t dlen) +{ + size_t l4len = dlen + sizeof(struct udphdr); + const struct iovec iov = { + .iov_base = in, + .iov_len = dlen + }; + struct iov_tail payload = IOV_TAIL(&iov, 1, 0); + + uh->source = htons(sport); + uh->dest = htons(dport); + uh->len = htons(l4len); + csum_udp6(uh, src, dst, &payload); + return (char *)uh + sizeof(*uh); +} + +/** + * tap_udp6_send() - Send UDP over IPv6 packet + * @c: Execution context + * @src: IPv6 source address + * @sport: UDP source port + * @dst: IPv6 destination address + * @dport: UDP destination port + * @flow: Flow label + * @in: UDP payload contents (not including UDP header) + * @dlen: UDP payload length (not including UDP header) */ void tap_udp6_send(const struct ctx *c, const struct in6_addr *src, in_port_t sport, @@ -285,19 +317,9 @@ void tap_udp6_send(const struct ctx *c, struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); struct udphdr *uh = tap_push_ip6h(ip6h, src, dst, l4len, IPPROTO_UDP, flow); - char *data = (char *)(uh + 1); - const struct iovec iov = { - .iov_base = in, - .iov_len = dlen - }; - struct iov_tail payload = IOV_TAIL(&iov, 1, 0); + char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen); - uh->source = htons(sport); - uh->dest = htons(dport); - uh->len = htons(l4len); - csum_udp6(uh, src, dst, &payload); memcpy(data, in, dlen); - tap_send_single(c, buf, dlen + (data - buf)); } diff --git a/tap.h b/tap.h index 9ac17ce..b53a5b8 100644 --- a/tap.h +++ b/tap.h @@ -50,6 +50,10 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, const void *in, size_t dlen); +void *tap_push_uh6(struct udphdr *uh, + const struct in6_addr *src, in_port_t sport, + const struct in6_addr *dst, in_port_t dport, + void *in, size_t dlen); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, struct in_addr dst, size_t l4len, uint8_t proto); void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, From 68b04182e07da6a437479cb191e5468db382bc56 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Thu, 6 Mar 2025 13:00:06 -0500 Subject: [PATCH 271/382] udp: create and send ICMPv6 to local peer when applicable When a local peer sends a UDP message to a non-existing port on an existing remote host, that host will return an ICMPv6 message containing the error code ICMP6_DST_UNREACH_NOPORT, plus the IPv6 header, UDP header and the first 1232 bytes of the original message, if any. If the sender socket has been connected, it uses this message to issue a "Connection Refused" event to the user. Until now, we have only read such events from the externally facing socket, but we don't forward them back to the local sender because we cannot read the ICMP message directly to user space. Because of this, the local peer will hang and wait for a response that never arrives. We now fix this for IPv6 by recreating and forwarding a correct ICMP message back to the internal sender. We synthesize the message based on the information in the extended error structure, plus the returned part of the original message body. Note that for the sake of completeness, we even produce ICMP messages for other error types and codes. We have noticed that at least ICMP_PROT_UNREACH is propagated as an error event back to the user. Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Jon Maloy <jmaloy@redhat.com> [sbrivio: fix cppcheck warning, udp_send_conn_fail_icmp6() doesn't modify saddr which can be declared as const] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 2 +- tap.h | 4 ++++ udp.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 65 insertions(+), 4 deletions(-) diff --git a/tap.c b/tap.c index 7082620..4541f51 100644 --- a/tap.c +++ b/tap.c @@ -261,7 +261,7 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h, ip6h->saddr = *src; ip6h->daddr = *dst; ip6_set_flow_lbl(ip6h, flow); - return ip6h + 1; + return (char *)ip6h + sizeof(*ip6h); } /** diff --git a/tap.h b/tap.h index b53a5b8..a2c3b87 100644 --- a/tap.h +++ b/tap.h @@ -56,6 +56,10 @@ void *tap_push_uh6(struct udphdr *uh, void *in, size_t dlen); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, struct in_addr dst, size_t l4len, uint8_t proto); +void *tap_push_ip6h(struct ipv6hdr *ip6h, + const struct in6_addr *src, + const struct in6_addr *dst, + size_t l4len, uint8_t proto, uint32_t flow); void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, const void *in, size_t dlen); diff --git a/udp.c b/udp.c index b72c3ce..80520cb 100644 --- a/udp.c +++ b/udp.c @@ -88,6 +88,7 @@ #include <netinet/ip.h> #include <netinet/udp.h> #include <netinet/ip_icmp.h> +#include <netinet/icmp6.h> #include <stdint.h> #include <stddef.h> #include <string.h> @@ -115,6 +116,9 @@ /* Maximum UDP data to be returned in ICMP messages */ #define ICMP4_MAX_DLEN 8 +#define ICMP6_MAX_DLEN (IPV6_MIN_MTU \ + - sizeof(struct udphdr) \ + - sizeof(struct ipv6hdr)) /* "Spliced" sockets indexed by bound port (host order) */ static int udp_splice_ns [IP_VERSIONS][NUM_PORTS]; @@ -449,6 +453,51 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c, tap_icmp4_send(c, saddr, eaddr, &msg, msglen); } + +/** + * udp_send_conn_fail_icmp6() - Construct and send ICMPv6 to local peer + * @c: Execution context + * @ee: Extended error descriptor + * @toside: Destination side of flow + * @saddr: Address of ICMP generating node + * @in: First bytes (max 1232) of original UDP message body + * @dlen: Length of the read part of original UDP message body + * @flow: IPv6 flow identifier + */ +static void udp_send_conn_fail_icmp6(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + const struct in6_addr *saddr, + void *in, size_t dlen, uint32_t flow) +{ + const struct in6_addr *oaddr = &toside->oaddr.a6; + const struct in6_addr *eaddr = &toside->eaddr.a6; + in_port_t eport = toside->eport; + in_port_t oport = toside->oport; + struct { + struct icmp6_hdr icmp6h; + struct ipv6hdr ip6h; + struct udphdr uh; + char data[ICMP6_MAX_DLEN]; + } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg; + size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen; + size_t l4len = dlen + sizeof(struct udphdr); + + ASSERT(dlen <= ICMP6_MAX_DLEN); + memset(&msg, 0, sizeof(msg)); + msg.icmp6h.icmp6_type = ee->ee_type; + msg.icmp6h.icmp6_code = ee->ee_code; + if (ee->ee_type == ICMP6_PACKET_TOO_BIG) + msg.icmp6h.icmp6_dataun.icmp6_un_data32[0] = htonl(ee->ee_info); + + /* Reconstruct the original headers as returned in the ICMP message */ + tap_push_ip6h(&msg.ip6h, eaddr, oaddr, l4len, IPPROTO_UDP, flow); + tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen); + memcpy(&msg.data, in, dlen); + + tap_icmp6_send(c, saddr, eaddr, &msg, msglen); +} + /** * udp_sock_recverr() - Receive and clear an error from a socket * @c: Execution context @@ -465,7 +514,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) const struct cmsghdr *hdr; union sockaddr_inany saddr; char buf[CMSG_SPACE(sizeof(*ee))]; - char data[ICMP4_MAX_DLEN]; + char data[ICMP6_MAX_DLEN]; int s = ref.fd; struct iovec iov = { .iov_base = data, @@ -508,9 +557,17 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) if (ref.type == EPOLL_TYPE_UDP_REPLY) { flow_sidx_t sidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(sidx); + size_t dlen = rc; - udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr, - data, rc); + if (hdr->cmsg_level == IPPROTO_IP) { + dlen = MIN(dlen, ICMP4_MAX_DLEN); + udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr, + data, dlen); + } else if (hdr->cmsg_level == IPPROTO_IPV6) { + udp_send_conn_fail_icmp6(c, ee, toside, + &saddr.sa6.sin6_addr, + data, dlen, sidx.flowi); + } } else { trace("Ignoring received IP_RECVERR cmsg on listener socket"); } From 57d2db370b9c12aca84901d968c2c31db89ca462 Mon Sep 17 00:00:00 2001 From: David Gibson <dgibson@redhat.com> Date: Wed, 5 Mar 2025 17:15:03 +1100 Subject: [PATCH 272/382] treewide: Mark assorted functions static This marks static a number of functions which are only used in their .c file, have no prototypes in a .h and were never intended to be globally exposed. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- log.c | 2 +- netlink.c | 2 +- passt.c | 2 +- tcp.c | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/log.c b/log.c index 95e4576..b6bce21 100644 --- a/log.c +++ b/log.c @@ -56,7 +56,7 @@ bool log_stderr = true; /* Not daemonised, no shell spawned */ * * Return: pointer to @now, or NULL if there was an error retrieving the time */ -const struct timespec *logtime(struct timespec *ts) +static const struct timespec *logtime(struct timespec *ts) { if (clock_gettime(CLOCK_MONOTONIC, ts)) return NULL; diff --git a/netlink.c b/netlink.c index 37d8b5b..a052504 100644 --- a/netlink.c +++ b/netlink.c @@ -355,7 +355,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) * * Return: true if a gateway was found, false otherwise */ -bool nl_route_get_def_multipath(struct rtattr *rta, void *gw) +static bool nl_route_get_def_multipath(struct rtattr *rta, void *gw) { int nh_len = RTA_PAYLOAD(rta); struct rtnexthop *rtnh; diff --git a/passt.c b/passt.c index 68d1a28..868842b 100644 --- a/passt.c +++ b/passt.c @@ -166,7 +166,7 @@ void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) * * #syscalls exit_group */ -void exit_handler(int signal) +static void exit_handler(int signal) { (void)signal; diff --git a/tcp.c b/tcp.c index fb04e2e..4c24367 100644 --- a/tcp.c +++ b/tcp.c @@ -2497,7 +2497,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) * @c: Execution context * @port: Port, host order */ -void tcp_ns_sock_init(const struct ctx *c, in_port_t port) +static void tcp_ns_sock_init(const struct ctx *c, in_port_t port) { ASSERT(!c->no_tcp); @@ -3141,7 +3141,7 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) * * Return: 0 on success, negative error code on failure */ -int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t) +static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t) { const struct tcp_repair_opt opts[] = { { TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) }, @@ -3333,7 +3333,7 @@ fail: * * Return: 0 on success, negative error code on failure */ -int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) { sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6; const struct flowside *sockside = HOSTFLOW(conn); From e36c35c952ef0848383cba8ef71e13cf25dab2da Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 5 Mar 2025 17:15:04 +1100 Subject: [PATCH 273/382] log: Don't export passt_vsyslog() passt_vsyslog() is an exposed function in log.h. However it shouldn't be called from outside log.c: it writes specifically to the system log, and most code should call passt's logging helpers which might go to the syslog or to a log file. Make passt_vsyslog() local to log.c. This requires a code motion to avoid a forward declaration. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- log.c | 48 ++++++++++++++++++++++++------------------------ log.h | 1 - 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/log.c b/log.c index b6bce21..6eda4c4 100644 --- a/log.c +++ b/log.c @@ -249,6 +249,30 @@ static void logfile_write(bool newline, bool cont, int pri, log_written += n; } +/** + * passt_vsyslog() - vsyslog() implementation not using heap memory + * @newline: Append newline at the end of the message, if missing + * @pri: Facility and level map, same as priority for vsyslog() + * @format: Same as vsyslog() format + * @ap: Same as vsyslog() ap + */ +static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap) +{ + char buf[BUFSIZ]; + int n; + + /* Send without timestamp, the system logger should add it */ + n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident); + + n += vsnprintf(buf + n, BUFSIZ - n, format, ap); + + if (newline && format[strlen(format)] != '\n') + n += snprintf(buf + n, BUFSIZ - n, "\n"); + + if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr) + FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n); +} + /** * vlogmsg() - Print or send messages to log or output files as configured * @newline: Append newline at the end of the message, if missing @@ -373,30 +397,6 @@ void __setlogmask(int mask) setlogmask(mask); } -/** - * passt_vsyslog() - vsyslog() implementation not using heap memory - * @newline: Append newline at the end of the message, if missing - * @pri: Facility and level map, same as priority for vsyslog() - * @format: Same as vsyslog() format - * @ap: Same as vsyslog() ap - */ -void passt_vsyslog(bool newline, int pri, const char *format, va_list ap) -{ - char buf[BUFSIZ]; - int n; - - /* Send without timestamp, the system logger should add it */ - n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident); - - n += vsnprintf(buf + n, BUFSIZ - n, format, ap); - - if (newline && format[strlen(format)] != '\n') - n += snprintf(buf + n, BUFSIZ - n, "\n"); - - if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr) - FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n); -} - /** * logfile_init() - Open log file and write header with PID, version, path * @name: Identifier for header: passt or pasta diff --git a/log.h b/log.h index 22c7b9a..08aa88c 100644 --- a/log.h +++ b/log.h @@ -55,7 +55,6 @@ void trace_init(int enable); void __openlog(const char *ident, int option, int facility); void logfile_init(const char *name, const char *path, size_t size); -void passt_vsyslog(bool newline, int pri, const char *format, va_list ap); void __setlogmask(int mask); #endif /* LOG_H */ From 12d5b36b2f17a1ddc9447b925dbec161b4da346a Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 5 Mar 2025 17:15:05 +1100 Subject: [PATCH 274/382] checksum: Don't export various functions Several of the exposed functions in checksum.h are no longer directly used. Remove them from the header, and make static. In particular sum_16b() should not be used outside: generally csum_unfolded() should be used which will automatically use either the AVX2 optimized version or sum_16b() as necessary. csum_fold() and csum() could have external uses, but they're not used right now. We can expose them again if we need to. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- checksum.c | 34 +++++++++++++++++----------------- checksum.h | 3 --- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/checksum.c b/checksum.c index b01e0fe..0894eca 100644 --- a/checksum.c +++ b/checksum.c @@ -85,7 +85,7 @@ */ /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ __attribute__((optimize("-fno-strict-aliasing"))) -uint32_t sum_16b(const void *buf, size_t len) +static uint32_t sum_16b(const void *buf, size_t len) { const uint16_t *p = buf; uint32_t sum = 0; @@ -107,7 +107,7 @@ uint32_t sum_16b(const void *buf, size_t len) * * Return: 16-bit folded sum */ -uint16_t csum_fold(uint32_t sum) +static uint16_t csum_fold(uint32_t sum) { while (sum >> 16) sum = (sum & 0xffff) + (sum >> 16); @@ -161,6 +161,21 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol, return psum; } +/** + * csum() - Compute TCP/IP-style checksum + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 16-bit folded, complemented checksum + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ +static uint16_t csum(const void *buf, size_t len, uint32_t init) +{ + return (uint16_t)~csum_fold(csum_unfolded(buf, len, init)); +} + /** * csum_udp4() - Calculate and set checksum for a UDP over IPv4 packet * @udp4hr: UDP header, initialised apart from checksum @@ -482,21 +497,6 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) } #endif /* !__AVX2__ */ -/** - * csum() - Compute TCP/IP-style checksum - * @buf: Input buffer - * @len: Input length - * @init: Initial 32-bit checksum, 0 for no pre-computed checksum - * - * Return: 16-bit folded, complemented checksum - */ -/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ -__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ -uint16_t csum(const void *buf, size_t len, uint32_t init) -{ - return (uint16_t)~csum_fold(csum_unfolded(buf, len, init)); -} - /** * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector * @tail: IO vector tail to checksum diff --git a/checksum.h b/checksum.h index e243c97..683a09b 100644 --- a/checksum.h +++ b/checksum.h @@ -11,8 +11,6 @@ struct icmphdr; struct icmp6hdr; struct iov_tail; -uint32_t sum_16b(const void *buf, size_t len); -uint16_t csum_fold(uint32_t sum); uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init); uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol, struct in_addr saddr, struct in_addr daddr); @@ -32,7 +30,6 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, const void *payload, size_t dlen); uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init); -uint16_t csum(const void *buf, size_t len, uint32_t init); uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init); #endif /* CHECKSUM_H */ From 27395e67c26a73e2e035360195b5928a07996dd5 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 5 Mar 2025 17:15:06 +1100 Subject: [PATCH 275/382] tcp: Don't export tcp_update_csum() tcp_update_csum() is exposed in tcp_internal.h, but is only used in tcp.c. Remove the unneded prototype and make it static. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 3 ++- tcp_internal.h | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tcp.c b/tcp.c index 4c24367..32a08bd 100644 --- a/tcp.c +++ b/tcp.c @@ -787,7 +787,8 @@ static void tcp_sock_set_nodelay(int s) * @th: TCP header (updated) * @payload: TCP payload */ -void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload) +static void tcp_update_csum(uint32_t psum, struct tcphdr *th, + struct iov_tail *payload) { th->check = 0; psum = csum_unfolded(th, sizeof(*th), psum); diff --git a/tcp_internal.h b/tcp_internal.h index 9cf31f5..6f5e054 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -166,8 +166,6 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); struct tcp_info_linux; -void tcp_update_csum(uint32_t psum, struct tcphdr *th, - struct iov_tail *payload); void tcp_fill_headers(const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct iphdr *ip4h, struct ipv6hdr *ip6h, From a83c806d1786fbe19bc6a3014f248e928e00651b Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 5 Mar 2025 17:15:07 +1100 Subject: [PATCH 276/382] vhost_user: Don't export several functions vhost-user added several functions which are exposed in headers, but not used outside the file where they're defined. I can't tell if these are really internal functions, or of they're logically supposed to be exported, but we don't happen to have anything using them yet. For the time being, just remove the exports. We can add them back if we need to. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 2 +- vhost_user.h | 1 - virtio.c | 9 +++++---- virtio.h | 4 ---- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/vhost_user.c b/vhost_user.c index be1aa94..105f77a 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -517,7 +517,7 @@ static void vu_close_log(struct vu_dev *vdev) * vu_log_kick() - Inform the front-end that the log has been modified * @vdev: vhost-user device */ -void vu_log_kick(const struct vu_dev *vdev) +static void vu_log_kick(const struct vu_dev *vdev) { if (vdev->log_call_fd != -1) { int rc; diff --git a/vhost_user.h b/vhost_user.h index e769cb1..1daacd1 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -241,7 +241,6 @@ static inline bool vu_queue_started(const struct vu_virtq *vq) void vu_print_capabilities(void); void vu_init(struct ctx *c); void vu_cleanup(struct vu_dev *vdev); -void vu_log_kick(const struct vu_dev *vdev); void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length); void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events); diff --git a/virtio.c b/virtio.c index 2b58e4d..bc2b89a 100644 --- a/virtio.c +++ b/virtio.c @@ -286,7 +286,7 @@ static int virtqueue_read_next_desc(const struct vring_desc *desc, * * Return: true if the virtqueue is empty, false otherwise */ -bool vu_queue_empty(struct vu_virtq *vq) +static bool vu_queue_empty(struct vu_virtq *vq) { if (!vq->vring.avail) return true; @@ -671,9 +671,10 @@ static void vu_log_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, * @len: Size of the element * @idx: Used ring entry index */ -void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq, - unsigned int index, unsigned int len, - unsigned int idx) +static void vu_queue_fill_by_index(const struct vu_dev *vdev, + struct vu_virtq *vq, + unsigned int index, unsigned int len, + unsigned int idx) { struct vring_used_elem uelem; diff --git a/virtio.h b/virtio.h index 0a59441..7a370bd 100644 --- a/virtio.h +++ b/virtio.h @@ -174,16 +174,12 @@ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, return has_feature(vdev->protocol_features, fbit); } -bool vu_queue_empty(struct vu_virtq *vq); void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq); int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem); void vu_queue_detach_element(struct vu_virtq *vq); void vu_queue_unpop(struct vu_virtq *vq); bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num); -void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq, - unsigned int index, unsigned int len, - unsigned int idx); void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, const struct vu_virtq_element *elem, unsigned int len, unsigned int idx); From 2b58b22845a76baf24141155eb4d4a882f509e97 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 5 Mar 2025 17:15:08 +1100 Subject: [PATCH 277/382] cppcheck: Add suppressions for "logically" exported functions We have some functions in our headers which are definitely there on purpose. However, they're not yet used outside the files in which they're defined. That causes sufficiently recent cppcheck versions (2.17) to complain they should be static. Suppress the errors for these "logically" exported functions. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- iov.c | 1 + log.c | 1 + 2 files changed, 2 insertions(+) diff --git a/iov.c b/iov.c index 3b12272..8c63b7e 100644 --- a/iov.c +++ b/iov.c @@ -203,6 +203,7 @@ size_t iov_tail_size(struct iov_tail *tail) * overruns the IO vector, is not contiguous or doesn't have the * requested alignment. */ +/* cppcheck-suppress [staticFunction,unmatchedSuppression] */ void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align) { char *p; diff --git a/log.c b/log.c index 6eda4c4..d40d7ae 100644 --- a/log.c +++ b/log.c @@ -281,6 +281,7 @@ static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap) * @format: Message * @ap: Variable argument list */ +/* cppcheck-suppress [staticFunction,unmatchedSuppression] */ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap) { bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1; From 04701702471ececee362669cc6b49ed9e20a1b6d Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 7 Mar 2025 23:27:03 +0100 Subject: [PATCH 278/382] passt-repair: Add directory watch It might not be feasible for users to start passt-repair after passt is started, on a migration target, but before the migration process starts. For instance, with libvirt, the guest domain (and, hence, passt) is started on the target as part of the migration process. At least for the moment being, there's no hook a libvirt user (including KubeVirt) can use to start passt-repair before the migration starts. Add a directory watch using inotify: if PATH is a directory, instead of connecting to it, we'll watch for a .repair socket file to appear in it, and then attempt to connect to that socket. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- contrib/selinux/passt-repair.te | 16 +++---- passt-repair.1 | 6 ++- passt-repair.c | 84 +++++++++++++++++++++++++++++---- 3 files changed, 89 insertions(+), 17 deletions(-) diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te index f171be6..7157dfb 100644 --- a/contrib/selinux/passt-repair.te +++ b/contrib/selinux/passt-repair.te @@ -61,11 +61,11 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write }; allow passt_repair_t passt_t:unix_stream_socket { connectto read write }; allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write }; -allow passt_repair_t user_tmp_t:dir search; +allow passt_repair_t user_tmp_t:dir { getattr read search watch }; -allow passt_repair_t unconfined_t:sock_file { read write }; -allow passt_repair_t passt_t:sock_file { read write }; -allow passt_repair_t user_tmp_t:sock_file { read write }; +allow passt_repair_t unconfined_t:sock_file { getattr read write }; +allow passt_repair_t passt_t:sock_file { getattr read write }; +allow passt_repair_t user_tmp_t:sock_file { getattr read write }; allow passt_repair_t unconfined_t:tcp_socket { read setopt write }; allow passt_repair_t passt_t:tcp_socket { read setopt write }; @@ -80,8 +80,8 @@ allow passt_repair_t passt_t:tcp_socket { read setopt write }; allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write }; allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write }; -allow passt_repair_t qemu_var_run_t:dir search; -allow passt_repair_t virt_var_run_t:dir search; +allow passt_repair_t qemu_var_run_t:dir { getattr read search watch }; +allow passt_repair_t virt_var_run_t:dir { getattr read search watch }; -allow passt_repair_t qemu_var_run_t:sock_file { read write }; -allow passt_repair_t virt_var_run_t:sock_file { read write }; +allow passt_repair_t qemu_var_run_t:sock_file { getattr read write }; +allow passt_repair_t virt_var_run_t:sock_file { getattr read write }; diff --git a/passt-repair.1 b/passt-repair.1 index 7c1b140..e65aadd 100644 --- a/passt-repair.1 +++ b/passt-repair.1 @@ -16,13 +16,17 @@ .B passt-repair is a privileged helper setting and clearing repair mode on TCP sockets on behalf of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain -socket, specified by \fIPATH\fR. +socket. It can be used to migrate TCP connections between guests without granting additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections, \fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR capability (see \fBcapabilities\fR(7)) to be set or cleared. +If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to +connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file +ending with \fI.repair\fR appears in it, and then attempts to connect to it. + .SH PROTOCOL \fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via diff --git a/passt-repair.c b/passt-repair.c index e0c366e..8bb3f00 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -16,11 +16,14 @@ * off. Reply by echoing the command. Exit on EOF. */ +#include <sys/inotify.h> #include <sys/prctl.h> #include <sys/types.h> #include <sys/socket.h> +#include <sys/stat.h> #include <sys/un.h> #include <errno.h> +#include <stdbool.h> #include <stddef.h> #include <stdio.h> #include <stdlib.h> @@ -39,6 +42,8 @@ #include "seccomp_repair.h" #define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ +#define REPAIR_EXT ".repair" +#define REPAIR_EXT_LEN strlen(REPAIR_EXT) /** * main() - Entry point and whole program with loop @@ -51,6 +56,9 @@ * #syscalls:repair socket s390x:socketcall i686:socketcall * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv * #syscalls:repair sendto sendmsg arm:send ppc64le:send + * #syscalls:repair stat|statx stat64|statx statx + * #syscalls:repair fstat|fstat64 newfstatat|fstatat64 + * #syscalls:repair inotify_init1 inotify_add_watch */ int main(int argc, char **argv) { @@ -58,12 +66,14 @@ int main(int argc, char **argv) __attribute__ ((aligned(__alignof__(struct cmsghdr)))); struct sockaddr_un a = { AF_UNIX, "" }; int fds[SCM_MAX_FD], s, ret, i, n = 0; + bool inotify_dir = false; struct sock_fprog prog; int8_t cmd = INT8_MAX; struct cmsghdr *cmsg; struct msghdr msg; struct iovec iov; size_t cmsg_len; + struct stat sb; int op; prctl(PR_SET_DUMPABLE, 0); @@ -90,19 +100,77 @@ int main(int argc, char **argv) _exit(2); } - ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); - if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) { - fprintf(stderr, "Invalid socket path: %s\n", argv[1]); - _exit(2); - } - if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno); _exit(1); } - if (connect(s, (struct sockaddr *)&a, sizeof(a))) { - fprintf(stderr, "Failed to connect to %s: %s\n", argv[1], + if ((stat(argv[1], &sb))) { + fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno); + _exit(1); + } + + if ((sb.st_mode & S_IFMT) == S_IFDIR) { + char buf[sizeof(struct inotify_event) + NAME_MAX + 1]; + const struct inotify_event *ev; + char path[PATH_MAX + 1]; + ssize_t n; + int fd; + + ev = (struct inotify_event *)buf; + + if ((fd = inotify_init1(IN_CLOEXEC)) < 0) { + fprintf(stderr, "inotify_init1: %i\n", errno); + _exit(1); + } + + if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) { + fprintf(stderr, "inotify_add_watch: %i\n", errno); + _exit(1); + } + + do { + n = read(fd, buf, sizeof(buf)); + if (n < 0) { + fprintf(stderr, "inotify read: %i", errno); + _exit(1); + } + + if (n < (ssize_t)sizeof(*ev)) { + fprintf(stderr, "Short inotify read: %zi", n); + _exit(1); + } + } while (ev->len < REPAIR_EXT_LEN || + memcmp(ev->name + strlen(ev->name) - REPAIR_EXT_LEN, + REPAIR_EXT, REPAIR_EXT_LEN)); + + snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name); + if ((stat(path, &sb))) { + fprintf(stderr, "Can't stat() %s: %i\n", path, errno); + _exit(1); + } + + ret = snprintf(a.sun_path, sizeof(a.sun_path), path); + inotify_dir = true; + } else { + ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); + } + + if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) { + fprintf(stderr, "Invalid socket path"); + _exit(2); + } + + if ((sb.st_mode & S_IFMT) != S_IFSOCK) { + fprintf(stderr, "%s is not a socket\n", a.sun_path); + _exit(2); + } + + while (connect(s, (struct sockaddr *)&a, sizeof(a))) { + if (inotify_dir && errno == ECONNREFUSED) + continue; + + fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path, strerror(errno)); _exit(1); } From c8b520c0625b440d0dcd588af085d35cf46aae2c Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 6 Mar 2025 20:00:51 +0100 Subject: [PATCH 279/382] flow, repair: Wait for a short while for passt-repair to connect ...and time out after that. This will be needed because of an upcoming change to passt-repair enabling it to start before passt is started, on both source and target, by means of an inotify watch. Once the inotify watch triggers, passt-repair will connect right away, but we have no guarantees that the connection completes before we start the migration process, so wait for it (for a reasonable amount of time). Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- flow.c | 20 ++++++++++++++++++++ repair.c | 32 ++++++++++++++++++++++++++++++++ repair.h | 1 + 3 files changed, 53 insertions(+) diff --git a/flow.c b/flow.c index 749c498..5e64b79 100644 --- a/flow.c +++ b/flow.c @@ -911,6 +911,21 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret) return ret; } +/** + * flow_migrate_need_repair() - Do we need to set repair mode for any flow? + * + * Return: true if repair mode is needed, false otherwise + */ +static bool flow_migrate_need_repair(void) +{ + union flow *flow; + + foreach_established_tcp_flow(flow) + return true; + + return false; +} + /** * flow_migrate_repair_all() - Turn repair mode on or off for all flows * @c: Execution context @@ -966,6 +981,9 @@ int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, (void)stage; (void)fd; + if (flow_migrate_need_repair()) + repair_wait(c); + if ((rc = flow_migrate_repair_all(c, true))) return -rc; @@ -1083,6 +1101,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, if (!count) return 0; + repair_wait(c); + if ((rc = flow_migrate_repair_all(c, true))) return -rc; diff --git a/repair.c b/repair.c index 3ee089f..149fe51 100644 --- a/repair.c +++ b/repair.c @@ -27,6 +27,10 @@ #define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ +/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */ +#define REPAIR_ACCEPT_TIMEOUT_MS 10 +#define REPAIR_ACCEPT_TIMEOUT_US (REPAIR_ACCEPT_TIMEOUT_MS * 1000) + /* Pending file descriptors for next repair_flush() call, or command change */ static int repair_fds[SCM_MAX_FD]; @@ -138,6 +142,34 @@ void repair_handler(struct ctx *c, uint32_t events) repair_close(c); } +/** + * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect + * @c: Execution context + */ +void repair_wait(struct ctx *c) +{ + struct timeval tv = { .tv_sec = 0, + .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) }; + static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000, + ".tv_usec is greater than 1000 * 1000"); + + if (c->fd_repair >= 0 || c->fd_repair_listen == -1) + return; + + if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) { + err_perror("Set timeout on TCP_REPAIR listening socket"); + return; + } + + repair_listen_handler(c, EPOLLIN); + + tv.tv_usec = 0; + if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) + err_perror("Clear timeout on TCP_REPAIR listening socket"); +} + /** * repair_flush() - Flush current set of sockets to helper, with current command * @c: Execution context diff --git a/repair.h b/repair.h index de279d6..1d37922 100644 --- a/repair.h +++ b/repair.h @@ -10,6 +10,7 @@ void repair_sock_init(const struct ctx *c); void repair_listen_handler(struct ctx *c, uint32_t events); void repair_handler(struct ctx *c, uint32_t events); void repair_close(struct ctx *c); +void repair_wait(struct ctx *c); int repair_flush(struct ctx *c); int repair_set(struct ctx *c, int s, int cmd); From bb00a0499fc9130e4b00a88928958b8b094ee2c9 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 13:18:31 +1100 Subject: [PATCH 280/382] conf: Use the same optstring for passt and pasta modes Currently we rely on detecting our mode first and use different sets of (single character) options for each. This means that if you use an option valid in only one mode in another you'll get the generic usage() message. We can give more helpful errors with little extra effort by combining all the options into a single value of the option string and giving bespoke messages if an option for the wrong mode is used; in fact we already did this for some single mode options like '-1'. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/conf.c b/conf.c index 065e720..7f20bc8 100644 --- a/conf.c +++ b/conf.c @@ -1388,6 +1388,7 @@ void conf(struct ctx *c, int argc, char **argv) {"repair-path", required_argument, NULL, 28 }, { 0 }, }; + const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:"; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; bool copy_addrs_opt = false, copy_routes_opt = false; @@ -1397,7 +1398,6 @@ void conf(struct ctx *c, int argc, char **argv) struct fqdn *dnss = c->dns_search; unsigned int ifi4 = 0, ifi6 = 0; const char *logfile = NULL; - const char *optstring; size_t logsize = 0; char *runas = NULL; long fd_tap_opt; @@ -1408,9 +1408,6 @@ void conf(struct ctx *c, int argc, char **argv) if (c->mode == MODE_PASTA) { c->no_dhcp_dns = c->no_dhcp_dns_search = 1; fwd_default = FWD_AUTO; - optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:"; - } else { - optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:"; } c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t)); @@ -1614,6 +1611,9 @@ void conf(struct ctx *c, int argc, char **argv) c->foreground = 1; break; case 's': + if (c->mode == MODE_PASTA) + die("-s is for passt / vhost-user mode only"); + ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->sock_path)) @@ -1634,6 +1634,9 @@ void conf(struct ctx *c, int argc, char **argv) *c->sock_path = 0; break; case 'I': + if (c->mode != MODE_PASTA) + die("-I is for pasta mode only"); + ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s", optarg); if (ret <= 0 || ret >= IFNAMSIZ) @@ -1790,11 +1793,16 @@ void conf(struct ctx *c, int argc, char **argv) break; case 't': case 'u': - case 'T': - case 'U': case 'D': /* Handle these later, once addresses are configured */ break; + case 'T': + case 'U': + if (c->mode != MODE_PASTA) + die("-%c is for pasta mode only", name); + + /* Handle properly later, once addresses are configured */ + break; case 'h': usage(argv[0], stdout, EXIT_SUCCESS); break; From 4b17d042c7e4f6e5b5a770181e2ebd53ec8e73d4 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 13:18:32 +1100 Subject: [PATCH 281/382] conf: Move mode detection into helper function One of the first things we need to do is determine if we're in passt mode or pasta mode. Currently this is open-coded in main(), by examining argv[0]. We want to complexify this a bit in future to cover vhost-user mode as well. Prepare for this, by moving the mode detection into a new conf_mode() function. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 26 ++++++++++++++++++++++++++ conf.h | 1 + passt.c | 14 ++------------ 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/conf.c b/conf.c index 7f20bc8..2022ea1 100644 --- a/conf.c +++ b/conf.c @@ -991,6 +991,32 @@ pasta_opts: _exit(status); } +/** + * conf_mode() - Determine passt/pasta's operating mode from command line + * @argc: Argument count + * @argv: Command line arguments + * + * Return: mode to operate in, PASTA or PASST + */ +/* cppcheck-suppress constParameter */ +enum passt_modes conf_mode(int argc, char *argv[]) +{ + char argv0[PATH_MAX], *basearg0; + + if (argc < 1) + die("Cannot determine argv[0]"); + + strncpy(argv0, argv[0], PATH_MAX - 1); + basearg0 = basename(argv0); + if (strstr(basearg0, "pasta")) + return MODE_PASTA; + + if (strstr(basearg0, "passt")) + return MODE_PASST; + + die("Cannot determine mode, invoke as \"passt\" or \"pasta\""); +} + /** * conf_print() - Print fundamental configuration parameters * @c: Execution context diff --git a/conf.h b/conf.h index 9d2143d..b45ad74 100644 --- a/conf.h +++ b/conf.h @@ -6,6 +6,7 @@ #ifndef CONF_H #define CONF_H +enum passt_modes conf_mode(int argc, char *argv[]); void conf(struct ctx *c, int argc, char **argv); #endif /* CONF_H */ diff --git a/passt.c b/passt.c index 868842b..0bd2a29 100644 --- a/passt.c +++ b/passt.c @@ -191,7 +191,6 @@ int main(int argc, char **argv) { struct epoll_event events[EPOLL_EVENTS]; int nfds, i, devnull_fd = -1; - char argv0[PATH_MAX], *name; struct ctx c = { 0 }; struct rlimit limit; struct timespec now; @@ -213,21 +212,12 @@ int main(int argc, char **argv) sigaction(SIGTERM, &sa, NULL); sigaction(SIGQUIT, &sa, NULL); - if (argc < 1) - _exit(EXIT_FAILURE); + c.mode = conf_mode(argc, argv); - strncpy(argv0, argv[0], PATH_MAX - 1); - name = basename(argv0); - if (strstr(name, "pasta")) { + if (c.mode == MODE_PASTA) { sa.sa_handler = pasta_child_handler; if (sigaction(SIGCHLD, &sa, NULL)) die_perror("Couldn't install signal handlers"); - - c.mode = MODE_PASTA; - } else if (strstr(name, "passt")) { - c.mode = MODE_PASST; - } else { - _exit(EXIT_FAILURE); } if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) From 74cd82adc87552c7ef6d255069a974b4ebeab4a1 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 13:18:33 +1100 Subject: [PATCH 282/382] conf: Detect vhost-user mode earlier We detect our operating mode in conf_mode(), unless we're using vhost-user mode, in which case we change it later when we parse the --vhost-user option. That means we need to delay parsing the --repair-path option (for vhost-user only) until still later. However, there are many other places in the main option parsing loop which also rely on mode. We get away with those, because they happen to be able to treat passt and vhost-user modes identically. This is potentially confusing, though. So, move setting of MODE_VU into conf_mode() so c->mode always has its final value from that point onwards. To match, we move the parsing of --repair-path back into the main option parsing loop. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/conf.c b/conf.c index 2022ea1..b58e2a6 100644 --- a/conf.c +++ b/conf.c @@ -998,10 +998,23 @@ pasta_opts: * * Return: mode to operate in, PASTA or PASST */ -/* cppcheck-suppress constParameter */ enum passt_modes conf_mode(int argc, char *argv[]) { + int vhost_user = 0; + const struct option optvu[] = { + {"vhost-user", no_argument, &vhost_user, 1 }, + { 0 }, + }; char argv0[PATH_MAX], *basearg0; + int name; + + optind = 0; + do { + name = getopt_long(argc, argv, "-:", optvu, NULL); + } while (name != -1); + + if (vhost_user) + return MODE_VU; if (argc < 1) die("Cannot determine argv[0]"); @@ -1604,9 +1617,8 @@ void conf(struct ctx *c, int argc, char **argv) die("Invalid host nameserver address: %s", optarg); case 25: - if (c->mode == MODE_PASTA) - die("--vhost-user is for passt mode only"); - c->mode = MODE_VU; + /* Already handled in conf_mode() */ + ASSERT(c->mode == MODE_VU); break; case 26: vu_print_capabilities(); @@ -1617,7 +1629,14 @@ void conf(struct ctx *c, int argc, char **argv) die("Invalid FQDN: %s", optarg); break; case 28: - /* Handle this once we checked --vhost-user */ + if (c->mode != MODE_VU && strcmp(optarg, "none")) + die("--repair-path is for vhost-user mode only"); + + if (snprintf_check(c->repair_path, + sizeof(c->repair_path), "%s", + optarg)) + die("Invalid passt-repair path: %s", optarg); + break; case 'd': c->debug = 1; @@ -1917,8 +1936,8 @@ void conf(struct ctx *c, int argc, char **argv) if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw)) c->no_dhcp = 1; - /* Inbound port options, DNS, and --repair-path can be parsed now, after - * IPv4/IPv6 settings and --vhost-user. + /* Inbound port options and DNS can be parsed now, after IPv4/IPv6 + * settings */ fwd_probe_ephemeral(); udp_portmap_clear(); @@ -1964,16 +1983,6 @@ void conf(struct ctx *c, int argc, char **argv) } die("Cannot use DNS address %s", optarg); - } else if (name == 28) { - if (c->mode != MODE_VU && strcmp(optarg, "none")) - die("--repair-path is for vhost-user mode only"); - - if (snprintf_check(c->repair_path, - sizeof(c->repair_path), "%s", - optarg)) - die("Invalid passt-repair path: %s", optarg); - - break; } } while (name != -1); From c43972ad67806fb403cdbc05179441917f2a776b Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 13:18:34 +1100 Subject: [PATCH 283/382] packet: Give explicit name to maximum packet size We verify that every packet we store in a pool (and every partial packet we retreive from it) has a length no longer than UINT16_MAX. This originated in the older packet pool implementation which stored packet lengths in a uint16_t. Now, that packets are represented by a struct iovec with its size_t length, this check serves only as a sanity / security check that we don't have some wildly out of range length due to a bug elsewhere. We have may reasons to (slightly) increase this limit in future, so in preparation, give this quantity an explicit name - PACKET_MAX_LEN. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 4 ++-- packet.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/packet.c b/packet.c index 0330b54..bcac037 100644 --- a/packet.c +++ b/packet.c @@ -83,7 +83,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start, if (packet_check_range(p, start, len, func, line)) return; - if (len > UINT16_MAX) { + if (len > PACKET_MAX_LEN) { trace("add packet length %zu, %s:%i", len, func, line); return; } @@ -119,7 +119,7 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (len > UINT16_MAX) { + if (len > PACKET_MAX_LEN) { if (func) { trace("packet data length %zu, %s:%i", len, func, line); diff --git a/packet.h b/packet.h index bdc07fe..d099f02 100644 --- a/packet.h +++ b/packet.h @@ -6,6 +6,9 @@ #ifndef PACKET_H #define PACKET_H +/* Maximum size of a single packet stored in pool, including headers */ +#define PACKET_MAX_LEN UINT16_MAX + /** * struct pool - Generic pool of packets stored in a buffer * @buf: Buffer storing packet descriptors, From 1eda8de4384a93778a781257781c5b0967c8abfe Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 13:18:35 +1100 Subject: [PATCH 284/382] packet: Remove redundant TAP_BUF_BYTES define Currently we define both TAP_BUF_BYTES and PKT_BUF_BYTES as essentially the same thing. They'll be different only if TAP_BUF_BYTES is negative, which makes no sense. So, remove TAP_BUF_BYTES and just use PKT_BUF_BYTES. In addition, most places we use this to just mean the size of the main packet buffer (pkt_buf) for which we can just directly use sizeof. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt.c | 2 +- passt.h | 5 ++--- tap.c | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/passt.c b/passt.c index 0bd2a29..cd06772 100644 --- a/passt.c +++ b/passt.c @@ -223,7 +223,7 @@ int main(int argc, char **argv) if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) die_perror("Couldn't set disposition for SIGPIPE"); - madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE); + madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE); c.epollfd = epoll_create1(EPOLL_CLOEXEC); if (c.epollfd == -1) diff --git a/passt.h b/passt.h index 28d1389..6b24805 100644 --- a/passt.h +++ b/passt.h @@ -69,12 +69,11 @@ union epoll_ref { static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), "epoll_ref must have same size as epoll_data"); -#define TAP_BUF_BYTES \ +#define PKT_BUF_BYTES \ ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE) #define TAP_MSGS \ - DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) + DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) -#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0) extern char pkt_buf [PKT_BUF_BYTES]; extern char *epoll_type_str[]; diff --git a/tap.c b/tap.c index 4541f51..fb306e7 100644 --- a/tap.c +++ b/tap.c @@ -1080,7 +1080,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) do { n = recv(c->fd_tap, pkt_buf + partial_len, - TAP_BUF_BYTES - partial_len, MSG_DONTWAIT); + sizeof(pkt_buf) - partial_len, MSG_DONTWAIT); } while ((n < 0) && errno == EINTR); if (n < 0) { @@ -1151,7 +1151,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) tap_flush_pools(); - for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) { + for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) { len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU); if (len == 0) { From c4bfa3339cea586172d4b0fcd613b5638498651e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 13:18:36 +1100 Subject: [PATCH 285/382] tap: Use explicit defines for maximum length of L2 frame Currently in tap.c we (mostly) use ETH_MAX_MTU as the maximum length of an L2 frame. This define comes from the kernel, but it's badly named and used confusingly. First, it doesn't really have anything to do with Ethernet, which has no structural limit on frame lengths. It comes more from either a) IP which imposes a 64k datagram limit or b) from internal buffers used in various places in the kernel (and in passt). Worse, MTU generally means the maximum size of the IP (L3) datagram which may be transferred, _not_ counting the L2 headers. In the kernel ETH_MAX_MTU is sometimes used that way, but sometimes seems to be used as a maximum frame length, _including_ L2 headers. In tap.c we're mostly using it in the second way. Finally, each of our tap backends could have different limits on the frame size imposed by the mechanisms they're using. Start clearing up this confusion by replacing it in tap.c with new L2_MAX_LEN_* defines which specifically refer to the maximum L2 frame length for each backend. Signed-off-by: David Gibson <dgibson@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 23 +++++++++++++++++++---- tap.h | 25 +++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/tap.c b/tap.c index fb306e7..ede547c 100644 --- a/tap.c +++ b/tap.c @@ -62,6 +62,19 @@ #include "vhost_user.h" #include "vu_common.h" +/* Maximum allowed frame lengths (including L2 header) */ + +/* Verify that an L2 frame length limit is large enough to contain the header, + * but small enough to fit in the packet pool + */ +#define CHECK_FRAME_LEN(len) \ + static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN, \ + #len " has bad value") + +CHECK_FRAME_LEN(L2_MAX_LEN_PASTA); +CHECK_FRAME_LEN(L2_MAX_LEN_PASST); +CHECK_FRAME_LEN(L2_MAX_LEN_VU); + /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); @@ -1097,7 +1110,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) while (n >= (ssize_t)sizeof(uint32_t)) { uint32_t l2len = ntohl_unaligned(p); - if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) { + if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) { err("Bad frame size from guest, resetting connection"); tap_sock_reset(c); return; @@ -1151,8 +1164,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) tap_flush_pools(); - for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) { - len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU); + for (n = 0; + n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA); + n += len) { + len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA); if (len == 0) { die("EOF on tap device, exiting"); @@ -1170,7 +1185,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) /* Ignore frames of bad length */ if (len < (ssize_t)sizeof(struct ethhdr) || - len > (ssize_t)ETH_MAX_MTU) + len > (ssize_t)L2_MAX_LEN_PASTA) continue; tap_add_packet(c, len, pkt_buf + n); diff --git a/tap.h b/tap.h index a2c3b87..84e9fdb 100644 --- a/tap.h +++ b/tap.h @@ -6,6 +6,31 @@ #ifndef TAP_H #define TAP_H +/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header) + * + * The kernel tuntap device imposes a maximum frame size of 65535 including + * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode). + */ +#define L2_MAX_LEN_PASTA USHRT_MAX + +/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header) + * + * The only structural limit the QEMU socket protocol imposes on frames is + * (2^32-1) bytes, but that would be ludicrously long in practice. For now, + * limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate + * limit with more precision. + */ +#define L2_MAX_LEN_PASST USHRT_MAX + +/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header) + * + * vhost-user allows multiple buffers per frame, each of which can be quite + * large, so the inherent frame size limit is rather large. Much larger than is + * actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME: + * Work out an appropriate limit with more precision. + */ +#define L2_MAX_LEN_VU USHRT_MAX + struct udphdr; /** From b6945e055376be944867479dcd8deb77e47b1fa4 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 13:18:37 +1100 Subject: [PATCH 286/382] Simplify sizing of pkt_buf We define the size of pkt_buf as large enough to hold 128 maximum size packets. Well, approximately, since we round down to the page size. We don't have any specific reliance on how many packets can fit in the buffer, we just want it to be big enough to allow reasonable batching. The current definition relies on the confusingly named ETH_MAX_MTU and adds in sizeof(uint32_t) rather non-obviously for the pseudo-physical header used by the qemu socket (passt mode) protocol. Instead, just define it to be 8MiB, which is what that complex calculation works out to. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/passt.h b/passt.h index 6b24805..8f45091 100644 --- a/passt.h +++ b/passt.h @@ -69,8 +69,8 @@ union epoll_ref { static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), "epoll_ref must have same size as epoll_data"); -#define PKT_BUF_BYTES \ - ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE) +/* Large enough for ~128 maximum size frames */ +#define PKT_BUF_BYTES (8UL << 20) #define TAP_MSGS \ DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) From 9d1a6b3eba9e6e5c4db4bfa0e395edc45ca6c39d Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 13:18:38 +1100 Subject: [PATCH 287/382] pcap: Correctly set snaplen based on tap backend type The pcap header includes a value indicating how much of each frame is captured. We always capture the entire frame, so we want to set this to the maximum possible frame size. Currently we do that by setting it to ETH_MAX_MTU, but that's a confusingly named constant which might not always be correct depending on the details of our tap backend. Instead add a tap_l2_max_len() function that explicitly returns the maximum frame size for the current mode and use that to set snaplen. While we're there, there's no particular need for the pcap header to be defined in a global; make it local to pcap_init() instead. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- pcap.c | 46 ++++++++++++++++++++++++---------------------- tap.c | 19 +++++++++++++++++++ tap.h | 1 + 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/pcap.c b/pcap.c index 3d623cf..e95aa6f 100644 --- a/pcap.c +++ b/pcap.c @@ -33,33 +33,12 @@ #include "log.h" #include "pcap.h" #include "iov.h" +#include "tap.h" #define PCAP_VERSION_MINOR 4 static int pcap_fd = -1; -/* See pcap.h from libpcap, or pcap-savefile(5) */ -static const struct { - uint32_t magic; -#define PCAP_MAGIC 0xa1b2c3d4 - - uint16_t major; -#define PCAP_VERSION_MAJOR 2 - - uint16_t minor; -#define PCAP_VERSION_MINOR 4 - - int32_t thiszone; - uint32_t sigfigs; - uint32_t snaplen; - - uint32_t linktype; -#define PCAP_LINKTYPE_ETHERNET 1 -} pcap_hdr = { - PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU, - PCAP_LINKTYPE_ETHERNET -}; - struct pcap_pkthdr { uint32_t tv_sec; uint32_t tv_usec; @@ -162,6 +141,29 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) */ void pcap_init(struct ctx *c) { + /* See pcap.h from libpcap, or pcap-savefile(5) */ +#define PCAP_MAGIC 0xa1b2c3d4 +#define PCAP_VERSION_MAJOR 2 +#define PCAP_VERSION_MINOR 4 +#define PCAP_LINKTYPE_ETHERNET 1 + const struct { + uint32_t magic; + uint16_t major; + uint16_t minor; + + int32_t thiszone; + uint32_t sigfigs; + uint32_t snaplen; + + uint32_t linktype; + } pcap_hdr = { + .magic = PCAP_MAGIC, + .major = PCAP_VERSION_MAJOR, + .minor = PCAP_VERSION_MINOR, + .snaplen = tap_l2_max_len(c), + .linktype = PCAP_LINKTYPE_ETHERNET + }; + if (pcap_fd != -1) return; diff --git a/tap.c b/tap.c index ede547c..182a115 100644 --- a/tap.c +++ b/tap.c @@ -82,6 +82,25 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); #define TAP_SEQS 128 /* Different L4 tuples in one batch */ #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ +/** + * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode + * @c: Execution context + */ +unsigned long tap_l2_max_len(const struct ctx *c) +{ + /* NOLINTBEGIN(bugprone-branch-clone): values can be the same */ + switch (c->mode) { + case MODE_PASST: + return L2_MAX_LEN_PASST; + case MODE_PASTA: + return L2_MAX_LEN_PASTA; + case MODE_VU: + return L2_MAX_LEN_VU; + } + /* NOLINTEND(bugprone-branch-clone) */ + ASSERT(0); +} + /** * tap_send_single() - Send a single frame * @c: Execution context diff --git a/tap.h b/tap.h index 84e9fdb..dd39fd8 100644 --- a/tap.h +++ b/tap.h @@ -69,6 +69,7 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) thdr->vnet_len = htonl(l2len); } +unsigned long tap_l2_max_len(const struct ctx *c); void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, struct in_addr dst, size_t l4len, uint8_t proto); From 26df8a3608e7b006c00f44a9029bcadb6d5e4153 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 13:18:39 +1100 Subject: [PATCH 288/382] conf: Limit maximum MTU based on backend frame size The -m option controls the MTU, that is the maximum transmissible L3 datagram, not including L2 headers. We currently limit it to ETH_MAX_MTU which sounds like it makes sense. But ETH_MAX_MTU is confusing: it's not consistently used as to whether it means the maximum L3 datagram size or the maximum L2 frame size. Even within conf() we explicitly account for the L2 header size when computing the default --mtu value, but not when we compute the maximum --mtu value. Clean this up by reworking the maximum MTU computation to be the minimum of IP_MAX_MTU (65535) and the maximum sized IP datagram which can fit into our L2 frames when we account for the L2 header. The latter can vary depending on our tap backend, although it doesn't right now. Link: https://bugs.passt.top/show_bug.cgi?id=66 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 11 +++++++---- util.h | 3 --- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/conf.c b/conf.c index b58e2a6..c760f79 100644 --- a/conf.c +++ b/conf.c @@ -1434,6 +1434,7 @@ void conf(struct ctx *c, int argc, char **argv) enum fwd_ports_mode fwd_default = FWD_NONE; bool v4_only = false, v6_only = false; unsigned dns4_idx = 0, dns6_idx = 0; + unsigned long max_mtu = IP_MAX_MTU; struct fqdn *dnss = c->dns_search; unsigned int ifi4 = 0, ifi6 = 0; const char *logfile = NULL; @@ -1449,7 +1450,9 @@ void conf(struct ctx *c, int argc, char **argv) fwd_default = FWD_AUTO; } - c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t)); + if (tap_l2_max_len(c) - ETH_HLEN < max_mtu) + max_mtu = tap_l2_max_len(c) - ETH_HLEN; + c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t)); c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET; c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET; memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN); @@ -1711,9 +1714,9 @@ void conf(struct ctx *c, int argc, char **argv) if (errno || *e) die("Invalid MTU: %s", optarg); - if (mtu > ETH_MAX_MTU) { - die("MTU %lu too large (max %u)", - mtu, ETH_MAX_MTU); + if (mtu > max_mtu) { + die("MTU %lu too large (max %lu)", + mtu, max_mtu); } c->mtu = mtu; diff --git a/util.h b/util.h index 0f70f4d..4d512fa 100644 --- a/util.h +++ b/util.h @@ -31,9 +31,6 @@ #ifndef SECCOMP_RET_KILL_PROCESS #define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL #endif -#ifndef ETH_MAX_MTU -#define ETH_MAX_MTU USHRT_MAX -#endif #ifndef IP_MAX_MTU #define IP_MAX_MTU USHRT_MAX #endif From 78f1f0fdfc1831f2ca3a65c2cee98c44ff3c30ab Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 16:26:57 +1100 Subject: [PATCH 289/382] test/perf: Simplify iperf3 server lifetime management After we start the iperf3 server in the background, we have a sleep to make sure it's ready to receive connections. We can simplify this slightly by using the -D option to have iperf3 background itself rather than backgrounding it manually. That won't return until the server is ready to use. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/lib/test | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/lib/test b/test/lib/test index 758250a..7349674 100755 --- a/test/lib/test +++ b/test/lib/test @@ -20,10 +20,7 @@ test_iperf3s() { __sctx="${1}" __port="${2}" - pane_or_context_run_bg "${__sctx}" \ - 'iperf3 -s -p'${__port}' & echo $! > s.pid' \ - - sleep 1 # Wait for server to be ready + pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid' } # test_iperf3k() - Kill iperf3 server @@ -31,7 +28,7 @@ test_iperf3s() { test_iperf3k() { __sctx="${1}" - pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid' + pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)' sleep 1 # Wait for kernel to free up ports } From 96fe5548cb16fe2664ad121c2976048ccad6a1ab Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 12 Mar 2025 14:43:59 +1100 Subject: [PATCH 290/382] conf: Unify several paths in conf_ports() In conf_ports() we have three different paths which actually do the setup of an individual forwarded port: one for the "all" case, one for the exclusions only case and one for the range of ports with possible exclusions case. We can unify those cases using a new helper which handles a single range of ports, with a bitmap of exclusions. Although this is slightly longer (largely due to the new helpers function comment), it reduces duplicated logic. It will also make future improvements to the tracking of port forwards easier. The new conf_ports_range_except() function has a pretty prodigious parameter list, but I still think it's an overall improvement in conceptual complexity. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 173 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 90 insertions(+), 83 deletions(-) diff --git a/conf.c b/conf.c index c760f79..0e2e8dc 100644 --- a/conf.c +++ b/conf.c @@ -123,6 +123,75 @@ static int parse_port_range(const char *s, char **endptr, return 0; } +/** + * conf_ports_range_except() - Set up forwarding for a range of ports minus a + * bitmap of exclusions + * @c: Execution context + * @optname: Short option name, t, T, u, or U + * @optarg: Option argument (port specification) + * @fwd: Pointer to @fwd_ports to be updated + * @addr: Listening address + * @ifname: Listening interface + * @first: First port to forward + * @last: Last port to forward + * @exclude: Bitmap of ports to exclude + * @to: Port to translate @first to when forwarding + * @weak: Ignore errors, as long as at least one port is mapped + */ +static void conf_ports_range_except(const struct ctx *c, char optname, + const char *optarg, struct fwd_ports *fwd, + const union inany_addr *addr, + const char *ifname, + uint16_t first, uint16_t last, + const uint8_t *exclude, uint16_t to, + bool weak) +{ + bool bound_one = false; + unsigned i; + int ret; + + if (first == 0) { + die("Can't forward port 0 for option '-%c %s'", + optname, optarg); + } + + for (i = first; i <= last; i++) { + if (bitmap_isset(exclude, i)) + continue; + + if (bitmap_isset(fwd->map, i)) { + warn( +"Altering mapping of already mapped port number: %s", optarg); + } + + bitmap_set(fwd->map, i); + fwd->delta[i] = to - first; + + if (optname == 't') + ret = tcp_sock_init(c, addr, ifname, i); + else if (optname == 'u') + ret = udp_sock_init(c, 0, addr, ifname, i); + else + /* No way to check in advance for -T and -U */ + ret = 0; + + if (ret == -ENFILE || ret == -EMFILE) { + die("Can't open enough sockets for port specifier: %s", + optarg); + } + + if (!ret) { + bound_one = true; + } else if (!weak) { + die("Failed to bind port %u (%s) for option '-%c %s'", + i, strerror_(-ret), optname, optarg); + } + } + + if (!bound_one) + die("Failed to bind any port for '-%c %s'", optname, optarg); +} + /** * conf_ports() - Parse port configuration options, initialise UDP/TCP sockets * @c: Execution context @@ -135,10 +204,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, { union inany_addr addr_buf = inany_any6, *addr = &addr_buf; char buf[BUFSIZ], *spec, *ifname = NULL, *p; - bool exclude_only = true, bound_one = false; uint8_t exclude[PORT_BITMAP_SIZE] = { 0 }; + bool exclude_only = true; unsigned i; - int ret; if (!strcmp(optarg, "none")) { if (fwd->mode) @@ -173,32 +241,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, fwd->mode = FWD_ALL; - /* Skip port 0. It has special meaning for many socket APIs, so - * trying to bind it is not really safe. - */ - for (i = 1; i < NUM_PORTS; i++) { + /* Exclude ephemeral ports */ + for (i = 0; i < NUM_PORTS; i++) if (fwd_port_is_ephemeral(i)) - continue; - - bitmap_set(fwd->map, i); - if (optname == 't') { - ret = tcp_sock_init(c, NULL, NULL, i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } else if (optname == 'u') { - ret = udp_sock_init(c, 0, NULL, NULL, i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } - } - - if (!bound_one) - goto bind_all_fail; + bitmap_set(exclude, i); + conf_ports_range_except(c, optname, optarg, fwd, + NULL, NULL, + 1, NUM_PORTS - 1, exclude, + 1, true); return; } @@ -275,37 +326,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, } while ((p = next_chunk(p, ','))); if (exclude_only) { - /* Skip port 0. It has special meaning for many socket APIs, so - * trying to bind it is not really safe. - */ - for (i = 1; i < NUM_PORTS; i++) { - if (fwd_port_is_ephemeral(i) || - bitmap_isset(exclude, i)) - continue; - - bitmap_set(fwd->map, i); - - if (optname == 't') { - ret = tcp_sock_init(c, addr, ifname, i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } else if (optname == 'u') { - ret = udp_sock_init(c, 0, addr, ifname, i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } else { - /* No way to check in advance for -T and -U */ - bound_one = true; - } - } - - if (!bound_one) - goto bind_all_fail; + /* Exclude ephemeral ports */ + for (i = 0; i < NUM_PORTS; i++) + if (fwd_port_is_ephemeral(i)) + bitmap_set(exclude, i); + conf_ports_range_except(c, optname, optarg, fwd, + addr, ifname, + 1, NUM_PORTS - 1, exclude, + 1, true); return; } @@ -334,40 +363,18 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, if ((*p != '\0') && (*p != ',')) /* Garbage after the ranges */ goto bad; - for (i = orig_range.first; i <= orig_range.last; i++) { - if (bitmap_isset(fwd->map, i)) - warn( -"Altering mapping of already mapped port number: %s", optarg); - - if (bitmap_isset(exclude, i)) - continue; - - bitmap_set(fwd->map, i); - - fwd->delta[i] = mapped_range.first - orig_range.first; - - ret = 0; - if (optname == 't') - ret = tcp_sock_init(c, addr, ifname, i); - else if (optname == 'u') - ret = udp_sock_init(c, 0, addr, ifname, i); - if (ret) - goto bind_fail; - } + conf_ports_range_except(c, optname, optarg, fwd, + addr, ifname, + orig_range.first, orig_range.last, + exclude, + mapped_range.first, false); } while ((p = next_chunk(p, ','))); return; -enfile: - die("Can't open enough sockets for port specifier: %s", optarg); bad: die("Invalid port specifier %s", optarg); mode_conflict: die("Port forwarding mode '%s' conflicts with previous mode", optarg); -bind_fail: - die("Failed to bind port %u (%s) for option '-%c %s', exiting", - i, strerror_(-ret), optname, optarg); -bind_all_fail: - die("Failed to bind any port for '-%c %s', exiting", optname, optarg); } /** From cb5b593563402680bee850245667f2e71b0d1bda Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 13 Mar 2025 13:56:17 +1100 Subject: [PATCH 291/382] tcp, flow: Better use flow specific logging heleprs A number of places in the TCP code use general logging functions, instead of the flow specific ones. That includes a few older ones as well as many places in the new migration code. Thus they either don't identify which flow the problem happened on, or identify it in a non-standard way. Convert many of these to use the existing flow specific helpers. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 16 ++-- tcp.c | 252 +++++++++++++++++++++++++++---------------------- tcp.h | 1 - tcp_buf.c | 4 +- tcp_internal.h | 1 + tcp_vu.c | 2 +- 6 files changed, 149 insertions(+), 127 deletions(-) diff --git a/flow.c b/flow.c index 5e64b79..8622242 100644 --- a/flow.c +++ b/flow.c @@ -1037,8 +1037,8 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, foreach_established_tcp_flow(flow) { rc = tcp_flow_migrate_source(fd, &flow->tcp); if (rc) { - err("Can't send data, flow %u: %s", FLOW_IDX(flow), - strerror_(-rc)); + flow_err(flow, "Can't send data: %s", + strerror_(-rc)); if (!first) die("Inconsistent migration state, exiting"); @@ -1064,8 +1064,8 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, foreach_established_tcp_flow(flow) { rc = tcp_flow_migrate_source_ext(fd, &flow->tcp); if (rc) { - err("Extended data for flow %u: %s", FLOW_IDX(flow), - strerror_(-rc)); + flow_err(flow, "Can't send extended data: %s", + strerror_(-rc)); if (rc == -EIO) die("Inconsistent migration state, exiting"); @@ -1112,8 +1112,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, for (i = 0; i < count; i++) { rc = tcp_flow_migrate_target(c, fd); if (rc) { - debug("Migration data failure at flow %u: %s, abort", - i, strerror_(-rc)); + flow_dbg(FLOW(i), "Migration data failure, abort: %s", + strerror_(-rc)); return -rc; } } @@ -1123,8 +1123,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, for (i = 0; i < count; i++) { rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd); if (rc) { - debug("Migration data failure at flow %u: %s, abort", - i, strerror_(-rc)); + flow_dbg(FLOW(i), "Migration data failure, abort: %s", + strerror_(-rc)); return -rc; } } diff --git a/tcp.c b/tcp.c index 32a08bd..a4c840e 100644 --- a/tcp.c +++ b/tcp.c @@ -434,19 +434,20 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx) } /** - * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported - * @s: Socket to update + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported + * @conn: Pointer to the TCP connection structure * @offset: Offset in bytes * * Return: -1 when it fails, 0 otherwise. */ -int tcp_set_peek_offset(int s, int offset) +int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset) { if (!peek_offset_cap) return 0; - if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) { - err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s); + if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF, + &offset, sizeof(offset))) { + flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset); return -1; } return 0; @@ -1757,7 +1758,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); conn->seq_to_tap = max_ack_seq; - if (tcp_set_peek_offset(conn->sock, 0)) { + if (tcp_set_peek_offset(conn, 0)) { tcp_rst(c, conn); return -1; } @@ -1854,7 +1855,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, conn->seq_ack_to_tap = conn->seq_from_tap; conn_event(c, conn, ESTABLISHED); - if (tcp_set_peek_offset(conn->sock, 0)) { + if (tcp_set_peek_offset(conn, 0)) { tcp_rst(c, conn); return; } @@ -2022,7 +2023,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, goto reset; conn_event(c, conn, ESTABLISHED); - if (tcp_set_peek_offset(conn->sock, 0)) + if (tcp_set_peek_offset(conn, 0)) goto reset; if (th->fin) { @@ -2286,7 +2287,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) conn->seq_to_tap = conn->seq_ack_from_tap; if (!conn->wnd_from_tap) conn->wnd_from_tap = 1; /* Zero-window probe */ - if (tcp_set_peek_offset(conn->sock, 0)) { + if (tcp_set_peek_offset(conn, 0)) { tcp_rst(c, conn); } else { tcp_data_from_sock(c, conn); @@ -2810,20 +2811,21 @@ int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn) /** * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options - * @c: Execution context + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t) +static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) { struct tcp_info tinfo; socklen_t sl; sl = sizeof(tinfo); - if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { + if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) { int rc = -errno; - err_perror("Querying TCP_INFO, socket %i", s); + flow_perror(conn, "Querying TCP_INFO"); return rc; } @@ -2837,18 +2839,19 @@ static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t) /** * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG - * @c: Execution context + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t) +static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) { socklen_t sl = sizeof(t->mss); - if (getsockopt(s, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) { + if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) { int rc = -errno; - err_perror("Getting MSS, socket %i", s); + flow_perror(conn, "Getting MSS"); return rc; } @@ -2857,19 +2860,20 @@ static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t) /** * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters - * @c: Execution context + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t) +static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) { struct tcp_repair_window wnd; socklen_t sl = sizeof(wnd); - if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) { + if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) { int rc = -errno; - err_perror("Getting window repair data, socket %i", s); + flow_perror(conn, "Getting window repair data"); return rc; } @@ -2893,12 +2897,13 @@ static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t) /** * tcp_flow_repair_wnd() - Restore window parameters from extended data - * @c: Execution context + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t) +static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) { struct tcp_repair_window wnd; @@ -2908,9 +2913,10 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t) wnd.rcv_wnd = t->rcv_wnd; wnd.rcv_wup = t->rcv_wup; - if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sizeof(wnd))) { + if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, + &wnd, sizeof(wnd))) { int rc = -errno; - err_perror("Setting window data, socket %i", s); + flow_perror(conn, "Setting window data"); return rc; } @@ -2919,16 +2925,17 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t) /** * tcp_flow_select_queue() - Select queue (receive or send) for next operation - * @s: Socket + * @conn: Connection to select queue for * @queue: TCP_RECV_QUEUE or TCP_SEND_QUEUE * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_select_queue(int s, int queue) +static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue) { - if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue))) { + if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE, + &queue, sizeof(queue))) { int rc = -errno; - err_perror("Selecting TCP_SEND_QUEUE, socket %i", s); + flow_perror(conn, "Selecting TCP_SEND_QUEUE"); return rc; } @@ -2937,26 +2944,28 @@ static int tcp_flow_select_queue(int s, int queue) /** * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data - * @s: Socket + * @conn: Connection to dump queue for * @t: Extended migration data * * Return: 0 on success, negative error code on failure * * #syscalls:vu ioctl */ -static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t) +static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) { + int s = conn->sock; ssize_t rc; if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) { rc = -errno; - err_perror("Getting send queue size, socket %i", s); + flow_perror(conn, "Getting send queue size"); return rc; } if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) { rc = -errno; - err_perror("Getting not sent count, socket %i", s); + flow_perror(conn, "Getting not sent count"); return rc; } @@ -2975,14 +2984,16 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t) } if (t->notsent > t->sndq) { - err("Invalid notsent count socket %i, send: %u, not sent: %u", - s, t->sndq, t->notsent); + flow_err(conn, + "Invalid notsent count socket %i, send: %u, not sent: %u", + s, t->sndq, t->notsent); return -EINVAL; } if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) { - err("Send queue too large to migrate socket %i: %u bytes", - s, t->sndq); + flow_err(conn, + "Send queue too large to migrate socket %i: %u bytes", + s, t->sndq); return -ENOBUFS; } @@ -2993,13 +3004,13 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t) rc = 0; } else { rc = -errno; - err_perror("Can't read send queue, socket %i", s); + flow_perror(conn, "Can't read send queue"); return rc; } } if ((uint32_t)rc < t->sndq) { - err("Short read migrating send queue"); + flow_err(conn, "Short read migrating send queue"); return -ENXIO; } @@ -3010,19 +3021,20 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t) /** * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue - * @s: Socket + * @conn: Connection to repair queue for * @len: Length of data to be restored * @buf: Buffer with content of pending data queue * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf) +static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn, + size_t len, uint8_t *buf) { size_t chunk = len; uint8_t *p = buf; while (len > 0) { - ssize_t rc = send(s, p, MIN(len, chunk), 0); + ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0); if (rc < 0) { if ((errno == ENOBUFS || errno == ENOMEM) && @@ -3032,7 +3044,7 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf) } rc = -errno; - err_perror("Can't write queue, socket %i", s); + flow_perror(conn, "Can't write queue"); return rc; } @@ -3045,18 +3057,18 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf) /** * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue - * @s: Socket + * @conn: Pointer to the TCP connection structure * @v: Sequence value, set on return * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_dump_seq(int s, uint32_t *v) +static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v) { socklen_t sl = sizeof(*v); - if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) { + if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) { int rc = -errno; - err_perror("Dumping sequence, socket %i", s); + flow_perror(conn, "Dumping sequence"); return rc; } @@ -3065,16 +3077,17 @@ static int tcp_flow_dump_seq(int s, uint32_t *v) /** * tcp_flow_repair_seq() - Restore sequence for pre-selected queue - * @s: Socket + * @conn: Connection to repair sequences for * @v: Sequence value to be set * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_repair_seq(int s, const uint32_t *v) +static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn, + const uint32_t *v) { - if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) { + if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) { int rc = -errno; - err_perror("Setting sequence, socket %i", s); + flow_perror(conn, "Setting sequence"); return rc; } @@ -3083,15 +3096,17 @@ static int tcp_flow_repair_seq(int s, const uint32_t *v) /** * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it - * @s: Socket + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure * * #syscalls:vu ioctl */ -static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) +static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) { + int s = conn->sock; ssize_t rc; if (ioctl(s, SIOCINQ, &t->rcvq) < 0) { @@ -3111,8 +3126,9 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) t->rcvq--; if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { - err("Receive queue too large to migrate socket %i: %u bytes", - s, t->rcvq); + flow_err(conn, + "Receive queue too large to migrate socket: %u bytes", + t->rcvq); return -ENOBUFS; } @@ -3122,13 +3138,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) rc = 0; } else { rc = -errno; - err_perror("Can't read receive queue for socket %i", s); + flow_perror(conn, "Can't read receive queue"); return rc; } } if ((uint32_t)rc < t->rcvq) { - err("Short read migrating receive queue"); + flow_err(conn, "Short read migrating receive queue"); return -ENXIO; } @@ -3137,12 +3153,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) /** * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps) - * @s: Socket + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t) +static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) { const struct tcp_repair_opt opts[] = { { TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) }, @@ -3156,9 +3173,9 @@ static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t) !!(t->tcpi_options & TCPI_OPT_SACK) + !!(t->tcpi_options & TCPI_OPT_TIMESTAMPS)); - if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) { + if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) { int rc = -errno; - err_perror("Setting repair options, socket %i", s); + flow_perror(conn, "Setting repair options"); return rc; } @@ -3229,36 +3246,36 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) /* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode * weird. */ - if (tcp_set_peek_offset(s, -1)) { + if (tcp_set_peek_offset(conn, -1)) { rc = -errno; goto fail; } - if ((rc = tcp_flow_dump_tinfo(s, t))) + if ((rc = tcp_flow_dump_tinfo(conn, t))) goto fail; - if ((rc = tcp_flow_dump_mss(s, t))) + if ((rc = tcp_flow_dump_mss(conn, t))) goto fail; - if ((rc = tcp_flow_dump_wnd(s, t))) + if ((rc = tcp_flow_dump_wnd(conn, t))) goto fail; - if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE))) + if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE))) goto fail; - if ((rc = tcp_flow_dump_sndqueue(s, t))) + if ((rc = tcp_flow_dump_sndqueue(conn, t))) goto fail; - if ((rc = tcp_flow_dump_seq(s, &t->seq_snd))) + if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd))) goto fail; - if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE))) + if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE))) goto fail; - if ((rc = tcp_flow_dump_rcvqueue(s, t))) + if ((rc = tcp_flow_dump_rcvqueue(conn, t))) goto fail; - if ((rc = tcp_flow_dump_seq(s, &t->seq_rcv))) + if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv))) goto fail; close(s); @@ -3269,14 +3286,14 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) t->seq_rcv -= t->rcvq; t->seq_snd -= t->sndq; - debug("Extended migration data, socket %i sequences send %u receive %u", - s, t->seq_snd, t->seq_rcv); - debug(" pending queues: send %u not sent %u receive %u", - t->sndq, t->notsent, t->rcvq); - debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", - t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup); - debug(" SO_PEEK_OFF %s offset=%"PRIu32, - peek_offset_cap ? "enabled" : "disabled", peek_offset); + flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u", + s, t->seq_snd, t->seq_rcv); + flow_dbg(conn, " pending queues: send %u not sent %u receive %u", + t->sndq, t->notsent, t->rcvq); + flow_dbg(conn, " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup); + flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); /* Endianness fix-ups */ t->seq_snd = htonl(t->seq_snd); @@ -3292,17 +3309,17 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) t->rcv_wup = htonl(t->rcv_wup); if (write_all_buf(fd, t, sizeof(*t))) { - err_perror("Failed to write extended data, socket %i", s); + flow_perror(conn, "Failed to write extended data"); return -EIO; } if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) { - err_perror("Failed to write send queue data, socket %i", s); + flow_perror(conn, "Failed to write send queue data"); return -EIO; } if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) { - err_perror("Failed to write receive queue data, socket %i", s); + flow_perror(conn, "Failed to write receive queue data"); return -EIO; } @@ -3317,7 +3334,7 @@ fail: t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */ if (write_all_buf(fd, t, sizeof(*t))) { - err_perror("Failed to write extended data, socket %i", s); + flow_perror(conn, "Failed to write extended data"); return -EIO; } @@ -3347,19 +3364,20 @@ static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP)) < 0) { rc = -errno; - err_perror("Failed to create socket for migrated flow"); + flow_perror(conn, "Failed to create socket for migrated flow"); return rc; } s = conn->sock; if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int))) - debug_perror("Setting SO_REUSEADDR on socket %i", s); + flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i", + s); tcp_sock_set_nodelay(s); if (bind(s, &a.sa, sizeof(a))) { rc = -errno; - err_perror("Failed to bind socket for migrated flow"); + flow_perror(conn, "Failed to bind socket for migrated flow"); goto err; } @@ -3390,7 +3408,7 @@ static int tcp_flow_repair_connect(const struct ctx *c, rc = flowside_connect(c, conn->sock, PIF_HOST, tgt); if (rc) { rc = -errno; - err_perror("Failed to connect migrated socket %i", conn->sock); + flow_perror(conn, "Failed to connect migrated socket"); return rc; } @@ -3421,8 +3439,8 @@ int tcp_flow_migrate_target(struct ctx *c, int fd) } if (read_all_buf(fd, &t, sizeof(t))) { + flow_perror(flow, "Failed to receive migration data"); flow_alloc_cancel(flow); - err_perror("Failed to receive migration data"); return -errno; } @@ -3481,7 +3499,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd if (read_all_buf(fd, &t, sizeof(t))) { rc = -errno; - err_perror("Failed to read extended data for socket %i", s); + flow_perror(conn, "Failed to read extended data"); return rc; } @@ -3503,31 +3521,34 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd t.rcv_wnd = ntohl(t.rcv_wnd); t.rcv_wup = ntohl(t.rcv_wup); - debug("Extended migration data, socket %i sequences send %u receive %u", - s, t.seq_snd, t.seq_rcv); - debug(" pending queues: send %u not sent %u receive %u", - t.sndq, t.notsent, t.rcvq); - debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", - t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup); - debug(" SO_PEEK_OFF %s offset=%"PRIu32, - peek_offset_cap ? "enabled" : "disabled", peek_offset); + flow_dbg(conn, + "Extended migration data, socket %i sequences send %u receive %u", + s, t.seq_snd, t.seq_rcv); + flow_dbg(conn, " pending queues: send %u not sent %u receive %u", + t.sndq, t.notsent, t.rcvq); + flow_dbg(conn, + " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup); + flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq || t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { - err("Bad queues socket %i, send: %u, not sent: %u, receive: %u", - s, t.sndq, t.notsent, t.rcvq); + flow_err(conn, + "Bad queues socket %i, send: %u, not sent: %u, receive: %u", + s, t.sndq, t.notsent, t.rcvq); return -EINVAL; } if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) { rc = -errno; - err_perror("Failed to read send queue data, socket %i", s); + flow_perror(conn, "Failed to read send queue data"); return rc; } if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) { rc = -errno; - err_perror("Failed to read receive queue data, socket %i", s); + flow_perror(conn, "Failed to read receive queue data"); return rc; } @@ -3535,32 +3556,32 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd /* We weren't able to create the socket, discard flow */ goto fail; - if (tcp_flow_select_queue(s, TCP_SEND_QUEUE)) + if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE)) goto fail; - if (tcp_flow_repair_seq(s, &t.seq_snd)) + if (tcp_flow_repair_seq(conn, &t.seq_snd)) goto fail; - if (tcp_flow_select_queue(s, TCP_RECV_QUEUE)) + if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE)) goto fail; - if (tcp_flow_repair_seq(s, &t.seq_rcv)) + if (tcp_flow_repair_seq(conn, &t.seq_rcv)) goto fail; if (tcp_flow_repair_connect(c, conn)) goto fail; - if (tcp_flow_repair_queue(s, t.rcvq, tcp_migrate_rcv_queue)) + if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue)) goto fail; - if (tcp_flow_select_queue(s, TCP_SEND_QUEUE)) + if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE)) goto fail; - if (tcp_flow_repair_queue(s, t.sndq - t.notsent, + if (tcp_flow_repair_queue(conn, t.sndq - t.notsent, tcp_migrate_snd_queue)) goto fail; - if (tcp_flow_repair_opt(s, &t)) + if (tcp_flow_repair_opt(conn, &t)) goto fail; /* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't @@ -3575,19 +3596,19 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd v = TCP_SEND_QUEUE; if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) - debug_perror("Selecting repair queue, socket %i", s); + flow_perror(conn, "Selecting repair queue"); else shutdown(s, SHUT_WR); } - if (tcp_flow_repair_wnd(s, &t)) + if (tcp_flow_repair_wnd(conn, &t)) goto fail; tcp_flow_repair_off(c, conn); repair_flush(c); if (t.notsent) { - if (tcp_flow_repair_queue(s, t.notsent, + if (tcp_flow_repair_queue(conn, t.notsent, tcp_migrate_snd_queue + (t.sndq - t.notsent))) { /* This sometimes seems to fail for unclear reasons. @@ -3607,15 +3628,16 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd if (t.tcpi_state == TCP_FIN_WAIT1) shutdown(s, SHUT_WR); - if (tcp_set_peek_offset(conn->sock, peek_offset)) + if (tcp_set_peek_offset(conn, peek_offset)) goto fail; tcp_send_flag(c, conn, ACK); tcp_data_from_sock(c, conn); if ((rc = tcp_epoll_ctl(c, conn))) { - debug("Failed to subscribe to epoll for migrated socket %i: %s", - conn->sock, strerror_(-rc)); + flow_dbg(conn, + "Failed to subscribe to epoll for migrated socket: %s", + strerror_(-rc)); goto fail; } diff --git a/tcp.h b/tcp.h index 9142eca..234a803 100644 --- a/tcp.h +++ b/tcp.h @@ -25,7 +25,6 @@ void tcp_timer(struct ctx *c, const struct timespec *now); void tcp_defer_handler(struct ctx *c); void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); -int tcp_set_peek_offset(int s, int offset); extern bool peek_offset_cap; diff --git a/tcp_buf.c b/tcp_buf.c index 72d99c5..0530563 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -125,7 +125,7 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, conn->seq_to_tap = seq; peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; - if (tcp_set_peek_offset(conn->sock, peek_offset)) + if (tcp_set_peek_offset(conn, peek_offset)) tcp_rst(c, conn); } } @@ -304,7 +304,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; - if (tcp_set_peek_offset(s, 0)) { + if (tcp_set_peek_offset(conn, 0)) { tcp_rst(c, conn); return -1; } diff --git a/tcp_internal.h b/tcp_internal.h index 6f5e054..36c6533 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -177,5 +177,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, struct tcp_syn_opts *opts, size_t *optlen); +int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset); #endif /* TCP_INTERNAL_H */ diff --git a/tcp_vu.c b/tcp_vu.c index 6891ed1..57587cc 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -376,7 +376,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; - if (tcp_set_peek_offset(conn->sock, 0)) { + if (tcp_set_peek_offset(conn, 0)) { tcp_rst(c, conn); return -1; } From 51f3c071a76bd20677e72b49007b822dca71e755 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 18 Mar 2025 17:18:47 +0100 Subject: [PATCH 292/382] passt-repair: Fix build with -Werror=format-security Fixes: 04701702471e ("passt-repair: Add directory watch") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt-repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passt-repair.c b/passt-repair.c index 8bb3f00..120f7aa 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -150,7 +150,7 @@ int main(int argc, char **argv) _exit(1); } - ret = snprintf(a.sun_path, sizeof(a.sun_path), path); + ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path); inotify_dir = true; } else { ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); From 28772ee91a60b34786023496ea17c2c2f4e5f7f5 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 19 Mar 2025 16:14:21 +1100 Subject: [PATCH 293/382] migrate, tcp: More careful marshalling of mss parameter during migration During migration we extract the limit on segment size using TCP_MAXSEG, and set it on the other side with TCP_REPAIR_OPTIONS. However, unlike most 32-bit values we transfer we transfer it in native endian, not network endian. This is not correct; add it to the list of endian fixups we make. In addition, while MAXSEG will be 32-bits in practice, and is given as such to TCP_REPAIR_OPTIONS, the TCP_MAXSEG sockopt treats it as an 'int'. It's not strictly safe to pass a uint32_t to a getsockopt() expecting an int, although we'll get away with it on most (maybe all) platforms. Correct this as well. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Minor coding style fix] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index a4c840e..43ee76b 100644 --- a/tcp.c +++ b/tcp.c @@ -2848,13 +2848,16 @@ static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn, struct tcp_tap_transfer_ext *t) { socklen_t sl = sizeof(t->mss); + int val; - if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) { + if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) { int rc = -errno; flow_perror(conn, "Getting MSS"); return rc; } + t->mss = (uint32_t)val; + return 0; } @@ -3301,6 +3304,7 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) t->sndq = htonl(t->sndq); t->notsent = htonl(t->notsent); t->rcvq = htonl(t->rcvq); + t->mss = htonl(t->mss); t->snd_wl1 = htonl(t->snd_wl1); t->snd_wnd = htonl(t->snd_wnd); @@ -3514,6 +3518,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd t.sndq = ntohl(t.sndq); t.notsent = ntohl(t.notsent); t.rcvq = ntohl(t.rcvq); + t.mss = ntohl(t.mss); t.snd_wl1 = ntohl(t.snd_wl1); t.snd_wnd = ntohl(t.snd_wnd); From cfb3740568ab291d7be00e457658c45ce9367ed5 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 19 Mar 2025 16:14:22 +1100 Subject: [PATCH 294/382] migrate, tcp: Migrate RFC 7323 timestamp Currently our migration of the state of TCP sockets omits the RFC 7323 timestamp. In some circumstances that can result in data sent from the target machine not being received, because it is discarded on the peer due to PAWS checking. Add code to dump and restore the timestamp across migration. Link: https://bugs.passt.top/show_bug.cgi?id=115 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Minor style fixes] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ tcp_conn.h | 2 ++ 2 files changed, 61 insertions(+) diff --git a/tcp.c b/tcp.c index 43ee76b..68af43d 100644 --- a/tcp.c +++ b/tcp.c @@ -2861,6 +2861,57 @@ static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn, return 0; } + +/** + * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data (tcpi_options must be populated) + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + int val = 0; + + if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) { + socklen_t sl = sizeof(val); + + if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) { + int rc = -errno; + flow_perror(conn, "Getting RFC 7323 timestamp"); + return rc; + } + } + + t->timestamp = (uint32_t)val; + return 0; +} + +/** + * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) +{ + int val = (int)t->timestamp; + + if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) { + if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, + &val, sizeof(val))) { + int rc = -errno; + flow_perror(conn, "Setting RFC 7323 timestamp"); + return rc; + } + } + + return 0; +} + /** * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters * @conn: Pointer to the TCP connection structure @@ -3260,6 +3311,9 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) if ((rc = tcp_flow_dump_mss(conn, t))) goto fail; + if ((rc = tcp_flow_dump_timestamp(conn, t))) + goto fail; + if ((rc = tcp_flow_dump_wnd(conn, t))) goto fail; @@ -3305,6 +3359,7 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) t->notsent = htonl(t->notsent); t->rcvq = htonl(t->rcvq); t->mss = htonl(t->mss); + t->timestamp = htonl(t->timestamp); t->snd_wl1 = htonl(t->snd_wl1); t->snd_wnd = htonl(t->snd_wnd); @@ -3519,6 +3574,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd t.notsent = ntohl(t.notsent); t.rcvq = ntohl(t.rcvq); t.mss = ntohl(t.mss); + t.timestamp = ntohl(t.timestamp); t.snd_wl1 = ntohl(t.snd_wl1); t.snd_wnd = ntohl(t.snd_wnd); @@ -3561,6 +3617,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd /* We weren't able to create the socket, discard flow */ goto fail; + if (tcp_flow_repair_timestamp(conn, &t)) + goto fail; + if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE)) goto fail; diff --git a/tcp_conn.h b/tcp_conn.h index 9126a36..35d813d 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -152,6 +152,7 @@ struct tcp_tap_transfer { * @notsent: Part of pending send queue that wasn't sent out yet * @rcvq: Length of pending receive queue * @mss: Socket-side MSS clamp + * @timestamp: RFC 7323 timestamp * @snd_wl1: Next sequence used in window probe (next sequence - 1) * @snd_wnd: Socket-side sending window * @max_window: Window clamp @@ -171,6 +172,7 @@ struct tcp_tap_transfer_ext { uint32_t rcvq; uint32_t mss; + uint32_t timestamp; /* We can't just use struct tcp_repair_window: we need network order */ uint32_t snd_wl1; From c250ffc5c11385d9618b3a8165e676d68d5cbfa2 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 19 Mar 2025 16:14:23 +1100 Subject: [PATCH 295/382] migrate: Bump migration version number v1 of the migration stream format, had some flaws: it didn't properly handle endianness of the MSS field, and it didn't transfer the RFC7323 timestamp. We've now fixed those bugs, but it requires incompatible changes to the stream format. Because of the timestamps in particular, v1 is not really usable, so there is little point maintaining compatible support for it. However, v1 is in released packages, both upstream and downstream (RHEL at least). Just updating the stream format without bumping the version would lead to very cryptic errors if anyone did attempt to migrate between an old and new passt. So, bump the migration version to v2, so we'll get a clear error message if anyone attempts this. We don't attempt to maintain backwards compatibility with v1, however: we'll simply fail if given a v1 stream. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- migrate.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/migrate.c b/migrate.c index 0fca77b..48d63a0 100644 --- a/migrate.c +++ b/migrate.c @@ -96,8 +96,8 @@ static int seen_addrs_target_v1(struct ctx *c, return 0; } -/* Stages for version 1 */ -static const struct migrate_stage stages_v1[] = { +/* Stages for version 2 */ +static const struct migrate_stage stages_v2[] = { { .name = "observed addresses", .source = seen_addrs_source_v1, @@ -118,7 +118,11 @@ static const struct migrate_stage stages_v1[] = { /* Supported encoding versions, from latest (most preferred) to oldest */ static const struct migrate_version versions[] = { - { 1, stages_v1, }, + { 2, stages_v2, }, + /* v1 was released, but not widely used. It had bad endianness for the + * MSS and omitted timestamps, which meant it usually wouldn't work. + * Therefore we don't attempt to support compatibility with it. + */ { 0 }, }; From ebdd46367ce1acba235013d97e362b8677b538d5 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 19 Mar 2025 17:57:45 +0100 Subject: [PATCH 296/382] tcp: Flush socket before checking for more data in active close state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise, if all the pending data is acknowledged: - tcp_update_seqack_from_tap() updates the current tap-side ACK sequence (conn->seq_ack_from_tap) - next, we compare the sequence we sent (conn->seq_to_tap) to the ACK sequence (conn->seq_ack_from_tap) in tcp_data_from_sock() to understand if there's more data we can send. If they match, we conclude that we haven't sent any of that data, and keep re-sending it. We need, instead, to flush the socket (drop acknowledged data) before calling tcp_update_seqack_from_tap(), so that once we update conn->seq_ack_from_tap, we can be sure that all data until there is gone from the socket. Link: https://bugs.passt.top/show_bug.cgi?id=114 Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Fixes: 30f1e082c3c0 ("tcp: Keep updating window and checking for socket data after FIN from guest") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tcp.c b/tcp.c index 68af43d..fa1d885 100644 --- a/tcp.c +++ b/tcp.c @@ -2049,6 +2049,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, /* Established connections not accepting data from tap */ if (conn->events & TAP_FIN_RCVD) { + tcp_sock_consume(conn, ntohl(th->ack_seq)); tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); tcp_tap_window_update(conn, ntohs(th->window)); tcp_data_from_sock(c, conn); From 07c2d584b334b0c405a5702a4f2fad104d03940b Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 19 Mar 2025 20:43:47 +0100 Subject: [PATCH 297/382] conf: Include libgen.h for basename(), fix build against musl Fixes: 4b17d042c7e4 ("conf: Move mode detection into helper function") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- conf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/conf.c b/conf.c index 0e2e8dc..b54c55d 100644 --- a/conf.c +++ b/conf.c @@ -16,6 +16,7 @@ #include <errno.h> #include <fcntl.h> #include <getopt.h> +#include <libgen.h> #include <string.h> #include <sched.h> #include <sys/types.h> From 32f6212551c5db3b7b3548e8483e5d73f07a35ac Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 19 Mar 2025 20:45:12 +0100 Subject: [PATCH 298/382] Makefile: Enable -Wformat-security It looks like an easy win to prevent a number of possible security flaws. Suggested-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f2ac8e5..31cbac3 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > / FORTIFY_FLAG := -D_FORTIFY_SOURCE=2 endif -FLAGS := -Wall -Wextra -Wno-format-zero-length +FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) From 4592719a744bcb47db2ff5680be4b8f6362a97ce Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:14 +1100 Subject: [PATCH 299/382] vu_common: Tighten vu_packet_check_range() This function verifies that the given packet is within the mmap()ed memory region of the vhost-user device. We can do better, however. The packet should be not only within the mmap()ed range, but specifically in the subsection of that range set aside for shared buffers, which starts at dev_region->mmap_offset within there. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vu_common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vu_common.c b/vu_common.c index 686a09b..9eea4f2 100644 --- a/vu_common.c +++ b/vu_common.c @@ -37,10 +37,10 @@ int vu_packet_check_range(void *buf, const char *ptr, size_t len) for (dev_region = buf; dev_region->mmap_addr; dev_region++) { /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ - char *m = (char *)(uintptr_t)dev_region->mmap_addr; + char *m = (char *)(uintptr_t)dev_region->mmap_addr + + dev_region->mmap_offset; - if (m <= ptr && - ptr + len <= m + dev_region->mmap_offset + dev_region->size) + if (m <= ptr && ptr + len <= m + dev_region->size) return 0; } From e43e00719d7701301e4bc4fb179dc7adff175409 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:15 +1100 Subject: [PATCH 300/382] packet: More cautious checks to avoid pointer arithmetic UB packet_check_range and vu_packet_check_range() verify that the packet or section of packet we're interested in lies in the packet buffer pool we expect it to. However, in doing so it doesn't avoid the possibility of an integer overflow while performing pointer arithmetic, with is UB. In fact, AFAICT it's UB even to use arbitrary pointer arithmetic to construct a pointer outside of a known valid buffer. To do this safely, we can't calculate the end of a memory region with pointer addition when then the length as untrusted. Instead we must work out the offset of one memory region within another using pointer subtraction, then do integer checks against the length of the outer region. We then need to be careful about the order of checks so that those integer checks can't themselves overflow. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 12 +++++++++--- vu_common.c | 10 +++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/packet.c b/packet.c index bcac037..d1a51a5 100644 --- a/packet.c +++ b/packet.c @@ -52,9 +52,15 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len, return -1; } - if (ptr + len > p->buf + p->buf_size) { - trace("packet range end %p after buffer end %p, %s:%i", - (void *)(ptr + len), (void *)(p->buf + p->buf_size), + if (len > p->buf_size) { + trace("packet range length %zu larger than buffer %zu, %s:%i", + len, p->buf_size, func, line); + return -1; + } + + if ((size_t)(ptr - p->buf) > p->buf_size - len) { + trace("packet range %p, len %zu after buffer end %p, %s:%i", + (void *)ptr, len, (void *)(p->buf + p->buf_size), func, line); return -1; } diff --git a/vu_common.c b/vu_common.c index 9eea4f2..cefe5e2 100644 --- a/vu_common.c +++ b/vu_common.c @@ -36,11 +36,15 @@ int vu_packet_check_range(void *buf, const char *ptr, size_t len) struct vu_dev_region *dev_region; for (dev_region = buf; dev_region->mmap_addr; dev_region++) { - /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ - char *m = (char *)(uintptr_t)dev_region->mmap_addr + + uintptr_t base_addr = dev_region->mmap_addr + dev_region->mmap_offset; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + const char *base = (const char *)base_addr; - if (m <= ptr && ptr + len <= m + dev_region->size) + ASSERT(base_addr >= dev_region->mmap_addr); + + if (len <= dev_region->size && base <= ptr && + (size_t)(ptr - base) <= dev_region->size - len) return 0; } From a41d6d125eca5ac8c54bed8157098be141557b03 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:16 +1100 Subject: [PATCH 301/382] tap: Make size of pool_tap[46] purely a tuning parameter Currently we attempt to size pool_tap[46] so they have room for the maximum possible number of packets that could fit in pkt_buf (TAP_MSGS). However, the calculation isn't quite correct: TAP_MSGS is based on ETH_ZLEN (60) as the minimum possible L2 frame size. But ETH_ZLEN is based on physical constraints of Ethernet, which don't apply to our virtual devices. It is possible to generate a legitimate frame smaller than this, for example an empty payload UDP/IPv4 frame on the 'pasta' backend is only 42 bytes long. Further more, the same limit applies for vhost-user, which is not limited by the size of pkt_buf like the other backends. In that case we don't even have full control of the maximum buffer size, so we can't really calculate how many packets could fit in there. If we exceed do TAP_MSGS we'll drop packets, not just use more batches, which is moderately bad. The fact that this needs to be sized just so for correctness not merely for tuning is a fairly non-obvious coupling between different parts of the code. To make this more robust, alter the tap code so it doesn't rely on everything fitting in a single batch of TAP_MSGS packets, instead breaking into multiple batches as necessary. This leaves TAP_MSGS as purely a tuning parameter, which we can freely adjust based on performance measures. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 13 ++++++++++++- packet.h | 3 +++ passt.h | 2 -- tap.c | 19 ++++++++++++++++--- tap.h | 3 ++- vu_common.c | 5 +++-- 6 files changed, 36 insertions(+), 9 deletions(-) diff --git a/packet.c b/packet.c index d1a51a5..08076d5 100644 --- a/packet.c +++ b/packet.c @@ -67,6 +67,17 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len, return 0; } +/** + * pool_full() - Is a packet pool full? + * @p: Pointer to packet pool + * + * Return: true if the pool is full, false if more packets can be added + */ +bool pool_full(const struct pool *p) +{ + return p->count >= p->size; +} + /** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool @@ -80,7 +91,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start, { size_t idx = p->count; - if (idx >= p->size) { + if (pool_full(p)) { trace("add packet index %zu to pool with size %zu, %s:%i", idx, p->size, func, line); return; diff --git a/packet.h b/packet.h index d099f02..dd18461 100644 --- a/packet.h +++ b/packet.h @@ -6,6 +6,8 @@ #ifndef PACKET_H #define PACKET_H +#include <stdbool.h> + /* Maximum size of a single packet stored in pool, including headers */ #define PACKET_MAX_LEN UINT16_MAX @@ -33,6 +35,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start, void *packet_get_do(const struct pool *p, const size_t idx, size_t offset, size_t len, size_t *left, const char *func, int line); +bool pool_full(const struct pool *p); void pool_flush(struct pool *p); #define packet_add(p, len, start) \ diff --git a/passt.h b/passt.h index 8f45091..8693794 100644 --- a/passt.h +++ b/passt.h @@ -71,8 +71,6 @@ static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), /* Large enough for ~128 maximum size frames */ #define PKT_BUF_BYTES (8UL << 20) -#define TAP_MSGS \ - DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) extern char pkt_buf [PKT_BUF_BYTES]; diff --git a/tap.c b/tap.c index 182a115..34e6774 100644 --- a/tap.c +++ b/tap.c @@ -75,6 +75,9 @@ CHECK_FRAME_LEN(L2_MAX_LEN_PASTA); CHECK_FRAME_LEN(L2_MAX_LEN_PASST); CHECK_FRAME_LEN(L2_MAX_LEN_VU); +#define TAP_MSGS \ + DIV_ROUND_UP(sizeof(pkt_buf), ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) + /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); @@ -1042,8 +1045,10 @@ void tap_handler(struct ctx *c, const struct timespec *now) * @c: Execution context * @l2len: Total L2 packet length * @p: Packet buffer + * @now: Current timestamp */ -void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p, + const struct timespec *now) { const struct ethhdr *eh; @@ -1059,9 +1064,17 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) switch (ntohs(eh->h_proto)) { case ETH_P_ARP: case ETH_P_IP: + if (pool_full(pool_tap4)) { + tap4_handler(c, pool_tap4, now); + pool_flush(pool_tap4); + } packet_add(pool_tap4, l2len, p); break; case ETH_P_IPV6: + if (pool_full(pool_tap6)) { + tap6_handler(c, pool_tap6, now); + pool_flush(pool_tap6); + } packet_add(pool_tap6, l2len, p); break; default: @@ -1142,7 +1155,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) p += sizeof(uint32_t); n -= sizeof(uint32_t); - tap_add_packet(c, l2len, p); + tap_add_packet(c, l2len, p, now); p += l2len; n -= l2len; @@ -1207,7 +1220,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) len > (ssize_t)L2_MAX_LEN_PASTA) continue; - tap_add_packet(c, len, pkt_buf + n); + tap_add_packet(c, len, pkt_buf + n, now); } tap_handler(c, now); diff --git a/tap.h b/tap.h index dd39fd8..6fe3d15 100644 --- a/tap.h +++ b/tap.h @@ -119,6 +119,7 @@ void tap_sock_update_pool(void *base, size_t size); void tap_backend_init(struct ctx *c); void tap_flush_pools(void); void tap_handler(struct ctx *c, const struct timespec *now); -void tap_add_packet(struct ctx *c, ssize_t l2len, char *p); +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p, + const struct timespec *now); #endif /* TAP_H */ diff --git a/vu_common.c b/vu_common.c index cefe5e2..5e6fd4a 100644 --- a/vu_common.c +++ b/vu_common.c @@ -195,7 +195,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, tap_add_packet(vdev->context, elem[count].out_sg[0].iov_len - hdrlen, (char *)elem[count].out_sg[0].iov_base + - hdrlen); + hdrlen, now); } else { /* vnet header can be in a separate iovec */ if (elem[count].out_num != 2) { @@ -207,7 +207,8 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, } else { tap_add_packet(vdev->context, elem[count].out_sg[1].iov_len, - (char *)elem[count].out_sg[1].iov_base); + (char *)elem[count].out_sg[1].iov_base, + now); } } From 9866d146e654975dd7f5fd3f1294d5fc4628cef3 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:17 +1100 Subject: [PATCH 302/382] tap: Clarify calculation of TAP_MSGS The rationale behind the calculation of TAP_MSGS isn't necessarily obvious. It's supposed to be the maximum number of packets that can fit in pkt_buf. However, the calculation is wrong in several ways: * It's based on ETH_ZLEN which isn't meaningful for virtual devices * It always includes the qemu socket header which isn't used for pasta * The size of pkt_buf isn't relevant for vhost-user We've already made sure this is just a tuning parameter, not a hard limit. Clarify what we're calculating here and why. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/tap.c b/tap.c index 34e6774..3a6fcbe 100644 --- a/tap.c +++ b/tap.c @@ -75,12 +75,28 @@ CHECK_FRAME_LEN(L2_MAX_LEN_PASTA); CHECK_FRAME_LEN(L2_MAX_LEN_PASST); CHECK_FRAME_LEN(L2_MAX_LEN_VU); -#define TAP_MSGS \ - DIV_ROUND_UP(sizeof(pkt_buf), ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) +/* We try size the packet pools so that we can use a single batch for the entire + * packet buffer. This might be exceeded for vhost-user, though, which uses its + * own buffers rather than pkt_buf. + * + * This is just a tuning parameter, the code will work with slightly more + * overhead if it's incorrect. So, we estimate based on the minimum practical + * frame size - an empty UDP datagram - rather than the minimum theoretical + * frame size. + * + * FIXME: Profile to work out how big this actually needs to be to amortise + * per-batch syscall overheads + */ +#define TAP_MSGS_IP4 \ + DIV_ROUND_UP(sizeof(pkt_buf), \ + ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr)) +#define TAP_MSGS_IP6 \ + DIV_ROUND_UP(sizeof(pkt_buf), \ + ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr)) /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ -static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); -static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); +static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4, pkt_buf); +static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf); #define TAP_SEQS 128 /* Different L4 tuples in one batch */ #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ @@ -1418,8 +1434,8 @@ void tap_sock_update_pool(void *base, size_t size) { int i; - pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size); - pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size); + pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size); + pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size); for (i = 0; i < TAP_SEQS; i++) { tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size); From c48331ca51399fe1779529511be395b576aaf0af Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:18 +1100 Subject: [PATCH 303/382] packet: Correct type of PACKET_MAX_LEN PACKET_MAX_LEN is usually involved in calculations on size_t values - the type of the iov_len field in struct iovec. However, being defined bare as UINT16_MAX, the compiled is likely to assign it a shorter type. This can lead to unexpected promotions (or lack thereof). Add a cast to force the type to be what we expect. Fixes: c43972ad6 ("packet: Give explicit name to maximum packet size") Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packet.h b/packet.h index dd18461..9061dad 100644 --- a/packet.h +++ b/packet.h @@ -9,7 +9,7 @@ #include <stdbool.h> /* Maximum size of a single packet stored in pool, including headers */ -#define PACKET_MAX_LEN UINT16_MAX +#define PACKET_MAX_LEN ((size_t)UINT16_MAX) /** * struct pool - Generic pool of packets stored in a buffer From 37d9f374d9f0c47c092f80a5d85d4505ae4a9af7 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:19 +1100 Subject: [PATCH 304/382] packet: Avoid integer overflows in packet_get_do() In packet_get_do() both offset and len are essentially untrusted. We do some validation of len (check it's < PACKET_MAX_LEN), but that's not enough to ensure that (len + offset) doesn't overflow. Rearrange our calculation to make sure it's safe regardless of the given offset & len values. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packet.c b/packet.c index 08076d5..fdc4be7 100644 --- a/packet.c +++ b/packet.c @@ -144,7 +144,8 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (len + offset > p->pkt[idx].iov_len) { + if (offset > p->pkt[idx].iov_len || + len > (p->pkt[idx].iov_len - offset)) { if (func) { trace("data length %zu, offset %zu from length %zu, " "%s:%i", len, offset, p->pkt[idx].iov_len, From 961aa6a0eb7fce956a34f8ccd883bfe12392d3d3 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:20 +1100 Subject: [PATCH 305/382] packet: Move checks against PACKET_MAX_LEN to packet_check_range() Both the callers of packet_check_range() separately verify that the given length does not exceed PACKET_MAX_LEN. Fold that check into packet_check_range() instead. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/packet.c b/packet.c index fdc4be7..7cbe95d 100644 --- a/packet.c +++ b/packet.c @@ -35,6 +35,12 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len, const char *func, int line) { + if (len > PACKET_MAX_LEN) { + trace("packet range length %zu (max %zu), %s:%i", + len, PACKET_MAX_LEN, func, line); + return -1; + } + if (p->buf_size == 0) { int ret; @@ -100,11 +106,6 @@ void packet_add_do(struct pool *p, size_t len, const char *start, if (packet_check_range(p, start, len, func, line)) return; - if (len > PACKET_MAX_LEN) { - trace("add packet length %zu, %s:%i", len, func, line); - return; - } - p->pkt[idx].iov_base = (void *)start; p->pkt[idx].iov_len = len; @@ -136,14 +137,6 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (len > PACKET_MAX_LEN) { - if (func) { - trace("packet data length %zu, %s:%i", - len, func, line); - } - return NULL; - } - if (offset > p->pkt[idx].iov_len || len > (p->pkt[idx].iov_len - offset)) { if (func) { From 38bcce997763f2e0c4bb6c0a3926674317796544 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:21 +1100 Subject: [PATCH 306/382] packet: Rework packet_get() versus packet_get_try() Most failures of packet_get() indicate a serious problem, and log messages accordingly. However, a few callers expect failures here, because they're probing for a certain range which might or might not be in a packet. They use packet_get_try() which passes a NULL func to packet_get_do() to suppress the logging which is unwanted in this case. However, this doesn't just suppress the log when packet_get_do() finds the requested region isn't in the packet. It suppresses logging for all other errors too, which do indicate serious problems, even for the callers of packet_get_try(). Worse it will pass the NULL func on to packet_check_range() which doesn't expect it, meaning we'll get unhelpful messages from there if there is a failure. Fix this by making packet_get_try_do() the primary function which doesn't log for the case of a range outside the packet. packet_get_do() becomes a trivial wrapper around that which logs a message if packet_get_try_do() returns NULL. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 51 +++++++++++++++++++++++++++++++++++---------------- packet.h | 8 +++++--- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/packet.c b/packet.c index 7cbe95d..b3e8c79 100644 --- a/packet.c +++ b/packet.c @@ -89,7 +89,7 @@ bool pool_full(const struct pool *p) * @p: Existing pool * @len: Length of new descriptor * @start: Start of data - * @func: For tracing: name of calling function, NULL means no trace() + * @func: For tracing: name of calling function * @line: For tracing: caller line of function call */ void packet_add_do(struct pool *p, size_t len, const char *start, @@ -113,39 +113,31 @@ void packet_add_do(struct pool *p, size_t len, const char *start, } /** - * packet_get_do() - Get data range from packet descriptor from given pool + * packet_get_try_do() - Get data range from packet descriptor from given pool * @p: Packet pool * @idx: Index of packet descriptor in pool * @offset: Offset of data range in packet descriptor * @len: Length of desired data range * @left: Length of available data after range, set on return, can be NULL - * @func: For tracing: name of calling function, NULL means no trace() + * @func: For tracing: name of calling function * @line: For tracing: caller line of function call * * Return: pointer to start of data range, NULL on invalid range or descriptor */ -void *packet_get_do(const struct pool *p, size_t idx, size_t offset, - size_t len, size_t *left, const char *func, int line) +void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset, + size_t len, size_t *left, const char *func, int line) { char *ptr; if (idx >= p->size || idx >= p->count) { - if (func) { - trace("packet %zu from pool size: %zu, count: %zu, " - "%s:%i", idx, p->size, p->count, func, line); - } + trace("packet %zu from pool size: %zu, count: %zu, %s:%i", + idx, p->size, p->count, func, line); return NULL; } if (offset > p->pkt[idx].iov_len || - len > (p->pkt[idx].iov_len - offset)) { - if (func) { - trace("data length %zu, offset %zu from length %zu, " - "%s:%i", len, offset, p->pkt[idx].iov_len, - func, line); - } + len > (p->pkt[idx].iov_len - offset)) return NULL; - } ptr = (char *)p->pkt[idx].iov_base + offset; @@ -158,6 +150,33 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return ptr; } +/** + * packet_get_do() - Get data range from packet descriptor from given pool + * @p: Packet pool + * @idx: Index of packet descriptor in pool + * @offset: Offset of data range in packet descriptor + * @len: Length of desired data range + * @left: Length of available data after range, set on return, can be NULL + * @func: For tracing: name of calling function + * @line: For tracing: caller line of function call + * + * Return: as packet_get_try_do() but log a trace message when returning NULL + */ +void *packet_get_do(const struct pool *p, const size_t idx, + size_t offset, size_t len, size_t *left, + const char *func, int line) +{ + void *r = packet_get_try_do(p, idx, offset, len, left, func, line); + + if (!r) { + trace("missing packet data length %zu, offset %zu from " + "length %zu, %s:%i", + len, offset, p->pkt[idx].iov_len, func, line); + } + + return r; +} + /** * pool_flush() - Flush a packet pool * @p: Pointer to packet pool diff --git a/packet.h b/packet.h index 9061dad..c94780a 100644 --- a/packet.h +++ b/packet.h @@ -32,6 +32,9 @@ struct pool { int vu_packet_check_range(void *buf, const char *ptr, size_t len); void packet_add_do(struct pool *p, size_t len, const char *start, const char *func, int line); +void *packet_get_try_do(const struct pool *p, const size_t idx, + size_t offset, size_t len, size_t *left, + const char *func, int line); void *packet_get_do(const struct pool *p, const size_t idx, size_t offset, size_t len, size_t *left, const char *func, int line); @@ -41,12 +44,11 @@ void pool_flush(struct pool *p); #define packet_add(p, len, start) \ packet_add_do(p, len, start, __func__, __LINE__) +#define packet_get_try(p, idx, offset, len, left) \ + packet_get_try_do(p, idx, offset, len, left, __func__, __LINE__) #define packet_get(p, idx, offset, len, left) \ packet_get_do(p, idx, offset, len, left, __func__, __LINE__) -#define packet_get_try(p, idx, offset, len, left) \ - packet_get_do(p, idx, offset, len, left, NULL, 0) - #define PACKET_POOL_DECL(_name, _size, _buf) \ struct _name ## _t { \ char *buf; \ From 9153aca15bc1150e450dd56e79bc035cc2dbf27c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:22 +1100 Subject: [PATCH 307/382] util: Add abort_with_msg() and ASSERT_WITH_MSG() helpers We already have the ASSERT() macro which will abort() passt based on a condition. It always has a fixed error message based on its location and the asserted expression. We have some upcoming cases where we want to customise the message when hitting an assert. Add abort_with_msg() and ASSERT_WITH_MSG() helpers to allow this. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- util.c | 19 +++++++++++++++++++ util.h | 25 ++++++++++--------------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/util.c b/util.c index 656e86a..b9a3d43 100644 --- a/util.c +++ b/util.c @@ -1017,3 +1017,22 @@ void encode_domain_name(char *buf, const char *domain_name) } p[i] = 0L; } + +/** + * abort_with_msg() - Print error message and abort + * @fmt: Format string + * @...: Format parameters + */ +void abort_with_msg(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vlogmsg(true, false, LOG_CRIT, fmt, ap); + va_end(ap); + + /* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp, + * but that will still get the job done. + */ + abort(); +} diff --git a/util.h b/util.h index 4d512fa..b1e7e79 100644 --- a/util.h +++ b/util.h @@ -61,27 +61,22 @@ #define STRINGIFY(x) #x #define STR(x) STRINGIFY(x) -#ifdef CPPCHECK_6936 +void abort_with_msg(const char *fmt, ...) + __attribute__((format(printf, 1, 2), noreturn)); + /* Some cppcheck versions get confused by aborts inside a loop, causing * it to give false positive uninitialised variable warnings later in * the function, because it doesn't realise the non-initialising path * already exited. See https://trac.cppcheck.net/ticket/13227 + * + * Therefore, avoid using the usual do while wrapper we use to force the macro + * to act like a single statement requiring a ';'. */ -#define ASSERT(expr) \ - ((expr) ? (void)0 : abort()) -#else +#define ASSERT_WITH_MSG(expr, ...) \ + ((expr) ? (void)0 : abort_with_msg(__VA_ARGS__)) #define ASSERT(expr) \ - do { \ - if (!(expr)) { \ - err("ASSERTION FAILED in %s (%s:%d): %s", \ - __func__, __FILE__, __LINE__, STRINGIFY(expr)); \ - /* This may actually SIGSYS, due to seccomp, \ - * but that will still get the job done \ - */ \ - abort(); \ - } \ - } while (0) -#endif + ASSERT_WITH_MSG((expr), "ASSSERTION FAILED in %s (%s:%d): %s", \ + __func__, __FILE__, __LINE__, STRINGIFY(expr)) #ifdef P_tmpdir #define TMPDIR P_tmpdir From 0857515c943d439eade80710c16f15f146dfa9e8 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:23 +1100 Subject: [PATCH 308/382] packet: ASSERT on signs of pool corruption If packet_check_range() fails in packet_get_try_do() we just return NULL. But this check only takes places after we've already validated the given range against the packet it's in. That means that if packet_check_range() fails, the packet pool is already in a corrupted state (we should have made strictly stronger checks when the packet was added). Simply returning NULL and logging a trace() level message isn't really adequate for that situation; ASSERT instead. Similarly we check the given idx against both p->count and p->size. The latter should be redundant, because count should always be <= size. If that's not the case then, again, the pool is already in a corrupted state and we may have overwritten unknown memory. Assert for this case too. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/packet.c b/packet.c index b3e8c79..be28f27 100644 --- a/packet.c +++ b/packet.c @@ -129,9 +129,13 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset, { char *ptr; - if (idx >= p->size || idx >= p->count) { - trace("packet %zu from pool size: %zu, count: %zu, %s:%i", - idx, p->size, p->count, func, line); + ASSERT_WITH_MSG(p->count <= p->size, + "Corrupt pool count: %zu, size: %zu, %s:%i", + p->count, p->size, func, line); + + if (idx >= p->count) { + trace("packet %zu from pool count: %zu, %s:%i", + idx, p->count, func, line); return NULL; } @@ -141,8 +145,8 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset, ptr = (char *)p->pkt[idx].iov_base + offset; - if (packet_check_range(p, ptr, len, func, line)) - return NULL; + ASSERT_WITH_MSG(!packet_check_range(p, ptr, len, func, line), + "Corrupt packet pool, %s:%i", func, line); if (left) *left = p->pkt[idx].iov_len - offset - len; From cf4d3f05c9263d1b0a88dbbcf9e48d34cac6708e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Mon, 17 Mar 2025 20:24:24 +1100 Subject: [PATCH 309/382] packet: Upgrade severity of most packet errors All errors from packet_range_check(), packet_add() and packet_get() are trace level. However, these are for the most part actual error conditions. They're states that should not happen, in many cases indicating a bug in the caller or elswhere. We don't promote these to err() or ASSERT() level, for fear of a localised bug on very specific input crashing the entire program, or flooding the logs, but we can at least upgrade them to debug level. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- packet.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/packet.c b/packet.c index be28f27..72c6158 100644 --- a/packet.c +++ b/packet.c @@ -36,7 +36,7 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len, const char *func, int line) { if (len > PACKET_MAX_LEN) { - trace("packet range length %zu (max %zu), %s:%i", + debug("packet range length %zu (max %zu), %s:%i", len, PACKET_MAX_LEN, func, line); return -1; } @@ -47,25 +47,25 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len, ret = vu_packet_check_range((void *)p->buf, ptr, len); if (ret == -1) - trace("cannot find region, %s:%i", func, line); + debug("cannot find region, %s:%i", func, line); return ret; } if (ptr < p->buf) { - trace("packet range start %p before buffer start %p, %s:%i", + debug("packet range start %p before buffer start %p, %s:%i", (void *)ptr, (void *)p->buf, func, line); return -1; } if (len > p->buf_size) { - trace("packet range length %zu larger than buffer %zu, %s:%i", + debug("packet range length %zu larger than buffer %zu, %s:%i", len, p->buf_size, func, line); return -1; } if ((size_t)(ptr - p->buf) > p->buf_size - len) { - trace("packet range %p, len %zu after buffer end %p, %s:%i", + debug("packet range %p, len %zu after buffer end %p, %s:%i", (void *)ptr, len, (void *)(p->buf + p->buf_size), func, line); return -1; @@ -98,7 +98,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start, size_t idx = p->count; if (pool_full(p)) { - trace("add packet index %zu to pool with size %zu, %s:%i", + debug("add packet index %zu to pool with size %zu, %s:%i", idx, p->size, func, line); return; } @@ -134,7 +134,7 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset, p->count, p->size, func, line); if (idx >= p->count) { - trace("packet %zu from pool count: %zu, %s:%i", + debug("packet %zu from pool count: %zu, %s:%i", idx, p->count, func, line); return NULL; } From 89b203b851f32a532cc0406cf26a1d24950a207c Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 26 Mar 2025 14:44:01 +1100 Subject: [PATCH 310/382] udp: Common invocation of udp_sock_errs() for vhost-user and "buf" paths The vhost-user and non-vhost-user paths for both udp_listen_sock_handler() and udp_reply_sock_handler() are more or less completely separate. Both, however, start with essentially the same invocation of udp_sock_errs(), so that can be made common. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 37 ++++++++++++++++++++----------------- udp_internal.h | 2 +- udp_vu.c | 15 --------------- 3 files changed, 21 insertions(+), 33 deletions(-) diff --git a/udp.c b/udp.c index 80520cb..4a06b16 100644 --- a/udp.c +++ b/udp.c @@ -585,7 +585,8 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) * * Return: Number of errors handled, or < 0 if we have an unrecoverable error */ -int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events) +static int udp_sock_errs(const struct ctx *c, union epoll_ref ref, + uint32_t events) { unsigned n_err = 0; socklen_t errlen; @@ -678,13 +679,6 @@ static void udp_buf_listen_sock_handler(const struct ctx *c, const socklen_t sasize = sizeof(udp_meta[0].s_in); int n, i; - if (udp_sock_errs(c, ref, events) < 0) { - err("UDP: Unrecoverable error on listening socket:" - " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); - /* FIXME: what now? close/re-open socket? */ - return; - } - if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0) return; @@ -750,6 +744,13 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { + if (udp_sock_errs(c, ref, events) < 0) { + err("UDP: Unrecoverable error on listening socket:" + " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); + /* FIXME: what now? close/re-open socket? */ + return; + } + if (c->mode == MODE_VU) { udp_vu_listen_sock_handler(c, ref, events, now); return; @@ -777,17 +778,8 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, uint8_t topif = pif_at_sidx(tosidx); int n, i, from_s; - ASSERT(!c->no_udp && uflow); - from_s = uflow->s[ref.flowside.sidei]; - if (udp_sock_errs(c, ref, events) < 0) { - flow_err(uflow, "Unrecoverable error on reply socket"); - flow_err_details(uflow); - udp_flow_close(c, uflow); - return; - } - if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0) return; @@ -825,6 +817,17 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { + struct udp_flow *uflow = udp_at_sidx(ref.flowside); + + ASSERT(!c->no_udp && uflow); + + if (udp_sock_errs(c, ref, events) < 0) { + flow_err(uflow, "Unrecoverable error on reply socket"); + flow_err_details(uflow); + udp_flow_close(c, uflow); + return; + } + if (c->mode == MODE_VU) { udp_vu_reply_sock_handler(c, ref, events, now); return; diff --git a/udp_internal.h b/udp_internal.h index 3b081f5..02724e5 100644 --- a/udp_internal.h +++ b/udp_internal.h @@ -30,5 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen, bool no_udp_csum); -int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events); + #endif /* UDP_INTERNAL_H */ diff --git a/udp_vu.c b/udp_vu.c index c26a223..84f52af 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -227,12 +227,6 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; int i; - if (udp_sock_errs(c, ref, events) < 0) { - err("UDP: Unrecoverable error on listening socket:" - " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); - return; - } - for (i = 0; i < UDP_MAX_FRAMES; i++) { const struct flowside *toside; union sockaddr_inany s_in; @@ -300,15 +294,6 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; int i; - ASSERT(!c->no_udp); - - if (udp_sock_errs(c, ref, events) < 0) { - flow_err(uflow, "Unrecoverable error on reply socket"); - flow_err_details(uflow); - udp_flow_close(c, uflow); - return; - } - for (i = 0; i < UDP_MAX_FRAMES; i++) { uint8_t topif = pif_at_sidx(tosidx); ssize_t dlen; From 5a977c2f4ee8926673554b2b456e7791962b2ce2 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 26 Mar 2025 14:44:02 +1100 Subject: [PATCH 311/382] udp: Simplify checking of epoll event bits udp_{listen,reply}_sock_handler() can accept both EPOLLERR and EPOLLIN events. However, unlike most epoll event handlers we don't check the event bits right there. EPOLLERR is checked within udp_sock_errs() which we call unconditionally. Checking EPOLLIN is still more buried: it is checked within both udp_sock_recv() and udp_vu_sock_recv(). We can simplify the logic and pass less extraneous parameters around by moving the checking of the event bits to the top level event handlers. This makes udp_{buf,vu}_{listen,reply}_sock_handler() no longer general event handlers, but specific to EPOLLIN events, meaning new data. So, rename those functions to udp_{buf,vu}_{listen,reply}_sock_data() to better reflect their function. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 78 ++++++++++++++++++++++++-------------------------------- udp_vu.c | 25 +++++++----------- udp_vu.h | 8 +++--- 3 files changed, 47 insertions(+), 64 deletions(-) diff --git a/udp.c b/udp.c index 4a06b16..26a91c9 100644 --- a/udp.c +++ b/udp.c @@ -581,12 +581,10 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) * udp_sock_errs() - Process errors on a socket * @c: Execution context * @ref: epoll reference - * @events: epoll events bitmap * * Return: Number of errors handled, or < 0 if we have an unrecoverable error */ -static int udp_sock_errs(const struct ctx *c, union epoll_ref ref, - uint32_t events) +static int udp_sock_errs(const struct ctx *c, union epoll_ref ref) { unsigned n_err = 0; socklen_t errlen; @@ -595,9 +593,6 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref, ASSERT(!c->no_udp); - if (!(events & EPOLLERR)) - return 0; /* Nothing to do */ - /* Empty the error queue */ while ((rc = udp_sock_recverr(c, ref)) > 0) n_err += rc; @@ -630,15 +625,13 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref, * udp_sock_recv() - Receive datagrams from a socket * @c: Execution context * @s: Socket to receive from - * @events: epoll events bitmap * @mmh mmsghdr array to receive into * * Return: Number of datagrams received * * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64 */ -static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, - struct mmsghdr *mmh) +static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh) { /* For not entirely clear reasons (data locality?) pasta gets better * throughput if we receive tap datagrams one at a atime. For small @@ -651,9 +644,6 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, ASSERT(!c->no_udp); - if (!(events & EPOLLIN)) - return 0; - n = recvmmsg(s, mmh, n, 0, NULL); if (n < 0) { err_perror("Error receiving datagrams"); @@ -664,22 +654,20 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, } /** - * udp_buf_listen_sock_handler() - Handle new data from socket + * udp_buf_listen_sock_data() - Handle new data from socket * @c: Execution context * @ref: epoll reference - * @events: epoll events bitmap * @now: Current timestamp * * #syscalls recvmmsg */ -static void udp_buf_listen_sock_handler(const struct ctx *c, - union epoll_ref ref, uint32_t events, - const struct timespec *now) +static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref, + const struct timespec *now) { const socklen_t sasize = sizeof(udp_meta[0].s_in); int n, i; - if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0) + if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv)) <= 0) return; /* We divide datagrams into batches based on how we need to send them, @@ -744,33 +732,33 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - if (udp_sock_errs(c, ref, events) < 0) { - err("UDP: Unrecoverable error on listening socket:" - " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); - /* FIXME: what now? close/re-open socket? */ - return; + if (events & EPOLLERR) { + if (udp_sock_errs(c, ref) < 0) { + err("UDP: Unrecoverable error on listening socket:" + " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); + /* FIXME: what now? close/re-open socket? */ + return; + } } - if (c->mode == MODE_VU) { - udp_vu_listen_sock_handler(c, ref, events, now); - return; + if (events & EPOLLIN) { + if (c->mode == MODE_VU) + udp_vu_listen_sock_data(c, ref, now); + else + udp_buf_listen_sock_data(c, ref, now); } - - udp_buf_listen_sock_handler(c, ref, events, now); } /** - * udp_buf_reply_sock_handler() - Handle new data from flow specific socket + * udp_buf_reply_sock_data() - Handle new data from flow specific socket * @c: Execution context * @ref: epoll reference - * @events: epoll events bitmap * @now: Current timestamp * * #syscalls recvmmsg */ -static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, - const struct timespec *now) +static void udp_buf_reply_sock_data(const struct ctx *c, union epoll_ref ref, + const struct timespec *now) { flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); @@ -780,7 +768,7 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, from_s = uflow->s[ref.flowside.sidei]; - if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0) + if ((n = udp_sock_recv(c, from_s, udp_mh_recv)) <= 0) return; flow_trace(uflow, "Received %d datagrams on reply socket", n); @@ -821,19 +809,21 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, ASSERT(!c->no_udp && uflow); - if (udp_sock_errs(c, ref, events) < 0) { - flow_err(uflow, "Unrecoverable error on reply socket"); - flow_err_details(uflow); - udp_flow_close(c, uflow); - return; + if (events & EPOLLERR) { + if (udp_sock_errs(c, ref) < 0) { + flow_err(uflow, "Unrecoverable error on reply socket"); + flow_err_details(uflow); + udp_flow_close(c, uflow); + return; + } } - if (c->mode == MODE_VU) { - udp_vu_reply_sock_handler(c, ref, events, now); - return; + if (events & EPOLLIN) { + if (c->mode == MODE_VU) + udp_vu_reply_sock_data(c, ref, now); + else + udp_buf_reply_sock_data(c, ref, now); } - - udp_buf_reply_sock_handler(c, ref, events, now); } /** diff --git a/udp_vu.c b/udp_vu.c index 84f52af..698667f 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -78,14 +78,12 @@ static int udp_vu_sock_info(int s, union sockaddr_inany *s_in) * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers * @c: Execution context * @s: Socket to receive from - * @events: epoll events bitmap * @v6: Set for IPv6 connections * @dlen: Size of received data (output) * * Return: Number of iov entries used to store the datagram */ -static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events, - bool v6, ssize_t *dlen) +static int udp_vu_sock_recv(const struct ctx *c, int s, bool v6, ssize_t *dlen) { struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; @@ -95,9 +93,6 @@ static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events, ASSERT(!c->no_udp); - if (!(events & EPOLLIN)) - return 0; - /* compute L2 header length */ hdrlen = udp_vu_hdrlen(v6); @@ -214,14 +209,13 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used) } /** - * udp_vu_listen_sock_handler() - Handle new data from socket + * udp_vu_listen_sock_data() - Handle new data from socket * @c: Execution context * @ref: epoll reference - * @events: epoll events bitmap * @now: Current timestamp */ -void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now) +void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, + const struct timespec *now) { struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; @@ -262,7 +256,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); - iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen); + iov_used = udp_vu_sock_recv(c, ref.fd, v6, &dlen); if (iov_used <= 0) break; @@ -277,14 +271,13 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, } /** - * udp_vu_reply_sock_handler() - Handle new data from flow specific socket + * udp_vu_reply_sock_data() - Handle new data from flow specific socket * @c: Execution context * @ref: epoll reference - * @events: epoll events bitmap * @now: Current timestamp */ -void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now) +void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref, + const struct timespec *now) { flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); @@ -313,7 +306,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); - iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen); + iov_used = udp_vu_sock_recv(c, from_s, v6, &dlen); if (iov_used <= 0) break; flow_trace(uflow, "Received 1 datagram on reply socket"); diff --git a/udp_vu.h b/udp_vu.h index ba7018d..4f2262d 100644 --- a/udp_vu.h +++ b/udp_vu.h @@ -6,8 +6,8 @@ #ifndef UDP_VU_H #define UDP_VU_H -void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now); -void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now); +void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, + const struct timespec *now); +void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref, + const struct timespec *now); #endif /* UDP_VU_H */ From d924b7dfc40cfaf9ebc64fe052efd8b0c45c6478 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 26 Mar 2025 14:44:03 +1100 Subject: [PATCH 312/382] udp_vu: Factor things out of udp_vu_reply_sock_data() loop At the start of every cycle of the loop in udp_vu_reply_sock_data() we: - ASSERT that uflow is not NULL - Check if the target pif is PIF_TAP - Initialize the v6 boolean However, all of these depend only on the flow, which doesn't change across the loop. This is probably a duplication from udp_vu_listen_sock_data(), where the flow can be different for each packet. For the reply socket case, however, factor that logic out of the loop. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp_vu.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/udp_vu.c b/udp_vu.c index 698667f..6e1823a 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -281,30 +281,28 @@ void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref, { flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); + bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); struct udp_flow *uflow = udp_at_sidx(ref.flowside); int from_s = uflow->s[ref.flowside.sidei]; struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + uint8_t topif = pif_at_sidx(tosidx); int i; + ASSERT(uflow); + + if (topif != PIF_TAP) { + uint8_t frompif = pif_at_sidx(ref.flowside); + + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(frompif), pif_name(topif)); + return; + } + for (i = 0; i < UDP_MAX_FRAMES; i++) { - uint8_t topif = pif_at_sidx(tosidx); ssize_t dlen; int iov_used; - bool v6; - - ASSERT(uflow); - - if (topif != PIF_TAP) { - uint8_t frompif = pif_at_sidx(ref.flowside); - - flow_err(uflow, - "No support for forwarding UDP from %s to %s", - pif_name(frompif), pif_name(topif)); - continue; - } - - v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); iov_used = udp_vu_sock_recv(c, from_s, v6, &dlen); if (iov_used <= 0) From 269cf6a12a5f89683daa8da9232cc2524d7a4ae2 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 26 Mar 2025 14:44:04 +1100 Subject: [PATCH 313/382] udp: Share more logic between vu and non-vu reply socket paths Share some additional miscellaneous logic between the vhost-user and "buf" paths for data on udp reply sockets. The biggest piece is error handling of cases where we can't forward between the two pifs of the flow. We also make common some more simple logic locating the correct flow and its parameters. This adds some lines of code due to extra comment lines, but nonetheless reduces logic duplication. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 41 ++++++++++++++++++++++++++--------------- udp_vu.c | 26 +++++++++++--------------- udp_vu.h | 3 ++- 3 files changed, 39 insertions(+), 31 deletions(-) diff --git a/udp.c b/udp.c index 26a91c9..f417cea 100644 --- a/udp.c +++ b/udp.c @@ -752,24 +752,25 @@ void udp_listen_sock_handler(const struct ctx *c, /** * udp_buf_reply_sock_data() - Handle new data from flow specific socket * @c: Execution context - * @ref: epoll reference + * @s: Socket to read data from + * @tosidx: Flow & side to forward data from @s to * @now: Current timestamp * + * Return: true on success, false if can't forward from socket to flow's pif + * * #syscalls recvmmsg */ -static void udp_buf_reply_sock_data(const struct ctx *c, union epoll_ref ref, +static bool udp_buf_reply_sock_data(const struct ctx *c, + int s, flow_sidx_t tosidx, const struct timespec *now) { - flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); - struct udp_flow *uflow = udp_at_sidx(ref.flowside); + struct udp_flow *uflow = udp_at_sidx(tosidx); uint8_t topif = pif_at_sidx(tosidx); - int n, i, from_s; + int n, i; - from_s = uflow->s[ref.flowside.sidei]; - - if ((n = udp_sock_recv(c, from_s, udp_mh_recv)) <= 0) - return; + if ((n = udp_sock_recv(c, s, udp_mh_recv)) <= 0) + return true; flow_trace(uflow, "Received %d datagrams on reply socket", n); uflow->ts = now->tv_sec; @@ -788,11 +789,10 @@ static void udp_buf_reply_sock_data(const struct ctx *c, union epoll_ref ref, } else if (topif == PIF_TAP) { tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n); } else { - uint8_t frompif = pif_at_sidx(ref.flowside); - - flow_err(uflow, "No support for forwarding UDP from %s to %s", - pif_name(frompif), pif_name(topif)); + return false; } + + return true; } /** @@ -819,10 +819,21 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, } if (events & EPOLLIN) { + flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); + int s = ref.fd; + bool ret; + if (c->mode == MODE_VU) - udp_vu_reply_sock_data(c, ref, now); + ret = udp_vu_reply_sock_data(c, s, tosidx, now); else - udp_buf_reply_sock_data(c, ref, now); + ret = udp_buf_reply_sock_data(c, s, tosidx, now); + + if (!ret) { + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(pif_at_sidx(ref.flowside)), + pif_name(pif_at_sidx(tosidx))); + } } } diff --git a/udp_vu.c b/udp_vu.c index 6e1823a..06bdeae 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -273,38 +273,32 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, /** * udp_vu_reply_sock_data() - Handle new data from flow specific socket * @c: Execution context - * @ref: epoll reference + * @s: Socket to read data from + * @tosidx: Flow & side to forward data from @s to * @now: Current timestamp + * + * Return: true on success, false if can't forward from socket to flow's pif */ -void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref, +bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx, const struct timespec *now) { - flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); - struct udp_flow *uflow = udp_at_sidx(ref.flowside); - int from_s = uflow->s[ref.flowside.sidei]; + struct udp_flow *uflow = udp_at_sidx(tosidx); struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; - uint8_t topif = pif_at_sidx(tosidx); int i; ASSERT(uflow); - if (topif != PIF_TAP) { - uint8_t frompif = pif_at_sidx(ref.flowside); - - flow_err(uflow, - "No support for forwarding UDP from %s to %s", - pif_name(frompif), pif_name(topif)); - return; - } + if (pif_at_sidx(tosidx) != PIF_TAP) + return false; for (i = 0; i < UDP_MAX_FRAMES; i++) { ssize_t dlen; int iov_used; - iov_used = udp_vu_sock_recv(c, from_s, v6, &dlen); + iov_used = udp_vu_sock_recv(c, s, v6, &dlen); if (iov_used <= 0) break; flow_trace(uflow, "Received 1 datagram on reply socket"); @@ -318,4 +312,6 @@ void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref, } vu_flush(vdev, vq, elem, iov_used); } + + return true; } diff --git a/udp_vu.h b/udp_vu.h index 4f2262d..2299b51 100644 --- a/udp_vu.h +++ b/udp_vu.h @@ -8,6 +8,7 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, const struct timespec *now); -void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref, +bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx, const struct timespec *now); + #endif /* UDP_VU_H */ From f67c488b81ca2a4d9f819b625fceab10b71fc3a5 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 26 Mar 2025 14:44:05 +1100 Subject: [PATCH 314/382] udp: Better handling of failure to forward from reply socket In udp_reply_sock_handler() if we're unable to forward the datagrams we just print an error. Generally this means we have an unsupported pair of pifs in the flow table, though, and that hasn't change. So, next time we get a matching packet we'll just get the same failure. In vhost-user mode we don't even dequeue the incoming packets which triggered this so we're likely to get the same failure immediately. Instead, close the flow, in the same we we do for an unrecoverable error. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/udp.c b/udp.c index f417cea..96e48dd 100644 --- a/udp.c +++ b/udp.c @@ -812,9 +812,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, if (events & EPOLLERR) { if (udp_sock_errs(c, ref) < 0) { flow_err(uflow, "Unrecoverable error on reply socket"); - flow_err_details(uflow); - udp_flow_close(c, uflow); - return; + goto fail; } } @@ -829,12 +827,15 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, ret = udp_buf_reply_sock_data(c, s, tosidx, now); if (!ret) { - flow_err(uflow, - "No support for forwarding UDP from %s to %s", - pif_name(pif_at_sidx(ref.flowside)), - pif_name(pif_at_sidx(tosidx))); + flow_err(uflow, "Unable to forward UDP"); + goto fail; } } + return; + +fail: + flow_err_details(uflow); + udp_flow_close(c, uflow); } /** From 37d78c9ef3944c1b060e3e8259b82fea3f8ec6bf Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 26 Mar 2025 14:44:06 +1100 Subject: [PATCH 315/382] udp: Always hash socket facing flowsides For UDP packets from the tap interface (like TCP) we use a hash table to look up which flow they belong to. Unlike TCP, we sometimes also create a hash table entry for the socket side of UDP flows. We need that when we receive a UDP packet from a "listening" socket which isn't specific to a single flow. At present we only do this for the initiating side of flows, which re-use the listening socket. For the target side we use a connected "reply" socket specific to the single flow. We have in mind changes that maye introduce some edge cases were we could receive UDP packets on a non flow specific socket more often. To allow for those changes - and slightly simplifying things in the meantime - always put both sides of a UDP flow - tap or socket - in the hash table. It's not that costly, and means we always have the option of falling back to a hash lookup. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp_flow.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/udp_flow.c b/udp_flow.c index c6b8630..7e80924 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -41,25 +41,23 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx) */ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) { + unsigned sidei; + if (uflow->closed) return; /* Nothing to do */ - if (uflow->s[INISIDE] >= 0) { - /* The listening socket needs to stay in epoll */ - close(uflow->s[INISIDE]); - uflow->s[INISIDE] = -1; + flow_foreach_sidei(sidei) { + flow_hash_remove(c, FLOW_SIDX(uflow, sidei)); + if (uflow->s[sidei] >= 0) { + /* The listening socket needs to stay in epoll, but the + * flow specific one needs to be removed */ + if (sidei == TGTSIDE) + epoll_del(c, uflow->s[sidei]); + close(uflow->s[sidei]); + uflow->s[sidei] = -1; + } } - if (uflow->s[TGTSIDE] >= 0) { - /* But the flow specific one needs to be removed */ - epoll_del(c, uflow->s[TGTSIDE]); - close(uflow->s[TGTSIDE]); - uflow->s[TGTSIDE] = -1; - } - flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); - if (!pif_is_socket(uflow->f.pif[TGTSIDE])) - flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE)); - uflow->closed = true; } @@ -77,6 +75,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, { struct udp_flow *uflow = NULL; const struct flowside *tgt; + unsigned sidei; uint8_t tgtpif; if (!(tgt = flow_target(c, flow, IPPROTO_UDP))) @@ -143,14 +142,14 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, } } - flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE)); - - /* If the target side is a socket, it will be a reply socket that knows - * its own flowside. But if it's tap, then we need to look it up by - * hash. + /* Tap sides always need to be looked up by hash. Socket sides don't + * always, but sometimes do (receiving packets on a socket not specific + * to one flow). Unconditionally hash both sides so all our bases are + * covered */ - if (!pif_is_socket(tgtpif)) - flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE)); + flow_foreach_sidei(sidei) + flow_hash_insert(c, FLOW_SIDX(uflow, sidei)); + FLOW_ACTIVATE(uflow); return FLOW_SIDX(uflow, TGTSIDE); From 77883fbdd17e836247f746d888dcad3f611a6a59 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 26 Mar 2025 14:44:07 +1100 Subject: [PATCH 316/382] udp: Add helper function for creating connected UDP socket Currently udp_flow_new() open codes creating and connecting a socket to use for reply messages. We have in mind some more places to use this logic, plus it just makes for a rather large function. Split this handling out into a new udp_flow_sock() function. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp_flow.c | 104 +++++++++++++++++++++++++++++------------------------ 1 file changed, 58 insertions(+), 46 deletions(-) diff --git a/udp_flow.c b/udp_flow.c index 7e80924..bf4b896 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -61,6 +61,61 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) uflow->closed = true; } +/** + * udp_flow_sock() - Create, bind and connect a flow specific UDP socket + * @c: Execution context + * @uflow: UDP flow to open socket for + * @sidei: Side of @uflow to open socket for + * + * Return: fd of new socket on success, -ve error code on failure + */ +static int udp_flow_sock(const struct ctx *c, + const struct udp_flow *uflow, unsigned sidei) +{ + const struct flowside *side = &uflow->f.side[sidei]; + struct mmsghdr discard[UIO_MAXIOV] = { 0 }; + uint8_t pif = uflow->f.pif[sidei]; + union { + flow_sidx_t sidx; + uint32_t data; + } fref = { .sidx = FLOW_SIDX(uflow, sidei) }; + int rc, s; + + s = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, pif, side, fref.data); + if (s < 0) { + flow_dbg_perror(uflow, "Couldn't open flow specific socket"); + return s; + } + + if (flowside_connect(c, s, pif, side) < 0) { + rc = -errno; + flow_dbg_perror(uflow, "Couldn't connect flow socket"); + return rc; + } + + /* It's possible, if unlikely, that we could receive some unrelated + * packets in between the bind() and connect() of this socket. For now + * we just discard these. + * + * FIXME: Redirect these to an appropriate handler + */ + rc = recvmmsg(s, discard, ARRAY_SIZE(discard), MSG_DONTWAIT, NULL); + if (rc >= ARRAY_SIZE(discard)) { + flow_dbg(uflow, "Too many (%d) spurious reply datagrams", rc); + return -E2BIG; + } + + if (rc > 0) { + flow_trace(uflow, "Discarded %d spurious reply datagrams", rc); + } else if (errno != EAGAIN) { + rc = -errno; + flow_perror(uflow, "Unexpected error discarding datagrams"); + return rc; + } + + return s; +} + /** * udp_flow_new() - Common setup for a new UDP flow * @c: Execution context @@ -74,13 +129,10 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, int s_ini, const struct timespec *now) { struct udp_flow *uflow = NULL; - const struct flowside *tgt; unsigned sidei; - uint8_t tgtpif; - if (!(tgt = flow_target(c, flow, IPPROTO_UDP))) + if (!flow_target(c, flow, IPPROTO_UDP)) goto cancel; - tgtpif = flow->f.pif[TGTSIDE]; uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); uflow->ts = now->tv_sec; @@ -98,49 +150,9 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, } } - if (pif_is_socket(tgtpif)) { - struct mmsghdr discard[UIO_MAXIOV] = { 0 }; - union { - flow_sidx_t sidx; - uint32_t data; - } fref = { - .sidx = FLOW_SIDX(flow, TGTSIDE), - }; - int rc; - - uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, - tgtpif, tgt, fref.data); - if (uflow->s[TGTSIDE] < 0) { - flow_dbg_perror(uflow, - "Couldn't open socket for spliced flow"); + if (pif_is_socket(flow->f.pif[TGTSIDE])) + if ((uflow->s[TGTSIDE] = udp_flow_sock(c, uflow, TGTSIDE)) < 0) goto cancel; - } - - if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) { - flow_dbg_perror(uflow, "Couldn't connect flow socket"); - goto cancel; - } - - /* It's possible, if unlikely, that we could receive some - * unrelated packets in between the bind() and connect() of this - * socket. For now we just discard these. We could consider - * trying to redirect these to an appropriate handler, if we - * need to. - */ - rc = recvmmsg(uflow->s[TGTSIDE], discard, ARRAY_SIZE(discard), - MSG_DONTWAIT, NULL); - if (rc >= ARRAY_SIZE(discard)) { - flow_dbg(uflow, - "Too many (%d) spurious reply datagrams", rc); - goto cancel; - } else if (rc > 0) { - flow_trace(uflow, - "Discarded %d spurious reply datagrams", rc); - } else if (errno != EAGAIN) { - flow_perror(uflow, - "Unexpected error discarding datagrams"); - } - } /* Tap sides always need to be looked up by hash. Socket sides don't * always, but sometimes do (receiving packets on a socket not specific From 664c588be752bf590adb55bf1f613d4a36f02e7c Mon Sep 17 00:00:00 2001 From: Julian Wundrak <julian@wundrak.net> Date: Wed, 26 Mar 2025 20:14:31 +0000 Subject: [PATCH 317/382] build: normalize arm targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linux distributions use different dumpmachine outputs for the ARM architecture. arm, armv6l, armv7l. For the syscall annotation, these variants are standardized to “arm”. Link: https://bugs.passt.top/show_bug.cgi?id=117 Signed-off-by: Julian Wundrak <julian@wundrak.net> [sbrivio: Fix typo: assign from TARGET_ARCH, not from TARGET] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 31cbac3..3328f83 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,7 @@ $(if $(TARGET),,$(error Failed to get target architecture)) # Get 'uname -m'-like architecture description for target TARGET_ARCH := $(firstword $(subst -, ,$(TARGET))) TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH)) +TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH)) TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH)) # On some systems enabling optimization also enables source fortification, From 65cca54be84ffc5d2e18fcb8229dcc9d1f229479 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Wed, 26 Mar 2025 11:59:02 -0400 Subject: [PATCH 318/382] udp: correct source address for ICMP messages While developing traceroute forwarding tap-to-sock we found that struct msghdr.msg_name for the ICMPs in the opposite direction always contains the destination address of the original UDP message, and not, as one might expect, the one of the host which created the error message. Study of the kernel code reveals that this address instead is appended as extra data after the received struct sock_extended_err area. We now change the ICMP receive code accordingly. Fixes: 55431f0077b6 ("udp: create and send ICMPv4 to local peer when applicable") Fixes: 68b04182e07d ("udp: create and send ICMPv6 to local peer when applicable") Signed-off-by: Jon Maloy <jmaloy@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/udp.c b/udp.c index 96e48dd..0c223b4 100644 --- a/udp.c +++ b/udp.c @@ -510,10 +510,13 @@ static void udp_send_conn_fail_icmp6(const struct ctx *c, */ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) { - const struct sock_extended_err *ee; + struct errhdr { + struct sock_extended_err ee; + union sockaddr_inany saddr; + }; + const struct errhdr *eh; const struct cmsghdr *hdr; - union sockaddr_inany saddr; - char buf[CMSG_SPACE(sizeof(*ee))]; + char buf[CMSG_SPACE(sizeof(struct errhdr))]; char data[ICMP6_MAX_DLEN]; int s = ref.fd; struct iovec iov = { @@ -521,8 +524,6 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) .iov_len = sizeof(data) }; struct msghdr mh = { - .msg_name = &saddr, - .msg_namelen = sizeof(saddr), .msg_iov = &iov, .msg_iovlen = 1, .msg_control = buf, @@ -553,7 +554,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) return -1; } - ee = (const struct sock_extended_err *)CMSG_DATA(hdr); + eh = (const struct errhdr *)CMSG_DATA(hdr); if (ref.type == EPOLL_TYPE_UDP_REPLY) { flow_sidx_t sidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(sidx); @@ -561,18 +562,19 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) if (hdr->cmsg_level == IPPROTO_IP) { dlen = MIN(dlen, ICMP4_MAX_DLEN); - udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr, + udp_send_conn_fail_icmp4(c, &eh->ee, toside, + eh->saddr.sa4.sin_addr, data, dlen); } else if (hdr->cmsg_level == IPPROTO_IPV6) { - udp_send_conn_fail_icmp6(c, ee, toside, - &saddr.sa6.sin6_addr, + udp_send_conn_fail_icmp6(c, &eh->ee, toside, + &eh->saddr.sa6.sin6_addr, data, dlen, sidx.flowi); } } else { trace("Ignoring received IP_RECVERR cmsg on listener socket"); } debug("%s error on UDP socket %i: %s", - str_ee_origin(ee), s, strerror_(ee->ee_errno)); + str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno)); return 1; } From 42a854a52b6fa2bbd70cbc0c7657c8a49a9c3d2d Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 28 Mar 2025 11:39:58 +1100 Subject: [PATCH 319/382] pasta, passt-repair: Support multiple events per read() in inotify handlers The current code assumes that we'll get one event per read() on inotify descriptors, but that's not the case, not from documentation, and not from reports. Add loops in the two inotify handlers we have, in pasta-specific code and passt-repair, to go through all the events we receive. Link: https://bugs.passt.top/show_bug.cgi?id=119 [dwg: Remove unnecessary buffer expansion, use strnlen instead of strlen to make Coverity happier] Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Add additional check on ev->name and ev->len in passt-repair] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt-repair.c | 32 +++++++++++++++++++++++++------- pasta.c | 20 +++++++++++++------- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/passt-repair.c b/passt-repair.c index 120f7aa..86f0293 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -111,14 +111,14 @@ int main(int argc, char **argv) } if ((sb.st_mode & S_IFMT) == S_IFDIR) { - char buf[sizeof(struct inotify_event) + NAME_MAX + 1]; + char buf[sizeof(struct inotify_event) + NAME_MAX + 1] + __attribute__ ((aligned(__alignof__(struct inotify_event)))); const struct inotify_event *ev; char path[PATH_MAX + 1]; + bool found = false; ssize_t n; int fd; - ev = (struct inotify_event *)buf; - if ((fd = inotify_init1(IN_CLOEXEC)) < 0) { fprintf(stderr, "inotify_init1: %i\n", errno); _exit(1); @@ -130,6 +130,8 @@ int main(int argc, char **argv) } do { + char *p; + n = read(fd, buf, sizeof(buf)); if (n < 0) { fprintf(stderr, "inotify read: %i", errno); @@ -138,11 +140,27 @@ int main(int argc, char **argv) if (n < (ssize_t)sizeof(*ev)) { fprintf(stderr, "Short inotify read: %zi", n); - _exit(1); + continue; } - } while (ev->len < REPAIR_EXT_LEN || - memcmp(ev->name + strlen(ev->name) - REPAIR_EXT_LEN, - REPAIR_EXT, REPAIR_EXT_LEN)); + + for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) { + ev = (const struct inotify_event *)p; + + if (ev->len >= REPAIR_EXT_LEN && + !memcmp(ev->name + + strnlen(ev->name, ev->len) - + REPAIR_EXT_LEN, + REPAIR_EXT, REPAIR_EXT_LEN)) { + found = true; + break; + } + } + } while (!found); + + if (ev->len > NAME_MAX + 1 || ev->name[ev->len] != '\0') { + fprintf(stderr, "Invalid filename from inotify\n"); + _exit(1); + } snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name); if ((stat(path, &sb))) { diff --git a/pasta.c b/pasta.c index fa3e7de..017fa32 100644 --- a/pasta.c +++ b/pasta.c @@ -498,17 +498,23 @@ void pasta_netns_quit_init(const struct ctx *c) */ void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd) { - char buf[sizeof(struct inotify_event) + NAME_MAX + 1]; - const struct inotify_event *in_ev = (struct inotify_event *)buf; + char buf[sizeof(struct inotify_event) + NAME_MAX + 1] + __attribute__ ((aligned(__alignof__(struct inotify_event)))); + const struct inotify_event *ev; + ssize_t n; + char *p; - if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev)) + if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev)) return; - if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base))) - return; + for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) { + ev = (const struct inotify_event *)p; - info("Namespace %s is gone, exiting", c->netns_base); - _exit(EXIT_SUCCESS); + if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) { + info("Namespace %s is gone, exiting", c->netns_base); + _exit(EXIT_SUCCESS); + } + } } /** From 025a3c2686b06be3fd09e29b2e3408d2c4ad6239 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 28 Mar 2025 14:34:14 +1100 Subject: [PATCH 320/382] udp: Don't attempt to forward ICMP socket errors to other sockets Recently we added support for detecting ICMP triggered errors on UDP sockets and forwarding them to the tap interface. However, in udp_sock_recverr() where this is handled we don't know for certain that the tap interface is the other side of the UDP flow. It could be a spliced connection with another socket on the other side. To forward errors in that case, we'd need to force the other side's socket to trigger issue an ICMP error. I'm not sure if there's a way to do that; probably not for an arbitrary ICMP but it might be possible for certain error conditions. Nonetheless what we do now - synthesise an ICMP on the tap interface - is certainly wrong. It's probably harmless; for a spliced connection it will have loopback addresses meaning we can expect the guest to discard it. But, correct this for now, by not attempting to propagate errors when the other side of the flow is a socket. Fixes: 55431f0077b6 ("udp: create and send ICMPv4 to local peer when applicable") Fixes: 68b04182e07d ("udp: create and send ICMPv6 to local peer when applicable") Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Acked-by: Jon Maloy <jmaloy@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udp.c b/udp.c index 0c223b4..e410f55 100644 --- a/udp.c +++ b/udp.c @@ -560,7 +560,10 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) const struct flowside *toside = flowside_at_sidx(sidx); size_t dlen = rc; - if (hdr->cmsg_level == IPPROTO_IP) { + if (pif_is_socket(pif_at_sidx(sidx))) { + /* XXX Is there any way to propagate ICMPs from socket + * to socket? */ + } else if (hdr->cmsg_level == IPPROTO_IP) { dlen = MIN(dlen, ICMP4_MAX_DLEN); udp_send_conn_fail_icmp4(c, &eh->ee, toside, eh->saddr.sa4.sin_addr, From 3de5af6e4145c6971be2597d7fb0386332d44a45 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 28 Mar 2025 14:34:15 +1100 Subject: [PATCH 321/382] udp: Improve name of UDP related ICMP sending functions udp_send_conn_fail_icmp[46]() aren't actually specific to connections failing: they can propagate a variety of ICMP errors, which might or might not break a "connection". They are, however, specific to sending ICMP errors to the tap connection, not splice or host. Rename them to better reflect that. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Acked-by: Jon Maloy <jmaloy@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/udp.c b/udp.c index e410f55..39431d7 100644 --- a/udp.c +++ b/udp.c @@ -411,7 +411,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, } /** - * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer + * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer * @c: Execution context * @ee: Extended error descriptor * @toside: Destination side of flow @@ -419,11 +419,11 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, * @in: First bytes (max 8) of original UDP message body * @dlen: Length of the read part of original UDP message body */ -static void udp_send_conn_fail_icmp4(const struct ctx *c, - const struct sock_extended_err *ee, - const struct flowside *toside, - struct in_addr saddr, - const void *in, size_t dlen) +static void udp_send_tap_icmp4(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + struct in_addr saddr, + const void *in, size_t dlen) { struct in_addr oaddr = toside->oaddr.v4mapped.a4; struct in_addr eaddr = toside->eaddr.v4mapped.a4; @@ -455,7 +455,7 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c, /** - * udp_send_conn_fail_icmp6() - Construct and send ICMPv6 to local peer + * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer * @c: Execution context * @ee: Extended error descriptor * @toside: Destination side of flow @@ -464,11 +464,11 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c, * @dlen: Length of the read part of original UDP message body * @flow: IPv6 flow identifier */ -static void udp_send_conn_fail_icmp6(const struct ctx *c, - const struct sock_extended_err *ee, - const struct flowside *toside, - const struct in6_addr *saddr, - void *in, size_t dlen, uint32_t flow) +static void udp_send_tap_icmp6(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + const struct in6_addr *saddr, + void *in, size_t dlen, uint32_t flow) { const struct in6_addr *oaddr = &toside->oaddr.a6; const struct in6_addr *eaddr = &toside->eaddr.a6; @@ -565,13 +565,12 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) * to socket? */ } else if (hdr->cmsg_level == IPPROTO_IP) { dlen = MIN(dlen, ICMP4_MAX_DLEN); - udp_send_conn_fail_icmp4(c, &eh->ee, toside, - eh->saddr.sa4.sin_addr, - data, dlen); + udp_send_tap_icmp4(c, &eh->ee, toside, + eh->saddr.sa4.sin_addr, data, dlen); } else if (hdr->cmsg_level == IPPROTO_IPV6) { - udp_send_conn_fail_icmp6(c, &eh->ee, toside, - &eh->saddr.sa6.sin6_addr, - data, dlen, sidx.flowi); + udp_send_tap_icmp6(c, &eh->ee, toside, + &eh->saddr.sa6.sin6_addr, data, + dlen, sidx.flowi); } } else { trace("Ignoring received IP_RECVERR cmsg on listener socket"); From 2ed2d59def758b049f42e7c75bfb48957a73bd39 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 2 Apr 2025 14:13:16 +1100 Subject: [PATCH 322/382] platform requirements: Fix clang-tidy warning Recent clang-tidy versions complain about enums defined with some but not all entries given explicit values. I'm not entirely convinced about whether that's a useful warning, but in any case we really don't need the explicit values in doc/platform-requirements/reuseaddr-priority.c, so remove them to make clang happy. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- doc/platform-requirements/reuseaddr-priority.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/platform-requirements/reuseaddr-priority.c b/doc/platform-requirements/reuseaddr-priority.c index 701b6ff..af39a39 100644 --- a/doc/platform-requirements/reuseaddr-priority.c +++ b/doc/platform-requirements/reuseaddr-priority.c @@ -46,13 +46,13 @@ /* Different cases for receiving socket configuration */ enum sock_type { /* Socket is bound to 0.0.0.0:DSTPORT and not connected */ - SOCK_BOUND_ANY = 0, + SOCK_BOUND_ANY, /* Socket is bound to 127.0.0.1:DSTPORT and not connected */ - SOCK_BOUND_LO = 1, + SOCK_BOUND_LO, /* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */ - SOCK_CONNECTED = 2, + SOCK_CONNECTED, NUM_SOCK_TYPES, }; From 8e32881ef1d6d5867223a164052f8ff39d4ebb4e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 2 Apr 2025 14:13:17 +1100 Subject: [PATCH 323/382] platform requirements: Add attributes to die() function Add both format string and ((noreturn)) attributes to the version of die() used in the test programs in doc/platform-requirements. As well as potentially catching problems in format strings, this means that the compiler and static checkers can properly reason about the fact that it will exit, preventing bogus warnings. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- doc/platform-requirements/common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/platform-requirements/common.h b/doc/platform-requirements/common.h index 8844b1e..e85fc2b 100644 --- a/doc/platform-requirements/common.h +++ b/doc/platform-requirements/common.h @@ -15,6 +15,7 @@ #include <stdio.h> #include <stdlib.h> +__attribute__((format(printf, 1, 2), noreturn)) static inline void die(const char *fmt, ...) { va_list ap; From 6bfc60b09522bd6f47660b835f0681977a28e1de Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 2 Apr 2025 14:13:18 +1100 Subject: [PATCH 324/382] platform requirements: Add test for address conflicts with TCP_REPAIR Simple test program to check the behaviour we need for bind() address conflicts between listening sockets and repair mode sockets. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- doc/platform-requirements/.gitignore | 1 + doc/platform-requirements/Makefile | 4 +- doc/platform-requirements/listen-vs-repair.c | 128 +++++++++++++++++++ 3 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 doc/platform-requirements/listen-vs-repair.c diff --git a/doc/platform-requirements/.gitignore b/doc/platform-requirements/.gitignore index 3b5a10a..f6272cf 100644 --- a/doc/platform-requirements/.gitignore +++ b/doc/platform-requirements/.gitignore @@ -1,3 +1,4 @@ +/listen-vs-repair /reuseaddr-priority /recv-zero /udp-close-dup diff --git a/doc/platform-requirements/Makefile b/doc/platform-requirements/Makefile index 6a7d374..83930ef 100644 --- a/doc/platform-requirements/Makefile +++ b/doc/platform-requirements/Makefile @@ -3,8 +3,8 @@ # Copyright Red Hat # Author: David Gibson <david@gibson.dropbear.id.au> -TARGETS = reuseaddr-priority recv-zero udp-close-dup -SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c +TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair +SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c CFLAGS = -Wall all: cppcheck clang-tidy $(TARGETS:%=check-%) diff --git a/doc/platform-requirements/listen-vs-repair.c b/doc/platform-requirements/listen-vs-repair.c new file mode 100644 index 0000000..d31fe3f --- /dev/null +++ b/doc/platform-requirements/listen-vs-repair.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* liste-vs-repair.c + * + * Do listening sockets have address conflicts with sockets under repair + * ==================================================================== + * + * When we accept() an incoming connection the accept()ed socket will have the + * same local address as the listening socket. This can be a complication on + * migration. On the migration target we've already set up listening sockets + * according to the command line. However to restore connections that we're + * migrating in we need to bind the new sockets to the same address, which would + * be an address conflict on the face of it. This test program verifies that + * enabling repair mode before bind() correctly suppresses that conflict. + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */ +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <errno.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <sched.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "common.h" + +#define PORT 13256U +#define CPORT 13257U + +/* 127.0.0.1:PORT */ +static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT); + +/* 127.0.0.1:CPORT */ +static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT); + +/* Put ourselves into a network sandbox */ +static void net_sandbox(void) +{ + /* NOLINTNEXTLINE(altera-struct-pack-align) */ + const struct req_t { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + } __attribute__((packed)) req = { + .nlh.nlmsg_type = RTM_NEWLINK, + .nlh.nlmsg_flags = NLM_F_REQUEST, + .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_seq = 1, + .ifm.ifi_family = AF_UNSPEC, + .ifm.ifi_index = 1, + .ifm.ifi_flags = IFF_UP, + .ifm.ifi_change = IFF_UP, + }; + int nl; + + if (unshare(CLONE_NEWUSER | CLONE_NEWNET)) + die("unshare(): %s\n", strerror(errno)); + + /* Bring up lo in the new netns */ + nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + if (nl < 0) + die("Can't create netlink socket: %s\n", strerror(errno)); + + if (send(nl, &req, sizeof(req), 0) < 0) + die("Netlink send(): %s\n", strerror(errno)); + close(nl); +} + +static void check(void) +{ + int s1, s2, op; + + s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s1 < 0) + die("socket() 1: %s\n", strerror(errno)); + + if (bind(s1, (struct sockaddr *)&addr, sizeof(addr))) + die("bind() 1: %s\n", strerror(errno)); + + if (listen(s1, 0)) + die("listen(): %s\n", strerror(errno)); + + s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s2 < 0) + die("socket() 2: %s\n", strerror(errno)); + + op = TCP_REPAIR_ON; + if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op))) + die("TCP_REPAIR: %s\n", strerror(errno)); + + if (bind(s2, (struct sockaddr *)&addr, sizeof(addr))) + die("bind() 2: %s\n", strerror(errno)); + + if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr))) + die("connect(): %s\n", strerror(errno)); + + op = TCP_REPAIR_OFF_NO_WP; + if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op))) + die("TCP_REPAIR: %s\n", strerror(errno)); + + close(s1); + close(s2); +} + +int main(int argc, char *argv[]) +{ + (void)argc; + (void)argv; + + net_sandbox(); + + check(); + + printf("Repair mode appears to properly suppress conflicts with listening sockets\n"); + + exit(0); +} From dec3d73e1e8e007d05f9dce9a48aca7cb8532992 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 2 Apr 2025 14:13:19 +1100 Subject: [PATCH 325/382] migrate, tcp: bind() migrated sockets in repair mode Currently on a migration target, we create then immediately bind() new sockets for the TCP connections we're reconstructing. Mostly, this works, since a socket() that is bound but hasn't had listen() or connect() called is essentially passive. However, this bind() is subject to the usual address conflict checking. In particular that means if we already have a listening socket on that port, we'll get an EADDRINUSE. This will happen for every connection we try to migrate that was initiated from outside to the guest, since we necessarily created a listening socket for that case. We set SO_REUSEADDR on the socket in an attempt to avoid this, but that's not sufficient; even with SO_REUSEADDR address conflicts are still prohibited for listening sockets. Of course once these incoming sockets are fully repaired and connect()ed they'll no longer conflict, but that doesn't help us if we fail at the bind(). We can avoid this by not calling bind() until we're already in repair mode which suppresses this transient conflict. Because of the batching of setting repair mode, to do that we need to move the bind to a step in tcp_flow_migrate_target_ext(). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/tcp.c b/tcp.c index fa1d885..35626c9 100644 --- a/tcp.c +++ b/tcp.c @@ -3414,13 +3414,8 @@ fail: static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) { sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6; - const struct flowside *sockside = HOSTFLOW(conn); - union sockaddr_inany a; - socklen_t sl; int s, rc; - pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport); - if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP)) < 0) { rc = -errno; @@ -3435,12 +3430,6 @@ static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) tcp_sock_set_nodelay(s); - if (bind(s, &a.sa, sizeof(a))) { - rc = -errno; - flow_perror(conn, "Failed to bind socket for migrated flow"); - goto err; - } - if ((rc = tcp_flow_repair_on(c, conn))) goto err; @@ -3452,6 +3441,30 @@ err: return rc; } +/** + * tcp_flow_repair_bind() - Bind socket in repair mode + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn) +{ + const struct flowside *sockside = HOSTFLOW(conn); + union sockaddr_inany a; + socklen_t sl; + + pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport); + + if (bind(conn->sock, &a.sa, sizeof(a))) { + int rc = -errno; + flow_perror(conn, "Failed to bind socket for migrated flow"); + return rc; + } + + return 0; +} + /** * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off * @c: Execution context @@ -3618,6 +3631,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd /* We weren't able to create the socket, discard flow */ goto fail; + if (tcp_flow_repair_bind(c, conn)) + goto fail; + if (tcp_flow_repair_timestamp(conn, &t)) goto fail; From 3d41e4d8389578e5d5f3cf2e47b9ff9cdd29ffd1 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 2 Apr 2025 15:43:40 +1100 Subject: [PATCH 326/382] passt-repair: Correct off-by-one error verifying name passt-repair will generate an error if the name it gets from the kernel is too long or not NUL terminated. Downstream testing has reported occasionally seeing this error in practice. In turns out there is a trivial off-by-one error in the check: ev->len is the length of the name, including terminating \0 characters, so to check for a \0 at the end of the buffer we need to check ev->name[len - 1] not ev->name[len]. Fixes: 42a854a52b6f ("pasta, passt-repair: Support multiple events per read() in inotify handlers") Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt-repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passt-repair.c b/passt-repair.c index 86f0293..440c77a 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -157,7 +157,7 @@ int main(int argc, char **argv) } } while (!found); - if (ev->len > NAME_MAX + 1 || ev->name[ev->len] != '\0') { + if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') { fprintf(stderr, "Invalid filename from inotify\n"); _exit(1); } From 8aa2d90c8d95d0fa1dad7027fdf92b48a1bbf3c6 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 1 Apr 2025 19:57:08 +1100 Subject: [PATCH 327/382] udp: Remove redundant udp_at_sidx() call in udp_tap_handler() We've already have a pointer to the UDP flow in variable uflow, we can just re-use it. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udp.c b/udp.c index 39431d7..ac168db 100644 --- a/udp.c +++ b/udp.c @@ -907,7 +907,7 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, } toside = flowside_at_sidx(tosidx); - s = udp_at_sidx(tosidx)->s[tosidx.sidei]; + s = uflow->s[tosidx.sidei]; ASSERT(s >= 0); pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport); From 76e554d9ec8dc80c1856621e17e45be811d198d0 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 1 Apr 2025 19:57:09 +1100 Subject: [PATCH 328/382] udp: Simplify updates to UDP flow timestamp Since UDP has no built in knowledge of connections, the only way we know when we're done with a UDP flow is a timeout with no activity. To keep track of this struct udp_flow includes a timestamp to record the last time we saw traffic on the flow. For data from listening sockets and from tap, this is done implicitly via udp_flow_from_{sock,tap}() but for reply sockets it's done explicitly. However, that logic is duplicated between the vhost-user and "buf" paths. Make it common in udp_reply_sock_handler() instead. Technically this is a behavioural change: previously if we got an EPOLLIN event, but there wasn't actually any data we wouldn't update the timestamp, now we will. This should be harmless: if there's an EPOLLIN we expect there to be data, and even if there isn't the worst we can do is mildly delay the cleanup of a stale flow. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 15 ++++++--------- udp_vu.c | 9 +-------- udp_vu.h | 3 +-- 3 files changed, 8 insertions(+), 19 deletions(-) diff --git a/udp.c b/udp.c index ac168db..44b58d1 100644 --- a/udp.c +++ b/udp.c @@ -758,27 +758,21 @@ void udp_listen_sock_handler(const struct ctx *c, * @c: Execution context * @s: Socket to read data from * @tosidx: Flow & side to forward data from @s to - * @now: Current timestamp * * Return: true on success, false if can't forward from socket to flow's pif * * #syscalls recvmmsg */ static bool udp_buf_reply_sock_data(const struct ctx *c, - int s, flow_sidx_t tosidx, - const struct timespec *now) + int s, flow_sidx_t tosidx) { const struct flowside *toside = flowside_at_sidx(tosidx); - struct udp_flow *uflow = udp_at_sidx(tosidx); uint8_t topif = pif_at_sidx(tosidx); int n, i; if ((n = udp_sock_recv(c, s, udp_mh_recv)) <= 0) return true; - flow_trace(uflow, "Received %d datagrams on reply socket", n); - uflow->ts = now->tv_sec; - for (i = 0; i < n; i++) { if (pif_is_socket(topif)) udp_splice_prepare(udp_mh_recv, i); @@ -825,10 +819,13 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, int s = ref.fd; bool ret; + flow_trace(uflow, "Received data on reply socket"); + uflow->ts = now->tv_sec; + if (c->mode == MODE_VU) - ret = udp_vu_reply_sock_data(c, s, tosidx, now); + ret = udp_vu_reply_sock_data(c, s, tosidx); else - ret = udp_buf_reply_sock_data(c, s, tosidx, now); + ret = udp_buf_reply_sock_data(c, s, tosidx); if (!ret) { flow_err(uflow, "Unable to forward UDP"); diff --git a/udp_vu.c b/udp_vu.c index 06bdeae..4153b6c 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -275,22 +275,17 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, * @c: Execution context * @s: Socket to read data from * @tosidx: Flow & side to forward data from @s to - * @now: Current timestamp * * Return: true on success, false if can't forward from socket to flow's pif */ -bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx, - const struct timespec *now) +bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx) { const struct flowside *toside = flowside_at_sidx(tosidx); bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); - struct udp_flow *uflow = udp_at_sidx(tosidx); struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; int i; - ASSERT(uflow); - if (pif_at_sidx(tosidx) != PIF_TAP) return false; @@ -301,8 +296,6 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx, iov_used = udp_vu_sock_recv(c, s, v6, &dlen); if (iov_used <= 0) break; - flow_trace(uflow, "Received 1 datagram on reply socket"); - uflow->ts = now->tv_sec; udp_vu_prepare(c, toside, dlen); if (*c->pcap) { diff --git a/udp_vu.h b/udp_vu.h index 2299b51..6d541a4 100644 --- a/udp_vu.h +++ b/udp_vu.h @@ -8,7 +8,6 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, const struct timespec *now); -bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx, - const struct timespec *now); +bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx); #endif /* UDP_VU_H */ From 684870a766e7f024a5720464ad070e666cb4793e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 1 Apr 2025 19:57:10 +1100 Subject: [PATCH 329/382] udp: Correct some seccomp filter annotations Both udp_buf_listen_sock_data() and udp_buf_reply_sock_data() have comments stating they use recvmmsg(). That's not correct, they only do so via udp_sock_recv() which lists recvmmsg() itself. In contrast udp_splice_send() and udp_tap_handler() both directly use sendmmsg(), but only the latter lists it. Add it to the former as well. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/udp.c b/udp.c index 44b58d1..ab3e9d2 100644 --- a/udp.c +++ b/udp.c @@ -272,6 +272,8 @@ static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx) * @dst: Destination port for datagrams (target side) * @ref: epoll reference for origin socket * @now: Timestamp + * + * #syscalls sendmmsg */ static void udp_splice_send(const struct ctx *c, size_t start, size_t n, flow_sidx_t tosidx) @@ -662,8 +664,6 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh) * @c: Execution context * @ref: epoll reference * @now: Current timestamp - * - * #syscalls recvmmsg */ static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref, const struct timespec *now) @@ -760,8 +760,6 @@ void udp_listen_sock_handler(const struct ctx *c, * @tosidx: Flow & side to forward data from @s to * * Return: true on success, false if can't forward from socket to flow's pif - * - * #syscalls recvmmsg */ static bool udp_buf_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx) From 06784d7fc6761528d587837b241d27c6d17c0842 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Thu, 3 Apr 2025 19:01:02 +0200 Subject: [PATCH 330/382] passt-repair: Ensure that read buffer is NULL-terminated After 3d41e4d83895 ("passt-repair: Correct off-by-one error verifying name"), Coverity Scan isn't convinced anymore about the fact that the ev->name used in the snprintf() is NULL-terminated. It comes from a read() call, and read() of course doesn't terminate it, but we already check that the byte at ev->len - 1 is a NULL terminator, so this is actually a false positive. In any case, the logic ensuring that ev->name is NULL-terminated isn't necessarily obvious, and additionally checking that the last byte in the buffer we read is a NULL terminator is harmless, so do that explicitly, even if it's redundant. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- passt-repair.c | 1 + 1 file changed, 1 insertion(+) diff --git a/passt-repair.c b/passt-repair.c index 440c77a..256a8c9 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -137,6 +137,7 @@ int main(int argc, char **argv) fprintf(stderr, "inotify read: %i", errno); _exit(1); } + buf[n - 1] = '\0'; if (n < (ssize_t)sizeof(*ev)) { fprintf(stderr, "Short inotify read: %zi", n); From a7775e9550fa698e4af1322f6ef63924c24d1fab Mon Sep 17 00:00:00 2001 From: Jon Maloy <jmaloy@redhat.com> Date: Sat, 5 Apr 2025 15:21:26 -0400 Subject: [PATCH 331/382] udp: support traceroute in direction tap-socket Now that ICMP pass-through from socket-to-tap is in place, it is easy to support UDP based traceroute functionality in direction tap-to-socket. We fix that in this commit. Link: https://bugs.passt.top/show_bug.cgi?id=64 Signed-off-by: Jon Maloy <jmaloy@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tap.c | 17 +++++++++++++---- udp.c | 22 +++++++++++++++++++++- udp.h | 3 ++- udp_flow.c | 1 + udp_flow.h | 4 +++- 5 files changed, 40 insertions(+), 7 deletions(-) diff --git a/tap.c b/tap.c index 3a6fcbe..d630f6d 100644 --- a/tap.c +++ b/tap.c @@ -559,6 +559,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4 * @msgs: Count of messages in sequence * @protocol: Protocol number + * @ttl: Time to live * @source: Source port * @dest: Destination port * @saddr: Source address @@ -567,6 +568,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); */ static struct tap4_l4_t { uint8_t protocol; + uint8_t ttl; uint16_t source; uint16_t dest; @@ -586,6 +588,7 @@ static struct tap4_l4_t { * @dest: Destination port * @saddr: Source address * @daddr: Destination address + * @hop_limit: Hop limit * @msg: Array of messages that can be handled in a single call */ static struct tap6_l4_t { @@ -598,6 +601,8 @@ static struct tap6_l4_t { struct in6_addr saddr; struct in6_addr daddr; + uint8_t hop_limit; + struct pool_l4_t p; } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */]; @@ -786,7 +791,8 @@ resume: #define L4_MATCH(iph, uh, seq) \ ((seq)->protocol == (iph)->protocol && \ (seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \ - (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr) + (seq)->saddr.s_addr == (iph)->saddr && \ + (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl) #define L4_SET(iph, uh, seq) \ do { \ @@ -795,6 +801,7 @@ resume: (seq)->dest = (uh)->dest; \ (seq)->saddr.s_addr = (iph)->saddr; \ (seq)->daddr.s_addr = (iph)->daddr; \ + (seq)->ttl = (iph)->ttl; \ } while (0) if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV) @@ -843,7 +850,7 @@ append: for (k = 0; k < p->count; ) k += udp_tap_handler(c, PIF_TAP, AF_INET, &seq->saddr, &seq->daddr, - p, k, now); + seq->ttl, p, k, now); } } @@ -966,7 +973,8 @@ resume: (seq)->dest == (uh)->dest && \ (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \ IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \ - IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)) + IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \ + (seq)->hop_limit == (ip6h)->hop_limit) #define L4_SET(ip6h, proto, uh, seq) \ do { \ @@ -976,6 +984,7 @@ resume: (seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \ (seq)->saddr = *saddr; \ (seq)->daddr = *daddr; \ + (seq)->hop_limit = (ip6h)->hop_limit; \ } while (0) if (seq && L4_MATCH(ip6h, proto, uh, seq) && @@ -1026,7 +1035,7 @@ append: for (k = 0; k < p->count; ) k += udp_tap_handler(c, PIF_TAP, AF_INET6, &seq->saddr, &seq->daddr, - p, k, now); + seq->hop_limit, p, k, now); } } diff --git a/udp.c b/udp.c index ab3e9d2..5a251df 100644 --- a/udp.c +++ b/udp.c @@ -844,6 +844,7 @@ fail: * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address + * @ttl: TTL or hop limit for packets to be sent in this call * @p: Pool of UDP packets, with UDP headers * @idx: Index of first packet to process * @now: Current timestamp @@ -854,7 +855,8 @@ fail: */ int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, - const struct pool *p, int idx, const struct timespec *now) + uint8_t ttl, const struct pool *p, int idx, + const struct timespec *now) { const struct flowside *toside; struct mmsghdr mm[UIO_MAXIOV]; @@ -933,6 +935,24 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, mm[i].msg_hdr.msg_controllen = 0; mm[i].msg_hdr.msg_flags = 0; + if (ttl != uflow->ttl[tosidx.sidei]) { + uflow->ttl[tosidx.sidei] = ttl; + if (af == AF_INET) { + if (setsockopt(s, IPPROTO_IP, IP_TTL, + &ttl, sizeof(ttl)) < 0) + flow_perror(uflow, + "setsockopt IP_TTL"); + } else { + /* IPv6 hop_limit cannot be only 1 byte */ + int hop_limit = ttl; + + if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS, + &hop_limit, sizeof(hop_limit)) < 0) + flow_perror(uflow, + "setsockopt IPV6_UNICAST_HOPS"); + } + } + count++; } diff --git a/udp.h b/udp.h index de2df6d..a811475 100644 --- a/udp.h +++ b/udp.h @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, - const struct pool *p, int idx, const struct timespec *now); + uint8_t ttl, const struct pool *p, int idx, + const struct timespec *now); int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, const char *ifname, in_port_t port); int udp_init(struct ctx *c); diff --git a/udp_flow.c b/udp_flow.c index bf4b896..99ae490 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); uflow->ts = now->tv_sec; uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1; + uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0; if (s_ini >= 0) { /* When using auto port-scanning the listening port could go diff --git a/udp_flow.h b/udp_flow.h index 9a1b059..520de62 100644 --- a/udp_flow.h +++ b/udp_flow.h @@ -8,11 +8,12 @@ #define UDP_FLOW_H /** - * struct udp - Descriptor for a flow of UDP packets + * struct udp_flow - Descriptor for a flow of UDP packets * @f: Generic flow information * @closed: Flow is already closed * @ts: Activity timestamp * @s: Socket fd (or -1) for each side of the flow + * @ttl: TTL or hop_limit for both sides */ struct udp_flow { /* Must be first element */ @@ -21,6 +22,7 @@ struct udp_flow { bool closed :1; time_t ts; int s[SIDES]; + uint8_t ttl[SIDES]; }; struct udp_flow *udp_at_sidx(flow_sidx_t sidx); From d74b5a7c107006b95df6a69e5f1e6b9a373c7f53 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:31 +1100 Subject: [PATCH 332/382] udp: Use connect()ed sockets for initiating side Currently we have an asymmetry in how we handle UDP sockets. For flows where the target side is a socket, we create a new connect()ed socket - the "reply socket" specifically for that flow used for sending and receiving datagrams on that flow and only that flow. For flows where the initiating side is a socket, we continue to use the "listening" socket (or rather, a dup() of it). This has some disadvantages: * We need a hash lookup for every datagram on the listening socket in order to work out what flow it belongs to * The dup() keeps the socket alive even if automatic forwarding removes the listening socket. However, the epoll data remains the same including containing the now stale original fd. This causes bug 103. * We can't (easily) set flow-specific options on an initiating side socket, because that could affect other flows as well Alter the code to use a connect()ed socket on the initiating side as well as the target side. There's no way to "clone and connect" the listening socket (a loose equivalent of accept() for UDP), so we have to create a new socket. We have to bind() this socket before we connect() it, which is allowed thanks to SO_REUSEADDR, but does leave a small window where it could receive datagrams not intended for this flow. For now we handle this by simply discarding any datagrams received between bind() and connect(), but I intend to improve this in a later patch. Link: https://bugs.passt.top/show_bug.cgi?id=103 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- epoll_type.h | 4 ++-- passt.c | 6 +++--- udp.c | 50 ++++++++++++++++++++++++++------------------------ udp.h | 4 ++-- udp_flow.c | 32 +++++++++----------------------- util.c | 2 +- 6 files changed, 43 insertions(+), 55 deletions(-) diff --git a/epoll_type.h b/epoll_type.h index 7f2a121..12ac59b 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -22,8 +22,8 @@ enum epoll_type { EPOLL_TYPE_TCP_TIMER, /* UDP "listening" sockets */ EPOLL_TYPE_UDP_LISTEN, - /* UDP socket for replies on a specific flow */ - EPOLL_TYPE_UDP_REPLY, + /* UDP socket for a specific flow */ + EPOLL_TYPE_UDP, /* ICMP/ICMPv6 ping sockets */ EPOLL_TYPE_PING, /* inotify fd watching for end of netns (pasta) */ diff --git a/passt.c b/passt.c index cd06772..388d10f 100644 --- a/passt.c +++ b/passt.c @@ -68,7 +68,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket", [EPOLL_TYPE_TCP_TIMER] = "TCP timer", [EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket", - [EPOLL_TYPE_UDP_REPLY] = "UDP reply socket", + [EPOLL_TYPE_UDP] = "UDP flow socket", [EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket", [EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch", [EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch", @@ -339,8 +339,8 @@ loop: case EPOLL_TYPE_UDP_LISTEN: udp_listen_sock_handler(&c, ref, eventmask, &now); break; - case EPOLL_TYPE_UDP_REPLY: - udp_reply_sock_handler(&c, ref, eventmask, &now); + case EPOLL_TYPE_UDP: + udp_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); diff --git a/udp.c b/udp.c index 5a251df..1b3fffd 100644 --- a/udp.c +++ b/udp.c @@ -39,27 +39,30 @@ * could receive packets from multiple flows, so we use a hash table match to * find the specific flow for a datagram. * - * When a UDP flow is initiated from a listening socket we take a duplicate of - * the socket and store it in uflow->s[INISIDE]. This will last for the + * Flow sockets + * ============ + * + * When a UDP flow targets a socket, we create a "flow" socket in + * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive + * replies on the target side. This socket is both bound and connected and has + * EPOLL_TYPE_UDP. The connect() means it will only receive datagrams + * associated with this flow, so the epoll reference directly points to the flow + * and we don't need a hash lookup. + * + * When a flow is initiated from a listening socket, we create a "flow" socket + * with the same bound address as the listening socket, but also connect()ed to + * the flow's peer. This is stored in uflow->s[INISIDE] and will last for the * lifetime of the flow, even if the original listening socket is closed due to * port auto-probing. The duplicate is used to deliver replies back to the * originating side. * - * Reply sockets - * ============= - * - * When a UDP flow targets a socket, we create a "reply" socket in - * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive - * replies on the target side. This socket is both bound and connected and has - * EPOLL_TYPE_UDP_REPLY. The connect() means it will only receive datagrams - * associated with this flow, so the epoll reference directly points to the flow - * and we don't need a hash lookup. - * - * NOTE: it's possible that the reply socket could have a bound address - * overlapping with an unrelated listening socket. We assume datagrams for the - * flow will come to the reply socket in preference to a listening socket. The - * sample program doc/platform-requirements/reuseaddr-priority.c documents and - * tests that assumption. + * NOTE: A flow socket can have a bound address overlapping with a listening + * socket. That will happen naturally for flows initiated from a socket, but is + * also possible (though unlikely) for tap initiated flows, depending on the + * source port. We assume datagrams for the flow will come to a connect()ed + * socket in preference to a listening socket. The sample program + * doc/platform-requirements/reuseaddr-priority.c documents and tests that + * assumption. * * "Spliced" flows * =============== @@ -71,8 +74,7 @@ * actually used; it doesn't make sense for datagrams and instead a pair of * recvmmsg() and sendmmsg() is used to forward the datagrams. * - * Note that a spliced flow will have *both* a duplicated listening socket and a - * reply socket (see above). + * Note that a spliced flow will have two flow sockets (see above). */ #include <sched.h> @@ -557,7 +559,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) } eh = (const struct errhdr *)CMSG_DATA(hdr); - if (ref.type == EPOLL_TYPE_UDP_REPLY) { + if (ref.type == EPOLL_TYPE_UDP) { flow_sidx_t sidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(sidx); size_t dlen = rc; @@ -792,14 +794,14 @@ static bool udp_buf_reply_sock_data(const struct ctx *c, } /** - * udp_reply_sock_handler() - Handle new data from flow specific socket + * udp_sock_handler() - Handle new data from flow specific socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap * @now: Current timestamp */ -void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now) +void udp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) { struct udp_flow *uflow = udp_at_sidx(ref.flowside); @@ -807,7 +809,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, if (events & EPOLLERR) { if (udp_sock_errs(c, ref) < 0) { - flow_err(uflow, "Unrecoverable error on reply socket"); + flow_err(uflow, "Unrecoverable error on flow socket"); goto fail; } } diff --git a/udp.h b/udp.h index a811475..8f8531a 100644 --- a/udp.h +++ b/udp.h @@ -11,8 +11,8 @@ void udp_portmap_clear(void); void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); -void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now); +void udp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now); int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, uint8_t ttl, const struct pool *p, int idx, diff --git a/udp_flow.c b/udp_flow.c index 99ae490..a2d417f 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -49,10 +49,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) flow_foreach_sidei(sidei) { flow_hash_remove(c, FLOW_SIDX(uflow, sidei)); if (uflow->s[sidei] >= 0) { - /* The listening socket needs to stay in epoll, but the - * flow specific one needs to be removed */ - if (sidei == TGTSIDE) - epoll_del(c, uflow->s[sidei]); + epoll_del(c, uflow->s[sidei]); close(uflow->s[sidei]); uflow->s[sidei] = -1; } @@ -81,7 +78,7 @@ static int udp_flow_sock(const struct ctx *c, } fref = { .sidx = FLOW_SIDX(uflow, sidei) }; int rc, s; - s = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, pif, side, fref.data); + s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data); if (s < 0) { flow_dbg_perror(uflow, "Couldn't open flow specific socket"); return s; @@ -120,13 +117,12 @@ static int udp_flow_sock(const struct ctx *c, * udp_flow_new() - Common setup for a new UDP flow * @c: Execution context * @flow: Initiated flow - * @s_ini: Initiating socket (or -1) * @now: Timestamp * * Return: UDP specific flow, if successful, NULL on failure */ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, - int s_ini, const struct timespec *now) + const struct timespec *now) { struct udp_flow *uflow = NULL; unsigned sidei; @@ -139,22 +135,12 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1; uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0; - if (s_ini >= 0) { - /* When using auto port-scanning the listening port could go - * away, so we need to duplicate the socket - */ - uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0); - if (uflow->s[INISIDE] < 0) { - flow_perror(uflow, - "Couldn't duplicate listening socket"); - goto cancel; - } + flow_foreach_sidei(sidei) { + if (pif_is_socket(uflow->f.pif[sidei])) + if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0) + goto cancel; } - if (pif_is_socket(flow->f.pif[TGTSIDE])) - if ((uflow->s[TGTSIDE] = udp_flow_sock(c, uflow, TGTSIDE)) < 0) - goto cancel; - /* Tap sides always need to be looked up by hash. Socket sides don't * always, but sometimes do (receiving packets on a socket not specific * to one flow). Unconditionally hash both sides so all our bases are @@ -225,7 +211,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, return FLOW_SIDX_NONE; } - return udp_flow_new(c, flow, ref.fd, now); + return udp_flow_new(c, flow, now); } /** @@ -281,7 +267,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, return FLOW_SIDX_NONE; } - return udp_flow_new(c, flow, -1, now); + return udp_flow_new(c, flow, now); } /** diff --git a/util.c b/util.c index b9a3d43..0f68cf5 100644 --- a/util.c +++ b/util.c @@ -71,7 +71,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, case EPOLL_TYPE_UDP_LISTEN: freebind = c->freebind; /* fallthrough */ - case EPOLL_TYPE_UDP_REPLY: + case EPOLL_TYPE_UDP: proto = IPPROTO_UDP; socktype = SOCK_DGRAM | SOCK_NONBLOCK; break; From 1d7bbb101a0b1dcbc99c51cd65abb90a0144ac7b Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:32 +1100 Subject: [PATCH 333/382] udp: Make udp_sock_recv() take max number of frames as a parameter Currently udp_sock_recv() decides the maximum number of frames it is willing to receive based on the mode. However, we have upcoming use cases where we will have different criteria for how many frames we want with information that's not naturally available here but is in the caller. So make the maximum number of frames a parameter. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Fix typo in comment in udp_buf_reply_sock_data()] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/udp.c b/udp.c index 1b3fffd..53403bf 100644 --- a/udp.c +++ b/udp.c @@ -634,22 +634,14 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref) * @c: Execution context * @s: Socket to receive from * @mmh mmsghdr array to receive into + * @n: Maximum number of datagrams to receive * * Return: Number of datagrams received * * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64 */ -static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh) +static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n) { - /* For not entirely clear reasons (data locality?) pasta gets better - * throughput if we receive tap datagrams one at a atime. For small - * splice datagrams throughput is slightly better if we do batch, but - * it's slightly worse for large splice datagrams. Since we don't know - * before we receive whether we'll use tap or splice, always go one at a - * time for pasta mode. - */ - int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); - ASSERT(!c->no_udp); n = recvmmsg(s, mmh, n, 0, NULL); @@ -671,9 +663,10 @@ static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { const socklen_t sasize = sizeof(udp_meta[0].s_in); - int n, i; + /* See udp_buf_sock_data() comment */ + int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i; - if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv)) <= 0) + if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv, n)) <= 0) return; /* We divide datagrams into batches based on how we need to send them, @@ -768,9 +761,15 @@ static bool udp_buf_reply_sock_data(const struct ctx *c, { const struct flowside *toside = flowside_at_sidx(tosidx); uint8_t topif = pif_at_sidx(tosidx); - int n, i; + /* For not entirely clear reasons (data locality?) pasta gets better + * throughput if we receive tap datagrams one at a time. For small + * splice datagrams throughput is slightly better if we do batch, but + * it's slightly worse for large splice datagrams. Since we don't know + * the size before we receive, always go one at a time for pasta mode. + */ + int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i; - if ((n = udp_sock_recv(c, s, udp_mh_recv)) <= 0) + if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0) return true; for (i = 0; i < n; i++) { From 84ab1305fabaf07b5badf433e55a458de5b86918 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:33 +1100 Subject: [PATCH 334/382] udp: Polish udp_vu_sock_info() and remove from vu specific code udp_vu_sock_info() uses MSG_PEEK to look ahead at the next datagram to be received and gets its source address. Currently we only use it in the vhost-user path, but there's nothing inherently vhost-user specific about it. We have upcoming uses for it elsewhere so rename and move to udp.c. While we're there, polish its error reporting a litle. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Drop excess newline before udp_sock_recv()] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 24 ++++++++++++++++++++++++ udp_internal.h | 1 + udp_vu.c | 19 +------------------ 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/udp.c b/udp.c index 53403bf..1e241c8 100644 --- a/udp.c +++ b/udp.c @@ -629,6 +629,30 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref) return n_err; } +/** + * udp_peek_addr() - Get source address for next packet + * @s: Socket to get information from + * @src: Socket address (output) + * + * Return: 0 on success, -1 otherwise + */ +int udp_peek_addr(int s, union sockaddr_inany *src) +{ + struct msghdr msg = { + .msg_name = src, + .msg_namelen = sizeof(*src), + }; + int rc; + + rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT); + if (rc < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) + warn_perror("Error peeking at socket address"); + return rc; + } + return 0; +} + /** * udp_sock_recv() - Receive datagrams from a socket * @c: Execution context diff --git a/udp_internal.h b/udp_internal.h index 02724e5..43a6109 100644 --- a/udp_internal.h +++ b/udp_internal.h @@ -30,5 +30,6 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen, bool no_udp_csum); +int udp_peek_addr(int s, union sockaddr_inany *src); #endif /* UDP_INTERNAL_H */ diff --git a/udp_vu.c b/udp_vu.c index 4153b6c..5faf1e1 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -57,23 +57,6 @@ static size_t udp_vu_hdrlen(bool v6) return hdrlen; } -/** - * udp_vu_sock_info() - get socket information - * @s: Socket to get information from - * @s_in: Socket address (output) - * - * Return: 0 if socket address can be read, -1 otherwise - */ -static int udp_vu_sock_info(int s, union sockaddr_inany *s_in) -{ - struct msghdr msg = { - .msg_name = s_in, - .msg_namelen = sizeof(union sockaddr_inany), - }; - - return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT); -} - /** * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers * @c: Execution context @@ -230,7 +213,7 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, int iov_used; bool v6; - if (udp_vu_sock_info(ref.fd, &s_in) < 0) + if (udp_peek_addr(ref.fd, &s_in) < 0) break; sidx = udp_flow_from_sock(c, ref, &s_in, now); From 3a0881dfd02d758b0dc8ca6f5732bcb666b6d21e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:34 +1100 Subject: [PATCH 335/382] udp: Don't bother to batch datagrams from "listening" socket A "listening" UDP socket can receive datagrams from multiple flows. So, we currently have some quite subtle and complex code in udp_buf_listen_sock_data() to group contiguously received packets for the same flow into batches for forwarding. However, since we are now always using flow specific connect()ed sockets once a flow is established, handling of datagrams on listening sockets is essentially a slow path. Given that, it's not worth the complexity. Substantially simplify the code by using an approach more like vhost-user, and "peeking" at the address of the next datagram, one at a time to determine the correct flow before we actually receive the data, This removes all meaningful use of the s_in and tosidx fields in udp_meta_t, so they too can be removed, along with setting of msg_name and msg_namelen in the msghdr arrays which referenced them. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 75 +++++++++++++++-------------------------------------------- 1 file changed, 19 insertions(+), 56 deletions(-) diff --git a/udp.c b/udp.c index 1e241c8..4d32124 100644 --- a/udp.c +++ b/udp.c @@ -138,20 +138,15 @@ static struct ethhdr udp4_eth_hdr; static struct ethhdr udp6_eth_hdr; /** - * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets + * struct udp_meta_t - Pre-cooked headers for UDP packets * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) * @taph: Tap backend specific header - * @s_in: Source socket address, filled in by recvmmsg() - * @tosidx: sidx for the destination side of this datagram's flow */ static struct udp_meta_t { struct ipv6hdr ip6h; struct iphdr ip4h; struct tap_hdr taph; - - union sockaddr_inany s_in; - flow_sidx_t tosidx; } #ifdef __AVX2__ __attribute__ ((aligned(32))) @@ -234,8 +229,6 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); tiov[UDP_IOV_PAYLOAD].iov_base = payload; - mh->msg_name = &meta->s_in; - mh->msg_namelen = sizeof(meta->s_in); mh->msg_iov = siov; mh->msg_iovlen = 1; } @@ -686,60 +679,32 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n) static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { - const socklen_t sasize = sizeof(udp_meta[0].s_in); - /* See udp_buf_sock_data() comment */ - int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i; + union sockaddr_inany src; - if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv, n)) <= 0) - return; + while (udp_peek_addr(ref.fd, &src) == 0) { + flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now); + uint8_t topif = pif_at_sidx(tosidx); - /* We divide datagrams into batches based on how we need to send them, - * determined by udp_meta[i].tosidx. To avoid either two passes through - * the array, or recalculating tosidx for a single entry, we have to - * populate it one entry *ahead* of the loop counter. - */ - udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now); - udp_mh_recv[0].msg_hdr.msg_namelen = sasize; - for (i = 0; i < n; ) { - flow_sidx_t batchsidx = udp_meta[i].tosidx; - uint8_t batchpif = pif_at_sidx(batchsidx); - int batchstart = i; + if (udp_sock_recv(c, ref.fd, udp_mh_recv, 1) <= 0) + break; - do { - if (pif_is_socket(batchpif)) { - udp_splice_prepare(udp_mh_recv, i); - } else if (batchpif == PIF_TAP) { - udp_tap_prepare(udp_mh_recv, i, - flowside_at_sidx(batchsidx), - false); - } - - if (++i >= n) - break; - - udp_meta[i].tosidx = udp_flow_from_sock(c, ref, - &udp_meta[i].s_in, - now); - udp_mh_recv[i].msg_hdr.msg_namelen = sasize; - } while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx)); - - if (pif_is_socket(batchpif)) { - udp_splice_send(c, batchstart, i - batchstart, - batchsidx); - } else if (batchpif == PIF_TAP) { - tap_send_frames(c, &udp_l2_iov[batchstart][0], - UDP_NUM_IOVS, i - batchstart); - } else if (flow_sidx_valid(batchsidx)) { - flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx); - struct udp_flow *uflow = udp_at_sidx(batchsidx); + if (pif_is_socket(topif)) { + udp_splice_prepare(udp_mh_recv, 0); + udp_splice_send(c, 0, 1, tosidx); + } else if (topif == PIF_TAP) { + udp_tap_prepare(udp_mh_recv, 0, flowside_at_sidx(tosidx), + false); + tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, 1); + } else if (flow_sidx_valid(tosidx)) { + flow_sidx_t fromsidx = flow_sidx_opposite(tosidx); + struct udp_flow *uflow = udp_at_sidx(tosidx); flow_err(uflow, "No support for forwarding UDP from %s to %s", pif_name(pif_at_sidx(fromsidx)), - pif_name(batchpif)); + pif_name(topif)); } else { - debug("Discarding %d datagrams without flow", - i - batchstart); + debug("Discarding datagram without flow"); } } } @@ -801,8 +766,6 @@ static bool udp_buf_reply_sock_data(const struct ctx *c, udp_splice_prepare(udp_mh_recv, i); else if (topif == PIF_TAP) udp_tap_prepare(udp_mh_recv, i, toside, false); - /* Restore sockaddr length clobbered by recvmsg() */ - udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in); } if (pif_is_socket(topif)) { From 5221e177e132b8b5001ec97f42975ad1251f7110 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:35 +1100 Subject: [PATCH 336/382] udp: Parameterize number of datagrams handled by udp_*_reply_sock_data() Both udp_buf_reply_sock_data() and udp_vu_reply_sock_data() internally decide what the maximum number of datagrams they will forward is. We have some upcoming reasons to allow the caller to decide that instead, so make the maximum number of datagrams a parameter for both of them. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 31 ++++++++++++++++++------------- udp_vu.c | 6 ++++-- udp_vu.h | 3 ++- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/udp.c b/udp.c index 4d32124..0f09e67 100644 --- a/udp.c +++ b/udp.c @@ -741,22 +741,17 @@ void udp_listen_sock_handler(const struct ctx *c, * udp_buf_reply_sock_data() - Handle new data from flow specific socket * @c: Execution context * @s: Socket to read data from + * @n: Maximum number of datagrams to forward * @tosidx: Flow & side to forward data from @s to * * Return: true on success, false if can't forward from socket to flow's pif */ -static bool udp_buf_reply_sock_data(const struct ctx *c, - int s, flow_sidx_t tosidx) +static bool udp_buf_reply_sock_data(const struct ctx *c, int s, int n, + flow_sidx_t tosidx) { const struct flowside *toside = flowside_at_sidx(tosidx); uint8_t topif = pif_at_sidx(tosidx); - /* For not entirely clear reasons (data locality?) pasta gets better - * throughput if we receive tap datagrams one at a time. For small - * splice datagrams throughput is slightly better if we do batch, but - * it's slightly worse for large splice datagrams. Since we don't know - * the size before we receive, always go one at a time for pasta mode. - */ - int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i; + int i; if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0) return true; @@ -801,6 +796,14 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, } if (events & EPOLLIN) { + /* For not entirely clear reasons (data locality?) pasta gets + * better throughput if we receive tap datagrams one at a + * time. For small splice datagrams throughput is slightly + * better if we do batch, but it's slightly worse for large + * splice datagrams. Since we don't know the size before we + * receive, always go one at a time for pasta mode. + */ + size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); int s = ref.fd; bool ret; @@ -808,10 +811,12 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, flow_trace(uflow, "Received data on reply socket"); uflow->ts = now->tv_sec; - if (c->mode == MODE_VU) - ret = udp_vu_reply_sock_data(c, s, tosidx); - else - ret = udp_buf_reply_sock_data(c, s, tosidx); + if (c->mode == MODE_VU) { + ret = udp_vu_reply_sock_data(c, s, UDP_MAX_FRAMES, + tosidx); + } else { + ret = udp_buf_reply_sock_data(c, s, n, tosidx); + } if (!ret) { flow_err(uflow, "Unable to forward UDP"); diff --git a/udp_vu.c b/udp_vu.c index 5faf1e1..b2618b3 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -257,11 +257,13 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, * udp_vu_reply_sock_data() - Handle new data from flow specific socket * @c: Execution context * @s: Socket to read data from + * @n: Maximum number of datagrams to forward * @tosidx: Flow & side to forward data from @s to * * Return: true on success, false if can't forward from socket to flow's pif */ -bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx) +bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n, + flow_sidx_t tosidx) { const struct flowside *toside = flowside_at_sidx(tosidx); bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); @@ -272,7 +274,7 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx) if (pif_at_sidx(tosidx) != PIF_TAP) return false; - for (i = 0; i < UDP_MAX_FRAMES; i++) { + for (i = 0; i < n; i++) { ssize_t dlen; int iov_used; diff --git a/udp_vu.h b/udp_vu.h index 6d541a4..c897c36 100644 --- a/udp_vu.h +++ b/udp_vu.h @@ -8,6 +8,7 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, const struct timespec *now); -bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx); +bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n, + flow_sidx_t tosidx); #endif /* UDP_VU_H */ From 0304dd9c34a7dd29c3a8a2058626a971d4e71a8e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:36 +1100 Subject: [PATCH 337/382] udp: Split spliced forwarding path from udp_buf_reply_sock_data() udp_buf_reply_sock_data() can handle forwarding data either from socket to socket ("splicing") or from socket to tap. It has a test on each datagram for which case we're in, but that will be the same for everything in the batch. Split out the spliced path into a separate udp_sock_to_sock() function. This leaves udp_{buf,vu}_reply_sock_data() handling only forwards from socket to tap, so rename and simplify them accordingly. This makes the code slightly longer for now, but will allow future cleanups to shrink it back down again. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Fix typos in comments to udp_sock_recv() and udp_vu_listen_sock_data()] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 103 ++++++++++++++++++++++++++++++------------------------- udp_vu.c | 12 ++----- udp_vu.h | 3 +- 3 files changed, 60 insertions(+), 58 deletions(-) diff --git a/udp.c b/udp.c index 0f09e67..2745e5d 100644 --- a/udp.c +++ b/udp.c @@ -670,6 +670,49 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n) return n; } +/** + * udp_sock_to_sock() - Forward datagrams from socket to socket + * @c: Execution context + * @from_s: Socket to receive datagrams from + * @n: Maximum number of datagrams to forward + * @tosidx: Flow & side to forward datagrams to + */ +static void udp_sock_to_sock(const struct ctx *c, int from_s, int n, + flow_sidx_t tosidx) +{ + int i; + + if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0) + return; + + for (i = 0; i < n; i++) + udp_splice_prepare(udp_mh_recv, i); + + udp_splice_send(c, 0, n, tosidx); +} + +/** + * udp_buf_sock_to_tap() - Forward datagrams from socket to tap + * @c: Execution context + * @s: Socket to read data from + * @n: Maximum number of datagrams to forward + * @tosidx: Flow & side to forward data from @s to + */ +static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n, + flow_sidx_t tosidx) +{ + const struct flowside *toside = flowside_at_sidx(tosidx); + int i; + + if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0) + return; + + for (i = 0; i < n; i++) + udp_tap_prepare(udp_mh_recv, i, toside, false); + + tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n); +} + /** * udp_buf_listen_sock_data() - Handle new data from socket * @c: Execution context @@ -737,43 +780,6 @@ void udp_listen_sock_handler(const struct ctx *c, } } -/** - * udp_buf_reply_sock_data() - Handle new data from flow specific socket - * @c: Execution context - * @s: Socket to read data from - * @n: Maximum number of datagrams to forward - * @tosidx: Flow & side to forward data from @s to - * - * Return: true on success, false if can't forward from socket to flow's pif - */ -static bool udp_buf_reply_sock_data(const struct ctx *c, int s, int n, - flow_sidx_t tosidx) -{ - const struct flowside *toside = flowside_at_sidx(tosidx); - uint8_t topif = pif_at_sidx(tosidx); - int i; - - if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0) - return true; - - for (i = 0; i < n; i++) { - if (pif_is_socket(topif)) - udp_splice_prepare(udp_mh_recv, i); - else if (topif == PIF_TAP) - udp_tap_prepare(udp_mh_recv, i, toside, false); - } - - if (pif_is_socket(topif)) { - udp_splice_send(c, 0, n, tosidx); - } else if (topif == PIF_TAP) { - tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n); - } else { - return false; - } - - return true; -} - /** * udp_sock_handler() - Handle new data from flow specific socket * @c: Execution context @@ -805,21 +811,26 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, */ size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); + uint8_t topif = pif_at_sidx(tosidx); int s = ref.fd; - bool ret; flow_trace(uflow, "Received data on reply socket"); uflow->ts = now->tv_sec; - if (c->mode == MODE_VU) { - ret = udp_vu_reply_sock_data(c, s, UDP_MAX_FRAMES, - tosidx); + if (pif_is_socket(topif)) { + udp_sock_to_sock(c, ref.fd, n, tosidx); + } else if (topif == PIF_TAP) { + if (c->mode == MODE_VU) { + udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES, + tosidx); + } else { + udp_buf_sock_to_tap(c, s, n, tosidx); + } } else { - ret = udp_buf_reply_sock_data(c, s, n, tosidx); - } - - if (!ret) { - flow_err(uflow, "Unable to forward UDP"); + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(pif_at_sidx(ref.flowside)), + pif_name(topif)); goto fail; } } diff --git a/udp_vu.c b/udp_vu.c index b2618b3..8e02093 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -254,16 +254,13 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, } /** - * udp_vu_reply_sock_data() - Handle new data from flow specific socket + * udp_vu_sock_to_tap() - Forward datagrams from socket to tap * @c: Execution context * @s: Socket to read data from * @n: Maximum number of datagrams to forward * @tosidx: Flow & side to forward data from @s to - * - * Return: true on success, false if can't forward from socket to flow's pif */ -bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n, - flow_sidx_t tosidx) +void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx) { const struct flowside *toside = flowside_at_sidx(tosidx); bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); @@ -271,9 +268,6 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n, struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; int i; - if (pif_at_sidx(tosidx) != PIF_TAP) - return false; - for (i = 0; i < n; i++) { ssize_t dlen; int iov_used; @@ -290,6 +284,4 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n, } vu_flush(vdev, vq, elem, iov_used); } - - return true; } diff --git a/udp_vu.h b/udp_vu.h index c897c36..576b0e7 100644 --- a/udp_vu.h +++ b/udp_vu.h @@ -8,7 +8,6 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, const struct timespec *now); -bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n, - flow_sidx_t tosidx); +void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx); #endif /* UDP_VU_H */ From fc6ee68ad3a8863cba534dfa4b88767114a6701e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:37 +1100 Subject: [PATCH 338/382] udp: Merge vhost-user and "buf" listening socket paths udp_buf_listen_sock_data() and udp_vu_listen_sock_data() now have effectively identical structure. The forwarding functions used for flow specific sockets (udp_buf_sock_to_tap(), udp_vu_sock_to_tap() and udp_sock_to_sock()) also now take a number of datagrams. This means we can re-use them for the listening socket path, just passing '1' so they handle a single datagram at a time. This allows us to merge both the vhost-user and flow specific paths into a single, simpler udp_listen_sock_data(). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 27 ++++++++-------------- udp_internal.h | 1 - udp_vu.c | 62 -------------------------------------------------- 3 files changed, 10 insertions(+), 80 deletions(-) diff --git a/udp.c b/udp.c index 2745e5d..b0a7bf7 100644 --- a/udp.c +++ b/udp.c @@ -629,7 +629,7 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref) * * Return: 0 on success, -1 otherwise */ -int udp_peek_addr(int s, union sockaddr_inany *src) +static int udp_peek_addr(int s, union sockaddr_inany *src) { struct msghdr msg = { .msg_name = src, @@ -714,12 +714,12 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n, } /** - * udp_buf_listen_sock_data() - Handle new data from socket + * udp_listen_sock_data() - Handle new data from listening socket * @c: Execution context * @ref: epoll reference * @now: Current timestamp */ -static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref, +static void udp_listen_sock_data(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { union sockaddr_inany src; @@ -728,16 +728,13 @@ static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref, flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now); uint8_t topif = pif_at_sidx(tosidx); - if (udp_sock_recv(c, ref.fd, udp_mh_recv, 1) <= 0) - break; - if (pif_is_socket(topif)) { - udp_splice_prepare(udp_mh_recv, 0); - udp_splice_send(c, 0, 1, tosidx); + udp_sock_to_sock(c, ref.fd, 1, tosidx); } else if (topif == PIF_TAP) { - udp_tap_prepare(udp_mh_recv, 0, flowside_at_sidx(tosidx), - false); - tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, 1); + if (c->mode == MODE_VU) + udp_vu_sock_to_tap(c, ref.fd, 1, tosidx); + else + udp_buf_sock_to_tap(c, ref.fd, 1, tosidx); } else if (flow_sidx_valid(tosidx)) { flow_sidx_t fromsidx = flow_sidx_opposite(tosidx); struct udp_flow *uflow = udp_at_sidx(tosidx); @@ -772,12 +769,8 @@ void udp_listen_sock_handler(const struct ctx *c, } } - if (events & EPOLLIN) { - if (c->mode == MODE_VU) - udp_vu_listen_sock_data(c, ref, now); - else - udp_buf_listen_sock_data(c, ref, now); - } + if (events & EPOLLIN) + udp_listen_sock_data(c, ref, now); } /** diff --git a/udp_internal.h b/udp_internal.h index 43a6109..02724e5 100644 --- a/udp_internal.h +++ b/udp_internal.h @@ -30,6 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen, bool no_udp_csum); -int udp_peek_addr(int s, union sockaddr_inany *src); #endif /* UDP_INTERNAL_H */ diff --git a/udp_vu.c b/udp_vu.c index 8e02093..1f89509 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -191,68 +191,6 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used) } } -/** - * udp_vu_listen_sock_data() - Handle new data from socket - * @c: Execution context - * @ref: epoll reference - * @now: Current timestamp - */ -void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, - const struct timespec *now) -{ - struct vu_dev *vdev = c->vdev; - struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; - int i; - - for (i = 0; i < UDP_MAX_FRAMES; i++) { - const struct flowside *toside; - union sockaddr_inany s_in; - flow_sidx_t sidx; - uint8_t pif; - ssize_t dlen; - int iov_used; - bool v6; - - if (udp_peek_addr(ref.fd, &s_in) < 0) - break; - - sidx = udp_flow_from_sock(c, ref, &s_in, now); - pif = pif_at_sidx(sidx); - - if (pif != PIF_TAP) { - if (flow_sidx_valid(sidx)) { - flow_sidx_t fromsidx = flow_sidx_opposite(sidx); - struct udp_flow *uflow = udp_at_sidx(sidx); - - flow_err(uflow, - "No support for forwarding UDP from %s to %s", - pif_name(pif_at_sidx(fromsidx)), - pif_name(pif)); - } else { - debug("Discarding 1 datagram without flow"); - } - - continue; - } - - toside = flowside_at_sidx(sidx); - - v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); - - iov_used = udp_vu_sock_recv(c, ref.fd, v6, &dlen); - if (iov_used <= 0) - break; - - udp_vu_prepare(c, toside, dlen); - if (*c->pcap) { - udp_vu_csum(toside, iov_used); - pcap_iov(iov_vu, iov_used, - sizeof(struct virtio_net_hdr_mrg_rxbuf)); - } - vu_flush(vdev, vq, elem, iov_used); - } -} - /** * udp_vu_sock_to_tap() - Forward datagrams from socket to tap * @c: Execution context From fd844a90bce0274d2488370ed7fadd850b6a0294 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:38 +1100 Subject: [PATCH 339/382] udp: Move UDP_MAX_FRAMES to udp.c Recent changes mean that this define is no longer used anywhere except in udp.c. Move it back into udp.c from udp_internal.h. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 2 ++ udp_internal.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/udp.c b/udp.c index b0a7bf7..f74a992 100644 --- a/udp.c +++ b/udp.c @@ -116,6 +116,8 @@ #include "udp_internal.h" #include "udp_vu.h" +#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ + /* Maximum UDP data to be returned in ICMP messages */ #define ICMP4_MAX_DLEN 8 #define ICMP6_MAX_DLEN (IPV6_MIN_MTU \ diff --git a/udp_internal.h b/udp_internal.h index 02724e5..f7d8426 100644 --- a/udp_internal.h +++ b/udp_internal.h @@ -8,8 +8,6 @@ #include "tap.h" /* needed by udp_meta_t */ -#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ - /** * struct udp_payload_t - UDP header and data for inbound messages * @uh: UDP header From 159beefa36a09fc36cc9669fd536926d84c7c342 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:39 +1100 Subject: [PATCH 340/382] udp_flow: Take pif and port as explicit parameters to udp_flow_from_sock() Currently udp_flow_from_sock() is only used when receiving a datagram from a "listening" socket. It takes the listening socket's epoll reference to get the interface and port on which the datagram arrived. We have some upcoming cases where we want to use this in different contexts, so make it take the pif and port as direct parameters instead. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Drop @ref from comment to udp_flow_from_sock()] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 4 +++- udp_flow.c | 16 +++++++--------- udp_flow.h | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/udp.c b/udp.c index f74a992..157697e 100644 --- a/udp.c +++ b/udp.c @@ -727,7 +727,9 @@ static void udp_listen_sock_data(const struct ctx *c, union epoll_ref ref, union sockaddr_inany src; while (udp_peek_addr(ref.fd, &src) == 0) { - flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now); + flow_sidx_t tosidx = udp_flow_from_sock(c, ref.udp.pif, + ref.udp.port, &src, + now); uint8_t topif = pif_at_sidx(tosidx); if (pif_is_socket(topif)) { diff --git a/udp_flow.c b/udp_flow.c index a2d417f..5afe6e5 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -161,9 +161,10 @@ cancel: } /** - * udp_flow_from_sock() - Find or create UDP flow for "listening" socket + * udp_flow_from_sock() - Find or create UDP flow for incoming datagram * @c: Execution context - * @ref: epoll reference of the receiving socket + * @pif: Interface the datagram is arriving from + * @port: Our (local) port number to which the datagram is arriving * @s_in: Source socket address, filled in by recvmmsg() * @now: Timestamp * @@ -172,7 +173,7 @@ cancel: * Return: sidx for the destination side of the flow for this packet, or * FLOW_SIDX_NONE if we couldn't find or create a flow. */ -flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, +flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port, const union sockaddr_inany *s_in, const struct timespec *now) { @@ -181,9 +182,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, union flow *flow; flow_sidx_t sidx; - ASSERT(ref.type == EPOLL_TYPE_UDP_LISTEN); - - sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, s_in, ref.udp.port); + sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, port); if ((uflow = udp_at_sidx(sidx))) { uflow->ts = now->tv_sec; return flow_sidx_opposite(sidx); @@ -193,12 +192,11 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, char sastr[SOCKADDR_STRLEN]; debug("Couldn't allocate flow for UDP datagram from %s %s", - pif_name(ref.udp.pif), - sockaddr_ntop(s_in, sastr, sizeof(sastr))); + pif_name(pif), sockaddr_ntop(s_in, sastr, sizeof(sastr))); return FLOW_SIDX_NONE; } - ini = flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port); + ini = flow_initiate_sa(flow, pif, s_in, port); if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || ini->oport == 0) { diff --git a/udp_flow.h b/udp_flow.h index 520de62..bbdeb2a 100644 --- a/udp_flow.h +++ b/udp_flow.h @@ -26,7 +26,7 @@ struct udp_flow { }; struct udp_flow *udp_at_sidx(flow_sidx_t sidx); -flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, +flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port, const union sockaddr_inany *s_in, const struct timespec *now); flow_sidx_t udp_flow_from_tap(const struct ctx *c, From bd6a41ee76bb9a0da2150d76dbabf9a3212d0fca Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:40 +1100 Subject: [PATCH 341/382] udp: Rework udp_listen_sock_data() into udp_sock_fwd() udp_listen_sock_data() forwards datagrams from a "listening" socket until there are no more (for now). We have an upcoming use case where we want to do that for a socket that's not a "listening" socket, and uses a different epoll reference. So, adjust the function to take the pieces it needs from the reference as direct parameters and rename to udp_sock_fwd(). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/udp.c b/udp.c index 157697e..20d8f0c 100644 --- a/udp.c +++ b/udp.c @@ -716,37 +716,36 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n, } /** - * udp_listen_sock_data() - Handle new data from listening socket + * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket * @c: Execution context - * @ref: epoll reference + * @s: Socket to forward from + * @frompif: Interface to which @s belongs + * @port: Our (local) port number of @s * @now: Current timestamp */ -static void udp_listen_sock_data(const struct ctx *c, union epoll_ref ref, - const struct timespec *now) +static void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, + in_port_t port, const struct timespec *now) { union sockaddr_inany src; - while (udp_peek_addr(ref.fd, &src) == 0) { - flow_sidx_t tosidx = udp_flow_from_sock(c, ref.udp.pif, - ref.udp.port, &src, - now); + while (udp_peek_addr(s, &src) == 0) { + flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, port, + &src, now); uint8_t topif = pif_at_sidx(tosidx); if (pif_is_socket(topif)) { - udp_sock_to_sock(c, ref.fd, 1, tosidx); + udp_sock_to_sock(c, s, 1, tosidx); } else if (topif == PIF_TAP) { if (c->mode == MODE_VU) - udp_vu_sock_to_tap(c, ref.fd, 1, tosidx); + udp_vu_sock_to_tap(c, s, 1, tosidx); else - udp_buf_sock_to_tap(c, ref.fd, 1, tosidx); + udp_buf_sock_to_tap(c, s, 1, tosidx); } else if (flow_sidx_valid(tosidx)) { - flow_sidx_t fromsidx = flow_sidx_opposite(tosidx); struct udp_flow *uflow = udp_at_sidx(tosidx); flow_err(uflow, "No support for forwarding UDP from %s to %s", - pif_name(pif_at_sidx(fromsidx)), - pif_name(topif)); + pif_name(frompif), pif_name(topif)); } else { debug("Discarding datagram without flow"); } @@ -774,7 +773,7 @@ void udp_listen_sock_handler(const struct ctx *c, } if (events & EPOLLIN) - udp_listen_sock_data(c, ref, now); + udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now); } /** From 9eb540626047bece3f25f38e47ec3b2b0030f9f4 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:41 +1100 Subject: [PATCH 342/382] udp: Fold udp_splice_prepare and udp_splice_send into udp_sock_to_sock udp_splice() prepare and udp_splice_send() are both quite simple functions that now have only one caller: udp_sock_to_sock(). Fold them both into that caller. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 55 +++++++++++++++---------------------------------------- 1 file changed, 15 insertions(+), 40 deletions(-) diff --git a/udp.c b/udp.c index 20d8f0c..d9d2183 100644 --- a/udp.c +++ b/udp.c @@ -250,43 +250,6 @@ static void udp_iov_init(const struct ctx *c) udp_iov_init_one(c, i); } -/** - * udp_splice_prepare() - Prepare one datagram for splicing - * @mmh: Receiving mmsghdr array - * @idx: Index of the datagram to prepare - */ -static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx) -{ - udp_mh_splice[idx].msg_hdr.msg_iov->iov_len = mmh[idx].msg_len; -} - -/** - * udp_splice_send() - Send a batch of datagrams from socket to socket - * @c: Execution context - * @start: Index of batch's first datagram in udp[46]_l2_buf - * @n: Number of datagrams in batch - * @src: Source port for datagram (target side) - * @dst: Destination port for datagrams (target side) - * @ref: epoll reference for origin socket - * @now: Timestamp - * - * #syscalls sendmmsg - */ -static void udp_splice_send(const struct ctx *c, size_t start, size_t n, - flow_sidx_t tosidx) -{ - const struct flowside *toside = flowside_at_sidx(tosidx); - const struct udp_flow *uflow = udp_at_sidx(tosidx); - uint8_t topif = pif_at_sidx(tosidx); - int s = uflow->s[tosidx.sidei]; - socklen_t sl; - - pif_sockaddr(c, &udp_splice_to, &sl, topif, - &toside->eaddr, toside->eport); - - sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL); -} - /** * udp_update_hdr4() - Update headers for one IPv4 datagram * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) @@ -678,19 +641,31 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n) * @from_s: Socket to receive datagrams from * @n: Maximum number of datagrams to forward * @tosidx: Flow & side to forward datagrams to + * + * #syscalls sendmmsg */ static void udp_sock_to_sock(const struct ctx *c, int from_s, int n, flow_sidx_t tosidx) { + const struct flowside *toside = flowside_at_sidx(tosidx); + const struct udp_flow *uflow = udp_at_sidx(tosidx); + uint8_t topif = pif_at_sidx(tosidx); + int to_s = uflow->s[tosidx.sidei]; + socklen_t sl; int i; if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0) return; - for (i = 0; i < n; i++) - udp_splice_prepare(udp_mh_recv, i); + for (i = 0; i < n; i++) { + udp_mh_splice[i].msg_hdr.msg_iov->iov_len + = udp_mh_recv[i].msg_len; + } - udp_splice_send(c, 0, n, tosidx); + pif_sockaddr(c, &udp_splice_to, &sl, topif, + &toside->eaddr, toside->eport); + + sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL); } /** From 9725e79888374a4e4060a2d798f3407c0006cc8a Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Fri, 4 Apr 2025 21:15:42 +1100 Subject: [PATCH 343/382] udp_flow: Don't discard packets that arrive between bind() and connect() When we establish a new UDP flow we create connect()ed sockets that will only handle datagrams for this flow. However, there is a race between bind() and connect() where they might get some packets queued for a different flow. Currently we handle this by simply discarding any queued datagrams after the connect. UDP protocols should be able to handle such packet loss, but it's not ideal. We now have the tools we need to handle this better, by redirecting any datagrams received during that race to the appropriate flow. We need to use a deferred handler for this to avoid unexpectedly re-ordering datagrams in some edge cases. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Update comment to udp_flow_defer()] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 2 +- udp.c | 4 +-- udp_flow.c | 77 +++++++++++++++++++++++++++++++++++--------------- udp_flow.h | 6 +++- udp_internal.h | 2 ++ 5 files changed, 64 insertions(+), 27 deletions(-) diff --git a/flow.c b/flow.c index 8622242..29a83e1 100644 --- a/flow.c +++ b/flow.c @@ -850,7 +850,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) closed = icmp_ping_timer(c, &flow->ping, now); break; case FLOW_UDP: - closed = udp_flow_defer(&flow->udp); + closed = udp_flow_defer(c, &flow->udp, now); if (!closed && timer) closed = udp_flow_timer(c, &flow->udp, now); break; diff --git a/udp.c b/udp.c index d9d2183..ed6edc1 100644 --- a/udp.c +++ b/udp.c @@ -698,8 +698,8 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n, * @port: Our (local) port number of @s * @now: Current timestamp */ -static void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, - in_port_t port, const struct timespec *now) +void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, + in_port_t port, const struct timespec *now) { union sockaddr_inany src; diff --git a/udp_flow.c b/udp_flow.c index 5afe6e5..75f5a0b 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -9,10 +9,12 @@ #include <fcntl.h> #include <sys/uio.h> #include <unistd.h> +#include <netinet/udp.h> #include "util.h" #include "passt.h" #include "flow_table.h" +#include "udp_internal.h" #define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ @@ -67,16 +69,15 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) * Return: fd of new socket on success, -ve error code on failure */ static int udp_flow_sock(const struct ctx *c, - const struct udp_flow *uflow, unsigned sidei) + struct udp_flow *uflow, unsigned sidei) { const struct flowside *side = &uflow->f.side[sidei]; - struct mmsghdr discard[UIO_MAXIOV] = { 0 }; uint8_t pif = uflow->f.pif[sidei]; union { flow_sidx_t sidx; uint32_t data; } fref = { .sidx = FLOW_SIDX(uflow, sidei) }; - int rc, s; + int s; s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data); if (s < 0) { @@ -85,30 +86,32 @@ static int udp_flow_sock(const struct ctx *c, } if (flowside_connect(c, s, pif, side) < 0) { - rc = -errno; + int rc = -errno; flow_dbg_perror(uflow, "Couldn't connect flow socket"); return rc; } - /* It's possible, if unlikely, that we could receive some unrelated - * packets in between the bind() and connect() of this socket. For now - * we just discard these. + /* It's possible, if unlikely, that we could receive some packets in + * between the bind() and connect() which may or may not be for this + * flow. Being UDP we could just discard them, but it's not ideal. * - * FIXME: Redirect these to an appropriate handler + * There's also a tricky case if a bunch of datagrams for a new flow + * arrive in rapid succession, the first going to the original listening + * socket and later ones going to this new socket. If we forwarded the + * datagrams from the new socket immediately here they would go before + * the datagram which established the flow. Again, not strictly wrong + * for UDP, but not ideal. + * + * So, we flag that the new socket is in a transient state where it + * might have datagrams for a different flow queued. Before the next + * epoll cycle, udp_flow_defer() will flush out any such datagrams, and + * thereafter everything on the new socket should be strictly for this + * flow. */ - rc = recvmmsg(s, discard, ARRAY_SIZE(discard), MSG_DONTWAIT, NULL); - if (rc >= ARRAY_SIZE(discard)) { - flow_dbg(uflow, "Too many (%d) spurious reply datagrams", rc); - return -E2BIG; - } - - if (rc > 0) { - flow_trace(uflow, "Discarded %d spurious reply datagrams", rc); - } else if (errno != EAGAIN) { - rc = -errno; - flow_perror(uflow, "Unexpected error discarding datagrams"); - return rc; - } + if (sidei) + uflow->flush1 = true; + else + uflow->flush0 = true; return s; } @@ -269,13 +272,41 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, } /** - * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows) + * udp_flush_flow() - Flush datagrams that might not be for this flow + * @c: Execution context * @uflow: Flow to handle + * @sidei: Side of the flow to flush + * @now: Current timestamp + */ +static void udp_flush_flow(const struct ctx *c, + const struct udp_flow *uflow, unsigned sidei, + const struct timespec *now) +{ + /* We don't know exactly where the datagrams will come from, but we know + * they'll have an interface and oport matching this flow */ + udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei], + uflow->f.side[sidei].oport, now); +} + +/** + * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows) + * @c: Execution context + * @uflow: Flow to handle + * @now: Current timestamp * * Return: true if the connection is ready to free, false otherwise */ -bool udp_flow_defer(const struct udp_flow *uflow) +bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now) { + if (uflow->flush0) { + udp_flush_flow(c, uflow, INISIDE, now); + uflow->flush0 = false; + } + if (uflow->flush1) { + udp_flush_flow(c, uflow, TGTSIDE, now); + uflow->flush1 = false; + } return uflow->closed; } diff --git a/udp_flow.h b/udp_flow.h index bbdeb2a..90d3b29 100644 --- a/udp_flow.h +++ b/udp_flow.h @@ -11,6 +11,8 @@ * struct udp_flow - Descriptor for a flow of UDP packets * @f: Generic flow information * @closed: Flow is already closed + * @flush0: @s[0] may have datagrams queued for other flows + * @flush1: @s[1] may have datagrams queued for other flows * @ts: Activity timestamp * @s: Socket fd (or -1) for each side of the flow * @ttl: TTL or hop_limit for both sides @@ -20,6 +22,7 @@ struct udp_flow { struct flow_common f; bool closed :1; + bool flush0, flush1 :1; time_t ts; int s[SIDES]; uint8_t ttl[SIDES]; @@ -35,7 +38,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, in_port_t srcport, in_port_t dstport, const struct timespec *now); void udp_flow_close(const struct ctx *c, struct udp_flow *uflow); -bool udp_flow_defer(const struct udp_flow *uflow); +bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now); bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, const struct timespec *now); diff --git a/udp_internal.h b/udp_internal.h index f7d8426..96d11cf 100644 --- a/udp_internal.h +++ b/udp_internal.h @@ -28,5 +28,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen, bool no_udp_csum); +void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, + in_port_t port, const struct timespec *now); #endif /* UDP_INTERNAL_H */ From 06ef64cdb72475fd02c72cdd607a31a86605e734 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 8 Apr 2025 07:49:55 +0200 Subject: [PATCH 344/382] udp_flow: Save 8 bytes in struct udp_flow on 64-bit architectures Shuffle the fields just added by commits a7775e9550fa ("udp: support traceroute in direction tap-socket") and 9725e7988837 ("udp_flow: Don't discard packets that arrive between bind() and connect()"). On x86_64, as reported by pahole(1), before: struct udp_flow { struct flow_common f; /* 0 76 */ /* --- cacheline 1 boundary (64 bytes) was 12 bytes ago --- */ _Bool closed:1; /* 76: 0 1 */ /* XXX 7 bits hole, try to pack */ _Bool flush0; /* 77 1 */ _Bool flush1:1; /* 78: 0 1 */ /* XXX 7 bits hole, try to pack */ /* XXX 1 byte hole, try to pack */ time_t ts; /* 80 8 */ int s[2]; /* 88 8 */ uint8_t ttl[2]; /* 96 2 */ /* size: 104, cachelines: 2, members: 7 */ /* sum members: 95, holes: 1, sum holes: 1 */ /* sum bitfield members: 2 bits, bit holes: 2, sum bit holes: 14 bits */ /* padding: 6 */ /* last cacheline: 40 bytes */ }; and after: struct udp_flow { struct flow_common f; /* 0 76 */ /* --- cacheline 1 boundary (64 bytes) was 12 bytes ago --- */ uint8_t ttl[2]; /* 76 2 */ _Bool closed:1; /* 78: 0 1 */ _Bool flush0:1; /* 78: 1 1 */ _Bool flush1:1; /* 78: 2 1 */ /* XXX 5 bits hole, try to pack */ /* XXX 1 byte hole, try to pack */ time_t ts; /* 80 8 */ int s[2]; /* 88 8 */ /* size: 96, cachelines: 2, members: 7 */ /* sum members: 94, holes: 1, sum holes: 1 */ /* sum bitfield members: 3 bits, bit holes: 1, sum bit holes: 5 bits */ /* last cacheline: 32 bytes */ }; It doesn't matter much because anyway the typical storage for struct udp_flow is given by union flow: union flow { struct flow_common f; /* 0 76 */ struct flow_free_cluster free; /* 0 84 */ struct tcp_tap_conn tcp; /* 0 120 */ struct tcp_splice_conn tcp_splice; /* 0 120 */ struct icmp_ping_flow ping; /* 0 96 */ struct udp_flow udp; /* 0 96 */ }; but it still improves data locality somewhat, so let me fix this up now that commits are fresh. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- udp_flow.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/udp_flow.h b/udp_flow.h index 90d3b29..e289122 100644 --- a/udp_flow.h +++ b/udp_flow.h @@ -10,22 +10,25 @@ /** * struct udp_flow - Descriptor for a flow of UDP packets * @f: Generic flow information + * @ttl: TTL or hop_limit for both sides * @closed: Flow is already closed * @flush0: @s[0] may have datagrams queued for other flows * @flush1: @s[1] may have datagrams queued for other flows * @ts: Activity timestamp * @s: Socket fd (or -1) for each side of the flow - * @ttl: TTL or hop_limit for both sides */ struct udp_flow { /* Must be first element */ struct flow_common f; - bool closed :1; - bool flush0, flush1 :1; + uint8_t ttl[SIDES]; + + bool closed :1, + flush0 :1, + flush1 :1; + time_t ts; int s[SIDES]; - uint8_t ttl[SIDES]; }; struct udp_flow *udp_at_sidx(flow_sidx_t sidx); From ffbef85e975ba117ed1c20f733d989ac08ebf325 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Tue, 8 Apr 2025 07:57:51 +0200 Subject: [PATCH 345/382] conf: Add missing return in conf_nat(), fix --map-guest-addr none As reported by somebody on IRC: $ pasta --map-guest-addr none Invalid address to remap to host: none that's because once we parsed "none", we try to parse it as an address as well. But we already handled it, so stop once we're done. Fixes: e813a4df7da2 ("conf: Allow address remapped to host to be configured") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- conf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conf.c b/conf.c index b54c55d..168646f 100644 --- a/conf.c +++ b/conf.c @@ -1272,6 +1272,8 @@ static void conf_nat(const char *arg, struct in_addr *addr4, *addr6 = in6addr_any; if (no_map_gw) *no_map_gw = 1; + + return; } if (inet_pton(AF_INET6, arg, addr6) && From d3f33f3b8ec4646dae3584b648cba142a73d3208 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 9 Apr 2025 16:35:40 +1000 Subject: [PATCH 346/382] tcp_splice: Don't double count bytes read on EINTR In tcp_splice_sock_handler(), if we get an EINTR on our second splice() (pipe to output socket) we - as we should - go back and retry it. However, we do so *after* we've already updated our byte counters. That does no harm for the conn->written[] counter - since the second splice() returned an error it will be advanced by 0. However we also advance the conn->read[] counter, and then do so again when the splice() succeeds. This results in the counters being out of sync, and us thinking we have remaining data in the pipe when we don't, which can leave us in an infinite loop once the stream finishes. Fix this by moving the EINTR handling to directly next to the splice() call (which is what we usually do for EINTR). As a bonus this removes one mildly confusing goto. For symmetry, also rework the EINTR handling on the first splice() the same way, although that doesn't (as far as I can tell) have buggy side effects. Link: https://github.com/containers/podman/issues/23686#issuecomment-2779347687 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_splice.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index 0d10e3d..7c3b56f 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -520,15 +520,14 @@ swap: int more = 0; retry: - readlen = splice(conn->s[fromsidei], NULL, - conn->pipe[fromsidei][1], NULL, - c->tcp.pipe_size, - SPLICE_F_MOVE | SPLICE_F_NONBLOCK); + do + readlen = splice(conn->s[fromsidei], NULL, + conn->pipe[fromsidei][1], NULL, + c->tcp.pipe_size, + SPLICE_F_MOVE | SPLICE_F_NONBLOCK); + while (readlen < 0 && errno == EINTR); flow_trace(conn, "%zi from read-side call", readlen); if (readlen < 0) { - if (errno == EINTR) - goto retry; - if (errno != EAGAIN) goto close; } else if (!readlen) { @@ -543,10 +542,13 @@ retry: conn_flag(c, conn, lowat_act_flag); } -eintr: - written = splice(conn->pipe[fromsidei][0], NULL, - conn->s[!fromsidei], NULL, c->tcp.pipe_size, - SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + do + written = splice(conn->pipe[fromsidei][0], NULL, + conn->s[!fromsidei], NULL, + c->tcp.pipe_size, + SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + while (written < 0 && errno == EINTR); + flow_trace(conn, "%zi from write-side call (passed %zi)", written, c->tcp.pipe_size); @@ -578,9 +580,6 @@ eintr: conn->written[fromsidei] += written > 0 ? written : 0; if (written < 0) { - if (errno == EINTR) - goto eintr; - if (errno != EAGAIN) goto close; From 6693fa115824d198b7cde46c272514be194500a9 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Wed, 9 Apr 2025 16:35:41 +1000 Subject: [PATCH 347/382] tcp_splice: Don't clobber errno before checking for EAGAIN Like many places, tcp_splice_sock_handler() needs to handle EAGAIN specially, in this case for both of its splice() calls. Unfortunately it tests for EAGAIN some time after those calls. In between there has been at least a flow_trace() which could have clobbered errno. Move the test on errno closer to the relevant system calls to avoid this problem. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- tcp_splice.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index 7c3b56f..60455d6 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -526,13 +526,15 @@ retry: c->tcp.pipe_size, SPLICE_F_MOVE | SPLICE_F_NONBLOCK); while (readlen < 0 && errno == EINTR); + + if (readlen < 0 && errno != EAGAIN) + goto close; + flow_trace(conn, "%zi from read-side call", readlen); - if (readlen < 0) { - if (errno != EAGAIN) - goto close; - } else if (!readlen) { + + if (!readlen) { eof = 1; - } else { + } else if (readlen > 0) { never_read = 0; if (readlen >= (long)c->tcp.pipe_size * 90 / 100) @@ -549,6 +551,9 @@ retry: SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); while (written < 0 && errno == EINTR); + if (written < 0 && errno != EAGAIN) + goto close; + flow_trace(conn, "%zi from write-side call (passed %zi)", written, c->tcp.pipe_size); @@ -580,9 +585,6 @@ retry: conn->written[fromsidei] += written > 0 ? written : 0; if (written < 0) { - if (errno != EAGAIN) - goto close; - if (conn->read[fromsidei] == conn->written[fromsidei]) break; From f4b0dd8b06850bacb2da57c8576e3377daa88572 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 10 Apr 2025 17:16:38 +1000 Subject: [PATCH 348/382] udp: Use PKTINFO cmsgs to get destination address for received datagrams Currently we get the source address for received datagrams from recvmsg(), but we don't get the local destination address. Sometimes we implicitly know this because the receiving socket is bound to a specific address, but when listening on 0.0.0.0 or ::, we don't. We need this information to properly direct replies to flows which come in to a non-default local address. So, enable the IP_PKTINFO and IPV6_PKTINFO control messages to obtain this information in udp_peek_addr(). For now we log a trace messages but don't do anything more with the information. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 37 +++++++++++++++++++++++++++++++++++-- util.c | 8 ++++++-- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/udp.c b/udp.c index ed6edc1..a71141a 100644 --- a/udp.c +++ b/udp.c @@ -587,18 +587,29 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref) return n_err; } +#define PKTINFO_SPACE \ + MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \ + CMSG_SPACE(sizeof(struct in6_pktinfo))) + /** * udp_peek_addr() - Get source address for next packet * @s: Socket to get information from * @src: Socket address (output) + * @dst: (Local) destination address (output) * * Return: 0 on success, -1 otherwise */ -static int udp_peek_addr(int s, union sockaddr_inany *src) +static int udp_peek_addr(int s, union sockaddr_inany *src, + union inany_addr *dst) { + char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN]; + const struct cmsghdr *hdr; + char cmsg[PKTINFO_SPACE]; struct msghdr msg = { .msg_name = src, .msg_namelen = sizeof(*src), + .msg_control = cmsg, + .msg_controllen = sizeof(cmsg), }; int rc; @@ -608,6 +619,27 @@ static int udp_peek_addr(int s, union sockaddr_inany *src) warn_perror("Error peeking at socket address"); return rc; } + + hdr = CMSG_FIRSTHDR(&msg); + if (hdr && hdr->cmsg_level == IPPROTO_IP && + hdr->cmsg_type == IP_PKTINFO) { + const struct in_pktinfo *info4 = (void *)CMSG_DATA(hdr); + + *dst = inany_from_v4(info4->ipi_addr); + } else if (hdr && hdr->cmsg_level == IPPROTO_IPV6 && + hdr->cmsg_type == IPV6_PKTINFO) { + const struct in6_pktinfo *info6 = (void *)CMSG_DATA(hdr); + + dst->a6 = info6->ipi6_addr; + } else { + debug("Unexpected cmsg on UDP datagram"); + *dst = inany_any6; + } + + trace("Peeked UDP datagram: %s -> %s", + sockaddr_ntop(src, sastr, sizeof(sastr)), + inany_ntop(dst, dstr, sizeof(dstr))); + return 0; } @@ -702,8 +734,9 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, in_port_t port, const struct timespec *now) { union sockaddr_inany src; + union inany_addr dst; - while (udp_peek_addr(s, &src) == 0) { + while (udp_peek_addr(s, &src, &dst) == 0) { flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, port, &src, now); uint8_t topif = pif_at_sidx(tosidx); diff --git a/util.c b/util.c index 0f68cf5..62a6003 100644 --- a/util.c +++ b/util.c @@ -109,11 +109,15 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, debug("Failed to set SO_REUSEADDR on socket %i", fd); if (proto == IPPROTO_UDP) { + int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO; + int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; - int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; - if (setsockopt(fd, level, opt, &y, sizeof(y))) + if (setsockopt(fd, level, recverr, &y, sizeof(y))) die_perror("Failed to set RECVERR on socket %i", fd); + + if (setsockopt(fd, level, pktinfo, &y, sizeof(y))) + die_perror("Failed to set PKTINFO on socket %i", fd); } if (ifname && *ifname) { From 695c62396eb3f4627c1114ce444394e3ba34373a Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 10 Apr 2025 17:16:39 +1000 Subject: [PATCH 349/382] inany: Improve ASSERT message for bad socket family inany_from_sockaddr() can only handle sockaddrs of family AF_INET or AF_INET6 and asserts if given something else. I hit this assertion while debugging something else, and wanted to see what the bad sockaddr family was. Now that we have ASSERT_WITH_MSG() its easy to add this information. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- inany.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inany.h b/inany.h index 6a12c29..1c247e1 100644 --- a/inany.h +++ b/inany.h @@ -252,7 +252,8 @@ static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port, *port = ntohs(sa->sa4.sin_port); } else { /* Not valid to call with other address families */ - ASSERT(0); + ASSERT_WITH_MSG(0, "Unexpected sockaddr family: %u", + sa->sa_family); } } From 59cc89f4cc018988428637d97745cc4c919126cb Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 10 Apr 2025 17:16:40 +1000 Subject: [PATCH 350/382] udp, udp_flow: Track our specific address on socket interfaces So far for UDP flows (like TCP connections) we didn't record our address (oaddr) in the flow table entry for socket based pifs. That's because we didn't have that information when a flow was initiated by a datagram coming to a "listening" socket with 0.0.0.0 or :: address. Even when we did have the information, we didn't record it, to simplify address matching on lookups. This meant that in some circumstances we could send replies on a UDP flow from a different address than the originating request came to, which is surprising and breaks certain setups. We now have code in udp_peek_addr() which does determine our address for incoming UDP datagrams. We can use that information to properly populate oaddr in the flow table for flow initiated from a socket. In order to be able to consistently match datagrams to flows, we must *always* have a specific oaddr, not an unspecified address (that's how the flow hash table works). So, we also need to fill in oaddr correctly for flows we initiate *to* sockets. Our forwarding logic doesn't specify oaddr here, letting the kernel decide based on the routing table. In this case we need to call getsockname() after connect()ing the socket to find which local address the kernel picked. This adds getsockname() to our seccomp profile for all variants. Link: https://bugs.passt.top/show_bug.cgi?id=99 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 14 +++++++++++--- flow.h | 3 ++- flow_table.h | 1 + tcp.c | 2 +- udp.c | 4 ++-- udp_flow.c | 36 ++++++++++++++++++++++++++++++++---- udp_flow.h | 3 ++- util.h | 10 ++++++++++ 8 files changed, 61 insertions(+), 12 deletions(-) diff --git a/flow.c b/flow.c index 29a83e1..3c81cb4 100644 --- a/flow.c +++ b/flow.c @@ -396,18 +396,22 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, * @flow: Flow to change state * @pif: pif of the initiating side * @ssa: Source socket address + * @daddr: Destination address (may be NULL) * @dport: Destination port * * Return: pointer to the initiating flowside information */ struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, const union sockaddr_inany *ssa, + const union inany_addr *daddr, in_port_t dport) { struct flowside *ini = &flow->f.side[INISIDE]; inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa); - if (inany_v4(&ini->eaddr)) + if (daddr) + ini->oaddr = *daddr; + else if (inany_v4(&ini->eaddr)) ini->oaddr = inany_any4; else ini->oaddr = inany_any6; @@ -751,19 +755,23 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, * @proto: Protocol of the flow (IP L4 protocol number) * @pif: Interface of the flow * @esa: Socket address of the endpoint + * @oaddr: Our address (may be NULL) * @oport: Our port number * * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found */ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, - const void *esa, in_port_t oport) + const void *esa, + const union inany_addr *oaddr, in_port_t oport) { struct flowside side = { .oport = oport, }; inany_from_sockaddr(&side.eaddr, &side.eport, esa); - if (inany_v4(&side.eaddr)) + if (oaddr) + side.oaddr = *oaddr; + else if (inany_v4(&side.eaddr)) side.oaddr = inany_any4; else side.oaddr = inany_any6; diff --git a/flow.h b/flow.h index dcf7645..cac618a 100644 --- a/flow.h +++ b/flow.h @@ -243,7 +243,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, const void *eaddr, const void *oaddr, in_port_t eport, in_port_t oport); flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, - const void *esa, in_port_t oport); + const void *esa, + const union inany_addr *oaddr, in_port_t oport); union flow; diff --git a/flow_table.h b/flow_table.h index fd2c57b..2d5c65c 100644 --- a/flow_table.h +++ b/flow_table.h @@ -199,6 +199,7 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, const void *daddr, in_port_t dport); struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, const union sockaddr_inany *ssa, + const union inany_addr *daddr, in_port_t dport); const struct flowside *flow_target_af(union flow *flow, uint8_t pif, sa_family_t af, diff --git a/tcp.c b/tcp.c index 35626c9..9c6bc52 100644 --- a/tcp.c +++ b/tcp.c @@ -2201,7 +2201,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, * mode only, below. */ ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, - ref.tcp_listen.port); + NULL, ref.tcp_listen.port); if (c->mode == MODE_VU) { /* Rebind to same address after migration */ if (!getsockname(s, &sa.sa, &sl)) diff --git a/udp.c b/udp.c index a71141a..40af7df 100644 --- a/udp.c +++ b/udp.c @@ -737,8 +737,8 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, union inany_addr dst; while (udp_peek_addr(s, &src, &dst) == 0) { - flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, port, - &src, now); + flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, + &dst, port, &src, now); uint8_t topif = pif_at_sidx(tosidx); if (pif_is_socket(topif)) { diff --git a/udp_flow.c b/udp_flow.c index 75f5a0b..ef2cbb0 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -123,14 +123,17 @@ static int udp_flow_sock(const struct ctx *c, * @now: Timestamp * * Return: UDP specific flow, if successful, NULL on failure + * + * #syscalls getsockname */ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, const struct timespec *now) { struct udp_flow *uflow = NULL; + const struct flowside *tgt; unsigned sidei; - if (!flow_target(c, flow, IPPROTO_UDP)) + if (!(tgt = flow_target(c, flow, IPPROTO_UDP))) goto cancel; uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); @@ -144,6 +147,29 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, goto cancel; } + if (uflow->s[TGTSIDE] >= 0 && inany_is_unspecified(&tgt->oaddr)) { + /* When we target a socket, we connect() it, but might not + * always bind(), leaving the kernel to pick our address. In + * that case connect() will implicitly bind() the socket, but we + * need to determine its local address so that we can match + * reply packets back to the correct flow. Update the flow with + * the information from getsockname() */ + union sockaddr_inany sa; + socklen_t sl = sizeof(sa); + in_port_t port; + + if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0) { + flow_perror(uflow, "Unable to determine local address"); + goto cancel; + } + inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr, + &port, &sa); + if (port != tgt->oport) { + flow_err(uflow, "Unexpected local port"); + goto cancel; + } + } + /* Tap sides always need to be looked up by hash. Socket sides don't * always, but sometimes do (receiving packets on a socket not specific * to one flow). Unconditionally hash both sides so all our bases are @@ -167,6 +193,7 @@ cancel: * udp_flow_from_sock() - Find or create UDP flow for incoming datagram * @c: Execution context * @pif: Interface the datagram is arriving from + * @dst: Our (local) address to which the datagram is arriving * @port: Our (local) port number to which the datagram is arriving * @s_in: Source socket address, filled in by recvmmsg() * @now: Timestamp @@ -176,7 +203,8 @@ cancel: * Return: sidx for the destination side of the flow for this packet, or * FLOW_SIDX_NONE if we couldn't find or create a flow. */ -flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port, +flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, + const union inany_addr *dst, in_port_t port, const union sockaddr_inany *s_in, const struct timespec *now) { @@ -185,7 +213,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port, union flow *flow; flow_sidx_t sidx; - sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, port); + sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port); if ((uflow = udp_at_sidx(sidx))) { uflow->ts = now->tv_sec; return flow_sidx_opposite(sidx); @@ -199,7 +227,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port, return FLOW_SIDX_NONE; } - ini = flow_initiate_sa(flow, pif, s_in, port); + ini = flow_initiate_sa(flow, pif, s_in, dst, port); if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || ini->oport == 0) { diff --git a/udp_flow.h b/udp_flow.h index e289122..4c528e9 100644 --- a/udp_flow.h +++ b/udp_flow.h @@ -32,7 +32,8 @@ struct udp_flow { }; struct udp_flow *udp_at_sidx(flow_sidx_t sidx); -flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port, +flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, + const union inany_addr *dst, in_port_t port, const union sockaddr_inany *s_in, const struct timespec *now); flow_sidx_t udp_flow_from_tap(const struct ctx *c, diff --git a/util.h b/util.h index b1e7e79..cc7d084 100644 --- a/util.h +++ b/util.h @@ -371,6 +371,16 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr, #define accept4(s, addr, addrlen, flags) \ wrap_accept4((s), (addr), (addrlen), (flags)) +static inline int wrap_getsockname(int sockfd, struct sockaddr *addr, +/* cppcheck-suppress constParameterPointer */ + socklen_t *addrlen) +{ + sa_init(addr, addrlen); + return getsockname(sockfd, addr, addrlen); +} +#define getsockname(s, addr, addrlen) \ + wrap_getsockname((s), (addr), (addrlen)) + #define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */ void encode_domain_name(char *buf, const char *domain_name); From bbff3653d6412690eee1a079d584a7365d2ed886 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 11 Apr 2025 09:58:31 +0200 Subject: [PATCH 351/382] conf: Split add_dns_resolv() into separate IPv4 and IPv6 versions Not really valuable by itself, but dropping one level of nested blocks makes the next change more convenient. No functional changes intended. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: Paul Holzinger <pholzing@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- conf.c | 101 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/conf.c b/conf.c index 168646f..18ed11c 100644 --- a/conf.c +++ b/conf.c @@ -414,6 +414,62 @@ static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr, return 1; } +/** + * add_dns_resolv4() - Possibly add one IPv4 nameserver from host's resolv.conf + * @c: Execution context + * @ns: Nameserver address + * @idx: Pointer to index of current IPv4 resolver entry, set on return + */ +static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx) +{ + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)) + c->ip4.dns_host = *ns; + + /* Special handling if guest or container can only access local + * addresses via redirect, or if the host gateway is also a resolver and + * we shadow its address + */ + if (IN4_IS_ADDR_LOOPBACK(ns) || + IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) + return; + + *ns = c->ip4.map_host_loopback; + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) + c->ip4.dns_match = c->ip4.map_host_loopback; + } + + *idx += add_dns4(c, ns, *idx); +} + +/** + * add_dns_resolv6() - Possibly add one IPv6 nameserver from host's resolv.conf + * @c: Execution context + * @ns: Nameserver address + * @idx: Pointer to index of current IPv6 resolver entry, set on return + */ +static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx) +{ + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) + c->ip6.dns_host = *ns; + + /* Special handling if guest or container can only access local + * addresses via redirect, or if the host gateway is also a resolver and + * we shadow its address + */ + if (IN6_IS_ADDR_LOOPBACK(ns) || + IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) { + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) + return; + + *ns = c->ip6.map_host_loopback; + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) + c->ip6.dns_match = c->ip6.map_host_loopback; + } + + *idx += add_dns6(c, ns, *idx); +} + /** * add_dns_resolv() - Possibly add ns from host resolv.conf to configuration * @c: Execution context @@ -430,48 +486,11 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver, struct in6_addr ns6; struct in_addr ns4; - if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) { - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)) - c->ip4.dns_host = ns4; + if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) + add_dns_resolv4(c, &ns4, idx4); - /* Special handling if guest or container can only access local - * addresses via redirect, or if the host gateway is also a - * resolver and we shadow its address - */ - if (IN4_IS_ADDR_LOOPBACK(&ns4) || - IN4_ARE_ADDR_EQUAL(&ns4, &c->ip4.map_host_loopback)) { - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) - return; - - ns4 = c->ip4.map_host_loopback; - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) - c->ip4.dns_match = c->ip4.map_host_loopback; - } - - *idx4 += add_dns4(c, &ns4, *idx4); - } - - if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) { - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) - c->ip6.dns_host = ns6; - - /* Special handling if guest or container can only access local - * addresses via redirect, or if the host gateway is also a - * resolver and we shadow its address - */ - if (IN6_IS_ADDR_LOOPBACK(&ns6) || - IN6_ARE_ADDR_EQUAL(&ns6, &c->ip6.map_host_loopback)) { - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) - return; - - ns6 = c->ip6.map_host_loopback; - - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) - c->ip6.dns_match = c->ip6.map_host_loopback; - } - - *idx6 += add_dns6(c, &ns6, *idx6); - } + if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) + add_dns_resolv6(c, &ns6, idx6); } /** From 50249086a967c54ff5b2521038cbe1d27303958c Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 11 Apr 2025 10:50:00 +0200 Subject: [PATCH 352/382] conf: Honour --dns-forward for local resolver even with --no-map-gw If the first resolver listed in the host's /etc/resolv.conf is a loopback address, and --no-map-gw is given, we automatically conclude that the resolver is not reachable, discard it, and, if it's the only nameserver listed in /etc/resolv.conf, we'll warn that we: Couldn't get any nameserver address However, this isn't true in a general case: the user might have passed --dns-forward, and in that case, while we won't map the address of the default gateway to the host, we're still supposed to map that particular address. Otherwise, in this common Podman usage: pasta --config-net --dns-forward 169.254.1.1 -t none -u none -T none -U none --no-map-gw --netns /run/user/1000/netns/netns-c02a8d8f-6ee3-902e-33c5-317e0f24e0af --map-guest-addr 169.254.1.2 and with a loopback address in /etc/resolv.conf, we'll unexpectedly refuse to forward DNS queries: # nslookup passt.top 169.254.1.1 ;; connection timed out; no servers could be reached To fix this, make an exception for --dns-forward: if &c->ip4.dns_match or &c->ip6.dns_match are set in add_dns_resolv4() / add_dns_resolv6(), use that address as guest-facing resolver. We already set 'dns_host' to the address we found in /etc/resolv.conf, that's correct in this case and it makes us forward queries as expected. I'm not changing the man page as the current description of --dns-forward is already consistent with the new behaviour: there's no described way in which --no-map-gw should affect it. Reported-by: Andrew Sayers <andrew-bugs.passt.top@pileofstuff.org> Link: https://bugs.passt.top/show_bug.cgi?id=111 Suggested-by: Paul Holzinger <pholzing@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: Paul Holzinger <pholzing@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- conf.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/conf.c b/conf.c index 18ed11c..f942851 100644 --- a/conf.c +++ b/conf.c @@ -431,12 +431,19 @@ static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx) */ if (IN4_IS_ADDR_LOOPBACK(ns) || IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) { - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) - return; + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) + return; /* Address unreachable */ - *ns = c->ip4.map_host_loopback; - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) + *ns = c->ip4.map_host_loopback; c->ip4.dns_match = c->ip4.map_host_loopback; + } else { + /* No general host mapping, but requested for DNS + * (--dns-forward and --no-map-gw): advertise resolver + * address from --dns-forward, and map that to loopback + */ + *ns = c->ip4.dns_match; + } } *idx += add_dns4(c, ns, *idx); @@ -459,12 +466,19 @@ static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx) */ if (IN6_IS_ADDR_LOOPBACK(ns) || IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) { - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) - return; + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) { + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) + return; /* Address unreachable */ - *ns = c->ip6.map_host_loopback; - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) + *ns = c->ip6.map_host_loopback; c->ip6.dns_match = c->ip6.map_host_loopback; + } else { + /* No general host mapping, but requested for DNS + * (--dns-forward and --no-map-gw): advertise resolver + * address from --dns-forward, and map that to loopback + */ + *ns = c->ip6.dns_match; + } } *idx += add_dns6(c, ns, *idx); From baf049f8e06b7f0a73dfa7913297679a75aad381 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 15 Apr 2025 17:16:18 +1000 Subject: [PATCH 353/382] udp: Fix breakage of UDP error handling by PKTINFO support We recently enabled the IP_PKTINFO / IPV6_RECVPKTINFO socket options on our UDP sockets. This lets us obtain and properly handle the specific local address used when we're "listening" with a socket on 0.0.0.0 or ::. However, the PKTINFO cmsgs this option generates appear on error queue messages as well as regular datagrams. udp_sock_recverr() doesn't expect this and so flags an unrecoverable error when it can't parse the control message. Correct this by adding space in udp_sock_recverr()s control buffer for the additional PKTINFO data, and scan through all cmsgs for the RECVERR, rather than only looking at the first one. Link: https://bugs.passt.top/show_bug.cgi?id=99 Fixes: f4b0dd8b0685 ("udp: Use PKTINFO cmsgs to get destination address for received datagrams") Reported-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/udp.c b/udp.c index 40af7df..f5fb98c 100644 --- a/udp.c +++ b/udp.c @@ -155,6 +155,10 @@ __attribute__ ((aligned(32))) #endif udp_meta[UDP_MAX_FRAMES]; +#define PKTINFO_SPACE \ + MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \ + CMSG_SPACE(sizeof(struct in6_pktinfo))) + /** * enum udp_iov_idx - Indices for the buffers making up a single UDP frame * @UDP_IOV_TAP tap specific header @@ -476,10 +480,10 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) struct sock_extended_err ee; union sockaddr_inany saddr; }; - const struct errhdr *eh; - const struct cmsghdr *hdr; - char buf[CMSG_SPACE(sizeof(struct errhdr))]; + char buf[PKTINFO_SPACE + CMSG_SPACE(sizeof(struct errhdr))]; char data[ICMP6_MAX_DLEN]; + const struct errhdr *eh; + struct cmsghdr *hdr; int s = ref.fd; struct iovec iov = { .iov_base = data, @@ -507,12 +511,16 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) return -1; } - hdr = CMSG_FIRSTHDR(&mh); - if (!((hdr->cmsg_level == IPPROTO_IP && - hdr->cmsg_type == IP_RECVERR) || - (hdr->cmsg_level == IPPROTO_IPV6 && - hdr->cmsg_type == IPV6_RECVERR))) { - err("Unexpected cmsg reading error queue"); + for (hdr = CMSG_FIRSTHDR(&mh); hdr; hdr = CMSG_NXTHDR(&mh, hdr)) { + if ((hdr->cmsg_level == IPPROTO_IP && + hdr->cmsg_type == IP_RECVERR) || + (hdr->cmsg_level == IPPROTO_IPV6 && + hdr->cmsg_type == IPV6_RECVERR)) + break; + } + + if (!hdr) { + err("Missing RECVERR cmsg in error queue"); return -1; } @@ -587,10 +595,6 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref) return n_err; } -#define PKTINFO_SPACE \ - MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \ - CMSG_SPACE(sizeof(struct in6_pktinfo))) - /** * udp_peek_addr() - Get source address for next packet * @s: Socket to get information from From 1bb8145c221a9124ca1671e64b27de173ff2d82d Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 15 Apr 2025 17:16:19 +1000 Subject: [PATCH 354/382] udp: Be quieter about errors on UDP receive If we get an error on UDP receive, either in udp_peek_addr() or udp_sock_recv(), we'll print an error message. However, this could be a perfectly routine UDP error triggered by an ICMP, which need not go to the error log. This doesn't usually happen, because before receiving we typically clear the error queue from udp_sock_errs(). However, it's possible an error could be flagged after udp_sock_errs() but before we receive. So it's better to handle this error "silently" (trace level only). We'll bail out of the receive, return to the epoll loop, and get an EPOLLERR where we'll handle and report the error properly. In particular there's one situation that can trigger this case much more easily. If we start a new outbound UDP flow to a local destination with nothing listening, we'll get a more or less immediate connection refused error. So, we'll get that error on the very first receive after the connect(). That will occur in udp_flow_defer() -> udp_flush_flow() -> udp_sock_fwd() -> udp_peek_addr() -> recvmsg(). This path doesn't call udp_sock_errs() first, so isn't (imperfectly) protected the way we are most of the time. Fixes: 84ab1305faba ("udp: Polish udp_vu_sock_info() and remove from vu specific code") Fixes: 69e5393c3722 ("udp: Move some more of sock_handler tasks into sub-functions") Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/udp.c b/udp.c index f5fb98c..154f99b 100644 --- a/udp.c +++ b/udp.c @@ -619,8 +619,8 @@ static int udp_peek_addr(int s, union sockaddr_inany *src, rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT); if (rc < 0) { - if (errno != EAGAIN && errno != EWOULDBLOCK) - warn_perror("Error peeking at socket address"); + trace("Error peeking at socket address: %s", strerror_(errno)); + /* Bail out and let the EPOLLERR handler deal with it */ return rc; } @@ -664,7 +664,8 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n) n = recvmmsg(s, mmh, n, 0, NULL); if (n < 0) { - err_perror("Error receiving datagrams"); + trace("Error receiving datagrams: %s", strerror_(errno)); + /* Bail out and let the EPOLLERR handler deal with it */ return 0; } From 3f995586b35494b08631081fbf609ff932110849 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 15 Apr 2025 17:16:20 +1000 Subject: [PATCH 355/382] udp: Pass socket & flow information direction to error handling functions udp_sock_recverr() and udp_sock_errs() take an epoll reference from which they obtain both the socket fd to receive errors from, and - for flow specific sockets - the flow and side the socket is associated with. We have some upcoming cases where we want to clear errors when we're not directly associated with receiving an epoll event, so it's not natural to have an epoll reference. Therefore, make these functions take the socket and flow from explicit parameters. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/udp.c b/udp.c index 154f99b..c51ac95 100644 --- a/udp.c +++ b/udp.c @@ -467,14 +467,15 @@ static void udp_send_tap_icmp6(const struct ctx *c, /** * udp_sock_recverr() - Receive and clear an error from a socket * @c: Execution context - * @ref: epoll reference + * @s: Socket to receive errors from + * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown * * Return: 1 if error received and processed, 0 if no more errors in queue, < 0 * if there was an error reading the queue * * #syscalls recvmsg */ -static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) +static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx) { struct errhdr { struct sock_extended_err ee; @@ -484,7 +485,6 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) char data[ICMP6_MAX_DLEN]; const struct errhdr *eh; struct cmsghdr *hdr; - int s = ref.fd; struct iovec iov = { .iov_base = data, .iov_len = sizeof(data) @@ -525,12 +525,12 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) } eh = (const struct errhdr *)CMSG_DATA(hdr); - if (ref.type == EPOLL_TYPE_UDP) { - flow_sidx_t sidx = flow_sidx_opposite(ref.flowside); - const struct flowside *toside = flowside_at_sidx(sidx); + if (flow_sidx_valid(sidx)) { + flow_sidx_t tosidx = flow_sidx_opposite(sidx); + const struct flowside *toside = flowside_at_sidx(tosidx); size_t dlen = rc; - if (pif_is_socket(pif_at_sidx(sidx))) { + if (pif_is_socket(pif_at_sidx(tosidx))) { /* XXX Is there any way to propagate ICMPs from socket * to socket? */ } else if (hdr->cmsg_level == IPPROTO_IP) { @@ -554,21 +554,21 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) /** * udp_sock_errs() - Process errors on a socket * @c: Execution context - * @ref: epoll reference + * @s: Socket to receive errors from + * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown * * Return: Number of errors handled, or < 0 if we have an unrecoverable error */ -static int udp_sock_errs(const struct ctx *c, union epoll_ref ref) +static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx) { unsigned n_err = 0; socklen_t errlen; - int s = ref.fd; int rc, err; ASSERT(!c->no_udp); /* Empty the error queue */ - while ((rc = udp_sock_recverr(c, ref)) > 0) + while ((rc = udp_sock_recverr(c, s, sidx)) > 0) n_err += rc; if (rc < 0) @@ -777,7 +777,7 @@ void udp_listen_sock_handler(const struct ctx *c, const struct timespec *now) { if (events & EPOLLERR) { - if (udp_sock_errs(c, ref) < 0) { + if (udp_sock_errs(c, ref.fd, FLOW_SIDX_NONE) < 0) { err("UDP: Unrecoverable error on listening socket:" " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); /* FIXME: what now? close/re-open socket? */ @@ -804,7 +804,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, ASSERT(!c->no_udp && uflow); if (events & EPOLLERR) { - if (udp_sock_errs(c, ref) < 0) { + if (udp_sock_errs(c, ref.fd, ref.flowside) < 0) { flow_err(uflow, "Unrecoverable error on flow socket"); goto fail; } From 04984578b00f7507a05544b7a5490b03ab2d5135 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 15 Apr 2025 17:16:21 +1000 Subject: [PATCH 356/382] udp: Deal with errors as we go in udp_sock_fwd() When we get an epoll event on a listening socket, we first deal with any errors (udp_sock_errs()), then with any received packets (udp_sock_fwd()). However, it's theoretically possible that new errors could get flagged on the socket after we call udp_sock_errs(), in which case we could get errors returned in in udp_sock_fwd() -> udp_peek_addr() -> recvmsg(). In fact, we do deal with this correctly, although the path is somewhat non-obvious. The recvmsg() error will cause us to bail out of udp_sock_fwd(), but the EPOLLERR event will now be flagged, so we'll come back here next epoll loop and call udp_sock_errs(). Except.. we call udp_sock_fwd() from udp_flush_flow() as well as from epoll events. This is to deal with any packets that arrived between bind() and connect(), and so might not be associated with the socket's intended flow. This expects udp_sock_fwd() to flush _all_ queued datagrams, so that anything received later must be for the correct flow. At the moment, udp_sock_errs() might fail to flush all datagrams if errors occur. In particular this can happen in practice for locally reported errors which occur immediately after connect() (e.g. connecting to a local port with nothing listening). We can deal with the problem case, and also make the flow a little more natural for the common case by having udp_sock_fwd() call udp_sock_errs() to handle errors as the occur, rather than trying to deal with all errors in advance. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/udp.c b/udp.c index c51ac95..0bec499 100644 --- a/udp.c +++ b/udp.c @@ -601,7 +601,7 @@ static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx) * @src: Socket address (output) * @dst: (Local) destination address (output) * - * Return: 0 on success, -1 otherwise + * Return: 0 if no more packets, 1 on success, -ve error code on error */ static int udp_peek_addr(int s, union sockaddr_inany *src, union inany_addr *dst) @@ -619,9 +619,9 @@ static int udp_peek_addr(int s, union sockaddr_inany *src, rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT); if (rc < 0) { - trace("Error peeking at socket address: %s", strerror_(errno)); - /* Bail out and let the EPOLLERR handler deal with it */ - return rc; + if (errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + return -errno; } hdr = CMSG_FIRSTHDR(&msg); @@ -644,7 +644,7 @@ static int udp_peek_addr(int s, union sockaddr_inany *src, sockaddr_ntop(src, sastr, sizeof(sastr)), inany_ntop(dst, dstr, sizeof(dstr))); - return 0; + return 1; } /** @@ -740,11 +740,27 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, { union sockaddr_inany src; union inany_addr dst; + int rc; - while (udp_peek_addr(s, &src, &dst) == 0) { - flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, - &dst, port, &src, now); - uint8_t topif = pif_at_sidx(tosidx); + while ((rc = udp_peek_addr(s, &src, &dst)) != 0) { + flow_sidx_t tosidx; + uint8_t topif; + + if (rc < 0) { + trace("Error peeking at socket address: %s", + strerror_(-rc)); + /* Clear errors & carry on */ + if (udp_sock_errs(c, s, FLOW_SIDX_NONE) < 0) { + err( +"UDP: Unrecoverable error on listening socket: (%s port %hu)", + pif_name(frompif), port); + /* FIXME: what now? close/re-open socket? */ + } + continue; + } + + tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now); + topif = pif_at_sidx(tosidx); if (pif_is_socket(topif)) { udp_sock_to_sock(c, s, 1, tosidx); @@ -776,16 +792,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - if (events & EPOLLERR) { - if (udp_sock_errs(c, ref.fd, FLOW_SIDX_NONE) < 0) { - err("UDP: Unrecoverable error on listening socket:" - " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); - /* FIXME: what now? close/re-open socket? */ - return; - } - } - - if (events & EPOLLIN) + if (events & (EPOLLERR | EPOLLIN)) udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now); } From f107a86cc05c83c5755861b00b85cdf0eb5c9534 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 15 Apr 2025 17:16:22 +1000 Subject: [PATCH 357/382] udp: Add udp_pktinfo() helper Currently we open code parsing the control message for IP_PKTINFO in udp_peek_addr(). We have an upcoming case where we want to parse PKTINFO in another place, so split this out into a helper function. While we're there, make the parsing a bit more robust: scan all cmsgs to look for the one we want, rather than assuming there's only one. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: udp_pktinfo(): Fix typo in comment and change err() to debug()] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 52 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/udp.c b/udp.c index 0bec499..97034f6 100644 --- a/udp.c +++ b/udp.c @@ -464,6 +464,41 @@ static void udp_send_tap_icmp6(const struct ctx *c, tap_icmp6_send(c, saddr, eaddr, &msg, msglen); } +/** + * udp_pktinfo() - Retrieve packet destination address from cmsg + * @msg: msghdr into which message has been received + * @dst: (Local) destination address of message in @mh (output) + * + * Return: 0 on success, -1 if the information was missing (@dst is set to + * inany_any6). + */ +static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst) +{ + struct cmsghdr *hdr; + + for (hdr = CMSG_FIRSTHDR(msg); hdr; hdr = CMSG_NXTHDR(msg, hdr)) { + if (hdr->cmsg_level == IPPROTO_IP && + hdr->cmsg_type == IP_PKTINFO) { + const struct in_pktinfo *i4 = (void *)CMSG_DATA(hdr); + + *dst = inany_from_v4(i4->ipi_addr); + return 0; + } + + if (hdr->cmsg_level == IPPROTO_IPV6 && + hdr->cmsg_type == IPV6_PKTINFO) { + const struct in6_pktinfo *i6 = (void *)CMSG_DATA(hdr); + + dst->a6 = i6->ipi6_addr; + return 0; + } + } + + debug("Missing PKTINFO cmsg on datagram"); + *dst = inany_any6; + return -1; +} + /** * udp_sock_recverr() - Receive and clear an error from a socket * @c: Execution context @@ -607,7 +642,6 @@ static int udp_peek_addr(int s, union sockaddr_inany *src, union inany_addr *dst) { char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN]; - const struct cmsghdr *hdr; char cmsg[PKTINFO_SPACE]; struct msghdr msg = { .msg_name = src, @@ -624,21 +658,7 @@ static int udp_peek_addr(int s, union sockaddr_inany *src, return -errno; } - hdr = CMSG_FIRSTHDR(&msg); - if (hdr && hdr->cmsg_level == IPPROTO_IP && - hdr->cmsg_type == IP_PKTINFO) { - const struct in_pktinfo *info4 = (void *)CMSG_DATA(hdr); - - *dst = inany_from_v4(info4->ipi_addr); - } else if (hdr && hdr->cmsg_level == IPPROTO_IPV6 && - hdr->cmsg_type == IPV6_PKTINFO) { - const struct in6_pktinfo *info6 = (void *)CMSG_DATA(hdr); - - dst->a6 = info6->ipi6_addr; - } else { - debug("Unexpected cmsg on UDP datagram"); - *dst = inany_any6; - } + udp_pktinfo(&msg, dst); trace("Peeked UDP datagram: %s -> %s", sockaddr_ntop(src, sastr, sizeof(sastr)), From cfc0ee145a5cdd29b6e584171085dac6539b86c0 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 15 Apr 2025 17:16:23 +1000 Subject: [PATCH 358/382] udp: Minor re-organisation of udp_sock_recverr() Usually we work with the "exit early" flow style, where we return early on "error" conditions in functions. We don't currently do this in udp_sock_recverr() for the case where we don't have a flow to associate the error with. Reorganise to use the "exit early" style, which will make some subsequent changes less awkward. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/udp.c b/udp.c index 97034f6..e8240fe 100644 --- a/udp.c +++ b/udp.c @@ -530,6 +530,9 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx) .msg_control = buf, .msg_controllen = sizeof(buf), }; + const struct flowside *toside; + flow_sidx_t tosidx; + size_t dlen; ssize_t rc; rc = recvmsg(s, &mh, MSG_ERRQUEUE); @@ -560,29 +563,32 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx) } eh = (const struct errhdr *)CMSG_DATA(hdr); - if (flow_sidx_valid(sidx)) { - flow_sidx_t tosidx = flow_sidx_opposite(sidx); - const struct flowside *toside = flowside_at_sidx(tosidx); - size_t dlen = rc; - if (pif_is_socket(pif_at_sidx(tosidx))) { - /* XXX Is there any way to propagate ICMPs from socket - * to socket? */ - } else if (hdr->cmsg_level == IPPROTO_IP) { - dlen = MIN(dlen, ICMP4_MAX_DLEN); - udp_send_tap_icmp4(c, &eh->ee, toside, - eh->saddr.sa4.sin_addr, data, dlen); - } else if (hdr->cmsg_level == IPPROTO_IPV6) { - udp_send_tap_icmp6(c, &eh->ee, toside, - &eh->saddr.sa6.sin6_addr, data, - dlen, sidx.flowi); - } - } else { - trace("Ignoring received IP_RECVERR cmsg on listener socket"); - } debug("%s error on UDP socket %i: %s", str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno)); + if (!flow_sidx_valid(sidx)) { + trace("Ignoring received IP_RECVERR cmsg on listener socket"); + return 1; + } + + tosidx = flow_sidx_opposite(sidx); + toside = flowside_at_sidx(tosidx); + dlen = rc; + + if (pif_is_socket(pif_at_sidx(tosidx))) { + /* XXX Is there any way to propagate ICMPs from socket to + * socket? */ + } else if (hdr->cmsg_level == IPPROTO_IP) { + dlen = MIN(dlen, ICMP4_MAX_DLEN); + udp_send_tap_icmp4(c, &eh->ee, toside, + eh->saddr.sa4.sin_addr, data, dlen); + } else if (hdr->cmsg_level == IPPROTO_IPV6) { + udp_send_tap_icmp6(c, &eh->ee, toside, + &eh->saddr.sa6.sin6_addr, data, + dlen, sidx.flowi); + } + return 1; } From 2340bbf867e6c3c3b5ac67345b0e841ab49bbaa5 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Tue, 15 Apr 2025 17:16:24 +1000 Subject: [PATCH 359/382] udp: Propagate errors on listening and brand new sockets udp_sock_recverr() processes errors on UDP sockets and attempts to propagate them as ICMP packets on the tap interface. To do this it currently requires the flow with which the error is associated as a parameter. If that's missing it will clear the error condition, but not propagate it. That means that we largely ignore errors on "listening" sockets. It also means we may discard some errors on flow specific sockets if they occur very shortly after the socket is created. In udp_flush_flow() we need to clear any datagrams received between bind() and connect() which might not be associated with the "final" flow for the socket. If we get errors before that point we'll ignore them in the same way because we don't know the flow they're associated with in advance. This can happen in practice if we have errors which occur almost immediately after connect(), such as ECONNREFUSED when we connect() to a local address where nothing is listening. Between the extended error message itself and the PKTINFO information we do actually have enough information to find the correct flow. So, rather than ignoring errors where we don't have a flow "hint", determine the flow the hard way in udp_sock_recverr(). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Change warn() to debug() in udp_sock_recverr()] Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/udp.c b/udp.c index e8240fe..57769d0 100644 --- a/udp.c +++ b/udp.c @@ -504,27 +504,34 @@ static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst) * @c: Execution context * @s: Socket to receive errors from * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown + * @pif: Interface on which the error occurred + * (only used if @sidx == FLOW_SIDX_NONE) + * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE) * * Return: 1 if error received and processed, 0 if no more errors in queue, < 0 * if there was an error reading the queue * * #syscalls recvmsg */ -static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx) +static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, + uint8_t pif, in_port_t port) { struct errhdr { struct sock_extended_err ee; union sockaddr_inany saddr; }; char buf[PKTINFO_SPACE + CMSG_SPACE(sizeof(struct errhdr))]; + const struct errhdr *eh = NULL; char data[ICMP6_MAX_DLEN]; - const struct errhdr *eh; struct cmsghdr *hdr; struct iovec iov = { .iov_base = data, .iov_len = sizeof(data) }; + union sockaddr_inany src; struct msghdr mh = { + .msg_name = &src, + .msg_namelen = sizeof(src), .msg_iov = &iov, .msg_iovlen = 1, .msg_control = buf, @@ -554,7 +561,7 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx) hdr->cmsg_type == IP_RECVERR) || (hdr->cmsg_level == IPPROTO_IPV6 && hdr->cmsg_type == IPV6_RECVERR)) - break; + break; } if (!hdr) { @@ -568,8 +575,19 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx) str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno)); if (!flow_sidx_valid(sidx)) { - trace("Ignoring received IP_RECVERR cmsg on listener socket"); - return 1; + /* No hint from the socket, determine flow from addresses */ + union inany_addr dst; + + if (udp_pktinfo(&mh, &dst) < 0) { + debug("Missing PKTINFO on UDP error"); + return 1; + } + + sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, &src, &dst, port); + if (!flow_sidx_valid(sidx)) { + debug("Ignoring UDP error without flow"); + return 1; + } } tosidx = flow_sidx_opposite(sidx); @@ -597,10 +615,14 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx) * @c: Execution context * @s: Socket to receive errors from * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown + * @pif: Interface on which the error occurred + * (only used if @sidx == FLOW_SIDX_NONE) + * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE) * * Return: Number of errors handled, or < 0 if we have an unrecoverable error */ -static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx) +static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx, + uint8_t pif, in_port_t port) { unsigned n_err = 0; socklen_t errlen; @@ -609,7 +631,7 @@ static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx) ASSERT(!c->no_udp); /* Empty the error queue */ - while ((rc = udp_sock_recverr(c, s, sidx)) > 0) + while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0) n_err += rc; if (rc < 0) @@ -776,7 +798,8 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, trace("Error peeking at socket address: %s", strerror_(-rc)); /* Clear errors & carry on */ - if (udp_sock_errs(c, s, FLOW_SIDX_NONE) < 0) { + if (udp_sock_errs(c, s, FLOW_SIDX_NONE, + frompif, port) < 0) { err( "UDP: Unrecoverable error on listening socket: (%s port %hu)", pif_name(frompif), port); @@ -837,7 +860,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, ASSERT(!c->no_udp && uflow); if (events & EPOLLERR) { - if (udp_sock_errs(c, ref.fd, ref.flowside) < 0) { + if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) { flow_err(uflow, "Unrecoverable error on flow socket"); goto fail; } From 9128f6e8f47d94c761b5fd8c0d0b8308758cbdc5 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 17 Apr 2025 11:55:40 +1000 Subject: [PATCH 360/382] fwd: Split out helpers for port-independent NAT Currently the functions fwd_nat_from_*() make some address translations based on both the IP address and protocol port numbers, and others based only on the address. We have some upcoming cases where it's useful to use the IP-address-only translations separately, so split them out into helper functions. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- fwd.c | 87 ++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/fwd.c b/fwd.c index 2829cd2..5c70e83 100644 --- a/fwd.c +++ b/fwd.c @@ -323,6 +323,30 @@ static bool fwd_guest_accessible(const struct ctx *c, return fwd_guest_accessible6(c, &addr->a6); } +/** + * nat_outbound() - Apply address translation for outbound (TAP to HOST) + * @c: Execution context + * @addr: Input address (as seen on TAP interface) + * @translated: Output address (as seen on HOST interface) + * + * Only handles translations that depend *only* on the address. Anything + * related to specific ports or flows is handled elsewhere. + */ +static void nat_outbound(const struct ctx *c, const union inany_addr *addr, + union inany_addr *translated) +{ + if (inany_equals4(addr, &c->ip4.map_host_loopback)) + *translated = inany_loopback4; + else if (inany_equals6(addr, &c->ip6.map_host_loopback)) + *translated = inany_loopback6; + else if (inany_equals4(addr, &c->ip4.map_guest_addr)) + *translated = inany_from_v4(c->ip4.addr); + else if (inany_equals6(addr, &c->ip6.map_guest_addr)) + translated->a6 = c->ip6.addr; + else + *translated = *addr; +} + /** * fwd_nat_from_tap() - Determine to forward a flow from the tap interface * @c: Execution context @@ -342,16 +366,8 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, else if (is_dns_flow(proto, ini) && inany_equals6(&ini->oaddr, &c->ip6.dns_match)) tgt->eaddr.a6 = c->ip6.dns_host; - else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback)) - tgt->eaddr = inany_loopback4; - else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback)) - tgt->eaddr = inany_loopback6; - else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr)) - tgt->eaddr = inany_from_v4(c->ip4.addr); - else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr)) - tgt->eaddr.a6 = c->ip6.addr; else - tgt->eaddr = ini->oaddr; + nat_outbound(c, &ini->oaddr, &tgt->eaddr); tgt->eport = ini->oport; @@ -423,6 +439,42 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, return PIF_HOST; } +/** + * nat_inbound() - Apply address translation for outbound (HOST to TAP) + * @c: Execution context + * @addr: Input address (as seen on HOST interface) + * @translated: Output address (as seen on TAP interface) + * + * Return: true on success, false if it couldn't translate the address + * + * Only handles translations that depend *only* on the address. Anything + * related to specific ports or flows is handled elsewhere. + */ +static bool nat_inbound(const struct ctx *c, const union inany_addr *addr, + union inany_addr *translated) +{ + if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) && + inany_equals4(addr, &in4addr_loopback)) { + /* Specifically 127.0.0.1, not 127.0.0.0/8 */ + *translated = inany_from_v4(c->ip4.map_host_loopback); + } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) && + inany_equals6(addr, &in6addr_loopback)) { + translated->a6 = c->ip6.map_host_loopback; + } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) && + inany_equals4(addr, &c->ip4.addr)) { + *translated = inany_from_v4(c->ip4.map_guest_addr); + } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) && + inany_equals6(addr, &c->ip6.addr)) { + translated->a6 = c->ip6.map_guest_addr; + } else if (fwd_guest_accessible(c, addr)) { + *translated = *addr; + } else { + return false; + } + + return true; +} + /** * fwd_nat_from_host() - Determine to forward a flow from the host interface * @c: Execution context @@ -479,20 +531,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, return PIF_SPLICE; } - if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) && - inany_equals4(&ini->eaddr, &in4addr_loopback)) { - /* Specifically 127.0.0.1, not 127.0.0.0/8 */ - tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback); - } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) && - inany_equals6(&ini->eaddr, &in6addr_loopback)) { - tgt->oaddr.a6 = c->ip6.map_host_loopback; - } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) && - inany_equals4(&ini->eaddr, &c->ip4.addr)) { - tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr); - } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) && - inany_equals6(&ini->eaddr, &c->ip6.addr)) { - tgt->oaddr.a6 = c->ip6.map_guest_addr; - } else if (!fwd_guest_accessible(c, &ini->eaddr)) { + if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) { if (inany_v4(&ini->eaddr)) { if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr)) /* No source address we can use */ @@ -501,8 +540,6 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, } else { tgt->oaddr.a6 = c->ip6.our_tap_ll; } - } else { - tgt->oaddr = ini->eaddr; } tgt->oport = ini->eport; From 4668e9137806b551f6ee44609064cc40243c2b6b Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 17 Apr 2025 11:55:41 +1000 Subject: [PATCH 361/382] treewide: Improve robustness against sockaddrs of unexpected family inany_from_sockaddr() expects a socket address of family AF_INET or AF_INET6 and ASSERT()s if it gets anything else. In many of the callers we can handle an unexpected family more gracefully, though, e.g. by failing a single flow rather than killing passt. Change inany_from_sockaddr() to return an error instead of ASSERT()ing, and handle those errors in the callers. Improve the reporting of any such errors while we're at it. With this greater robustness, allow inany_from_sockaddr() to take a void * rather than specifically a union sockaddr_inany *. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 16 ++++++++++++++-- inany.h | 30 ++++++++++++++++++------------ tcp.c | 10 ++++------ udp_flow.c | 6 +++--- 4 files changed, 39 insertions(+), 23 deletions(-) diff --git a/flow.c b/flow.c index 3c81cb4..447c021 100644 --- a/flow.c +++ b/flow.c @@ -408,7 +408,12 @@ struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, { struct flowside *ini = &flow->f.side[INISIDE]; - inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa); + if (inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa) < 0) { + char str[SOCKADDR_STRLEN]; + + ASSERT_WITH_MSG(0, "Bad socket address %s", + sockaddr_ntop(ssa, str, sizeof(str))); + } if (daddr) ini->oaddr = *daddr; else if (inany_v4(&ini->eaddr)) @@ -768,7 +773,14 @@ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, .oport = oport, }; - inany_from_sockaddr(&side.eaddr, &side.eport, esa); + if (inany_from_sockaddr(&side.eaddr, &side.eport, esa) < 0) { + char str[SOCKADDR_STRLEN]; + + warn("Flow lookup on bad socket address %s", + sockaddr_ntop(esa, str, sizeof(str))); + return FLOW_SIDX_NONE; + } + if (oaddr) side.oaddr = *oaddr; else if (inany_v4(&side.eaddr)) diff --git a/inany.h b/inany.h index 1c247e1..7ca5cbd 100644 --- a/inany.h +++ b/inany.h @@ -237,24 +237,30 @@ static inline void inany_from_af(union inany_addr *aa, } /** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr - * @aa: Pointer to store IPv[46] address + * @dst: Pointer to store IPv[46] address (output) * @port: Pointer to store port number, host order - * @addr: AF_INET or AF_INET6 socket address + * @addr: Socket address + * + * Return: 0 on success, -1 on error (bad address family) */ -static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port, - const union sockaddr_inany *sa) +static inline int inany_from_sockaddr(union inany_addr *dst, in_port_t *port, + const void *addr) { + const union sockaddr_inany *sa = (const union sockaddr_inany *)addr; + if (sa->sa_family == AF_INET6) { - inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr); + inany_from_af(dst, AF_INET6, &sa->sa6.sin6_addr); *port = ntohs(sa->sa6.sin6_port); - } else if (sa->sa_family == AF_INET) { - inany_from_af(aa, AF_INET, &sa->sa4.sin_addr); - *port = ntohs(sa->sa4.sin_port); - } else { - /* Not valid to call with other address families */ - ASSERT_WITH_MSG(0, "Unexpected sockaddr family: %u", - sa->sa_family); + return 0; } + + if (sa->sa_family == AF_INET) { + inany_from_af(dst, AF_INET, &sa->sa4.sin_addr); + *port = ntohs(sa->sa4.sin_port); + return 0; + } + + return -1; } /** inany_siphash_feed- Fold IPv[46] address into an in-progress siphash diff --git a/tcp.c b/tcp.c index 9c6bc52..0ac298a 100644 --- a/tcp.c +++ b/tcp.c @@ -1546,9 +1546,8 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ sl = sizeof(sa); - if (!getsockname(s, &sa.sa, &sl)) - inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa); - else + if (getsockname(s, &sa.sa, &sl) || + inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0) err_perror("Can't get local address for socket %i", s); } @@ -2204,9 +2203,8 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, NULL, ref.tcp_listen.port); if (c->mode == MODE_VU) { /* Rebind to same address after migration */ - if (!getsockname(s, &sa.sa, &sl)) - inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa); - else + if (getsockname(s, &sa.sa, &sl) || + inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0) err_perror("Can't get local address for socket %i", s); } diff --git a/udp_flow.c b/udp_flow.c index ef2cbb0..fea1cf3 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -158,12 +158,12 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, socklen_t sl = sizeof(sa); in_port_t port; - if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0) { + if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0 || + inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr, + &port, &sa) < 0) { flow_perror(uflow, "Unable to determine local address"); goto cancel; } - inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr, - &port, &sa); if (port != tgt->oport) { flow_err(uflow, "Unexpected local port"); goto cancel; From 08e617ec2ba916d8250a41d3ac68183124a6ec3e Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 17 Apr 2025 11:55:42 +1000 Subject: [PATCH 362/382] udp: Rework offender address handling in udp_sock_recverr() Make a number of changes to udp_sock_recverr() to improve the robustness of how we handle addresses. * Get the "offender" address (source of the ICMP packet) using the SO_EE_OFFENDER() macro, reducing assumptions about structure layout. * Parse the offender sockaddr using inany_from_sockaddr() * Check explicitly that the source and destination pifs are what we expect. Previously we checked something that was probably equivalent in practice, but isn't strictly speaking what we require for the rest of the code. * Verify that for an ICMPv4 error we also have an IPv4 source/offender and destination/endpoint address * Verify that for an ICMPv6 error we have an IPv6 endpoint * Improve debug reporting of any failures Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp.c | 69 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/udp.c b/udp.c index 57769d0..d09b3eb 100644 --- a/udp.c +++ b/udp.c @@ -159,6 +159,12 @@ udp_meta[UDP_MAX_FRAMES]; MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \ CMSG_SPACE(sizeof(struct in6_pktinfo))) +#define RECVERR_SPACE \ + MAX(CMSG_SPACE(sizeof(struct sock_extended_err) + \ + sizeof(struct sockaddr_in)), \ + CMSG_SPACE(sizeof(struct sock_extended_err) + \ + sizeof(struct sockaddr_in6))) + /** * enum udp_iov_idx - Indices for the buffers making up a single UDP frame * @UDP_IOV_TAP tap specific header @@ -516,12 +522,8 @@ static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst) static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, uint8_t pif, in_port_t port) { - struct errhdr { - struct sock_extended_err ee; - union sockaddr_inany saddr; - }; - char buf[PKTINFO_SPACE + CMSG_SPACE(sizeof(struct errhdr))]; - const struct errhdr *eh = NULL; + char buf[PKTINFO_SPACE + RECVERR_SPACE]; + const struct sock_extended_err *ee; char data[ICMP6_MAX_DLEN]; struct cmsghdr *hdr; struct iovec iov = { @@ -538,7 +540,13 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, .msg_controllen = sizeof(buf), }; const struct flowside *toside; - flow_sidx_t tosidx; + char astr[INANY_ADDRSTRLEN]; + char sastr[SOCKADDR_STRLEN]; + union inany_addr offender; + const struct in_addr *o4; + in_port_t offender_port; + struct udp_flow *uflow; + uint8_t topif; size_t dlen; ssize_t rc; @@ -569,10 +577,10 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, return -1; } - eh = (const struct errhdr *)CMSG_DATA(hdr); + ee = (const struct sock_extended_err *)CMSG_DATA(hdr); debug("%s error on UDP socket %i: %s", - str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno)); + str_ee_origin(ee), s, strerror_(ee->ee_errno)); if (!flow_sidx_valid(sidx)) { /* No hint from the socket, determine flow from addresses */ @@ -588,25 +596,44 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, debug("Ignoring UDP error without flow"); return 1; } + } else { + pif = pif_at_sidx(sidx); } - tosidx = flow_sidx_opposite(sidx); - toside = flowside_at_sidx(tosidx); + uflow = udp_at_sidx(sidx); + ASSERT(uflow); + toside = &uflow->f.side[!sidx.sidei]; + topif = uflow->f.pif[!sidx.sidei]; dlen = rc; - if (pif_is_socket(pif_at_sidx(tosidx))) { - /* XXX Is there any way to propagate ICMPs from socket to - * socket? */ - } else if (hdr->cmsg_level == IPPROTO_IP) { + if (inany_from_sockaddr(&offender, &offender_port, + SO_EE_OFFENDER(ee)) < 0) + goto fail; + + if (pif != PIF_HOST || topif != PIF_TAP) + /* XXX Can we support any other cases? */ + goto fail; + + if (hdr->cmsg_level == IPPROTO_IP && + (o4 = inany_v4(&offender)) && inany_v4(&toside->eaddr)) { dlen = MIN(dlen, ICMP4_MAX_DLEN); - udp_send_tap_icmp4(c, &eh->ee, toside, - eh->saddr.sa4.sin_addr, data, dlen); - } else if (hdr->cmsg_level == IPPROTO_IPV6) { - udp_send_tap_icmp6(c, &eh->ee, toside, - &eh->saddr.sa6.sin6_addr, data, - dlen, sidx.flowi); + udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen); + return 1; } + if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) { + udp_send_tap_icmp6(c, ee, toside, &offender.a6, data, dlen, + FLOW_IDX(uflow)); + return 1; + } + +fail: + flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s", + str_ee_origin(ee), + pif_name(pif), + sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)), + pif_name(topif), + inany_ntop(&toside->eaddr, astr, sizeof(astr))); return 1; } From 436afc30447c6f0ce516f2b38c769833114bb5f8 Mon Sep 17 00:00:00 2001 From: David Gibson <david@gibson.dropbear.id.au> Date: Thu, 17 Apr 2025 11:55:43 +1000 Subject: [PATCH 363/382] udp: Translate offender addresses for ICMP messages We've recently added support for propagating ICMP errors related to a UDP flow from the host to the guest, by handling the extended UDP error on the socket and synthesizing a suitable ICMP on the tap interface. Currently we create that ICMP with a source address of the "offender" from the extended error information - the source of the ICMP error received on the host. However, we don't translate this address for cases where we NAT between host and guest. This means (amongst other things) that we won't get a "Connection refused" error as expected if send data from the guest to the --map-host-loopback address. The error comes from 127.0.0.1 on the host, which doesn't make sense on the tap interface and will be discarded by the guest. Because ICMP errors can be sent by an intermediate host, not just by the endpoints of the flow, we can't handle this translation purely with the information in the flow table entry. We need to explicitly translate this address by our NAT rules, which we can do with the nat_inbound() helper. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- fwd.c | 4 ++-- fwd.h | 3 +++ udp.c | 18 ++++++++++++++---- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/fwd.c b/fwd.c index 5c70e83..b73c2c8 100644 --- a/fwd.c +++ b/fwd.c @@ -450,8 +450,8 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, * Only handles translations that depend *only* on the address. Anything * related to specific ports or flows is handled elsewhere. */ -static bool nat_inbound(const struct ctx *c, const union inany_addr *addr, - union inany_addr *translated) +bool nat_inbound(const struct ctx *c, const union inany_addr *addr, + union inany_addr *translated) { if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) && inany_equals4(addr, &in4addr_loopback)) { diff --git a/fwd.h b/fwd.h index 3562f3c..0458a3c 100644 --- a/fwd.h +++ b/fwd.h @@ -7,6 +7,7 @@ #ifndef FWD_H #define FWD_H +union inany_addr; struct flowside; /* Number of ports for both TCP and UDP */ @@ -47,6 +48,8 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev, const struct fwd_ports *tcp_rev); void fwd_scan_ports_init(struct ctx *c); +bool nat_inbound(const struct ctx *c, const union inany_addr *addr, + union inany_addr *translated); uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, const struct flowside *ini, struct flowside *tgt); uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, diff --git a/udp.c b/udp.c index d09b3eb..f5a5cd1 100644 --- a/udp.c +++ b/udp.c @@ -539,10 +539,10 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, .msg_control = buf, .msg_controllen = sizeof(buf), }; - const struct flowside *toside; + const struct flowside *fromside, *toside; + union inany_addr offender, otap; char astr[INANY_ADDRSTRLEN]; char sastr[SOCKADDR_STRLEN]; - union inany_addr offender; const struct in_addr *o4; in_port_t offender_port; struct udp_flow *uflow; @@ -602,6 +602,7 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, uflow = udp_at_sidx(sidx); ASSERT(uflow); + fromside = &uflow->f.side[sidx.sidei]; toside = &uflow->f.side[!sidx.sidei]; topif = uflow->f.pif[!sidx.sidei]; dlen = rc; @@ -614,15 +615,24 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, /* XXX Can we support any other cases? */ goto fail; + /* If the offender *is* the endpoint, make sure our translation is + * consistent with the flow's translation. This matters if the flow + * endpoint has a port specific translation (like --dns-match). + */ + if (inany_equals(&offender, &fromside->eaddr)) + otap = toside->oaddr; + else if (!nat_inbound(c, &offender, &otap)) + goto fail; + if (hdr->cmsg_level == IPPROTO_IP && - (o4 = inany_v4(&offender)) && inany_v4(&toside->eaddr)) { + (o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) { dlen = MIN(dlen, ICMP4_MAX_DLEN); udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen); return 1; } if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) { - udp_send_tap_icmp6(c, ee, toside, &offender.a6, data, dlen, + udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen, FLOW_IDX(uflow)); return 1; } From aa1cc8922867b8f7c17742f8da3b9fcc6291bbeb Mon Sep 17 00:00:00 2001 From: Alyssa Ross <hi@alyssa.is> Date: Sat, 26 Apr 2025 10:44:25 +0200 Subject: [PATCH 364/382] conf: allow --fd 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit inetd-style socket passing traditionally starts a service with a connected socket on file descriptors 0 and 1. passt disallowing obtaining its socket from either of these descriptors made it difficult to use with super-servers providing this interface — in my case I wanted to use passt with s6-ipcserver[1]. Since (as far as I can tell) passt does not use standard input for anything else (unlike standard output), it should be safe to relax the restrictions on --fd to allow setting it to 0, enabling this use case. Link: https://skarnet.org/software/s6/s6-ipcserver.html [1] Signed-off-by: Alyssa Ross <hi@alyssa.is> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- conf.c | 3 ++- util.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/conf.c b/conf.c index f942851..a6d7e22 100644 --- a/conf.c +++ b/conf.c @@ -1717,7 +1717,8 @@ void conf(struct ctx *c, int argc, char **argv) fd_tap_opt = strtol(optarg, NULL, 0); if (errno || - fd_tap_opt <= STDERR_FILENO || fd_tap_opt > INT_MAX) + (fd_tap_opt != STDIN_FILENO && fd_tap_opt <= STDERR_FILENO) || + fd_tap_opt > INT_MAX) die("Invalid --fd: %s", optarg); c->fd_tap = fd_tap_opt; diff --git a/util.c b/util.c index 62a6003..f5497d4 100644 --- a/util.c +++ b/util.c @@ -875,7 +875,9 @@ void close_open_files(int argc, char **argv) errno = 0; fd = strtol(optarg, NULL, 0); - if (errno || fd <= STDERR_FILENO || fd > INT_MAX) + if (errno || + (fd != STDIN_FILENO && fd <= STDERR_FILENO) || + fd > INT_MAX) die("Invalid --fd: %s", optarg); } } while (name != -1); From ea0a1240df671de221f469327899564ed74b5edd Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 30 Apr 2025 16:48:34 +0200 Subject: [PATCH 365/382] passt-repair: Hide bogus gcc warning from -Og MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building with gcc 13 and -Og, we get: passt-repair.c: In function ‘main’: passt-repair.c:161:23: warning: ‘ev’ may be used uninitialized [-Wmaybe-uninitialized] 161 | if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') { | ~~^~~~~ but that can't actually happen, because we only exit the preceding while loop if 'found' is true, and that only happens, in turn, as we assign 'ev'. Get rid of the warning by (redundantly) initialising ev to NULL. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- passt-repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passt-repair.c b/passt-repair.c index 256a8c9..ff1c44f 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -113,7 +113,7 @@ int main(int argc, char **argv) if ((sb.st_mode & S_IFMT) == S_IFDIR) { char buf[sizeof(struct inotify_event) + NAME_MAX + 1] __attribute__ ((aligned(__alignof__(struct inotify_event)))); - const struct inotify_event *ev; + const struct inotify_event *ev = NULL; char path[PATH_MAX + 1]; bool found = false; ssize_t n; From 6a96cd97a5fda26a8f12531a72f6a969e476ad9e Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Wed, 30 Apr 2025 16:59:13 +0200 Subject: [PATCH 366/382] util: Fix typo, ASSSERTION -> ASSERTION Fixes: 9153aca15bc1 ("util: Add abort_with_msg() and ASSERT_WITH_MSG() helpers") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util.h b/util.h index cc7d084..5947337 100644 --- a/util.h +++ b/util.h @@ -75,7 +75,7 @@ void abort_with_msg(const char *fmt, ...) #define ASSERT_WITH_MSG(expr, ...) \ ((expr) ? (void)0 : abort_with_msg(__VA_ARGS__)) #define ASSERT(expr) \ - ASSERT_WITH_MSG((expr), "ASSSERTION FAILED in %s (%s:%d): %s", \ + ASSERT_WITH_MSG((expr), "ASSERTION FAILED in %s (%s:%d): %s", \ __func__, __FILE__, __LINE__, STRINGIFY(expr)) #ifdef P_tmpdir From 11be695f5c0a6a7d74e9628e9863e665f59d511f Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Wed, 30 Apr 2025 18:05:25 +0200 Subject: [PATCH 367/382] flow: fix podman issue #25959 While running piHole using podman, traffic can trigger the following assert: ASSSERTION FAILED in flow_alloc (flow.c:521): flow->f.state == FLOW_STATE_FREE Backtrace shows that this happens in flow_defer_handler(): #4 0x00005610d6f5b481 flow_alloc (passt + 0xb481) #5 0x00005610d6f74f86 udp_flow_from_sock (passt + 0x24f86) #6 0x00005610d6f737c3 udp_sock_fwd (passt + 0x237c3) #7 0x00005610d6f74c07 udp_flush_flow (passt + 0x24c07) #8 0x00005610d6f752c2 udp_flow_defer (passt + 0x252c2) #9 0x00005610d6f5bce1 flow_defer_handler (passt + 0xbce1) We are trying to allocate a new flow inside the loop freeing them. Inside the loop free_head points to the first free flow entry in the current cluster. But if we allocate a new entry during the loop, free_head is not updated and can point now to the entry we have just allocated. We can fix the problem by spliting the loop in two parts: - first part where we can close some of them and allocate some new flow entries, - second part where we free the entries closed in the previous loop and we aggregate the free entries to merge consecutive the clusters. Reported-by: Martin Rijntjes <bugs@air-global.nl> Link: https://github.com/containers/podman/issues/25959 Fixes: 9725e7988837 ("udp_flow: Don't discard packets that arrive between bind() and connect()") Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 109 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 51 deletions(-) diff --git a/flow.c b/flow.c index 447c021..c5718e3 100644 --- a/flow.c +++ b/flow.c @@ -800,6 +800,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) { struct flow_free_cluster *free_head = NULL; unsigned *last_next = &flow_first_free; + bool to_free[FLOW_MAX] = { 0 }; bool timer = false; union flow *flow; @@ -810,9 +811,44 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */ - flow_foreach_slot(flow) { + /* Check which flows we might need to close first, but don't free them + * yet as it's not safe to do that in the middle of flow_foreach(). + */ + flow_foreach(flow) { bool closed = false; + switch (flow->f.type) { + case FLOW_TYPE_NONE: + ASSERT(false); + break; + case FLOW_TCP: + closed = tcp_flow_defer(&flow->tcp); + break; + case FLOW_TCP_SPLICE: + closed = tcp_splice_flow_defer(&flow->tcp_splice); + if (!closed && timer) + tcp_splice_timer(c, &flow->tcp_splice); + break; + case FLOW_PING4: + case FLOW_PING6: + if (timer) + closed = icmp_ping_timer(c, &flow->ping, now); + break; + case FLOW_UDP: + closed = udp_flow_defer(c, &flow->udp, now); + if (!closed && timer) + closed = udp_flow_timer(c, &flow->udp, now); + break; + default: + /* Assume other flow types don't need any handling */ + ; + } + + to_free[FLOW_IDX(flow)] = closed; + } + + /* Second step: actually free the flows */ + flow_foreach_slot(flow) { switch (flow->f.state) { case FLOW_STATE_FREE: { unsigned skip = flow->free.n; @@ -845,59 +881,30 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) break; case FLOW_STATE_ACTIVE: - /* Nothing to do */ - break; + if (to_free[FLOW_IDX(flow)]) { + flow_set_state(&flow->f, FLOW_STATE_FREE); + memset(flow, 0, sizeof(*flow)); - default: - ASSERT(false); - } - - switch (flow->f.type) { - case FLOW_TYPE_NONE: - ASSERT(false); - break; - case FLOW_TCP: - closed = tcp_flow_defer(&flow->tcp); - break; - case FLOW_TCP_SPLICE: - closed = tcp_splice_flow_defer(&flow->tcp_splice); - if (!closed && timer) - tcp_splice_timer(c, &flow->tcp_splice); - break; - case FLOW_PING4: - case FLOW_PING6: - if (timer) - closed = icmp_ping_timer(c, &flow->ping, now); - break; - case FLOW_UDP: - closed = udp_flow_defer(c, &flow->udp, now); - if (!closed && timer) - closed = udp_flow_timer(c, &flow->udp, now); - break; - default: - /* Assume other flow types don't need any handling */ - ; - } - - if (closed) { - flow_set_state(&flow->f, FLOW_STATE_FREE); - memset(flow, 0, sizeof(*flow)); - - if (free_head) { - /* Add slot to current free cluster */ - ASSERT(FLOW_IDX(flow) == - FLOW_IDX(free_head) + free_head->n); - free_head->n++; - flow->free.n = flow->free.next = 0; + if (free_head) { + /* Add slot to current free cluster */ + ASSERT(FLOW_IDX(flow) == + FLOW_IDX(free_head) + free_head->n); + free_head->n++; + flow->free.n = flow->free.next = 0; + } else { + /* Create new free cluster */ + free_head = &flow->free; + free_head->n = 1; + *last_next = FLOW_IDX(flow); + last_next = &free_head->next; + } } else { - /* Create new free cluster */ - free_head = &flow->free; - free_head->n = 1; - *last_next = FLOW_IDX(flow); - last_next = &free_head->next; + free_head = NULL; } - } else { - free_head = NULL; + break; + + default: + ASSERT(false); } } From 93394f4ef0966602b2ada8f72beaf75352add7b1 Mon Sep 17 00:00:00 2001 From: Janne Grunau <janne-psst@jannau.net> Date: Thu, 1 May 2025 11:54:07 +0200 Subject: [PATCH 368/382] selinux: Add getattr to class udp_socket Commit 59cc89f ("udp, udp_flow: Track our specific address on socket interfaces") added a getsockname() call in udp_flow_new(). This requires getattr. Fixes "Flow 0 (UDP flow): Unable to determine local address: Permission denied" errors in muvm/passt on Fedora Linux 42 with SELinux. The SELinux audit message is | type=AVC msg=audit(1746083799.606:235): avc: denied { getattr } for | pid=2961 comm="passt" laddr=127.0.0.1 lport=49221 | faddr=127.0.0.53 fport=53 | scontext=unconfined_u:unconfined_r:passt_t:s0-s0:c0.c1023 | tcontext=unconfined_u:unconfined_r:passt_t:s0-s0:c0.c1023 | tclass=udp_socket permissive=0 Fixes: 59cc89f4cc01 ("udp, udp_flow: Track our specific address on socket interfaces") Link: https://bugzilla.redhat.com/show_bug.cgi?id=2363238 Signed-off-by: Janne Grunau <janne-psst@jannau.net> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- contrib/selinux/passt.te | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index f8ea672..eb9ce72 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -49,7 +49,7 @@ require { type proc_net_t; type node_t; class tcp_socket { create accept listen name_bind name_connect getattr ioctl }; - class udp_socket { create accept listen }; + class udp_socket { create accept listen getattr }; class icmp_socket { bind create name_bind node_bind setopt read write }; class sock_file { create unlink write }; @@ -133,7 +133,7 @@ allow passt_t node_t:icmp_socket { name_bind node_bind }; allow passt_t port_t:icmp_socket name_bind; allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl }; -allow passt_t self:udp_socket { create getopt setopt connect bind read write }; +allow passt_t self:udp_socket { create getopt setopt connect bind read write getattr }; allow passt_t self:icmp_socket { bind create setopt read write }; allow passt_t user_tmp_t:dir { add_name write }; From f0021f9e1d4f118f4167149b256346f3dfea9d2b Mon Sep 17 00:00:00 2001 From: Emanuel Valasiadis <emanuel@valasiadis.space> Date: Fri, 2 May 2025 15:31:39 +0200 Subject: [PATCH 369/382] fwd: fix doc typo Signed-off-by: Emanuel Valasiadis <emanuel@valasiadis.space> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- fwd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fwd.c b/fwd.c index b73c2c8..49aabc3 100644 --- a/fwd.c +++ b/fwd.c @@ -440,7 +440,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, } /** - * nat_inbound() - Apply address translation for outbound (HOST to TAP) + * nat_inbound() - Apply address translation for inbound (HOST to TAP) * @c: Execution context * @addr: Input address (as seen on HOST interface) * @translated: Output address (as seen on TAP interface) From 587980ca1e9d5645f6738f67ec3f15cc61a7efa3 Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio@redhat.com> Date: Fri, 2 May 2025 21:56:30 +0200 Subject: [PATCH 370/382] udp: Actually discard datagrams we can't forward Given that udp_sock_fwd() now loops on udp_peek_addr() to get endpoint addresses for datagrams, if we can't forward one of these datagrams, we need to make sure we actually discard it. Otherwise, with MSG_PEEK, we won't dequeue and loop on it forever. For example, if we fail to create a socket for a new flow, because, say, the destination of an inbound packet is multicast, and we can't bind() to a multicast address, the loop will look like this: 18.0563: Flow 0 (NEW): FREE -> NEW 18.0563: Flow 0 (INI): NEW -> INI 18.0563: Flow 0 (INI): HOST [127.0.0.1]:42487 -> [127.0.0.1]:9997 => ? 18.0563: Flow 0 (TGT): INI -> TGT 18.0563: Flow 0 (TGT): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997 18.0563: Flow 0 (UDP flow): TGT -> TYPED 18.0564: Flow 0 (UDP flow): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997 18.0564: Flow 0 (UDP flow): Couldn't open flow specific socket: Invalid argument 18.0564: Flow 0 (FREE): TYPED -> FREE 18.0564: Flow 0 (FREE): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997 18.0564: Discarding datagram without flow 18.0564: Flow 0 (NEW): FREE -> NEW 18.0564: Flow 0 (INI): NEW -> INI 18.0564: Flow 0 (INI): HOST [127.0.0.1]:42487 -> [127.0.0.1]:9997 => ? 18.0564: Flow 0 (TGT): INI -> TGT 18.0564: Flow 0 (TGT): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997 18.0564: Flow 0 (UDP flow): TGT -> TYPED 18.0564: Flow 0 (UDP flow): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997 18.0564: Flow 0 (UDP flow): Couldn't open flow specific socket: Invalid argument 18.0564: Flow 0 (FREE): TYPED -> FREE 18.0564: Flow 0 (FREE): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997 18.0564: Discarding datagram without flow and seen from strace: epoll_wait(3, [{events=EPOLLIN, data=0x1076c00000705}], 8, 1000) = 1 recvmsg(7, {msg_name={sa_family=AF_INET6, sin6_port=htons(55899), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "fe80::26e8:53ff:fef3:13b6", &sin6_addr), sin6_scope_id=if_nametoindex("wlp4s0")}, msg_namelen=28, msg_iov=NULL, msg_iovlen=0, msg_control=[{cmsg_len=36, cmsg_level=SOL_IPV6, cmsg_type=0x32, cmsg_data="\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x03\x00\x00\x00"}], msg_controllen=40, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_DONTWAIT) = 0 socket(AF_INET6, SOCK_DGRAM|SOCK_NONBLOCK, IPPROTO_UDP) = 12 setsockopt(12, SOL_IPV6, IPV6_V6ONLY, [1], 4) = 0 setsockopt(12, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 setsockopt(12, SOL_IPV6, IPV6_RECVERR, [1], 4) = 0 setsockopt(12, SOL_IPV6, IPV6_RECVPKTINFO, [1], 4) = 0 bind(12, {sa_family=AF_INET6, sin6_port=htons(1900), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "ff02::c", &sin6_addr), sin6_scope_id=0}, 28) = -1 EINVAL (Invalid argument) close(12) = 0 recvmsg(7, {msg_name={sa_family=AF_INET6, sin6_port=htons(55899), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "fe80::26e8:53ff:fef3:13b6", &sin6_addr), sin6_scope_id=if_nametoindex("wlp4s0")}, msg_namelen=28, msg_iov=NULL, msg_iovlen=0, msg_control=[{cmsg_len=36, cmsg_level=SOL_IPV6, cmsg_type=0x32, cmsg_data="\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x03\x00\x00\x00"}], msg_controllen=40, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_DONTWAIT) = 0 socket(AF_INET6, SOCK_DGRAM|SOCK_NONBLOCK, IPPROTO_UDP) = 12 setsockopt(12, SOL_IPV6, IPV6_V6ONLY, [1], 4) = 0 setsockopt(12, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 setsockopt(12, SOL_IPV6, IPV6_RECVERR, [1], 4) = 0 setsockopt(12, SOL_IPV6, IPV6_RECVPKTINFO, [1], 4) = 0 bind(12, {sa_family=AF_INET6, sin6_port=htons(1900), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "ff02::c", &sin6_addr), sin6_scope_id=0}, 28) = -1 EINVAL (Invalid argument) close(12) = 0 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> --- udp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/udp.c b/udp.c index f5a5cd1..ca28b37 100644 --- a/udp.c +++ b/udp.c @@ -828,6 +828,7 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, int rc; while ((rc = udp_peek_addr(s, &src, &dst)) != 0) { + bool discard = false; flow_sidx_t tosidx; uint8_t topif; @@ -861,8 +862,17 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, flow_err(uflow, "No support for forwarding UDP from %s to %s", pif_name(frompif), pif_name(topif)); + discard = true; } else { debug("Discarding datagram without flow"); + discard = true; + } + + if (discard) { + struct msghdr msg = { 0 }; + + if (recvmsg(s, &msg, MSG_DONTWAIT) < 0) + debug_perror("Failed to discard datagram"); } } } From eea8a76caf85f4bae5f92b695d09b9ddea354b57 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Wed, 7 May 2025 14:36:34 +0200 Subject: [PATCH 371/382] flow: fix podman issue #26073 While running pasta, we trigger the following assert: ASSERTION FAILED in udp_at_sidx (udp_flow.c:35): flow->f.type == FLOW_UDP in udp_at_sidx() in the following path: 902 void udp_sock_handler(const struct ctx *c, union epoll_ref ref, 903 uint32_t events, const struct timespec *now) 904 { 905 struct udp_flow *uflow = udp_at_sidx(ref.flowside); The invalid sidx is comming from the epoll_ref provided by epoll_wait(). This assert follows the following error: Couldn't connect flow socket: Permission denied It appears that an error happens in udp_flow_sock() and the recently created fd is not removed from the epoll_ctl() pool: 71 static int udp_flow_sock(const struct ctx *c, 72 struct udp_flow *uflow, unsigned sidei) 73 { ... 82 s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data); 83 if (s < 0) { 84 flow_dbg_perror(uflow, "Couldn't open flow specific socket"); 85 return s; 86 } 87 88 if (flowside_connect(c, s, pif, side) < 0) { 89 int rc = -errno; 90 flow_dbg_perror(uflow, "Couldn't connect flow socket"); 91 return rc; 92 } ... flowside_sock_l4() calls sock_l4_sa() that adds 's' to the epoll_ctl() pool. So to cleanly manage the error of flowside_connect() we need to remove 's' from the epoll_ctl() pool using epoll_del(). Link: https://github.com/containers/podman/issues/26073 Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp_flow.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udp_flow.c b/udp_flow.c index fea1cf3..b3a13b7 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -87,6 +87,10 @@ static int udp_flow_sock(const struct ctx *c, if (flowside_connect(c, s, pif, side) < 0) { int rc = -errno; + + if (pif == PIF_HOST) + epoll_del(c, s); + flow_dbg_perror(uflow, "Couldn't connect flow socket"); return rc; } From 92d5d680134455f1a5b51fd8a3e9e64c99ac6d13 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Tue, 6 May 2025 16:13:25 +0200 Subject: [PATCH 372/382] flow: fix wrong macro name in comments The maximum number of flow macro name is FLOW_MAX, not MAX_FLOW. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow.c b/flow.c index c5718e3..6a5c8aa 100644 --- a/flow.c +++ b/flow.c @@ -81,7 +81,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, * * Free cluster list * flow_first_free gives the index of the first (lowest index) free cluster. - * Each free cluster has the index of the next free cluster, or MAX_FLOW if + * Each free cluster has the index of the next free cluster, or FLOW_MAX if * it is the last free cluster. Together these form a linked list of free * clusters, in strictly increasing order of index. * From 8ec134109eb136432a29bdf5a14f8b1fd4e46208 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Mon, 12 May 2025 18:47:00 +0200 Subject: [PATCH 373/382] flow: close socket fd on error In eea8a76caf85 ("flow: fix podman issue #26073"), we unregister the fd from epoll_ctl() in case of error, but we also need to close it. As flowside_sock_l4() also calls sock_l4_sa() via flowside_sock_splice() we can do it unconditionally. Fixes: eea8a76caf85 ("flow: fix podman issue #26073") Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- udp_flow.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udp_flow.c b/udp_flow.c index b3a13b7..4c6b3c2 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -88,8 +88,8 @@ static int udp_flow_sock(const struct ctx *c, if (flowside_connect(c, s, pif, side) < 0) { int rc = -errno; - if (pif == PIF_HOST) - epoll_del(c, s); + epoll_del(c, s); + close(s); flow_dbg_perror(uflow, "Couldn't connect flow socket"); return rc; From 570e7b4454f2f879180ae3ca13dedd759aff5243 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Tue, 13 May 2025 11:40:59 +0200 Subject: [PATCH 374/382] dhcpv6: fix GCC error (unterminated-string-initialization) The string STR_NOTONLINK is intentionally not NUL-terminated. Ignore the GCC error using __attribute__((nonstring)). This error is reported by GCC 15.1.1 on Fedora 42. However, Clang 20.1.3 does not support __attribute__((nonstring)). Therefore, NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) is also added to suppress Clang's unknown attribute warning. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- dhcpv6.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dhcpv6.c b/dhcpv6.c index 373a988..ba16c66 100644 --- a/dhcpv6.c +++ b/dhcpv6.c @@ -144,7 +144,9 @@ struct opt_ia_addr { struct opt_status_code { struct opt_hdr hdr; uint16_t code; - char status_msg[sizeof(STR_NOTONLINK) - 1]; + /* "nonstring" is only supported since clang 23 */ + /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ + __attribute__((nonstring)) char status_msg[sizeof(STR_NOTONLINK) - 1]; } __attribute__((packed)); /** From a6b9832e495be636bcccf25e0aebdeb564addf06 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Tue, 13 May 2025 11:41:00 +0200 Subject: [PATCH 375/382] virtio: Fix Clang warning (bugprone-sizeof-expression, cert-arr39-c) In `virtqueue_read_indirect_desc()`, the pointer arithmetic involving `desc` is intentional. We add the length in bytes (`read_len`) divided by the size of `struct vring_desc` to `desc`, which is an array of `struct vring_desc`. This correctly calculates the offset in terms of the number of `struct vring_desc` elements. Clang issues the following warning due to this explicit scaling: virtio.c:238:8: error: suspicious usage of 'sizeof(...)' in pointer arithmetic; this scaled value will be scaled again by the '+=' operator [bugprone-sizeof-expression,cert-arr39-c,-Werror] 238 | desc += read_len / sizeof(struct vring_desc); | ^ ~~~~~~~~~~~~~~~~~~~~~~~~~ virtio.c:238:8: note: '+=' in pointer arithmetic internally scales with 'sizeof(struct vring_desc)' == 16 This behavior is intended, so the warning can be considered a false positive in this context. The code correctly advances the pointer by the desired number of descriptor entries. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- virtio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/virtio.c b/virtio.c index bc2b89a..f7db007 100644 --- a/virtio.c +++ b/virtio.c @@ -235,6 +235,7 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev, memcpy(desc, orig_desc, read_len); len -= read_len; addr += read_len; + /* NOLINTNEXTLINE(bugprone-sizeof-expression,cert-arr39-c) */ desc += read_len / sizeof(struct vring_desc); } From 0f7bf10b0a5542690dc6c75e4b56a6030ca8a663 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Tue, 13 May 2025 11:41:01 +0200 Subject: [PATCH 376/382] ndp: Fix Clang analyzer warning (clang-analyzer-security.PointerSub) Addresses Clang warning: "Subtraction of two pointers that do not point into the same array is undefined behavior" for the line: `ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);` Here, `ptr` is `&ra.var[0]`. The subtraction calculates the offset of `var[0]` within the `struct ra_options ra`. Since `ptr` points inside `ra`, this pointer arithmetic is well-defined for calculating the size of the data to send, even if `ptr` and `&ra` are not strictly considered part of the same "array" by the analyzer. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- ndp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ndp.c b/ndp.c index ded2081..b664034 100644 --- a/ndp.c +++ b/ndp.c @@ -328,6 +328,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN); + /* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */ ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra); } From 2d3d69c5c348d18112596bd3fdeed95689c613c8 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Tue, 13 May 2025 11:41:02 +0200 Subject: [PATCH 377/382] flow: Fix clang error (clang-analyzer-security.PointerSub) Fixes the following clang-analyzer warning: flow_table.h:96:25: note: Subtraction of two pointers that do not point into the same array is undefined behavior 96 | return (union flow *)f - flowtab; The `flow_idx()` function is called via `FLOW_IDX()` from `flow_foreach_slot()`, where `f` is set to `&flowtab[idx].f`. Therefore, `f` and `flowtab` do point to the same array. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- flow_table.h | 1 + 1 file changed, 1 insertion(+) diff --git a/flow_table.h b/flow_table.h index 2d5c65c..3f3f4b7 100644 --- a/flow_table.h +++ b/flow_table.h @@ -93,6 +93,7 @@ extern union flow flowtab[]; */ static inline unsigned flow_idx(const struct flow_common *f) { + /* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */ return (union flow *)f - flowtab; } From 4234ace84cdf989cbcdb96a8165221dc83a11c85 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Wed, 14 May 2025 15:45:09 +0200 Subject: [PATCH 378/382] test: Display count of skipped tests in status and summary This commit enhances test reporting by tracking and displaying the number of skipped tests. The skipped test count is now visible in the tmux status bar during execution and included in the final test summary log. This provides a more complete overview of test suite results. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- test/lib/term | 7 +++++-- test/run | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/lib/term b/test/lib/term index ed690de..089364c 100755 --- a/test/lib/term +++ b/test/lib/term @@ -19,6 +19,7 @@ STATUS_FILE_INDEX=0 STATUS_COLS= STATUS_PASS=0 STATUS_FAIL=0 +STATUS_SKIPPED=0 PR_RED='\033[1;31m' PR_GREEN='\033[1;32m' @@ -439,19 +440,21 @@ info_layout() { # status_test_ok() - Update counter of passed tests, log and display message status_test_ok() { STATUS_PASS=$((STATUS_PASS + 1)) - tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)" + tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)" info_passed } # status_test_fail() - Update counter of failed tests, log and display message status_test_fail() { STATUS_FAIL=$((STATUS_FAIL + 1)) - tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)" + tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)" info_failed } # status_test_fail() - Update counter of failed tests, log and display message status_test_skip() { + STATUS_SKIPPED=$((STATUS_SKIPPED + 1)) + tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)" info_skipped } diff --git a/test/run b/test/run index 4e86f30..f73c311 100755 --- a/test/run +++ b/test/run @@ -202,7 +202,7 @@ skip_distro() { perf_finish [ ${CI} -eq 1 ] && video_stop - log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}" + log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}" pause_continue \ "Press any key to keep test session open" \ @@ -236,7 +236,7 @@ run_selected() { done teardown "${__setup}" - log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}" + log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}" pause_continue \ "Press any key to keep test session open" \ @@ -307,4 +307,4 @@ fi tail -n1 ${LOGFILE} echo "Log at ${LOGFILE}" -exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\)$/\1/p') +exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\),.*$/\1/p') From 2046976866dd1f983cb0417a1d3ee3f64190805d Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Thu, 15 May 2025 11:41:51 +0200 Subject: [PATCH 379/382] codespell: Correct typos in comments and error message This commit addresses several spelling errors identified by the `codespell` tool. The corrections apply to: - Code comments in `fwd.c`, `ip.h`, `isolation.c`, and `log.c`. - An error message string in `vhost_user.c`. Specifically, the following misspellings were corrected: - "adddress" to "address" - "capabilites" to "capabilities" - "Musn't" to "Mustn't" - "calculatd" to "calculated" - "Invalide" to "Invalid" Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- fwd.c | 2 +- ip.h | 2 +- isolation.c | 8 ++++---- log.c | 2 +- vhost_user.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fwd.c b/fwd.c index 49aabc3..250cf56 100644 --- a/fwd.c +++ b/fwd.c @@ -418,7 +418,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, else tgt->eaddr = inany_loopback6; - /* Preserve the specific loopback adddress used, but let the kernel pick + /* Preserve the specific loopback address used, but let the kernel pick * a source port on the target side */ tgt->oaddr = ini->eaddr; diff --git a/ip.h b/ip.h index 471c57e..24509d9 100644 --- a/ip.h +++ b/ip.h @@ -118,7 +118,7 @@ static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h) char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto, size_t *dlen); -/* IPv6 link-local all-nodes multicast adddress, ff02::1 */ +/* IPv6 link-local all-nodes multicast address, ff02::1 */ static const struct in6_addr in6addr_ll_all_nodes = { .s6_addr = { 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, diff --git a/isolation.c b/isolation.c index c944fb3..bbcd23b 100644 --- a/isolation.c +++ b/isolation.c @@ -129,7 +129,7 @@ static void drop_caps_ep_except(uint64_t keep) * additional layer of protection. Executing this requires * CAP_SETPCAP, which we will have within our userns. * - * Note that dropping capabilites from the bounding set limits + * Note that dropping capabilities from the bounding set limits * exec()ed processes, but does not remove them from the effective or * permitted sets, so it doesn't reduce our own capabilities. */ @@ -174,8 +174,8 @@ static void clamp_caps(void) * Should: * - drop unneeded capabilities * - close all open files except for standard streams and the one from --fd - * Musn't: - * - remove filesytem access (we need to access files during setup) + * Mustn't: + * - remove filesystem access (we need to access files during setup) */ void isolate_initial(int argc, char **argv) { @@ -194,7 +194,7 @@ void isolate_initial(int argc, char **argv) * * It's debatable whether it's useful to drop caps when we * retain SETUID and SYS_ADMIN, but we might as well. We drop - * further capabilites in isolate_user() and + * further capabilities in isolate_user() and * isolate_prefork(). */ keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) | diff --git a/log.c b/log.c index d40d7ae..5d7d76f 100644 --- a/log.c +++ b/log.c @@ -402,7 +402,7 @@ void __setlogmask(int mask) * logfile_init() - Open log file and write header with PID, version, path * @name: Identifier for header: passt or pasta * @path: Path to log file - * @size: Maximum size of log file: log_cut_size is calculatd here + * @size: Maximum size of log file: log_cut_size is calculated here */ void logfile_init(const char *name, const char *path, size_t size) { diff --git a/vhost_user.c b/vhost_user.c index 105f77a..ca36763 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -1021,7 +1021,7 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE && direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD) - die("Invalide device_state_fd direction: %d", direction); + die("Invalid device_state_fd direction: %d", direction); migrate_request(vdev->context, msg->fds[0], direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD); From 2fd0944f21d6b9fce53c328acf1faaeb46b98528 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 16 May 2025 14:42:26 +0200 Subject: [PATCH 380/382] vhost_user: Correct and align function comment headers This commit cleans up function comment headers in vhost_user.c to ensure accuracy and consistency with the code. Changes include correcting parameter names in comments and signatures (e.g., standardizing on vmsg for vhost messages, fixing dev to vdev), updating function names in comment descriptions, and removing/rectifying erroneous parameter documentation. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- vhost_user.c | 221 +++++++++++++++++++++++++-------------------------- vhost_user.h | 2 +- 2 files changed, 111 insertions(+), 112 deletions(-) diff --git a/vhost_user.c b/vhost_user.c index ca36763..e8377bb 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -302,13 +302,13 @@ static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg) * @conn_fd: vhost-user command socket * @vmsg: vhost-user message */ -static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg) +static void vu_send_reply(int conn_fd, struct vhost_user_msg *vmsg) { - msg->hdr.flags &= ~VHOST_USER_VERSION_MASK; - msg->hdr.flags |= VHOST_USER_VERSION; - msg->hdr.flags |= VHOST_USER_REPLY_MASK; + vmsg->hdr.flags &= ~VHOST_USER_VERSION_MASK; + vmsg->hdr.flags |= VHOST_USER_VERSION; + vmsg->hdr.flags |= VHOST_USER_REPLY_MASK; - vu_message_write(conn_fd, msg); + vu_message_write(conn_fd, vmsg); } /** @@ -319,7 +319,7 @@ static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg) * Return: True as a reply is requested */ static bool vu_get_features_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { uint64_t features = 1ULL << VIRTIO_F_VERSION_1 | @@ -329,9 +329,9 @@ static bool vu_get_features_exec(struct vu_dev *vdev, (void)vdev; - vmsg_set_reply_u64(msg, features); + vmsg_set_reply_u64(vmsg, features); - debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64); + debug("Sending back to guest u64: 0x%016"PRIx64, vmsg->payload.u64); return true; } @@ -357,11 +357,11 @@ static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable) * Return: False as no reply is requested */ static bool vu_set_features_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - debug("u64: 0x%016"PRIx64, msg->payload.u64); + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); - vdev->features = msg->payload.u64; + vdev->features = vmsg->payload.u64; /* We only support devices conforming to VIRTIO 1.0 or * later */ @@ -382,10 +382,10 @@ static bool vu_set_features_exec(struct vu_dev *vdev, * Return: False as no reply is requested */ static bool vu_set_owner_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { (void)vdev; - (void)msg; + (void)vmsg; return false; } @@ -423,9 +423,9 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq) * #syscalls:vu mmap|mmap2 munmap */ static bool vu_set_mem_table_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - struct vhost_user_memory m = msg->payload.memory, *memory = &m; + struct vhost_user_memory m = vmsg->payload.memory, *memory = &m; unsigned int i; for (i = 0; i < vdev->nregions; i++) { @@ -465,7 +465,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, */ mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, PROT_READ | PROT_WRITE, MAP_SHARED | - MAP_NORESERVE, msg->fds[i], 0); + MAP_NORESERVE, vmsg->fds[i], 0); if (mmap_addr == MAP_FAILED) die_perror("vhost-user region mmap error"); @@ -474,7 +474,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, debug(" mmap_addr: 0x%016"PRIx64, dev_region->mmap_addr); - close(msg->fds[i]); + close(vmsg->fds[i]); } for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { @@ -541,7 +541,7 @@ static void vu_log_page(uint8_t *log_table, uint64_t page) /** * vu_log_write() - Log memory write - * @dev: vhost-user device + * @vdev: vhost-user device * @address: Memory address * @length: Memory size */ @@ -566,23 +566,23 @@ void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length) * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: True as a reply is requested * * #syscalls:vu mmap|mmap2 munmap */ static bool vu_set_log_base_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { uint64_t log_mmap_size, log_mmap_offset; void *base; int fd; - if (msg->fd_num != 1 || msg->hdr.size != sizeof(msg->payload.log)) + if (vmsg->fd_num != 1 || vmsg->hdr.size != sizeof(vmsg->payload.log)) die("vhost-user: Invalid log_base message"); - fd = msg->fds[0]; - log_mmap_offset = msg->payload.log.mmap_offset; - log_mmap_size = msg->payload.log.mmap_size; + fd = vmsg->fds[0]; + log_mmap_offset = vmsg->payload.log.mmap_offset; + log_mmap_size = vmsg->payload.log.mmap_size; debug("vhost-user log mmap_offset: %"PRId64, log_mmap_offset); debug("vhost-user log mmap_size: %"PRId64, log_mmap_size); @@ -599,8 +599,8 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev, vdev->log_table = base; vdev->log_size = log_mmap_size; - msg->hdr.size = sizeof(msg->payload.u64); - msg->fd_num = 0; + vmsg->hdr.size = sizeof(vmsg->payload.u64); + vmsg->fd_num = 0; return true; } @@ -613,15 +613,15 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev, * Return: False as no reply is requested */ static bool vu_set_log_fd_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - if (msg->fd_num != 1) + if (vmsg->fd_num != 1) die("Invalid log_fd message"); if (vdev->log_call_fd != -1) close(vdev->log_call_fd); - vdev->log_call_fd = msg->fds[0]; + vdev->log_call_fd = vmsg->fds[0]; debug("Got log_call_fd: %d", vdev->log_call_fd); @@ -636,10 +636,10 @@ static bool vu_set_log_fd_exec(struct vu_dev *vdev, * Return: False as no reply is requested */ static bool vu_set_vring_num_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - unsigned int idx = msg->payload.state.index; - unsigned int num = msg->payload.state.num; + unsigned int idx = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; trace("State.index: %u", idx); trace("State.num: %u", num); @@ -656,13 +656,13 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev, * Return: False as no reply is requested */ static bool vu_set_vring_addr_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { /* We need to copy the payload to vhost_vring_addr structure - * to access index because address of msg->payload.addr + * to access index because address of vmsg->payload.addr * can be unaligned as it is packed. */ - struct vhost_vring_addr addr = msg->payload.addr; + struct vhost_vring_addr addr = vmsg->payload.addr; struct vu_virtq *vq = &vdev->vq[addr.index]; debug("vhost_vring_addr:"); @@ -677,7 +677,7 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev, debug(" log_guest_addr: 0x%016" PRIx64, (uint64_t)addr.log_guest_addr); - vq->vra = msg->payload.addr; + vq->vra = vmsg->payload.addr; vq->vring.flags = addr.flags; vq->vring.log_guest_addr = addr.log_guest_addr; @@ -702,10 +702,10 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev, * Return: False as no reply is requested */ static bool vu_set_vring_base_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - unsigned int idx = msg->payload.state.index; - unsigned int num = msg->payload.state.num; + unsigned int idx = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; debug("State.index: %u", idx); debug("State.num: %u", num); @@ -723,13 +723,13 @@ static bool vu_set_vring_base_exec(struct vu_dev *vdev, * Return: True as a reply is requested */ static bool vu_get_vring_base_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - unsigned int idx = msg->payload.state.index; + unsigned int idx = vmsg->payload.state.index; debug("State.index: %u", idx); - msg->payload.state.num = vdev->vq[idx].last_avail_idx; - msg->hdr.size = sizeof(msg->payload.state); + vmsg->payload.state.num = vdev->vq[idx].last_avail_idx; + vmsg->hdr.size = sizeof(vmsg->payload.state); vdev->vq[idx].started = false; vdev->vq[idx].vring.avail = 0; @@ -771,21 +771,21 @@ static void vu_set_watch(const struct vu_dev *vdev, int idx) * close fds if NOFD bit is set * @vmsg: vhost-user message */ -static void vu_check_queue_msg_file(struct vhost_user_msg *msg) +static void vu_check_queue_msg_file(struct vhost_user_msg *vmsg) { - bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; - int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; if (idx >= VHOST_USER_MAX_QUEUES) die("Invalid vhost-user queue index: %u", idx); if (nofd) { - vmsg_close_fds(msg); + vmsg_close_fds(vmsg); return; } - if (msg->fd_num != 1) - die("Invalid fds in vhost-user request: %d", msg->hdr.request); + if (vmsg->fd_num != 1) + die("Invalid fds in vhost-user request: %d", vmsg->hdr.request); } /** @@ -797,14 +797,14 @@ static void vu_check_queue_msg_file(struct vhost_user_msg *msg) * Return: False as no reply is requested */ static bool vu_set_vring_kick_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; - int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - debug("u64: 0x%016"PRIx64, msg->payload.u64); + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); - vu_check_queue_msg_file(msg); + vu_check_queue_msg_file(vmsg); if (vdev->vq[idx].kick_fd != -1) { epoll_del(vdev->context, vdev->vq[idx].kick_fd); @@ -813,7 +813,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev, } if (!nofd) - vdev->vq[idx].kick_fd = msg->fds[0]; + vdev->vq[idx].kick_fd = vmsg->fds[0]; debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx); @@ -837,14 +837,14 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev, * Return: False as no reply is requested */ static bool vu_set_vring_call_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; - int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - debug("u64: 0x%016"PRIx64, msg->payload.u64); + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); - vu_check_queue_msg_file(msg); + vu_check_queue_msg_file(vmsg); if (vdev->vq[idx].call_fd != -1) { close(vdev->vq[idx].call_fd); @@ -852,11 +852,11 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev, } if (!nofd) - vdev->vq[idx].call_fd = msg->fds[0]; + vdev->vq[idx].call_fd = vmsg->fds[0]; /* in case of I/O hang after reconnecting */ if (vdev->vq[idx].call_fd != -1) - eventfd_write(msg->fds[0], 1); + eventfd_write(vmsg->fds[0], 1); debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx); @@ -872,14 +872,14 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev, * Return: False as no reply is requested */ static bool vu_set_vring_err_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; - int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - debug("u64: 0x%016"PRIx64, msg->payload.u64); + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); - vu_check_queue_msg_file(msg); + vu_check_queue_msg_file(vmsg); if (vdev->vq[idx].err_fd != -1) { close(vdev->vq[idx].err_fd); @@ -887,7 +887,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev, } if (!nofd) - vdev->vq[idx].err_fd = msg->fds[0]; + vdev->vq[idx].err_fd = vmsg->fds[0]; return false; } @@ -901,7 +901,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev, * Return: True as a reply is requested */ static bool vu_get_protocol_features_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | @@ -909,7 +909,7 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev, 1ULL << VHOST_USER_PROTOCOL_F_RARP; (void)vdev; - vmsg_set_reply_u64(msg, features); + vmsg_set_reply_u64(vmsg, features); return true; } @@ -922,13 +922,13 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev, * Return: False as no reply is requested */ static bool vu_set_protocol_features_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - uint64_t features = msg->payload.u64; + uint64_t features = vmsg->payload.u64; debug("u64: 0x%016"PRIx64, features); - vdev->protocol_features = msg->payload.u64; + vdev->protocol_features = vmsg->payload.u64; return false; } @@ -941,11 +941,11 @@ static bool vu_set_protocol_features_exec(struct vu_dev *vdev, * Return: True as a reply is requested */ static bool vu_get_queue_num_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { (void)vdev; - vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES); + vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_QUEUES); return true; } @@ -958,10 +958,10 @@ static bool vu_get_queue_num_exec(struct vu_dev *vdev, * Return: False as no reply is requested */ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - unsigned int enable = msg->payload.state.num; - unsigned int idx = msg->payload.state.index; + unsigned int enable = vmsg->payload.state.num; + unsigned int idx = vmsg->payload.state.index; debug("State.index: %u", idx); debug("State.enable: %u", enable); @@ -974,17 +974,17 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, } /** - * vu_set_send_rarp_exec() - vhost-user specification says: "Broadcast a fake - * RARP to notify the migration is terminated", - * but passt doesn't need to update any ARP table, - * so do nothing to silence QEMU bogus error message + * vu_send_rarp_exec() - vhost-user specification says: "Broadcast a fake + * RARP to notify the migration is terminated", + * but passt doesn't need to update any ARP table, + * so do nothing to silence QEMU bogus error message * @vdev: vhost-user device * @vmsg: vhost-user message * * Return: False as no reply is requested */ static bool vu_send_rarp_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { char macstr[ETH_ADDRSTRLEN]; @@ -993,7 +993,7 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev, /* ignore the command */ debug("Ignore command VHOST_USER_SEND_RARP for %s", - eth_ntop((unsigned char *)&msg->payload.u64, macstr, + eth_ntop((unsigned char *)&vmsg->payload.u64, macstr, sizeof(macstr))); return false; @@ -1008,12 +1008,12 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev, * and set bit 8 as we don't provide our own fd. */ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - unsigned int direction = msg->payload.transfer_state.direction; - unsigned int phase = msg->payload.transfer_state.phase; + unsigned int direction = vmsg->payload.transfer_state.direction; + unsigned int phase = vmsg->payload.transfer_state.phase; - if (msg->fd_num != 1) + if (vmsg->fd_num != 1) die("Invalid device_state_fd message"); if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED) @@ -1023,11 +1023,11 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD) die("Invalid device_state_fd direction: %d", direction); - migrate_request(vdev->context, msg->fds[0], + migrate_request(vdev->context, vmsg->fds[0], direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD); /* We don't provide a new fd for the data transfer */ - vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK); + vmsg_set_reply_u64(vmsg, VHOST_USER_VRING_NOFD_MASK); return true; } @@ -1041,9 +1041,9 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, */ /* cppcheck-suppress constParameterCallback */ static bool vu_check_device_state_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - vmsg_set_reply_u64(msg, vdev->context->device_state_result); + vmsg_set_reply_u64(vmsg, vdev->context->device_state_result); return true; } @@ -1051,7 +1051,6 @@ static bool vu_check_device_state_exec(struct vu_dev *vdev, /** * vu_init() - Initialize vhost-user device structure * @c: execution context - * @vdev: vhost-user device */ void vu_init(struct ctx *c) { @@ -1134,7 +1133,7 @@ static void vu_sock_reset(struct vu_dev *vdev) } static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, - struct vhost_user_msg *msg) = { + struct vhost_user_msg *vmsg) = { [VHOST_USER_GET_FEATURES] = vu_get_features_exec, [VHOST_USER_SET_FEATURES] = vu_set_features_exec, [VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec, @@ -1165,7 +1164,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, */ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) { - struct vhost_user_msg msg = { 0 }; + struct vhost_user_msg vmsg = { 0 }; bool need_reply, reply_requested; int ret; @@ -1174,38 +1173,38 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) return; } - ret = vu_message_read_default(fd, &msg); + ret = vu_message_read_default(fd, &vmsg); if (ret == 0) { vu_sock_reset(vdev); return; } debug("================ Vhost user message ================"); - debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request), - msg.hdr.request); - debug("Flags: 0x%x", msg.hdr.flags); - debug("Size: %u", msg.hdr.size); + debug("Request: %s (%d)", vu_request_to_string(vmsg.hdr.request), + vmsg.hdr.request); + debug("Flags: 0x%x", vmsg.hdr.flags); + debug("Size: %u", vmsg.hdr.size); - need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; + need_reply = vmsg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; - if (msg.hdr.request >= 0 && msg.hdr.request < VHOST_USER_MAX && - vu_handle[msg.hdr.request]) - reply_requested = vu_handle[msg.hdr.request](vdev, &msg); + if (vmsg.hdr.request >= 0 && vmsg.hdr.request < VHOST_USER_MAX && + vu_handle[vmsg.hdr.request]) + reply_requested = vu_handle[vmsg.hdr.request](vdev, &vmsg); else - die("Unhandled request: %d", msg.hdr.request); + die("Unhandled request: %d", vmsg.hdr.request); /* cppcheck-suppress legacyUninitvar */ if (!reply_requested && need_reply) { - msg.payload.u64 = 0; - msg.hdr.flags = 0; - msg.hdr.size = sizeof(msg.payload.u64); - msg.fd_num = 0; + vmsg.payload.u64 = 0; + vmsg.hdr.flags = 0; + vmsg.hdr.size = sizeof(vmsg.payload.u64); + vmsg.fd_num = 0; reply_requested = true; } if (reply_requested) - vu_send_reply(fd, &msg); + vu_send_reply(fd, &vmsg); - if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE && + if (vmsg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE && vdev->context->device_state_result == 0 && !vdev->context->migrate_target) { info("Migration complete, exiting"); diff --git a/vhost_user.h b/vhost_user.h index 1daacd1..f2ae2da 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -184,7 +184,7 @@ union vhost_user_payload { }; /** - * struct vhost_user_msg - vhost-use message + * struct vhost_user_msg - vhost-user message * @hdr: Message header * @payload: Message payload * @fds: File descriptors associated with the message From b915375a421d70065baa90444da49954ceacde38 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 16 May 2025 14:42:27 +0200 Subject: [PATCH 381/382] virtio: Correct and align comment headers Standardize and fix issues in `virtio.c` and `virtio.h` comment headers. Improvements include: - Added `()` to function names in comment summaries. - Added colons after parameter and enum member tags. - Changed `/*` to `/**` for `virtq_avail_event()` comment. - Fixed typos (e.g., "file"->"fill", "virqueue"->"virtqueue"). - Added missing `Return:` tag for `vu_queue_rewind()`. - Corrected parameter names in `virtio.h` comments to match code. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- virtio.c | 29 ++++++++++++++++------------- virtio.h | 4 ++-- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/virtio.c b/virtio.c index f7db007..83906aa 100644 --- a/virtio.c +++ b/virtio.c @@ -156,9 +156,9 @@ static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i) } /** - * virtq_used_event - Get location of used event indices + * virtq_used_event() - Get location of used event indices * (only with VIRTIO_F_EVENT_IDX) - * @vq Virtqueue + * @vq: Virtqueue * * Return: return the location of the used event index */ @@ -170,7 +170,7 @@ static inline uint16_t *virtq_used_event(const struct vu_virtq *vq) /** * vring_get_used_event() - Get the used event from the available ring - * @vq Virtqueue + * @vq: Virtqueue * * Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set) * used_event is a performant alternative where the driver @@ -244,9 +244,9 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev, /** * enum virtqueue_read_desc_state - State in the descriptor chain - * @VIRTQUEUE_READ_DESC_ERROR Found an invalid descriptor - * @VIRTQUEUE_READ_DESC_DONE No more descriptors in the chain - * @VIRTQUEUE_READ_DESC_MORE there are more descriptors in the chain + * @VIRTQUEUE_READ_DESC_ERROR: Found an invalid descriptor + * @VIRTQUEUE_READ_DESC_DONE: No more descriptors in the chain + * @VIRTQUEUE_READ_DESC_MORE: there are more descriptors in the chain */ enum virtqueue_read_desc_state { VIRTQUEUE_READ_DESC_ERROR = -1, @@ -347,8 +347,9 @@ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) die_perror("Error writing vhost-user queue eventfd"); } -/* virtq_avail_event() - Get location of available event indices - * (only with VIRTIO_F_EVENT_IDX) +/** + * virtq_avail_event() - Get location of available event indices + * (only with VIRTIO_F_EVENT_IDX) * @vq: Virtqueue * * Return: return the location of the available event index @@ -421,8 +422,8 @@ static bool virtqueue_map_desc(const struct vu_dev *dev, } /** - * vu_queue_map_desc - Map the virtqueue descriptor ring into our virtual - * address space + * vu_queue_map_desc() - Map the virtqueue descriptor ring into our virtual + * address space * @dev: Vhost-user device * @vq: Virtqueue * @idx: First descriptor ring entry to map @@ -505,7 +506,7 @@ static int vu_queue_map_desc(const struct vu_dev *dev, * vu_queue_pop() - Pop an entry from the virtqueue * @dev: Vhost-user device * @vq: Virtqueue - * @elem: Virtqueue element to file with the entry information + * @elem: Virtqueue element to fill with the entry information * * Return: -1 if there is an error, 0 otherwise */ @@ -545,7 +546,7 @@ int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, } /** - * vu_queue_detach_element() - Detach an element from the virqueue + * vu_queue_detach_element() - Detach an element from the virtqueue * @vq: Virtqueue */ void vu_queue_detach_element(struct vu_virtq *vq) @@ -555,7 +556,7 @@ void vu_queue_detach_element(struct vu_virtq *vq) } /** - * vu_queue_unpop() - Push back the previously popped element from the virqueue + * vu_queue_unpop() - Push back the previously popped element from the virtqueue * @vq: Virtqueue */ /* cppcheck-suppress unusedFunction */ @@ -569,6 +570,8 @@ void vu_queue_unpop(struct vu_virtq *vq) * vu_queue_rewind() - Push back a given number of popped elements * @vq: Virtqueue * @num: Number of element to unpop + * + * Return: True on success, false if not */ bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num) { diff --git a/virtio.h b/virtio.h index 7a370bd..d8beb88 100644 --- a/virtio.h +++ b/virtio.h @@ -150,7 +150,7 @@ static inline bool has_feature(uint64_t features, unsigned int fbit) /** * vu_has_feature() - Check if a virtio-net feature is available * @vdev: Vhost-user device - * @bit: Feature to check + * @fbit: Feature to check * * Return: True if the feature is available */ @@ -163,7 +163,7 @@ static inline bool vu_has_feature(const struct vu_dev *vdev, /** * vu_has_protocol_feature() - Check if a vhost-user feature is available * @vdev: Vhost-user device - * @bit: Feature to check + * @fbit: Feature to check * * Return: True if the feature is available */ From 3262c9b088288902f28b5d09f61220fae5376082 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier@redhat.com> Date: Fri, 16 May 2025 14:42:28 +0200 Subject: [PATCH 382/382] iov: Standardize function comment headers Update function comment headers in iov.c to a consistent and standardized format. This change ensures: - Comment blocks for functions consistently start with /**. - Function names in the comment summary line include parentheses (). This improves overall comment clarity and uniformity within the file. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- iov.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/iov.c b/iov.c index 8c63b7e..91e87a7 100644 --- a/iov.c +++ b/iov.c @@ -26,7 +26,8 @@ #include "iov.h" -/* iov_skip_bytes() - Skip leading bytes of an IO vector +/** + * iov_skip_bytes() - Skip leading bytes of an IO vector * @iov: IO vector * @n: Number of entries in @iov * @skip: Number of leading bytes of @iov to skip @@ -56,8 +57,8 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, } /** - * iov_from_buf - Copy data from a buffer to an I/O vector (struct iovec) - * efficiently. + * iov_from_buf() - Copy data from a buffer to an I/O vector (struct iovec) + * efficiently. * * @iov: Pointer to the array of struct iovec describing the * scatter/gather I/O vector. @@ -96,8 +97,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, } /** - * iov_to_buf - Copy data from a scatter/gather I/O vector (struct iovec) to - * a buffer efficiently. + * iov_to_buf() - Copy data from a scatter/gather I/O vector (struct iovec) to + * a buffer efficiently. * * @iov: Pointer to the array of struct iovec describing the scatter/gather * I/O vector. @@ -136,8 +137,8 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, } /** - * iov_size - Calculate the total size of a scatter/gather I/O vector - * (struct iovec). + * iov_size() - Calculate the total size of a scatter/gather I/O vector + * (struct iovec). * * @iov: Pointer to the array of struct iovec describing the * scatter/gather I/O vector.