From 418feb37ece9ad584ec8b167861bb21a2cc3c067 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 26 Aug 2024 20:41:31 +0200
Subject: [PATCH 001/382] test: Look for possible sshd-session paths (if it's
 there at all) in mbuto's profile

Some distributions already have OpenSSH 9.8, which introduces split
sshd/sshd-session binaries, and there we need to copy the binary from
the host, which can be /usr/libexec/openssh/sshd-session (Fedora
Rawhide), /usr/lib/ssh/sshd-session (Arch Linux),
/usr/lib/openssh/sshd-session (Debian), and possibly other paths.

Add at least those three, and, if we don't find sshd-session, assume
we don't need it: it could very well be an older version of OpenSSH,
as reported by David for Fedora 40, or perhaps another daemon (would
Dropbear even work? I'm not sure).

Reported-by: David Gibson <david@gibson.dropbear.id.au>
Fixes: d6817b3930be ("test/passt.mbuto: Install sshd-session OpenSSH's split process")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: David Gibson <david@gibson.dropbear.id.au>
---
 test/passt.mbuto | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/test/passt.mbuto b/test/passt.mbuto
index 61865e8..138d365 100755
--- a/test/passt.mbuto
+++ b/test/passt.mbuto
@@ -13,8 +13,15 @@
 PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod
        modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname
        sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
-       nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp
-       /usr/lib/openssh/sshd-session}"
+       nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}"
+
+# OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and
+# sshd-session the per-session program. We need the latter as well, and the path
+# depends on the distribution. It doesn't exist on older versions.
+for bin in /usr/lib/openssh/sshd-session /usr/lib/ssh/sshd-session \
+	   /usr/libexec/openssh/sshd-session; do
+	command -v "${bin}" >/dev/null && PROGS="${PROGS} ${bin}"
+done
 
 KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}"
 

From 620e19a1b48a80abddc657b4c17f5e4920f300ec Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 27 Aug 2024 16:04:44 +1000
Subject: [PATCH 002/382] udp: Merge udp[46]_mh_recv arrays

We've already gotten rid of most of the IPv4/IPv6 specific data structures
in udp.c by merging them with each other.  One significant one remains:
udp[46]_mh_recv.  This was a bit awkward to remove because of a subtle
interaction.  We initialise the msg_namelen fields to represent the total
size we have for a socket address, but when we receive into the arrays
those are modified to the actual length of the sockaddr we received.

That meant that naively merging the arrays meant that if we received IPv4
datagrams, then IPv6 datagrams, the addresses for the latter would be
truncated.  In this patch address that by resetting the received
msg_namelen as soon as we've found a flow for the datagram.  Finding the
flow is the only thing that might use the actual sockaddr length, although
we in fact don't need it for the time being.

This also removes the last use of the 'v6' field from udp_listen_epoll_ref,
so remove that as well.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 54 +++++++++++++++++-------------------------------------
 udp.h |  2 --
 2 files changed, 17 insertions(+), 39 deletions(-)

diff --git a/udp.c b/udp.c
index 8a93aad..01b03df 100644
--- a/udp.c
+++ b/udp.c
@@ -178,8 +178,7 @@ enum udp_iov_idx {
 
 /* IOVs and msghdr arrays for receiving datagrams from sockets */
 static struct iovec	udp_iov_recv		[UDP_MAX_FRAMES];
-static struct mmsghdr	udp4_mh_recv		[UDP_MAX_FRAMES];
-static struct mmsghdr	udp6_mh_recv		[UDP_MAX_FRAMES];
+static struct mmsghdr	udp_mh_recv		[UDP_MAX_FRAMES];
 
 /* IOVs and msghdr arrays for sending "spliced" datagrams to sockets */
 static union sockaddr_inany udp_splice_to;
@@ -222,6 +221,7 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
 static void udp_iov_init_one(const struct ctx *c, size_t i)
 {
 	struct udp_payload_t *payload = &udp_payload[i];
+	struct msghdr *mh = &udp_mh_recv[i].msg_hdr;
 	struct udp_meta_t *meta = &udp_meta[i];
 	struct iovec *siov = &udp_iov_recv[i];
 	struct iovec *tiov = udp_l2_iov[i];
@@ -236,27 +236,10 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
 	tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
 	tiov[UDP_IOV_PAYLOAD].iov_base = payload;
 
-	/* It's useful to have separate msghdr arrays for receiving.  Otherwise,
-	 * an IPv4 recv() will alter msg_namelen, so we'd have to reset it every
-	 * time or risk truncating the address on future IPv6 recv()s.
-	 */
-	if (c->ifi4) {
-		struct msghdr *mh = &udp4_mh_recv[i].msg_hdr;
-
-		mh->msg_name	= &meta->s_in;
-		mh->msg_namelen	= sizeof(struct sockaddr_in);
-		mh->msg_iov	= siov;
-		mh->msg_iovlen	= 1;
-	}
-
-	if (c->ifi6) {
-		struct msghdr *mh = &udp6_mh_recv[i].msg_hdr;
-
-		mh->msg_name	= &meta->s_in;
-		mh->msg_namelen	= sizeof(struct sockaddr_in6);
-		mh->msg_iov	= siov;
-		mh->msg_iovlen	= 1;
-	}
+	mh->msg_name	= &meta->s_in;
+	mh->msg_namelen	= sizeof(meta->s_in);
+	mh->msg_iov	= siov;
+	mh->msg_iovlen	= 1;
 }
 
 /**
@@ -506,10 +489,10 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
 void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 			     uint32_t events, const struct timespec *now)
 {
-	struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv;
+	const socklen_t sasize = sizeof(udp_meta[0].s_in);
 	int n, i;
 
-	if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0)
+	if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
 		return;
 
 	/* We divide datagrams into batches based on how we need to send them,
@@ -518,6 +501,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 	 * populate it one entry *ahead* of the loop counter.
 	 */
 	udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now);
+	udp_mh_recv[0].msg_hdr.msg_namelen = sasize;
 	for (i = 0; i < n; ) {
 		flow_sidx_t batchsidx = udp_meta[i].tosidx;
 		uint8_t batchpif = pif_at_sidx(batchsidx);
@@ -525,9 +509,9 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 		do {
 			if (pif_is_socket(batchpif)) {
-				udp_splice_prepare(mmh_recv, i);
+				udp_splice_prepare(udp_mh_recv, i);
 			} else if (batchpif == PIF_TAP) {
-				udp_tap_prepare(mmh_recv, i,
+				udp_tap_prepare(udp_mh_recv, i,
 						flowside_at_sidx(batchsidx));
 			}
 
@@ -537,6 +521,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 			udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
 								&udp_meta[i].s_in,
 								now);
+			udp_mh_recv[i].msg_hdr.msg_namelen = sasize;
 		} while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx));
 
 		if (pif_is_socket(batchpif)) {
@@ -572,19 +557,16 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now)
 {
-	const struct flowside *fromside = flowside_at_sidx(ref.flowside);
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
 	int from_s = uflow->s[ref.flowside.sidei];
-	bool v6 = !inany_v4(&fromside->eaddr);
-	struct mmsghdr *mmh_recv = v6 ? udp6_mh_recv : udp4_mh_recv;
 	uint8_t topif = pif_at_sidx(tosidx);
 	int n, i;
 
 	ASSERT(!c->no_udp && uflow);
 
-	if ((n = udp_sock_recv(c, from_s, events, mmh_recv)) <= 0)
+	if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
 		return;
 
 	flow_trace(uflow, "Received %d datagrams on reply socket", n);
@@ -592,9 +574,11 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	for (i = 0; i < n; i++) {
 		if (pif_is_socket(topif))
-			udp_splice_prepare(mmh_recv, i);
+			udp_splice_prepare(udp_mh_recv, i);
 		else if (topif == PIF_TAP)
-			udp_tap_prepare(mmh_recv, i, toside);
+			udp_tap_prepare(udp_mh_recv, i, toside);
+		/* Restore sockaddr length clobbered by recvmsg() */
+		udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
 	}
 
 	if (pif_is_socket(topif)) {
@@ -740,8 +724,6 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		uref.pif = PIF_HOST;
 
 	if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
-		uref.v6 = 0;
-
 		if (!ns) {
 			r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
 					 addr, ifname, port, uref.u32);
@@ -756,8 +738,6 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 	}
 
 	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
-		uref.v6 = 1;
-
 		if (!ns) {
 			r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
 					 addr, ifname, port, uref.u32);
diff --git a/udp.h b/udp.h
index fb42e1c..a8e76bf 100644
--- a/udp.h
+++ b/udp.h
@@ -26,14 +26,12 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
  * union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets
  * @port:		Source port for connected sockets, bound port otherwise
  * @pif:		pif for this socket
- * @v6:			Set for IPv6 sockets or connections
  * @u32:		Opaque u32 value of reference
  */
 union udp_listen_epoll_ref {
 	struct {
 		in_port_t	port;
 		uint8_t		pif;
-		bool		v6:1;
 	};
 	uint32_t u32;
 };

From c78b194001ec211401144e3e89071bc2f54f121d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 27 Aug 2024 16:04:45 +1000
Subject: [PATCH 003/382] udp: Remove unnnecessary local from udp_sock_init()

The 's' variable is always redundant with either 'r4' or 'r6', so remove
it.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/udp.c b/udp.c
index 01b03df..41a6247 100644
--- a/udp.c
+++ b/udp.c
@@ -714,7 +714,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		  const void *addr, const char *ifname, in_port_t port)
 {
 	union udp_listen_epoll_ref uref = { .port = port };
-	int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
+	int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
 
 	ASSERT(!c->no_udp);
 
@@ -725,29 +725,29 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 
 	if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
 		if (!ns) {
-			r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
-					 addr, ifname, port, uref.u32);
+			r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
+				     addr, ifname, port, uref.u32);
 
-			udp_splice_init[V4][port] = s < 0 ? -1 : s;
+			udp_splice_init[V4][port] = r4 < 0 ? -1 : r4;
 		} else {
-			r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
-					 &in4addr_loopback,
-					 ifname, port, uref.u32);
-			udp_splice_ns[V4][port] = s < 0 ? -1 : s;
+			r4  = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
+				      &in4addr_loopback,
+				      ifname, port, uref.u32);
+			udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4;
 		}
 	}
 
 	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
 		if (!ns) {
-			r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
-					 addr, ifname, port, uref.u32);
+			r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
+				     addr, ifname, port, uref.u32);
 
-			udp_splice_init[V6][port] = s < 0 ? -1 : s;
+			udp_splice_init[V6][port] = r6 < 0 ? -1 : r6;
 		} else {
-			r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
-					 &in6addr_loopback,
-					 ifname, port, uref.u32);
-			udp_splice_ns[V6][port] = s < 0 ? -1 : s;
+			r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
+				     &in6addr_loopback,
+				     ifname, port, uref.u32);
+			udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6;
 		}
 	}
 

From e0be6bc2f4762ba8c090aef0f8b85a47a4243356 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 27 Aug 2024 16:04:46 +1000
Subject: [PATCH 004/382] udp: Use dual stack sockets for port forwarding when
 possible

Platforms like Linux allow IPv6 sockets to listen for IPv4 connections as
well as native IPv6 connections.  By doing this we halve the number of
listening sockets we need (assuming passt/pasta is listening on the same
ports for IPv4 and IPv6).  When forwarding many ports (e.g. -u all) this
can significantly reduce the amount of kernel memory that passt consumes.

We've used such dual stack sockets for TCP since 8e914238b "tcp: Use dual
stack sockets for port forwarding when possible".  Add similar support for
UDP "listening" sockets.  Since UDP sockets don't use as much kernel memory
as TCP sockets this isn't as big a saving, but it's still significant.
When forwarding all TCP and UDP ports for both IPv4 & IPv6 (-t all -u all),
this reduces kernel memory usage from ~522 MiB to ~380MiB (kernel version
6.10.6 on Fedora 40, x86_64).

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/udp.c b/udp.c
index 41a6247..bd9051e 100644
--- a/udp.c
+++ b/udp.c
@@ -723,6 +723,25 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 	else
 		uref.pif = PIF_HOST;
 
+	if (af == AF_UNSPEC && c->ifi4 && c->ifi6) {
+		int s;
+
+		/* Attempt to get a dual stack socket */
+		if (!ns) {
+			s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
+				    addr, ifname, port, uref.u32);
+			udp_splice_init[V4][port] = s < 0 ? -1 : s;
+			udp_splice_init[V6][port] = s < 0 ? -1 : s;
+		} else {
+			s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
+				    &in4addr_loopback, ifname, port, uref.u32);
+			udp_splice_ns[V4][port] = s < 0 ? -1 : s;
+			udp_splice_ns[V6][port] = s < 0 ? -1 : s;
+		}
+		if (IN_INTERVAL(0, FD_REF_MAX, s))
+			return 0;
+	}
+
 	if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
 		if (!ns) {
 			r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,

From 712ca3235329b049bf9a4e481ba38a4c64768e8b Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 27 Aug 2024 08:23:41 +0200
Subject: [PATCH 005/382] seccomp.sh: Try to account for terminal width while
 formatting list of system calls

Avoid excess lines on wide terminals, but make sure we don't fail if
we can't fetch the number of columns for any reason, as it's not a
fundamental feature and we don't want to break anything with it.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 seccomp.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/seccomp.sh b/seccomp.sh
index 052e1c8..38aa826 100755
--- a/seccomp.sh
+++ b/seccomp.sh
@@ -242,7 +242,10 @@ for __p in ${__profiles}; do
 	__calls="$(sed -n 's/[\t ]*\*[\t ]*#syscalls\(:'"${__p}"'\|\)[\t ]\{1,\}\(.*\)/\2/p' ${IN})"
 	__calls="${__calls} ${EXTRA_SYSCALLS:-}"
 	__calls="$(filter ${__calls})"
-	echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t
+
+	cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
+	case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac
+	echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args}
 
 	# Pad here to keep gen_profile() "simple"
 	__count=0

From 1daf6f4615226a2cdd9523a80d70736af4a9f3c0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 29 Aug 2024 19:58:45 +1000
Subject: [PATCH 006/382] conf, fwd: Make ephemeral port logic more flexible

"Ephemeral" ports are those which the kernel may allocate as local
port numbers for outgoing connections or datagrams.  Because of that,
they're generally not good choices for listening servers to bind to.

Thefore when using -t all, -u all or exclude-only ranges, we map only
non-ephemeral ports.  Our logic for this is a bit rigid though: we
assume the ephemeral ports are always a fixed range at the top of the
port number space.  We also assume PORT_EPHEMERAL_MIN is a multiple of
8, or we won't set the forward bitmap correctly.

Make the logic in conf.c more flexible, using a helper moved into
fwd.[ch], although we don't change which ports we consider ephemeral
(yet).

The new handling is undoubtedly more computationally expensive, but
since it's a once-off operation at start off, I don't think it really
matters.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 12 ++++++++----
 fwd.c  | 17 +++++++++++++++++
 fwd.h  |  2 ++
 util.h |  3 ---
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/conf.c b/conf.c
index e29b6a9..6b3dafd 100644
--- a/conf.c
+++ b/conf.c
@@ -156,9 +156,12 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 			die("'all' port forwarding is only allowed for passt");
 
 		fwd->mode = FWD_ALL;
-		memset(fwd->map, 0xff, PORT_EPHEMERAL_MIN / 8);
 
-		for (i = 0; i < PORT_EPHEMERAL_MIN; i++) {
+		for (i = 0; i < NUM_PORTS; i++) {
+			if (fwd_port_is_ephemeral(i))
+				continue;
+
+			bitmap_set(fwd->map, i);
 			if (optname == 't') {
 				ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL,
 						    i);
@@ -259,8 +262,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 	} while ((p = next_chunk(p, ',')));
 
 	if (exclude_only) {
-		for (i = 0; i < PORT_EPHEMERAL_MIN; i++) {
-			if (bitmap_isset(exclude, i))
+		for (i = 0; i < NUM_PORTS; i++) {
+			if (fwd_port_is_ephemeral(i) ||
+			    bitmap_isset(exclude, i))
 				continue;
 
 			bitmap_set(fwd->map, i);
diff --git a/fwd.c b/fwd.c
index 2a0452f..8fa312a 100644
--- a/fwd.c
+++ b/fwd.c
@@ -27,6 +27,23 @@
 #include "lineread.h"
 #include "flow_table.h"
 
+/* Empheral port range: values from RFC 6335 */
+static const in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
+static const in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
+
+/**
+ * fwd_port_is_ephemeral() - Is port number ephemeral?
+ * @port:	Port number
+ *
+ * Return: true if @port is ephemeral, that is may be allocated by the kernel as
+ *         a local port for outgoing connections or datagrams, but should not be
+ *         used for binding services to.
+ */
+bool fwd_port_is_ephemeral(in_port_t port)
+{
+	return (port >= fwd_ephemeral_min) && (port <= fwd_ephemeral_max);
+}
+
 /* See enum in kernel's include/net/tcp_states.h */
 #define UDP_LISTEN	0x07
 #define TCP_LISTEN	0x0a
diff --git a/fwd.h b/fwd.h
index b4aa8d5..99dd66c 100644
--- a/fwd.h
+++ b/fwd.h
@@ -12,6 +12,8 @@ struct flowside;
 /* Number of ports for both TCP and UDP */
 #define	NUM_PORTS	(1U << 16)
 
+bool fwd_port_is_ephemeral(in_port_t port);
+
 enum fwd_ports_mode {
 	FWD_UNSET = 0,
 	FWD_SPEC = 1,
diff --git a/util.h b/util.h
index 1463c92..c7a59d5 100644
--- a/util.h
+++ b/util.h
@@ -95,9 +95,6 @@
 #define FD_PROTO(x, proto)						\
 	(IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x)))
 
-#define PORT_EPHEMERAL_MIN	((1 << 15) + (1 << 14))		/* RFC 6335 */
-#define PORT_IS_EPHEMERAL(port) ((port) >= PORT_EPHEMERAL_MIN)
-
 #define MAC_ZERO		((uint8_t [ETH_ALEN]){ 0 })
 #define MAC_IS_ZERO(addr)	(!memcmp((addr), MAC_ZERO, ETH_ALEN))
 

From 4a41dc58d67e910c3a1f505a6a20988c4555e735 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 29 Aug 2024 19:58:46 +1000
Subject: [PATCH 007/382] conf, fwd: Don't attempt to forward port 0

When using -t all, -u all or exclude-only ranges, we'll attempt to forward
all non-ephemeral port numbers, including port 0.  However, this won't work
as intended: bind() treats a zero port not as literal port 0, but as
"pick a port for me".  Because of the special meaning of port 0, we mostly
outright exclude it in our handling.

Do the same for setting up forwards, not attempting to forward for port 0.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/conf.c b/conf.c
index 6b3dafd..3eb117f 100644
--- a/conf.c
+++ b/conf.c
@@ -157,7 +157,10 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 
 		fwd->mode = FWD_ALL;
 
-		for (i = 0; i < NUM_PORTS; i++) {
+		/* Skip port 0.  It has special meaning for many socket APIs, so
+		 * trying to bind it is not really safe.
+		 */
+		for (i = 1; i < NUM_PORTS; i++) {
 			if (fwd_port_is_ephemeral(i))
 				continue;
 
@@ -262,7 +265,10 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 	} while ((p = next_chunk(p, ',')));
 
 	if (exclude_only) {
-		for (i = 0; i < NUM_PORTS; i++) {
+		/* Skip port 0.  It has special meaning for many socket APIs, so
+		 * trying to bind it is not really safe.
+		 */
+		for (i = 1; i < NUM_PORTS; i++) {
 			if (fwd_port_is_ephemeral(i) ||
 			    bitmap_isset(exclude, i))
 				continue;

From eedc81b6ef552736c4d1d7354837e296af081b57 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 29 Aug 2024 19:58:47 +1000
Subject: [PATCH 008/382] fwd, conf: Probe host's ephemeral ports

When we forward "all" ports (-t all or -u all), or use an exclude-only
range, we don't actually forward *all* ports - that wouln't leave local
ports to use for outgoing connections.  Rather we forward all non-ephemeral
ports - those that won't be used for outgoing connections or datagrams.

Currently we assume the range of ephemeral ports is that recommended by
RFC 6335, 49152-65535.  However, that's not the range used by default on
Linux, 32768-60999 but configurable with the net.ipv4.ip_local_port_range
sysctl.

We can't really know what range the guest will consider ephemeral, but if
it differs too much from the host it's likely to cause problems we can't
avoid anyway.  So, using the host's ephemeral range is a better guess than
using the RFC 6335 range.

Therefore, add logic to probe the host's ephemeral range, falling back to
the RFC 6335 range if that fails.  This has the bonus advantage of
reducing the number of ports bound by -t all -u all on most Linux machines
thereby reducing kernel memory usage.  Specifically this reduces kernel
memory usage with -t all -u all from ~380MiB to ~289MiB.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c |  1 +
 fwd.c  | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 fwd.h  |  1 +
 3 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/conf.c b/conf.c
index 3eb117f..b275886 100644
--- a/conf.c
+++ b/conf.c
@@ -1721,6 +1721,7 @@ void conf(struct ctx *c, int argc, char **argv)
 	/* Inbound port options & DNS can be parsed now (after IPv4/IPv6
 	 * settings)
 	 */
+	fwd_probe_ephemeral();
 	udp_portmap_clear();
 	optind = 0;
 	do {
diff --git a/fwd.c b/fwd.c
index 8fa312a..a505098 100644
--- a/fwd.c
+++ b/fwd.c
@@ -28,8 +28,65 @@
 #include "flow_table.h"
 
 /* Empheral port range: values from RFC 6335 */
-static const in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
-static const in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
+static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
+static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
+
+#define PORT_RANGE_SYSCTL	"/proc/sys/net/ipv4/ip_local_port_range"
+
+/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
+ *
+ * Work out what ports the host thinks are emphemeral and record it for later
+ * use by fwd_port_is_ephemeral().  If we're unable to probe, assume the range
+ * recommended by RFC 6335.
+ */
+void fwd_probe_ephemeral(void)
+{
+	char *line, *tab, *end;
+	struct lineread lr;
+	long min, max;
+	ssize_t len;
+	int fd;
+
+	fd = open(PORT_RANGE_SYSCTL, O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		warn_perror("Unable to open %s", PORT_RANGE_SYSCTL);
+		return;
+	}
+
+	lineread_init(&lr, fd);
+	len = lineread_get(&lr, &line);
+	close(fd);
+
+	if (len < 0)
+		goto parse_err;
+
+	tab = strchr(line, '\t');
+	if (!tab)
+		goto parse_err;
+	*tab = '\0';
+
+	errno = 0;
+	min = strtol(line, &end, 10);
+	if (*end || errno)
+		goto parse_err;
+
+	errno = 0;
+	max = strtol(tab + 1, &end, 10);
+	if (*end || errno)
+		goto parse_err;
+
+	if (min < 0 || min >= NUM_PORTS ||
+	    max < 0 || max >= NUM_PORTS)
+		goto parse_err;
+
+	fwd_ephemeral_min = min;
+	fwd_ephemeral_max = max;
+
+	return;
+
+parse_err:
+	warn("Unable to parse %s", PORT_RANGE_SYSCTL);
+}
 
 /**
  * fwd_port_is_ephemeral() - Is port number ephemeral?
diff --git a/fwd.h b/fwd.h
index 99dd66c..3562f3c 100644
--- a/fwd.h
+++ b/fwd.h
@@ -12,6 +12,7 @@ struct flowside;
 /* Number of ports for both TCP and UDP */
 #define	NUM_PORTS	(1U << 16)
 
+void fwd_probe_ephemeral(void);
 bool fwd_port_is_ephemeral(in_port_t port);
 
 enum fwd_ports_mode {

From 38363964fc96008761195984c989b036227e0e5c Mon Sep 17 00:00:00 2001
From: Michal Privoznik <mprivozn@redhat.com>
Date: Thu, 29 Aug 2024 16:16:03 +0200
Subject: [PATCH 009/382] Makefile: Enable _FORTIFY_SOURCE iff needed

On some systems source fortification is enabled whenever code
optimization is enabled (e.g. with -O2). Since code fortification
is explicitly enabled too (with possibly different value than the
system wants, there are three levels [1]), distros are required
to patch our Makefile, e.g. [2].

Detect whether fortification is not already enabled and enable it
explicitly only if really needed.

1: https://www.gnu.org/software/libc/manual/html_node/Source-Fortification.html
2: https://github.com/gentoo/gentoo/commit/edfeb8763ac56112c59248c62c9cda13e5d01c97

Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 01fada4..74a9513 100644
--- a/Makefile
+++ b/Makefile
@@ -33,9 +33,16 @@ AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
 AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
 AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
 
+# On some systems enabling optimization also enables source fortification,
+# automagically. Do not override it.
+FORTIFY_FLAG :=
+ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /dev/null; echo $$?),1)
+FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
+endif
+
 FLAGS := -Wall -Wextra -Wno-format-zero-length
 FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
-FLAGS += -D_FORTIFY_SOURCE=2 -O2 -pie -fPIE
+FLAGS +=  $(FORTIFY_FLAG) -O2 -pie -fPIE
 FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
 FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
 FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)

From 0ea60e5a7741658ad7056a0a6674e00e72d2d288 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 3 Sep 2024 23:45:53 +0200
Subject: [PATCH 010/382] log: Don't prefix log file messages with time and
 severity if they're continuations

In fecb1b65b1ac ("log: Don't prefix message with timestamp on --debug
if it's a continuation"), I fixed this for --debug on standard error,
but not for log files: if messages are continuations, they shouldn't
be prefixed by timestamp and severity.

Otherwise, we'll print stuff like this:

  0.0028: ERROR:   Receive error on guest connection, reset0.0028:  ERROR:   : Bad file descriptor

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 log.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/log.c b/log.c
index 433b552..a61468e 100644
--- a/log.c
+++ b/log.c
@@ -224,19 +224,23 @@ static int logfile_rotate(int fd, const struct timespec *now)
 /**
  * logfile_write() - Write entry to log file, trigger rotation if full
  * @newline:	Append newline at the end of the message, if missing
+ * @cont:	Continuation of a previous message, on the same line
  * @pri:	Facility and level map, same as priority for vsyslog()
  * @now:	Timestamp
  * @format:	Same as vsyslog() format
  * @ap:		Same as vsyslog() ap
  */
-static void logfile_write(bool newline, int pri, const struct timespec *now,
+static void logfile_write(bool newline, bool cont, int pri,
+			  const struct timespec *now,
 			  const char *format, va_list ap)
 {
 	char buf[BUFSIZ];
-	int n;
+	int n = 0;
 
-	n  = logtime_fmt(buf, BUFSIZ, now);
-	n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]);
+	if (!cont) {
+		n += logtime_fmt(buf, BUFSIZ, now);
+		n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]);
+	}
 
 	n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
 
@@ -278,7 +282,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
 
 		va_copy(ap2, ap); /* Don't clobber ap, we need it again */
 		if (log_file != -1)
-			logfile_write(newline, pri, now, format, ap2);
+			logfile_write(newline, cont, pri, now, format, ap2);
 		else if (!(log_mask & LOG_MASK(LOG_DEBUG)))
 			passt_vsyslog(newline, pri, format, ap2);
 

From 7ad9f9bd2bbda8d705e0c6faf5acf2792fce063c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Sep 2024 15:17:05 +1000
Subject: [PATCH 011/382] flow: Fix incorrect hash probe in flowside_lookup()

Our flow hash table uses linear probing in which we step backwards through
clusters of adjacent hash entries when we have near collisions.  Usually
that's implemented by flow_hash_probe().  However, due to some details we
need a second implementation in flowside_lookup().  An embarrassing
oversight in rebasing from earlier versions has mean that version is
incorrect, trying to step forward through clusters rather than backward.

In situations with the right sorts of has near-collisions this can lead to
us not associating an ACK from the tap device with the right flow, leaving
it in a not-quite-established state.  If the remote peer does a shutdown()
at the right time, this can lead to a storm of EPOLLRDHUP events causing
high CPU load.

Fixes: acca4235c46f ("flow, tcp: Generalise TCP hash table to general flow hash table")
Link: https://bugs.passt.top/show_bug.cgi?id=94
Suggested-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow.c b/flow.c
index 02631eb..a00e01d 100644
--- a/flow.c
+++ b/flow.c
@@ -697,7 +697,7 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
 	       !(FLOW_PROTO(&flow->f) == proto &&
 		 flow->f.pif[sidx.sidei] == pif &&
 		 flowside_eq(&flow->f.side[sidx.sidei], side)))
-		b = (b + 1) % FLOW_HASH_SIZE;
+		b = mod_sub(b, 1, FLOW_HASH_SIZE);
 
 	return flow_hashtab[b];
 }

From 1166401c2f2b97961bdc285b336eed912b4f8bb1 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Sep 2024 15:17:06 +1000
Subject: [PATCH 012/382] udp: Allow UDP flows to be prematurely closed

Unlike TCP, UDP has no in-band signalling for the end of a flow.  So the
only way we remove flows is on a timer if they have no activity for 180s.

However, we've started to investigate some error conditions in which we
want to prematurely abort / abandon a UDP flow.  We can call
udp_flow_close(), which will make the flow inert (sockets closed, no epoll
events, can't be looked up in hash).  However it will still wait 3 minutes
to clear away the stale entry.

Clean this up by adding an explicit 'closed' flag which will cause a flow
to be more promptly cleaned up.  We also publish udp_flow_close() so it
can be called from other places to abort UDP flows().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c     |  3 ++-
 udp_flow.c | 18 +++++++++++++++++-
 udp_flow.h |  4 ++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/flow.c b/flow.c
index a00e01d..f2de041 100644
--- a/flow.c
+++ b/flow.c
@@ -832,7 +832,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 				closed = icmp_ping_timer(c, &flow->ping, now);
 			break;
 		case FLOW_UDP:
-			if (timer)
+			closed = udp_flow_defer(&flow->udp);
+			if (!closed && timer)
 				closed = udp_flow_timer(c, &flow->udp, now);
 			break;
 		default:
diff --git a/udp_flow.c b/udp_flow.c
index 1ff59a9..b81be2c 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -39,8 +39,11 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
  * @c:		Execution context
  * @uflow:	UDP flow
  */
-static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
+void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 {
+	if (uflow->closed)
+		return; /* Nothing to do */
+
 	if (uflow->s[INISIDE] >= 0) {
 		/* The listening socket needs to stay in epoll */
 		close(uflow->s[INISIDE]);
@@ -56,6 +59,8 @@ static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 	flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
 	if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
 		flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
+
+	uflow->closed = true;
 }
 
 /**
@@ -256,6 +261,17 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 	return udp_flow_new(c, flow, -1, now);
 }
 
+/**
+ * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
+ * @uflow:	Flow to handle
+ *
+ * Return: true if the connection is ready to free, false otherwise
+ */
+bool udp_flow_defer(const struct udp_flow *uflow)
+{
+	return uflow->closed;
+}
+
 /**
  * udp_flow_timer() - Handler for timed events related to a given flow
  * @c:		Execution context
diff --git a/udp_flow.h b/udp_flow.h
index 12ddf03..9a1b059 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -10,6 +10,7 @@
 /**
  * struct udp - Descriptor for a flow of UDP packets
  * @f:		Generic flow information
+ * @closed:	Flow is already closed
  * @ts:		Activity timestamp
  * @s:		Socket fd (or -1) for each side of the flow
  */
@@ -17,6 +18,7 @@ struct udp_flow {
 	/* Must be first element */
 	struct flow_common f;
 
+	bool closed :1;
 	time_t ts;
 	int s[SIDES];
 };
@@ -30,6 +32,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 			      const void *saddr, const void *daddr,
 			      in_port_t srcport, in_port_t dstport,
 			      const struct timespec *now);
+void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
+bool udp_flow_defer(const struct udp_flow *uflow);
 bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
 		    const struct timespec *now);
 

From 88bfa3801e187ac33ca9de552612bc30a1708c72 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Sep 2024 15:17:07 +1000
Subject: [PATCH 013/382] flow: Helpers to log details of a flow

The details of a flow - endpoints, interfaces etc. - can be pretty
important for debugging.  We log this on flow state transitions, but it can
also be useful to log this when we report specific conditions.  Add some
helper functions and macros to make it easy to do that.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 48 +++++++++++++++++++++++++++++++-----------------
 flow.h |  7 +++++++
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/flow.c b/flow.c
index f2de041..1ea112b 100644
--- a/flow.c
+++ b/flow.c
@@ -283,28 +283,23 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 	       "Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
 }
 
-/**
- * flow_set_state() - Change flow's state
- * @f:		Flow changing state
- * @state:	New state
+/** flow_log_details_() - Log the details of a flow
+ * @f:		flow to log
+ * @pri:	Log priority
+ * @state:	State to log details according to
+ *
+ * Logs the details of the flow: endpoints, interfaces, type etc.
  */
-static void flow_set_state(struct flow_common *f, enum flow_state state)
+void flow_log_details_(const struct flow_common *f, int pri,
+		       enum flow_state state)
 {
 	char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
 	char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
 	const struct flowside *ini = &f->side[INISIDE];
 	const struct flowside *tgt = &f->side[TGTSIDE];
-	uint8_t oldstate = f->state;
 
-	ASSERT(state < FLOW_NUM_STATES);
-	ASSERT(oldstate < FLOW_NUM_STATES);
-
-	f->state = state;
-	flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
-		  FLOW_STATE(f));
-
-	if (MAX(state, oldstate) >= FLOW_STATE_TGT)
-		flow_log_(f, LOG_DEBUG,
+	if (state >= FLOW_STATE_TGT)
+		flow_log_(f, pri,
 			  "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
 			  pif_name(f->pif[INISIDE]),
 			  inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
@@ -316,8 +311,8 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
 			  tgt->oport,
 			  inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
 			  tgt->eport);
-	else if (MAX(state, oldstate) >= FLOW_STATE_INI)
-		flow_log_(f, LOG_DEBUG, "%s [%s]:%hu -> [%s]:%hu => ?",
+	else if (state >= FLOW_STATE_INI)
+		flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
 			  pif_name(f->pif[INISIDE]),
 			  inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
 			  ini->eport,
@@ -325,6 +320,25 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
 			  ini->oport);
 }
 
+/**
+ * flow_set_state() - Change flow's state
+ * @f:		Flow changing state
+ * @state:	New state
+ */
+static void flow_set_state(struct flow_common *f, enum flow_state state)
+{
+	uint8_t oldstate = f->state;
+
+	ASSERT(state < FLOW_NUM_STATES);
+	ASSERT(oldstate < FLOW_NUM_STATES);
+
+	f->state = state;
+	flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
+		  FLOW_STATE(f));
+
+	flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
+}
+
 /**
  * flow_initiate_() - Move flow to INI, setting pif[INISIDE]
  * @flow:	Flow to change state
diff --git a/flow.h b/flow.h
index d167b65..24ba3ef 100644
--- a/flow.h
+++ b/flow.h
@@ -264,4 +264,11 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 			flow_dbg((f), __VA_ARGS__);			\
 	} while (0)
 
+void flow_log_details_(const struct flow_common *f, int pri,
+		       enum flow_state state);
+#define flow_log_details(f_, pri) \
+	flow_log_details_(&((f_)->f), (pri), (f_)->f.state)
+#define flow_dbg_details(f_)	flow_log_details((f_), LOG_DEBUG)
+#define flow_err_details(f_)	flow_log_details((f_), LOG_ERR)
+
 #endif /* FLOW_H */

From bd092ca421be8908aadbeb2ecdfb9fede0f67c07 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Sep 2024 15:17:08 +1000
Subject: [PATCH 014/382] udp: Split socket error handling out from
 udp_sock_recv()

Currently udp_sock_recv() both attempts to clear socket errors and read
a batch of datagrams for forwarding.  That made sense initially, since
both listening and reply sockets need to do this.  However, we have certain
error cases which will add additional complexity to the error processing.
Furthermore, if we ever wanted to more thoroughly handle errors received
here - e.g. by synthesising ICMP messages on the tap device - it will
likely require different handling for the listening and reply socket cases.

So, split handling of error events into its own udp_sock_errs() function.
While we're there, allow it to report "unrecoverable errors".  We don't
have any of these so far, but some cases we're working on might require it.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 46 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/udp.c b/udp.c
index bd9051e..45142cd 100644
--- a/udp.c
+++ b/udp.c
@@ -436,6 +436,30 @@ static bool udp_sock_recverr(int s)
 	return true;
 }
 
+/**
+ * udp_sock_errs() - Process errors on a socket
+ * @c:		Execution context
+ * @s:		Socket to receive from
+ * @events:	epoll events bitmap
+ *
+ * Return: Number of errors handled, or < 0 if we have an unrecoverable error
+ */
+static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
+{
+	unsigned n_err = 0;
+
+	ASSERT(!c->no_udp);
+
+	if (!(events & EPOLLERR))
+		return 0; /* Nothing to do */
+
+	/* Empty the error queue */
+	while (udp_sock_recverr(s))
+		n_err++;
+
+	return n_err;
+}
+
 /**
  * udp_sock_recv() - Receive datagrams from a socket
  * @c:		Execution context
@@ -443,6 +467,8 @@ static bool udp_sock_recverr(int s)
  * @events:	epoll events bitmap
  * @mmh		mmsghdr array to receive into
  *
+ * Return: Number of datagrams received
+ *
  * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
  */
 static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
@@ -459,12 +485,6 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
 
 	ASSERT(!c->no_udp);
 
-	/* Clear any errors first */
-	if (events & EPOLLERR) {
-		while (udp_sock_recverr(s))
-			;
-	}
-
 	if (!(events & EPOLLIN))
 		return 0;
 
@@ -492,6 +512,13 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
 	int n, i;
 
+	if (udp_sock_errs(c, ref.fd, events) < 0) {
+		err("UDP: Unrecoverable error on listening socket:"
+		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
+		/* FIXME: what now?  close/re-open socket? */
+		return;
+	}
+
 	if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
 		return;
 
@@ -566,6 +593,13 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	ASSERT(!c->no_udp && uflow);
 
+	if (udp_sock_errs(c, from_s, events) < 0) {
+		flow_err(uflow, "Unrecoverable error on reply socket");
+		flow_err_details(uflow);
+		udp_flow_close(c, uflow);
+		return;
+	}
+
 	if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
 		return;
 

From bd99f02a64f46cae44ef13c3cb934b8baa9c1a2c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Sep 2024 15:17:09 +1000
Subject: [PATCH 015/382] udp: Treat errors getting errors as unrecoverable

We can get network errors, usually transient, reported via the socket error
queue.  However, at least theoretically, we could get errors trying to
read the queue itself.  Since we have no idea how to clear an error
condition in that case, treat it as unrecoverable.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/udp.c b/udp.c
index 45142cd..85a14de 100644
--- a/udp.c
+++ b/udp.c
@@ -387,11 +387,12 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
  * udp_sock_recverr() - Receive and clear an error from a socket
  * @s:		Socket to receive from
  *
- * Return: true if errors received and processed, false if no more errors
+ * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
+ *         if there was an error reading the queue
  *
  * #syscalls recvmsg
  */
-static bool udp_sock_recverr(int s)
+static int udp_sock_recverr(int s)
 {
 	const struct sock_extended_err *ee;
 	const struct cmsghdr *hdr;
@@ -408,14 +409,16 @@ static bool udp_sock_recverr(int s)
 
 	rc = recvmsg(s, &mh, MSG_ERRQUEUE);
 	if (rc < 0) {
-		if (errno != EAGAIN && errno != EWOULDBLOCK)
-			err_perror("Failed to read error queue");
-		return false;
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+			return 0;
+
+		err_perror("UDP: Failed to read error queue");
+		return -1;
 	}
 
 	if (!(mh.msg_flags & MSG_ERRQUEUE)) {
 		err("Missing MSG_ERRQUEUE flag reading error queue");
-		return false;
+		return -1;
 	}
 
 	hdr = CMSG_FIRSTHDR(&mh);
@@ -424,7 +427,7 @@ static bool udp_sock_recverr(int s)
 	      (hdr->cmsg_level == IPPROTO_IPV6 &&
 	       hdr->cmsg_type == IPV6_RECVERR))) {
 		err("Unexpected cmsg reading error queue");
-		return false;
+		return -1;
 	}
 
 	ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
@@ -433,7 +436,7 @@ static bool udp_sock_recverr(int s)
 	debug("%s error on UDP socket %i: %s",
 	      str_ee_origin(ee), s, strerror(ee->ee_errno));
 
-	return true;
+	return 1;
 }
 
 /**
@@ -447,6 +450,7 @@ static bool udp_sock_recverr(int s)
 static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
 {
 	unsigned n_err = 0;
+	int rc;
 
 	ASSERT(!c->no_udp);
 
@@ -454,8 +458,11 @@ static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
 		return 0; /* Nothing to do */
 
 	/* Empty the error queue */
-	while (udp_sock_recverr(s))
-		n_err++;
+	while ((rc = udp_sock_recverr(s)) > 0)
+		n_err += rc;
+
+	if (rc < 0)
+		return -1; /* error reading error, unrecoverable */
 
 	return n_err;
 }

From aff5a49b0e75dd08428a88c05d98f39885556c8b Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Sep 2024 15:17:10 +1000
Subject: [PATCH 016/382] udp: Handle more error conditions in udp_sock_errs()

udp_sock_errs() reads out everything in the socket error queue.  However
we've seen some cases[0] where an EPOLLERR event is active, but there isn't
anything in the queue.

One possibility is that the error is reported instead by the SO_ERROR
sockopt.  Check for that case and report it as best we can.  If we still
get an EPOLLERR without visible error, we have no way to clear the error
state, so treat it as an unrecoverable error.

[0] https://github.com/containers/podman/issues/23686#issuecomment-2324945010

Link: https://bugs.passt.top/show_bug.cgi?id=95
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/udp.c b/udp.c
index 85a14de..ae91027 100644
--- a/udp.c
+++ b/udp.c
@@ -450,7 +450,8 @@ static int udp_sock_recverr(int s)
 static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
 {
 	unsigned n_err = 0;
-	int rc;
+	socklen_t errlen;
+	int rc, err;
 
 	ASSERT(!c->no_udp);
 
@@ -464,6 +465,24 @@ static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
 	if (rc < 0)
 		return -1; /* error reading error, unrecoverable */
 
+	errlen = sizeof(err);
+	if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 ||
+	    errlen != sizeof(err)) {
+		err_perror("Error reading SO_ERROR");
+		return -1;  /* error reading error, unrecoverable */
+	}
+
+	if (err) {
+		debug("Unqueued error on UDP socket %i: %s", s, strerror(err));
+		n_err++;
+	}
+
+	if (!n_err) {
+		/* EPOLLERR, but no errors to clear !? */
+		err("EPOLLERR event without reported errors on socket %i", s);
+		return -1; /* no way to clear, unrecoverable */
+	}
+
 	return n_err;
 }
 

From afedc2412e8576d95ef49e684601bde2f12d7974 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 6 Sep 2024 10:33:55 +0200
Subject: [PATCH 017/382] tcp: Use EPOLLET for any state of not established
 connections

Currently, for not established connections, we monitor sockets with
edge-triggered events (EPOLLET) if we are in the TAP_SYN_RCVD state
(outbound connection being established) but not in the
TAP_SYN_ACK_SENT case of it (socket is connected, and we sent SYN,ACK
to the container/guest).

While debugging https://bugs.passt.top/show_bug.cgi?id=94, I spotted
another possibility for a short EPOLLRDHUP storm (10 seconds), which
doesn't seem to happen in actual use cases, but I could reproduce it:
start a connection from a container, while dropping (using netfilter)
ACK segments coming out of the container itself.

On the server side, outside the container, accept the connection and
shutdown the writing side of it immediately.

At this point, we're in the TAP_SYN_ACK_SENT case (not just a mere
TAP_SYN_RCVD state), we get EPOLLRDHUP from the socket, but we don't
have any reasonable way to handle it other than waiting for the tap
side to complete the three-way handshake. So we'll just keep getting
this EPOLLRDHUP until the SYN_TIMEOUT kicks in.

Always enable EPOLLET when EPOLLRDHUP is the only epoll event we
subscribe to: in this case, getting multiple EPOLLRDHUP reports is
totally useless.

In the only remaining non-established state, SOCK_ACCEPTED, for
inbound connections, we're anyway discarding EPOLLRDHUP events until
we established the conection, because we don't know what to do with
them until we get an answer from the tap side, so it's safe to enable
EPOLLET also in that case.

Link: https://bugs.passt.top/show_bug.cgi?id=94
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcp.c b/tcp.c
index 77c62f0..f9fe1b9 100644
--- a/tcp.c
+++ b/tcp.c
@@ -440,7 +440,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
 	if (events == TAP_SYN_RCVD)
 		return EPOLLOUT | EPOLLET | EPOLLRDHUP;
 
-	return EPOLLRDHUP;
+	return EPOLLET | EPOLLRDHUP;
 }
 
 /**

From 748ef4cd6e7d7307b4c91cbe59ad040ef535dbdc Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 5 Sep 2024 21:22:04 +1000
Subject: [PATCH 018/382] cppcheck: Work around some cppcheck 2.15.0
 redundantInitialization warnings

cppcheck-2.15.0 has apparently broadened when it throws a warning about
redundant initialization to include some cases where we have an initializer
for some fields, but then set other fields in the function body.

This is arguably a false positive: although we are technically overwriting
the zero-initialization the compiler supplies for fields not explicitly
initialized, this sort of construct makes sense when there are some fields
we know at the top of the function where the initializer is, but others
that require more complex calculation.

That said, in the two places this shows up, it's pretty easy to work
around.  The results are arguably slightly clearer than what we had, since
they move the parts of the initialization closer together.

So do that rather than having ugly suppressions or dealing with the
tedious process of reporting a cppcheck false positive.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 pasta.c |  3 ++-
 udp.c   | 10 ++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pasta.c b/pasta.c
index 1900693..307fb4a 100644
--- a/pasta.c
+++ b/pasta.c
@@ -427,12 +427,12 @@ static int pasta_netns_quit_timer(void)
  */
 void pasta_netns_quit_init(const struct ctx *c)
 {
-	union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY };
 	struct epoll_event ev = { .events = EPOLLIN };
 	int flags = O_NONBLOCK | O_CLOEXEC;
 	struct statfs s = { 0 };
 	bool try_inotify = true;
 	int fd = -1, dir_fd;
+	union epoll_ref ref;
 
 	if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
 		return;
@@ -463,6 +463,7 @@ void pasta_netns_quit_init(const struct ctx *c)
 		ref.type = EPOLL_TYPE_NSQUIT_TIMER;
 	} else {
 		close(dir_fd);
+		ref.type = EPOLL_TYPE_NSQUIT_INOTIFY;
 	}
 
 	if (fd > FD_REF_MAX)
diff --git a/udp.c b/udp.c
index ae91027..2ba00c9 100644
--- a/udp.c
+++ b/udp.c
@@ -773,16 +773,14 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		  const void *addr, const char *ifname, in_port_t port)
 {
-	union udp_listen_epoll_ref uref = { .port = port };
+	union udp_listen_epoll_ref uref = {
+		.pif = ns ? PIF_SPLICE : PIF_HOST,
+		.port = port,
+	};
 	int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
 
 	ASSERT(!c->no_udp);
 
-	if (ns)
-		uref.pif = PIF_SPLICE;
-	else
-		uref.pif = PIF_HOST;
-
 	if (af == AF_UNSPEC && c->ifi4 && c->ifi6) {
 		int s;
 

From 63513e54f3208566ecb746d204ebeaafdd2c79c1 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 6 Sep 2024 12:43:45 +0200
Subject: [PATCH 019/382] util: Fix order of operands and carry of one second
 in timespec_diff_us()

If the nanoseconds of the minuend timestamp are less than the
nanoseconds of the subtrahend timestamp, we need to carry one second
in the subtraction.

I subtracted this second from the minuend, but didn't actually carry
it in the subtraction of nanoseconds, and logged timestamps would jump
back whenever we switched to the first branch of timespec_diff_us()
from the second one.

Most likely, the reason why I didn't carry the second is that I
instinctively thought that swapping the operands would have the same
effect. But it doesn't, in general: that only happens with arithmetic
in modulo powers of 2. Undo the swap as well.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util.c b/util.c
index 6e64279..eede4e5 100644
--- a/util.c
+++ b/util.c
@@ -249,7 +249,7 @@ void sock_probe_mem(struct ctx *c)
 int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b)
 {
 	if (a->tv_nsec < b->tv_nsec) {
-		return (b->tv_nsec - a->tv_nsec) / 1000 +
+		return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 +
 		       (a->tv_sec - b->tv_sec - 1) * 1000000;
 	}
 

From 49fc4e0414610c6eadc6693fee4d5077d2e8097e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Sep 2024 21:49:36 +1000
Subject: [PATCH 020/382] tap: Split out handling of EPOLLIN events

Currently, tap_handler_pas{st,ta}() check for EPOLLRDHUP, EPOLLHUP and
EPOLLERR events, then assume anything left is EPOLLIN.  We have some future
cases that may want to also handle EPOLLOUT, so in preparation explicitly
handle EPOLLIN, moving the logic to a subfunction.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 50 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/tap.c b/tap.c
index 852d837..c8abc06 100644
--- a/tap.c
+++ b/tap.c
@@ -982,24 +982,17 @@ static void tap_sock_reset(struct ctx *c)
 }
 
 /**
- * tap_handler_passt() - Packet handler for AF_UNIX file descriptor
+ * tap_passt_input() - Handler for new data on the socket to qemu
  * @c:		Execution context
- * @events:	epoll events
  * @now:	Current timestamp
  */
-void tap_handler_passt(struct ctx *c, uint32_t events,
-		       const struct timespec *now)
+static void tap_passt_input(struct ctx *c, const struct timespec *now)
 {
 	static const char *partial_frame;
 	static ssize_t partial_len = 0;
 	ssize_t n;
 	char *p;
 
-	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
-		tap_sock_reset(c);
-		return;
-	}
-
 	tap_flush_pools();
 
 	if (partial_len) {
@@ -1052,20 +1045,33 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
 }
 
 /**
- * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor
+ * tap_handler_passt() - Event handler for AF_UNIX file descriptor
  * @c:		Execution context
  * @events:	epoll events
  * @now:	Current timestamp
  */
-void tap_handler_pasta(struct ctx *c, uint32_t events,
+void tap_handler_passt(struct ctx *c, uint32_t events,
 		       const struct timespec *now)
+{
+	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
+		tap_sock_reset(c);
+		return;
+	}
+
+	if (events & EPOLLIN)
+		tap_passt_input(c, now);
+}
+
+/**
+ * tap_pasta_input() - Handler for new data on the socket to hypervisor
+ * @c:		Execution context
+ * @now:	Current timestamp
+ */
+static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 {
 	ssize_t n, len;
 	int ret;
 
-	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
-		die("Disconnect event on /dev/net/tun device, exiting");
-
 redo:
 	n = 0;
 
@@ -1102,6 +1108,22 @@ restart:
 	die("Error on tap device, exiting");
 }
 
+/**
+ * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor
+ * @c:		Execution context
+ * @events:	epoll events
+ * @now:	Current timestamp
+ */
+void tap_handler_pasta(struct ctx *c, uint32_t events,
+		       const struct timespec *now)
+{
+	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
+		die("Disconnect event on /dev/net/tun device, exiting");
+
+	if (events & EPOLLIN)
+		tap_pasta_input(c, now);
+}
+
 /**
  * tap_sock_unix_open() - Create and bind AF_UNIX socket
  * @sock_path:	Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)

From 11e29054fe91ceaf59d2a500e09c4da262c7b23e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Sep 2024 21:49:37 +1000
Subject: [PATCH 021/382] tap: Improve handling of EINTR in tap_passt_input()

When tap_passt_input() gets an error from recv() it (correctly) does not
print any error message for EINTR, EAGAIN or EWOULDBLOCK.  However in all
three cases it returns from the function.  That makes sense for EAGAIN and
EWOULDBLOCK, since we then want to wait for the next EPOLLIN event before
trying again.  For EINTR, however, it makes more sense to retry immediately
- as it stands we're likely to get a renewer EPOLLIN event immediately in
that case, since we're using level triggered signalling.

So, handle EINTR separately by immediately retrying until we succeed or
get a different type of error.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tap.c b/tap.c
index c8abc06..8977b3f 100644
--- a/tap.c
+++ b/tap.c
@@ -1003,10 +1003,13 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 		memmove(pkt_buf, partial_frame, partial_len);
 	}
 
-	n = recv(c->fd_tap, pkt_buf + partial_len, TAP_BUF_BYTES - partial_len,
-		 MSG_DONTWAIT);
+	do {
+		n = recv(c->fd_tap, pkt_buf + partial_len,
+			 TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
+	} while ((n < 0) && errno == EINTR);
+
 	if (n < 0) {
-		if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
+		if (errno != EAGAIN && errno != EWOULDBLOCK) {
 			err_perror("Receive error on guest connection, reset");
 			tap_sock_reset(c);
 		}

From d2a1dc744b10d3e5253149a2520db9967f9f20d5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Sep 2024 21:49:38 +1000
Subject: [PATCH 022/382] tap: Restructure in tap_pasta_input()

tap_pasta_input() has a rather confusing structure, using two gotos.
Remove these by restructuring the function to have the main loop condition
based on filling our buffer space, with errors or running out of data
treated as the exception, rather than the other way around.  This allows
us to handle the EINTR which triggered the 'restart' goto with a continue.

The outer 'redo' was triggered if we completely filled our buffer, to flush
it and do another pass.  This one is unnecessary since we don't (yet) use
EPOLLET on the tap device: if there's still more data we'll get another
event and re-enter the loop.

Along the way handle a couple of extra edge cases:
 - Check for EWOULDBLOCK as well as EAGAIN for the benefit of any future
   ports where those might not have the same value
 - Detect EOF on the tap device and exit in that case

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 45 +++++++++++++++++++--------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/tap.c b/tap.c
index 8977b3f..145587f 100644
--- a/tap.c
+++ b/tap.c
@@ -1073,42 +1073,35 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
 static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 {
 	ssize_t n, len;
-	int ret;
-
-redo:
-	n = 0;
 
 	tap_flush_pools();
-restart:
-	while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
 
-		if (len < (ssize_t)sizeof(struct ethhdr) ||
-		    len > (ssize_t)ETH_MAX_MTU) {
-			n += len;
-			continue;
+	for (n = 0; n < (ssize_t)TAP_BUF_BYTES; n += len) {
+		len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n);
+
+		if (len == 0) {
+			die("EOF on tap device, exiting");
+		} else if (len < 0) {
+			if (errno == EINTR) {
+				len = 0;
+				continue;
+			}
+
+			if (errno == EAGAIN && errno == EWOULDBLOCK)
+				break; /* all done for now */
+
+			die("Error on tap device, exiting");
 		}
 
+		/* Ignore frames of bad length */
+		if (len < (ssize_t)sizeof(struct ethhdr) ||
+		    len > (ssize_t)ETH_MAX_MTU)
+			continue;
 
 		tap_add_packet(c, len, pkt_buf + n);
-
-		if ((n += len) == TAP_BUF_BYTES)
-			break;
 	}
 
-	if (len < 0 && errno == EINTR)
-		goto restart;
-
-	ret = errno;
-
 	tap_handler(c, now);
-
-	if (len > 0 || ret == EAGAIN)
-		return;
-
-	if (n == TAP_BUF_BYTES)
-		goto redo;
-
-	die("Error on tap device, exiting");
 }
 
 /**

From a33ecafbd921a681ef65b66624625a1beac43c50 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Sep 2024 21:49:39 +1000
Subject: [PATCH 023/382] tap: Don't risk truncating frames on full buffer in
 tap_pasta_input()

tap_pasta_input() keeps reading frames from the tap device until the
buffer is full.  However, this has an ugly edge case, when we get close
to buffer full, we will provide just the remaining space as a read()
buffer.  If this is shorter than the next frame to read, the tap device
will truncate the frame and discard the remainder.

Adjust the code to make sure we always have room for a maximum size frame.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tap.c b/tap.c
index 145587f..41af6a6 100644
--- a/tap.c
+++ b/tap.c
@@ -1076,8 +1076,8 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 
 	tap_flush_pools();
 
-	for (n = 0; n < (ssize_t)TAP_BUF_BYTES; n += len) {
-		len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n);
+	for (n = 0; n <= (ssize_t)TAP_BUF_BYTES - ETH_MAX_MTU; n += len) {
+		len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
 
 		if (len == 0) {
 			die("EOF on tap device, exiting");

From 116bc8266d97d3a3679f9f1c5dc306c834562b48 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 6 Sep 2024 15:19:20 +0200
Subject: [PATCH 024/382] selinux: Allow read access to
 /proc/sys/net/ipv4/ip_local_port_range

Since commit eedc81b6ef55 ("fwd, conf: Probe host's ephemeral ports"),
we might need to read from /proc/sys/net/ipv4/ip_local_port_range in
both passt and pasta.

While pasta was already allowed to open and write /proc/sys/net
entries, read access was missing in SELinux's type enforcement: add
that.

In passt, instead, this is the first time we need to access an entry
there: add everything we need.

Fixes: eedc81b6ef55 ("fwd, conf: Probe host's ephemeral ports")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/selinux/passt.te | 3 +++
 contrib/selinux/pasta.te | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index bbb0917..80bf780 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -50,6 +50,7 @@ require {
 	type passwd_file_t;
 
 	class netlink_route_socket { bind create nlmsg_read };
+	type sysctl_net_t;
 
 	class capability { sys_tty_config setuid setgid };
 	class cap_userns { setpcap sys_admin sys_ptrace };
@@ -104,6 +105,8 @@ allow passt_t net_conf_t:lnk_file read;
 allow passt_t tmp_t:sock_file { create unlink write };
 allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
 kernel_search_network_sysctl(passt_t)
+allow passt_t sysctl_net_t:dir search;
+allow passt_t sysctl_net_t:file { open read };
 
 corenet_tcp_bind_all_nodes(passt_t)
 corenet_udp_bind_all_nodes(passt_t)
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index 4e36c3f..310383c 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -196,7 +196,7 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch };
 allow pasta_t self:tun_socket create;
 allow pasta_t tun_tap_device_t:chr_file { ioctl open read write };
 allow pasta_t sysctl_net_t:dir search;
-allow pasta_t sysctl_net_t:file { open write };
+allow pasta_t sysctl_net_t:file { open read write };
 allow pasta_t kernel_t:system module_request;
 
 allow pasta_t nsfs_t:file read;

From 6b38f0723949f8b4b2787ee55d4330249a1a4a3e Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 6 Sep 2024 15:24:26 +0200
Subject: [PATCH 025/382] apparmor: Allow read access to
 /proc/sys/net/ipv4/ip_local_port_range

...for both passt and pasta: use passt's abstraction for this.

Fixes: eedc81b6ef55 ("fwd, conf: Probe host's ephemeral ports")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/apparmor/abstractions/passt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt
index d245115..43fd63f 100644
--- a/contrib/apparmor/abstractions/passt
+++ b/contrib/apparmor/abstractions/passt
@@ -34,6 +34,8 @@
 
   owner @{PROC}/@{pid}/uid_map		r,	# conf_ugid()
 
+  @{PROC}/sys/net/ipv4/ip_local_port_range r,	# fwd_probe_ephemeral()
+
   network netlink raw,				# nl_sock_init_do(), netlink.c
 
   network inet stream,				# tcp.c

From 1f414ed8f0b3101363c1373e338802186eb29b7c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 12 Sep 2024 16:59:39 +1000
Subject: [PATCH 026/382] tcp: Remove redundant initialisation of
 iov[TCP_IOV_ETH].iov_base

This initialisation for IPv4 flags buffers is redundant with the very next
line which sets both iov_base and iov_len.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_buf.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tcp_buf.c b/tcp_buf.c
index c31e9f3..2e044b2 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -168,7 +168,6 @@ void tcp_sock4_iov_init(const struct ctx *c)
 		iov = tcp4_l2_flags_iov[i];
 
 		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
-		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
 		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
 		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
 		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];

From 5ff5d55291d2223c65f889b8eee446b8ed2c551c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 12 Sep 2024 16:59:40 +1000
Subject: [PATCH 027/382] tcp: Avoid overlapping memcpy() in DUP_ACK handling

When handling the DUP_ACK flag, we copy all the buffers making up the ack
frame.  However, all our frames share the same buffer for the Ethernet
header (tcp4_eth_src or tcp6_eth_src), so copying the TCP_IOV_ETH will
result in a (perfectly) overlapping memcpy().  This seems to have been
harmless so far, but overlapping ranges to memcpy() is undefined behaviour,
so we really should avoid it.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_buf.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tcp_buf.c b/tcp_buf.c
index 2e044b2..1a39846 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -332,9 +332,13 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 		else
 			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
 
-		for (i = 0; i < TCP_NUM_IOVS; i++)
-			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
-			       iov[i].iov_len);
+		for (i = 0; i < TCP_NUM_IOVS; i++) {
+			/* All frames share the same ethernet header buffer */
+			if (i != TCP_IOV_ETH) {
+				memcpy(dup_iov[i].iov_base, iov[i].iov_base,
+				       iov[i].iov_len);
+			}
+		}
 		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
 	}
 

From 7d8804beb8ecbd07b51dbbeaf14289d37f4f8107 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 18 Sep 2024 11:53:04 +1000
Subject: [PATCH 028/382] tcp: Make some extra functions private

tcp_send_flag() and tcp_probe_peek_offset_cap() are not used outside tcp.c,
and have no prototype in a header.  Make them static.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index f9fe1b9..14b48a8 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1235,7 +1235,7 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
  *
  * Return: negative error code on connection reset, 0 otherwise
  */
-int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 {
 	return tcp_buf_send_flag(c, conn, flags);
 }
@@ -2477,7 +2477,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
  *
  * Return: true if supported, false otherwise
  */
-bool tcp_probe_peek_offset_cap(sa_family_t af)
+static bool tcp_probe_peek_offset_cap(sa_family_t af)
 {
 	bool ret = false;
 	int s, optv = 0;

From 4aff6f93923327cb875ceacf12ef0ffc2e613174 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 18 Sep 2024 11:53:05 +1000
Subject: [PATCH 029/382] tcp: Clean up tcpi_snd_wnd probing

When available, we want to retrieve our socket peer's advertised window and
forward that to the guest.  That information has been available from the
kernel via the TCP_INFO getsockopt() since kernel commit 8f7baad7f035.

Currently our probing for this is a bit odd.  The HAS_SND_WND define
determines if our headers include the tcp_snd_wnd field, but that doesn't
necessarily mean the running kernel supports it.  Currently we start by
assuming it's _not_ available, but mark it as available if we ever see
a non-zero value in the field.  This is a bit hit and miss in two ways:
 * Zero is perfectly possible window the peer could report, so we can
   get false negatives
 * We're reading TCP_INFO into a local variable, which might not be zero
   initialised, so if the kernel _doesn't_ write it it could have non-zero
   garbage, giving us false positives.

We can use a more direct way of probing for this: getsockopt() reports the
length of the information retreived.  So, check whether that's long enough
to include the field.  This lets us probe the availability of the field
once and for all during initialisation.  That in turn allows ctx to become
a const pointer to tcp_prepare_flags() which cascades through many other
functions.

We also move the flag for the probe result from the ctx structure to a
global, to match peek_offset_cap.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 93 ++++++++++++++++++++++++++++++++++++--------------
 tcp.h          | 13 +++----
 tcp_buf.c      | 10 +++---
 tcp_buf.h      |  6 ++--
 tcp_internal.h |  4 +--
 5 files changed, 82 insertions(+), 44 deletions(-)

diff --git a/tcp.c b/tcp.c
index 14b48a8..cba3f3b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -308,11 +308,6 @@
 /* MSS rounding: see SET_MSS() */
 #define MSS_DEFAULT			536
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
-#ifdef HAS_SND_WND
-# define KERNEL_REPORTS_SND_WND(c)	((c)->tcp.kernel_snd_wnd)
-#else
-# define KERNEL_REPORTS_SND_WND(c)	(0 && (c))
-#endif
 
 #define ACK_INTERVAL			10		/* ms */
 #define SYN_TIMEOUT			10		/* s */
@@ -370,6 +365,14 @@ char		tcp_buf_discard		[MAX_WINDOW];
 
 /* Does the kernel support TCP_PEEK_OFF? */
 bool peek_offset_cap;
+#ifdef HAS_SND_WND
+/* Does the kernel report sending window in TCP_INFO (kernel commit
+ * 8f7baad7f035)
+ */
+bool snd_wnd_cap;
+#else
+#define snd_wnd_cap	(false)
+#endif
 
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
@@ -1052,7 +1055,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 	}
 #endif /* !HAS_BYTES_ACKED */
 
-	if (!KERNEL_REPORTS_SND_WND(c)) {
+	if (!snd_wnd_cap) {
 		tcp_get_sndbuf(conn);
 		new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
 		conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
@@ -1136,7 +1139,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
  *	     0 if there is no flag to send
  *	     1 otherwise
  */
-int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 		      int flags, struct tcphdr *th, char *data,
 		      size_t *optlen)
 {
@@ -1153,11 +1156,6 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
 		return -ECONNRESET;
 	}
 
-#ifdef HAS_SND_WND
-	if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
-		c->tcp.kernel_snd_wnd = 1;
-#endif
-
 	if (!(conn->flags & LOCAL))
 		tcp_rtt_dst_check(conn, &tinfo);
 
@@ -1235,7 +1233,8 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
  *
  * Return: negative error code on connection reset, 0 otherwise
  */
-static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
+			 int flags)
 {
 	return tcp_buf_send_flag(c, conn, flags);
 }
@@ -1245,7 +1244,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
  * @c:		Execution context
  * @conn:	Connection pointer
  */
-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	if (conn->events == CLOSED)
 		return;
@@ -1463,7 +1462,7 @@ static void tcp_bind_outbound(const struct ctx *c,
  * @optlen:	Bytes in options: caller MUST ensure available length
  * @now:	Current timestamp
  */
-static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
+static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 			      const void *saddr, const void *daddr,
 			      const struct tcphdr *th, const char *opts,
 			      size_t optlen, const struct timespec *now)
@@ -1628,7 +1627,7 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
  *
  * #syscalls recvmsg
  */
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	return tcp_buf_data_from_sock(c, conn);
 }
@@ -1644,8 +1643,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
  *
  * Return: count of consumed packets
  */
-static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
-			      const struct pool *p, int idx)
+static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+			     const struct pool *p, int idx)
 {
 	int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
 	uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
@@ -1842,7 +1841,8 @@ out:
  * @opts:	Pointer to start of options
  * @optlen:	Bytes in options: caller MUST ensure available length
  */
-static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_conn_from_sock_finish(const struct ctx *c,
+				      struct tcp_tap_conn *conn,
 				      const struct tcphdr *th,
 				      const char *opts, size_t optlen)
 {
@@ -1885,7 +1885,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
  *
  * Return: count of consumed packets
  */
-int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
+int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		    const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now)
 {
@@ -2023,7 +2023,7 @@ reset:
  * @c:		Execution context
  * @conn:	Connection pointer
  */
-static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	socklen_t sl;
 	int so;
@@ -2049,8 +2049,8 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
  * @sa:		Peer socket address (from accept())
  * @now:	Current timestamp
  */
-static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
-				   const struct timespec *now)
+static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
+				   int s, const struct timespec *now)
 {
 	struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
 	uint64_t hash;
@@ -2081,7 +2081,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
  * @ref:	epoll reference of listening socket
  * @now:	Current timestamp
  */
-void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			const struct timespec *now)
 {
 	const struct flowside *ini;
@@ -2146,7 +2146,7 @@ cancel:
  *
  * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
  */
-void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
+void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 {
 	struct itimerspec check_armed = { { 0 }, { 0 } };
 	struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp;
@@ -2210,7 +2210,8 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
  * @ref:	epoll reference
  * @events:	epoll events bitmap
  */
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
+void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
+		      uint32_t events)
 {
 	struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside);
 
@@ -2494,6 +2495,40 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
 	return ret;
 }
 
+#ifdef HAS_SND_WND
+/**
+ * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd
+ *
+ * Return: true if supported, false otherwise
+ */
+static bool tcp_probe_snd_wnd_cap(void)
+{
+	struct tcp_info tinfo;
+	socklen_t sl = sizeof(tinfo);
+	int s;
+
+	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
+		warn_perror("Temporary TCP socket creation failed");
+		return false;
+	}
+
+	if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+		warn_perror("Failed to get TCP_INFO on temporary socket");
+		close(s);
+		return false;
+	}
+
+	close(s);
+
+	if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) +
+		  sizeof(tinfo.tcpi_snd_wnd)))
+		return false;
+
+	return true;
+}
+#endif /* HAS_SND_WND */
+
 /**
  * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
  * @c:		Execution context
@@ -2527,6 +2562,12 @@ int tcp_init(struct ctx *c)
 			  (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
 	debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
 
+#ifdef HAS_SND_WND
+	snd_wnd_cap = tcp_probe_snd_wnd_cap();
+#endif
+	debug("TCP_INFO tcpi_snd_wnd field%ssupported",
+	      snd_wnd_cap ? " " : " not ");
+
 	return 0;
 }
 
diff --git a/tcp.h b/tcp.h
index e9ff019..5585924 100644
--- a/tcp.h
+++ b/tcp.h
@@ -10,11 +10,12 @@
 
 struct ctx;
 
-void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
-void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+void tcp_timer_handler(const struct ctx *c, union epoll_ref ref);
+void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			const struct timespec *now);
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
-int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
+void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
+		      uint32_t events);
+int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		    const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now);
 int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
@@ -58,16 +59,12 @@ union tcp_listen_epoll_ref {
  * @fwd_in:		Port forwarding configuration for inbound packets
  * @fwd_out:		Port forwarding configuration for outbound packets
  * @timer_run:		Timestamp of most recent timer run
- * @kernel_snd_wnd:	Kernel reports sending window (with commit 8f7baad7f035)
  * @pipe_size:		Size of pipes for spliced connections
  */
 struct tcp_ctx {
 	struct fwd_ports fwd_in;
 	struct fwd_ports fwd_out;
 	struct timespec timer_run;
-#ifdef HAS_SND_WND
-	int kernel_snd_wnd;
-#endif
 	size_t pipe_size;
 };
 
diff --git a/tcp_buf.c b/tcp_buf.c
index 1a39846..c886c92 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -239,7 +239,7 @@ void tcp_flags_flush(const struct ctx *c)
  * @frames:	Two-dimensional array containing queued frames with sub-iovs
  * @num_frames:	Number of entries in the two arrays to be compared
  */
-static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
+static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
 			   struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
 {
 	int i;
@@ -264,7 +264,7 @@ static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
  * tcp_payload_flush() - Send out buffers for segments with data
  * @c:		Execution context
  */
-void tcp_payload_flush(struct ctx *c)
+void tcp_payload_flush(const struct ctx *c)
 {
 	size_t m;
 
@@ -293,7 +293,7 @@ void tcp_payload_flush(struct ctx *c)
  *
  * Return: negative error code on connection reset, 0 otherwise
  */
-int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 {
 	struct tcp_flags_t *payload;
 	struct iovec *iov;
@@ -361,7 +361,7 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
  * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
  * @seq:	Sequence number to be sent
  */
-static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
 	struct iovec *iov;
@@ -405,7 +405,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
  *
  * #syscalls recvmsg
  */
-int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
 	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
diff --git a/tcp_buf.h b/tcp_buf.h
index 3db4c56..8d4b615 100644
--- a/tcp_buf.h
+++ b/tcp_buf.h
@@ -9,8 +9,8 @@
 void tcp_sock4_iov_init(const struct ctx *c);
 void tcp_sock6_iov_init(const struct ctx *c);
 void tcp_flags_flush(const struct ctx *c);
-void tcp_payload_flush(struct ctx *c);
-int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
-int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
+void tcp_payload_flush(const struct ctx *c);
+int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
 
 #endif  /*TCP_BUF_H */
diff --git a/tcp_internal.h b/tcp_internal.h
index aa8bb64..bd634be 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -82,7 +82,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
 		conn_event_do(c, conn, event);				\
 	} while (0)
 
-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
 #define tcp_rst(c, conn)						\
 	do {								\
 		flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
@@ -94,7 +94,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 			       const uint16_t *check, uint32_t seq);
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 			  int force_seq, struct tcp_info *tinfo);
-int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags,
 		      struct tcphdr *th, char *data, size_t *optlen);
 
 #endif /* TCP_INTERNAL_H */

From 265b2099c7715a3432eef00acd1faea7cbc1eb25 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 18 Sep 2024 11:53:06 +1000
Subject: [PATCH 030/382] tcp: Simplify ifdef logic in tcp_update_seqack_wnd()

This function has a block conditional on !snd_wnd_cap shortly before an
snd_wnd_cap is statically false).

Therefore, simplify this down to a single conditional with an else branch.
While we're there, fix some improperly indented closing braces.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tcp.c b/tcp.c
index cba3f3b..92ac164 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1066,14 +1066,13 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (!tinfo) {
 		if (prev_wnd_to_tap > WINDOW_DEFAULT) {
 			goto out;
-}
+		}
 		tinfo = &tinfo_new;
 		if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) {
 			goto out;
-}
+		}
 	}
 
-#ifdef HAS_SND_WND
 	if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
 		new_wnd_to_tap = tinfo->tcpi_snd_wnd;
 	} else {
@@ -1081,7 +1080,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 		new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
 				     SNDBUF_GET(conn));
 	}
-#endif
 
 	new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
 	if (!(conn->events & ESTABLISHED))

From bb41901c719f9ba422b538f773025dad5c398823 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 18 Sep 2024 11:53:07 +1000
Subject: [PATCH 031/382] tcp: Make tcp_update_seqack_wnd()s force_seq
 parameter explicitly boolean

This parameter is already treated as a boolean internally.  Make it a
'bool' type for clarity.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 6 +++---
 tcp_buf.c      | 2 +-
 tcp_internal.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tcp.c b/tcp.c
index 92ac164..787df63 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1020,7 +1020,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
  * Return: 1 if sequence or window were updated, 0 otherwise
  */
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
-			  int force_seq, struct tcp_info *tinfo)
+			  bool force_seq, struct tcp_info *tinfo)
 {
 	uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
 	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
@@ -1157,7 +1157,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (!(conn->flags & LOCAL))
 		tcp_rtt_dst_check(conn, &tinfo);
 
-	if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
+	if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags)
 		return 0;
 
 	*optlen = 0;
@@ -2240,7 +2240,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
 			tcp_data_from_sock(c, conn);
 
 		if (events & EPOLLOUT)
-			tcp_update_seqack_wnd(c, conn, 0, NULL);
+			tcp_update_seqack_wnd(c, conn, false, NULL);
 
 		return;
 	}
diff --git a/tcp_buf.c b/tcp_buf.c
index c886c92..83f91a3 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -511,7 +511,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	last_len = sendlen - (send_bufs - 1) * mss;
 
 	/* Likely, some new data was acked too. */
-	tcp_update_seqack_wnd(c, conn, 0, NULL);
+	tcp_update_seqack_wnd(c, conn, false, NULL);
 
 	/* Finally, queue to tap */
 	dlen = mss;
diff --git a/tcp_internal.h b/tcp_internal.h
index bd634be..a450d85 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -93,7 +93,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 			       struct iovec *iov, size_t dlen,
 			       const uint16_t *check, uint32_t seq);
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
-			  int force_seq, struct tcp_info *tinfo);
+			  bool force_seq, struct tcp_info *tinfo);
 int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags,
 		      struct tcphdr *th, char *data, size_t *optlen);
 

From bfc294b90dc46d132a56dc0a2ae118f2bea5a266 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 18 Sep 2024 20:44:05 +1000
Subject: [PATCH 032/382] util: Add helper to write() all of a buffer

write(2) might not write all the data it is given.  Add a write_all_buf()
helper to keep calling it until all the given data is written, or we get an
error.

Currently we use write_remainder() to do this operation in pcap_frame().
That's a little awkward since it requires constructing an iovec, and future
changes we want to make to write_remainder() will be easier in terms of
this single buffer helper.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 pcap.c |  3 +--
 util.c | 25 +++++++++++++++++++++++++
 util.h |  1 +
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/pcap.c b/pcap.c
index 46cc4b0..e6b5ced 100644
--- a/pcap.c
+++ b/pcap.c
@@ -86,9 +86,8 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
 		.caplen = l2len,
 		.len = l2len
 	};
-	struct iovec hiov = { &h, sizeof(h) };
 
-	if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 ||
+	if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 ||
 	    write_remainder(pcap_fd, iov, iovcnt, offset) < 0)
 		debug_perror("Cannot log packet, length %zu", l2len);
 }
diff --git a/util.c b/util.c
index eede4e5..7db7c2e 100644
--- a/util.c
+++ b/util.c
@@ -582,6 +582,31 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
 #endif
 }
 
+/* write_all_buf() - write all of a buffer to an fd
+ * @fd:		File descriptor
+ * @buf:	Pointer to base of buffer
+ * @len:	Length of buffer
+ *
+ * Return: 0 on success, -1 on error (with errno set)
+ *
+ * #syscalls write
+ */
+int write_all_buf(int fd, const void *buf, size_t len)
+{
+	const char *p = buf;
+	size_t left = len;
+
+	while (left) {
+		ssize_t rc = write(fd, p, left);
+
+		if (rc < 0)
+			return -1;
+		p += rc;
+		left -= rc;
+	}
+	return 0;
+}
+
 /* write_remainder() - write the tail of an IO vector to an fd
  * @fd:		File descriptor
  * @iov:	IO vector
diff --git a/util.h b/util.h
index c7a59d5..5e67f1f 100644
--- a/util.h
+++ b/util.h
@@ -200,6 +200,7 @@ void pidfile_write(int fd, pid_t pid);
 int __daemon(int pidfile_fd, int devnull_fd);
 int fls(unsigned long x);
 int write_file(const char *path, const char *buf);
+int write_all_buf(int fd, const void *buf, size_t len);
 int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
 void close_open_files(int argc, char **argv);
 

From d836d9e345865245bab28100a6065d6fa7b6a00c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 18 Sep 2024 20:44:06 +1000
Subject: [PATCH 033/382] util: Remove possible quadratic behaviour from
 write_remainder()

write_remainder() steps through the buffers in an IO vector writing out
everything past a certain byte offset.  However, on each iteration it
rescans the buffer from the beginning to find out where we're up to.  With
an unfortunate set of write sizes this could lead to quadratic behaviour.

In an even less likely set of circumstances (total vector length > maximum
size_t) the 'skip' variable could overflow.  This is one factor in a
longstanding Coverity error we've seen (although I still can't figure out
the remainder of its complaint).

Rework write_remainder() to always work out our new position in the vector
relative to our old/current position, rather than starting from the
beginning each time.  As a bonus this seems to fix the Coverity error.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 util.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/util.c b/util.c
index 7db7c2e..87309c5 100644
--- a/util.c
+++ b/util.c
@@ -597,10 +597,15 @@ int write_all_buf(int fd, const void *buf, size_t len)
 	size_t left = len;
 
 	while (left) {
-		ssize_t rc = write(fd, p, left);
+		ssize_t rc;
+
+		do
+			rc = write(fd, p, left);
+		while ((rc < 0) && errno == EINTR);
 
 		if (rc < 0)
 			return -1;
+
 		p += rc;
 		left -= rc;
 	}
@@ -615,28 +620,30 @@ int write_all_buf(int fd, const void *buf, size_t len)
  *
  * Return: 0 on success, -1 on error (with errno set)
  *
- * #syscalls write writev
+ * #syscalls writev
  */
 int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip)
 {
-	size_t offset, i;
+	size_t i = 0, offset;
 
-	while ((i = iov_skip_bytes(iov, iovcnt, skip, &offset)) < iovcnt) {
+	while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) {
 		ssize_t rc;
 
 		if (offset) {
-			rc = write(fd, (char *)iov[i].iov_base + offset,
-				   iov[i].iov_len - offset);
-		} else {
-			rc = writev(fd, &iov[i], iovcnt - i);
+			/* Write the remainder of the partially written buffer */
+			if (write_all_buf(fd, (char *)iov[i].iov_base + offset,
+					  iov[i].iov_len - offset) < 0)
+				return -1;
+			i++;
 		}
 
+		/* Write as much of the remaining whole buffers as we can */
+		rc = writev(fd, &iov[i], iovcnt - i);
 		if (rc < 0)
 			return -1;
 
-		skip += rc;
+		skip = rc;
 	}
-
 	return 0;
 }
 

From 4fe5f4e813b553f4877ffa2b485d941bb9f85ca2 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 18 Sep 2024 15:13:27 +0200
Subject: [PATCH 034/382] udp: Allow checksum to be disabled

We can need not to set the UDP checksum. Add a parameter to
udp_update_hdr4() and udp_update_hdr6() to disable it.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 58 ++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 18 deletions(-)

diff --git a/udp.c b/udp.c
index 2ba00c9..7b28313 100644
--- a/udp.c
+++ b/udp.c
@@ -294,15 +294,17 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
 
 /**
  * udp_update_hdr4() - Update headers for one IPv4 datagram
- * @ip4h:	Pre-filled IPv4 header (except for tot_len and saddr)
- * @bp:		Pointer to udp_payload_t to update
- * @toside:	Flowside for destination side
- * @dlen:	Length of UDP payload
+ * @ip4h:		Pre-filled IPv4 header (except for tot_len and saddr)
+ * @bp:			Pointer to udp_payload_t to update
+ * @toside:		Flowside for destination side
+ * @dlen:		Length of UDP payload
+ * @no_udp_csum:	Do not set UDP checksum
  *
  * Return: size of IPv4 payload (UDP header + data)
  */
 static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
-			      const struct flowside *toside, size_t dlen)
+			      const struct flowside *toside, size_t dlen,
+			      bool no_udp_csum)
 {
 	const struct in_addr *src = inany_v4(&toside->oaddr);
 	const struct in_addr *dst = inany_v4(&toside->eaddr);
@@ -319,22 +321,28 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 	bp->uh.source = htons(toside->oport);
 	bp->uh.dest = htons(toside->eport);
 	bp->uh.len = htons(l4len);
-	csum_udp4(&bp->uh, *src, *dst, bp->data, dlen);
+	if (no_udp_csum)
+		bp->uh.check = 0;
+	else
+		csum_udp4(&bp->uh, *src, *dst, bp->data, dlen);
 
 	return l4len;
 }
 
 /**
  * udp_update_hdr6() - Update headers for one IPv6 datagram
- * @ip6h:	Pre-filled IPv6 header (except for payload_len and addresses)
- * @bp:		Pointer to udp_payload_t to update
- * @toside:	Flowside for destination side
- * @dlen:	Length of UDP payload
+ * @ip6h:		Pre-filled IPv6 header (except for payload_len and
+ * 			addresses)
+ * @bp:			Pointer to udp_payload_t to update
+ * @toside:		Flowside for destination side
+ * @dlen:		Length of UDP payload
+ * @no_udp_csum:	Do not set UDP checksum
  *
  * Return: size of IPv6 payload (UDP header + data)
  */
 static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
-			      const struct flowside *toside, size_t dlen)
+			      const struct flowside *toside, size_t dlen,
+			      bool no_udp_csum)
 {
 	uint16_t l4len = dlen + sizeof(bp->uh);
 
@@ -348,7 +356,16 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
 	bp->uh.source = htons(toside->oport);
 	bp->uh.dest = htons(toside->eport);
 	bp->uh.len = ip6h->payload_len;
-	csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, bp->data, dlen);
+	if (no_udp_csum) {
+		/* 0 is an invalid checksum for UDP IPv6 and dropped by
+		 * the kernel stack, even if the checksum is disabled by virtio
+		 * flags. We need to put any non-zero value here.
+		 */
+		bp->uh.check = 0xffff;
+	} else {
+		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
+			  bp->data, dlen);
+	}
 
 	return l4len;
 }
@@ -358,9 +375,11 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
  * @mmh:	Receiving mmsghdr array
  * @idx:	Index of the datagram to prepare
  * @toside:	Flowside for destination side
+ * @no_udp_csum: Do not set UDP checksum
  */
-static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
-			    const struct flowside *toside)
+static void udp_tap_prepare(const struct mmsghdr *mmh,
+			    unsigned idx, const struct flowside *toside,
+			    bool no_udp_csum)
 {
 	struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
 	struct udp_payload_t *bp = &udp_payload[idx];
@@ -368,13 +387,15 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
 	size_t l4len;
 
 	if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
-		l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len);
+		l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
+					mmh[idx].msg_len, no_udp_csum);
 		tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
 			       sizeof(udp6_eth_hdr));
 		(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
 		(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
 	} else {
-		l4len = udp_update_hdr4(&bm->ip4h, bp, toside, mmh[idx].msg_len);
+		l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
+					mmh[idx].msg_len, no_udp_csum);
 		tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
 			       sizeof(udp4_eth_hdr));
 		(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
@@ -565,7 +586,8 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 				udp_splice_prepare(udp_mh_recv, i);
 			} else if (batchpif == PIF_TAP) {
 				udp_tap_prepare(udp_mh_recv, i,
-						flowside_at_sidx(batchsidx));
+						flowside_at_sidx(batchsidx),
+						false);
 			}
 
 			if (++i >= n)
@@ -636,7 +658,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 		if (pif_is_socket(topif))
 			udp_splice_prepare(udp_mh_recv, i);
 		else if (topif == PIF_TAP)
-			udp_tap_prepare(udp_mh_recv, i, toside);
+			udp_tap_prepare(udp_mh_recv, i, toside, false);
 		/* Restore sockaddr length clobbered by recvmsg() */
 		udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
 	}

From 8f8c4d27eb2e023fd80986d8fdf8a68b37e3877e Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 18 Sep 2024 15:13:28 +0200
Subject: [PATCH 035/382] tcp: Allow checksum to be disabled

We can need not to set TCP checksum. Add a parameter to
tcp_fill_headers4() and tcp_fill_headers6() to disable it.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 52 ++++++++++++++++++++++++++++++--------------------
 tcp_buf.c      |  8 +++++---
 tcp_internal.h |  3 ++-
 3 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/tcp.c b/tcp.c
index 787df63..1962fcc 100644
--- a/tcp.c
+++ b/tcp.c
@@ -899,13 +899,14 @@ static void tcp_fill_header(struct tcphdr *th,
 
 /**
  * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers
- * @conn:	Connection pointer
- * @taph:	tap backend specific header
- * @iph:	Pointer to IPv4 header
- * @th:		Pointer to TCP header
- * @dlen:	TCP payload length
- * @check:	Checksum, if already known
- * @seq:	Sequence number for this segment
+ * @conn:		Connection pointer
+ * @taph:		tap backend specific header
+ * @iph:		Pointer to IPv4 header
+ * @th:			Pointer to TCP header
+ * @dlen:		TCP payload length
+ * @check:		Checksum, if already known
+ * @seq:		Sequence number for this segment
+ * @no_tcp_csum:	Do not set TCP checksum
  *
  * Return: The IPv4 payload length, host order
  */
@@ -913,7 +914,7 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
 				struct tap_hdr *taph,
 				struct iphdr *iph, struct tcphdr *th,
 				size_t dlen, const uint16_t *check,
-				uint32_t seq)
+				uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *src4 = inany_v4(&tapside->oaddr);
@@ -932,7 +933,10 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
 
 	tcp_fill_header(th, conn, seq);
 
-	tcp_update_check_tcp4(iph, th);
+	if (no_tcp_csum)
+		th->check = 0;
+	else
+		tcp_update_check_tcp4(iph, th);
 
 	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
 
@@ -941,20 +945,21 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
 
 /**
  * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers
- * @conn:	Connection pointer
- * @taph:	tap backend specific header
- * @ip6h:	Pointer to IPv6 header
- * @th:		Pointer to TCP header
- * @dlen:	TCP payload length
- * @check:	Checksum, if already known
- * @seq:	Sequence number for this segment
+ * @conn:		Connection pointer
+ * @taph:		tap backend specific header
+ * @ip6h:		Pointer to IPv6 header
+ * @th:			Pointer to TCP header
+ * @dlen:		TCP payload length
+ * @check:		Checksum, if already known
+ * @seq:		Sequence number for this segment
+ * @no_tcp_csum:	Do not set TCP checksum
  *
  * Return: The IPv6 payload length, host order
  */
 static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
 				struct tap_hdr *taph,
 				struct ipv6hdr *ip6h, struct tcphdr *th,
-				size_t dlen, uint32_t seq)
+				size_t dlen, uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	size_t l4len = dlen + sizeof(*th);
@@ -973,7 +978,10 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
 
 	tcp_fill_header(th, conn, seq);
 
-	tcp_update_check_tcp6(ip6h, th);
+	if (no_tcp_csum)
+		th->check = 0;
+	else
+		tcp_update_check_tcp6(ip6h, th);
 
 	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
 
@@ -987,12 +995,14 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
  * @dlen:	TCP payload length
  * @check:	Checksum, if already known
  * @seq:	Sequence number for this segment
+ * @no_tcp_csum: Do not set TCP checksum
  *
  * Return: IP payload length, host order
  */
 size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 			       struct iovec *iov, size_t dlen,
-			       const uint16_t *check, uint32_t seq)
+			       const uint16_t *check, uint32_t seq,
+			       bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *a4 = inany_v4(&tapside->oaddr);
@@ -1001,13 +1011,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 		return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
 					 iov[TCP_IOV_IP].iov_base,
 					 iov[TCP_IOV_PAYLOAD].iov_base, dlen,
-					 check, seq);
+					 check, seq, no_tcp_csum);
 	}
 
 	return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
 				 iov[TCP_IOV_IP].iov_base,
 				 iov[TCP_IOV_PAYLOAD].iov_base, dlen,
-				 seq);
+				 seq, no_tcp_csum);
 }
 
 /**
diff --git a/tcp_buf.c b/tcp_buf.c
index 83f91a3..ffbff5e 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -320,7 +320,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 		return ret;
 	}
 
-	l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq);
+	l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
 	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
 
 	if (flags & DUP_ACK) {
@@ -381,7 +381,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		tcp4_frame_conns[tcp4_payload_used] = conn;
 
 		iov = tcp4_l2_iov[tcp4_payload_used++];
-		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq);
+		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq,
+						false);
 		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
 		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
@@ -389,7 +390,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		tcp6_frame_conns[tcp6_payload_used] = conn;
 
 		iov = tcp6_l2_iov[tcp6_payload_used++];
-		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq);
+		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq,
+						false);
 		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
 		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
diff --git a/tcp_internal.h b/tcp_internal.h
index a450d85..de06db1 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -91,7 +91,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
 
 size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 			       struct iovec *iov, size_t dlen,
-			       const uint16_t *check, uint32_t seq);
+			       const uint16_t *check, uint32_t seq,
+			       bool no_tcp_csum);
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 			  bool force_seq, struct tcp_info *tinfo);
 int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags,

From 204e77cd11b2df720c9acd35d562e1ed868304b4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 20 Sep 2024 14:12:41 +1000
Subject: [PATCH 036/382] udp: Don't attempt to get dual-stack sockets in
 nonsensical cases

To save some kernel memory we try to use "dual stack" sockets (that listen
to both IPv4 and IPv6 traffic) when possible.   However udp_sock_init()
attempts to do this in some cases that can't work.  Specifically we can
only do this when listening on any address.  That's never true for the
ns (splicing) case, because we always listen on loopback.  For the !ns
case and AF_UNSPEC case, addr should always be NULL, but add an assert to
verify.

This is harmless: if addr is non-NULL, sock_l4() will just fail and we'll
fall back to the other path.  But, it's messy and makes some upcoming
changes harder, so avoid attempting this in cases we know can't work.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/udp.c b/udp.c
index 7b28313..8cea80c 100644
--- a/udp.c
+++ b/udp.c
@@ -803,21 +803,16 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 
 	ASSERT(!c->no_udp);
 
-	if (af == AF_UNSPEC && c->ifi4 && c->ifi6) {
+	if (af == AF_UNSPEC && c->ifi4 && c->ifi6 && !ns) {
 		int s;
 
+		ASSERT(!addr);
+
 		/* Attempt to get a dual stack socket */
-		if (!ns) {
-			s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
-				    addr, ifname, port, uref.u32);
-			udp_splice_init[V4][port] = s < 0 ? -1 : s;
-			udp_splice_init[V6][port] = s < 0 ? -1 : s;
-		} else {
-			s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
-				    &in4addr_loopback, ifname, port, uref.u32);
-			udp_splice_ns[V4][port] = s < 0 ? -1 : s;
-			udp_splice_ns[V6][port] = s < 0 ? -1 : s;
-		}
+		s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
+			    NULL, ifname, port, uref.u32);
+		udp_splice_init[V4][port] = s < 0 ? -1 : s;
+		udp_splice_init[V6][port] = s < 0 ? -1 : s;
 		if (IN_INTERVAL(0, FD_REF_MAX, s))
 			return 0;
 	}

From b8d4fac6a2e77a93d9b0d291cd1ca803a29f890e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 20 Sep 2024 14:12:42 +1000
Subject: [PATCH 037/382] util, pif: Replace sock_l4() with pif_sock_l4()

The sock_l4() function is very convenient for creating sockets bound to
a given address, but its interface has some problems.

Most importantly, the address and port alone aren't enough in some cases.
For link-local addresses (at least) we also need the pif in order to
properly construct a socket adddress.  This case doesn't yet arise, but
it might cause us trouble in future.

Additionally, sock_l4() can take AF_UNSPEC with the special meaning that it
should attempt to create a "dual stack" socket which will respond to both
IPv4 and IPv6 traffic.  This only makes sense if there is no specific
address given.  We verify this at runtime, but it would be nicer if we
could enforce it structurally.

For sockets associated specifically with a single flow we already replaced
sock_l4() with flowside_sock_l4() which avoids those problems.  Now,
replace all the remaining users with a new pif_sock_l4() which also takes
an explicit pif.

The new function takes the address as an inany *, with NULL indicating the
dual stack case.  This does add some complexity in some of the callers,
however future planned cleanups should make this go away again.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 pif.c  | 42 ++++++++++++++++++++++++++++++++++++++++++
 pif.h  |  3 +++
 tcp.c  | 22 +++++++++++++++++-----
 udp.c  | 34 ++++++++++++++++++++++------------
 util.c | 52 ----------------------------------------------------
 util.h |  3 ---
 6 files changed, 84 insertions(+), 72 deletions(-)

diff --git a/pif.c b/pif.c
index a099e31..592fafa 100644
--- a/pif.c
+++ b/pif.c
@@ -59,3 +59,45 @@ void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
 		*sl = sizeof(sa->sa6);
 	}
 }
+
+/** pif_sock_l4() - Open a socket bound to an address on a specified interface
+ * @c:		Execution context
+ * @type:	Socket epoll type
+ * @pif:	Interface for this socket
+ * @addr:	Address to bind to, or NULL for dual-stack any
+ * @ifname:	Interface for binding, NULL for any
+ * @port:	Port number to bind to (host byte order)
+ * @data:	epoll reference portion for protocol handlers
+ *
+ * NOTE: For namespace pifs, this must be called having already entered the
+ * relevant namespace.
+ *
+ * Return: newly created socket, negative error code on failure
+ */
+int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
+		const union inany_addr *addr, const char *ifname,
+		in_port_t port, uint32_t data)
+{
+	union sockaddr_inany sa = {
+		.sa6.sin6_family = AF_INET6,
+		.sa6.sin6_addr = in6addr_any,
+		.sa6.sin6_port = htons(port),
+	};
+	socklen_t sl;
+
+	ASSERT(pif_is_socket(pif));
+
+	if (pif == PIF_SPLICE) {
+		/* Sanity checks */
+		ASSERT(!ifname);
+		ASSERT(addr && inany_is_loopback(addr));
+	}
+
+	if (!addr)
+		return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
+				  ifname, false, data);
+
+	pif_sockaddr(c, &sa, &sl, pif, addr, port);
+	return sock_l4_sa(c, type, &sa, sl,
+			  ifname, sa.sa_family == AF_INET6, data);
+}
diff --git a/pif.h b/pif.h
index 8777bb5..f029282 100644
--- a/pif.h
+++ b/pif.h
@@ -59,5 +59,8 @@ static inline bool pif_is_socket(uint8_t pif)
 
 void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
 		  uint8_t pif, const union inany_addr *addr, in_port_t port);
+int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
+		const union inany_addr *addr, const char *ifname,
+		in_port_t port, uint32_t data);
 
 #endif /* PIF_H */
diff --git a/tcp.c b/tcp.c
index 1962fcc..49e0cfe 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2291,7 +2291,19 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
 	};
 	int s;
 
-	s = sock_l4(c, af, EPOLL_TYPE_TCP_LISTEN, addr, ifname, port, tref.u32);
+	if (af == AF_UNSPEC) {
+		ASSERT(!addr);
+		s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, NULL,
+				ifname, port, tref.u32);
+	} else {
+		union inany_addr aany = af == AF_INET ? inany_any4 : inany_any6;
+
+		if (addr)
+			inany_from_af(&aany, af, addr);
+
+		s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, &aany,
+				ifname, port, tref.u32);
+	}
 
 	if (c->tcp.fwd_in.mode == FWD_AUTO) {
 		if (af == AF_INET  || af == AF_UNSPEC)
@@ -2357,8 +2369,8 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
 
 	ASSERT(c->mode == MODE_PASTA);
 
-	s = sock_l4(c, AF_INET, EPOLL_TYPE_TCP_LISTEN, &in4addr_loopback,
-		    NULL, port, tref.u32);
+	s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4,
+			NULL, port, tref.u32);
 	if (s >= 0)
 		tcp_sock_set_bufsize(c, s);
 	else
@@ -2383,8 +2395,8 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
 
 	ASSERT(c->mode == MODE_PASTA);
 
-	s = sock_l4(c, AF_INET6, EPOLL_TYPE_TCP_LISTEN, &in6addr_loopback,
-		    NULL, port, tref.u32);
+	s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6,
+			NULL, port, tref.u32);
 	if (s >= 0)
 		tcp_sock_set_bufsize(c, s);
 	else
diff --git a/udp.c b/udp.c
index 8cea80c..b3d4a64 100644
--- a/udp.c
+++ b/udp.c
@@ -809,8 +809,8 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		ASSERT(!addr);
 
 		/* Attempt to get a dual stack socket */
-		s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
-			    NULL, ifname, port, uref.u32);
+		s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
+				NULL, ifname, port, uref.u32);
 		udp_splice_init[V4][port] = s < 0 ? -1 : s;
 		udp_splice_init[V6][port] = s < 0 ? -1 : s;
 		if (IN_INTERVAL(0, FD_REF_MAX, s))
@@ -819,28 +819,38 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 
 	if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
 		if (!ns) {
-			r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
-				     addr, ifname, port, uref.u32);
+			union inany_addr aany = inany_any4;
+
+			if (addr)
+				inany_from_af(&aany, AF_INET, addr);
+
+			r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
+					 &aany, ifname, port, uref.u32);
 
 			udp_splice_init[V4][port] = r4 < 0 ? -1 : r4;
 		} else {
-			r4  = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
-				      &in4addr_loopback,
-				      ifname, port, uref.u32);
+			r4  = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
+					  &inany_loopback4, ifname,
+					  port, uref.u32);
 			udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4;
 		}
 	}
 
 	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
 		if (!ns) {
-			r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
-				     addr, ifname, port, uref.u32);
+			union inany_addr aany = inany_any6;
+
+			if (addr)
+				inany_from_af(&aany, AF_INET6, addr);
+
+			r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
+					 &aany, ifname, port, uref.u32);
 
 			udp_splice_init[V6][port] = r6 < 0 ? -1 : r6;
 		} else {
-			r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
-				     &in6addr_loopback,
-				     ifname, port, uref.u32);
+			r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
+					 &inany_loopback6, ifname,
+					 port, uref.u32);
 			udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6;
 		}
 	}
diff --git a/util.c b/util.c
index 87309c5..ebd93ed 100644
--- a/util.c
+++ b/util.c
@@ -157,58 +157,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 
 	return fd;
 }
-/**
- * sock_l4() - Create and bind socket for given L4, add to epoll list
- * @c:		Execution context
- * @af:		Address family, AF_INET or AF_INET6
- * @type:	epoll type
- * @bind_addr:	Address for binding, NULL for any
- * @ifname:	Interface for binding, NULL for any
- * @port:	Port, host order
- * @data:	epoll reference portion for protocol handlers
- *
- * Return: newly created socket, negative error code on failure
- */
-int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
-	    const void *bind_addr, const char *ifname, uint16_t port,
-	    uint32_t data)
-{
-	switch (af) {
-	case AF_INET: {
-		struct sockaddr_in addr4 = {
-			.sin_family = AF_INET,
-			.sin_port = htons(port),
-			{ 0 }, { 0 },
-		};
-		if (bind_addr)
-			addr4.sin_addr = *(struct in_addr *)bind_addr;
-		return sock_l4_sa(c, type, &addr4, sizeof(addr4), ifname,
-				  false, data);
-	}
-
-	case AF_UNSPEC:
-		if (!DUAL_STACK_SOCKETS || bind_addr)
-			 return -EINVAL;
-		/* fallthrough */
-	case AF_INET6: {
-		struct sockaddr_in6 addr6 = {
-			.sin6_family = AF_INET6,
-			.sin6_port = htons(port),
-			0, IN6ADDR_ANY_INIT, 0,
-		};
-		if (bind_addr) {
-			addr6.sin6_addr = *(struct in6_addr *)bind_addr;
-
-			if (IN6_IS_ADDR_LINKLOCAL(bind_addr))
-				addr6.sin6_scope_id = c->ifi6;
-		}
-		return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname,
-				  af == AF_INET6, data);
-	}
-	default:
-		return -EINVAL;
-	}
-}
 
 /**
  * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
diff --git a/util.h b/util.h
index 5e67f1f..2c1e08e 100644
--- a/util.h
+++ b/util.h
@@ -181,9 +181,6 @@ int close_range(unsigned int first, unsigned int last, int flags) {
 int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	       const void *sa, socklen_t sl,
 	       const char *ifname, bool v6only, uint32_t data);
-int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
-	    const void *bind_addr, const char *ifname, uint16_t port,
-	    uint32_t data);
 void sock_probe_mem(struct ctx *c);
 long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
 int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);

From cbde4192eeef7a5640aea6dd84d5eac02841ef5c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 20 Sep 2024 14:12:43 +1000
Subject: [PATCH 038/382] tcp, udp: Make {tcp,udp}_sock_init() take an inany
 address

tcp_sock_init() and udp_sock_init() take an address to bind to as an
address family and void * pair.  Use an inany instead.  Formerly AF_UNSPEC
was used to indicate that we want to listen on both 0.0.0.0 and ::, now use
a NULL inany to indicate that.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c | 28 ++++++++++++++--------------
 tcp.c  | 47 ++++++++++++++++++-----------------------------
 tcp.h  |  2 +-
 udp.c  | 31 ++++++++++---------------------
 udp.h  |  4 ++--
 5 files changed, 45 insertions(+), 67 deletions(-)

diff --git a/conf.c b/conf.c
index b275886..9f1cd83 100644
--- a/conf.c
+++ b/conf.c
@@ -116,11 +116,10 @@ static int parse_port_range(const char *s, char **endptr,
 static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 		       struct fwd_ports *fwd)
 {
-	char addr_buf[sizeof(struct in6_addr)] = { 0 }, *addr = addr_buf;
+	union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
 	char buf[BUFSIZ], *spec, *ifname = NULL, *p;
 	bool exclude_only = true, bound_one = false;
 	uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
-	sa_family_t af = AF_UNSPEC;
 	unsigned i;
 	int ret;
 
@@ -166,15 +165,13 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 
 			bitmap_set(fwd->map, i);
 			if (optname == 't') {
-				ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL,
-						    i);
+				ret = tcp_sock_init(c, NULL, NULL, i);
 				if (ret == -ENFILE || ret == -EMFILE)
 					goto enfile;
 				if (!ret)
 					bound_one = true;
 			} else if (optname == 'u') {
-				ret = udp_sock_init(c, 0, AF_UNSPEC, NULL, NULL,
-						    i);
+				ret = udp_sock_init(c, 0, NULL, NULL, i);
 				if (ret == -ENFILE || ret == -EMFILE)
 					goto enfile;
 				if (!ret)
@@ -218,6 +215,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 		if (ifname == buf + 1) {	/* Interface without address */
 			addr = NULL;
 		} else {
+			struct in6_addr a6;
+			struct in_addr a4;
+
 			p = buf;
 
 			/* Allow square brackets for IPv4 too for convenience */
@@ -226,10 +226,10 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 				p++;
 			}
 
-			if (inet_pton(AF_INET, p, addr))
-				af = AF_INET;
-			else if (inet_pton(AF_INET6, p, addr))
-				af = AF_INET6;
+			if (inet_pton(AF_INET, p, &a4))
+				inany_from_af(addr, AF_INET, &a4);
+			else if (inet_pton(AF_INET6, p, &a6))
+				inany_from_af(addr, AF_INET6, &a6);
 			else
 				goto bad;
 		}
@@ -276,13 +276,13 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 			bitmap_set(fwd->map, i);
 
 			if (optname == 't') {
-				ret = tcp_sock_init(c, af, addr, ifname, i);
+				ret = tcp_sock_init(c, addr, ifname, i);
 				if (ret == -ENFILE || ret == -EMFILE)
 					goto enfile;
 				if (!ret)
 					bound_one = true;
 			} else if (optname == 'u') {
-				ret = udp_sock_init(c, 0, af, addr, ifname, i);
+				ret = udp_sock_init(c, 0, addr, ifname, i);
 				if (ret == -ENFILE || ret == -EMFILE)
 					goto enfile;
 				if (!ret)
@@ -338,9 +338,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 
 			ret = 0;
 			if (optname == 't')
-				ret = tcp_sock_init(c, af, addr, ifname, i);
+				ret = tcp_sock_init(c, addr, ifname, i);
 			else if (optname == 'u')
-				ret = udp_sock_init(c, 0, af, addr, ifname, i);
+				ret = udp_sock_init(c, 0, addr, ifname, i);
 			if (ret)
 				goto bind_fail;
 		}
diff --git a/tcp.c b/tcp.c
index 49e0cfe..6ca3700 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2273,17 +2273,16 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
 }
 
 /**
- * tcp_sock_init_af() - Initialise listening socket for a given af and port
+ * tcp_sock_init_one() - Initialise listening socket for address and port
  * @c:		Execution context
- * @af:		Address family to listen on
- * @port:	Port, host order
- * @addr:	Pointer to address for binding, NULL if not configured
+ * @addr:	Pointer to address for binding, NULL for dual stack any
  * @ifname:	Name of interface to bind to, NULL if not configured
+ * @port:	Port, host order
  *
  * Return: fd for the new listening socket, negative error code on failure
  */
-static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
-			    const void *addr, const char *ifname)
+static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
+			     const char *ifname, in_port_t port)
 {
 	union tcp_listen_epoll_ref tref = {
 		.port = port,
@@ -2291,24 +2290,13 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
 	};
 	int s;
 
-	if (af == AF_UNSPEC) {
-		ASSERT(!addr);
-		s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, NULL,
+	s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr,
 				ifname, port, tref.u32);
-	} else {
-		union inany_addr aany = af == AF_INET ? inany_any4 : inany_any6;
-
-		if (addr)
-			inany_from_af(&aany, af, addr);
-
-		s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, &aany,
-				ifname, port, tref.u32);
-	}
 
 	if (c->tcp.fwd_in.mode == FWD_AUTO) {
-		if (af == AF_INET  || af == AF_UNSPEC)
+		if (!addr || inany_v4(addr))
 			tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s;
-		if (af == AF_INET6 || af == AF_UNSPEC)
+		if (!addr || !inany_v4(addr))
 			tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s;
 	}
 
@@ -2322,31 +2310,32 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
 /**
  * tcp_sock_init() - Create listening sockets for a given host ("inbound") port
  * @c:		Execution context
- * @af:		Address family to select a specific IP version, or AF_UNSPEC
  * @addr:	Pointer to address for binding, NULL if not configured
  * @ifname:	Name of interface to bind to, NULL if not configured
  * @port:	Port, host order
  *
  * Return: 0 on (partial) success, negative error code on (complete) failure
  */
-int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
+int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
 		  const char *ifname, in_port_t port)
 {
 	int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
 
 	ASSERT(!c->no_tcp);
 
-	if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
+	if (!addr && c->ifi4 && c->ifi6)
 		/* Attempt to get a dual stack socket */
-		if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0)
+		if (tcp_sock_init_one(c, NULL, ifname, port) >= 0)
 			return 0;
 
 	/* Otherwise create a socket per IP version */
-	if ((af == AF_INET  || af == AF_UNSPEC) && c->ifi4)
-		r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname);
+	if ((!addr || inany_v4(addr)) && c->ifi4)
+		r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4,
+				       ifname, port);
 
-	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
-		r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
+	if ((!addr || !inany_v4(addr)) && c->ifi6)
+		r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6,
+				       ifname, port);
 
 	if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
 		return 0;
@@ -2629,7 +2618,7 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
 			if (outbound)
 				tcp_ns_sock_init(c, port);
 			else
-				tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port);
+				tcp_sock_init(c, NULL, NULL, port);
 		}
 	}
 }
diff --git a/tcp.h b/tcp.h
index 5585924..cf30744 100644
--- a/tcp.h
+++ b/tcp.h
@@ -18,7 +18,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
 int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		    const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now);
-int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
+int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);
 int tcp_init(struct ctx *c);
 void tcp_timer(struct ctx *c, const struct timespec *now);
diff --git a/udp.c b/udp.c
index b3d4a64..08faaec 100644
--- a/udp.c
+++ b/udp.c
@@ -785,15 +785,14 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
  * udp_sock_init() - Initialise listening sockets for a given port
  * @c:		Execution context
  * @ns:		In pasta mode, if set, bind with loopback address in namespace
- * @af:		Address family to select a specific IP version, or AF_UNSPEC
  * @addr:	Pointer to address for binding, NULL if not configured
  * @ifname:	Name of interface to bind to, NULL if not configured
  * @port:	Port, host order
  *
  * Return: 0 on (partial) success, negative error code on (complete) failure
  */
-int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
-		  const void *addr, const char *ifname, in_port_t port)
+int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
+		  const char *ifname, in_port_t port)
 {
 	union udp_listen_epoll_ref uref = {
 		.pif = ns ? PIF_SPLICE : PIF_HOST,
@@ -803,11 +802,9 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 
 	ASSERT(!c->no_udp);
 
-	if (af == AF_UNSPEC && c->ifi4 && c->ifi6 && !ns) {
+	if (!addr && c->ifi4 && c->ifi6 && !ns) {
 		int s;
 
-		ASSERT(!addr);
-
 		/* Attempt to get a dual stack socket */
 		s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
 				NULL, ifname, port, uref.u32);
@@ -817,15 +814,11 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 			return 0;
 	}
 
-	if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
+	if ((!addr || inany_v4(addr)) && c->ifi4) {
 		if (!ns) {
-			union inany_addr aany = inany_any4;
-
-			if (addr)
-				inany_from_af(&aany, AF_INET, addr);
-
 			r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
-					 &aany, ifname, port, uref.u32);
+					 addr ? addr : &inany_any4, ifname,
+					 port, uref.u32);
 
 			udp_splice_init[V4][port] = r4 < 0 ? -1 : r4;
 		} else {
@@ -836,15 +829,11 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		}
 	}
 
-	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
+	if ((!addr || !inany_v4(addr)) && c->ifi6) {
 		if (!ns) {
-			union inany_addr aany = inany_any6;
-
-			if (addr)
-				inany_from_af(&aany, AF_INET6, addr);
-
 			r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
-					 &aany, ifname, port, uref.u32);
+					 addr ? addr : &inany_any6, ifname,
+					 port, uref.u32);
 
 			udp_splice_init[V6][port] = r6 < 0 ? -1 : r6;
 		} else {
@@ -918,7 +907,7 @@ static void udp_port_rebind(struct ctx *c, bool outbound)
 
 		if ((c->ifi4 && socks[V4][port] == -1) ||
 		    (c->ifi6 && socks[V6][port] == -1))
-			udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port);
+			udp_sock_init(c, outbound, NULL, NULL, port);
 	}
 }
 
diff --git a/udp.h b/udp.h
index a8e76bf..de2df6d 100644
--- a/udp.h
+++ b/udp.h
@@ -16,8 +16,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now);
-int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
-		  const void *addr, const char *ifname, in_port_t port);
+int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
+		  const char *ifname, in_port_t port);
 int udp_init(struct ctx *c);
 void udp_timer(struct ctx *c, const struct timespec *now);
 void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);

From b55013b1a7e7dd7e4e90455703d272b9ffc28b64 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 20 Sep 2024 14:12:44 +1000
Subject: [PATCH 039/382] inany: Add inany_pton() helper

We already have an inany_ntop() function to format inany addresses into
text.  Add inany_pton() to parse them from text, and use it in
conf_ports().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c  |  9 +--------
 inany.c | 20 ++++++++++++++++++++
 inany.h |  1 +
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/conf.c b/conf.c
index 9f1cd83..6e62510 100644
--- a/conf.c
+++ b/conf.c
@@ -215,9 +215,6 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 		if (ifname == buf + 1) {	/* Interface without address */
 			addr = NULL;
 		} else {
-			struct in6_addr a6;
-			struct in_addr a4;
-
 			p = buf;
 
 			/* Allow square brackets for IPv4 too for convenience */
@@ -226,11 +223,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 				p++;
 			}
 
-			if (inet_pton(AF_INET, p, &a4))
-				inany_from_af(addr, AF_INET, &a4);
-			else if (inet_pton(AF_INET6, p, &a6))
-				inany_from_af(addr, AF_INET6, &a6);
-			else
+			if (!inany_pton(p, addr))
 				goto bad;
 		}
 	} else {
diff --git a/inany.c b/inany.c
index 5e391dc..f5483bf 100644
--- a/inany.c
+++ b/inany.c
@@ -36,3 +36,23 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
 
 	return inet_ntop(AF_INET6, &src->a6, dst, size);
 }
+
+/** inany_pton - Parse an IPv[46] address from text format
+ * @src:	IPv[46] address
+ * @dst:	output buffer, filled with parsed address
+ *
+ * Return: On success, 1, if no parseable address is found, 0
+ */
+int inany_pton(const char *src, union inany_addr *dst)
+{
+	if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) {
+		memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero));
+		memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one));
+		return 1;
+	}
+
+	if (inet_pton(AF_INET6, src, &dst->a6))
+		return 1;
+
+	return 0;
+}
diff --git a/inany.h b/inany.h
index d2893ce..6a12c29 100644
--- a/inany.h
+++ b/inany.h
@@ -270,5 +270,6 @@ static inline void inany_siphash_feed(struct siphash_state *state,
 #define INANY_ADDRSTRLEN	MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)
 
 const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size);
+int inany_pton(const char *src, union inany_addr *dst);
 
 #endif /* INANY_H */

From def8acdcd846582df5939446be0d73d50971ab18 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 27 Sep 2024 18:43:16 +0200
Subject: [PATCH 040/382] test: Kernel binary can now be passed via the KERNEL
 environmental variable

This is quite useful at least for myself as I'm usually running tests
using a guest kernel that's not the same as the one on the host.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 test/lib/setup | 8 ++++----
 test/lib/term  | 2 +-
 test/run       | 3 +++
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/lib/setup b/test/lib/setup
index d764138..5338393 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -58,7 +58,7 @@ setup_passt() {
 	context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}"		   \
 		' -machine accel=kvm'                                      \
 		' -m '${VMEM}' -cpu host -smp '${VCPUS}                    \
-		' -kernel ' "/boot/vmlinuz-$(uname -r)"			   \
+		' -kernel '"${KERNEL}"					   \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	   \
 		' -nodefaults'						   \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	   \
@@ -159,7 +159,7 @@ setup_passt_in_ns() {
 		' -machine accel=kvm'                                      \
 		' -M accel=kvm:tcg'                                        \
 		' -m '${VMEM}' -cpu host -smp '${VCPUS}                    \
-		' -kernel ' "/boot/vmlinuz-$(uname -r)"			   \
+		' -kernel '"${KERNEL}"					   \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	   \
 		' -nodefaults'						   \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	   \
@@ -230,7 +230,7 @@ setup_two_guests() {
 	context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}"		     \
 		' -M accel=kvm:tcg'                                          \
 		' -m '${VMEM}' -cpu host -smp '${VCPUS}                      \
-		' -kernel ' "/boot/vmlinuz-$(uname -r)"			     \
+		' -kernel '"${KERNEL}"					     \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	     \
 		' -nodefaults'						     \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	     \
@@ -243,7 +243,7 @@ setup_two_guests() {
 	context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}"		     \
 		' -M accel=kvm:tcg'                                          \
 		' -m '${VMEM}' -cpu host -smp '${VCPUS}                      \
-		' -kernel ' "/boot/vmlinuz-$(uname -r)"			     \
+		' -kernel '"${KERNEL}"					     \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	     \
 		' -nodefaults'						     \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	     \
diff --git a/test/lib/term b/test/lib/term
index 3834092..0fa0936 100755
--- a/test/lib/term
+++ b/test/lib/term
@@ -664,7 +664,7 @@ pause_continue() {
 
 # run_term() - Start tmux session, running entry point, with recording if needed
 run_term() {
-	TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG"
+	TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eKERNEL=$KERNEL"
 
 	if [ ${CI} -eq 1 ]; then
 		printf '\e[8;50;240t'
diff --git a/test/run b/test/run
index cd6d707..547a729 100755
--- a/test/run
+++ b/test/run
@@ -38,6 +38,9 @@ TRACE=${TRACE:-0}
 # If set, tell passt and pasta to take packet captures
 PCAP=${PCAP:-0}
 
+# Custom kernel to boot guests with, if given
+KERNEL=${KERNEL:-"/boot/vmlinuz-$(uname -r)"}
+
 COMMIT="$(git log --oneline --no-decorate -1)"
 
 . lib/util

From 72e7d3024b037afe2cb00c772eea0807286633bd Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 3 Oct 2024 16:51:04 +0200
Subject: [PATCH 041/382] tcp: Use tcp_payload_t rather than tcphdr

As tcp_update_check_tcp4() and tcp_update_check_tcp6() compute the
checksum using the TCP header and the TCP payload, it is clearer
to use a pointer to tcp_payload_t that includes tcphdr and payload
rather than a pointer to tcphdr (and guessing TCP header is
followed by the payload).

Move tcp_payload_t and tcp_flags_t to tcp_internal.h.
(They will be used also by vhost-user).

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 42 ++++++++++++++++++++++--------------------
 tcp_buf.c      | 29 -----------------------------
 tcp_internal.h | 29 +++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/tcp.c b/tcp.c
index 6ca3700..0590153 100644
--- a/tcp.c
+++ b/tcp.c
@@ -757,32 +757,34 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
 /**
  * tcp_update_check_tcp4() - Update TCP checksum from stored one
  * @iph:	IPv4 header
- * @th:		TCP header followed by TCP payload
+ * @bp:		TCP header followed by TCP payload
  */
-static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th)
+static void tcp_update_check_tcp4(const struct iphdr *iph,
+				  struct tcp_payload_t *bp)
 {
 	uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
 	struct in_addr saddr = { .s_addr = iph->saddr };
 	struct in_addr daddr = { .s_addr = iph->daddr };
 	uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
 
-	th->check = 0;
-	th->check = csum(th, l4len, sum);
+	bp->th.check = 0;
+	bp->th.check = csum(bp, l4len, sum);
 }
 
 /**
  * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
  * @ip6h:	IPv6 header
- * @th:		TCP header followed by TCP payload
+ * @bp:		TCP header followed by TCP payload
  */
-static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
+static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
+				  struct tcp_payload_t *bp)
 {
 	uint16_t l4len = ntohs(ip6h->payload_len);
 	uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
 					      &ip6h->saddr, &ip6h->daddr);
 
-	th->check = 0;
-	th->check = csum(th, l4len, sum);
+	bp->th.check = 0;
+	bp->th.check = csum(bp, l4len, sum);
 }
 
 /**
@@ -902,7 +904,7 @@ static void tcp_fill_header(struct tcphdr *th,
  * @conn:		Connection pointer
  * @taph:		tap backend specific header
  * @iph:		Pointer to IPv4 header
- * @th:			Pointer to TCP header
+ * @bp:			Pointer to TCP header followed by TCP payload
  * @dlen:		TCP payload length
  * @check:		Checksum, if already known
  * @seq:		Sequence number for this segment
@@ -912,14 +914,14 @@ static void tcp_fill_header(struct tcphdr *th,
  */
 static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
 				struct tap_hdr *taph,
-				struct iphdr *iph, struct tcphdr *th,
+				struct iphdr *iph, struct tcp_payload_t *bp,
 				size_t dlen, const uint16_t *check,
 				uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *src4 = inany_v4(&tapside->oaddr);
 	const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
-	size_t l4len = dlen + sizeof(*th);
+	size_t l4len = dlen + sizeof(bp->th);
 	size_t l3len = l4len + sizeof(*iph);
 
 	ASSERT(src4 && dst4);
@@ -931,12 +933,12 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
 	iph->check = check ? *check :
 			     csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4);
 
-	tcp_fill_header(th, conn, seq);
+	tcp_fill_header(&bp->th, conn, seq);
 
 	if (no_tcp_csum)
-		th->check = 0;
+		bp->th.check = 0;
 	else
-		tcp_update_check_tcp4(iph, th);
+		tcp_update_check_tcp4(iph, bp);
 
 	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
 
@@ -948,7 +950,7 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
  * @conn:		Connection pointer
  * @taph:		tap backend specific header
  * @ip6h:		Pointer to IPv6 header
- * @th:			Pointer to TCP header
+ * @bp:			Pointer to TCP header followed by TCP payload
  * @dlen:		TCP payload length
  * @check:		Checksum, if already known
  * @seq:		Sequence number for this segment
@@ -958,11 +960,11 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
  */
 static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
 				struct tap_hdr *taph,
-				struct ipv6hdr *ip6h, struct tcphdr *th,
+				struct ipv6hdr *ip6h, struct tcp_payload_t *bp,
 				size_t dlen, uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
-	size_t l4len = dlen + sizeof(*th);
+	size_t l4len = dlen + sizeof(bp->th);
 
 	ip6h->payload_len = htons(l4len);
 	ip6h->saddr = tapside->oaddr.a6;
@@ -976,12 +978,12 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
 	ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
 	ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
 
-	tcp_fill_header(th, conn, seq);
+	tcp_fill_header(&bp->th, conn, seq);
 
 	if (no_tcp_csum)
-		th->check = 0;
+		bp->th.check = 0;
 	else
-		tcp_update_check_tcp6(ip6h, th);
+		tcp_update_check_tcp6(ip6h, bp);
 
 	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
 
diff --git a/tcp_buf.c b/tcp_buf.c
index ffbff5e..238827b 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -38,35 +38,6 @@
 	(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
 
 /* Static buffers */
-/**
- * struct tcp_payload_t - TCP header and data to send segments with payload
- * @th:		TCP header
- * @data:	TCP data
- */
-struct tcp_payload_t {
-	struct tcphdr th;
-	uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));    /* For AVX2 checksum routines */
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-/**
- * struct tcp_flags_t - TCP header and data to send zero-length
- *                      segments (flags)
- * @th:		TCP header
- * @opts	TCP options
- */
-struct tcp_flags_t {
-	struct tcphdr th;
-	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
 /* Ethernet header for IPv4 frames */
 static struct ethhdr		tcp4_eth_src;
 
diff --git a/tcp_internal.h b/tcp_internal.h
index de06db1..2f74ffe 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -63,6 +63,35 @@ enum tcp_iov_parts {
 	TCP_NUM_IOVS
 };
 
+/**
+ * struct tcp_payload_t - TCP header and data to send segments with payload
+ * @th:		TCP header
+ * @data:	TCP data
+ */
+struct tcp_payload_t {
+	struct tcphdr th;
+	uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)));    /* For AVX2 checksum routines */
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
+#endif
+
+/**
+ * struct tcp_flags_t - TCP header and data to send zero-length
+ *                      segments (flags)
+ * @th:		TCP header
+ * @opts	TCP options
+ */
+struct tcp_flags_t {
+	struct tcphdr th;
+	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)));
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
+#endif
+
 extern char tcp_buf_discard [MAX_WINDOW];
 
 void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,

From fd8334b25dfa0cf4a93bb7fad6728f3bd0e31c6d Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 3 Oct 2024 16:51:05 +0200
Subject: [PATCH 042/382] pcap: Add an offset argument in pcap_iov()

The offset is passed directly to pcap_frame() and allows
any headers that are not part of the frame to
capture to be skipped.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 pcap.c | 5 +++--
 pcap.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pcap.c b/pcap.c
index e6b5ced..6ee6cdf 100644
--- a/pcap.c
+++ b/pcap.c
@@ -138,9 +138,10 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
  * @iov:	Pointer to the array of struct iovec describing the I/O vector
  *		containing packet data to write, including L2 header
  * @iovcnt:	Number of buffers (@iov entries)
+ * @offset:	Offset of the L2 frame within the full data length
  */
 /* cppcheck-suppress unusedFunction */
-void pcap_iov(const struct iovec *iov, size_t iovcnt)
+void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
 {
 	struct timespec now;
 
@@ -148,7 +149,7 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt)
 		return;
 
 	clock_gettime(CLOCK_REALTIME, &now);
-	pcap_frame(iov, iovcnt, 0, &now);
+	pcap_frame(iov, iovcnt, offset, &now);
 }
 
 /**
diff --git a/pcap.h b/pcap.h
index 5339237..9795f2e 100644
--- a/pcap.h
+++ b/pcap.h
@@ -9,7 +9,7 @@
 void pcap(const char *pkt, size_t l2len);
 void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
 		   size_t offset);
-void pcap_iov(const struct iovec *iov, size_t iovcnt);
+void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset);
 void pcap_init(struct ctx *c);
 
 #endif /* PCAP_H */

From e6548c643796f036de83163e395f0efd56da4790 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 3 Oct 2024 16:51:06 +0200
Subject: [PATCH 043/382] checksum: Add an offset argument in csum_iov()

The offset allows any headers that are not part of the data
to checksum to be skipped.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 checksum.c | 16 ++++++++++++++--
 checksum.h |  3 ++-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/checksum.c b/checksum.c
index 006614f..05d002a 100644
--- a/checksum.c
+++ b/checksum.c
@@ -59,6 +59,7 @@
 #include "util.h"
 #include "ip.h"
 #include "checksum.h"
+#include "iov.h"
 
 /* Checksums are optional for UDP over IPv4, so we usually just set
  * them to 0.  Change this to 1 to calculate real UDP over IPv4
@@ -497,16 +498,27 @@ uint16_t csum(const void *buf, size_t len, uint32_t init)
  *
  * @iov		Pointer to the array of IO vectors
  * @n		Length of the array
+ * @offset:	Offset of the data to checksum within the full data length
  * @init	Initial 32-bit checksum, 0 for no pre-computed checksum
  *
  * Return: 16-bit folded, complemented checksum
  */
 /* cppcheck-suppress unusedFunction */
-uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init)
+uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
+		  uint32_t init)
 {
 	unsigned int i;
+	size_t first;
 
-	for (i = 0; i < n; i++)
+	i = iov_skip_bytes(iov, n, offset, &first);
+	if (i >= n)
+		return (uint16_t)~csum_fold(init);
+
+	init = csum_unfolded((char *)iov[i].iov_base + first,
+			     iov[i].iov_len - first, init);
+	i++;
+
+	for (; i < n; i++)
 		init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);
 
 	return (uint16_t)~csum_fold(init);
diff --git a/checksum.h b/checksum.h
index c5964ac..49f7472 100644
--- a/checksum.h
+++ b/checksum.h
@@ -32,6 +32,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr,
 		const void *payload, size_t dlen);
 uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
 uint16_t csum(const void *buf, size_t len, uint32_t init);
-uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init);
+uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
+		  uint32_t init);
 
 #endif /* CHECKSUM_H */

From 3d484aa370902873bd42a434fa856b9ee3eac228 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 3 Oct 2024 16:51:07 +0200
Subject: [PATCH 044/382] tcp: Update TCP checksum using an iovec array

TCP header and payload are supposed to be in the same buffer,
and tcp_update_check_tcp4()/tcp_update_check_tcp6() compute
the checksum from the base address of the header using the
length of the IP payload.

In the future (for vhost-user) we need to dispatch the TCP header and
the TCP payload through several buffers. To be able to manage that, we
provide an iovec array that points to the data of the TCP frame.
We provide also an offset to be able to provide an array that contains
the TCP frame embedded in an lower level frame, and this offset points
to the TCP header inside the iovec array.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 checksum.c |   1 -
 tcp.c      | 118 +++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 100 insertions(+), 19 deletions(-)

diff --git a/checksum.c b/checksum.c
index 05d002a..cf85019 100644
--- a/checksum.c
+++ b/checksum.c
@@ -503,7 +503,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init)
  *
  * Return: 16-bit folded, complemented checksum
  */
-/* cppcheck-suppress unusedFunction */
 uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
 		  uint32_t init)
 {
diff --git a/tcp.c b/tcp.c
index 0590153..9617b7a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -755,36 +755,106 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
 }
 
 /**
- * tcp_update_check_tcp4() - Update TCP checksum from stored one
+ * tcp_update_check_tcp4() - Calculate TCP checksum for IPv4
  * @iph:	IPv4 header
- * @bp:		TCP header followed by TCP payload
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ * @l4offset:	IPv4 payload offset in the iovec array
  */
 static void tcp_update_check_tcp4(const struct iphdr *iph,
-				  struct tcp_payload_t *bp)
+				  const struct iovec *iov, int iov_cnt,
+				  size_t l4offset)
 {
 	uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
 	struct in_addr saddr = { .s_addr = iph->saddr };
 	struct in_addr daddr = { .s_addr = iph->daddr };
-	uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
+	size_t check_ofs;
+	__sum16 *check;
+	int check_idx;
+	uint32_t sum;
+	char *ptr;
 
-	bp->th.check = 0;
-	bp->th.check = csum(bp, l4len, sum);
+	sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
+
+	check_idx = iov_skip_bytes(iov, iov_cnt,
+				   l4offset + offsetof(struct tcphdr, check),
+				   &check_ofs);
+
+	if (check_idx >= iov_cnt) {
+		err("TCP4 buffer is too small, iov size %zd, check offset %zd",
+		    iov_size(iov, iov_cnt),
+		    l4offset + offsetof(struct tcphdr, check));
+		return;
+	}
+
+	if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
+		err("TCP4 checksum field memory is not contiguous "
+		    "check_ofs %zd check_idx %d iov_len %zd",
+		    check_ofs, check_idx, iov[check_idx].iov_len);
+		return;
+	}
+
+	ptr = (char *)iov[check_idx].iov_base + check_ofs;
+	if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
+		err("TCP4 checksum field is not correctly aligned in memory");
+		return;
+	}
+
+	check = (__sum16 *)ptr;
+
+	*check = 0;
+	*check = csum_iov(iov, iov_cnt, l4offset, sum);
 }
 
 /**
  * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
  * @ip6h:	IPv6 header
- * @bp:		TCP header followed by TCP payload
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ * @l4offset:	IPv6 payload offset in the iovec array
  */
 static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
-				  struct tcp_payload_t *bp)
+				  const struct iovec *iov, int iov_cnt,
+				  size_t l4offset)
 {
 	uint16_t l4len = ntohs(ip6h->payload_len);
-	uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
-					      &ip6h->saddr, &ip6h->daddr);
+	size_t check_ofs;
+	__sum16 *check;
+	int check_idx;
+	uint32_t sum;
+	char *ptr;
 
-	bp->th.check = 0;
-	bp->th.check = csum(bp, l4len, sum);
+	sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr,
+				     &ip6h->daddr);
+
+	check_idx = iov_skip_bytes(iov, iov_cnt,
+				   l4offset + offsetof(struct tcphdr, check),
+				   &check_ofs);
+
+	if (check_idx >= iov_cnt) {
+		err("TCP6 buffer is too small, iov size %zd, check offset %zd",
+		    iov_size(iov, iov_cnt),
+		    l4offset + offsetof(struct tcphdr, check));
+		return;
+	}
+
+	if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
+		err("TCP6 checksum field memory is not contiguous "
+		    "check_ofs %zd check_idx %d iov_len %zd",
+		    check_ofs, check_idx, iov[check_idx].iov_len);
+		return;
+	}
+
+	ptr = (char *)iov[check_idx].iov_base + check_ofs;
+	if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
+		err("TCP6 checksum field is not correctly aligned in memory");
+		return;
+	}
+
+	check = (__sum16 *)ptr;
+
+	*check = 0;
+	*check = csum_iov(iov, iov_cnt, l4offset, sum);
 }
 
 /**
@@ -935,10 +1005,16 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
 
 	tcp_fill_header(&bp->th, conn, seq);
 
-	if (no_tcp_csum)
+	if (no_tcp_csum) {
 		bp->th.check = 0;
-	else
-		tcp_update_check_tcp4(iph, bp);
+	} else {
+		const struct iovec iov = {
+			.iov_base = bp,
+			.iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr),
+		};
+
+		tcp_update_check_tcp4(iph, &iov, 1, 0);
+	}
 
 	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
 
@@ -980,10 +1056,16 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
 
 	tcp_fill_header(&bp->th, conn, seq);
 
-	if (no_tcp_csum)
+	if (no_tcp_csum) {
 		bp->th.check = 0;
-	else
-		tcp_update_check_tcp6(ip6h, bp);
+	} else {
+		const struct iovec iov = {
+			.iov_base = bp,
+			.iov_len = ntohs(ip6h->payload_len)
+		};
+
+		tcp_update_check_tcp6(ip6h, &iov, 1, 0);
+	}
 
 	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
 

From 151dbe0d3d3690978a0a5cf3b8fa9808bd708668 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 3 Oct 2024 16:51:08 +0200
Subject: [PATCH 045/382] udp: Update UDP checksum using an iovec array

As for tcp_update_check_tcp4()/tcp_update_check_tcp6(),
change csum_udp4() and csum_udp6() to use an iovec array.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 checksum.c | 29 ++++++++++++++++++-----------
 checksum.h |  4 ++--
 tap.c      | 14 +++++++++++---
 tap.h      |  2 +-
 udp.c      | 17 +++++++++++++----
 5 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/checksum.c b/checksum.c
index cf85019..c673993 100644
--- a/checksum.c
+++ b/checksum.c
@@ -166,22 +166,24 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
  * @udp4hr:	UDP header, initialised apart from checksum
  * @saddr:	IPv4 source address
  * @daddr:	IPv4 destination address
- * @payload:	UDP packet payload
- * @dlen:	Length of @payload (not including UDP header)
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ * @offset:	UDP payload offset in the iovec array
  */
 void csum_udp4(struct udphdr *udp4hr,
 	       struct in_addr saddr, struct in_addr daddr,
-	       const void *payload, size_t dlen)
+	       const struct iovec *iov, int iov_cnt, size_t offset)
 {
 	/* UDP checksums are optional, so don't bother */
 	udp4hr->check = 0;
 
 	if (UDP4_REAL_CHECKSUMS) {
-		uint16_t l4len = dlen + sizeof(struct udphdr);
+		uint16_t l4len = iov_size(iov, iov_cnt) - offset +
+				 sizeof(struct udphdr);
 		uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
 						       saddr, daddr);
 		psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
-		udp4hr->check = csum(payload, dlen, psum);
+		udp4hr->check = csum_iov(iov, iov_cnt, offset, psum);
 	}
 }
 
@@ -227,19 +229,24 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
 /**
  * csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet
  * @udp6hr:	UDP header, initialised apart from checksum
- * @payload:	UDP packet payload
- * @dlen:	Length of @payload (not including UDP header)
+ * @saddr:	Source address
+ * @daddr:	Destination address
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ * @offset:	UDP payload offset in the iovec array
  */
 void csum_udp6(struct udphdr *udp6hr,
 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
-	       const void *payload, size_t dlen)
+	       const struct iovec *iov, int iov_cnt, size_t offset)
 {
-	uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(struct udphdr),
-					       IPPROTO_UDP, saddr, daddr);
+	uint16_t l4len = iov_size(iov, iov_cnt) - offset +
+			 sizeof(struct udphdr);
+	uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
+					       saddr, daddr);
 	udp6hr->check = 0;
 
 	psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
-	udp6hr->check = csum(payload, dlen, psum);
+	udp6hr->check = csum_iov(iov, iov_cnt, offset, psum);
 }
 
 /**
diff --git a/checksum.h b/checksum.h
index 49f7472..31ba322 100644
--- a/checksum.h
+++ b/checksum.h
@@ -19,14 +19,14 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
 				struct in_addr saddr, struct in_addr daddr);
 void csum_udp4(struct udphdr *udp4hr,
 	       struct in_addr saddr, struct in_addr daddr,
-	       const void *payload, size_t dlen);
+	       const struct iovec *iov, int iov_cnt, size_t offset);
 void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
 uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
 				const struct in6_addr *saddr,
 				const struct in6_addr *daddr);
 void csum_udp6(struct udphdr *udp6hr,
 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
-	       const void *payload, size_t dlen);
+	       const struct iovec *iov, int iov_cnt, size_t offset);
 void csum_icmp6(struct icmp6hdr *icmp6hr,
 		const struct in6_addr *saddr, const struct in6_addr *daddr,
 		const void *payload, size_t dlen);
diff --git a/tap.c b/tap.c
index 41af6a6..c53a39b 100644
--- a/tap.c
+++ b/tap.c
@@ -172,11 +172,15 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 	struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
 	struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
 	char *data = (char *)(uh + 1);
+	const struct iovec iov = {
+		.iov_base = (void *)in,
+		.iov_len = dlen
+	};
 
 	uh->source = htons(sport);
 	uh->dest = htons(dport);
 	uh->len = htons(l4len);
-	csum_udp4(uh, src, dst, in, dlen);
+	csum_udp4(uh, src, dst, &iov, 1, 0);
 	memcpy(data, in, dlen);
 
 	tap_send_single(c, buf, dlen + (data - buf));
@@ -247,7 +251,7 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h,
 void tap_udp6_send(const struct ctx *c,
 		   const struct in6_addr *src, in_port_t sport,
 		   const struct in6_addr *dst, in_port_t dport,
-		   uint32_t flow, const void *in, size_t dlen)
+		   uint32_t flow, void *in, size_t dlen)
 {
 	size_t l4len = dlen + sizeof(struct udphdr);
 	char buf[USHRT_MAX];
@@ -255,11 +259,15 @@ void tap_udp6_send(const struct ctx *c,
 	struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
 					  l4len, IPPROTO_UDP, flow);
 	char *data = (char *)(uh + 1);
+	const struct iovec iov = {
+		.iov_base = in,
+		.iov_len = dlen
+	};
 
 	uh->source = htons(sport);
 	uh->dest = htons(dport);
 	uh->len = htons(l4len);
-	csum_udp6(uh, src, dst, in, dlen);
+	csum_udp6(uh, src, dst, &iov, 1, 0);
 	memcpy(data, in, dlen);
 
 	tap_send_single(c, buf, dlen + (data - buf));
diff --git a/tap.h b/tap.h
index ec9e2ac..85f1e84 100644
--- a/tap.h
+++ b/tap.h
@@ -53,7 +53,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
 void tap_udp6_send(const struct ctx *c,
 		   const struct in6_addr *src, in_port_t sport,
 		   const struct in6_addr *dst, in_port_t dport,
-		   uint32_t flow, const void *in, size_t dlen);
+		   uint32_t flow, void *in, size_t dlen);
 void tap_icmp6_send(const struct ctx *c,
 		    const struct in6_addr *src, const struct in6_addr *dst,
 		    const void *in, size_t l4len);
diff --git a/udp.c b/udp.c
index 08faaec..100610f 100644
--- a/udp.c
+++ b/udp.c
@@ -321,10 +321,15 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 	bp->uh.source = htons(toside->oport);
 	bp->uh.dest = htons(toside->eport);
 	bp->uh.len = htons(l4len);
-	if (no_udp_csum)
+	if (no_udp_csum) {
 		bp->uh.check = 0;
-	else
-		csum_udp4(&bp->uh, *src, *dst, bp->data, dlen);
+	} else {
+		const struct iovec iov = {
+			.iov_base = bp->data,
+			.iov_len = dlen
+		};
+		csum_udp4(&bp->uh, *src, *dst, &iov, 1, 0);
+	}
 
 	return l4len;
 }
@@ -363,8 +368,12 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
 		 */
 		bp->uh.check = 0xffff;
 	} else {
+		const struct iovec iov = {
+			.iov_base = bp->data,
+			.iov_len = dlen
+		};
 		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
-			  bp->data, dlen);
+			  &iov, 1, 0);
 	}
 
 	return l4len;

From 9d66df9a9a45b9305a2daff8a3c09a28f2c78d83 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 3 Oct 2024 14:48:32 +1000
Subject: [PATCH 046/382] conf: Add command line switch to enable IP_FREEBIND
 socket option

In a couple of recent reports, we've seen that it can be useful for pasta
to forward ports from addresses which are not currently configured on the
host, but might be in future.  That can be done with the sysctl
net.ipv4.ip_nonlocal_bind, but that does require CAP_NET_ADMIN to set in
the first place.  We can allow the same thing on a per-socket basis with
the IP_FREEBIND (or IPV6_FREEBIND) socket option.

Add a --freebind command line argument to enable this socket option on
all listening sockets.

Link: https://bugs.passt.top/show_bug.cgi?id=101
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c  |  2 ++
 passt.1 | 10 ++++++++++
 passt.h |  2 ++
 util.c  | 16 ++++++++++++++++
 4 files changed, 30 insertions(+)

diff --git a/conf.c b/conf.c
index 6e62510..e360fb9 100644
--- a/conf.c
+++ b/conf.c
@@ -836,6 +836,7 @@ static void usage(const char *name, FILE *f, int status)
 		"  --no-ndp		Disable NDP responses\n"
 		"  --no-dhcpv6		Disable DHCPv6 server\n"
 		"  --no-ra		Disable router advertisements\n"
+		"  --freebind		Bind to any address for forwarding\n"
 		"  --no-map-gw		Don't map gateway address to host\n"
 		"  -4, --ipv4-only	Enable IPv4 operation only\n"
 		"  -6, --ipv6-only	Enable IPv6 operation only\n");
@@ -1255,6 +1256,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"no-dhcpv6",	no_argument,		&c->no_dhcpv6,	1 },
 		{"no-ndp",	no_argument,		&c->no_ndp,	1 },
 		{"no-ra",	no_argument,		&c->no_ra,	1 },
+		{"freebind",	no_argument,		&c->freebind,	1 },
 		{"no-map-gw",	no_argument,		&no_map_gw,	1 },
 		{"ipv4-only",	no_argument,		NULL,		'4' },
 		{"ipv6-only",	no_argument,		NULL,		'6' },
diff --git a/passt.1 b/passt.1
index 79d134d..5ac2962 100644
--- a/passt.1
+++ b/passt.1
@@ -327,6 +327,16 @@ namespace will be silently dropped.
 Disable Router Advertisements. Router Solicitations coming from guest or target
 namespace will be ignored.
 
+.TP
+.BR \-\-freebind
+Allow any binding address to be specified for \fB-t\fR and \fB-u\fR
+options.  Usually binding addresses must be addresses currently
+configured on the host.  With \fB\-\-freebind\fR, the
+\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled
+allowing any address to be used.  This is typically used to bind
+addresses which might be configured on the host in future, at which
+point the forwarding will immediately start operating.
+
 .TP
 .BR \-\-map-host-loopback " " \fIaddr
 Translate \fIaddr\fR to refer to the host. Packets from the guest to
diff --git a/passt.h b/passt.h
index 031c9b6..4908ed9 100644
--- a/passt.h
+++ b/passt.h
@@ -225,6 +225,7 @@ struct ip6_ctx {
  * @no_dhcpv6:		Disable DHCPv6 server
  * @no_ndp:		Disable NDP handler altogether
  * @no_ra:		Disable router advertisements
+ * @freebind:		Allow binding of non-local addresses for forwarding
  * @low_wmem:		Low probed net.core.wmem_max
  * @low_rmem:		Low probed net.core.rmem_max
  */
@@ -284,6 +285,7 @@ struct ctx {
 	int no_dhcpv6;
 	int no_ndp;
 	int no_ra;
+	int freebind;
 
 	int low_wmem;
 	int low_rmem;
diff --git a/util.c b/util.c
index ebd93ed..eba7d52 100644
--- a/util.c
+++ b/util.c
@@ -52,6 +52,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 {
 	sa_family_t af = ((const struct sockaddr *)sa)->sa_family;
 	union epoll_ref ref = { .type = type, .data = data };
+	bool freebind = false;
 	struct epoll_event ev;
 	int fd, y = 1, ret;
 	uint8_t proto;
@@ -61,8 +62,11 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	case EPOLL_TYPE_TCP_LISTEN:
 		proto = IPPROTO_TCP;
 		socktype = SOCK_STREAM | SOCK_NONBLOCK;
+		freebind = c->freebind;
 		break;
 	case EPOLL_TYPE_UDP_LISTEN:
+		freebind = c->freebind;
+		/* fallthrough */
 	case EPOLL_TYPE_UDP_REPLY:
 		proto = IPPROTO_UDP;
 		socktype = SOCK_DGRAM | SOCK_NONBLOCK;
@@ -127,6 +131,18 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 		}
 	}
 
+	if (freebind) {
+		int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
+		int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND;
+
+		if (setsockopt(fd, level, opt, &y, sizeof(y))) {
+			err_perror("Failed to set %s on socket %i",
+				   af == AF_INET ? "IP_FREEBIND"
+				                 : "IPV6_FREEBIND",
+				   fd);
+		}
+	}
+
 	if (bind(fd, sa, sl) < 0) {
 		/* We'll fail to bind to low ports if we don't have enough
 		 * capabilities, and we'll fail to bind on already bound ports,

From ff63ac922a4017de8a5d384b1c0be36433436ed8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 3 Oct 2024 15:14:02 +1000
Subject: [PATCH 047/382] conf: Add --dns-host option to configure host side
 nameserver

When redirecting DNS queries with the --dns-forward option, passt/pasta
needs a host side nameserver to redirect the queries to.  This is
controlled by the c->ip[46].dns_host variables.  This is set to the first
first nameserver listed in the host's /etc/resolv.conf, and there isn't
currently a way to override it from the command line.

Prior to 0b25cac9 ("conf: Treat --dns addresses as guest visible
addresses") it was possible to alter this with the -D/--dns option.
However, doing so was confusing and had some nonsensical edge cases because
-D generally takes guest side addresses, rather than host side addresses.

Add a new --dns-host option to restore this functionality in a more
sensible way.

Link: https://bugs.passt.top/show_bug.cgi?id=102
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c  | 16 ++++++++++++++++
 passt.1 | 17 +++++++++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/conf.c b/conf.c
index e360fb9..c631019 100644
--- a/conf.c
+++ b/conf.c
@@ -829,6 +829,9 @@ static void usage(const char *name, FILE *f, int status)
 		"  --dns-forward ADDR	Forward DNS queries sent to ADDR\n"
 		"    can be specified zero to two times (for IPv4 and IPv6)\n"
 		"    default: don't forward DNS queries\n"
+		"  --dns-host ADDR	Host nameserver to direct queries to\n"
+		"    can be specified zero to two times (for IPv4 and IPv6)\n"
+		"    default: first nameserver from host's /etc/resolv.conf\n"
 		"  --no-tcp		Disable TCP protocol handler\n"
 		"  --no-udp		Disable UDP protocol handler\n"
 		"  --no-icmp		Disable ICMP/ICMPv6 protocol handler\n"
@@ -1286,6 +1289,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"netns-only",	no_argument,		NULL,		20 },
 		{"map-host-loopback", required_argument, NULL,		21 },
 		{"map-guest-addr", required_argument,	NULL,		22 },
+		{"dns-host",	required_argument,	NULL,		24 },
 		{ 0 },
 	};
 	const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@@ -1463,6 +1467,18 @@ void conf(struct ctx *c, int argc, char **argv)
 			conf_nat(optarg, &c->ip4.map_guest_addr,
 				 &c->ip6.map_guest_addr, NULL);
 			break;
+		case 24:
+			if (inet_pton(AF_INET6, optarg, &c->ip6.dns_host) &&
+			    !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
+				break;
+
+			if (inet_pton(AF_INET, optarg, &c->ip4.dns_host) &&
+			    !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)   &&
+			    !IN4_IS_ADDR_BROADCAST(&c->ip4.dns_host))
+				break;
+
+			die("Invalid host nameserver address: %s", optarg);
+			break;
 		case 'd':
 			c->debug = 1;
 			c->quiet = 0;
diff --git a/passt.1 b/passt.1
index 5ac2962..ef33267 100644
--- a/passt.1
+++ b/passt.1
@@ -249,10 +249,19 @@ the host.
 .TP
 .BR \-\-dns-forward " " \fIaddr
 Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the
-first configured DNS resolver (with corresponding IP version). Maps
-only UDP and TCP traffic to port 53 or port 853.  Replies are
-translated back with a reverse mapping.  This option can be specified
-zero to two times (once for IPv4, once for IPv6).
+nameserver (with corresponding IP version) specified by the
+\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or
+port 853.  Replies are translated back with a reverse mapping.  This
+option can be specified zero to two times (once for IPv4, once for
+IPv6).
+
+.TP
+.BR \-\-dns-host " " \fIaddr
+Configure the host nameserver which guest or namespace queries to the
+\fB\-\-dns-forward\fR address will be redirected to. This option can
+be specified zero to two times (once for IPv4, once for IPv6).
+By default, the first nameserver from the host's
+\fI/etc/resolv.conf\fR.
 
 .TP
 .BR \-S ", " \-\-search " " \fIlist

From b40880c157ea12ccfc93266cc08252be1aaedaa9 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 8 Oct 2024 22:40:58 +0200
Subject: [PATCH 048/382] test/lib/term: Always use printf for messages with
 escape sequences

...instead of echo: otherwise, bash won't handle escape sequences we
use to colour messages (and 'echo -e' is not specified by POSIX).

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 test/lib/term | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/lib/term b/test/lib/term
index 0fa0936..fcbed16 100755
--- a/test/lib/term
+++ b/test/lib/term
@@ -31,8 +31,8 @@ PR_DELAY_INIT=100 # ms
 # $@:	Message to print
 info() {
 	tmux select-pane -t ${PANE_INFO}
-	echo "${@}" >> $STATEBASE/log_pipe
-	echo "${@}" >> "${LOGFILE}"
+	printf "${@}\n" >> $STATEBASE/log_pipe
+	printf "${@}\n" >> "${LOGFILE}"
 }
 
 # info_n() - Highlight, print message to pane and to log file without newline
@@ -47,13 +47,13 @@ info_n() {
 # $@:	Message to print
 info_nolog() {
 	tmux select-pane -t ${PANE_INFO}
-	echo "${@}" >> $STATEBASE/log_pipe
+	printf "${@}\n" >> $STATEBASE/log_pipe
 }
 
 # info_nolog() - Print message to log file
 # $@:	Message to print
 log() {
-	echo "${@}" >> "${LOGFILE}"
+	printf "${@}\n" >> "${LOGFILE}"
 }
 
 # info_nolog_n() - Send message to pane without highlighting it, without newline

From 7612cb80fe80c089b25245e12a5e934f772480f8 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 4 Oct 2024 18:50:43 +0200
Subject: [PATCH 049/382] test: Pass TRACE from run_term() into ./run from_term

Just like we do for PCAP, DEBUG and KERNEL. Otherwise, running tests
with TRACE=1 will not actually enable tracing output.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 test/lib/term | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/lib/term b/test/lib/term
index fcbed16..ed690de 100755
--- a/test/lib/term
+++ b/test/lib/term
@@ -664,7 +664,7 @@ pause_continue() {
 
 # run_term() - Start tmux session, running entry point, with recording if needed
 run_term() {
-	TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eKERNEL=$KERNEL"
+	TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL"
 
 	if [ ${CI} -eq 1 ]; then
 		printf '\e[8;50;240t'

From 2d7f734c45c64e9d5ddc408a1e13de7d9942bf42 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 15 Oct 2024 00:17:24 +0200
Subject: [PATCH 050/382] tcp: Send "empty" handshake ACK before first data
 segment

Starting from commit 9178a9e3462d ("tcp: Always send an ACK segment
once the handshake is completed"), we always send an ACK segment,
without any payload, to complete the three-way handshake while
establishing a connection started from a socket.

We queue that segment after checking if we already have data to send
to the tap, which means that its sequence number is higher than any
segment with data we're sending in the same iteration, if any data is
available on the socket.

However, in tcp_defer_handler(), we first flush "flags" buffers, that
is, we send out segments without any data first, and then segments
with data, which means that our "empty" ACK is sent before the ACK
segment with data (if any), which has a lower sequence number.

This appears to be harmless as the guest or container will generally
reorder segments, but it looks rather weird and we can't exclude it's
actually causing problems.

Queue the empty ACK first, so that it gets a lower sequence number,
before checking for any data from the socket.

Reported-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tcp.c b/tcp.c
index 9617b7a..b2155ab 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1957,11 +1957,12 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
 		return;
 	}
 
+	tcp_send_flag(c, conn, ACK);
+
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
 	 */
 	tcp_data_from_sock(c, conn);
-	tcp_send_flag(c, conn, ACK);
 }
 
 /**

From f9d677bff6af48b50f3655224e8b0eb8820d3e89 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 18 Oct 2024 12:35:50 +1100
Subject: [PATCH 051/382] arp: Fix a handful of small warts

This fixes a number of harmless but slightly ugly warts in the ARP
resolution code:
 * Use in4addr_any to represent 0.0.0.0 rather than hand constructing an
   example.
 * When comparing am->sip against 0.0.0.0 use sizeof(am->sip) instead of
   sizeof(am->tip) (same value, but makes more logical sense)
 * Described the guest's assigned address as such, rather than as "our
   address" - that's not usually what we mean by "our address" these days
 * Remove "we might have the same IP address" comment which I can't make
   sense of in context (possibly it's relating to the statement below,
   which already has its own comment?)

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 arp.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arp.c b/arp.c
index 53334da..fc482bb 100644
--- a/arp.c
+++ b/arp.c
@@ -59,14 +59,12 @@ int arp(const struct ctx *c, const struct pool *p)
 	    ah->ar_op  != htons(ARPOP_REQUEST))
 		return 1;
 
-	/* Discard announcements (but not 0.0.0.0 "probes"): we might have the
-	 * same IP address, hide that.
-	 */
-	if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
+	/* Discard announcements, but not 0.0.0.0 "probes" */
+	if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) &&
 	    !memcmp(am->sip, am->tip, sizeof(am->sip)))
 		return 1;
 
-	/* Don't resolve our own address, either. */
+	/* Don't resolve the guest's assigned address, either. */
 	if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
 		return 1;
 

From 75b9c0feb0b54b040a8c49f160cfc2defe28c045 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 18 Oct 2024 12:35:51 +1100
Subject: [PATCH 052/382] test: Explicitly wait for DAD to complete on SLAAC
 addresses

Getting a SLAAC address takes a little while because the kernel must
complete Duplicate Address Detection (DAD) before marking the address as
ready.  In several places we have an explicit 'sleep 2' to wait for that
to complete.

Fixed length delays are never a great idea, although this one is pretty
solid.  Still, it would be better to explicitly wait for DAD to complete
in case of long delays (which might happen on slow emulated hosts, or with
heavy load), and to speed the tests up if DAD completes quicker.

Replace the fixed sleeps with a loop waiting for DAD to complete.  We do
this by looping waiting for all tentative addresses to disappear.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/passt/ndp        | 4 +++-
 test/pasta/ndp        | 3 ++-
 test/two_guests/basic | 3 ++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/passt/ndp b/test/passt/ndp
index 6bf8af3..f54b8ce 100644
--- a/test/passt/ndp
+++ b/test/passt/ndp
@@ -16,7 +16,9 @@ htools	ip jq sipcalc grep cut
 
 test	Interface name
 gout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-guest	ip link set dev __IFNAME__ up && sleep 2
+guest	ip link set dev __IFNAME__ up
+# Wait for DAD to complete
+guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
 check	[ -n "__IFNAME__" ]
 
diff --git a/test/pasta/ndp b/test/pasta/ndp
index d45ff7b..c59627f 100644
--- a/test/pasta/ndp
+++ b/test/pasta/ndp
@@ -18,7 +18,8 @@ test	Interface name
 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
 check	[ -n "__IFNAME__" ]
 ns	ip link set dev __IFNAME__ up
-sleep	2
+# Wait for DAD to complete
+ns	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 
 test	SLAAC: prefix
 nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
diff --git a/test/two_guests/basic b/test/two_guests/basic
index 4d49e85..ac50ff8 100644
--- a/test/two_guests/basic
+++ b/test/two_guests/basic
@@ -36,7 +36,8 @@ check	[ "__ADDR2__" = "__HOST_ADDR__" ]
 
 test	DHCPv6: addresses
 # Link is up now, wait for DAD to complete
-sleep	2
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest2	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 guest1	/sbin/dhclient -6 __IFNAME1__
 guest2	/sbin/dhclient -6 __IFNAME2__
 g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'

From 53176ca91d176ea15d8abf3b1429e43bc93e516c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 18 Oct 2024 12:35:52 +1100
Subject: [PATCH 053/382] test: Wait for DAD on DHCPv6 addresses

After running dhclient -6 we expect the DHCPv6 assigned address to be
immediately usable.  That's true with the Fedora dhclient-script (and the
upstream ISC DHCP one), however it's not true with the Debian
dhclient-script.  The Debian script can complete with the address still
in "tentative" state, and the address won't be usable until Duplicate
Address Detection (DAD) completes.  That's arguably a bug in Debian (see
link below), but for the time being we need to work around it anyway.

We usually get away with this, because by the time we do anything where the
address matters, DAD has completed.  However, it's not robust, so we should
explicitly wait for DAD to complete when we get an DHCPv6 address.

Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1085231

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/passt/dhcp       | 2 ++
 test/passt_in_ns/dhcp | 2 ++
 test/pasta/dhcp       | 2 ++
 test/perf/passt_tcp   | 2 ++
 test/two_guests/basic | 3 +++
 5 files changed, 11 insertions(+)

diff --git a/test/passt/dhcp b/test/passt/dhcp
index e05a4bb..9925ab9 100644
--- a/test/passt/dhcp
+++ b/test/passt/dhcp
@@ -49,6 +49,8 @@ check	[ "__SEARCH__" = "__HOST_SEARCH__" ]
 
 test	DHCPv6: address
 guest	/sbin/dhclient -6 __IFNAME__
+# Wait for DAD to complete
+guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 check	[ "__ADDR6__" = "__HOST_ADDR6__" ]
diff --git a/test/passt_in_ns/dhcp b/test/passt_in_ns/dhcp
index 0ceed7c..a38a690 100644
--- a/test/passt_in_ns/dhcp
+++ b/test/passt_in_ns/dhcp
@@ -52,6 +52,8 @@ check	[ "__SEARCH__" = "__HOST_SEARCH__" ]
 
 test	DHCPv6: address
 guest	/sbin/dhclient -6 __IFNAME__
+# Wait for DAD to complete
+guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 check	[ "__ADDR6__" = "__HOST_ADDR6__" ]
diff --git a/test/pasta/dhcp b/test/pasta/dhcp
index 41556b8..d4f3ad5 100644
--- a/test/pasta/dhcp
+++ b/test/pasta/dhcp
@@ -35,6 +35,8 @@ check	[ __MTU__ = 65520 ]
 
 test	DHCPv6: address
 ns	/sbin/dhclient -6 --no-pid __IFNAME__
+# Wait for DAD to complete
+ns	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
 nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
diff --git a/test/perf/passt_tcp b/test/perf/passt_tcp
index 089d953..5978c49 100644
--- a/test/perf/passt_tcp
+++ b/test/perf/passt_tcp
@@ -116,6 +116,8 @@ iperf3k	ns
 # Reducing MTU below 1280 deconfigures IPv6, get our address back
 guest	dhclient -6 -x
 guest	dhclient -6 __IFNAME__
+# Wait for DAD to complete
+guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 
 tl	TCP RR latency over IPv4: guest to host
 lat	-
diff --git a/test/two_guests/basic b/test/two_guests/basic
index ac50ff8..9ba5efe 100644
--- a/test/two_guests/basic
+++ b/test/two_guests/basic
@@ -40,6 +40,9 @@ guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1;
 guest2	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 guest1	/sbin/dhclient -6 __IFNAME1__
 guest2	/sbin/dhclient -6 __IFNAME2__
+# Wait for DAD to complete on the DHCP address
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest2	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
 g2out	ADDR2_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME2__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'

From ef8a5161d0d83193cadc965f6a8951fe92659996 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 18 Oct 2024 12:35:53 +1100
Subject: [PATCH 054/382] passt.1: Mark --stderr as deprecated more prominently

The description of this option says that it's deprecated, but unlike
--no-copy-addrs and --no-copy-routes it doesn't have a clear label.  Add
one to make it easier to spot.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt.1 b/passt.1
index ef33267..c573788 100644
--- a/passt.1
+++ b/passt.1
@@ -95,7 +95,7 @@ detached PID namespace after starting, because the PID itself cannot change.
 Default is to fork into background.
 
 .TP
-.BR \-e ", " \-\-stderr
+.BR \-e ", " \-\-stderr " " (DEPRECATED)
 This option has no effect, and is maintained for compatibility purposes only.
 
 Note that this configuration option is \fBdeprecated\fR and will be removed in a

From 1fa421192c7f11f071d11a7aba1bb1f5cdf4a604 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 18 Oct 2024 12:35:54 +1100
Subject: [PATCH 055/382] passt.1: Clarify and update "Handling of local
 addresses" section

This section didn't mention the effect of the --map-host-loopback option
which now alters this behaviour.  Update it accordingly.

It used "local addresses" to mean specifically 127.0.0.0/8 and ::1.
However, "local" could also refer to link-local addresses or to addresses
of any scope which happen to be configured on the host.  Use "loopback
address" to be more precise about this.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.1 | 48 +++++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/passt.1 b/passt.1
index c573788..46100e2 100644
--- a/passt.1
+++ b/passt.1
@@ -882,38 +882,40 @@ root@localhost's password:
 
 .SH NOTES
 
-.SS Handling of traffic with local destination and source addresses
+.SS Handling of traffic with loopback destination and source addresses
 
-Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address,
-depending on the configuration. Local destination or source addresses need to be
-changed before packets are delivered to the guest or target namespace: most
-operating systems would drop packets received from non-loopback interfaces with
-local addresses, and it would also be impossible for guest or target namespace
-to route answers back.
+Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback
+address (127.0.0.0/8 or ::1), depending on the configuration. Loopback
+destination or source addresses need to be changed before packets are
+delivered to the guest or target namespace: most operating systems
+would drop packets received with loopback addresses on non-loopback
+interfaces, and it would also be impossible for guest or target
+namespace to route answers back.
 
-For convenience, and somewhat arbitrarily, the source address on these packets
-is translated to the address of the default IPv4 or IPv6 gateway (if any) --
-this is known to be an existing, valid address on the same subnet.
+For convenience, the source address on these packets is translated to
+the address specified by the \fB\-\-map-host-loopback\fR option.  If
+not specified this defaults, somewhat arbitrarily, to the address of
+default IPv4 or IPv6 gateway (if any) -- this is known to be an
+existing, valid address on the same subnet.  If \fB\-\-no-map-gw\fR or
+\fB\-\-map-host-loopback none\fR are specified this translation is
+disabled and packets with loopback addresses are simply dropped.
 
-Loopback destination addresses are instead translated to the observed external
-address of the guest or target namespace. For IPv6 packets, if usage of a
-link-local address by guest or namespace has ever been observed, and the
-original destination address is also a link-local address, the observed
-link-local address is used. Otherwise, the observed global address is used. For
-both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses
-will be used instead.
+Loopback destination addresses are translated to the observed external
+address of the guest or target namespace. For IPv6, the observed
+link-local address is used if the translated source address is
+link-local, otherwise the observed global address is used. For both
+IPv4 and IPv6, if no addresses have been seen yet, the configured
+addresses will be used instead.
 
 For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1,
 with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
 the last observed source address from guest or namespace is 192.0.2.2, this will
 be translated to a connection from 192.0.2.1 to 192.0.2.2.
 
-Similarly, for traffic coming from guest or namespace, packets with destination
-address corresponding to the default gateway will have their destination address
-translated to a loopback address, if and only if a packet, in the opposite
-direction, with a loopback destination or source address, port-wise matching for
-UDP, or connection-wise for TCP, has been recently forwarded to guest or
-namespace. This behaviour can be disabled with \-\-no\-map\-gw.
+Similarly, for traffic coming from guest or namespace, packets with
+destination address corresponding to the \fB\-\-map-host-loopback\fR
+address will have their destination address translated to a loopback
+address.
 
 .SS Handling of local traffic in pasta
 

From 58e6d685995f7b1068357a00e2618627d17fa8f5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 18 Oct 2024 12:35:55 +1100
Subject: [PATCH 056/382] test: Clarify test for spliced inbound transfers

The tests in pasta/tcp and pasta/udp for inbound transfers have the server
listening within the namespace explicitly bound to 127.0.0.1 or ::1.  This
only works because of the behaviour of inbound splice connections, which
always appear with both source and destination addresses as loopback in
the namespace.  That's not an inherent property for "spliced" connections
and arguably an undesirable one.  Also update the test names to make it
clearer that these tests are expecting to exercise the "splice" path.

Interestingly this was already correct for the equivalent passt_in_ns/*,
although we also update the test names for clarity there.

Note that there are similar issues in some of the podman tests, addressed
in https://github.com/containers/podman/pull/24064

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/passt_in_ns/tcp |  8 ++++----
 test/passt_in_ns/udp |  4 ++--
 test/pasta/tcp       | 16 ++++++++--------
 test/pasta/udp       |  8 ++++----
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/test/passt_in_ns/tcp b/test/passt_in_ns/tcp
index aaf340e..319880b 100644
--- a/test/passt_in_ns/tcp
+++ b/test/passt_in_ns/tcp
@@ -32,7 +32,7 @@ host	socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10001
 guestw
 guest	cmp test_big.bin /root/big.bin
 
-test	TCP/IPv4: host to ns: big transfer
+test	TCP/IPv4: host to ns (spliced): big transfer
 nsb	socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
@@ -90,7 +90,7 @@ host	socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10001
 guestw
 guest	cmp test_small.bin /root/small.bin
 
-test	TCP/IPv4: host to ns: small transfer
+test	TCP/IPv4: host to ns (spliced): small transfer
 nsb	socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
@@ -146,7 +146,7 @@ host	socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10001
 guestw
 guest	cmp test_big.bin /root/big.bin
 
-test	TCP/IPv6: host to ns: big transfer
+test	TCP/IPv6: host to ns (spliced): big transfer
 nsb	socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
@@ -204,7 +204,7 @@ host	socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10001
 guestw
 guest	cmp test_small.bin /root/small.bin
 
-test	TCP/IPv6: host to ns: small transfer
+test	TCP/IPv6: host to ns (spliced): small transfer
 nsb	socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
diff --git a/test/passt_in_ns/udp b/test/passt_in_ns/udp
index 3426ab9..791511c 100644
--- a/test/passt_in_ns/udp
+++ b/test/passt_in_ns/udp
@@ -30,7 +30,7 @@ host	socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10001,shut-null
 guestw
 guest	cmp test.bin /root/medium.bin
 
-test	UDP/IPv4: host to ns
+test	UDP/IPv4: host to ns (recvmmsg/sendmmsg)
 nsb	socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
@@ -88,7 +88,7 @@ host	socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10001,shut-null
 guestw
 guest	cmp test.bin /root/medium.bin
 
-test	UDP/IPv6: host to ns
+test	UDP/IPv6: host to ns (recvmmsg/sendmmsg)
 nsb	socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
diff --git a/test/pasta/tcp b/test/pasta/tcp
index 6ab18c5..53b6f25 100644
--- a/test/pasta/tcp
+++ b/test/pasta/tcp
@@ -19,8 +19,8 @@ set	TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
 set	TEMP_SMALL __STATEDIR__/test_small.bin
 set	TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin
 
-test	TCP/IPv4: host to ns: big transfer
-nsb	socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_BIG__,create,trunc
+test	TCP/IPv4: host to ns (spliced): big transfer
+nsb	socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
 host	socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
 nsw
 check	cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
@@ -38,8 +38,8 @@ ns	socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003
 hostw
 check	cmp __BASEPATH__/big.bin __TEMP_BIG__
 
-test	TCP/IPv4: host to ns: small transfer
-nsb	socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_SMALL__,create,trunc
+test	TCP/IPv4: host to ns (spliced): small transfer
+nsb	socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
 host	socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
 nsw
 check	cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
@@ -57,8 +57,8 @@ ns	socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003
 hostw
 check	cmp __BASEPATH__/small.bin __TEMP_SMALL__
 
-test	TCP/IPv6: host to ns: big transfer
-nsb	socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_BIG__,create,trunc
+test	TCP/IPv6: host to ns (spliced): big transfer
+nsb	socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
 host	socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
 nsw
 check	cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
@@ -77,8 +77,8 @@ ns	socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003
 hostw
 check	cmp __BASEPATH__/big.bin __TEMP_BIG__
 
-test	TCP/IPv6: host to ns: small transfer
-nsb	socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_SMALL__,create,trunc
+test	TCP/IPv6: host to ns (spliced): small transfer
+nsb	socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
 host	socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
 nsw
 check	cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
diff --git a/test/pasta/udp b/test/pasta/udp
index 30e3a85..7734d02 100644
--- a/test/pasta/udp
+++ b/test/pasta/udp
@@ -17,8 +17,8 @@ htools	dd socat ip jq
 set	TEMP __STATEDIR__/test.bin
 set	TEMP_NS __STATEDIR__/test_ns.bin
 
-test	UDP/IPv4: host to ns
-nsb	socat -u UDP4-LISTEN:10002,bind=127.0.0.1,null-eof OPEN:__TEMP_NS__,create,trunc
+test	UDP/IPv4: host to ns (recvmmsg/sendmmsg)
+nsb	socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
 host	socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
 nsw
 check	cmp __BASEPATH__/medium.bin __TEMP_NS__
@@ -37,8 +37,8 @@ ns	socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null
 hostw
 check	cmp __BASEPATH__/medium.bin __TEMP__
 
-test	UDP/IPv6: host to ns
-nsb	socat -u UDP6-LISTEN:10002,bind=[::1],null-eof OPEN:__TEMP_NS__,create,trunc
+test	UDP/IPv6: host to ns (recvmmsg/sendmmsg)
+nsb	socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
 host	socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
 nsw
 check	cmp __BASEPATH__/medium.bin __TEMP_NS__

From b4dace8f462b346ae2135af1f8d681a99a849a5f Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 18 Oct 2024 12:35:56 +1100
Subject: [PATCH 057/382] fwd: Direct inbound spliced forwards to the guest's
 external address

In pasta mode, where addressing permits we "splice" connections, forwarding
directly from host socket to guest/container socket without any L2 or L3
processing.  This gives us a very large performance improvement when it's
possible.

Since the traffic is from a local socket within the guest, it will go over
the guest's 'lo' interface, and accordingly we set the guest side address
to be the loopback address.  However this has a surprising side effect:
sometimes guests will run services that are only supposed to be used within
the guest and are therefore bound to only 127.0.0.1 and/or ::1.  pasta's
forwarding exposes those services to the host, which isn't generally what
we want.

Correct this by instead forwarding inbound "splice" flows to the guest's
external address.

Link: https://github.com/containers/podman/issues/24045
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c  |  9 +++++++++
 fwd.c   | 31 +++++++++++++++++++++++--------
 passt.1 | 23 +++++++++++++++++++----
 passt.h |  2 ++
 4 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/conf.c b/conf.c
index c631019..b3b5342 100644
--- a/conf.c
+++ b/conf.c
@@ -912,6 +912,9 @@ pasta_opts:
 		"  -U, --udp-ns SPEC	UDP port forwarding to init namespace\n"
 		"    SPEC is as described above\n"
 		"    default: auto\n"
+		"  --host-lo-to-ns-lo	DEPRECATED:\n"
+		"			Translate host-loopback forwards to\n"
+		"			namespace loopback\n"
 		"  --userns NSPATH 	Target user namespace to join\n"
 		"  --netns PATH|NAME	Target network namespace to join\n"
 		"  --netns-only		Don't join existing user namespace\n"
@@ -1289,6 +1292,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"netns-only",	no_argument,		NULL,		20 },
 		{"map-host-loopback", required_argument, NULL,		21 },
 		{"map-guest-addr", required_argument,	NULL,		22 },
+		{"host-lo-to-ns-lo", no_argument, 	NULL,		23 },
 		{"dns-host",	required_argument,	NULL,		24 },
 		{ 0 },
 	};
@@ -1467,6 +1471,11 @@ void conf(struct ctx *c, int argc, char **argv)
 			conf_nat(optarg, &c->ip4.map_guest_addr,
 				 &c->ip6.map_guest_addr, NULL);
 			break;
+		case 23:
+			if (c->mode != MODE_PASTA)
+				die("--host-lo-to-ns-lo is for pasta mode only");
+			c->host_lo_to_ns_lo = 1;
+			break;
 		case 24:
 			if (inet_pton(AF_INET6, optarg, &c->ip6.dns_host) &&
 			    !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
diff --git a/fwd.c b/fwd.c
index a505098..c71f5e1 100644
--- a/fwd.c
+++ b/fwd.c
@@ -447,20 +447,35 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
 	    (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
 		/* spliceable */
 
-		/* Preserve the specific loopback adddress used, but let the
-		 * kernel pick a source port on the target side
+		/* The traffic will go over the guest's 'lo' interface, but by
+		 * default use its external address, so we don't inadvertently
+		 * expose services that listen only on the guest's loopback
+		 * address.  That can be overridden by --host-lo-to-ns-lo which
+		 * will instead forward to the loopback address in the guest.
+		 *
+		 * In either case, let the kernel pick the source address to
+		 * match.
 		 */
-		tgt->oaddr = ini->eaddr;
+		if (inany_v4(&ini->eaddr)) {
+			if (c->host_lo_to_ns_lo)
+				tgt->eaddr = inany_loopback4;
+			else
+				tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
+			tgt->oaddr = inany_any4;
+		} else {
+			if (c->host_lo_to_ns_lo)
+				tgt->eaddr = inany_loopback6;
+			else
+				tgt->eaddr.a6 = c->ip6.addr_seen;
+			tgt->oaddr = inany_any6;
+		}
+
+		/* Let the kernel pick source port */
 		tgt->oport = 0;
 		if (proto == IPPROTO_UDP)
 			/* But for UDP preserve the source port */
 			tgt->oport = ini->eport;
 
-		if (inany_v4(&ini->eaddr))
-			tgt->eaddr = inany_loopback4;
-		else
-			tgt->eaddr = inany_loopback6;
-
 		return PIF_SPLICE;
 	}
 
diff --git a/passt.1 b/passt.1
index 46100e2..f084978 100644
--- a/passt.1
+++ b/passt.1
@@ -605,6 +605,13 @@ Configure UDP port forwarding from target namespace to init namespace.
 
 Default is \fBauto\fR.
 
+.TP
+.BR \-\-host-lo-to-ns-lo " " (DEPRECATED)
+If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
+the host's loopback address will appear on the loopback address in the
+guest as well.  Without this option such forwarded packets will appear
+to come from the guest's public address.
+
 .TP
 .BR \-\-userns " " \fIspec
 Target user namespace to join, as a path. If PID is given, without this option,
@@ -893,8 +900,9 @@ interfaces, and it would also be impossible for guest or target
 namespace to route answers back.
 
 For convenience, the source address on these packets is translated to
-the address specified by the \fB\-\-map-host-loopback\fR option.  If
-not specified this defaults, somewhat arbitrarily, to the address of
+the address specified by the \fB\-\-map-host-loopback\fR option (with
+some exceptions in pasta mode, see next section below).  If not
+specified this defaults, somewhat arbitrarily, to the address of
 default IPv4 or IPv6 gateway (if any) -- this is known to be an
 existing, valid address on the same subnet.  If \fB\-\-no-map-gw\fR or
 \fB\-\-map-host-loopback none\fR are specified this translation is
@@ -931,8 +939,15 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair
 of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet
 transfers.
 
-This bypass only applies to local connections and traffic, because it's not
-possible to bind sockets to foreign addresses.
+Because it's not possible to bind sockets to foreign addresses, this
+bypass only applies to local connections and traffic.  It also means
+that the address translation differs slightly from passt mode.
+Connections from loopback to loopback on the host will appear to come
+from the target namespace's public address within the guest, unless
+\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will
+appear to come from loopback in the namespace as well.  The latter
+behaviour used to be the default, but is usually undesirable, since it
+can unintentionally expose namespace local services to the host.
 
 .SS Binding to low numbered ports (well-known or system ports, up to 1023)
 
diff --git a/passt.h b/passt.h
index 4908ed9..72c7f72 100644
--- a/passt.h
+++ b/passt.h
@@ -225,6 +225,7 @@ struct ip6_ctx {
  * @no_dhcpv6:		Disable DHCPv6 server
  * @no_ndp:		Disable NDP handler altogether
  * @no_ra:		Disable router advertisements
+ * @host_lo_to_ns_lo:	Map host loopback addresses to ns loopback addresses
  * @freebind:		Allow binding of non-local addresses for forwarding
  * @low_wmem:		Low probed net.core.wmem_max
  * @low_rmem:		Low probed net.core.rmem_max
@@ -285,6 +286,7 @@ struct ctx {
 	int no_dhcpv6;
 	int no_ndp;
 	int no_ra;
+	int host_lo_to_ns_lo;
 	int freebind;
 
 	int low_wmem;

From 9e5df350d63b0819f04b44bb57ea146274a6b42f Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 21 Oct 2024 18:40:29 +1100
Subject: [PATCH 058/382] tcp: Use structures to construct initial TCP options

As a rule, we prefer constructing packets with matching C structures,
rather than building them byte by byte.  However, one case we still build
byte by byte is the TCP options we include in SYN packets (in fact the only
time we generate TCP options on the tap interface).

Rework this to use a structure and initialisers which make it a bit
clearer what's going on.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by; Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 17 +++----------
 tcp_buf.c      |  2 +-
 tcp_internal.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/tcp.c b/tcp.c
index b2155ab..0d22e07 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1232,7 +1232,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
  *	     1 otherwise
  */
 int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
-		      int flags, struct tcphdr *th, char *data,
+		      int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
 		      size_t *optlen)
 {
 	struct tcp_info tinfo = { 0 };
@@ -1258,12 +1258,6 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (flags & SYN) {
 		int mss;
 
-		/* Options: MSS, NOP and window scale (8 bytes) */
-		*optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
-
-		*data++ = OPT_MSS;
-		*data++ = OPT_MSS_LEN;
-
 		if (c->mtu == -1) {
 			mss = tinfo.tcpi_snd_mss;
 		} else {
@@ -1279,16 +1273,11 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 			else if (mss > PAGE_SIZE)
 				mss = ROUND_DOWN(mss, PAGE_SIZE);
 		}
-		*(uint16_t *)data = htons(MIN(USHRT_MAX, mss));
-
-		data += OPT_MSS_LEN - 2;
 
 		conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
 
-		*data++ = OPT_NOP;
-		*data++ = OPT_WS;
-		*data++ = OPT_WS_LEN;
-		*data++ = conn->ws_to_tap;
+		*opts = TCP_SYN_OPTS(mss, conn->ws_to_tap);
+		*optlen = sizeof(*opts);
 	} else if (!(flags & RST)) {
 		flags |= ACK;
 	}
diff --git a/tcp_buf.c b/tcp_buf.c
index 238827b..44df0e4 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -282,7 +282,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 
 	seq = conn->seq_to_tap;
 	ret = tcp_prepare_flags(c, conn, flags, &payload->th,
-				payload->opts, &optlen);
+				&payload->opts, &optlen);
 	if (ret <= 0) {
 		if (CONN_V4(conn))
 			tcp4_flags_used--;
diff --git a/tcp_internal.h b/tcp_internal.h
index 2f74ffe..1ab8ce2 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -33,9 +33,7 @@
 #define OPT_EOL		0
 #define OPT_NOP		1
 #define OPT_MSS		2
-#define OPT_MSS_LEN	4
 #define OPT_WS		3
-#define OPT_WS_LEN	3
 #define OPT_SACKP	4
 #define OPT_SACK	5
 #define OPT_TS		8
@@ -77,6 +75,65 @@ struct tcp_payload_t {
 } __attribute__ ((packed, aligned(__alignof__(unsigned int))));
 #endif
 
+/** struct tcp_opt_nop - TCP NOP option
+ * @kind:	Option kind (OPT_NOP = 1)
+ */
+struct tcp_opt_nop {
+	uint8_t kind;
+} __attribute__ ((packed));
+#define TCP_OPT_NOP		((struct tcp_opt_nop){ .kind = OPT_NOP, })
+
+/** struct tcp_opt_mss - TCP MSS option
+ * @kind:	Option kind (OPT_MSS == 2)
+ * @len:	Option length (4)
+ * @mss:	Maximum Segment Size
+ */
+struct tcp_opt_mss {
+	uint8_t kind;
+	uint8_t len;
+	uint16_t mss;
+} __attribute__ ((packed));
+#define TCP_OPT_MSS(mss_)				\
+	((struct tcp_opt_mss) {				\
+		.kind = OPT_MSS,			\
+		.len = sizeof(struct tcp_opt_mss),	\
+		.mss = htons(mss_),			\
+	})
+
+/** struct tcp_opt_ws - TCP Window Scaling option
+ * @kind:	Option kind (OPT_WS == 3)
+ * @len:	Option length (3)
+ * @shift:	Window scaling shift
+ */
+struct tcp_opt_ws {
+	uint8_t kind;
+	uint8_t len;
+	uint8_t shift;
+} __attribute__ ((packed));
+#define TCP_OPT_WS(shift_)				\
+	((struct tcp_opt_ws) {				\
+		.kind = OPT_WS,				\
+		.len = sizeof(struct tcp_opt_ws),	\
+		.shift = (shift_),			\
+	})
+
+/** struct tcp_syn_opts - TCP options we apply to SYN packets
+ * @mss:	Maximum Segment Size (MSS) option
+ * @nop:	NOP opt (for alignment)
+ * @ws:		Window Scaling (WS) option
+ */
+struct tcp_syn_opts {
+	struct tcp_opt_mss mss;
+	struct tcp_opt_nop nop;
+	struct tcp_opt_ws ws;
+} __attribute__ ((packed));
+#define TCP_SYN_OPTS(mss_, ws_)				\
+	((struct tcp_syn_opts){				\
+		.mss = TCP_OPT_MSS(mss_),		\
+		.nop = TCP_OPT_NOP,			\
+		.ws = TCP_OPT_WS(ws_),			\
+	})
+
 /**
  * struct tcp_flags_t - TCP header and data to send zero-length
  *                      segments (flags)
@@ -85,7 +142,7 @@ struct tcp_payload_t {
  */
 struct tcp_flags_t {
 	struct tcphdr th;
-	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
+	struct tcp_syn_opts opts;
 #ifdef __AVX2__
 } __attribute__ ((packed, aligned(32)));
 #else
@@ -124,7 +181,8 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 			       bool no_tcp_csum);
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 			  bool force_seq, struct tcp_info *tinfo);
-int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags,
-		      struct tcphdr *th, char *data, size_t *optlen);
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
+		      int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
+		      size_t *optlen);
 
 #endif /* TCP_INTERNAL_H */

From 149f457b23ed2cb196eed1b3f413b4a900f39547 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 24 Oct 2024 09:12:11 +0200
Subject: [PATCH 059/382] tcp_splice: splice() all we have to the writing side,
 not what we just read

In tcp_splice_sock_handler(), we try to calculate how much we can move
from the pipe to the writing socket: if we just read some bytes, we'll
use that amount, but if we haven't, we just try to empty the pipe.

However, if we just read something, that doesn't mean that that's all
the data we have on the pipe, as it's obvious from this sequence, where:

  pasta: epoll event on connected spliced TCP socket 54 (events: 0x00000001)
  Flow 0 (TCP connection (spliced)): 98304 from read-side call
  Flow 0 (TCP connection (spliced)): 33615 from write-side call (passed 98304)
  Flow 0 (TCP connection (spliced)): -1 from read-side call
  Flow 0 (TCP connection (spliced)): -1 from write-side call (passed 524288)
  Flow 0 (TCP connection (spliced)): event at tcp_splice_sock_handler:580
  Flow 0 (TCP connection (spliced)): OUT_WAIT_0

we first pile up 98304 - 33615 = 64689 pending bytes, that we read but
couldn't write, as the receiver buffer is full, and we set the
corresponding OUT_WAIT flag. Then:

  pasta: epoll event on connected spliced TCP socket 54 (events: 0x00000001)
  Flow 0 (TCP connection (spliced)): 32768 from read-side call
  Flow 0 (TCP connection (spliced)): -1 from write-side call (passed 32768)
  Flow 0 (TCP connection (spliced)): event at tcp_splice_sock_handler:580

we splice() 32768 more bytes from our receiving side to the pipe. At
some point:

  pasta: epoll event on connected spliced TCP socket 49 (events: 0x00000004)
  Flow 0 (TCP connection (spliced)): event at tcp_splice_sock_handler:489
  Flow 0 (TCP connection (spliced)): ~OUT_WAIT_0
  Flow 0 (TCP connection (spliced)): 1320 from read-side call
  Flow 0 (TCP connection (spliced)): 1320 from write-side call (passed 1320)

the receiver is signalling to us that it's ready for more data
(EPOLLOUT). We reset the OUT_WAIT flag, read 1320 more bytes from
our receiving socket into the pipe, and that's what we write to the
receiver, forgetting about the pending 97457 bytes we had, which the
receiver might never get (not the same 97547 bytes: we'll actually
send 1320 of those).

This condition is rather hard to reproduce, and it was observed with
Podman pulling container images via HTTPS. In the traces above, the
client is side 0 (the initiating peer), and the server is sending the
whole data.

Instead of splicing from pipe to socket the amount of data we just
read, we need to splice all the pending data we piled up until that
point. We could do that using 'read' and 'written' counters, but
there's actually no need, as the kernel also keeps track of how much
data is available on the pipe.

So, to make this simple and more robust, just give the whole pipe size
as length to splice(). The kernel knows what to do with it.

Later in the function, we used 'to_write' for an optimisation meant
to reduce wakeups which retries right away to splice() in both
directions if we couldn't write to the receiver the whole amount of
pending data. Calculate a 'pending' value instead, only if we reach
that point.

Now that we check for the actual amount of pending data in that
optimisation, we need to make sure we don't compare a zero or negative
'written' value: if we met that, it means that the receiver signalled
end-of-file, an error, or to try again later. In those three cases,
the optimisation doesn't make any sense, so skip it.

Reported-by: Ed Santiago <santiago@redhat.com>
Reported-by: Paul Holzinger <pholzing@redhat.com>
Analysed-by: Paul Holzinger <pholzing@redhat.com>
Link: https://github.com/containers/podman/issues/24219
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp_splice.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/tcp_splice.c b/tcp_splice.c
index 9f5cc27..f112cfe 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -503,7 +503,7 @@ swap:
 	lowat_act_flag = RCVLOWAT_ACT(fromsidei);
 
 	while (1) {
-		ssize_t readlen, to_write = 0, written;
+		ssize_t readlen, written, pending;
 		int more = 0;
 
 retry:
@@ -518,14 +518,11 @@ retry:
 
 			if (errno != EAGAIN)
 				goto close;
-
-			to_write = c->tcp.pipe_size;
 		} else if (!readlen) {
 			eof = 1;
-			to_write = c->tcp.pipe_size;
 		} else {
 			never_read = 0;
-			to_write += readlen;
+
 			if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
 				more = SPLICE_F_MORE;
 
@@ -535,10 +532,10 @@ retry:
 
 eintr:
 		written = splice(conn->pipe[fromsidei][0], NULL,
-				 conn->s[!fromsidei], NULL, to_write,
+				 conn->s[!fromsidei], NULL, c->tcp.pipe_size,
 				 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
 		flow_trace(conn, "%zi from write-side call (passed %zi)",
-			   written, to_write);
+			   written, c->tcp.pipe_size);
 
 		/* Most common case: skip updating counters. */
 		if (readlen > 0 && readlen == written) {
@@ -584,10 +581,9 @@ eintr:
 		if (never_read && written == (long)(c->tcp.pipe_size))
 			goto retry;
 
-		if (!never_read && written < to_write) {
-			to_write -= written;
+		pending = conn->read[fromsidei] - conn->written[fromsidei];
+		if (!never_read && written > 0 && written < pending)
 			goto retry;
-		}
 
 		if (eof)
 			break;

From 9e4615b40bfa7f1b692c3c3360d88a22c453b016 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 24 Oct 2024 22:16:39 +0200
Subject: [PATCH 060/382] tcp_splice: fcntl(2) returns the size of the pipe, if
 F_SETPIPE_SZ succeeds

Don't report bogus failures (with --trace) just because the return
value is not zero.

Link: https://github.com/containers/podman/issues/24219
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp_splice.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcp_splice.c b/tcp_splice.c
index f112cfe..93f8bce 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -320,7 +320,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
 			}
 
 			if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
-				  c->tcp.pipe_size)) {
+				  c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
 				flow_trace(conn,
 					   "cannot set %d->%d pipe size to %zu",
 					   sidei, !sidei, c->tcp.pipe_size);
@@ -672,7 +672,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
 			continue;
 
 		if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
-			  c->tcp.pipe_size)) {
+			  c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
 			trace("TCP (spliced): cannot set pool pipe size to %zu",
 			      c->tcp.pipe_size);
 		}

From 13f0291ede19fc6baea02e8327acec144bdf79e6 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 24 Oct 2024 15:59:20 +1100
Subject: [PATCH 061/382] tcp: Remove compile-time dependency on struct
 tcp_info version

In the Makefile we probe to create several defines based on the presence
of particular fields in struct tcp_info.  These defines are used for two
purposes, neither of which they accomplish well:

1) Determining if the tcp_info fields are available at runtime.  For this
   purpose the defines are Just Plain Wrong, since the runtime kernel may
   not be the same as the compile time kernel. We corrected this for
   tcp_snd_wnd, but not for tcpi_bytes_acked or tcpi_min_rtt

2) Allowing the source to compile against older kernel headers which don't
   have the fields in question.  This works in theory, but it does mean
   we won't be able to use the fields, even if later run against a
   newer kernel.  Furthermore, it's quite fragile: without much more
   thorough tests of builds in different environments that we're currently
   set up for, it's very easy to miss cases where we're accessing a field
   without protection from an #ifdef.  For example we currently access
   tcpi_snd_wnd without #ifdefs in tcp_update_seqack_wnd().

Improve this with a different approach, borrowed from qemu (which has many
instances of similar problems).  Don't compile against linux/tcp.h, using
netinet/tcp.h instead.  Then for when we need an extension field, define
a struct tcp_info_linux, copied from the kernel, with all the fields we're
interested in.  That may need updating from future kernel versions, but
only when we want to use a new extension, so it shouldn't be frequent.

This allows us to remove the HAS_SND_WND define entirely.  We keep
HAS_BYTES_ACKED and HAS_MIN_RTT now, since they're used for purpose (1),
we'll fix that in a later patch.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Trivial grammar fixes in comments]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile       |   5 ---
 tcp.c          |  30 ++++---------
 tcp_info.h     | 120 +++++++++++++++++++++++++++++++++++++++++++++++++
 tcp_internal.h |   4 +-
 4 files changed, 132 insertions(+), 27 deletions(-)
 create mode 100644 tcp_info.h

diff --git a/Makefile b/Makefile
index 74a9513..6faa501 100644
--- a/Makefile
+++ b/Makefile
@@ -67,11 +67,6 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	udp.h udp_flow.h util.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
-C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
-ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
-	FLAGS += -DHAS_SND_WND
-endif
-
 C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
 ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
 	FLAGS += -DHAS_BYTES_ACKED
diff --git a/tcp.c b/tcp.c
index 0d22e07..2a0b272 100644
--- a/tcp.c
+++ b/tcp.c
@@ -274,6 +274,7 @@
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
+#include <netinet/tcp.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <stddef.h>
@@ -286,8 +287,6 @@
 #include <time.h>
 #include <arpa/inet.h>
 
-#include <linux/tcp.h> /* For struct tcp_info */
-
 #include "checksum.h"
 #include "util.h"
 #include "iov.h"
@@ -303,6 +302,7 @@
 
 #include "flow_table.h"
 #include "tcp_internal.h"
+#include "tcp_info.h"
 #include "tcp_buf.h"
 
 /* MSS rounding: see SET_MSS() */
@@ -318,11 +318,6 @@
 #define LOW_RTT_TABLE_SIZE		8
 #define LOW_RTT_THRESHOLD		10 /* us */
 
-/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
- * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
- */
-#define SOL_TCP				IPPROTO_TCP
-
 #define ACK_IF_NEEDED	0		/* See tcp_send_flag() */
 
 #define CONN_IS_CLOSING(conn)						\
@@ -365,14 +360,11 @@ char		tcp_buf_discard		[MAX_WINDOW];
 
 /* Does the kernel support TCP_PEEK_OFF? */
 bool peek_offset_cap;
-#ifdef HAS_SND_WND
+
 /* Does the kernel report sending window in TCP_INFO (kernel commit
  * 8f7baad7f035)
  */
 bool snd_wnd_cap;
-#else
-#define snd_wnd_cap	(false)
-#endif
 
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
@@ -678,7 +670,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
  * @tinfo:	Pointer to struct tcp_info for socket
  */
 static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
-			      const struct tcp_info *tinfo)
+			      const struct tcp_info_linux *tinfo)
 {
 #ifdef HAS_MIN_RTT
 	const struct flowside *tapside = TAPFLOW(conn);
@@ -1114,13 +1106,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
  * Return: 1 if sequence or window were updated, 0 otherwise
  */
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
-			  bool force_seq, struct tcp_info *tinfo)
+			  bool force_seq, struct tcp_info_linux *tinfo)
 {
 	uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
 	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
 	/* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */
 	socklen_t sl = sizeof(*tinfo);
-	struct tcp_info tinfo_new;
+	struct tcp_info_linux tinfo_new;
 	uint32_t new_wnd_to_tap = prev_wnd_to_tap;
 	int s = conn->sock;
 
@@ -1235,7 +1227,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 		      int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
 		      size_t *optlen)
 {
-	struct tcp_info tinfo = { 0 };
+	struct tcp_info_linux tinfo = { 0 };
 	socklen_t sl = sizeof(tinfo);
 	int s = conn->sock;
 
@@ -2578,7 +2570,6 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
 	return ret;
 }
 
-#ifdef HAS_SND_WND
 /**
  * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd
  *
@@ -2586,7 +2577,7 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
  */
 static bool tcp_probe_snd_wnd_cap(void)
 {
-	struct tcp_info tinfo;
+	struct tcp_info_linux tinfo;
 	socklen_t sl = sizeof(tinfo);
 	int s;
 
@@ -2604,13 +2595,12 @@ static bool tcp_probe_snd_wnd_cap(void)
 
 	close(s);
 
-	if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) +
+	if (sl < (offsetof(struct tcp_info_linux, tcpi_snd_wnd) +
 		  sizeof(tinfo.tcpi_snd_wnd)))
 		return false;
 
 	return true;
 }
-#endif /* HAS_SND_WND */
 
 /**
  * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
@@ -2645,9 +2635,7 @@ int tcp_init(struct ctx *c)
 			  (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
 	debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
 
-#ifdef HAS_SND_WND
 	snd_wnd_cap = tcp_probe_snd_wnd_cap();
-#endif
 	debug("TCP_INFO tcpi_snd_wnd field%ssupported",
 	      snd_wnd_cap ? " " : " not ");
 
diff --git a/tcp_info.h b/tcp_info.h
new file mode 100644
index 0000000..06ccb16
--- /dev/null
+++ b/tcp_info.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ *
+ * Largely derived from include/linux/tcp.h in the Linux kernel
+ */
+
+#ifndef TCP_INFO_H
+#define TCP_INFO_H
+
+/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
+ *
+ * Some fields returned by TCP_INFO have been there for ages and are shared with
+ * BSD.  struct tcp_info from netinet/tcp.h has only those fields.  There are
+ * also a many Linux specific extensions to the structure, which are only found
+ * in the linux/tcp.h version of struct tcp_info.
+ *
+ * We want to use some of those extension fields, when available.  We can test
+ * for availability in the runtime kernel using the length returned from
+ * getsockopt(). However, we won't necessarily be compiled against the same
+ * kernel headers as we'll run with, so compiling directly against linux/tcp.h
+ * means wrapping every field access in an #ifdef whose #else does the same
+ * thing as when the field is missing at runtime.  This rapidly gets messy.
+ *
+ * Instead we define here struct tcp_info_linux which includes all the Linux
+ * extensions that we want to use.  This is taken from v6.11 of the kernel.
+ */
+struct tcp_info_linux {
+	uint8_t		tcpi_state;
+	uint8_t		tcpi_ca_state;
+	uint8_t		tcpi_retransmits;
+	uint8_t		tcpi_probes;
+	uint8_t		tcpi_backoff;
+	uint8_t		tcpi_options;
+	uint8_t		tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
+	uint8_t		tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
+
+	uint32_t	tcpi_rto;
+	uint32_t	tcpi_ato;
+	uint32_t	tcpi_snd_mss;
+	uint32_t	tcpi_rcv_mss;
+
+	uint32_t	tcpi_unacked;
+	uint32_t	tcpi_sacked;
+	uint32_t	tcpi_lost;
+	uint32_t	tcpi_retrans;
+	uint32_t	tcpi_fackets;
+
+	/* Times. */
+	uint32_t	tcpi_last_data_sent;
+	uint32_t	tcpi_last_ack_sent;
+	uint32_t	tcpi_last_data_recv;
+	uint32_t	tcpi_last_ack_recv;
+
+	/* Metrics. */
+	uint32_t	tcpi_pmtu;
+	uint32_t	tcpi_rcv_ssthresh;
+	uint32_t	tcpi_rtt;
+	uint32_t	tcpi_rttvar;
+	uint32_t	tcpi_snd_ssthresh;
+	uint32_t	tcpi_snd_cwnd;
+	uint32_t	tcpi_advmss;
+	uint32_t	tcpi_reordering;
+
+	uint32_t	tcpi_rcv_rtt;
+	uint32_t	tcpi_rcv_space;
+
+	uint32_t	tcpi_total_retrans;
+
+	/* Linux extensions */
+	uint64_t	tcpi_pacing_rate;
+	uint64_t	tcpi_max_pacing_rate;
+	uint64_t	tcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
+	uint64_t	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
+	uint32_t	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */
+	uint32_t	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */
+
+	uint32_t	tcpi_notsent_bytes;
+	uint32_t	tcpi_min_rtt;
+	uint32_t	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
+	uint32_t	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
+
+	uint64_t	tcpi_delivery_rate;
+
+	uint64_t	tcpi_busy_time;      /* Time (usec) busy sending data */
+	uint64_t	tcpi_rwnd_limited;   /* Time (usec) limited by receive window */
+	uint64_t	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
+
+	uint32_t	tcpi_delivered;
+	uint32_t	tcpi_delivered_ce;
+
+	uint64_t	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
+	uint64_t	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
+	uint32_t	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */
+	uint32_t	tcpi_reord_seen;     /* reordering events seen */
+
+	uint32_t	tcpi_rcv_ooopack;    /* Out-of-order packets received */
+
+	uint32_t	tcpi_snd_wnd;	     /* peer's advertised receive window after
+					      * scaling (bytes)
+					      */
+	uint32_t	tcpi_rcv_wnd;	     /* local advertised receive window after
+					      * scaling (bytes)
+					      */
+
+	uint32_t 	tcpi_rehash;         /* PLB or timeout triggered rehash attempts */
+
+	uint16_t	tcpi_total_rto;	/* Total number of RTO timeouts, including
+					 * SYN/SYN-ACK and recurring timeouts.
+					 */
+	uint16_t	tcpi_total_rto_recoveries;	/* Total number of RTO
+							 * recoveries, including any
+							 * unfinished recovery.
+							 */
+	uint32_t	tcpi_total_rto_time;	/* Total time spent in RTO recoveries
+						 * in milliseconds, including any
+						 * unfinished recovery.
+						 */
+};
+
+#endif /* TCP_INFO_H */
diff --git a/tcp_internal.h b/tcp_internal.h
index 1ab8ce2..a5a47df 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -175,12 +175,14 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
 		tcp_rst_do(c, conn);					\
 	} while (0)
 
+struct tcp_info_linux;
+
 size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 			       struct iovec *iov, size_t dlen,
 			       const uint16_t *check, uint32_t seq,
 			       bool no_tcp_csum);
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
-			  bool force_seq, struct tcp_info *tinfo);
+			  bool force_seq, struct tcp_info_linux *tinfo);
 int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 		      int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
 		      size_t *optlen);

From 81143813a6b3ec297c31d234bbdc6000ed8c7052 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 24 Oct 2024 15:59:21 +1100
Subject: [PATCH 062/382] tcp: Generalise probing for tcpi_snd_wnd field

In order to use the tcpi_snd_wnd field from the TCP_INFO getsockopt() we
need the field to be supported in the runtime kernel (snd_wnd_cap).

In fact we should check that for for every tcp_info field we want to use,
beyond the very old ones shared with BSD.  Prepare to do that, by
generalising the probing from setting a single bool to instead record the
size of the returned TCP_INFO structure.  We can then use that recorded
value to check for the presence of any field we need.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/tcp.c b/tcp.c
index 2a0b272..998e56d 100644
--- a/tcp.c
+++ b/tcp.c
@@ -361,10 +361,15 @@ char		tcp_buf_discard		[MAX_WINDOW];
 /* Does the kernel support TCP_PEEK_OFF? */
 bool peek_offset_cap;
 
-/* Does the kernel report sending window in TCP_INFO (kernel commit
- * 8f7baad7f035)
- */
-bool snd_wnd_cap;
+/* Size of data returned by TCP_INFO getsockopt() */
+socklen_t tcp_info_size;
+
+#define tcp_info_cap(f_)						\
+	((offsetof(struct tcp_info_linux, tcpi_##f_) +			\
+	  sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
+
+/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
+#define snd_wnd_cap	tcp_info_cap(snd_wnd)
 
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
@@ -2571,11 +2576,11 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
 }
 
 /**
- * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd
+ * tcp_probe_tcp_info() - Check what data TCP_INFO reports
  *
- * Return: true if supported, false otherwise
+ * Return: Number of bytes returned by TCP_INFO getsockopt()
  */
-static bool tcp_probe_snd_wnd_cap(void)
+static socklen_t tcp_probe_tcp_info(void)
 {
 	struct tcp_info_linux tinfo;
 	socklen_t sl = sizeof(tinfo);
@@ -2595,11 +2600,7 @@ static bool tcp_probe_snd_wnd_cap(void)
 
 	close(s);
 
-	if (sl < (offsetof(struct tcp_info_linux, tcpi_snd_wnd) +
-		  sizeof(tinfo.tcpi_snd_wnd)))
-		return false;
-
-	return true;
+	return sl;
 }
 
 /**
@@ -2635,9 +2636,12 @@ int tcp_init(struct ctx *c)
 			  (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
 	debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
 
-	snd_wnd_cap = tcp_probe_snd_wnd_cap();
-	debug("TCP_INFO tcpi_snd_wnd field%ssupported",
-	      snd_wnd_cap ? " " : " not ");
+	tcp_info_size = tcp_probe_tcp_info();
+
+#define dbg_tcpi(f_)	debug("TCP_INFO tcpi_%s field%s supported",	\
+			      STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
+	dbg_tcpi(snd_wnd);
+#undef dbg_tcpi
 
 	return 0;
 }

From e7fcd0c3481f15395ea4060eadfac0b6a8f69b29 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 24 Oct 2024 15:59:22 +1100
Subject: [PATCH 063/382] tcp: Use runtime tests for TCP_INFO fields

In order to use particular fields from the TCP_INFO getsockopt() we
need them to be in structure returned by the runtime kernel.  We attempt
to determine that with the HAS_BYTES_ACKED and HAS_MIN_RTT defines, probed
in the Makefile.

However, that's not correct, because the kernel headers we compile against
may not be the same as the runtime kernel.  We instead should check against
the size of structure returned from the TCP_INFO getsockopt() as we already
do for tcpi_snd_wnd.  Switch from the compile time flags to a runtime
test.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 10 ----------
 tcp.c    | 52 ++++++++++++++++++++++++++--------------------------
 2 files changed, 26 insertions(+), 36 deletions(-)

diff --git a/Makefile b/Makefile
index 6faa501..4c2d020 100644
--- a/Makefile
+++ b/Makefile
@@ -67,16 +67,6 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	udp.h udp_flow.h util.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
-C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
-ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
-	FLAGS += -DHAS_BYTES_ACKED
-endif
-
-C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 };
-ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
-	FLAGS += -DHAS_MIN_RTT
-endif
-
 C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
 ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
 	FLAGS += -DHAS_GETRANDOM
diff --git a/tcp.c b/tcp.c
index 998e56d..0569dc6 100644
--- a/tcp.c
+++ b/tcp.c
@@ -370,6 +370,10 @@ socklen_t tcp_info_size;
 
 /* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
 #define snd_wnd_cap	tcp_info_cap(snd_wnd)
+/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
+#define bytes_acked_cap	tcp_info_cap(bytes_acked)
+/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
+#define min_rtt_cap	tcp_info_cap(min_rtt)
 
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
@@ -677,11 +681,10 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
 static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
 			      const struct tcp_info_linux *tinfo)
 {
-#ifdef HAS_MIN_RTT
 	const struct flowside *tapside = TAPFLOW(conn);
 	int i, hole = -1;
 
-	if (!tinfo->tcpi_min_rtt ||
+	if (!min_rtt_cap ||
 	    (int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
 		return;
 
@@ -702,10 +705,6 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
 	if (hole == LOW_RTT_TABLE_SIZE)
 		hole = 0;
 	inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
-#else
-	(void)conn;
-	(void)tinfo;
-#endif /* HAS_MIN_RTT */
 }
 
 /**
@@ -1121,30 +1120,29 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 	uint32_t new_wnd_to_tap = prev_wnd_to_tap;
 	int s = conn->sock;
 
-#ifndef HAS_BYTES_ACKED
-	(void)force_seq;
-
-	conn->seq_ack_to_tap = conn->seq_from_tap;
-	if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
-		conn->seq_ack_to_tap = prev_ack_to_tap;
-#else
-	if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
-	    || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
+	if (!bytes_acked_cap) {
 		conn->seq_ack_to_tap = conn->seq_from_tap;
-	} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
-		if (!tinfo) {
-			tinfo = &tinfo_new;
-			if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
-				return 0;
-		}
-
-		conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
-				       conn->seq_init_from_tap;
-
 		if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
 			conn->seq_ack_to_tap = prev_ack_to_tap;
+	} else {
+		if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL ||
+		    tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) ||
+		    (conn->flags & LOCAL) || force_seq) {
+			conn->seq_ack_to_tap = conn->seq_from_tap;
+		} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
+			if (!tinfo) {
+				tinfo = &tinfo_new;
+				if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
+					return 0;
+			}
+
+			conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
+				conn->seq_init_from_tap;
+
+			if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
+				conn->seq_ack_to_tap = prev_ack_to_tap;
+		}
 	}
-#endif /* !HAS_BYTES_ACKED */
 
 	if (!snd_wnd_cap) {
 		tcp_get_sndbuf(conn);
@@ -2641,6 +2639,8 @@ int tcp_init(struct ctx *c)
 #define dbg_tcpi(f_)	debug("TCP_INFO tcpi_%s field%s supported",	\
 			      STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
 	dbg_tcpi(snd_wnd);
+	dbg_tcpi(bytes_acked);
+	dbg_tcpi(min_rtt);
 #undef dbg_tcpi
 
 	return 0;

From f43f7d5e89b51b44a03de5a1eb566e14604bb08d Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 24 Oct 2024 10:50:58 +0200
Subject: [PATCH 064/382] tcp: cleanup tcp_buf_data_from_sock()

Remove the err label as there is only one caller, and move code
to the caller position. ret is not needed here anymore as it is
always 0.
Remove sendlen as we can user directly len.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_buf.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/tcp_buf.c b/tcp_buf.c
index 44df0e4..cb6742c 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -382,8 +382,8 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
 	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
-	int sendlen, len, dlen, v4 = CONN_V4(conn);
-	int s = conn->sock, i, ret = 0;
+	int len, dlen, v4 = CONN_V4(conn);
+	int s = conn->sock, i;
 	struct msghdr mh_sock = { 0 };
 	uint16_t mss = MSS_GET(conn);
 	uint32_t already_sent, seq;
@@ -453,12 +453,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		len = recvmsg(s, &mh_sock, MSG_PEEK);
 	while (len < 0 && errno == EINTR);
 
-	if (len < 0)
-		goto err;
+	if (len < 0) {
+		if (errno != EAGAIN && errno != EWOULDBLOCK) {
+			tcp_rst(c, conn);
+			return -errno;
+		}
+
+		return 0;
+	}
 
 	if (!len) {
 		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
-			if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
+			int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
+			if (ret) {
 				tcp_rst(c, conn);
 				return ret;
 			}
@@ -469,19 +476,18 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len;
 	if (!peek_offset_cap)
-		sendlen -= already_sent;
+		len -= already_sent;
 
-	if (sendlen <= 0) {
+	if (len <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
 	}
 
 	conn_flag(c, conn, ~STALLED);
 
-	send_bufs = DIV_ROUND_UP(sendlen, mss);
-	last_len = sendlen - (send_bufs - 1) * mss;
+	send_bufs = DIV_ROUND_UP(len, mss);
+	last_len = len - (send_bufs - 1) * mss;
 
 	/* Likely, some new data was acked too. */
 	tcp_update_seqack_wnd(c, conn, false, NULL);
@@ -502,12 +508,4 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	conn_flag(c, conn, ACK_FROM_TAP_DUE);
 
 	return 0;
-
-err:
-	if (errno != EAGAIN && errno != EWOULDBLOCK) {
-		ret = -errno;
-		tcp_rst(c, conn);
-	}
-
-	return ret;
 }

From 5563d5f668450441e4f3cedc9d83283739b5e0ca Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 25 Oct 2024 09:49:10 +0200
Subject: [PATCH 065/382] test: remove obsolete images

Remove debian-9-nocloud-amd64-daily-20200210-166.qcow2 and
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2 as they cannot be
downloaded anymore

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/Makefile | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/test/Makefile b/test/Makefile
index 35a3b55..5e49047 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -8,7 +8,6 @@
 WGET = wget -c
 
 DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
-	debian-9-nocloud-amd64-daily-20200210-166.qcow2 \
 	debian-10-nocloud-amd64.qcow2 \
 	debian-10-generic-arm64.qcow2 \
 	debian-10-generic-ppc64el-20220911-1135.qcow2 \
@@ -42,8 +41,7 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \
 	openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \
 	openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \
 	openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \
-	openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \
-	openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
+	openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
 
 UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \
 	trusty-server-cloudimg-i386-disk1.img \
@@ -135,9 +133,6 @@ realclean: clean
 debian-8.11.0-openstack-%.qcow2:
 	$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
 
-debian-9-nocloud-%-daily-20200210-166.qcow2:
-	$(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2
-
 debian-10-nocloud-%.qcow2:
 	$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
 
@@ -203,9 +198,6 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz:
 openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
 	$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
 
-openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2:
-	$(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
-
 # Ubuntu downloads
 trusty-server-cloudimg-%-disk1.img:
 	$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img

From 2053c36dec4ce3e5bfddb52f5f2957165a692f1d Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Mon, 28 Oct 2024 22:13:59 -0400
Subject: [PATCH 066/382] tcp: set ip and eth headers in l2 tap queues on the
 fly

l2 tap queue entries are currently initialized at system start, and
reused with preset headers through its whole life time. The only
fields we need to update per message are things like payload size
and checksums.

If we want to reuse these entries between ipv4 and ipv6 messages we
will need to set the pointer to the right header on the fly per
message, since the header type may differ between entries in the same
queue.

The same needs to be done for the ethernet header.

We do these changes here.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_buf.c | 50 ++++++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/tcp_buf.c b/tcp_buf.c
index cb6742c..e249c6b 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -130,8 +130,7 @@ void tcp_sock4_iov_init(const struct ctx *c)
 		iov = tcp4_l2_iov[i];
 
 		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
 		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
 	}
 
@@ -173,8 +172,7 @@ void tcp_sock6_iov_init(const struct ctx *c)
 		iov = tcp6_l2_iov[i];
 
 		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
 		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
 	}
 
@@ -273,11 +271,17 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	uint32_t seq;
 	int ret;
 
-	if (CONN_V4(conn))
-		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
-	else
-		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-
+	if (CONN_V4(conn)) {
+		iov = tcp4_l2_flags_iov[tcp4_flags_used];
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[tcp4_flags_used]);
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+		tcp4_flags_used++;
+	} else {
+		iov = tcp6_l2_flags_iov[tcp6_flags_used];
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[tcp6_flags_used]);
+		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
+		tcp6_flags_used++;
+	}
 	payload = iov[TCP_IOV_PAYLOAD].iov_base;
 
 	seq = conn->seq_to_tap;
@@ -296,21 +300,19 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 
 	if (flags & DUP_ACK) {
 		struct iovec *dup_iov;
-		int i;
 
 		if (CONN_V4(conn))
 			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
 		else
 			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
 
-		for (i = 0; i < TCP_NUM_IOVS; i++) {
-			/* All frames share the same ethernet header buffer */
-			if (i != TCP_IOV_ETH) {
-				memcpy(dup_iov[i].iov_base, iov[i].iov_base,
-				       iov[i].iov_len);
-			}
-		}
-		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
+		memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
+		       iov[TCP_IOV_TAP].iov_len);
+		dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
+		dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP];
+		memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
+		       iov[TCP_IOV_PAYLOAD].iov_base, l4len);
+		dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
 	}
 
 	if (CONN_V4(conn)) {
@@ -350,8 +352,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		}
 
 		tcp4_frame_conns[tcp4_payload_used] = conn;
-
-		iov = tcp4_l2_iov[tcp4_payload_used++];
+		iov = tcp4_l2_iov[tcp4_payload_used];
+		iov[TCP_IOV_IP] =
+			IOV_OF_LVALUE(tcp4_payload_ip[tcp4_payload_used++]);
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
 		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq,
 						false);
 		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
@@ -359,8 +363,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
 		tcp6_frame_conns[tcp6_payload_used] = conn;
-
-		iov = tcp6_l2_iov[tcp6_payload_used++];
+		iov = tcp6_l2_iov[tcp6_payload_used];
+		iov[TCP_IOV_IP] =
+			IOV_OF_LVALUE(tcp6_payload_ip[tcp6_payload_used++]);
+		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
 		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq,
 						false);
 		iov[TCP_IOV_PAYLOAD].iov_len = l4len;

From ba38e67cf405c5fd4c0fc043af453fa23a55fb35 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Mon, 28 Oct 2024 22:14:00 -0400
Subject: [PATCH 067/382] tcp: unify l2 TCPv4 and TCPv6 queues and structures

Following the preparations in the previous commit, we can now remove
the payload and flag queues dedicated for TCPv6 and TCPv4 and move all
traffic into common queues handling both protocol types.

Apart from reducing code and memory footprint, this change reduces
a potential risk for TCPv4 traffic starving out TCPv6 traffic.
Since we always flush out the TCPv4 frame queue before the TCPv6 queue,
the latter will never be handled if the former fails to send all its
frames.

Tests with iperf3 shows no measurable change in performance after this
change.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c     |   6 +-
 tcp_buf.c | 248 ++++++++++++++++--------------------------------------
 tcp_buf.h |   3 +-
 3 files changed, 76 insertions(+), 181 deletions(-)

diff --git a/tcp.c b/tcp.c
index 0569dc6..10ad06a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2611,11 +2611,7 @@ int tcp_init(struct ctx *c)
 {
 	ASSERT(!c->no_tcp);
 
-	if (c->ifi4)
-		tcp_sock4_iov_init(c);
-
-	if (c->ifi6)
-		tcp_sock6_iov_init(c);
+	tcp_sock_iov_init(c);
 
 	memset(init_sock_pool4,		0xff,	sizeof(init_sock_pool4));
 	memset(init_sock_pool6,		0xff,	sizeof(init_sock_pool6));
diff --git a/tcp_buf.c b/tcp_buf.c
index e249c6b..274e313 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -38,59 +38,44 @@
 	(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
 
 /* Static buffers */
-/* Ethernet header for IPv4 frames */
+
+/* Ethernet header for IPv4 and IPv6 frames */
 static struct ethhdr		tcp4_eth_src;
+static struct ethhdr		tcp6_eth_src;
 
-static struct tap_hdr		tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
-/* IPv4 headers */
-static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
-/* TCP segments with payload for IPv4 frames */
-static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
+static struct tap_hdr		tcp_payload_tap_hdr[TCP_FRAMES_MEM];
 
-static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
+/* IP headers for IPv4 and IPv6 */
+struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
+struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
+
+/* TCP segments with payload for IPv4 and IPv6 frames */
+static struct tcp_payload_t	tcp_payload[TCP_FRAMES_MEM];
+
+static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516");
+static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516");
 
 /* References tracking the owner connection of frames in the tap outqueue */
-static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
-static unsigned int tcp4_payload_used;
+static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
+static unsigned int tcp_payload_used;
 
-static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
+static struct tap_hdr		tcp_flags_tap_hdr[TCP_FRAMES_MEM];
 /* IPv4 headers for TCP segment without payload */
 static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
 /* TCP segments without payload for IPv4 frames */
-static struct tcp_flags_t	tcp4_flags[TCP_FRAMES_MEM];
+static struct tcp_flags_t	tcp_flags[TCP_FRAMES_MEM];
 
-static unsigned int tcp4_flags_used;
+static unsigned int tcp_flags_used;
 
-/* Ethernet header for IPv6 frames */
-static struct ethhdr		tcp6_eth_src;
-
-static struct tap_hdr		tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
-/* IPv6 headers */
-static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
-/* TCP headers and data for IPv6 frames */
-static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
-
-static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
-
-/* References tracking the owner connection of frames in the tap outqueue */
-static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
-static unsigned int tcp6_payload_used;
-
-static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
 /* IPv6 headers for TCP segment without payload */
 static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
-/* TCP segment without payload for IPv6 frames */
-static struct tcp_flags_t	tcp6_flags[TCP_FRAMES_MEM];
-
-static unsigned int tcp6_flags_used;
 
 /* recvmsg()/sendmsg() data for tap */
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
-static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec	tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec	tcp_l2_flags_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+
 /**
  * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
  * @eth_d:	Ethernet destination address, NULL if unchanged
@@ -103,86 +88,46 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
 }
 
 /**
- * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
+ * tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
  * @c:		Execution context
  */
-void tcp_sock4_iov_init(const struct ctx *c)
-{
-	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
-	struct iovec *iov;
-	int i;
-
-	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
-
-	for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
-		tcp4_payload_ip[i] = iph;
-		tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp4_payload[i].th.ack = 1;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
-		tcp4_flags_ip[i] = iph;
-		tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp4_flags[i].th.ack = 1;
-	}
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp4_l2_iov[i];
-
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
-		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
-	}
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp4_l2_flags_iov[i];
-
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
-	}
-}
-
-/**
- * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
- * @c:		Execution context
- */
-void tcp_sock6_iov_init(const struct ctx *c)
+void tcp_sock_iov_init(const struct ctx *c)
 {
 	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
-	struct iovec *iov;
+	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
 	int i;
 
 	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
+	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
 
-	for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
+	for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
 		tcp6_payload_ip[i] = ip6;
-		tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp6_payload[i].th.ack = 1;
+		tcp4_payload_ip[i] = iph;
+		tcp_payload[i].th.doff = sizeof(struct tcphdr) / 4;
+		tcp_payload[i].th.ack = 1;
 	}
 
-	for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
+	for (i = 0; i < ARRAY_SIZE(tcp_flags); i++) {
 		tcp6_flags_ip[i] = ip6;
-		tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp6_flags[i].th .ack = 1;
+		tcp4_flags_ip[i] = iph;
+		tcp_flags[i].th.doff = sizeof(struct tcphdr) / 4;
+		tcp_flags[i].th.ack = 1;
 	}
 
 	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp6_l2_iov[i];
+		struct iovec *iov = tcp_l2_iov[i];
 
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
+		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
 		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
 	}
 
 	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp6_l2_flags_iov[i];
+		struct iovec *iov = tcp_l2_flags_iov[i];
 
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
+		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_flags_tap_hdr[i]);
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp_flags[i];
 	}
 }
 
@@ -192,13 +137,9 @@ void tcp_sock6_iov_init(const struct ctx *c)
  */
 void tcp_flags_flush(const struct ctx *c)
 {
-	tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
-			tcp6_flags_used);
-	tcp6_flags_used = 0;
-
-	tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
-			tcp4_flags_used);
-	tcp4_flags_used = 0;
+	tap_send_frames(c, &tcp_l2_flags_iov[0][0], TCP_NUM_IOVS,
+			tcp_flags_used);
+	tcp_flags_used = 0;
 }
 
 /**
@@ -237,21 +178,13 @@ void tcp_payload_flush(const struct ctx *c)
 {
 	size_t m;
 
-	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
-			    tcp6_payload_used);
-	if (m != tcp6_payload_used) {
-		tcp_revert_seq(c, &tcp6_frame_conns[m], &tcp6_l2_iov[m],
-			       tcp6_payload_used - m);
+	m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
+			    tcp_payload_used);
+	if (m != tcp_payload_used) {
+		tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
+			       tcp_payload_used - m);
 	}
-	tcp6_payload_used = 0;
-
-	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
-			    tcp4_payload_used);
-	if (m != tcp4_payload_used) {
-		tcp_revert_seq(c, &tcp4_frame_conns[m], &tcp4_l2_iov[m],
-			       tcp4_payload_used - m);
-	}
-	tcp4_payload_used = 0;
+	tcp_payload_used = 0;
 }
 
 /**
@@ -271,41 +204,30 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	uint32_t seq;
 	int ret;
 
+	iov = tcp_l2_flags_iov[tcp_flags_used];
 	if (CONN_V4(conn)) {
-		iov = tcp4_l2_flags_iov[tcp4_flags_used];
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[tcp4_flags_used]);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[tcp_flags_used]);
 		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
-		tcp4_flags_used++;
 	} else {
-		iov = tcp6_l2_flags_iov[tcp6_flags_used];
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[tcp6_flags_used]);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[tcp_flags_used]);
 		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
-		tcp6_flags_used++;
 	}
-	payload = iov[TCP_IOV_PAYLOAD].iov_base;
 
+	payload = iov[TCP_IOV_PAYLOAD].iov_base;
 	seq = conn->seq_to_tap;
 	ret = tcp_prepare_flags(c, conn, flags, &payload->th,
 				&payload->opts, &optlen);
-	if (ret <= 0) {
-		if (CONN_V4(conn))
-			tcp4_flags_used--;
-		else
-			tcp6_flags_used--;
+	if (ret <= 0)
 		return ret;
-	}
 
+	tcp_flags_used++;
 	l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
 	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
 
 	if (flags & DUP_ACK) {
 		struct iovec *dup_iov;
 
-		if (CONN_V4(conn))
-			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
-		else
-			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-
+		dup_iov = tcp_l2_flags_iov[tcp_flags_used++];
 		memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
 		       iov[TCP_IOV_TAP].iov_len);
 		dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
@@ -315,13 +237,8 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 		dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
 	}
 
-	if (CONN_V4(conn)) {
-		if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
-			tcp_flags_flush(c);
-	} else {
-		if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
-			tcp_flags_flush(c);
-	}
+	if (tcp_flags_used > TCP_FRAMES_MEM - 2)
+		tcp_flags_flush(c);
 
 	return 0;
 }
@@ -337,42 +254,30 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
+	const uint16_t *check = NULL;
 	struct iovec *iov;
 	size_t l4len;
 
 	conn->seq_to_tap = seq + dlen;
-
+	tcp_frame_conns[tcp_payload_used] = conn;
+	iov = tcp_l2_iov[tcp_payload_used];
 	if (CONN_V4(conn)) {
-		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
-		const uint16_t *check = NULL;
-
 		if (no_csum) {
+			struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
 			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
+
 			check = &iph->check;
 		}
-
-		tcp4_frame_conns[tcp4_payload_used] = conn;
-		iov = tcp4_l2_iov[tcp4_payload_used];
-		iov[TCP_IOV_IP] =
-			IOV_OF_LVALUE(tcp4_payload_ip[tcp4_payload_used++]);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
 		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
-		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq,
-						false);
-		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
-			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
-		tcp6_frame_conns[tcp6_payload_used] = conn;
-		iov = tcp6_l2_iov[tcp6_payload_used];
-		iov[TCP_IOV_IP] =
-			IOV_OF_LVALUE(tcp6_payload_ip[tcp6_payload_used++]);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
 		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
-		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq,
-						false);
-		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
-			tcp_payload_flush(c);
 	}
+	l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
+	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+	if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
+		tcp_payload_flush(c);
 }
 
 /**
@@ -388,8 +293,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
 	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
-	int len, dlen, v4 = CONN_V4(conn);
-	int s = conn->sock, i;
+	int len, dlen, i, s = conn->sock;
 	struct msghdr mh_sock = { 0 };
 	uint16_t mss = MSS_GET(conn);
 	uint32_t already_sent, seq;
@@ -436,19 +340,15 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		mh_sock.msg_iovlen = fill_bufs;
 	}
 
-	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
-	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
+	if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
 		tcp_payload_flush(c);
 
 		/* Silence Coverity CWE-125 false positive */
-		tcp4_payload_used = tcp6_payload_used = 0;
+		tcp_payload_used = 0;
 	}
 
 	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
-		if (v4)
-			iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
-		else
-			iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
+		iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
 		iov->iov_len = mss;
 	}
 	if (iov_rem)
@@ -502,7 +402,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	dlen = mss;
 	seq = conn->seq_to_tap;
 	for (i = 0; i < send_bufs; i++) {
-		int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
+		int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
 
 		if (i == send_bufs - 1)
 			dlen = last_len;
diff --git a/tcp_buf.h b/tcp_buf.h
index 8d4b615..49c04d4 100644
--- a/tcp_buf.h
+++ b/tcp_buf.h
@@ -6,8 +6,7 @@
 #ifndef TCP_BUF_H
 #define TCP_BUF_H
 
-void tcp_sock4_iov_init(const struct ctx *c);
-void tcp_sock6_iov_init(const struct ctx *c);
+void tcp_sock_iov_init(const struct ctx *c);
 void tcp_flags_flush(const struct ctx *c);
 void tcp_payload_flush(const struct ctx *c);
 int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);

From 988a4d75f89473cbf76e09852a03f21658859710 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 24 Oct 2024 23:10:09 +0200
Subject: [PATCH 068/382] Makefile: Exclude qrap.c from clang-tidy checks

We'll deprecate qrap(1) soon, and warnings reported by clang-tidy as
of LLVM versions 16 and later would need a bunch of changes there to
be addressed, mostly around CERT C rule ERR33-C and checking return
code from snprintf().

It makes no sense to fix warnings in qrap just for the sake of it, so
officially declare the bitrotting season open.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 4c2d020..01f0cc1 100644
--- a/Makefile
+++ b/Makefile
@@ -256,7 +256,7 @@ docs: README.md
 #	weird for cases like standalone constants, and causes other
 #	awkwardness for a bunch of cases we use
 
-clang-tidy: $(SRCS) $(HEADERS)
+clang-tidy: $(filter-out qrap.c,$(SRCS)) $(HEADERS)
 	clang-tidy -checks=*,-modernize-*,\
 	-clang-analyzer-valist.Uninitialized,\
 	-cppcoreguidelines-init-variables,\
@@ -283,7 +283,7 @@ clang-tidy: $(SRCS) $(HEADERS)
 	-misc-include-cleaner,\
 	-cppcoreguidelines-macro-to-enum \
 	-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
-	--warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
+	--warnings-as-errors=* $(filter-out qrap.c,$(SRCS)) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
 
 SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
 ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)

From 98efe7c2fdd82a2822e1be8e5c5c8caed846ae76 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 24 Oct 2024 23:25:33 +0200
Subject: [PATCH 069/382] treewide: Comply with CERT C rule ERR33-C for
 snprintf()

clang-tidy, starting from LLVM version 16, up to at least LLVM version
19, now checks that we detect and handle errors for snprintf() as
requested by CERT C rule ERR33-C. These warnings were logged with LLVM
version 19.1.2 (at least Debian and Fedora match):

/home/sbrivio/passt/arch.c:43:3: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors]
   43 |                 snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe);
      |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/sbrivio/passt/arch.c:43:3: note: cast the expression to void to silence this warning
/home/sbrivio/passt/conf.c:577:4: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors]
  577 |                         snprintf(netns, PATH_MAX, "/proc/%ld/ns/net", pidval);
      |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/sbrivio/passt/conf.c:577:4: note: cast the expression to void to silence this warning
/home/sbrivio/passt/conf.c:579:5: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors]
  579 |                                 snprintf(userns, PATH_MAX, "/proc/%ld/ns/user",
      |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  580 |                                          pidval);
      |                                          ~~~~~~~
/home/sbrivio/passt/conf.c:579:5: note: cast the expression to void to silence this warning
/home/sbrivio/passt/pasta.c:105:2: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors]
  105 |         snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/sbrivio/passt/pasta.c:105:2: note: cast the expression to void to silence this warning
/home/sbrivio/passt/pasta.c:242:2: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors]
  242 |         snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/sbrivio/passt/pasta.c:242:2: note: cast the expression to void to silence this warning
/home/sbrivio/passt/pasta.c:243:2: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors]
  243 |         snprintf(gidmap, BUFSIZ, "0 %u 1", gid);
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/sbrivio/passt/pasta.c:243:2: note: cast the expression to void to silence this warning
/home/sbrivio/passt/tap.c:1155:4: error: the value returned by this function should not be disregarded; neglecting it may lead to errors [cert-err33-c,-warnings-as-errors]
 1155 |                         snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i);
      |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/sbrivio/passt/tap.c:1155:4: note: cast the expression to void to silence this warning

Don't silence the warnings as they might actually have some merit. Add
an snprintf_check() function, instead, checking that we're not
truncating messages while printing to buffers, and terminate if the
check fails.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 arch.c  |  6 +++++-
 conf.c  | 13 +++++++++----
 pasta.c | 11 ++++++++---
 tap.c   |  5 +++--
 util.c  | 30 ++++++++++++++++++++++++++++++
 util.h  |  2 ++
 6 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/arch.c b/arch.c
index 04bebfc..d1dfb73 100644
--- a/arch.c
+++ b/arch.c
@@ -19,6 +19,7 @@
 #include <unistd.h>
 
 #include "log.h"
+#include "util.h"
 
 /**
  * arch_avx2_exec() - Switch to AVX2 build if supported
@@ -40,7 +41,10 @@ void arch_avx2_exec(char **argv)
 	if (__builtin_cpu_supports("avx2")) {
 		char new_path[PATH_MAX + sizeof(".avx2")];
 
-		snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe);
+		if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"),
+				   "%s.avx2", exe))
+			die_perror("Can't build AVX2 executable path");
+
 		execve(new_path, argv, environ);
 		warn_perror("Can't run AVX2 build, using non-AVX2 version");
 	}
diff --git a/conf.c b/conf.c
index b3b5342..fa5cec3 100644
--- a/conf.c
+++ b/conf.c
@@ -574,10 +574,15 @@ static void conf_pasta_ns(int *netns_only, char *userns, char *netns,
 			if (pidval < 0 || pidval > INT_MAX)
 				die("Invalid PID %s", argv[optind]);
 
-			snprintf(netns, PATH_MAX, "/proc/%ld/ns/net", pidval);
-			if (!*userns)
-				snprintf(userns, PATH_MAX, "/proc/%ld/ns/user",
-					 pidval);
+			if (snprintf_check(netns, PATH_MAX,
+					   "/proc/%ld/ns/net", pidval))
+				die_perror("Can't build netns path");
+
+			if (!*userns) {
+				if (snprintf_check(userns, PATH_MAX,
+						   "/proc/%ld/ns/user", pidval))
+					die_perror("Can't build userns path");
+			}
 		}
 	}
 
diff --git a/pasta.c b/pasta.c
index 307fb4a..a117704 100644
--- a/pasta.c
+++ b/pasta.c
@@ -102,7 +102,9 @@ static int pasta_wait_for_ns(void *arg)
 	int flags = O_RDONLY | O_CLOEXEC;
 	char ns[PATH_MAX];
 
-	snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
+	if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid))
+		die_perror("Can't build netns path");
+
 	do {
 		while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
 			if (errno != ENOENT)
@@ -239,8 +241,11 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
 		c->quiet = 1;
 
 	/* Configure user and group mappings */
-	snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
-	snprintf(gidmap, BUFSIZ, "0 %u 1", gid);
+	if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid))
+		die_perror("Can't build uidmap");
+
+	if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid))
+		die_perror("Can't build gidmap");
 
 	if (write_file("/proc/self/uid_map", uidmap) ||
 	    write_file("/proc/self/setgroups", "deny") ||
diff --git a/tap.c b/tap.c
index c53a39b..cfb82e9 100644
--- a/tap.c
+++ b/tap.c
@@ -1151,8 +1151,9 @@ int tap_sock_unix_open(char *sock_path)
 
 		if (*sock_path)
 			memcpy(path, sock_path, UNIX_PATH_MAX);
-		else
-			snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i);
+		else if (snprintf_check(path, UNIX_PATH_MAX - 1,
+					UNIX_SOCK_PATH, i))
+			die_perror("Can't build UNIX domain socket path");
 
 		ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
 		if (ex < 0)
diff --git a/util.c b/util.c
index eba7d52..21ce0a8 100644
--- a/util.c
+++ b/util.c
@@ -749,3 +749,33 @@ void close_open_files(int argc, char **argv)
 	if (rc)
 		die_perror("Failed to close files leaked by parent");
 }
+
+/**
+ * snprintf_check() - snprintf() wrapper, checking for truncation and errors
+ * @str:	Output buffer
+ * @size:	Maximum size to write to @str
+ * @format:	Message
+ *
+ * Return: false on success, true on truncation or error, sets errno on failure
+ */
+bool snprintf_check(char *str, size_t size, const char *format, ...)
+{
+	va_list ap;
+	int rc;
+
+	va_start(ap, format);
+	rc = vsnprintf(str, size, format, ap);
+	va_end(ap);
+
+	if (rc < 0) {
+		errno = EIO;
+		return true;
+	}
+
+	if ((size_t)rc >= size) {
+		errno = ENOBUFS;
+		return true;
+	}
+
+	return false;
+}
diff --git a/util.h b/util.h
index 2c1e08e..96f178c 100644
--- a/util.h
+++ b/util.h
@@ -11,6 +11,7 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <string.h>
 #include <signal.h>
 #include <arpa/inet.h>
@@ -200,6 +201,7 @@ int write_file(const char *path, const char *buf);
 int write_all_buf(int fd, const void *buf, size_t len);
 int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
 void close_open_files(int argc, char **argv);
+bool snprintf_check(char *str, size_t size, const char *format, ...);
 
 /**
  * af_name() - Return name of an address family

From 744247856da10412a64ce0720f0e7359981748e1 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 24 Oct 2024 23:44:43 +0200
Subject: [PATCH 070/382] treewide: Silence cert-err33-c clang-tidy warnings
 for fprintf()

We use fprintf() to print to standard output or standard error
streams. If something gets truncated or there's an output error, we
don't really want to try and report that, and at the same time it's
not abnormal behaviour upon which we should terminate, either.

Just silence the warning with an ugly FPRINTF() variadic macro casting
the fprintf() expressions to void.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c | 46 +++++++++++++++++++++++-----------------------
 log.c  |  6 +++---
 util.h |  3 +++
 3 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/conf.c b/conf.c
index fa5cec3..4db7c64 100644
--- a/conf.c
+++ b/conf.c
@@ -733,19 +733,19 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
 static void usage(const char *name, FILE *f, int status)
 {
 	if (strstr(name, "pasta")) {
-		fprintf(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name);
-		fprintf(f, "       %s [OPTION]... PID\n", name);
-		fprintf(f, "       %s [OPTION]... --netns [PATH|NAME]\n", name);
-		fprintf(f,
+		FPRINTF(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name);
+		FPRINTF(f, "       %s [OPTION]... PID\n", name);
+		FPRINTF(f, "       %s [OPTION]... --netns [PATH|NAME]\n", name);
+		FPRINTF(f,
 			"\n"
 			"Without PID or --netns, run the given command or a\n"
 			"default shell in a new network and user namespace, and\n"
 			"connect it via pasta.\n");
 	} else {
-		fprintf(f, "Usage: %s [OPTION]...\n", name);
+		FPRINTF(f, "Usage: %s [OPTION]...\n", name);
 	}
 
-	fprintf(f,
+	FPRINTF(f,
 		"\n"
 		"  -d, --debug		Be verbose\n"
 		"      --trace		Be extra verbose, implies --debug\n"
@@ -762,17 +762,17 @@ static void usage(const char *name, FILE *f, int status)
 		"  --version		Show version and exit\n");
 
 	if (strstr(name, "pasta")) {
-		fprintf(f,
+		FPRINTF(f,
 			"  -I, --ns-ifname NAME	namespace interface name\n"
 			"    default: same interface name as external one\n");
 	} else {
-		fprintf(f,
+		FPRINTF(f,
 			"  -s, --socket PATH	UNIX domain socket path\n"
 			"    default: probe free path starting from "
 			UNIX_SOCK_PATH "\n", 1);
 	}
 
-	fprintf(f,
+	FPRINTF(f,
 		"  -F, --fd FD		Use FD as pre-opened connected socket\n"
 		"  -p, --pcap FILE	Log tap-facing traffic to pcap file\n"
 		"  -P, --pid FILE	Write own PID to the given file\n"
@@ -803,28 +803,28 @@ static void usage(const char *name, FILE *f, int status)
 		"    can be specified multiple times\n"
 		"    a single, empty option disables DNS information\n");
 	if (strstr(name, "pasta"))
-		fprintf(f, "    default: don't use any addresses\n");
+		FPRINTF(f, "    default: don't use any addresses\n");
 	else
-		fprintf(f, "    default: use addresses from /etc/resolv.conf\n");
-	fprintf(f,
+		FPRINTF(f, "    default: use addresses from /etc/resolv.conf\n");
+	FPRINTF(f,
 		"  -S, --search LIST	Space-separated list, search domains\n"
 		"    a single, empty option disables the DNS search list\n");
 	if (strstr(name, "pasta"))
-		fprintf(f, "    default: don't use any search list\n");
+		FPRINTF(f, "    default: don't use any search list\n");
 	else
-		fprintf(f, "    default: use search list from /etc/resolv.conf\n");
+		FPRINTF(f, "    default: use search list from /etc/resolv.conf\n");
 
 	if (strstr(name, "pasta"))
-		fprintf(f, "  --dhcp-dns	\tPass DNS list via DHCP/DHCPv6/NDP\n");
+		FPRINTF(f, "  --dhcp-dns	\tPass DNS list via DHCP/DHCPv6/NDP\n");
 	else
-		fprintf(f, "  --no-dhcp-dns	No DNS list in DHCP/DHCPv6/NDP\n");
+		FPRINTF(f, "  --no-dhcp-dns	No DNS list in DHCP/DHCPv6/NDP\n");
 
 	if (strstr(name, "pasta"))
-		fprintf(f, "  --dhcp-search	Pass list via DHCP/DHCPv6/NDP\n");
+		FPRINTF(f, "  --dhcp-search	Pass list via DHCP/DHCPv6/NDP\n");
 	else
-		fprintf(f, "  --no-dhcp-search	No list in DHCP/DHCPv6/NDP\n");
+		FPRINTF(f, "  --no-dhcp-search	No list in DHCP/DHCPv6/NDP\n");
 
-	fprintf(f,
+	FPRINTF(f,
 		"  --map-host-loopback ADDR	Translate ADDR to refer to host\n"
 	        "    can be specified zero to two times (for IPv4 and IPv6)\n"
 		"    default: gateway address\n"
@@ -852,7 +852,7 @@ static void usage(const char *name, FILE *f, int status)
 	if (strstr(name, "pasta"))
 		goto pasta_opts;
 
-	fprintf(f,
+	FPRINTF(f,
 		"  -1, --one-off	Quit after handling one single client\n"
 		"  -t, --tcp-ports SPEC	TCP port forwarding to guest\n"
 		"    can be specified multiple times\n"
@@ -883,7 +883,7 @@ static void usage(const char *name, FILE *f, int status)
 
 pasta_opts:
 
-	fprintf(f,
+	FPRINTF(f,
 		"  -t, --tcp-ports SPEC	TCP port forwarding to namespace\n"
 		"    can be specified multiple times\n"
 		"    SPEC can be:\n"
@@ -1421,9 +1421,9 @@ void conf(struct ctx *c, int argc, char **argv)
 
 			break;
 		case 14:
-			fprintf(stdout,
+			FPRINTF(stdout,
 				c->mode == MODE_PASTA ? "pasta " : "passt ");
-			fprintf(stdout, VERSION_BLOB);
+			FPRINTF(stdout, VERSION_BLOB);
 			exit(EXIT_SUCCESS);
 		case 15:
 			ret = snprintf(c->ip4.ifname_out,
diff --git a/log.c b/log.c
index a61468e..6932885 100644
--- a/log.c
+++ b/log.c
@@ -274,7 +274,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
 		char timestr[LOGTIME_STRLEN];
 
 		logtime_fmt(timestr, sizeof(timestr), now);
-		fprintf(stderr, "%s: ", timestr);
+		FPRINTF(stderr, "%s: ", timestr);
 	}
 
 	if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) {
@@ -293,7 +293,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
 	    (log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) {
 		(void)vfprintf(stderr, format, ap);
 		if (newline && format[strlen(format)] != '\n')
-			fprintf(stderr, "\n");
+			FPRINTF(stderr, "\n");
 	}
 }
 
@@ -399,7 +399,7 @@ void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
 		n += snprintf(buf + n, BUFSIZ - n, "\n");
 
 	if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
-		fprintf(stderr, "Failed to send %i bytes to syslog\n", n);
+		FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
 }
 
 /**
diff --git a/util.h b/util.h
index 96f178c..4f8b768 100644
--- a/util.h
+++ b/util.h
@@ -269,6 +269,9 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
 	return mod_sub(x, i, m) < mod_sub(j, i, m);
 }
 
+/* FPRINTF() intentionally silences cert-err33-c clang-tidy warnings */
+#define FPRINTF(f, ...)	(void)fprintf(f, __VA_ARGS__)
+
 /*
  * Workarounds for https://github.com/llvm/llvm-project/issues/58992
  *

From 134b4d58b409013d9f231aac1d4ba69f7835da7c Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 24 Oct 2024 23:52:19 +0200
Subject: [PATCH 071/382] Makefile: Disable
 readability-math-missing-parentheses clang-tidy check

With clang-tidy and LLVM 19:

/home/sbrivio/passt/conf.c:1218:29: error: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors]
 1218 |                 const char *octet = str + 3 * i;
      |                                           ^~~~~~
      |                                           (    )
/home/sbrivio/passt/ndp.c:285:18: error: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors]
  285 |                                         .len            = 1 + 2 * n,
      |                                                               ^~~~~~
      |                                                               (    )
/home/sbrivio/passt/ndp.c:329:23: error: '%' has higher precedence than '-'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors]
  329 |                         memset(ptr, 0, 8 - dns_s_len % 8);      /* padding */
      |                                            ^~~~~~~~~~~~~~
      |                                            (            )
/home/sbrivio/passt/pcap.c:131:20: error: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors]
  131 |                 pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
      |                                  ^~~~~~~~~~~~~~~~
      |                                  (              )
/home/sbrivio/passt/util.c:216:10: error: '/' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors]
  216 |                 return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 +
      |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                        (                                            )
/home/sbrivio/passt/util.c:217:10: error: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors]
  217 |                        (a->tv_sec - b->tv_sec - 1) * 1000000;
      |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                        (                                    )
/home/sbrivio/passt/util.c:220:9: error: '/' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors]
  220 |         return (a->tv_nsec - b->tv_nsec) / 1000 +
      |                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                (                               )
/home/sbrivio/passt/util.c:221:9: error: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors]
  221 |                (a->tv_sec - b->tv_sec) * 1000000;
      |                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                (                                )
/home/sbrivio/passt/util.c:545:32: error: '/' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses,-warnings-as-errors]
  545 |         return clone(fn, stack_area + stack_size / 2, flags, arg);
      |                                       ^~~~~~~~~~~~~~~
      |                                       (             )

Just... no.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 Makefile | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 01f0cc1..c1c6e30 100644
--- a/Makefile
+++ b/Makefile
@@ -255,6 +255,12 @@ docs: README.md
 #	makes sense when those defines form an enum-like set, but
 #	weird for cases like standalone constants, and causes other
 #	awkwardness for a bunch of cases we use
+#
+# - readability-math-missing-parentheses
+#	It's been a couple of centuries since multiplication has been granted
+#	precedence over addition in modern mathematical notation. Adding
+#	parentheses to reinforce that certainly won't improve readability.
+
 
 clang-tidy: $(filter-out qrap.c,$(SRCS)) $(HEADERS)
 	clang-tidy -checks=*,-modernize-*,\
@@ -281,7 +287,8 @@ clang-tidy: $(filter-out qrap.c,$(SRCS)) $(HEADERS)
 	-concurrency-mt-unsafe,\
 	-readability-identifier-length,\
 	-misc-include-cleaner,\
-	-cppcoreguidelines-macro-to-enum \
+	-cppcoreguidelines-macro-to-enum,\
+	-readability-math-missing-parentheses \
 	-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
 	--warnings-as-errors=* $(filter-out qrap.c,$(SRCS)) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
 

From 59fe34ee36368bb28c8298b1a1bfad5d0d9f47a3 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 25 Oct 2024 00:10:36 +0200
Subject: [PATCH 072/382] treewide: Suppress clang-tidy warning if we already
 use O_CLOEXEC

In pcap_init(), we should always open the packet capture file with
O_CLOEXEC, even if we're not running in foreground: O_CLOEXEC means
close-on-exec, not close-on-fork.

In logfile_init() and pidfile_open(), the fact that we pass a third
'mode' argument to open() seems to confuse the android-cloexec-open
checker in LLVM versions from 16 to 19 (at least).

The checker is suggesting to add O_CLOEXEC to 'mode', and not in
'flags', where we already have it.

Add a suppression for clang-tidy and a comment, and avoid repeating
those three times by adding a new helper, output_file_open().

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c |  6 +++++-
 log.c  |  3 +--
 pcap.c |  7 ++-----
 util.c | 27 +++++++++++----------------
 util.h |  2 +-
 5 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/conf.c b/conf.c
index 4db7c64..14411b4 100644
--- a/conf.c
+++ b/conf.c
@@ -1194,7 +1194,11 @@ static void conf_open_files(struct ctx *c)
 	if (c->mode != MODE_PASTA && c->fd_tap == -1)
 		c->fd_tap_listen = tap_sock_unix_open(c->sock_path);
 
-	c->pidfile_fd = pidfile_open(c->pidfile);
+	if (*c->pidfile) {
+		c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY);
+		if (c->pidfile_fd < 0)
+			die_perror("Couldn't open PID file %s", c->pidfile);
+	}
 }
 
 /**
diff --git a/log.c b/log.c
index 6932885..19f1d98 100644
--- a/log.c
+++ b/log.c
@@ -416,8 +416,7 @@ void logfile_init(const char *name, const char *path, size_t size)
 	if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
 		die_perror("Failed to read own /proc/self/exe link");
 
-	log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC,
-			S_IRUSR | S_IWUSR);
+	log_file = output_file_open(path, O_APPEND | O_RDWR);
 	if (log_file == -1)
 		die_perror("Couldn't open log file %s", path);
 
diff --git a/pcap.c b/pcap.c
index 6ee6cdf..2e2ff93 100644
--- a/pcap.c
+++ b/pcap.c
@@ -158,18 +158,15 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
  */
 void pcap_init(struct ctx *c)
 {
-	int flags = O_WRONLY | O_CREAT | O_TRUNC;
-
 	if (pcap_fd != -1)
 		return;
 
 	if (!*c->pcap)
 		return;
 
-	flags |= c->foreground ? O_CLOEXEC : 0;
-	pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR);
+	pcap_fd = output_file_open(c->pcap, O_WRONLY);
 	if (pcap_fd == -1) {
-		perror("open");
+		err_perror("Couldn't open pcap file %s", c->pcap);
 		return;
 	}
 
diff --git a/util.c b/util.c
index 21ce0a8..1d6d009 100644
--- a/util.c
+++ b/util.c
@@ -407,25 +407,20 @@ void pidfile_write(int fd, pid_t pid)
 }
 
 /**
- * pidfile_open() - Open PID file if needed
- * @path:	Path for PID file, empty string if no PID file is requested
+ * output_file_open() - Open file for output, if needed
+ * @path:	Path for output file
+ * @flags:	Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC
  *
- * Return: descriptor for PID file, -1 if path is NULL, won't return on failure
+ * Return: file descriptor on success, -1 on failure with errno set by open()
  */
-int pidfile_open(const char *path)
+int output_file_open(const char *path, int flags)
 {
-	int fd;
-
-	if (!*path)
-		return -1;
-
-	if ((fd = open(path, O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC,
-			     S_IRUSR | S_IWUSR)) < 0) {
-		perror("PID file open");
-		exit(EXIT_FAILURE);
-	}
-
-	return fd;
+	/* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for
+	 * it in the 'mode' argument if we have one
+	 */
+	return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags,
+		    /* NOLINTNEXTLINE(android-cloexec-open) */
+		    S_IRUSR | S_IWUSR);
 }
 
 /**
diff --git a/util.h b/util.h
index 4f8b768..3fc64cf 100644
--- a/util.h
+++ b/util.h
@@ -193,7 +193,7 @@ char *line_read(char *buf, size_t len, int fd);
 void ns_enter(const struct ctx *c);
 bool ns_is_init(void);
 int open_in_ns(const struct ctx *c, const char *path, int flags);
-int pidfile_open(const char *path);
+int output_file_open(const char *path, int flags);
 void pidfile_write(int fd, pid_t pid);
 int __daemon(int pidfile_fd, int devnull_fd);
 int fls(unsigned long x);

From 099ace64cedbf43922527dc7f132f0c0e65f308a Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 25 Oct 2024 00:29:50 +0200
Subject: [PATCH 073/382] treewide: Address cert-err33-c clang-tidy warnings
 for clock and timer functions

For clock_gettime(), we shouldn't ignore errors if they happen at
initialisation phase, because something is seriously wrong and it's
not helpful if we proceed as if nothing happened.

As we're up and running, though, it's probably better to report the
error and use a stale value than to terminate altogether. Make sure
we use a zero value if we don't have a stale one somewhere.

For timerfd_gettime() and timerfd_settime() failures, just report an
error, there isn't much else we can do.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.c |  9 ++++++---
 pcap.c  | 17 +++++++++++------
 tcp.c   | 12 +++++++++---
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/passt.c b/passt.c
index ad6f0bc..eaf231d 100644
--- a/passt.c
+++ b/passt.c
@@ -207,7 +207,8 @@ int main(int argc, char **argv)
 	struct timespec now;
 	struct sigaction sa;
 
-	clock_gettime(CLOCK_MONOTONIC, &log_start);
+	if (clock_gettime(CLOCK_MONOTONIC, &log_start))
+		die_perror("Failed to get CLOCK_MONOTONIC time");
 
 	arch_avx2_exec(argv);
 
@@ -265,7 +266,8 @@ int main(int argc, char **argv)
 
 	secret_init(&c);
 
-	clock_gettime(CLOCK_MONOTONIC, &now);
+	if (clock_gettime(CLOCK_MONOTONIC, &now))
+		die_perror("Failed to get CLOCK_MONOTONIC time");
 
 	flow_init();
 
@@ -313,7 +315,8 @@ loop:
 	if (nfds == -1 && errno != EINTR)
 		die_perror("epoll_wait() failed in main loop");
 
-	clock_gettime(CLOCK_MONOTONIC, &now);
+	if (clock_gettime(CLOCK_MONOTONIC, &now))
+		err_perror("Failed to get CLOCK_MONOTONIC time");
 
 	for (i = 0; i < nfds; i++) {
 		union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
diff --git a/pcap.c b/pcap.c
index 2e2ff93..23205dd 100644
--- a/pcap.c
+++ b/pcap.c
@@ -100,12 +100,14 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
 void pcap(const char *pkt, size_t l2len)
 {
 	struct iovec iov = { (char *)pkt, l2len };
-	struct timespec now;
+	struct timespec now = { 0 };
 
 	if (pcap_fd == -1)
 		return;
 
-	clock_gettime(CLOCK_REALTIME, &now);
+	if (clock_gettime(CLOCK_REALTIME, &now))
+		err_perror("Failed to get CLOCK_REALTIME time");
+
 	pcap_frame(&iov, 1, 0, &now);
 }
 
@@ -119,13 +121,14 @@ void pcap(const char *pkt, size_t l2len)
 void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
 		   size_t offset)
 {
-	struct timespec now;
+	struct timespec now = { 0 };
 	unsigned int i;
 
 	if (pcap_fd == -1)
 		return;
 
-	clock_gettime(CLOCK_REALTIME, &now);
+	if (clock_gettime(CLOCK_REALTIME, &now))
+		err_perror("Failed to get CLOCK_REALTIME time");
 
 	for (i = 0; i < n; i++)
 		pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
@@ -143,12 +146,14 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
 /* cppcheck-suppress unusedFunction */
 void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
 {
-	struct timespec now;
+	struct timespec now = { 0 };
 
 	if (pcap_fd == -1)
 		return;
 
-	clock_gettime(CLOCK_REALTIME, &now);
+	if (clock_gettime(CLOCK_REALTIME, &now))
+		err_perror("Failed to get CLOCK_REALTIME time");
+
 	pcap_frame(iov, iovcnt, offset, &now);
 }
 
diff --git a/tcp.c b/tcp.c
index 10ad06a..4e0a17e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -549,7 +549,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		 (unsigned long long)it.it_value.tv_sec,
 		 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
 
-	timerfd_settime(conn->timer, 0, &it, NULL);
+	if (timerfd_settime(conn->timer, 0, &it, NULL))
+		flow_err(conn, "failed to set timer: %s", strerror(errno));
 }
 
 /**
@@ -2235,7 +2236,9 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 	 * timer is currently armed, this event came from a previous setting,
 	 * and we just set the timer to a new point in the future: discard it.
 	 */
-	timerfd_gettime(conn->timer, &check_armed);
+	if (timerfd_gettime(conn->timer, &check_armed))
+		flow_err(conn, "failed to read timer: %s", strerror(errno));
+
 	if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
 		return;
 
@@ -2273,7 +2276,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 		 * case. This avoids having to preemptively reset the timer on
 		 * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
 		 */
-		timerfd_settime(conn->timer, 0, &new, &old);
+		if (timerfd_settime(conn->timer, 0, &new, &old))
+			flow_err(conn, "failed to set timer: %s",
+				 strerror(errno));
+
 		if (old.it_value.tv_sec == ACT_TIMEOUT) {
 			flow_dbg(conn, "activity timeout");
 			tcp_rst(c, conn);

From b1a607fba11b3325117b76ffb41cc6edff774abf Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 25 Oct 2024 00:48:10 +0200
Subject: [PATCH 074/382] udp: Take care of cert-int09-c clang-tidy warning for
 enum udp_iov_idx

/home/sbrivio/passt/udp.c:171:1: error: inital values in enum 'udp_iov_idx' are not consistent, consider explicit initialization of all, none or only the first enumerator [cert-int09-c,readability-enum-initial-value,-warnings-as-errors]
  171 | enum udp_iov_idx {
      | ^
  172 |         UDP_IOV_TAP     = 0,
  173 |         UDP_IOV_ETH     = 1,
  174 |         UDP_IOV_IP      = 2,
  175 |         UDP_IOV_PAYLOAD = 3,
  176 |         UDP_NUM_IOVS
      |
      |                      = 4

Don't initialise any value, so that it's obvious that constants map to
unique values.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/udp.c b/udp.c
index 100610f..0c01067 100644
--- a/udp.c
+++ b/udp.c
@@ -169,11 +169,11 @@ udp_meta[UDP_MAX_FRAMES];
  * @UDP_NUM_IOVS        the number of entries in the iovec array
  */
 enum udp_iov_idx {
-	UDP_IOV_TAP	= 0,
-	UDP_IOV_ETH	= 1,
-	UDP_IOV_IP	= 2,
-	UDP_IOV_PAYLOAD	= 3,
-	UDP_NUM_IOVS
+	UDP_IOV_TAP,
+	UDP_IOV_ETH,
+	UDP_IOV_IP,
+	UDP_IOV_PAYLOAD,
+	UDP_NUM_IOVS,
 };
 
 /* IOVs and msghdr arrays for receiving datagrams from sockets */

From ee7d0b62a716201abc818eb0d1df4c6bb1051336 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 25 Oct 2024 00:57:58 +0200
Subject: [PATCH 075/382] util: Don't use errno after a successful call in
 __daemon()

I thought we could just set errno to 0, do a bunch of stuff, and check
that errno didn't change to infer we succeeded. But clang-tidy,
starting with LLVM 19, reports:

/home/sbrivio/passt/util.c:465:6: error: An undefined value may be read from 'errno' [clang-analyzer-unix.Errno,-warnings-as-errors]
  465 |         if (errno)
      |             ^
/usr/include/errno.h:38:16: note: expanded from macro 'errno'
   38 | # define errno (*__errno_location ())
      |                ^~~~~~~~~~~~~~~~~~~~~~
/home/sbrivio/passt/util.c:446:6: note: Assuming the condition is false
  446 |         if (pid == -1) {
      |             ^~~~~~~~~
/home/sbrivio/passt/util.c:446:2: note: Taking false branch
  446 |         if (pid == -1) {
      |         ^
/home/sbrivio/passt/util.c:451:6: note: Assuming 'pid' is 0
  451 |         if (pid) {
      |             ^~~
/home/sbrivio/passt/util.c:451:2: note: Taking false branch
  451 |         if (pid) {
      |         ^
/home/sbrivio/passt/util.c:463:2: note: Assuming that 'close' is successful; 'errno' becomes undefined after the call
  463 |         close(devnull_fd);
      |         ^~~~~~~~~~~~~~~~~
/home/sbrivio/passt/util.c:465:6: note: An undefined value may be read from 'errno'
  465 |         if (errno)
      |             ^
/usr/include/errno.h:38:16: note: expanded from macro 'errno'
   38 | # define errno (*__errno_location ())
      |                ^~~~~~~~~~~~~~~~~~~~~~

And the LLVM documentation for the unix.Errno checker, 1.1.8.3
unix.Errno (C), mentions, at:

  https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno

that:

  The C and POSIX standards often do not define if a standard library
  function may change value of errno if the call does not fail.
  Therefore, errno should only be used if it is known from the return
  value of a function that the call has failed.

which is, somewhat surprisingly, the case for close().

Instead of using errno, check the actual return values of the calls
we issue here.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 util.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/util.c b/util.c
index 1d6d009..dddef93 100644
--- a/util.c
+++ b/util.c
@@ -444,16 +444,11 @@ int __daemon(int pidfile_fd, int devnull_fd)
 		exit(EXIT_SUCCESS);
 	}
 
-	errno = 0;
-
-	setsid();
-
-	dup2(devnull_fd, STDIN_FILENO);
-	dup2(devnull_fd, STDOUT_FILENO);
-	dup2(devnull_fd, STDERR_FILENO);
-	close(devnull_fd);
-
-	if (errno)
+	if (setsid()				< 0 ||
+	    dup2(devnull_fd, STDIN_FILENO)	< 0 ||
+	    dup2(devnull_fd, STDOUT_FILENO)	< 0 ||
+	    dup2(devnull_fd, STDERR_FILENO)	< 0 ||
+	    close(devnull_fd))
 		exit(EXIT_FAILURE);
 
 	return 0;

From d165d36a0c88fe8665da012f35cb60ced991568b Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 30 Oct 2024 21:31:05 +0100
Subject: [PATCH 076/382] tcp: Fix build against musl, __sum16 comes from
 linux/types.h

Use a plain uint16_t instead and avoid including one extra header:
the 'bitwise' attribute of __sum16 is just used by sparse(1).

Reported-by: omni <omni+alpine@hack.org>
Fixes: 3d484aa37090 ("tcp: Update TCP checksum using an iovec array")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tcp.c b/tcp.c
index 4e0a17e..56ceba6 100644
--- a/tcp.c
+++ b/tcp.c
@@ -766,7 +766,7 @@ static void tcp_update_check_tcp4(const struct iphdr *iph,
 	struct in_addr saddr = { .s_addr = iph->saddr };
 	struct in_addr daddr = { .s_addr = iph->daddr };
 	size_t check_ofs;
-	__sum16 *check;
+	uint16_t *check;
 	int check_idx;
 	uint32_t sum;
 	char *ptr;
@@ -797,7 +797,7 @@ static void tcp_update_check_tcp4(const struct iphdr *iph,
 		return;
 	}
 
-	check = (__sum16 *)ptr;
+	check = (uint16_t *)ptr;
 
 	*check = 0;
 	*check = csum_iov(iov, iov_cnt, l4offset, sum);
@@ -816,7 +816,7 @@ static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
 {
 	uint16_t l4len = ntohs(ip6h->payload_len);
 	size_t check_ofs;
-	__sum16 *check;
+	uint16_t *check;
 	int check_idx;
 	uint32_t sum;
 	char *ptr;
@@ -848,7 +848,7 @@ static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
 		return;
 	}
 
-	check = (__sum16 *)ptr;
+	check = (uint16_t *)ptr;
 
 	*check = 0;
 	*check = csum_iov(iov, iov_cnt, l4offset, sum);

From 9afce0b45c396e43a5499f227cc21849812a435b Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 30 Oct 2024 21:36:18 +0100
Subject: [PATCH 077/382] tap: Explicitly cast TUNSETIFF to fix build warning
 with musl on ppc64le

On ppc64le, TUNSETIFF happens to be 2147767498, which is bigger than
INT_MAX (2^31 - 1), and musl declares the second argument of ioctl()
as 'int', not 'unsigned long' like glibc does, probably because of how
POSIX specifies the equivalent argument, int dcmd, in posix_devctl(),
so gcc reports a warning:

tap.c: In function 'tap_ns_tun':
tap.c:1291:24: warning: overflow in conversion from 'long unsigned int' to 'int' changes value from '2147767498' to '-2147199798' [-Woverflow]
 1291 |         rc = ioctl(fd, TUNSETIFF, &ifr);
      |                        ^~~~~~~~~

We don't care about that overflow, so explicitly cast TUNSETIFF to
int.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tap.c b/tap.c
index cfb82e9..f638f2c 100644
--- a/tap.c
+++ b/tap.c
@@ -1288,7 +1288,7 @@ static int tap_ns_tun(void *arg)
 	if (fd < 0)
 		die_perror("Failed to open() /dev/net/tun");
 
-	rc = ioctl(fd, TUNSETIFF, &ifr);
+	rc = ioctl(fd, (int)TUNSETIFF, &ifr);
 	if (rc < 0)
 		die_perror("TUNSETIFF ioctl on /dev/net/tun failed");
 

From 5e93bcd8bff7ea373d7befa1cf9761c6fff994b2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 5 Nov 2024 12:44:04 +1100
Subject: [PATCH 078/382] test: Adjust misplaced sleeps in two_guests code

Most of our transfer tests using socat use 'sleep' waaiting for the server
side to be ready before starting the client.  However in two_guests/basic
the sleep is in the wrong place: rather than being between starting the
server and starting the client, it's after waiting for the server to
complete.  This causes occasional hangs when the client runs before the
server is ready - in that case the receiving guest sends an RST, which we
don't (currently) propagate back to the sender.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/two_guests/basic | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/two_guests/basic b/test/two_guests/basic
index 9ba5efe..e2338ff 100644
--- a/test/two_guests/basic
+++ b/test/two_guests/basic
@@ -52,33 +52,33 @@ check	[ "__ADDR2_6__" = "__HOST_ADDR6__" ]
 test	TCP/IPv4: guest 1 > guest 2
 g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
 guest2b	socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc
+sleep	1
 guest1	echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004
 guest2w
-sleep	1
 g2out	MSG2 cat msg
 check	[ "__MSG2__" = "Hello_from_guest_1" ]
 
 test	TCP/IPv6: guest 2 > guest 1
 g2out	GW2_6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
 guest1b	socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc
+sleep	1
 guest2	echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001
 guest1w
-sleep	1
 g1out	MSG1 cat msg
 check	[ "__MSG1__" = "Hello_from_guest_2" ]
 
 test	UDP/IPv4: guest 1 > guest 2
 guest2b	socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc
+sleep	1
 guest1	echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004
 guest2w
-sleep	1
 g2out	MSG2 cat msg
 check	[ "__MSG2__" = "Hello_from_guest_1" ]
 
 test	UDP/IPv6: guest 2 > guest 1
 guest1b	socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc
+sleep	1
 guest2	echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001
 guest1w
-sleep	1
 g1out	MSG1 cat msg
 check	[ "__MSG1__" = "Hello_from_guest_2" ]

From 8f1b6a0ca68ae1530ac193cc47cd17ae8cbfd45d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:17 +1100
Subject: [PATCH 079/382] clang: Add .clang-format file

I've been experimenting with clangd, but its default format style is
horrid.  Since our style is basically that of the Linux kernel, copy the
.clang-format from the kernel, minus reference to a bunch of kernel
specific macros.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 .clang-format | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..78f177a
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 11.
+#
+# For more information, see:
+#
+#   Documentation/dev-tools/clang-format.rst
+#   https://clang.llvm.org/docs/ClangFormat.html
+#   https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: true
+  AfterNamespace: true
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  IndentBraces: false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeInheritanceComma: false
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeComma
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 80
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: false
+
+# Taken from:
+#   git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
+#   | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$,  - '\1'," \
+#   | LC_ALL=C sort -u
+ForEachMacros:
+  - 'for_each_nst'
+
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex: '.*'
+    Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentGotoLabels: false
+IndentPPDirectives: None
+IndentWidth: 8
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+PenaltyBreakAssignment: 10
+PenaltyBreakBeforeFirstCallParameter: 30
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatementsExceptForEachMacros
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 8
+UseTab: Always
+...

From 8346216c9adf34920a6c0724d332c53557051557 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:18 +1100
Subject: [PATCH 080/382] Makefile: Simplify exclusion of qrap from static
 checks

There are things in qrap.c that clang-tidy complains about that aren't
worth fixing.  So, we currently exclude it using $(filter-out).  However,
we already have a make variable which has just the passt sources, excluding
qrap, so we can use that instead of the awkward filter-out expression.

Currently, we still include qrap.c for cppcheck, but there's not much
point doing so: it's, well, qrap, so we don't care that much about lints.
Exclude it from cppcheck as well, for consistency.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index c1c6e30..8e14309 100644
--- a/Makefile
+++ b/Makefile
@@ -262,7 +262,7 @@ docs: README.md
 #	parentheses to reinforce that certainly won't improve readability.
 
 
-clang-tidy: $(filter-out qrap.c,$(SRCS)) $(HEADERS)
+clang-tidy: $(PASST_SRCS) $(HEADERS)
 	clang-tidy -checks=*,-modernize-*,\
 	-clang-analyzer-valist.Uninitialized,\
 	-cppcoreguidelines-init-variables,\
@@ -290,14 +290,14 @@ clang-tidy: $(filter-out qrap.c,$(SRCS)) $(HEADERS)
 	-cppcoreguidelines-macro-to-enum,\
 	-readability-math-missing-parentheses \
 	-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
-	--warnings-as-errors=* $(filter-out qrap.c,$(SRCS)) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
+	--warnings-as-errors=* $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
 
 SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
 ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)
 VER := $(shell $(CC) -dumpversion)
 SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include
 endif
-cppcheck: $(SRCS) $(HEADERS)
+cppcheck: $(PASST_SRCS) $(HEADERS)
 	if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
 		CPPCHECK_EXHAUSTIVE="--check-level=exhaustive";		\
 	else								\
@@ -313,4 +313,4 @@ cppcheck: $(SRCS) $(HEADERS)
 	--inline-suppr							\
 	--suppress=unusedStructMember					\
 	$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS))			\
-	$(SRCS) $(HEADERS)
+	$(PASST_SRCS) $(HEADERS)

From b78e72da0b27e222592ff1f1578c69bad4756c65 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:19 +1100
Subject: [PATCH 081/382] clang: Move clang-tidy configuration from Makefile to
 .clang-tidy

Currently we configure clang-tidy with a very long command line spelled out
in the Makefile (mostly a big list of lints to disable).  Move it from here
into a .clang-tidy configuration file, so that the config is accessible if
clang-tidy is invoked in other ways (e.g. via clangd) as well.  As a bonus
this also means that we can move the bulky comments about why we're
suppressing various tests inline with the relevant config lines.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 .clang-tidy |  93 +++++++++++++++++++++++++++++++++++++++++++
 Makefile    | 111 +---------------------------------------------------
 2 files changed, 95 insertions(+), 109 deletions(-)
 create mode 100644 .clang-tidy

diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..9d346ec
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,93 @@
+---
+Checks:
+    - "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*"
+
+    #	TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
+    - "-clang-analyzer-valist.Uninitialized"
+
+    #	Dubious value, would kill readability
+    - "-cppcoreguidelines-init-variables"
+
+    #	Dubious value over the compiler's built-in warning.  Would
+    #	increase verbosity.
+    - "-bugprone-assignment-in-if-condition"
+
+    #	Debatable whether these improve readability, right now it would look
+    #	like a mess
+    - "-google-readability-braces-around-statements"
+    - "-hicpp-braces-around-statements"
+    - "-readability-braces-around-statements"
+
+    #	TODO: in most cases they are justified, but probably not everywhere
+    #
+    - "-readability-magic-numbers"
+    - "-cppcoreguidelines-avoid-magic-numbers"
+
+    #	TODO: this is Linux-only for the moment, nice to fix eventually
+    - "-llvmlibc-restrict-system-libc-headers"
+
+    #	Those are needed for syscalls, epoll_wait flags, etc.
+    - "-hicpp-signed-bitwise"
+
+    #	Probably not doable to impement this without plain memcpy(), memset()
+    - "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
+
+    #	TODO: not really important, but nice to fix eventually
+    - "-llvm-include-order"
+
+    #	Dubious value, would kill readability
+    - "-readability-isolate-declaration"
+
+    #	TODO: nice to fix eventually
+    - "-bugprone-narrowing-conversions"
+    - "-cppcoreguidelines-narrowing-conversions"
+
+    #	TODO: check, fix, and more in general constify wherever possible
+    - "-cppcoreguidelines-avoid-non-const-global-variables"
+
+    #	TODO: check paths where it might make sense to improve performance
+    - "-altera-unroll-loops"
+    - "-altera-id-dependent-backward-branch"
+
+    #	Not much can be done about them other than being careful
+    - "-bugprone-easily-swappable-parameters"
+
+    #	TODO: split reported functions
+    - "-readability-function-cognitive-complexity"
+
+    #	"Poor" alignment needed for structs reflecting message formats/headers
+    - "-altera-struct-pack-align"
+
+    #	TODO: check again if multithreading is implemented
+    - "-concurrency-mt-unsafe"
+
+    #	Complains about any identifier <3 characters, reasonable for
+    #	globals, pointlessly verbose for locals and parameters.
+    - "-readability-identifier-length"
+
+    #	Wants to include headers which *directly* provide the things
+    #	we use.  That sounds nice, but means it will often want a OS
+    #	specific header instead of a mostly standard one, such as
+    #	<linux/limits.h> instead of <limits.h>.
+    - "-misc-include-cleaner"
+
+    #	Want to replace all #defines of integers with enums.  Kind of
+    #	makes sense when those defines form an enum-like set, but
+    #	weird for cases like standalone constants, and causes other
+    #	awkwardness for a bunch of cases we use
+    - "-cppcoreguidelines-macro-to-enum"
+
+    #	It's been a couple of centuries since multiplication has been granted
+    #	precedence over addition in modern mathematical notation. Adding
+    #	parentheses to reinforce that certainly won't improve readability.
+    - "-readability-math-missing-parentheses"
+WarningsAsErrors: "*"
+HeaderFileExtensions:
+    - h
+ImplementationFileExtensions:
+    - c
+HeaderFilterRegex: ""
+FormatStyle: none
+CheckOptions:
+    bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false"
+SystemHeaders: false
diff --git a/Makefile b/Makefile
index 8e14309..f1e9937 100644
--- a/Makefile
+++ b/Makefile
@@ -181,116 +181,9 @@ docs: README.md
 		done < README.md;					\
 	) > README.plain.md
 
-# Checkers currently disabled for clang-tidy:
-# - llvmlibc-restrict-system-libc-headers
-#	TODO: this is Linux-only for the moment, nice to fix eventually
-#
-# - google-readability-braces-around-statements
-# - hicpp-braces-around-statements
-# - readability-braces-around-statements
-#	Debatable whether that improves readability, right now it would look
-#	like a mess
-#
-# - readability-magic-numbers
-# - cppcoreguidelines-avoid-magic-numbers
-#	TODO: in most cases they are justified, but probably not everywhere
-#
-# - clang-analyzer-valist.Uninitialized
-#	TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
-#
-# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling
-#	Probably not doable to impement this without plain memcpy(), memset()
-#
-# - cppcoreguidelines-init-variables
-#	Dubious value, would kill readability
-#
-# - hicpp-signed-bitwise
-#	Those are needed for syscalls, epoll_wait flags, etc.
-#
-# - llvm-include-order
-#	TODO: not really important, but nice to fix eventually
-#
-# - readability-isolate-declaration
-#	Dubious value, would kill readability
-#
-# - bugprone-narrowing-conversions
-# - cppcoreguidelines-narrowing-conversions
-#	TODO: nice to fix eventually
-#
-# - cppcoreguidelines-avoid-non-const-global-variables
-#	TODO: check, fix, and more in general constify wherever possible
-#
-# - altera-unroll-loops
-# - altera-id-dependent-backward-branch
-#	TODO: check paths where it might make sense to improve performance
-#
-# - bugprone-easily-swappable-parameters
-#	Not much can be done about them other than being careful
-#
-# - readability-function-cognitive-complexity
-#	TODO: split reported functions
-#
-# - altera-struct-pack-align
-#	"Poor" alignment needed for structs reflecting message formats/headers
-#
-# - concurrency-mt-unsafe
-#	TODO: check again if multithreading is implemented
-#
-# - readability-identifier-length
-#	Complains about any identifier <3 characters, reasonable for
-#	globals, pointlessly verbose for locals and parameters.
-#
-# - bugprone-assignment-in-if-condition
-#	Dubious value over the compiler's built-in warning.  Would
-#	increase verbosity.
-#
-# - misc-include-cleaner
-#	Wants to include headers which *directly* provide the things
-#	we use.  That sounds nice, but means it will often want a OS
-#	specific header instead of a mostly standard one, such as
-#	<linux/limits.h> instead of <limits.h>.
-#
-# - cppcoreguidelines-macro-to-enum
-#	Want to replace all #defines of integers with enums.  Kind of
-#	makes sense when those defines form an enum-like set, but
-#	weird for cases like standalone constants, and causes other
-#	awkwardness for a bunch of cases we use
-#
-# - readability-math-missing-parentheses
-#	It's been a couple of centuries since multiplication has been granted
-#	precedence over addition in modern mathematical notation. Adding
-#	parentheses to reinforce that certainly won't improve readability.
-
-
 clang-tidy: $(PASST_SRCS) $(HEADERS)
-	clang-tidy -checks=*,-modernize-*,\
-	-clang-analyzer-valist.Uninitialized,\
-	-cppcoreguidelines-init-variables,\
-	-bugprone-assignment-in-if-condition,\
-	-google-readability-braces-around-statements,\
-	-hicpp-braces-around-statements,\
-	-readability-braces-around-statements,\
-	-readability-magic-numbers,\
-	-llvmlibc-restrict-system-libc-headers,\
-	-hicpp-signed-bitwise,\
-	-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
-	-llvm-include-order,\
-	-cppcoreguidelines-avoid-magic-numbers,\
-	-readability-isolate-declaration,\
-	-bugprone-narrowing-conversions,\
-	-cppcoreguidelines-narrowing-conversions,\
-	-cppcoreguidelines-avoid-non-const-global-variables,\
-	-altera-unroll-loops,-altera-id-dependent-backward-branch,\
-	-bugprone-easily-swappable-parameters,\
-	-readability-function-cognitive-complexity,\
-	-altera-struct-pack-align,\
-	-concurrency-mt-unsafe,\
-	-readability-identifier-length,\
-	-misc-include-cleaner,\
-	-cppcoreguidelines-macro-to-enum,\
-	-readability-math-missing-parentheses \
-	-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
-	--warnings-as-errors=* $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
+	clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
+	           -DCLANG_TIDY_58992
 
 SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
 ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)

From 30b4f8816774665321e6903b4f55a929b015d16d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:20 +1100
Subject: [PATCH 082/382] arch: Avoid explicit access to 'environ'

We pass 'environ' to execve() in arch_avc2_exec(), so that we retain the
environment in the current process.  But the declaration of 'environ' is
a bit weird - it doesn't seem to be in a standard header, requiring a
manual explicit declaration.  But, we can avoid needing to reference it
explicitly by using execv() instead of execve().  This removes a clang
warning.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 arch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch.c b/arch.c
index d1dfb73..e1ee729 100644
--- a/arch.c
+++ b/arch.c
@@ -45,7 +45,7 @@ void arch_avx2_exec(char **argv)
 				   "%s.avx2", exe))
 			die_perror("Can't build AVX2 executable path");
 
-		execve(new_path, argv, environ);
+		execv(new_path, argv);
 		warn_perror("Can't run AVX2 build, using non-AVX2 version");
 	}
 }

From f6b546c6e4f036bc569df05cf76eced3f68d6db8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:21 +1100
Subject: [PATCH 083/382] flow: Correct type of flowside_at_sidx()

Due to a copy-pasta error, this returns 'PIF_NONE' instead of NULL on the
failure case.  PIF_NONE expands to 0, which turns into NULL, but it's
still confusing, so fix it.  This removes a clang warning.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow_table.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow_table.h b/flow_table.h
index a499e7b..f15db53 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -110,7 +110,7 @@ static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
 	const union flow *flow = flow_at_sidx(sidx);
 
 	if (!flow)
-		return PIF_NONE;
+		return NULL;
 
 	return &flow->f.side[sidx.sidei];
 }

From c938d8a93e2561df1a4ac7897327456e97babb8c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:22 +1100
Subject: [PATCH 084/382] netlink: RTA_PAYLOAD() returns int, not size_t

Since it's the size of a chunk of memory it would seem logical that
RTA_PAYLOAD() returns size_t.  However, it doesn't - it explicitly casts
its result to an int.  RTNH_OK(), which often takes the result of
RTA_PAYLOAD() as a parameter compares it to an int, so using size_t can
result in comparison of different-signed integer warnings from clang.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 netlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/netlink.c b/netlink.c
index 0bdbabf..4aba2a3 100644
--- a/netlink.c
+++ b/netlink.c
@@ -353,7 +353,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
  */
 bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
 {
-	size_t nh_len = RTA_PAYLOAD(rta);
+	int nh_len = RTA_PAYLOAD(rta);
 	struct rtnexthop *rtnh;
 	bool found = false;
 	int hops = -1;
@@ -582,7 +582,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
 
 				*(unsigned int *)RTA_DATA(rta) = ifi_dst;
 			} else if (rta->rta_type == RTA_MULTIPATH) {
-				size_t nh_len = RTA_PAYLOAD(rta);
+				int nh_len = RTA_PAYLOAD(rta);
 				struct rtnexthop *rtnh;
 
 				for (rtnh = (struct rtnexthop *)RTA_DATA(rta);

From 93bce404c19652b40f2104633286b6dac5f85b0e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:23 +1100
Subject: [PATCH 085/382] Makefile: Move NETNS_RUN_DIR definition to C code

NETNS_RUN_DIR is set in the Makefile, then passed into the C code with
-D.  But NETNS_RUN_DIR is just a fixed string, it doesn't depend on any
make probes or variables, so there's really no reason to handle it via the
Makefile.  Just move it to a plain #define in conf.c.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 1 -
 conf.c   | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index f1e9937..41f24e8 100644
--- a/Makefile
+++ b/Makefile
@@ -44,7 +44,6 @@ FLAGS := -Wall -Wextra -Wno-format-zero-length
 FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
 FLAGS +=  $(FORTIFY_FLAG) -O2 -pie -fPIE
 FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
-FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
 FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
 FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
 FLAGS += -DARCH=\"$(TARGET_ARCH)\"
diff --git a/conf.c b/conf.c
index 14411b4..86566db 100644
--- a/conf.c
+++ b/conf.c
@@ -46,6 +46,8 @@
 #include "isolation.h"
 #include "log.h"
 
+#define NETNS_RUN_DIR	"/run/netns"
+
 /**
  * next_chunk - Return the next piece of a string delimited by a character
  * @s:		String to search

From 7917159005d41d2f87213645e9460534beb1e14f Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:24 +1100
Subject: [PATCH 086/382] seccomp: Simplify handling of AUDIT_ARCH

Currently we construct the AUDIT_ARCH variable in the Makefile, then pass
it into the C code with -D.  The only place that uses it, though is the
BPF filter generated by seccomp.sh.  seccomp.sh already needs to do things
differently depending on the arch, so it might as well just insert the
expanded AUDIT_ARCH directly into the generated code, rather than using
a #define.  Arguably this is better, even, since it ensures more locally
that the arch the BPF checks for matches the arch seccomp.sh built the
filter for.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile   |  9 ---------
 seccomp.sh | 14 ++++++++++++--
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index 41f24e8..c521d04 100644
--- a/Makefile
+++ b/Makefile
@@ -25,14 +25,6 @@ TARGET ?= $(shell $(CC) -dumpmachine)
 TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
 TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')
 
-AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
-
 # On some systems enabling optimization also enables source fortification,
 # automagically. Do not override it.
 FORTIFY_FLAG :=
@@ -44,7 +36,6 @@ FLAGS := -Wall -Wextra -Wno-format-zero-length
 FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
 FLAGS +=  $(FORTIFY_FLAG) -O2 -pie -fPIE
 FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
-FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
 FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
 FLAGS += -DARCH=\"$(TARGET_ARCH)\"
 FLAGS += -DVERSION=\"$(VERSION)\"
diff --git a/seccomp.sh b/seccomp.sh
index 38aa826..6499c58 100755
--- a/seccomp.sh
+++ b/seccomp.sh
@@ -20,6 +20,15 @@ OUT="$(mktemp)"
 [ -z "${ARCH}" ] && ARCH="$(uname -m)"
 [ -z "${CC}" ] && CC="cc"
 
+AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z]             \
+                                      | sed 's/^ARM.*/ARM/'        \
+                                      | sed 's/I[456]86/I386/'     \
+                                      | sed 's/PPC64/PPC/'         \
+                                      | sed 's/PPCLE/PPC64LE/'     \
+                                      | sed 's/MIPS64EL/MIPSEL64/' \
+                                      | sed 's/HPPA/PARISC/'       \
+                                      | sed 's/SH4/SH/')"
+
 HEADER="/* This file was automatically generated by $(basename ${0}) */
 
 #ifndef AUDIT_ARCH_PPC64LE
@@ -32,7 +41,7 @@ struct sock_filter filter_@PROFILE@[] = {
 	/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
 	BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
 		 (offsetof(struct seccomp_data, arch))),
-	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@),
 	/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
 	BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
 		 (offsetof(struct seccomp_data, nr))),
@@ -233,7 +242,8 @@ gen_profile() {
 		sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}"
 	done
 
-	finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))"
+	finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \
+	       "AUDIT_ARCH:${AUDIT_ARCH}"
 }
 
 printf '%s\n' "${HEADER}" > "${OUT}"

From 13fc6d511eb89b15a0941c63ae44f147572b1470 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:25 +1100
Subject: [PATCH 087/382] Makefile: Use -DARCH for qrap only

We insert -DARCH for all compiles, based on TARGET_ARCH determined in the
Makefile.  However, this is only used in qrap.c, not anywhere else in
passt or pasta.  Only supply this -D when compiling qrap specifically.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index c521d04..2a8540a 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,6 @@ FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
 FLAGS +=  $(FORTIFY_FLAG) -O2 -pie -fPIE
 FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
 FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
-FLAGS += -DARCH=\"$(TARGET_ARCH)\"
 FLAGS += -DVERSION=\"$(VERSION)\"
 FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
 
@@ -107,7 +106,7 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
 	ln -sf $< $@
 
 qrap: $(QRAP_SRCS) passt.h
-	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
+	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)
 
 valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction	\
 			    rt_sigreturn getpid gettid kill clock_gettime mmap \

From c560e2f65b625367d3baf0fcf06cf19996407659 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:26 +1100
Subject: [PATCH 088/382] Makefile: Don't attempt to auto-detect stack size

We probe the available stack limit in the Makefile using rlimit, then use
that to set the size of the stack when we clone() extra threads.  But
the rlimit at compile time need not be the same as the rlimit at runtime,
so that's not particularly sensible.

Ideally, we'd set the stack size based on an estimate of the actual
maximum stack usage of all our clone()ed functions.  We don't have that
at the moment, but to keep things simple just set it to 1MiB - that's what
the current probe will set things to on my default configuration Fedora 40,
so it's likely to be fine in most cases.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 6 ------
 util.h   | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 2a8540a..56bf2e8 100644
--- a/Makefile
+++ b/Makefile
@@ -15,11 +15,6 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
 # the IPv6 socket API? (Linux does)
 DUAL_STACK_SOCKETS := 1
 
-RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s')
-ifeq ($(RLIMIT_STACK_VAL),unlimited)
-RLIMIT_STACK_VAL := 1024
-endif
-
 TARGET ?= $(shell $(CC) -dumpmachine)
 # Get 'uname -m'-like architecture description for target
 TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
@@ -36,7 +31,6 @@ FLAGS := -Wall -Wextra -Wno-format-zero-length
 FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
 FLAGS +=  $(FORTIFY_FLAG) -O2 -pie -fPIE
 FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
-FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
 FLAGS += -DVERSION=\"$(VERSION)\"
 FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
 
diff --git a/util.h b/util.h
index 3fc64cf..c341236 100644
--- a/util.h
+++ b/util.h
@@ -132,7 +132,7 @@ static inline uint32_t ntohl_unaligned(const void *p)
 	return ntohl(val);
 }
 
-#define NS_FN_STACK_SIZE	(RLIMIT_STACK_VAL * 1024 / 8)
+#define NS_FN_STACK_SIZE	(1024 * 1024) /* 1MiB */
 int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
 	     void *arg);
 #define NS_CALL(fn, arg)						\

From 1d7cff3779e4bff944ce17c86471a87141c352d2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:27 +1100
Subject: [PATCH 089/382] clang: Add rudimentary clangd configuration

clangd's default configuration seems to try to treat .h files as C++ not
C.  There are many more spurious warnings generated at present, but this
removes some of the most egregious ones.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 .clangd | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .clangd

diff --git a/.clangd b/.clangd
new file mode 100644
index 0000000..41bec92
--- /dev/null
+++ b/.clangd
@@ -0,0 +1,3 @@
+CompileFlags:
+    # Don't try to interpret our headers as C++'
+    Add: [-xc, -Wall]

From 1e76a19895b5d8b2b5994263625fce35373041e7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 10:25:28 +1100
Subject: [PATCH 090/382] util: Remove unused ffsl() function

We supply a weak alias for ffsl() in case it's not defined in our libc.
Except.. we don't have any users for it any more, so remove it.

make cppcheck doesn't spot this at present for complicated reasons, but it
might with tweaks to the options I'm experimenting with.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 util.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/util.h b/util.h
index c341236..2858b10 100644
--- a/util.h
+++ b/util.h
@@ -158,9 +158,6 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
 
 struct ctx;
 
-/* cppcheck-suppress funcArgNamesDifferent */
-__attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); }
-
 #ifdef CLOSE_RANGE_UNSHARE	/* Linux kernel >= 5.9 */
 /* glibc < 2.34 and musl as of 1.2.5 need these */
 #ifndef SYS_close_range

From c5f4e4d146f6f57a66bd4d7792e8ccf9625d039c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 12:43:04 +1100
Subject: [PATCH 091/382] fwd: Squash different-signedness comparison warning

On certain architectures we get a warning about comparison between
different signedness integers in fwd_probe_ephemeral().  This is because
NUM_PORTS evaluates to an unsigned integer.  It's a fixed value, though
and we know it will fit in a signed long on anything reasonable, so add
a cast to suppress the warning.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 fwd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fwd.c b/fwd.c
index c71f5e1..0b7f8b1 100644
--- a/fwd.c
+++ b/fwd.c
@@ -75,8 +75,8 @@ void fwd_probe_ephemeral(void)
 	if (*end || errno)
 		goto parse_err;
 
-	if (min < 0 || min >= NUM_PORTS ||
-	    max < 0 || max >= NUM_PORTS)
+	if (min < 0 || min >= (long)NUM_PORTS ||
+	    max < 0 || max >= (long)NUM_PORTS)
 		goto parse_err;
 
 	fwd_ephemeral_min = min;

From 0d7b8201ed5788416d1b36fc3a554b61ad10c201 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 17:54:14 +1100
Subject: [PATCH 092/382] linux_dep: Generalise tcp_info.h to handling Linux
 extension compatibility

tcp_info.h exists just to contain a modern enough version of struct
tcp_info for our needs, removing compile time dependency on the version of
kernel headers.  There are several other cases where we can remove similar
compile time dependencies on kernel version.  Prepare for that by renaming
tcp_info.h to linux_dep.h.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_info.h => linux_dep.h | 10 ++++++----
 tcp.c                     |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)
 rename tcp_info.h => linux_dep.h (97%)

diff --git a/tcp_info.h b/linux_dep.h
similarity index 97%
rename from tcp_info.h
rename to linux_dep.h
index 06ccb16..8921623 100644
--- a/tcp_info.h
+++ b/linux_dep.h
@@ -1,13 +1,15 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later
  * Copyright Red Hat
  *
- * Largely derived from include/linux/tcp.h in the Linux kernel
+ * Declarations for Linux specific dependencies
  */
 
-#ifndef TCP_INFO_H
-#define TCP_INFO_H
+#ifndef LINUX_DEP_H
+#define LINUX_DEP_H
 
 /* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
+ *
+ * Largely derived from include/linux/tcp.h in the Linux kernel
  *
  * Some fields returned by TCP_INFO have been there for ages and are shared with
  * BSD.  struct tcp_info from netinet/tcp.h has only those fields.  There are
@@ -117,4 +119,4 @@ struct tcp_info_linux {
 						 */
 };
 
-#endif /* TCP_INFO_H */
+#endif /* LINUX_DEP_H */
diff --git a/tcp.c b/tcp.c
index 56ceba6..1bb122b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -299,10 +299,10 @@
 #include "log.h"
 #include "inany.h"
 #include "flow.h"
+#include "linux_dep.h"
 
 #include "flow_table.h"
 #include "tcp_internal.h"
-#include "tcp_info.h"
 #include "tcp_buf.h"
 
 /* MSS rounding: see SET_MSS() */

From d8e05a3fe0f2db444c51342888b37ed351b66f63 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 17:54:18 +1100
Subject: [PATCH 093/382] ndp: Use const pointer for ndp_ns packet

We don't modify this structure at all.  For some reason cppcheck doesn't
catch this with our current options, but did when I was experimenting with
some different options.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ndp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ndp.c b/ndp.c
index a1ee834..faae408 100644
--- a/ndp.c
+++ b/ndp.c
@@ -234,8 +234,8 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
 		return 1;
 
 	if (ih->icmp6_type == NS) {
-		struct ndp_ns *ns = packet_get(p, 0, 0, sizeof(struct ndp_ns),
-					       NULL);
+		const struct ndp_ns *ns =
+			packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
 
 		if (!ns)
 			return -1;

From 6f913b3af062a889f70758f8d3a458dcf0ac0cdd Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 17:54:19 +1100
Subject: [PATCH 094/382] udp: Don't dereference uflow before NULL check in
 udp_reply_sock_handler()

We have an ASSERT() verifying that we're able to look up the flow in
udp_reply_sock_handler().  However, we dereference uflow before that in
an initializer, rather defeating the point.  Rearrange to avoid that.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/udp.c b/udp.c
index 0c01067..4be165f 100644
--- a/udp.c
+++ b/udp.c
@@ -644,12 +644,13 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
-	int from_s = uflow->s[ref.flowside.sidei];
 	uint8_t topif = pif_at_sidx(tosidx);
-	int n, i;
+	int n, i, from_s;
 
 	ASSERT(!c->no_udp && uflow);
 
+	from_s = uflow->s[ref.flowside.sidei];
+
 	if (udp_sock_errs(c, from_s, events) < 0) {
 		flow_err(uflow, "Unrecoverable error on reply socket");
 		flow_err_details(uflow);

From 867db07fcfc24d0918fa92f98e26fc8f9bf40253 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 17:54:20 +1100
Subject: [PATCH 095/382] util: Work around cppcheck bug 6936

While experimenting with cppcheck options, I hit several false positives
caused by this bug: https://trac.cppcheck.net/ticket/13227

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile |  2 +-
 util.h   | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 56bf2e8..3c82f50 100644
--- a/Makefile
+++ b/Makefile
@@ -188,5 +188,5 @@ cppcheck: $(PASST_SRCS) $(HEADERS)
 	$(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*)	\
 	--inline-suppr							\
 	--suppress=unusedStructMember					\
-	$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS))			\
+	$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936  \
 	$(PASST_SRCS) $(HEADERS)
diff --git a/util.h b/util.h
index 2858b10..0bf396a 100644
--- a/util.h
+++ b/util.h
@@ -68,6 +68,15 @@
 #define STRINGIFY(x)	#x
 #define STR(x)		STRINGIFY(x)
 
+#ifdef CPPCHECK_6936
+/* Some cppcheck versions get confused by aborts inside a loop, causing
+ * it to give false positive uninitialised variable warnings later in
+ * the function, because it doesn't realise the non-initialising path
+ * already exited.  See https://trac.cppcheck.net/ticket/13227
+ */
+#define ASSERT(expr)		\
+	((expr) ? (void)0 : abort())
+#else
 #define ASSERT(expr)							\
 	do {								\
 		if (!(expr)) {						\
@@ -79,6 +88,7 @@
 			abort();					\
 		}							\
 	} while (0)
+#endif
 
 #ifdef P_tmpdir
 #define TMPDIR		P_tmpdir

From b456ee1b53171c46b6f25c1c43d9fc17f6116745 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 14:03:21 +1100
Subject: [PATCH 096/382] test: Rename propagating signal handler

nstool in "exec" mode will propagate some signals (specifically SIGTERM) to
the process in the namespace it executes.  The signal handler which
accomplishes this is called simply sig_handler().  However, it turns out
we're going to need some other signal handlers, so rename this to the more
specific sig_propagate().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/nstool.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/nstool.c b/test/nstool.c
index fc357d8..3f75edd 100644
--- a/test/nstool.c
+++ b/test/nstool.c
@@ -346,7 +346,7 @@ static int openns(const char *fmt, ...)
 }
 
 static pid_t sig_pid;
-static void sig_handler(int signum)
+static void sig_propagate(int signum)
 {
 	int err;
 
@@ -358,7 +358,7 @@ static void sig_handler(int signum)
 static void wait_for_child(pid_t pid)
 {
 	struct sigaction sa = {
-		.sa_handler = sig_handler,
+		.sa_handler = sig_propagate,
 		.sa_flags = SA_RESETHAND,
 	};
 	int status, err;

From 1699083f291ca8e639d0711eff59c61eecdf02c1 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 14:03:22 +1100
Subject: [PATCH 097/382] test: Make nstool hold robust against interruptions
 to control clients

Currently nstool die()s on essentially any error.  In most cases that's
fine for our purposes.  However, it's a problem when in "hold" mode and
getting an IO error on an accept()ed socket.  This could just indicate that
the control client aborted prematurely, in which case we don't want to
kill of the namespace we're holding.

Adjust these to print an error, close() the control client socket and
carry on.  In addition, we need to explicitly ignore SIGPIPE in order not
to be killed by an abruptly closed client connection.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/nstool.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/test/nstool.c b/test/nstool.c
index 3f75edd..7ab5d2a 100644
--- a/test/nstool.c
+++ b/test/nstool.c
@@ -31,10 +31,15 @@
 
 #define	ARRAY_SIZE(a)	((int)(sizeof(a) / sizeof((a)[0])))
 
-#define die(...)				\
-	do {					\
-		fprintf(stderr, __VA_ARGS__);	\
-		exit(1);			\
+#define die(...)						\
+	do {							\
+		fprintf(stderr, "nstool: " __VA_ARGS__);	\
+		exit(1);					\
+	} while (0)
+
+#define err(...)						\
+	do {							\
+		fprintf(stderr, "nstool: " __VA_ARGS__);	\
 	} while (0)
 
 struct ns_type {
@@ -156,6 +161,9 @@ static int connect_ctl(const char *sockpath, bool wait,
 
 static void cmd_hold(int argc, char *argv[])
 {
+	struct sigaction sa = {
+		.sa_handler = SIG_IGN,
+	};
 	int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX);
 	struct sockaddr_un addr;
 	const char *sockpath = argv[1];
@@ -185,6 +193,10 @@ static void cmd_hold(int argc, char *argv[])
 	if (!getcwd(info.cwd, sizeof(info.cwd)))
 		die("getcwd(): %s\n", strerror(errno));
 
+	rc = sigaction(SIGPIPE, &sa, NULL);
+	if (rc)
+		die("sigaction(SIGPIPE): %s\n", strerror(errno));
+
 	do {
 		int afd = accept(fd, NULL, NULL);
 		char buf;
@@ -193,17 +205,21 @@ static void cmd_hold(int argc, char *argv[])
 			die("accept(): %s\n", strerror(errno));
 
 		rc = write(afd, &info, sizeof(info));
-		if (rc < 0)
-			die("write(): %s\n", strerror(errno));
+		if (rc < 0) {
+			err("holder write() to control socket: %s\n",
+			    strerror(errno));
+		}
 		if ((size_t)rc < sizeof(info))
-			die("short write() on control socket\n");
+			err("holder short write() on control socket\n");
 
 		rc = read(afd, &buf, sizeof(buf));
-		if (rc < 0)
-			die("read(): %s\n", strerror(errno));
+		if (rc < 0) {
+			err("holder read() on control socket: %s\n",
+			    strerror(errno));
+		}
 
 		close(afd);
-	} while (rc == 0);
+	} while (rc <= 0);
 
 	unlink(sockpath);
 }

From 910f4f91030141b7e2e65644dc9fe678cc57f640 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 12:44:14 +1100
Subject: [PATCH 098/382] test: Don't require 64-bit prefixes in perf tests

When determining the namespace's IPv6 address in the perf test setup, we
explicitly filter for addresses with a 64-bit prefix length.  There's no
real reason we need that - as long as it's a global address we can use it.
I suspect this was copied without thinking from a similar example in the
NDP tests, where the 64-bit prefix length _is_ meaningful (though it's not
entirely clear if the handling is correct there either).

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/perf/pasta_tcp | 2 +-
 test/perf/pasta_udp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/perf/pasta_tcp b/test/perf/pasta_tcp
index d1ccf7d..88284b2 100644
--- a/test/perf/pasta_tcp
+++ b/test/perf/pasta_tcp
@@ -211,7 +211,7 @@ tr	TCP throughput over IPv6: host to ns
 iperf3s	ns 10002
 
 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
 bw	-
 bw	-
 bw	-
diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp
index 544bf17..3d07091 100644
--- a/test/perf/pasta_udp
+++ b/test/perf/pasta_udp
@@ -196,7 +196,7 @@ tr	UDP throughput over IPv6: host to ns
 iperf3s	ns 10002
 
 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
 iperf3	BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472
 bw	__BW__ 0.3 0.5
 iperf3	BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972

From 9a0e544f05bf93609921f988b22f0680e143b4ad Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Nov 2024 12:44:15 +1100
Subject: [PATCH 099/382] test: Improve test for NDP assigned prefix

In the NDP tests we search explicitly for a guest address with prefix
length 64.  AFAICT this is an attempt to specifically find the SLAAC
assigned address, rather than something assigned by other means.  We can do
that more explicitly by checking for .protocol == "kernel_ra". however.

The SLAAC prefixes we assigned *will* always be 64-bit, that's hard-coded
into our NDP implementation.  RFC4862 doesn't really allow anything else
since the interface identifiers for an Ethernet-like link are 64-bits.

Let's actually verify that, rather than just assuming it, by extracting the
prefix length assigned in the guest and checking it as well.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/passt/ndp | 4 ++--
 test/pasta/ndp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/passt/ndp b/test/passt/ndp
index f54b8ce..56b385b 100644
--- a/test/passt/ndp
+++ b/test/passt/ndp
@@ -23,8 +23,8 @@ hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").d
 check	[ -n "__IFNAME__" ]
 
 test	SLAAC: prefix
-gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
-gout	PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
+gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
+gout	PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 hout	HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
 check	[ "__PREFIX6__" = "__HOST_PREFIX6__" ]
diff --git a/test/pasta/ndp b/test/pasta/ndp
index c59627f..2442ab5 100644
--- a/test/pasta/ndp
+++ b/test/pasta/ndp
@@ -22,8 +22,8 @@ ns	ip link set dev __IFNAME__ up
 ns	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 
 test	SLAAC: prefix
-nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
-nsout	PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
+nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
+nsout	PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 hout	HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
 check	[ "__PREFIX6__" = "__HOST_PREFIX6__" ]

From 78da088f7babfa0431c6fb2704ef0709fe057770 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Tue, 5 Nov 2024 20:07:44 -0500
Subject: [PATCH 100/382] tcp: unify payload and flags l2 frames array

In order to reduce static memory and code footprint, we merge
the array for l2 flag frames into the one for payload frames.

This change also ensures that no flag message will be sent out
over the l2 media bypassing already queued payload messages.

Performance measurements with iperf3, where we force all
traffic via the tap queue, show no significant difference:

Dual traffic both directions sinmultaneously, with patch:
========================================================
host->ns:
--------
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-100.00 sec  36.3 GBytes  3.12 Gbits/sec  4759       sender
[  5]   0.00-100.04 sec  36.3 GBytes  3.11 Gbits/sec             receiver

ns->host:
---------
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-100.00 sec   321 GBytes  27.6 Gbits/sec            receiver

Dual traffic both directions sinmultaneously, without patch:
============================================================
host->ns:
--------
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-100.00 sec  35.0 GBytes  3.01 Gbits/sec  6001       sender
[  5]   0.00-100.04 sec  34.8 GBytes  2.99 Gbits/sec            receiver

ns->host
--------
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-100.00 sec   345 GBytes  29.6 Gbits/sec            receiver

Single connection, with patch:
==============================
host->ns:
---------
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-100.00 sec   138 GBytes  11.8 Gbits/sec  922       sender
[  5]   0.00-100.04 sec   138 GBytes  11.8 Gbits/sec            receiver

ns->host:
-----------
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-100.00 sec   430 GBytes  36.9 Gbits/sec            receiver

Single connection, without patch:
=================================
host->ns:
------------
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-100.00 sec   139 GBytes  11.9 Gbits/sec  900       sender
[  5]   0.00-100.04 sec   139 GBytes  11.9 Gbits/sec            receiver

ns->host:
---------
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-100.00 sec   440 GBytes  37.8 Gbits/sec            receiver

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          |  1 -
 tcp_buf.c      | 70 ++++++++++++--------------------------------------
 tcp_buf.h      |  1 -
 tcp_internal.h | 15 -----------
 4 files changed, 17 insertions(+), 70 deletions(-)

diff --git a/tcp.c b/tcp.c
index 1bb122b..a3d48fa 100644
--- a/tcp.c
+++ b/tcp.c
@@ -937,7 +937,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn)
 /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
 void tcp_defer_handler(struct ctx *c)
 {
-	tcp_flags_flush(c);
 	tcp_payload_flush(c);
 }
 
diff --git a/tcp_buf.c b/tcp_buf.c
index 274e313..d29c1a9 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -20,7 +20,7 @@
 
 #include <netinet/ip.h>
 
-#include <linux/tcp.h>
+#include <netinet/tcp.h>
 
 #include "util.h"
 #include "ip.h"
@@ -59,22 +59,10 @@ static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516")
 static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp_payload_used;
 
-static struct tap_hdr		tcp_flags_tap_hdr[TCP_FRAMES_MEM];
-/* IPv4 headers for TCP segment without payload */
-static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
-/* TCP segments without payload for IPv4 frames */
-static struct tcp_flags_t	tcp_flags[TCP_FRAMES_MEM];
-
-static unsigned int tcp_flags_used;
-
-/* IPv6 headers for TCP segment without payload */
-static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
-
 /* recvmsg()/sendmsg() data for tap */
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
 static struct iovec	tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec	tcp_l2_flags_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 
 /**
  * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
@@ -103,15 +91,6 @@ void tcp_sock_iov_init(const struct ctx *c)
 	for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
 		tcp6_payload_ip[i] = ip6;
 		tcp4_payload_ip[i] = iph;
-		tcp_payload[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp_payload[i].th.ack = 1;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(tcp_flags); i++) {
-		tcp6_flags_ip[i] = ip6;
-		tcp4_flags_ip[i] = iph;
-		tcp_flags[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp_flags[i].th.ack = 1;
 	}
 
 	for (i = 0; i < TCP_FRAMES_MEM; i++) {
@@ -121,25 +100,6 @@ void tcp_sock_iov_init(const struct ctx *c)
 		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
 		iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
 	}
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		struct iovec *iov = tcp_l2_flags_iov[i];
-
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_flags_tap_hdr[i]);
-		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp_flags[i];
-	}
-}
-
-/**
- * tcp_flags_flush() - Send out buffers for segments with no data (flags)
- * @c:		Execution context
- */
-void tcp_flags_flush(const struct ctx *c)
-{
-	tap_send_frames(c, &tcp_l2_flags_iov[0][0], TCP_NUM_IOVS,
-			tcp_flags_used);
-	tcp_flags_used = 0;
 }
 
 /**
@@ -171,7 +131,7 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
 }
 
 /**
- * tcp_payload_flush() - Send out buffers for segments with data
+ * tcp_payload_flush() - Send out buffers for segments with data or flags
  * @c:		Execution context
  */
 void tcp_payload_flush(const struct ctx *c)
@@ -197,37 +157,35 @@ void tcp_payload_flush(const struct ctx *c)
  */
 int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 {
-	struct tcp_flags_t *payload;
+	struct tcp_payload_t *payload;
 	struct iovec *iov;
 	size_t optlen;
 	size_t l4len;
 	uint32_t seq;
 	int ret;
 
-	iov = tcp_l2_flags_iov[tcp_flags_used];
+	iov = tcp_l2_iov[tcp_payload_used];
 	if (CONN_V4(conn)) {
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[tcp_flags_used]);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
 		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
 	} else {
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[tcp_flags_used]);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
 		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
 	}
 
 	payload = iov[TCP_IOV_PAYLOAD].iov_base;
 	seq = conn->seq_to_tap;
 	ret = tcp_prepare_flags(c, conn, flags, &payload->th,
-				&payload->opts, &optlen);
+				(struct tcp_syn_opts *)&payload->data, &optlen);
 	if (ret <= 0)
 		return ret;
 
-	tcp_flags_used++;
+	tcp_payload_used++;
 	l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
 	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-
 	if (flags & DUP_ACK) {
-		struct iovec *dup_iov;
+		struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
 
-		dup_iov = tcp_l2_flags_iov[tcp_flags_used++];
 		memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
 		       iov[TCP_IOV_TAP].iov_len);
 		dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
@@ -237,8 +195,8 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 		dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
 	}
 
-	if (tcp_flags_used > TCP_FRAMES_MEM - 2)
-		tcp_flags_flush(c);
+	if (tcp_payload_used > TCP_FRAMES_MEM - 2)
+		tcp_payload_flush(c);
 
 	return 0;
 }
@@ -254,6 +212,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
+	struct tcp_payload_t *payload;
 	const uint16_t *check = NULL;
 	struct iovec *iov;
 	size_t l4len;
@@ -274,6 +233,11 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
 		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
 	}
+	payload = iov[TCP_IOV_PAYLOAD].iov_base;
+	payload->th.th_off = sizeof(struct tcphdr) / 4;
+	payload->th.th_x2 = 0;
+	payload->th.th_flags = 0;
+	payload->th.ack = 1;
 	l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
 	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
 	if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
diff --git a/tcp_buf.h b/tcp_buf.h
index 49c04d4..54f5e53 100644
--- a/tcp_buf.h
+++ b/tcp_buf.h
@@ -7,7 +7,6 @@
 #define TCP_BUF_H
 
 void tcp_sock_iov_init(const struct ctx *c);
-void tcp_flags_flush(const struct ctx *c);
 void tcp_payload_flush(const struct ctx *c);
 int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
 int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
diff --git a/tcp_internal.h b/tcp_internal.h
index a5a47df..c846f60 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -134,21 +134,6 @@ struct tcp_syn_opts {
 		.ws = TCP_OPT_WS(ws_),			\
 	})
 
-/**
- * struct tcp_flags_t - TCP header and data to send zero-length
- *                      segments (flags)
- * @th:		TCP header
- * @opts	TCP options
- */
-struct tcp_flags_t {
-	struct tcphdr th;
-	struct tcp_syn_opts opts;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
 extern char tcp_buf_discard [MAX_WINDOW];
 
 void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,

From 5f5e814cfc27c14cd7f116c8fb59e17d5671cafe Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 7 Nov 2024 17:47:08 +0100
Subject: [PATCH 101/382] dhcpv6: Use for loop instead of goto to avoid false
 positive cppcheck warning

cppcheck 2.16.0 reports:

dhcpv6.c:334:14: style: The comparison 'ia_type == 3' is always true. [knownConditionTrueFalse]
 if (ia_type == OPT_IA_NA) {
             ^
dhcpv6.c:306:12: note: 'ia_type' is assigned value '3' here.
 ia_type = OPT_IA_NA;
           ^
dhcpv6.c:334:14: note: The comparison 'ia_type == 3' is always true.
 if (ia_type == OPT_IA_NA) {
             ^

this is not really the case as we set ia_type to OPT_IA_TA and then
jump back.

Anyway, there's no particular reason to use a goto here: add a trivial
foreach() macro to go through elements of an array and use it instead.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 dhcpv6.c | 51 +++++++++++++++++++++++----------------------------
 util.h   |  3 +++
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/dhcpv6.c b/dhcpv6.c
index 14a5c7e..f2e7307 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -296,47 +296,42 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
 static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
 					   struct in6_addr *la)
 {
+	int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
+	const struct opt_ia_addr *opt_addr;
 	char buf[INET6_ADDRSTRLEN];
 	struct in6_addr req_addr;
 	const struct opt_hdr *h;
 	struct opt_hdr *ia;
 	size_t offset;
-	int ia_type;
 
-	ia_type = OPT_IA_NA;
-ia_ta:
-	offset = 0;
-	while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
-		if (ntohs(ia->l) < OPT_VSIZE(ia_na))
-			return NULL;
-
-		offset += sizeof(struct opt_ia_na);
-
-		while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
-			const struct opt_ia_addr *opt_addr;
-
-			if (ntohs(h->l) != OPT_VSIZE(ia_addr))
+	foreach(ia_type, ia_types) {
+		offset = 0;
+		while ((ia = dhcpv6_opt(p, &offset, *ia_type))) {
+			if (ntohs(ia->l) < OPT_VSIZE(ia_na))
 				return NULL;
 
-			opt_addr = (const struct opt_ia_addr *)h;
-			req_addr = opt_addr->addr;
-			if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) {
-				info("DHCPv6: requested address %s not on link",
-				     inet_ntop(AF_INET6, &req_addr,
-					       buf, sizeof(buf)));
-				return ia;
-			}
+			offset += sizeof(struct opt_ia_na);
 
-			offset += sizeof(struct opt_ia_addr);
+			while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
+				if (ntohs(h->l) != OPT_VSIZE(ia_addr))
+					return NULL;
+
+				opt_addr = (const struct opt_ia_addr *)h;
+				req_addr = opt_addr->addr;
+				if (!IN6_ARE_ADDR_EQUAL(la, &req_addr))
+					goto err;
+
+				offset += sizeof(struct opt_ia_addr);
+			}
 		}
 	}
 
-	if (ia_type == OPT_IA_NA) {
-		ia_type = OPT_IA_TA;
-		goto ia_ta;
-	}
-
 	return NULL;
+
+err:
+	info("DHCPv6: requested address %s not on link",
+	     inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
+	return ia;
 }
 
 /**
diff --git a/util.h b/util.h
index 0bf396a..582ef57 100644
--- a/util.h
+++ b/util.h
@@ -102,6 +102,9 @@
 
 #define ARRAY_SIZE(a)		((int)(sizeof(a) / sizeof((a)[0])))
 
+#define foreach(item, array)						\
+	for ((item) = (array); (item) - (array) < ARRAY_SIZE(array); (item)++)
+
 #define IN_INTERVAL(a, b, x)	((x) >= (a) && (x) <= (b))
 #define FD_PROTO(x, proto)						\
 	(IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x)))

From 1feb90fe627959e4903e01ba83249fa33c4d472d Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 7 Nov 2024 18:08:46 +0100
Subject: [PATCH 102/382] dhcpv6: Turn some option headers pointers to const

cppcheck 2.14.2 on Alpine reports:

dhcpv6.c:431:32: style: Variable 'client_id' can be declared as pointer to const [constVariablePointer]
 struct opt_hdr *ia, *bad_ia, *client_id;
                               ^

It's not only 'client_id': we can declare 'ia' as const pointer too.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 dhcpv6.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dhcpv6.c b/dhcpv6.c
index f2e7307..0523bba 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -423,11 +423,11 @@ search:
 int dhcpv6(struct ctx *c, const struct pool *p,
 	   const struct in6_addr *saddr, const struct in6_addr *daddr)
 {
-	struct opt_hdr *ia, *bad_ia, *client_id;
-	const struct opt_hdr *server_id;
+	const struct opt_hdr *client_id, *server_id, *ia;
 	const struct in6_addr *src;
 	const struct msg_hdr *mh;
 	const struct udphdr *uh;
+	struct opt_hdr *bad_ia;
 	size_t mlen, n;
 
 	uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);

From 87940f9aa72a342988e89b9509c2572e494d91a6 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 7 Nov 2024 18:58:49 +0100
Subject: [PATCH 103/382] tap: Cast TAP_BUF_BYTES - ETH_MAX_MTU to ssize_t, not
 TAP_BUF_BYTES

Given that we're comparing against 'n', which is signed, we cast
TAP_BUF_BYTES to ssize_t so that the maximum buffer usage, calculated
as the difference between TAP_BUF_BYTES and ETH_MAX_MTU, will also be
signed.

This doesn't necessarily happen on 32-bit architectures, though. On
armhf and i686, clang-tidy 18.1.8 and 19.1.2 report:

/home/pi/passt/tap.c:1087:16: error: comparison of integers of different signs: 'ssize_t' (aka 'int') and 'unsigned int' [clang-diagnostic-sign-compare,-warnings-as-errors]
 1087 |         for (n = 0; n <= (ssize_t)TAP_BUF_BYTES - ETH_MAX_MTU; n += len) {
      |                     ~ ^  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

cast the whole difference to ssize_t, as we know it's going to be
positive anyway, instead of relying on that side effect.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tap.c b/tap.c
index f638f2c..a3ba958 100644
--- a/tap.c
+++ b/tap.c
@@ -1084,7 +1084,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 
 	tap_flush_pools();
 
-	for (n = 0; n <= (ssize_t)TAP_BUF_BYTES - ETH_MAX_MTU; n += len) {
+	for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
 		len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
 
 		if (len == 0) {

From d4f09c9b96c68a1c6b1387cd5674cd331a939f27 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 7 Nov 2024 19:04:44 +0100
Subject: [PATCH 104/382] util: Define small and big thresholds for socket
 buffers as unsigned long long

On 32-bit architectures, clang-tidy reports:

/home/pi/passt/tcp.c:728:11: error: performing an implicit widening conversion to type 'uint64_t' (aka 'unsigned long long') of a multiplication performed in type 'unsigned long' [bugprone-implicit-widening-of-multiplication-result,-warnings-as-errors]
  728 |         if (v >= SNDBUF_BIG)
      |                  ^
/home/pi/passt/util.h:158:22: note: expanded from macro 'SNDBUF_BIG'
  158 | #define SNDBUF_BIG              (4UL * 1024 * 1024)
      |                                  ^
/home/pi/passt/tcp.c:728:11: note: make conversion explicit to silence this warning
  728 |         if (v >= SNDBUF_BIG)
      |                  ^
/home/pi/passt/util.h:158:22: note: expanded from macro 'SNDBUF_BIG'
  158 | #define SNDBUF_BIG              (4UL * 1024 * 1024)
      |                                  ^~~~~~~~~~~~~~~~~
/home/pi/passt/tcp.c:728:11: note: perform multiplication in a wider type
  728 |         if (v >= SNDBUF_BIG)
      |                  ^
/home/pi/passt/util.h:158:22: note: expanded from macro 'SNDBUF_BIG'
  158 | #define SNDBUF_BIG              (4UL * 1024 * 1024)
      |                                  ^~~~~~~~~~
/home/pi/passt/tcp.c:730:15: error: performing an implicit widening conversion to type 'uint64_t' (aka 'unsigned long long') of a multiplication performed in type 'unsigned long' [bugprone-implicit-widening-of-multiplication-result,-warnings-as-errors]
  730 |         else if (v > SNDBUF_SMALL)
      |                      ^
/home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL'
  159 | #define SNDBUF_SMALL            (128UL * 1024)
      |                                  ^
/home/pi/passt/tcp.c:730:15: note: make conversion explicit to silence this warning
  730 |         else if (v > SNDBUF_SMALL)
      |                      ^
/home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL'
  159 | #define SNDBUF_SMALL            (128UL * 1024)
      |                                  ^~~~~~~~~~~~
/home/pi/passt/tcp.c:730:15: note: perform multiplication in a wider type
  730 |         else if (v > SNDBUF_SMALL)
      |                      ^
/home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL'
  159 | #define SNDBUF_SMALL            (128UL * 1024)
      |                                  ^~~~~
/home/pi/passt/tcp.c:731:17: error: performing an implicit widening conversion to type 'uint64_t' (aka 'unsigned long long') of a multiplication performed in type 'unsigned long' [bugprone-implicit-widening-of-multiplication-result,-warnings-as-errors]
  731 |                 v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;
      |                               ^
/home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL'
  159 | #define SNDBUF_SMALL            (128UL * 1024)
      |                                  ^
/home/pi/passt/tcp.c:731:17: note: make conversion explicit to silence this warning
  731 |                 v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;
      |                               ^
/home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL'
  159 | #define SNDBUF_SMALL            (128UL * 1024)
      |                                  ^~~~~~~~~~~~
/home/pi/passt/tcp.c:731:17: note: perform multiplication in a wider type
  731 |                 v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;
      |                               ^
/home/pi/passt/util.h:159:24: note: expanded from macro 'SNDBUF_SMALL'
  159 | #define SNDBUF_SMALL            (128UL * 1024)
      |                                  ^~~~~

because, wherever we use those thresholds, we define the other term
of comparison as uint64_t. Define the thresholds as unsigned long long
as well, to make sure we match types.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 util.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/util.h b/util.h
index 582ef57..963f57b 100644
--- a/util.h
+++ b/util.h
@@ -158,9 +158,9 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
 			 (void *)(arg));				\
 	} while (0)
 
-#define RCVBUF_BIG		(2UL * 1024 * 1024)
-#define SNDBUF_BIG		(4UL * 1024 * 1024)
-#define SNDBUF_SMALL		(128UL * 1024)
+#define RCVBUF_BIG		(2ULL * 1024 * 1024)
+#define SNDBUF_BIG		(4ULL * 1024 * 1024)
+#define SNDBUF_SMALL		(128ULL * 1024)
 
 #include <net/if.h>
 #include <limits.h>

From 71869e2912b9ede9532725e9ee5e7752b7137009 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 7 Nov 2024 19:28:21 +0100
Subject: [PATCH 105/382] passt: Use NOLINT clang-tidy block instead of
 NOLINTNEXTLINE

For some reason, this is only reported by clang-tidy 19.1.2 on
Alpine:

/home/sbrivio/passt/passt.c:314:53: error: conditional operator with identical true and false expressions [bugprone-branch-clone,-warnings-as-errors]
  314 |         nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
      |                                                            ^

We do have a suppression, but not on the line preceding it, because
we also need a cppcheck suppression there. Use NOLINTBEGIN/NOLINTEND
for the clang-tidy suppression.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/passt.c b/passt.c
index eaf231d..fac6101 100644
--- a/passt.c
+++ b/passt.c
@@ -309,9 +309,10 @@ int main(int argc, char **argv)
 	timer_init(&c, &now);
 
 loop:
-	/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
+	/* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */
 	/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
 	nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
+	/* NOLINTEND(bugprone-branch-clone) */
 	if (nfds == -1 && errno != EINTR)
 		die_perror("epoll_wait() failed in main loop");
 

From 58fa5508bde073a39c93a8f1296e363f1786c84c Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 7 Nov 2024 19:40:37 +0100
Subject: [PATCH 106/382] tap, tcp, util: Add some missing SOCK_CLOEXEC flags

I have no idea why, but these are reported by clang-tidy (19.2.1) on
Alpine (x86) only:

/home/sbrivio/passt/tap.c:1139:38: error: 'socket' should use SOCK_CLOEXEC where possible [android-cloexec-socket,-warnings-as-errors]
 1139 |         int fd = socket(AF_UNIX, SOCK_STREAM, 0);
      |                                             ^
      |                                              | SOCK_CLOEXEC
/home/sbrivio/passt/tap.c:1158:51: error: 'socket' should use SOCK_CLOEXEC where possible [android-cloexec-socket,-warnings-as-errors]
 1158 |                 ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
      |                                                                 ^
      |                                                                  | SOCK_CLOEXEC
/home/sbrivio/passt/tcp.c:1413:44: error: 'socket' should use SOCK_CLOEXEC where possible [android-cloexec-socket,-warnings-as-errors]
 1413 |         s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
      |                                                   ^
      |                                                    | SOCK_CLOEXEC
/home/sbrivio/passt/util.c:188:38: error: 'socket' should use SOCK_CLOEXEC where possible [android-cloexec-socket,-warnings-as-errors]
  188 |         if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
      |                                             ^
      |                                              | SOCK_CLOEXEC

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tap.c  | 5 +++--
 tcp.c  | 2 +-
 util.c | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tap.c b/tap.c
index a3ba958..14d9b3d 100644
--- a/tap.c
+++ b/tap.c
@@ -1136,7 +1136,7 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
  */
 int tap_sock_unix_open(char *sock_path)
 {
-	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
 	struct sockaddr_un addr = {
 		.sun_family = AF_UNIX,
 	};
@@ -1155,7 +1155,8 @@ int tap_sock_unix_open(char *sock_path)
 					UNIX_SOCK_PATH, i))
 			die_perror("Can't build UNIX domain socket path");
 
-		ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
+		ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+			    0);
 		if (ex < 0)
 			die_perror("Failed to check for UNIX domain conflicts");
 
diff --git a/tcp.c b/tcp.c
index a3d48fa..6a98dfa 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1410,7 +1410,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
 {
 	int s;
 
-	s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+	s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP);
 
 	if (s > FD_REF_MAX) {
 		close(s);
diff --git a/util.c b/util.c
index dddef93..3448f30 100644
--- a/util.c
+++ b/util.c
@@ -183,7 +183,8 @@ void sock_probe_mem(struct ctx *c)
 	int v = INT_MAX / 2, s;
 	socklen_t sl;
 
-	if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
+	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
 		c->low_wmem = c->low_rmem = 1;
 		return;
 	}

From b84cd05098275a7625223141d019f8af5a17323b Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 8 Nov 2024 13:53:27 +1100
Subject: [PATCH 107/382] log: Only check for FALLOC_FL_COLLAPSE_RANGE
 availability at runtime

log.c has several #ifdefs on FALLOC_FL_COLLAPSE_RANGE that won't attempt
to use it if not defined.  But even if the value is defined at compile
time, it might not be available in the runtime kernel, so we need to check
for errors from a fallocate() call and fall back to other methods.

Simplify this to only need the runtime check by using linux_dep.h to define
FALLOC_FL_COLLAPSE_RANGE if it's not in the kernel headers.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile    |  5 -----
 linux_dep.h |  6 ++++++
 log.c       | 10 ++--------
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index 3c82f50..0ba85b4 100644
--- a/Makefile
+++ b/Makefile
@@ -59,11 +59,6 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec
 	FLAGS += -fstack-protector-strong
 endif
 
-C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE;
-ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
-	EXTRA_SYSCALLS += fallocate
-endif
-
 prefix		?= /usr/local
 exec_prefix	?= $(prefix)
 bindir		?= $(exec_prefix)/bin
diff --git a/linux_dep.h b/linux_dep.h
index 8921623..eae9c3c 100644
--- a/linux_dep.h
+++ b/linux_dep.h
@@ -119,4 +119,10 @@ struct tcp_info_linux {
 						 */
 };
 
+#include <linux/falloc.h>
+
+#ifndef FALLOC_FL_COLLAPSE_RANGE
+#define FALLOC_FL_COLLAPSE_RANGE	0x08
+#endif
+
 #endif /* LINUX_DEP_H */
diff --git a/log.c b/log.c
index 19f1d98..239c8ce 100644
--- a/log.c
+++ b/log.c
@@ -26,6 +26,7 @@
 #include <stdarg.h>
 #include <sys/socket.h>
 
+#include "linux_dep.h"
 #include "log.h"
 #include "util.h"
 #include "passt.h"
@@ -92,7 +93,6 @@ const char *logfile_prefix[] = {
 	"         ",		/* LOG_DEBUG */
 };
 
-#ifdef FALLOC_FL_COLLAPSE_RANGE
 /**
  * logfile_rotate_fallocate() - Write header, set log_written after fallocate()
  * @fd:		Log file descriptor
@@ -126,7 +126,6 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
 
 	log_written -= log_cut_size;
 }
-#endif /* FALLOC_FL_COLLAPSE_RANGE */
 
 /**
  * logfile_rotate_move() - Fallback: move recent entries toward start, then cut
@@ -198,21 +197,17 @@ out:
  *
  * Return: 0 on success, negative error code on failure
  *
- * #syscalls fcntl
- *
- * fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there
+ * #syscalls fcntl fallocate
  */
 static int logfile_rotate(int fd, const struct timespec *now)
 {
 	if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */))
 		return -errno;
 
-#ifdef FALLOC_FL_COLLAPSE_RANGE
 	/* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */
 	if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size))
 		logfile_rotate_fallocate(fd, now);
 	else
-#endif
 		logfile_rotate_move(fd, now);
 
 	if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND))
@@ -432,4 +427,3 @@ void logfile_init(const char *name, const char *path, size_t size)
 	/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
 	log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
 }
-

From d64f25724399fbb4ba9d36eda7e17984a4c6c91c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 8 Nov 2024 13:53:28 +1100
Subject: [PATCH 108/382] linux_dep: Move close_range() conditional handling to
 linux_dep.h

util.h has some #ifdefs and weak definitions to handle compatibility with
various kernel versions.  Move this to linux_dep.h which handles several
other similar cases.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 linux_dep.h | 20 ++++++++++++++++++++
 util.c      |  1 +
 util.h      | 19 -------------------
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/linux_dep.h b/linux_dep.h
index eae9c3c..3a41e42 100644
--- a/linux_dep.h
+++ b/linux_dep.h
@@ -125,4 +125,24 @@ struct tcp_info_linux {
 #define FALLOC_FL_COLLAPSE_RANGE	0x08
 #endif
 
+#include <linux/close_range.h>
+
+#ifdef CLOSE_RANGE_UNSHARE	/* Linux kernel >= 5.9 */
+/* glibc < 2.34 and musl as of 1.2.5 need these */
+#ifndef SYS_close_range
+#define SYS_close_range		436
+#endif
+__attribute__ ((weak))
+/* cppcheck-suppress funcArgNamesDifferent */
+int close_range(unsigned int first, unsigned int last, int flags) {
+	return syscall(SYS_close_range, first, last, flags);
+}
+#else
+/* No reasonable fallback option */
+/* cppcheck-suppress funcArgNamesDifferent */
+int close_range(unsigned int first, unsigned int last, int flags) {
+	return 0;
+}
+#endif
+
 #endif /* LINUX_DEP_H */
diff --git a/util.c b/util.c
index 3448f30..913f34b 100644
--- a/util.c
+++ b/util.c
@@ -28,6 +28,7 @@
 #include <linux/errqueue.h>
 #include <getopt.h>
 
+#include "linux_dep.h"
 #include "util.h"
 #include "iov.h"
 #include "passt.h"
diff --git a/util.h b/util.h
index 963f57b..3616515 100644
--- a/util.h
+++ b/util.h
@@ -17,7 +17,6 @@
 #include <arpa/inet.h>
 #include <unistd.h>
 #include <sys/syscall.h>
-#include <linux/close_range.h>
 
 #include "log.h"
 
@@ -171,24 +170,6 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
 
 struct ctx;
 
-#ifdef CLOSE_RANGE_UNSHARE	/* Linux kernel >= 5.9 */
-/* glibc < 2.34 and musl as of 1.2.5 need these */
-#ifndef SYS_close_range
-#define SYS_close_range		436
-#endif
-__attribute__ ((weak))
-/* cppcheck-suppress funcArgNamesDifferent */
-int close_range(unsigned int first, unsigned int last, int flags) {
-	return syscall(SYS_close_range, first, last, flags);
-}
-#else
-/* No reasonable fallback option */
-/* cppcheck-suppress funcArgNamesDifferent */
-int close_range(unsigned int first, unsigned int last, int flags) {
-	return 0;
-}
-#endif
-
 int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	       const void *sa, socklen_t sl,
 	       const char *ifname, bool v6only, uint32_t data);

From 14dd70e2b33941f1f7663969574278873c9e3d35 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 8 Nov 2024 13:53:29 +1100
Subject: [PATCH 109/382] linux_dep: Fix CLOSE_RANGE_UNSHARE availability
 handling

If CLOSE_RANGE_UNSHARE isn't defined, we define a fallback version of
close_range() which is a (successful) no-op.  This is broken in several
ways:
 * It doesn't actually fix compile if using old kernel headers, because
   the caller of close_range() still directly uses CLOSE_RANGE_UNSHARE
   unprotected by ifdefs
 * Even if it did fix the compile, it means inconsistent behaviour between
   a compile time failure to find the value (we silently don't close files)
   and a runtime failure (we die with an error from close_range())
 * Silently not closing the files we intend to close for security reasons
   is probably not a good idea in any case

We don't want to simply error if close_range() or CLOSE_RANGE_UNSHARE isn't
available, because that would require running on kernel >= 5.9.  On the
other hand there's not really any other way to flush all possible fds
leaked by the parent (close() in a loop takes over a minute).  So in this
case print a warning and carry on.

As bonus this fixes a cppcheck error I see with some different options I'm
looking to apply in future.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 linux_dep.h | 12 ++++--------
 util.c      | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/linux_dep.h b/linux_dep.h
index 3a41e42..240f50a 100644
--- a/linux_dep.h
+++ b/linux_dep.h
@@ -127,22 +127,18 @@ struct tcp_info_linux {
 
 #include <linux/close_range.h>
 
-#ifdef CLOSE_RANGE_UNSHARE	/* Linux kernel >= 5.9 */
 /* glibc < 2.34 and musl as of 1.2.5 need these */
 #ifndef SYS_close_range
 #define SYS_close_range		436
 #endif
+#ifndef CLOSE_RANGE_UNSHARE	/* Linux kernel < 5.9 */
+#define CLOSE_RANGE_UNSHARE	(1U << 1)
+#endif
+
 __attribute__ ((weak))
 /* cppcheck-suppress funcArgNamesDifferent */
 int close_range(unsigned int first, unsigned int last, int flags) {
 	return syscall(SYS_close_range, first, last, flags);
 }
-#else
-/* No reasonable fallback option */
-/* cppcheck-suppress funcArgNamesDifferent */
-int close_range(unsigned int first, unsigned int last, int flags) {
-	return 0;
-}
-#endif
 
 #endif /* LINUX_DEP_H */
diff --git a/util.c b/util.c
index 913f34b..126dedb 100644
--- a/util.c
+++ b/util.c
@@ -738,8 +738,20 @@ void close_open_files(int argc, char **argv)
 			rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE);
 	}
 
-	if (rc)
-		die_perror("Failed to close files leaked by parent");
+	if (rc) {
+		if (errno == ENOSYS || errno == EINVAL) {
+			/* This probably means close_range() or the
+			 * CLOSE_RANGE_UNSHARE flag is not supported by the
+			 * kernel.  Not much we can do here except carry on and
+			 * hope for the best.
+			 */
+			warn(
+"Can't use close_range() to ensure no files leaked by parent");
+		} else {
+			die_perror("Failed to close files leaked by parent");
+		}
+	}
+
 }
 
 /**

From 0588163b1f981a3ef87a9a3fe155dc2f0e116e18 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 8 Nov 2024 13:53:30 +1100
Subject: [PATCH 110/382] cppcheck: Don't check the system headers

We pass -I options to cppcheck so that it will find the system headers.
Then we need to pass a bunch more options to suppress the zillions of
cppcheck errors found in those headers.

It turns out, however, that it's not recommended to give the system headers
to cppcheck anyway.  Instead it has built-in knowledge of the ANSI libc and
uses that as the basis of its checks.  We do need to suppress
missingIncludeSystem warnings instead though.

Not bothering with the system headers makes the cppcheck runtime go from
~37s to ~14s on my machine, which is a pretty nice win.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index 0ba85b4..258d298 100644
--- a/Makefile
+++ b/Makefile
@@ -163,11 +163,6 @@ clang-tidy: $(PASST_SRCS) $(HEADERS)
 	clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
 	           -DCLANG_TIDY_58992
 
-SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
-ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)
-VER := $(shell $(CC) -dumpversion)
-SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include
-endif
 cppcheck: $(PASST_SRCS) $(HEADERS)
 	if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
 		CPPCHECK_EXHAUSTIVE="--check-level=exhaustive";		\
@@ -177,11 +172,8 @@ cppcheck: $(PASST_SRCS) $(HEADERS)
 	cppcheck --std=c11 --error-exitcode=1 --enable=all --force	\
 	--inconclusive --library=posix --quiet				\
 	$${CPPCHECK_EXHAUSTIVE}						\
-	$(SYSTEM_INCLUDES:%=-I%)					\
-	$(SYSTEM_INCLUDES:%=--config-exclude=%)				\
-	$(SYSTEM_INCLUDES:%=--suppress=*:%/*)				\
-	$(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*)	\
 	--inline-suppr							\
+	--suppress=missingIncludeSystem \
 	--suppress=unusedStructMember					\
 	$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936  \
 	$(PASST_SRCS) $(HEADERS)

From 71f228d04b5c68b1cf42d95e4e5bbb82af0a0e60 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 14 Nov 2024 14:33:03 +1100
Subject: [PATCH 111/382] ndp: Remove redundant update to addr_seen

ndp() updates addr_seen or addr_ll_seen based on the source address of the
received packet.  This is redundant since tap6_handler() has already
updated addr_seen for any type of packet, not just NDP.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ndp.c | 9 ++-------
 ndp.h | 4 ++--
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/ndp.c b/ndp.c
index faae408..ab80898 100644
--- a/ndp.c
+++ b/ndp.c
@@ -179,8 +179,8 @@ struct ndp_ns {
  *
  * Return: 0 if not handled here, 1 if handled, -1 on failure
  */
-int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
-	const struct pool *p)
+int ndp(const struct ctx *c, const struct icmp6hdr *ih,
+	const struct in6_addr *saddr, const struct pool *p)
 {
 	struct ndp_na na = {
 		.ih = {
@@ -336,11 +336,6 @@ dns_done:
 		return 1;
 	}
 
-	if (IN6_IS_ADDR_LINKLOCAL(saddr))
-		c->ip6.addr_ll_seen = *saddr;
-	else
-		c->ip6.addr_seen = *saddr;
-
 	rsaddr = &c->ip6.our_tap_ll;
 
 	if (ih->icmp6_type == NS) {
diff --git a/ndp.h b/ndp.h
index a786441..abe6d02 100644
--- a/ndp.h
+++ b/ndp.h
@@ -6,7 +6,7 @@
 #ifndef NDP_H
 #define NDP_H
 
-int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
-	const struct pool *p);
+int ndp(const struct ctx *c, const struct icmp6hdr *ih,
+	const struct in6_addr *saddr, const struct pool *p);
 
 #endif /* NDP_H */

From 4e471670351a76b902e5376da4ee909f68485da2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 14 Nov 2024 14:33:04 +1100
Subject: [PATCH 112/382] ndp: Add ndp_send() helper

ndp() has a conditional on message type generating the reply message, then
a tiny amount of common code, then another conditional to send the reply
with slightly different parameters.  We can make this a bit neater by
making a helper function for sending the reply, and call it from each of
the different message type paths.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ndp.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/ndp.c b/ndp.c
index ab80898..fa1b67a 100644
--- a/ndp.c
+++ b/ndp.c
@@ -170,6 +170,21 @@ struct ndp_ns {
 	struct in6_addr target_addr;
 } __attribute__((packed));
 
+/**
+ * ndp_send() - Send an NDP message
+ * @c:		Execution context
+ * @dst:	IPv6 address to send the message to
+ * @buf:	ICMPv6 header + message payload
+ * @l4len:	Length of message, including ICMPv6 header
+ */
+static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
+		     const void *buf, size_t l4len)
+{
+	const struct in6_addr *src = &c->ip6.our_tap_ll;
+
+	tap_icmp6_send(c, src, dst, buf, l4len);
+}
+
 /**
  * ndp() - Check for NDP solicitations, reply as needed
  * @c:		Execution context
@@ -223,9 +238,6 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih,
 			},
 		},
 	};
-	const struct in6_addr *rsaddr; /* src addr for reply */
-	unsigned char *ptr = NULL;
-	size_t dlen;
 
 	if (ih->icmp6_type < RS || ih->icmp6_type > NA)
 		return 0;
@@ -249,7 +261,9 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih,
 		       sizeof(na.target_addr));
 		memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
 
+		ndp_send(c, saddr, &na, sizeof(struct ndp_na));
 	} else if (ih->icmp6_type == RS) {
+		unsigned char *ptr = NULL;
 		size_t dns_s_len = 0;
 		int i, n;
 
@@ -332,18 +346,8 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih,
 
 dns_done:
 		memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
-	} else {
-		return 1;
-	}
 
-	rsaddr = &c->ip6.our_tap_ll;
-
-	if (ih->icmp6_type == NS) {
-		dlen = sizeof(struct ndp_na);
-		tap_icmp6_send(c, rsaddr, saddr, &na, dlen);
-	} else if (ih->icmp6_type == RS) {
-		dlen = ptr - (unsigned char *)&ra;
-		tap_icmp6_send(c, rsaddr, saddr, &ra, dlen);
+		ndp_send(c, saddr, &ra, ptr - (unsigned char *)&ra);
 	}
 
 	return 1;

From cbc83e14df5ebbc656de8ec0e5c26a1a6efadf0e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 14 Nov 2024 14:33:05 +1100
Subject: [PATCH 113/382] ndp: Split out helpers for sending specific NDP
 message types

Currently the large ndp() function responds to all NDP messages we handle,
both parsing the message as necessary and sending the response.  Split out
the code to construct and send specific message types into ndp_na() (to
send NA messages) and ndp_ra() (to send RA messages).

As well as breaking up an excessively large function, this is a first step
to being able to send unsolicited NDP messages.

While we're there, remove a slighty ugly goto.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ndp.c | 132 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 76 insertions(+), 56 deletions(-)

diff --git a/ndp.c b/ndp.c
index fa1b67a..8f52471 100644
--- a/ndp.c
+++ b/ndp.c
@@ -186,16 +186,13 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
 }
 
 /**
- * ndp() - Check for NDP solicitations, reply as needed
+ * ndp_na() - Send an NDP Neighbour Advertisement (NA) message
  * @c:		Execution context
- * @ih:		ICMPv6 header
- * @saddr:	Source IPv6 address
- * @p:		Packet pool
- *
- * Return: 0 if not handled here, 1 if handled, -1 on failure
+ * @dst:	IPv6 address to send the NA to
+ * @addr:	IPv6 address to advertise
  */
-int ndp(const struct ctx *c, const struct icmp6hdr *ih,
-	const struct in6_addr *saddr, const struct pool *p)
+static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
+		   const void *addr)
 {
 	struct ndp_na na = {
 		.ih = {
@@ -212,6 +209,20 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih,
 			},
 		}
 	};
+
+	memcpy(&na.target_addr, addr, sizeof(na.target_addr));
+	memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
+
+	ndp_send(c, dst, &na, sizeof(na));
+}
+
+/**
+ * ndp_ra() - Send an NDP Router Advertisement (RA) message
+ * @c:		Execution context
+ * @dst:	IPv6 address to send the RA to
+ */
+static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
+{
 	struct ndp_ra ra = {
 		.ih = {
 			.icmp6_type		= RA,
@@ -238,58 +249,28 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih,
 			},
 		},
 	};
+	unsigned char *ptr = NULL;
 
-	if (ih->icmp6_type < RS || ih->icmp6_type > NA)
-		return 0;
+	memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix));
 
-	if (c->no_ndp)
-		return 1;
+	ptr = &ra.var[0];
 
-	if (ih->icmp6_type == NS) {
-		const struct ndp_ns *ns =
-			packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
+	if (c->mtu != -1) {
+		struct opt_mtu *mtu = (struct opt_mtu *)ptr;
+		*mtu = (struct opt_mtu) {
+			.header = {
+				.type		= OPT_MTU,
+				.len		= 1,
+			},
+			.value			= htonl(c->mtu),
+		};
+		ptr += sizeof(struct opt_mtu);
+	}
 
-		if (!ns)
-			return -1;
-
-		if (IN6_IS_ADDR_UNSPECIFIED(saddr))
-			return 1;
-
-		info("NDP: received NS, sending NA");
-
-		memcpy(&na.target_addr, &ns->target_addr,
-		       sizeof(na.target_addr));
-		memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
-
-		ndp_send(c, saddr, &na, sizeof(struct ndp_na));
-	} else if (ih->icmp6_type == RS) {
-		unsigned char *ptr = NULL;
+	if (!c->no_dhcp_dns) {
 		size_t dns_s_len = 0;
 		int i, n;
 
-		if (c->no_ra)
-			return 1;
-
-		info("NDP: received RS, sending RA");
-		memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix));
-
-		ptr = &ra.var[0];
-
-		if (c->mtu != -1) {
-			struct opt_mtu *mtu = (struct opt_mtu *)ptr;
-			*mtu = (struct opt_mtu) {
-				.header = {
-					.type		= OPT_MTU,
-					.len		= 1,
-				},
-				.value			= htonl(c->mtu),
-			};
-			ptr += sizeof(struct opt_mtu);
-		}
-
-		if (c->no_dhcp_dns)
-			goto dns_done;
-
 		for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n]); n++);
 		if (n) {
 			struct opt_rdnss *rdnss = (struct opt_rdnss *)ptr;
@@ -343,11 +324,50 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih,
 			memset(ptr, 0, 8 - dns_s_len % 8);	/* padding */
 			ptr += 8 - dns_s_len % 8;
 		}
+	}
 
-dns_done:
-		memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
+	memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
 
-		ndp_send(c, saddr, &ra, ptr - (unsigned char *)&ra);
+	ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);
+}
+
+/**
+ * ndp() - Check for NDP solicitations, reply as needed
+ * @c:		Execution context
+ * @ih:		ICMPv6 header
+ * @saddr:	Source IPv6 address
+ * @p:		Packet pool
+ *
+ * Return: 0 if not handled here, 1 if handled, -1 on failure
+ */
+int ndp(const struct ctx *c, const struct icmp6hdr *ih,
+	const struct in6_addr *saddr, const struct pool *p)
+{
+	if (ih->icmp6_type < RS || ih->icmp6_type > NA)
+		return 0;
+
+	if (c->no_ndp)
+		return 1;
+
+	if (ih->icmp6_type == NS) {
+		const struct ndp_ns *ns;
+
+		ns = packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
+		if (!ns)
+			return -1;
+
+		if (IN6_IS_ADDR_UNSPECIFIED(saddr))
+			return 1;
+
+		info("NDP: received NS, sending NA");
+
+		ndp_na(c, saddr, &ns->target_addr);
+	} else if (ih->icmp6_type == RS) {
+		if (c->no_ra)
+			return 1;
+
+		info("NDP: received RS, sending RA");
+		ndp_ra(c, saddr);
 	}
 
 	return 1;

From 36c070e6e320b97bb4761e29c934f5f269e06b35 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 14 Nov 2024 14:33:06 +1100
Subject: [PATCH 114/382] ndp: Use struct assignment in preference to memcpy()
 for IPv6 addresses

There are a number of places we can simply assign IPv6 addresses about,
rather than the current mildly ugly memcpy().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ndp.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/ndp.c b/ndp.c
index 8f52471..fd512ae 100644
--- a/ndp.c
+++ b/ndp.c
@@ -158,7 +158,7 @@ struct ndp_ra {
 
 	unsigned char var[sizeof(struct opt_mtu) + sizeof(struct opt_rdnss) +
 			  sizeof(struct opt_dnssl)];
-} __attribute__((packed));
+} __attribute__((packed, aligned(__alignof__(struct in6_addr))));
 
 /**
  * struct ndp_ns - NDP Neighbor Solicitation (NS) message
@@ -168,7 +168,7 @@ struct ndp_ra {
 struct ndp_ns {
 	struct icmp6hdr ih;
 	struct in6_addr target_addr;
-} __attribute__((packed));
+} __attribute__((packed, aligned(__alignof__(struct in6_addr))));
 
 /**
  * ndp_send() - Send an NDP message
@@ -192,7 +192,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
  * @addr:	IPv6 address to advertise
  */
 static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
-		   const void *addr)
+	    const struct in6_addr *addr)
 {
 	struct ndp_na na = {
 		.ih = {
@@ -202,6 +202,7 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
 			.icmp6_solicited	= 1,
 			.icmp6_override		= 1,
 		},
+		.target_addr = *addr,
 		.target_l2_addr = {
 			.header	= {
 				.type		= OPT_TARGET_L2_ADDR,
@@ -210,7 +211,6 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
 		}
 	};
 
-	memcpy(&na.target_addr, addr, sizeof(na.target_addr));
 	memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
 
 	ndp_send(c, dst, &na, sizeof(na));
@@ -242,6 +242,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
 			.valid_lifetime		= ~0U,
 			.pref_lifetime		= ~0U,
 		},
+		.prefix = c->ip6.addr,
 		.source_ll = {
 			.header = {
 				.type		= OPT_SRC_L2_ADDR,
@@ -251,8 +252,6 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
 	};
 	unsigned char *ptr = NULL;
 
-	memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix));
-
 	ptr = &ra.var[0];
 
 	if (c->mtu != -1) {
@@ -282,8 +281,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
 				.lifetime		= ~0U,
 			};
 			for (i = 0; i < n; i++) {
-				memcpy(&rdnss->dns[i], &c->ip6.dns[i],
-				       sizeof(rdnss->dns[i]));
+				rdnss->dns[i] = c->ip6.dns[i];
 			}
 			ptr += offsetof(struct opt_rdnss, dns) +
 			       i * sizeof(rdnss->dns[0]);

From a60703e89991d23345ed929328001e19f5bc47e0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 14 Nov 2024 14:33:07 +1100
Subject: [PATCH 115/382] ndp: Make route lifetime a #define

Currently we open-code the lifetime of the route we advertise via NDP to be
65535s (the maximum).  Change it to a #define.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ndp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ndp.c b/ndp.c
index fd512ae..09df8d6 100644
--- a/ndp.c
+++ b/ndp.c
@@ -33,6 +33,8 @@
 #include "tap.h"
 #include "log.h"
 
+#define	RT_LIFETIME	65535
+
 #define RS	133
 #define RA	134
 #define NS	135
@@ -229,7 +231,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
 			.icmp6_code		= 0,
 			.icmp6_hop_limit	= 255,
 			/* RFC 8319 */
-			.icmp6_rt_lifetime	= htons_constant(65535),
+			.icmp6_rt_lifetime	= htons_constant(RT_LIFETIME),
 			.icmp6_addrconf_managed	= 1,
 		},
 		.prefix_info = {

From 71d5deed5eed3949ee09c5f0a53b4de0b09b4afc Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 14 Nov 2024 14:33:08 +1100
Subject: [PATCH 116/382] util: Add general low-level random bytes helper

Currently secret_init() open codes getting good quality random bytes from
the OS, either via getrandom(2) or reading /dev/random.  We're going to
add at least one more place that needs random data in future, so make a
general helper for getting random bytes.  While we're there, fix a number
of minor bugs:
 - getrandom() can theoretically return a "short read", so handle that case
 - getrandom() as well as read can return a transient EINTR
 - We would attempt to read data from /dev/random if we failed to open it
   (open() returns -1), but not if we opened it as fd 0 (unlikely, but ok)
 - More specific error reporting

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.c | 30 +-----------------------------
 util.c  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 util.h  |  2 ++
 3 files changed, 57 insertions(+), 29 deletions(-)

diff --git a/passt.c b/passt.c
index fac6101..73649de 100644
--- a/passt.c
+++ b/passt.c
@@ -36,9 +36,6 @@
 #include <sys/prctl.h>
 #include <netinet/if_ether.h>
 #include <libgen.h>
-#ifdef HAS_GETRANDOM
-#include <sys/random.h>
-#endif
 
 #include "util.h"
 #include "passt.h"
@@ -118,32 +115,7 @@ static void post_handler(struct ctx *c, const struct timespec *now)
  */
 static void secret_init(struct ctx *c)
 {
-#ifndef HAS_GETRANDOM
-	int dev_random = open("/dev/random", O_RDONLY);
-	unsigned int random_read = 0;
-
-	while (dev_random && random_read < sizeof(c->hash_secret)) {
-		int ret = read(dev_random,
-			       (uint8_t *)&c->hash_secret + random_read,
-			       sizeof(c->hash_secret) - random_read);
-
-		if (ret == -1 && errno == EINTR)
-			continue;
-
-		if (ret <= 0)
-			break;
-
-		random_read += ret;
-	}
-	if (dev_random >= 0)
-		close(dev_random);
-
-	if (random_read < sizeof(c->hash_secret))
-#else
-	if (getrandom(&c->hash_secret, sizeof(c->hash_secret),
-		      GRND_RANDOM) < 0)
-#endif /* !HAS_GETRANDOM */
-		die_perror("Failed to get random bytes for hash table and TCP");
+	raw_random(&c->hash_secret, sizeof(c->hash_secret));
 }
 
 /**
diff --git a/util.c b/util.c
index 126dedb..55cae3f 100644
--- a/util.c
+++ b/util.c
@@ -34,6 +34,9 @@
 #include "passt.h"
 #include "packet.h"
 #include "log.h"
+#ifdef HAS_GETRANDOM
+#include <sys/random.h>
+#endif
 
 /**
  * sock_l4_sa() - Create and bind socket to socket address, add to epoll list
@@ -783,3 +786,54 @@ bool snprintf_check(char *str, size_t size, const char *format, ...)
 
 	return false;
 }
+
+#define DEV_RANDOM	"/dev/random"
+
+/**
+ * raw_random() - Get high quality random bytes
+ * @buf:	Buffer to fill with random bytes
+ * @buflen:	Number of bytes of random data to put in @buf
+ *
+ * Assumes that the random data is essential, and will die() if unable to obtain
+ * it.
+ */
+void raw_random(void *buf, size_t buflen)
+{
+	size_t random_read = 0;
+#ifndef HAS_GETRANDOM
+	int fd = open(DEV_RANDOM, O_RDONLY);
+
+	if (fd < 0)
+		die_perror("Couldn't open %s", DEV_RANDOM);
+#endif
+
+	while (random_read < buflen) {
+		ssize_t ret;
+
+#ifdef HAS_GETRANDOM
+		ret = getrandom((char *)buf + random_read,
+				buflen - random_read, GRND_RANDOM);
+#else
+		ret = read(dev_random, (char *)buf + random_read,
+			   buflen - random_read);
+#endif
+
+		if (ret == -1 && errno == EINTR)
+			continue;
+
+		if (ret < 0)
+			die_perror("Error on random data source");
+
+		if (ret == 0)
+			break;
+
+		random_read += ret;
+	}
+
+#ifndef HAS_GETRANDOM
+	close(dev_random);
+#endif
+
+	if (random_read < buflen)
+		die("Unexpected EOF on random data source");
+}
diff --git a/util.h b/util.h
index 3616515..90428c4 100644
--- a/util.h
+++ b/util.h
@@ -263,6 +263,8 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
 /* FPRINTF() intentionally silences cert-err33-c clang-tidy warnings */
 #define FPRINTF(f, ...)	(void)fprintf(f, __VA_ARGS__)
 
+void raw_random(void *buf, size_t buflen);
+
 /*
  * Workarounds for https://github.com/llvm/llvm-project/issues/58992
  *

From b39760cc7d89e69c7fb12eccc3df3bd15e2d5665 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 14 Nov 2024 14:33:09 +1100
Subject: [PATCH 117/382] passt: Seed libc's pseudo random number generator

We have an upcoming case where we need pseudo-random numbers to scatter
timings, but we don't need cryptographically strong random numbers.  libc's
built in random() is fine for this purpose, but we should seed it.  Extend
secret_init() - the only current user of random numbers - to do this as
well as generating the SipHash secret.  Using /dev/random for a PRNG seed
is probably overkill, but it's simple and we only do it once, so we might
as well.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/passt.c b/passt.c
index 73649de..83b26c5 100644
--- a/passt.c
+++ b/passt.c
@@ -110,12 +110,19 @@ static void post_handler(struct ctx *c, const struct timespec *now)
 }
 
 /**
- * secret_init() - Create secret value for SipHash calculations
+ * random_init() - Initialise things based on random data
  * @c:		Execution context
  */
-static void secret_init(struct ctx *c)
+static void random_init(struct ctx *c)
 {
+	unsigned int seed;
+
+	/* Create secret value for SipHash calculations */
 	raw_random(&c->hash_secret, sizeof(c->hash_secret));
+
+	/* Seed pseudo-RNG for things that need non-cryptographic random */
+	raw_random(&seed, sizeof(seed));
+	srandom(seed);
 }
 
 /**
@@ -236,7 +243,7 @@ int main(int argc, char **argv)
 
 	tap_sock_init(&c);
 
-	secret_init(&c);
+	random_init(&c);
 
 	if (clock_gettime(CLOCK_MONOTONIC, &now))
 		die_perror("Failed to get CLOCK_MONOTONIC time");

From 6e1e44293ef991d8c946dd59fbbd65c54901b255 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 14 Nov 2024 14:33:10 +1100
Subject: [PATCH 118/382] ndp: Send unsolicited Router Advertisements

Currently, our NDP implementation only sends Router Advertisements (RA)
when it receives a Router Solicitation (RS) from the guest.  However,
RFC 4861 requires that we periodically send unsolicited RAs.

Linux as a guest also requires this: it will send an RS when a link first
comes up, but the route it gets from this will have a finite lifetime (we
set this to 65535s, the maximum allowed, around 18 hours).  When that
expires the guest will not send a new RS, but instead expects the route to
have been renewed (if still valid) by an unsolicited RA.

Implement sending unsolicited RAs on a partially randomised timer, as
required by RFC 4861.  The RFC also specifies that solicited RAs should
also be delayed, or even omitted, if the next unsolicited RA is soon
enough.  For now we don't do that, always sending an immediate RA in
response to an RS.  We can get away with this because in our use cases
we expect to just have passt itself and the guest on the link, rather than
a large broadcast domain.

Link: https://github.com/kubevirt/kubevirt/issues/13191
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ip.h    |  9 +++++++++
 ndp.c   | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ndp.h   |  3 +++
 passt.c |  3 +++
 4 files changed, 69 insertions(+)

diff --git a/ip.h b/ip.h
index b8d4a5b..0742612 100644
--- a/ip.h
+++ b/ip.h
@@ -92,4 +92,13 @@ struct ipv6_opt_hdr {
 
 char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
 		 size_t *dlen);
+
+/* IPv6 link-local all-nodes multicast adddress, ff02::1 */
+static const struct in6_addr in6addr_ll_all_nodes = {
+	.s6_addr = {
+		0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+	},
+};
+
 #endif /* IP_H */
diff --git a/ndp.c b/ndp.c
index 09df8d6..7ee44b2 100644
--- a/ndp.c
+++ b/ndp.c
@@ -372,3 +372,57 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih,
 
 	return 1;
 }
+
+/* Default interval between unsolicited RAs (seconds) */
+#define DEFAULT_MAX_RTR_ADV_INTERVAL	600	/* RFC 4861, 6.2.1 */
+
+/* Minimum required interval between RAs (seconds) */
+#define MIN_DELAY_BETWEEN_RAS		3	/* RFC 4861, 10 */
+
+static time_t next_ra;
+
+/**
+ * ndp_timer() - Send unsolicited NDP messages if necessary
+ * @c:		Execution context
+ * @now:	Current (monotonic) time
+ */
+void ndp_timer(const struct ctx *c, const struct timespec *now)
+{
+	time_t max_rtr_adv_interval = DEFAULT_MAX_RTR_ADV_INTERVAL;
+	time_t min_rtr_adv_interval, interval;
+
+	if (c->no_ra || now->tv_sec < next_ra)
+		return;
+
+	/* We must advertise before the route's lifetime expires */
+	max_rtr_adv_interval = MIN(max_rtr_adv_interval, RT_LIFETIME - 1);
+
+	/* But we must not go smaller than the minimum delay */
+	max_rtr_adv_interval = MAX(max_rtr_adv_interval, MIN_DELAY_BETWEEN_RAS);
+
+	/* RFC 4861, 6.2.1 */
+	min_rtr_adv_interval = MAX(max_rtr_adv_interval / 3,
+				   MIN_DELAY_BETWEEN_RAS);
+
+	/* As required by RFC 4861, we randomise the interval between
+	 * unsolicited RAs.  This is to prevent multiple routers on a link
+	 * getting synchronised (e.g. after booting a bunch of routers at once)
+	 * and causing flurries of RAs at the same time.
+	 *
+	 * This random doesn't need to be cryptographically strong, so random(3)
+	 * is fine.  Other routers on the link also want to avoid
+	 * synchronisation, and anything malicious has much easier ways to cause
+	 * trouble.
+	 *
+	 * The modulus also makes this not strictly a uniform distribution, but,
+	 * again, it's close enough for our purposes.
+	 */
+	interval = min_rtr_adv_interval +
+		random() % (max_rtr_adv_interval - min_rtr_adv_interval);
+
+	info("NDP: sending unsolicited RA, next in %llds", (long long)interval);
+
+	ndp_ra(c, &in6addr_ll_all_nodes);
+
+	next_ra = now->tv_sec + interval;
+}
diff --git a/ndp.h b/ndp.h
index abe6d02..41c2000 100644
--- a/ndp.h
+++ b/ndp.h
@@ -6,7 +6,10 @@
 #ifndef NDP_H
 #define NDP_H
 
+struct icmp6hdr;
+
 int ndp(const struct ctx *c, const struct icmp6hdr *ih,
 	const struct in6_addr *saddr, const struct pool *p);
+void ndp_timer(const struct ctx *c, const struct timespec *now);
 
 #endif /* NDP_H */
diff --git a/passt.c b/passt.c
index 83b26c5..a51a4e1 100644
--- a/passt.c
+++ b/passt.c
@@ -49,6 +49,7 @@
 #include "arch.h"
 #include "log.h"
 #include "tcp_splice.h"
+#include "ndp.h"
 
 #define EPOLL_EVENTS		8
 
@@ -107,6 +108,8 @@ static void post_handler(struct ctx *c, const struct timespec *now)
 
 	flow_defer_handler(c, now);
 #undef CALL_PROTO_HANDLER
+
+	ndp_timer(c, now);
 }
 
 /**

From 5e2446667729d01ef8208d0e7e866cee09c8a3fb Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 14 Nov 2024 23:48:54 +0100
Subject: [PATCH 119/382] selinux: Use auth_read_passwd() interface for all our
 getpwnam() needs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If passt or pasta are started as root, we need to read the passwd file
(be it /etc/passwd or whatever sssd provides) to find out UID and GID
of 'nobody' so that we can switch to it.

Instead of a bunch of allow rules for passwd_file_t and sssd macros,
use the more convenient auth_read_passwd() interface which should
cover our usage of getpwnam().

The existing rules weren't actually enough:

  # strace -e openat passt -f
  [...]
  Started as root, will change to nobody.
  openat(AT_FDCWD, "/etc/nsswitch.conf", O_RDONLY|O_CLOEXEC) = 4
  openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 4
  openat(AT_FDCWD, "/lib64/libnss_sss.so.2", O_RDONLY|O_CLOEXEC) = 4
  openat(AT_FDCWD, "/var/lib/sss/mc/passwd", O_RDONLY|O_CLOEXEC) = -1 EACCES (Permission denied)
  openat(AT_FDCWD, "/var/lib/sss/mc/passwd", O_RDONLY|O_CLOEXEC) = -1 EACCES (Permission denied)
  openat(AT_FDCWD, "/etc/passwd", O_RDONLY|O_CLOEXEC) = 4

with corresponding SELinux warnings logged in audit.log.

Reported-by: Minxi Hou <mhou@redhat.com>
Analysed-by: Miloš Malik <mmalik@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/selinux/passt.te | 5 +----
 contrib/selinux/pasta.te | 9 +--------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index 80bf780..c6cea34 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -47,8 +47,6 @@ require {
 	type port_t;
 	type http_port_t;
 
-	type passwd_file_t;
-
 	class netlink_route_socket { bind create nlmsg_read };
 	type sysctl_net_t;
 
@@ -96,8 +94,7 @@ allow passt_t self:capability { sys_tty_config setpcap net_bind_service setuid s
 allow passt_t self:cap_userns { setpcap sys_admin sys_ptrace };
 allow passt_t self:user_namespace create;
 
-allow passt_t passwd_file_t:file read_file_perms;
-sssd_search_lib(passt_t)
+auth_read_passwd(passt_t)
 
 allow passt_t proc_net_t:file read;
 allow passt_t net_conf_t:file { open read };
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index 310383c..69be081 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -68,9 +68,6 @@ require {
 	type system_dbusd_t;
 	type systemd_hostnamed_t;
 	type systemd_systemctl_exec_t;
-	type passwd_file_t;
-	type sssd_public_t;
-	type sssd_var_lib_t;
 	class dbus send_msg;
 	class system module_request;
 	class system status;
@@ -115,8 +112,7 @@ allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read
 allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service };
 allow pasta_t self:user_namespace create;
 
-allow pasta_t passwd_file_t:file read_file_perms;
-sssd_search_lib(pasta_t)
+auth_read_passwd(pasta_t)
 
 domain_auto_trans(pasta_t, bin_t, unconfined_t);
 domain_auto_trans(pasta_t, shell_exec_t, unconfined_t);
@@ -178,12 +174,9 @@ allow pasta_t init_t:system status;
 allow pasta_t unconfined_t:dir search;
 allow pasta_t unconfined_t:file read;
 allow pasta_t unconfined_t:lnk_file read;
-allow pasta_t passwd_file_t:file { getattr open read };
 allow pasta_t self:process { setpgid setcap };
 allow pasta_t shell_exec_t:file { execute execute_no_trans map };
 
-allow pasta_t sssd_var_lib_t:dir search;
-allow pasta_t sssd_public_t:dir search;
 allow pasta_t hostname_exec_t:file { execute execute_no_trans getattr open read map };
 allow pasta_t system_dbusd_t:unix_stream_socket connectto;
 allow pasta_t system_dbusd_t:dbus send_msg;

From bf9492747df006a794f281d6c26ee38989b44d23 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 15 Nov 2024 15:22:06 +0100
Subject: [PATCH 120/382] ndp: Don't send unsolicited router advertisement if
 we can't, yet

ndp_timer() is called right away on the first epoll_wait() cycle,
when the communication channel to the guest isn't ready yet:

  1.0038: NDP: sending unsolicited RA, next in 264s
  1.0038: tap: failed to send 1 frames of 1

check that it's up before sending it. This effectively delays the
first gratuitous router advertisement, which is probably a good idea
given that we expect the guest to send a router solicitation right
away.

Fixes: 6e1e44293ef9 ("ndp: Send unsolicited Router Advertisements")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 ndp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ndp.c b/ndp.c
index 7ee44b2..1752d64 100644
--- a/ndp.c
+++ b/ndp.c
@@ -391,7 +391,7 @@ void ndp_timer(const struct ctx *c, const struct timespec *now)
 	time_t max_rtr_adv_interval = DEFAULT_MAX_RTR_ADV_INTERVAL;
 	time_t min_rtr_adv_interval, interval;
 
-	if (c->no_ra || now->tv_sec < next_ra)
+	if (c->fd_tap < 0 || c->no_ra || now->tv_sec < next_ra)
 		return;
 
 	/* We must advertise before the route's lifetime expires */

From 5ae21841acd7f55a4b57b99a5097ca99b84f07c4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 19 Nov 2024 12:21:56 +1100
Subject: [PATCH 121/382] ndp: Don't send unsolicited RAs if NDP is disabled

We recently added support for sending unsolicited NDP Router Advertisement
packets.  While we (correctly) disable this if the --no-ra option is given
we incorrectly still send them if --no-ndp is set.  Fix the oversight.

Fixes: 6e1e44293ef9 ("ndp: Send unsolicited Router Advertisements")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/passt.c b/passt.c
index a51a4e1..06e0a33 100644
--- a/passt.c
+++ b/passt.c
@@ -109,7 +109,8 @@ static void post_handler(struct ctx *c, const struct timespec *now)
 	flow_defer_handler(c, now);
 #undef CALL_PROTO_HANDLER
 
-	ndp_timer(c, now);
+	if (!c->no_ndp)
+		ndp_timer(c, now);
 }
 
 /**

From af464c4ffbb7a5341f8a7beedce8382d598dbaf7 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 19 Nov 2024 20:53:43 +0100
Subject: [PATCH 122/382] tcp: Reset ACK_TO_TAP_DUE flag whenever an ACK isn't
 needed anymore

We enter the timer handler with the ACK_TO_TAP_DUE flag, call
tcp_prepare_flags() with ACK_IF_NEEDED, and realise that we
acknowledged everything meanwhile, so we return early, but we also
need to reset that flag to avoid unnecessarily scheduling the timer
over and over again until more pending data appears.

I'm not sure if this fixes any real issue, but I've spotted this
in several logs reported by users, including one where we have some
unexpected bursts of high CPU load during TCP transfers at low rates,
from https://github.com/containers/podman/issues/23686.

Link: https://github.com/containers/podman/discussions/24572
Link: https://github.com/containers/podman/issues/23686
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tcp.c b/tcp.c
index 6a98dfa..f357920 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1235,8 +1235,10 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 	int s = conn->sock;
 
 	if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
-	    !flags && conn->wnd_to_tap)
+	    !flags && conn->wnd_to_tap) {
+		conn_flag(c, conn, ~ACK_TO_TAP_DUE);
 		return 0;
+	}
 
 	if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
 		conn_event(c, conn, CLOSED);

From 238c69f9af458e41dea5ad8c988dbf65b05b5172 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 19 Nov 2024 20:53:44 +0100
Subject: [PATCH 123/382] tcp: Acknowledge keep-alive segments, ignore them for
 the rest

RFC 9293, 3.8.4 says:

   Implementers MAY include "keep-alives" in their TCP implementations
   (MAY-5), although this practice is not universally accepted.  Some
   TCP implementations, however, have included a keep-alive mechanism.
   To confirm that an idle connection is still active, these
   implementations send a probe segment designed to elicit a response
   from the TCP peer.  Such a segment generally contains SEG.SEQ =
   SND.NXT-1 and may or may not contain one garbage octet of data.  If
   keep-alives are included, the application MUST be able to turn them
   on or off for each TCP connection (MUST-24), and they MUST default to
   off (MUST-25).

but currently, tcp_data_from_tap() is not aware of this and will
schedule a fast re-transmit on the second keep-alive (because it's
also a duplicate ACK), ignoring the fact that the sequence number was
rewinded to SND.NXT-1.

ACK these keep-alive segments, reset the activity timeout, and ignore
them for the rest.

At some point, we could think of implementing an approximation of
keep-alive segments on outbound sockets, for example by setting
TCP_KEEPIDLE to 1, and a large TCP_KEEPINTVL, so that we send a single
keep-alive segment at approximately the same time, and never reset the
connection. That's beyond the scope of this fix, though.

Reported-by: Tim Besard <tim.besard@gmail.com>
Link: https://github.com/containers/podman/discussions/24572
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tcp.c b/tcp.c
index f357920..1eb85bb 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1763,6 +1763,20 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			continue;
 
 		seq = ntohl(th->seq);
+		if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) {
+			flow_trace(conn,
+				   "keep-alive sequence: %u, previous: %u",
+				   seq, conn->seq_from_tap);
+
+			tcp_send_flag(c, conn, ACK);
+			tcp_timer_ctl(c, conn);
+
+			if (p->count == 1)
+				return 1;
+
+			continue;
+		}
+
 		ack_seq = ntohl(th->ack_seq);
 
 		if (th->ack) {

From b61be8468a804f5660cebcfdc10aa94b7ecac7a3 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 25 Nov 2024 11:40:53 +0100
Subject: [PATCH 124/382] passt.1: Fix "default" note about --map-guest-addr

It's not true that there's no mapping by default: there's no mapping
in the --map-guest-addr sense, by default, but in that case
the default --map-host-loopback behaviour prevails.

While at it, fix a typo.

Fixes: 57b7bd2a48a1 ("fwd, conf: Allow NAT of the guest's assigned address")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.1 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/passt.1 b/passt.1
index f084978..02a9bcc 100644
--- a/passt.1
+++ b/passt.1
@@ -373,14 +373,14 @@ Translate \fIaddr\fR in the guest to be equal to the guest's assigned
 address on the host.  That is, packets from the guest to \fIaddr\fR
 will be redirected to the address assigned to the guest with \fB-a\fR,
 or by default the host's global address.  This allows the guest to
-access services availble on the host's global address, even though its
+access services available on the host's global address, even though its
 own address shadows that of the host.
 
 If \fIaddr\fR is 'none', no address is mapped.  Only one IPv4 and one
 IPv6 address can be translated, and if the option is specified
 multiple times, the last one for each address type takes effect.
 
-Default is no mapping.
+By default, mapping happens as described for the \-\-map-host-loopback option.
 
 .TP
 .BR \-4 ", " \-\-ipv4-only

From 6819b2e1020411661dc0487ee3614f012d45b049 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 25 Nov 2024 11:46:33 +0100
Subject: [PATCH 125/382] conf, passt.1: Update --mac-addr default in usage()
 and man page

Fixes: 90e83d50a9bd ("Don't take "our" MAC address from the host")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c  | 2 +-
 passt.1 | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/conf.c b/conf.c
index 86566db..d342c8a 100644
--- a/conf.c
+++ b/conf.c
@@ -788,7 +788,7 @@ static void usage(const char *name, FILE *f, int status)
 		"  -n, --netmask MASK	Assign IPv4 MASK, dot-decimal or bits\n"
 		"    default: netmask from matching address on the host\n"
 		"  -M, --mac-addr ADDR	Use source MAC address ADDR\n"
-		"    default: MAC address from interface with default route\n"
+		"    default: 9a:55:9a:55:9a:55 (locally administered)\n"
 		"  -g, --gateway ADDR	Pass IPv4 or IPv6 address as gateway\n"
 		"    default: gateway from interface with default route\n"
 		"  -i, --interface NAME	Interface for addresses and routes\n"
diff --git a/passt.1 b/passt.1
index 02a9bcc..059abd3 100644
--- a/passt.1
+++ b/passt.1
@@ -174,8 +174,7 @@ according to the CIDR block of the assigned address (RFC 4632).
 .BR \-M ", " \-\-mac-addr " " \fIaddr
 Use source MAC address \fIaddr\fR when communicating to the guest or to the
 target namespace.
-Default is to use the MAC address of the interface with the first IPv4 default
-route on the host.
+Default is the locally administered MAC addresses 9a:55:9a:55:9a:55.
 
 .TP
 .BR \-g ", " \-\-gateway " " \fIaddr

From 2bf8ffcf078c5933e6a31dbffbfb4dc31bfd7bc5 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 25 Nov 2024 11:53:10 +0100
Subject: [PATCH 126/382] test/perf: Select a single IPv6 namespace address in
 pasta tests

By dropping the filter on prefix length, commit 910f4f910301
("test: Don't require 64-bit prefixes in perf tests") broke tests on
setups where two global unicast IPv6 addresses are available, which
is the typical case when the "host" is a VM running under passt with
addresses from SLAAC and DHCPv6, because two addresses will be
returned.

Pick the first one instead. We don't really care about the prefix
length, any of these addresses will work.

Fixes: 910f4f910301 ("test: Don't require 64-bit prefixes in perf tests")
Link: https://archives.passt.top/passt-dev/20241119214344.6b4a5b3a@elisabeth/
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 test/perf/pasta_tcp | 2 +-
 test/perf/pasta_udp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/perf/pasta_tcp b/test/perf/pasta_tcp
index 88284b2..bc0de3c 100644
--- a/test/perf/pasta_tcp
+++ b/test/perf/pasta_tcp
@@ -211,7 +211,7 @@ tr	TCP throughput over IPv6: host to ns
 iperf3s	ns 10002
 
 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]'
 bw	-
 bw	-
 bw	-
diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp
index 3d07091..ab2f3e8 100644
--- a/test/perf/pasta_udp
+++ b/test/perf/pasta_udp
@@ -196,7 +196,7 @@ tr	UDP throughput over IPv6: host to ns
 iperf3s	ns 10002
 
 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]'
 iperf3	BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472
 bw	__BW__ 0.3 0.5
 iperf3	BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972

From cda7f160f091515770a103765d50bac0f136faef Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 25 Nov 2024 08:50:39 +0100
Subject: [PATCH 127/382] ndp: Don't send first periodic router advertisement
 right after guest connects

This is very visible with muvm, but it also happens with QEMU: we're
sending the first unsolicited router advertisement milliseconds after
the guest connects.

That's usually pointless because, when the hypervisor connects, the
guest is typically not ready yet to process anything of that sort:
it's still booting. And if we happen to send it late enough (still
milliseconds), with muvm, while the message is discarded, it
sometimes (slightly) delays the response to the first solicited
router advertisement, which is the one we need to have coming fast.

Skip sending the unsolicited advertisement on the first timer run,
just calculate the next delay. Keep it simple by observing that we're
probably not trying to reach the 1970s with IPv6.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 ndp.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ndp.c b/ndp.c
index 1752d64..37bf7a3 100644
--- a/ndp.c
+++ b/ndp.c
@@ -420,9 +420,13 @@ void ndp_timer(const struct ctx *c, const struct timespec *now)
 	interval = min_rtr_adv_interval +
 		random() % (max_rtr_adv_interval - min_rtr_adv_interval);
 
+	if (!next_ra)
+		goto first;
+
 	info("NDP: sending unsolicited RA, next in %llds", (long long)interval);
 
 	ndp_ra(c, &in6addr_ll_all_nodes);
 
+first:
 	next_ra = now->tv_sec + interval;
 }

From c6e61064139ba94a763097144d1a84bd4fbafade Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 26 Nov 2024 14:27:27 +1100
Subject: [PATCH 128/382] test: Improve logic for waiting for SLAAC & DAD to
 complete in NDP tests

Since 9a0e544f05bf the NDP tests attempt to explicitly wait for DAD to
complete, rather than just having a hard coded sleep.  However, the
conditions we use are a bit sloppy and allow for a number of possible cases
where it might not work correctly.  Stefano seems to be hitting one of
these (though I'm not sure which) with some later patches.

 - We wait for *lack* of a tentative address, so if the first check occurs
   before we have even a tentative address it will bypass the delay
 - It's not entirely clear if the permanent address will always appear
   as soon as the tentative address disappears
 - We weren't filtering on interface
 - We were doing the filtering with ip-address options rather than in jq.
   However in at least in some circumstances this seems to result in an
   empty .addr_info field, rather than omitting it entirely, which could
   cause us to get the wrong result

So, instead, explicitly wait for the address we need to be present: an
RA provided address on the external interface.  While we're here we remove
the requirement that it have global scope: the "kernel_ra" check is already
sufficient to make sure this address comes from an NDP RA, not something
else.  If it's not the global scope address we expect, better to check it
and fail, rather than keep waiting.

Fixes: 9a0e544f05bf ("test: Improve test for NDP assigned prefix")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/passt/ndp | 6 +++---
 test/pasta/ndp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/passt/ndp b/test/passt/ndp
index 56b385b..516cd6b 100644
--- a/test/passt/ndp
+++ b/test/passt/ndp
@@ -17,13 +17,13 @@ htools	ip jq sipcalc grep cut
 test	Interface name
 gout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
 guest	ip link set dev __IFNAME__ up
-# Wait for DAD to complete
-guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+# Wait for SLAAC & DAD to complete
+guest	while ! ip -j -6 addr show dev __IFNAME__ | jq -e '.[].addr_info.[] | select(.protocol == "kernel_ra")'; do sleep 0.1; done
 hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
 check	[ -n "__IFNAME__" ]
 
 test	SLAAC: prefix
-gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
+gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
 gout	PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 hout	HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
diff --git a/test/pasta/ndp b/test/pasta/ndp
index 2442ab5..952c1ea 100644
--- a/test/pasta/ndp
+++ b/test/pasta/ndp
@@ -18,11 +18,11 @@ test	Interface name
 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
 check	[ -n "__IFNAME__" ]
 ns	ip link set dev __IFNAME__ up
-# Wait for DAD to complete
-ns	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+# Wait for SLAAC & DAD to complete
+ns	while ! ip -j -6 addr show dev __IFNAME__ | jq -e '.[].addr_info.[] | select(.protocol == "kernel_ra")'; do sleep 0.1; done
 
 test	SLAAC: prefix
-nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
 nsout	PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 hout	HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4

From 14b84a7f077ecb734bb0e724f70bafeaa6d35a61 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 22 Nov 2024 07:57:43 +0100
Subject: [PATCH 129/382] treewide: Introduce 'local mode' for disconnected
 setups

There are setups where no host interface is available or configured
at all, intentionally or not, temporarily or not, but users expect
(Podman) containers to run in any case as they did with slirp4netns,
and we're now getting reports that we broke such setups at a rather
alarming rate.

To this end, if we don't find any usable host interface, instead of
exiting:

- for IPv4, use 169.254.2.1 as guest/container address and 169.254.2.2
  as default gateway

- for IPv6, don't assign any address (forcibly disable DHCPv6), and
  use the *first* link-local address we observe to represent the
  guest/container. Advertise fe80::1 as default gateway

- use 'tap0' as default interface name for pasta

Change ifi4 and ifi6 in struct ctx to int and accept a special -1
value meaning that no host interface was selected, but the IP family
is enabled. The fact that the kernel uses unsigned int values for
those is not an issue as 1. one can't create so many interfaces
anyway and 2. we otherwise handle those values transparently.

Fix a botched conditional in conf_print() to actually skip printing
DHCPv6 information if DHCPv6 is disabled (and skip printing NDP
information if NDP is disabled).

Link: https://github.com/containers/podman/issues/24614
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c  | 97 ++++++++++++++++++++++++++++++++++++++++++++-------------
 passt.1 | 33 +++++++++++++++++---
 passt.h |  8 ++---
 pasta.c |  7 +++--
 tap.c   |  3 ++
 5 files changed, 116 insertions(+), 32 deletions(-)

diff --git a/conf.c b/conf.c
index d342c8a..c6bffc4 100644
--- a/conf.c
+++ b/conf.c
@@ -48,6 +48,20 @@
 
 #define NETNS_RUN_DIR	"/run/netns"
 
+#define IP4_LL_GUEST_ADDR	(struct in_addr){ htonl_constant(0xa9fe0201) }
+				/* 169.254.2.1, libslirp default: 10.0.2.1 */
+
+#define IP4_LL_GUEST_GW		(struct in_addr){ htonl_constant(0xa9fe0202) }
+				/* 169.254.2.2, libslirp default: 10.0.2.2 */
+
+#define IP4_LL_PREFIX_LEN	16
+
+#define IP6_LL_GUEST_GW		(struct in6_addr)			\
+				{{{ 0xfe, 0x80, 0, 0, 0, 0, 0, 0,	\
+				       0, 0, 0, 0, 0, 0, 0, 0x01 }}}
+
+const char *pasta_default_ifn = "tap0";
+
 /**
  * next_chunk - Return the next piece of a string delimited by a character
  * @s:		String to search
@@ -631,7 +645,7 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
 		ifi = nl_get_ext_if(nl_sock, AF_INET);
 
 	if (!ifi) {
-		info("Couldn't pick external interface: disabling IPv4");
+		debug("Failed to detect external interface for IPv4");
 		return 0;
 	}
 
@@ -639,8 +653,8 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
 		int rc = nl_route_get_def(nl_sock, ifi, AF_INET,
 					  &ip4->guest_gw);
 		if (rc < 0) {
-			err("Couldn't discover IPv4 gateway address: %s",
-			    strerror(-rc));
+			debug("Couldn't discover IPv4 gateway address: %s",
+			      strerror(-rc));
 			return 0;
 		}
 	}
@@ -649,8 +663,8 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
 		int rc = nl_addr_get(nl_sock, ifi, AF_INET,
 				     &ip4->addr, &ip4->prefix_len, NULL);
 		if (rc < 0) {
-			err("Couldn't discover IPv4 address: %s",
-			    strerror(-rc));
+			debug("Couldn't discover IPv4 address: %s",
+			      strerror(-rc));
 			return 0;
 		}
 	}
@@ -677,6 +691,19 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
 	return ifi;
 }
 
+/**
+ * conf_ip4_local() - Configure IPv4 addresses and attributes for local mode
+ * @ip4:	IPv4 context (will be written)
+ */
+static void conf_ip4_local(struct ip4_ctx *ip4)
+{
+	ip4->addr_seen = ip4->addr = IP4_LL_GUEST_ADDR;
+	ip4->our_tap_addr = ip4->guest_gw = IP4_LL_GUEST_GW;
+	ip4->prefix_len = IP4_LL_PREFIX_LEN;
+
+	ip4->no_copy_addrs = ip4->no_copy_routes = true;
+}
+
 /**
  * conf_ip6() - Verify or detect IPv6 support, get relevant addresses
  * @ifi:	Host interface to attempt (0 to determine one)
@@ -693,15 +720,15 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
 		ifi = nl_get_ext_if(nl_sock, AF_INET6);
 
 	if (!ifi) {
-		info("Couldn't pick external interface: disabling IPv6");
+		debug("Failed to detect external interface for IPv6");
 		return 0;
 	}
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->guest_gw)) {
 		rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->guest_gw);
 		if (rc < 0) {
-			err("Couldn't discover IPv6 gateway address: %s",
-			    strerror(-rc));
+			debug("Couldn't discover IPv6 gateway address: %s",
+			      strerror(-rc));
 			return 0;
 		}
 	}
@@ -710,7 +737,7 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
 			 IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ? &ip6->addr : NULL,
 			 &prefix_len, &ip6->our_tap_ll);
 	if (rc < 0) {
-		err("Couldn't discover IPv6 address: %s", strerror(-rc));
+		debug("Couldn't discover IPv6 address: %s", strerror(-rc));
 		return 0;
 	}
 
@@ -726,6 +753,17 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
 	return ifi;
 }
 
+/**
+ * conf_ip6_local() - Configure IPv6 addresses and attributes for local mode
+ * @ip6:	IPv6 context (will be written)
+ */
+static void conf_ip6_local(struct ip6_ctx *ip6)
+{
+	ip6->our_tap_ll = ip6->guest_gw = IP6_LL_GUEST_GW;
+
+	ip6->no_copy_addrs = ip6->no_copy_routes = true;
+}
+
 /**
  * usage() - Print usage, exit with given status code
  * @name:	Executable name
@@ -948,12 +986,14 @@ static void conf_print(const struct ctx *c)
 	char bufmac[ETH_ADDRSTRLEN], ifn[IFNAMSIZ];
 	int i;
 
-	info("Template interface: %s%s%s%s%s",
-	     c->ifi4 ? if_indextoname(c->ifi4, ifn) : "",
-	     c->ifi4 ? " (IPv4)" : "",
-	     (c->ifi4 && c->ifi6) ? ", " : "",
-	     c->ifi6 ? if_indextoname(c->ifi6, ifn) : "",
-	     c->ifi6 ? " (IPv6)" : "");
+	if (c->ifi4 > 0 || c->ifi6 > 0) {
+		info("Template interface: %s%s%s%s%s",
+		     c->ifi4 > 0 ? if_indextoname(c->ifi4, ifn) : "",
+		     c->ifi4 > 0 ? " (IPv4)" : "",
+		     (c->ifi4 && c->ifi6) ? ", " : "",
+		     c->ifi6 > 0 ? if_indextoname(c->ifi6, ifn) : "",
+		     c->ifi6 > 0 ? " (IPv6)" : "");
+	}
 
 	if (*c->ip4.ifname_out || *c->ip6.ifname_out) {
 		info("Outbound interface: %s%s%s%s%s",
@@ -1024,9 +1064,9 @@ static void conf_print(const struct ctx *c)
 
 		if (!c->no_ndp && !c->no_dhcpv6)
 			info("NDP/DHCPv6:");
-		else if (!c->no_ndp)
-			info("DHCPv6:");
 		else if (!c->no_dhcpv6)
+			info("DHCPv6:");
+		else if (!c->no_ndp)
 			info("NDP:");
 		else
 			goto dns6;
@@ -1733,10 +1773,23 @@ void conf(struct ctx *c, int argc, char **argv)
 		c->ifi4 = conf_ip4(ifi4, &c->ip4);
 	if (!v4_only)
 		c->ifi6 = conf_ip6(ifi6, &c->ip6);
-	if ((!c->ifi4 && !c->ifi6) ||
-	    (*c->ip4.ifname_out && !c->ifi4) ||
+	if ((*c->ip4.ifname_out && !c->ifi4) ||
 	    (*c->ip6.ifname_out && !c->ifi6))
 		die("External interface not usable");
+	if (!c->ifi4 && !c->ifi6) {
+		info("No external interface as template, switch to local mode");
+
+		conf_ip4_local(&c->ip4);
+		c->ifi4 = -1;
+
+		conf_ip6_local(&c->ip6);
+		c->ifi6 = -1;
+
+		if (!*c->pasta_ifn) {
+			strncpy(c->pasta_ifn, pasta_default_ifn,
+				sizeof(c->pasta_ifn) - 1);
+		}
+	}
 
 	if (c->ifi4 && !no_map_gw &&
 	    IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
@@ -1840,6 +1893,8 @@ void conf(struct ctx *c, int argc, char **argv)
 	if (!c->ifi6) {
 		c->no_ndp = 1;
 		c->no_dhcpv6 = 1;
+	} else if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) {
+		c->no_dhcpv6 = 1;
 	}
 
 	if (!c->mtu)
@@ -1848,9 +1903,9 @@ void conf(struct ctx *c, int argc, char **argv)
 	get_dns(c);
 
 	if (!*c->pasta_ifn) {
-		if (c->ifi4)
+		if (c->ifi4 > 0)
 			if_indextoname(c->ifi4, c->pasta_ifn);
-		else
+		else if (c->ifi6 > 0)
 			if_indextoname(c->ifi6, c->pasta_ifn);
 	}
 
diff --git a/passt.1 b/passt.1
index 059abd3..15c8338 100644
--- a/passt.1
+++ b/passt.1
@@ -160,7 +160,9 @@ once for IPv6).
 By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces
 with the first default route, if any, for the corresponding IP version. If no
 default routes are available and there is any interface with any route for a
-given IP version, the first of these interfaces will be chosen instead.
+given IP version, the first of these interfaces will be chosen instead. If no
+such interface exists, the link-local address 169.254.2.1 is assigned for IPv4,
+and no additional address will be assigned for IPv6.
 
 .TP
 .BR \-n ", " \-\-netmask " " \fImask
@@ -187,7 +189,9 @@ first default route, if any, for the corresponding IP version. If the default
 route is a multipath one, the gateway is the first nexthop router returned by
 the kernel which has the highest weight in the set of paths. If no default
 routes are available and there is just one interface with any route, that
-interface will be chosen instead.
+interface will be chosen instead. If no such interface exists, the link-local
+address 169.254.2.2 is used for IPv4, and the link-local address fe80::1 is used
+for IPv6.
 
 Note: these addresses are also used as source address for packets directed to
 the guest or to the target namespace having a loopback or local source address,
@@ -202,7 +206,9 @@ Default is to use the interfaces specified by \fB--outbound-if4\fR and
 
 If no interfaces are given, the interface with the first default routes for each
 IP version is selected. If no default routes are available and there is just one
-interface with any route, that interface will be chosen instead.
+interface with any route, that interface will be chosen instead. If no such
+interface exists, host interfaces will be ignored for the purposes of assigning
+addresses and routes, and link-local addresses will be used instead.
 
 .TP
 .BR \-o ", " \-\-outbound " " \fIaddr
@@ -221,7 +227,8 @@ derive IPv4 addresses and routes.
 
 By default, the interface given by the default route is selected. If no default
 routes are available and there is just one interface with any route, that
-interface will be chosen instead.
+interface will be chosen instead. If no such interface exists, outbound sockets
+will not be bound to any specific interface.
 
 .TP
 .BR \-\-outbound-if6 " " \fIname
@@ -231,7 +238,8 @@ derive IPv6 addresses and routes.
 
 By default, the interface given by the default route is selected. If no default
 routes are available and there is just one interface with any route, that
-interface will be chosen instead.
+interface will be chosen instead. If no such interface exists, outbound sockets
+will not be bound to any specific interface.
 
 .TP
 .BR \-D ", " \-\-dns " " \fIaddr
@@ -503,6 +511,7 @@ Default is \fBnone\fR.
 .BR \-I ", " \-\-ns-ifname " " \fIname
 Name of tap interface to be created in target namespace.
 By default, the same interface name as the external, routable interface is used.
+If no such interface exists, the name \fItap0\fR will be used instead.
 
 .TP
 .BR \-t ", " \-\-tcp-ports " " \fIspec
@@ -1031,6 +1040,20 @@ If the sending window cannot be queried, it will always be announced as the
 current sending buffer size to guest or target namespace. This might affect
 throughput of TCP connections.
 
+.SS Local mode for disconnected setups
+
+If \fBpasst\fR and \fBpasta\fR fail to find a host interface with a configured
+address, other than loopback addresses, they will, obviously, not attempt to
+source addresses or routes from the host.
+
+In this case, unless configured otherwise, they will assign the IPv4 link-local
+address 169.254.2.1 to the guest or target namespace, and no IPv6 address. The
+notion of the guest or target namespace IPv6 address is derived from the first
+link-local address observed.
+
+Default gateways will be assigned as the link-local address 169.254.2.2 for
+IPv4, and as the link-local address fe80::1 for IPv6.
+
 .SH LIMITATIONS
 
 Currently, IGMP/MLD proxying (RFC 4605) and support for SCTP (RFC 4960) are not
diff --git a/passt.h b/passt.h
index 72c7f72..799ee50 100644
--- a/passt.h
+++ b/passt.h
@@ -202,10 +202,10 @@ struct ip6_ctx {
  * @our_tap_mac:	Pasta/passt's MAC on the tap link
  * @guest_mac:		MAC address of guest or namespace, seen or configured
  * @hash_secret:	128-bit secret for siphash functions
- * @ifi4:		Index of template interface for IPv4, 0 if IPv4 disabled
+ * @ifi4:		Template interface for IPv4, -1: none, 0: IPv4 disabled
  * @ip:			IPv4 configuration
  * @dns_search:		DNS search list
- * @ifi6:		Index of template interface for IPv6, 0 if IPv6 disabled
+ * @ifi6:		Template interface for IPv6, -1: none, 0: IPv6 disabled
  * @ip6:		IPv6 configuration
  * @pasta_ifn:		Name of namespace interface for pasta
  * @pasta_ifi:		Index of namespace interface for pasta
@@ -258,12 +258,12 @@ struct ctx {
 	unsigned char guest_mac[ETH_ALEN];
 	uint64_t hash_secret[2];
 
-	unsigned int ifi4;
+	int ifi4;
 	struct ip4_ctx ip4;
 
 	struct fqdn dns_search[MAXDNSRCH];
 
-	unsigned int ifi6;
+	int ifi6;
 	struct ip6_ctx ip6;
 
 	char pasta_ifn[IF_NAMESIZE];
diff --git a/pasta.c b/pasta.c
index a117704..96dacc3 100644
--- a/pasta.c
+++ b/pasta.c
@@ -369,8 +369,11 @@ void pasta_ns_conf(struct ctx *c)
 					  0, IFF_NOARP);
 
 			if (c->ip6.no_copy_addrs) {
-				rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
-						 AF_INET6, &c->ip6.addr, 64);
+				if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) {
+					rc = nl_addr_set(nl_sock_ns,
+							 c->pasta_ifi, AF_INET6,
+							 &c->ip6.addr, 64);
+				}
 			} else {
 				rc = nl_addr_dup(nl_sock, c->ifi6,
 						 nl_sock_ns, c->pasta_ifi,
diff --git a/tap.c b/tap.c
index 14d9b3d..5347df4 100644
--- a/tap.c
+++ b/tap.c
@@ -803,6 +803,9 @@ resume:
 			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen)) {
 				c->ip6.addr_seen = *saddr;
 			}
+
+			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr))
+				c->ip6.addr = *saddr;
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(saddr)){
 			c->ip6.addr_seen = *saddr;
 		}

From d6e9e2486f092901207e6565f5eee3817cf4e11a Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 15 Nov 2024 18:13:17 +0100
Subject: [PATCH 130/382] dhcp: Use -1 as "missing option" length instead of 0

We want to add support for option 80 (Rapid Commit, RFC 4039), whose
length is 0.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 dhcp.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/dhcp.c b/dhcp.c
index a06f143..387aee3 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -36,9 +36,9 @@
 /**
  * struct opt - DHCP option
  * @sent:	Convenience flag, set while filling replies
- * @slen:	Length of option defined for server
+ * @slen:	Length of option defined for server, -1 if not going to be sent
  * @s:		Option payload from server
- * @clen:	Length of option received from client
+ * @clen:	Length of option received from client, -1 if not received
  * @c:		Option payload from client
  */
 struct opt {
@@ -68,6 +68,11 @@ static struct opt opts[255];
  */
 void dhcp_init(void)
 {
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(opts); i++)
+		opts[i].slen = -1;
+
 	opts[1]  = (struct opt) { 0, 4, {     0 }, 0, { 0 }, };	/* Mask */
 	opts[3]  = (struct opt) { 0, 4, {     0 }, 0, { 0 }, };	/* Router */
 	opts[51] = (struct opt) { 0, 4, {  0xff,
@@ -154,17 +159,17 @@ static int fill(struct msg *m)
 	 * option 53 at the beginning of the list.
 	 * Put it there explicitly, unless requested via option 55.
 	 */
-	if (!memchr(opts[55].c, 53, opts[55].clen))
+	if (opts[55].clen > 0 && !memchr(opts[55].c, 53, opts[55].clen))
 		fill_one(m, 53, &offset);
 
 	for (i = 0; i < opts[55].clen; i++) {
 		o = opts[55].c[i];
-		if (opts[o].slen)
+		if (opts[o].slen != -1)
 			fill_one(m, o, &offset);
 	}
 
 	for (o = 0; o < 255; o++) {
-		if (opts[o].slen && !opts[o].sent)
+		if (opts[o].slen != -1 && !opts[o].sent)
 			fill_one(m, o, &offset);
 	}
 
@@ -264,6 +269,9 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len)
 						 ".\xc0");
 		}
 	}
+
+	if (!opts[119].slen)
+		opts[119].slen = -1;
 }
 
 /**
@@ -313,6 +321,9 @@ int dhcp(const struct ctx *c, const struct pool *p)
 
 	offset += offsetof(struct msg, o);
 
+	for (i = 0; i < ARRAY_SIZE(opts); i++)
+		opts[i].clen = -1;
+
 	while (opt_off + 2 < opt_len) {
 		const uint8_t *olen, *val;
 		uint8_t *type;
@@ -331,11 +342,12 @@ int dhcp(const struct ctx *c, const struct pool *p)
 		opt_off += *olen + 2;
 	}
 
-	if (opts[53].c[0] == DHCPDISCOVER) {
+	if (opts[53].clen > 0 && opts[53].c[0] == DHCPDISCOVER) {
 		info("DHCP: offer to discover");
 		opts[53].s[0] = DHCPOFFER;
-	} else if (opts[53].c[0] == DHCPREQUEST || !opts[53].clen) {
-		info("%s: ack to request", opts[53].clen ? "DHCP" : "BOOTP");
+	} else if (opts[53].clen <= 0 || opts[53].c[0] == DHCPREQUEST) {
+		info("%s: ack to request", /* DHCP needs a valid message type */
+		     (opts[53].clen <= 0) ? "BOOTP" : "DHCP");
 		opts[53].s[0] = DHCPACK;
 	} else {
 		return -1;
@@ -374,6 +386,8 @@ int dhcp(const struct ctx *c, const struct pool *p)
 		((struct in_addr *)opts[6].s)[i] = c->ip4.dns[i];
 		opts[6].slen += sizeof(uint32_t);
 	}
+	if (!opts[6].slen)
+		opts[6].slen = -1;
 
 	if (!c->no_dhcp_dns_search)
 		opt_set_dns_search(c, sizeof(m->o));

From 9da2038485c9334d28df34d2ebd5ba04a3c7662d Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 15 Nov 2024 18:18:22 +0100
Subject: [PATCH 131/382] dhcp: Introduce support for Rapid Commit (option 80,
 RFC 4039)

I'm trying to speed up and simplify IP address acquisition in muvm.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 dhcp.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/dhcp.c b/dhcp.c
index 387aee3..a16cde8 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -342,9 +342,16 @@ int dhcp(const struct ctx *c, const struct pool *p)
 		opt_off += *olen + 2;
 	}
 
+	opts[80].slen = -1;
 	if (opts[53].clen > 0 && opts[53].c[0] == DHCPDISCOVER) {
-		info("DHCP: offer to discover");
-		opts[53].s[0] = DHCPOFFER;
+		if (opts[80].clen == -1) {
+			info("DHCP: offer to discover");
+			opts[53].s[0] = DHCPOFFER;
+		} else {
+			info("DHCP: ack to discover (Rapid Commit)");
+			opts[53].s[0] = DHCPACK;
+			opts[80].slen = 0;
+		}
 	} else if (opts[53].clen <= 0 || opts[53].c[0] == DHCPREQUEST) {
 		info("%s: ack to request", /* DHCP needs a valid message type */
 		     (opts[53].clen <= 0) ? "BOOTP" : "DHCP");

From c0fbc7ef2ae2919bf6162b4149d341f448289836 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 25 Nov 2024 00:52:57 +0100
Subject: [PATCH 132/382] dhcp: Honour broadcast flag (RFC 2131, 4.1)

It's widely considered a legacy option nowadays, and I've haven't seen
clients setting it since Windows 95, but it's convenient for a minimal
DHCP client not using raw IP sockets such as what I'm playing with for
muvm.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 dhcp.c | 12 ++++++++++--
 ip.h   |  3 +++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/dhcp.c b/dhcp.c
index a16cde8..d8515aa 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -112,6 +112,8 @@ struct msg {
 	uint32_t xid;
 	uint16_t secs;
 	uint16_t flags;
+#define FLAG_BROADCAST	htons_constant(0x8000)
+
 	uint32_t ciaddr;
 	struct in_addr yiaddr;
 	uint32_t siaddr;
@@ -285,10 +287,10 @@ int dhcp(const struct ctx *c, const struct pool *p)
 {
 	size_t mlen, dlen, offset = 0, opt_len, opt_off = 0;
 	char macstr[ETH_ADDRSTRLEN];
+	struct in_addr mask, dst;
 	const struct ethhdr *eh;
 	const struct iphdr *iph;
 	const struct udphdr *uh;
-	struct in_addr mask;
 	unsigned int i;
 	struct msg *m;
 
@@ -400,7 +402,13 @@ int dhcp(const struct ctx *c, const struct pool *p)
 		opt_set_dns_search(c, sizeof(m->o));
 
 	dlen = offsetof(struct msg, o) + fill(m);
-	tap_udp4_send(c, c->ip4.our_tap_addr, 67, c->ip4.addr, 68, m, dlen);
+
+	if (m->flags & FLAG_BROADCAST)
+		dst = in4addr_broadcast;
+	else
+		dst = c->ip4.addr;
+
+	tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, m, dlen);
 
 	return 1;
 }
diff --git a/ip.h b/ip.h
index 0742612..1544dbf 100644
--- a/ip.h
+++ b/ip.h
@@ -101,4 +101,7 @@ static const struct in6_addr in6addr_ll_all_nodes = {
 	},
 };
 
+/* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */
+static const struct in_addr in4addr_broadcast = { 0xffffffff };
+
 #endif /* IP_H */

From dd143e38901af1c0c48f483d93558ba0773667db Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 22 Nov 2024 17:43:28 +0100
Subject: [PATCH 133/382] packet: replace struct desc by struct iovec

To be able to manage buffers inside a shared memory provided
by a VM via a vhost-user interface, we cannot rely on the fact
that buffers are located in a pre-defined memory area and use
a base address and a 32bit offset to address them.

We need a 64bit address, so replace struct desc by struct iovec
and update range checking.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 84 ++++++++++++++++++++++++++++++--------------------------
 packet.h | 14 ++--------
 2 files changed, 47 insertions(+), 51 deletions(-)

diff --git a/packet.c b/packet.c
index ccfc846..3748996 100644
--- a/packet.c
+++ b/packet.c
@@ -22,6 +22,35 @@
 #include "util.h"
 #include "log.h"
 
+/**
+ * packet_check_range() - Check if a packet memory range is valid
+ * @p:		Packet pool
+ * @offset:	Offset of data range in packet descriptor
+ * @len:	Length of desired data range
+ * @start:	Start of the packet descriptor
+ * @func:	For tracing: name of calling function
+ * @line:	For tracing: caller line of function call
+ *
+ * Return: 0 if the range is valid, -1 otherwise
+ */
+static int packet_check_range(const struct pool *p, size_t offset, size_t len,
+			      const char *start, const char *func, int line)
+{
+	if (start < p->buf) {
+		trace("packet start %p before buffer start %p, "
+		      "%s:%i", (void *)start, (void *)p->buf, func, line);
+		return -1;
+	}
+
+	if (start + len + offset > p->buf + p->buf_size) {
+		trace("packet offset plus length %lu from size %lu, "
+		      "%s:%i", start - p->buf + len + offset,
+		      p->buf_size, func, line);
+		return -1;
+	}
+
+	return 0;
+}
 /**
  * packet_add_do() - Add data as packet descriptor to given pool
  * @p:		Existing pool
@@ -41,34 +70,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 		return;
 	}
 
-	if (start < p->buf) {
-		trace("add packet start %p before buffer start %p, %s:%i",
-		      (void *)start, (void *)p->buf, func, line);
+	if (packet_check_range(p, 0, len, start, func, line))
 		return;
-	}
-
-	if (start + len > p->buf + p->buf_size) {
-		trace("add packet start %p, length: %zu, buffer end %p, %s:%i",
-		      (void *)start, len, (void *)(p->buf + p->buf_size),
-		      func, line);
-		return;
-	}
 
 	if (len > UINT16_MAX) {
 		trace("add packet length %zu, %s:%i", len, func, line);
 		return;
 	}
 
-#if UINTPTR_MAX == UINT64_MAX
-	if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {
-		trace("add packet start %p, buffer start %p, %s:%i",
-		      (void *)start, (void *)p->buf, func, line);
-		return;
-	}
-#endif
-
-	p->pkt[idx].offset = start - p->buf;
-	p->pkt[idx].len = len;
+	p->pkt[idx].iov_base = (void *)start;
+	p->pkt[idx].iov_len = len;
 
 	p->count++;
 }
@@ -96,36 +107,31 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		return NULL;
 	}
 
-	if (len > UINT16_MAX || len + offset > UINT32_MAX) {
+	if (len > UINT16_MAX) {
 		if (func) {
-			trace("packet data length %zu, offset %zu, %s:%i",
-			      len, offset, func, line);
+			trace("packet data length %zu, %s:%i",
+			      len, func, line);
 		}
 		return NULL;
 	}
 
-	if (p->pkt[idx].offset + len + offset > p->buf_size) {
+	if (len + offset > p->pkt[idx].iov_len) {
 		if (func) {
-			trace("packet offset plus length %zu from size %zu, "
-			      "%s:%i", p->pkt[idx].offset + len + offset,
-			      p->buf_size, func, line);
-		}
-		return NULL;
-	}
-
-	if (len + offset > p->pkt[idx].len) {
-		if (func) {
-			trace("data length %zu, offset %zu from length %u, "
-			      "%s:%i", len, offset, p->pkt[idx].len,
+			trace("data length %zu, offset %zu from length %zu, "
+			      "%s:%i", len, offset, p->pkt[idx].iov_len,
 			      func, line);
 		}
 		return NULL;
 	}
 
-	if (left)
-		*left = p->pkt[idx].len - offset - len;
+	if (packet_check_range(p, offset, len, p->pkt[idx].iov_base,
+			       func, line))
+		return NULL;
 
-	return p->buf + p->pkt[idx].offset + offset;
+	if (left)
+		*left = p->pkt[idx].iov_len - offset - len;
+
+	return (char *)p->pkt[idx].iov_base + offset;
 }
 
 /**
diff --git a/packet.h b/packet.h
index a784b07..8377dcf 100644
--- a/packet.h
+++ b/packet.h
@@ -6,16 +6,6 @@
 #ifndef PACKET_H
 #define PACKET_H
 
-/**
- * struct desc - Generic offset-based descriptor within buffer
- * @offset:	Offset of descriptor relative to buffer start, 32-bit limit
- * @len:	Length of descriptor, host order, 16-bit limit
- */
-struct desc {
-	uint32_t offset;
-	uint16_t len;
-};
-
 /**
  * struct pool - Generic pool of packets stored in a buffer
  * @buf:	Buffer storing packet descriptors
@@ -29,7 +19,7 @@ struct pool {
 	size_t buf_size;
 	size_t size;
 	size_t count;
-	struct desc pkt[1];
+	struct iovec pkt[1];
 };
 
 void packet_add_do(struct pool *p, size_t len, const char *start,
@@ -54,7 +44,7 @@ struct _name ## _t {							\
 	size_t buf_size;						\
 	size_t size;							\
 	size_t count;							\
-	struct desc pkt[_size];						\
+	struct iovec pkt[_size];					\
 }
 
 #define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size)			\

From 7d1cd4dbf50325b57eb25648f1f64168d7e4820b Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 22 Nov 2024 17:43:29 +0100
Subject: [PATCH 134/382] vhost-user: introduce virtio API

Add virtio.c and virtio.h that define the functions needed
to manage virtqueues.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile |   4 +-
 util.h   |   9 +
 virtio.c | 650 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 virtio.h | 183 ++++++++++++++++
 4 files changed, 844 insertions(+), 2 deletions(-)
 create mode 100644 virtio.c
 create mode 100644 virtio.h

diff --git a/Makefile b/Makefile
index 258d298..9b61a47 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
 PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
 	icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
 	ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
-	tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c
+	tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c virtio.c
 QRAP_SRCS = qrap.c
 SRCS = $(PASST_SRCS) $(QRAP_SRCS)
 
@@ -47,7 +47,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
 	lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
 	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
-	udp.h udp_flow.h util.h
+	udp.h udp_flow.h util.h virtio.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
 C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/util.h b/util.h
index 90428c4..41bbd60 100644
--- a/util.h
+++ b/util.h
@@ -144,7 +144,16 @@ static inline uint32_t ntohl_unaligned(const void *p)
 	return ntohl(val);
 }
 
+static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); }
+#define smp_mb()		do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0)
+#define smp_mb_release()	do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0)
+#define smp_mb_acquire()	do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0)
+
+#define smp_wmb()	smp_mb_release()
+#define smp_rmb()	smp_mb_acquire()
+
 #define NS_FN_STACK_SIZE	(1024 * 1024) /* 1MiB */
+
 int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
 	     void *arg);
 #define NS_CALL(fn, arg)						\
diff --git a/virtio.c b/virtio.c
new file mode 100644
index 0000000..b23a68c
--- /dev/null
+++ b/virtio.c
@@ -0,0 +1,650 @@
+// SPDX-License-Identifier: GPL-2.0-or-later AND BSD-3-Clause
+/*
+ * virtio API, vring and virtqueue functions definition
+ *
+ * Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ */
+
+/* Some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c
+ * originally licensed under the following terms:
+ *
+ * --
+ *
+ * Copyright IBM, Corp. 2007
+ * Copyright (c) 2016 Red Hat, Inc.
+ *
+ * Authors:
+ *  Anthony Liguori <aliguori@us.ibm.com>
+ *  Marc-André Lureau <mlureau@redhat.com>
+ *  Victor Kaplansky <victork@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ *
+ * Some parts copied from QEMU hw/virtio/virtio.c
+ * licensed under the following terms:
+ *
+ * Copyright IBM, Corp. 2007
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * --
+ *
+ * virtq_used_event() and virtq_avail_event() from
+ * https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-712000A
+ * licensed under the following terms:
+ *
+ * --
+ *
+ * This header is BSD licensed so anyone can use the definitions
+ * to implement compatible drivers/servers.
+ *
+ * Copyright 2007, 2009, IBM Corporation
+ * Copyright 2011, Red Hat, Inc
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ‘‘AS IS’’ AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <endian.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+
+#include "util.h"
+#include "virtio.h"
+
+#define VIRTQUEUE_MAX_SIZE 1024
+
+/**
+ * vu_gpa_to_va() - Translate guest physical address to our virtual address.
+ * @dev:	Vhost-user device
+ * @plen:	Physical length to map (input), capped to region (output)
+ * @guest_addr:	Guest physical address
+ *
+ * Return: virtual address in our address space of the guest physical address
+ */
+static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_addr)
+{
+	unsigned int i;
+
+	if (*plen == 0)
+		return NULL;
+
+	/* Find matching memory region. */
+	for (i = 0; i < dev->nregions; i++) {
+		const struct vu_dev_region *r = &dev->regions[i];
+
+		if ((guest_addr >= r->gpa) &&
+		    (guest_addr < (r->gpa + r->size))) {
+			if ((guest_addr + *plen) > (r->gpa + r->size))
+				*plen = r->gpa + r->size - guest_addr;
+			/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
+			return (void *)(guest_addr - r->gpa + r->mmap_addr +
+						     r->mmap_offset);
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * vring_avail_flags() - Read the available ring flags
+ * @vq:		Virtqueue
+ *
+ * Return: the available ring descriptor flags of the given virtqueue
+ */
+static inline uint16_t vring_avail_flags(const struct vu_virtq *vq)
+{
+	return le16toh(vq->vring.avail->flags);
+}
+
+/**
+ * vring_avail_idx() - Read the available ring index
+ * @vq:		Virtqueue
+ *
+ * Return: the available ring index of the given virtqueue
+ */
+static inline uint16_t vring_avail_idx(struct vu_virtq *vq)
+{
+	vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
+
+	return vq->shadow_avail_idx;
+}
+
+/**
+ * vring_avail_ring() - Read an available ring entry
+ * @vq:		Virtqueue
+ * @i:		Index of the entry to read
+ *
+ * Return: the ring entry content (head of the descriptor chain)
+ */
+static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i)
+{
+	return le16toh(vq->vring.avail->ring[i]);
+}
+
+/**
+ * virtq_used_event - Get location of used event indices
+ *		      (only with VIRTIO_F_EVENT_IDX)
+ * @vq		Virtqueue
+ *
+ * Return: return the location of the used event index
+ */
+static inline uint16_t *virtq_used_event(const struct vu_virtq *vq)
+{
+        /* For backwards compat, used event index is at *end* of avail ring. */
+        return &vq->vring.avail->ring[vq->vring.num];
+}
+
+/**
+ * vring_get_used_event() - Get the used event from the available ring
+ * @vq		Virtqueue
+ *
+ * Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set)
+ *         used_event is a performant alternative where the driver
+ *         specifies how far the device can progress before a notification
+ *         is required.
+ */
+static inline uint16_t vring_get_used_event(const struct vu_virtq *vq)
+{
+	return le16toh(*virtq_used_event(vq));
+}
+
+/**
+ * virtqueue_get_head() - Get the head of the descriptor chain for a given
+ *                        index
+ * @vq:		Virtqueue
+ * @idx:	Available ring entry index
+ * @head:	Head of the descriptor chain
+ */
+static void virtqueue_get_head(const struct vu_virtq *vq,
+			       unsigned int idx, unsigned int *head)
+{
+	/* Grab the next descriptor number they're advertising, and increment
+	 * the index we've seen.
+	 */
+	*head = vring_avail_ring(vq, idx % vq->vring.num);
+
+	/* If their number is silly, that's a fatal mistake. */
+	if (*head >= vq->vring.num)
+		die("vhost-user: Guest says index %u is available", *head);
+}
+
+/**
+ * virtqueue_read_indirect_desc() - Copy virtio ring descriptors from guest
+ *                                  memory
+ * @dev:	Vhost-user device
+ * @desc:	Destination address to copy the descriptors to
+ * @addr:	Guest memory address to copy from
+ * @len:	Length of memory to copy
+ *
+ * Return: -1 if there is an error, 0 otherwise
+ */
+static int virtqueue_read_indirect_desc(struct vu_dev *dev, struct vring_desc *desc,
+					uint64_t addr, size_t len)
+{
+	uint64_t read_len;
+
+	if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc)))
+		return -1;
+
+	if (len == 0)
+		return -1;
+
+	while (len) {
+		const struct vring_desc *orig_desc;
+
+		read_len = len;
+		orig_desc = vu_gpa_to_va(dev, &read_len, addr);
+		if (!orig_desc)
+			return -1;
+
+		memcpy(desc, orig_desc, read_len);
+		len -= read_len;
+		addr += read_len;
+		desc += read_len / sizeof(struct vring_desc);
+	}
+
+	return 0;
+}
+
+/**
+ * enum virtqueue_read_desc_state - State in the descriptor chain
+ * @VIRTQUEUE_READ_DESC_ERROR	Found an invalid descriptor
+ * @VIRTQUEUE_READ_DESC_DONE	No more descriptors in the chain
+ * @VIRTQUEUE_READ_DESC_MORE	there are more descriptors in the chain
+ */
+enum virtqueue_read_desc_state {
+	VIRTQUEUE_READ_DESC_ERROR = -1,
+	VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
+	VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
+};
+
+/**
+ * virtqueue_read_next_desc() - Read the the next descriptor in the chain
+ * @desc:	Virtio ring descriptors
+ * @i:		Index of the current descriptor
+ * @max:	Maximum value of the descriptor index
+ * @next:	Index of the next descriptor in the chain (output value)
+ *
+ * Return: current chain descriptor state (error, next, done)
+ */
+static int virtqueue_read_next_desc(const struct vring_desc *desc,
+				    int i, unsigned int max, unsigned int *next)
+{
+	/* If this descriptor says it doesn't chain, we're done. */
+	if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT))
+		return VIRTQUEUE_READ_DESC_DONE;
+
+	/* Check they're not leading us off end of descriptors. */
+	*next = le16toh(desc[i].next);
+	/* Make sure compiler knows to grab that: we don't want it changing! */
+	smp_wmb();
+
+	if (*next >= max)
+		return VIRTQUEUE_READ_DESC_ERROR;
+
+	return VIRTQUEUE_READ_DESC_MORE;
+}
+
+/**
+ * vu_queue_empty() - Check if virtqueue is empty
+ * @vq:		Virtqueue
+ *
+ * Return: true if the virtqueue is empty, false otherwise
+ */
+bool vu_queue_empty(struct vu_virtq *vq)
+{
+	if (vq->shadow_avail_idx != vq->last_avail_idx)
+		return false;
+
+	return vring_avail_idx(vq) == vq->last_avail_idx;
+}
+
+/**
+ * vring_can_notify() - Check if a notification can be sent
+ * @dev:	Vhost-user device
+ * @vq:		Virtqueue
+ *
+ * Return: true if notification can be sent
+ */
+static bool vring_can_notify(const struct vu_dev *dev, struct vu_virtq *vq)
+{
+	uint16_t old, new;
+	bool v;
+
+	/* We need to expose used array entries before checking used event. */
+	smp_mb();
+
+	/* Always notify when queue is empty (when feature acknowledge) */
+	if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
+	    !vq->inuse && vu_queue_empty(vq))
+		return true;
+
+	if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX))
+		return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
+
+	v = vq->signalled_used_valid;
+	vq->signalled_used_valid = true;
+	old = vq->signalled_used;
+	new = vq->signalled_used = vq->used_idx;
+	return !v || vring_need_event(vring_get_used_event(vq), new, old);
+}
+
+/**
+ * vu_queue_notify() - Send a notification to the given virtqueue
+ * @dev:	Vhost-user device
+ * @vq:		Virtqueue
+ */
+/* cppcheck-suppress unusedFunction */
+void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
+{
+	if (!vring_can_notify(dev, vq)) {
+		debug("vhost-user: virtqueue can skip notify...");
+		return;
+	}
+
+	if (eventfd_write(vq->call_fd, 1) < 0)
+		die_perror("Error writing vhost-user queue eventfd");
+}
+
+/* virtq_avail_event() -  Get location of available event indices
+ *			      (only with VIRTIO_F_EVENT_IDX)
+ * @vq:		Virtqueue
+ *
+ * Return: return the location of the available event index
+ */
+static inline uint16_t *virtq_avail_event(const struct vu_virtq *vq)
+{
+        /* For backwards compat, avail event index is at *end* of used ring. */
+        return (uint16_t *)&vq->vring.used->ring[vq->vring.num];
+}
+
+/**
+ * vring_set_avail_event() - Set avail_event
+ * @vq:		Virtqueue
+ * @val:	Value to set to avail_event
+ *		avail_event is used in the same way the used_event is in the
+ *		avail_ring.
+ *		avail_event is used to advise the driver that notifications
+ *		are unnecessary until the driver writes entry with an index
+ *		specified by avail_event into the available ring.
+ */
+static inline void vring_set_avail_event(const struct vu_virtq *vq,
+					 uint16_t val)
+{
+	uint16_t val_le = htole16(val);
+
+	if (!vq->notification)
+		return;
+
+	memcpy(virtq_avail_event(vq), &val_le, sizeof(val_le));
+}
+
+/**
+ * virtqueue_map_desc() - Translate descriptor ring physical address into our
+ * 			  virtual address space
+ * @dev:	Vhost-user device
+ * @p_num_sg:	First iov entry to use (input),
+ *		first iov entry not used (output)
+ * @iov:	Iov array to use to store buffer virtual addresses
+ * @max_num_sg:	Maximum number of iov entries
+ * @pa:		Guest physical address of the buffer to map into our virtual
+ * 		address
+ * @sz:		Size of the buffer
+ *
+ * Return: false on error, true otherwise
+ */
+static bool virtqueue_map_desc(struct vu_dev *dev,
+			       unsigned int *p_num_sg, struct iovec *iov,
+			       unsigned int max_num_sg,
+			       uint64_t pa, size_t sz)
+{
+	unsigned int num_sg = *p_num_sg;
+
+	ASSERT(num_sg < max_num_sg);
+	ASSERT(sz);
+
+	while (sz) {
+		uint64_t len = sz;
+
+		iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa);
+		if (iov[num_sg].iov_base == NULL)
+			die("vhost-user: invalid address for buffers");
+		iov[num_sg].iov_len = len;
+		num_sg++;
+		sz -= len;
+		pa += len;
+	}
+
+	*p_num_sg = num_sg;
+	return true;
+}
+
+/**
+ * vu_queue_map_desc - Map the virtqueue descriptor ring into our virtual
+ * 		       address space
+ * @dev:	Vhost-user device
+ * @vq:		Virtqueue
+ * @idx:	First descriptor ring entry to map
+ * @elem:	Virtqueue element to store descriptor ring iov
+ *
+ * Return: -1 if there is an error, 0 otherwise
+ */
+static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned int idx,
+			     struct vu_virtq_element *elem)
+{
+	const struct vring_desc *desc = vq->vring.desc;
+	struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
+	unsigned int out_num = 0, in_num = 0;
+	unsigned int max = vq->vring.num;
+	unsigned int i = idx;
+	uint64_t read_len;
+	int rc;
+
+	if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
+		unsigned int desc_len;
+		uint64_t desc_addr;
+
+		if (le32toh(desc[i].len) % sizeof(struct vring_desc))
+			die("vhost-user: Invalid size for indirect buffer table");
+
+		/* loop over the indirect descriptor table */
+		desc_addr = le64toh(desc[i].addr);
+		desc_len = le32toh(desc[i].len);
+		max = desc_len / sizeof(struct vring_desc);
+		read_len = desc_len;
+		desc = vu_gpa_to_va(dev, &read_len, desc_addr);
+		if (desc && read_len != desc_len) {
+			/* Failed to use zero copy */
+			desc = NULL;
+			if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len))
+				desc = desc_buf;
+		}
+		if (!desc)
+			die("vhost-user: Invalid indirect buffer table");
+		i = 0;
+	}
+
+	/* Collect all the descriptors */
+	do {
+		if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
+			if (!virtqueue_map_desc(dev, &in_num, elem->in_sg,
+						elem->in_num,
+						le64toh(desc[i].addr),
+						le32toh(desc[i].len)))
+				return -1;
+		} else {
+			if (in_num)
+				die("Incorrect order for descriptors");
+			if (!virtqueue_map_desc(dev, &out_num, elem->out_sg,
+						elem->out_num,
+						le64toh(desc[i].addr),
+						le32toh(desc[i].len))) {
+				return -1;
+			}
+		}
+
+		/* If we've got too many, that implies a descriptor loop. */
+		if ((in_num + out_num) > max)
+			die("vhost-user: Loop in queue descriptor list");
+		rc = virtqueue_read_next_desc(desc, i, max, &i);
+	} while (rc == VIRTQUEUE_READ_DESC_MORE);
+
+	if (rc == VIRTQUEUE_READ_DESC_ERROR)
+		die("vhost-user: Failed to read descriptor list");
+
+	elem->index = idx;
+	elem->in_num = in_num;
+	elem->out_num = out_num;
+
+	return 0;
+}
+
+/**
+ * vu_queue_pop() - Pop an entry from the virtqueue
+ * @dev:	Vhost-user device
+ * @vq:		Virtqueue
+ * @elem:	Virtqueue element to file with the entry information
+ *
+ * Return: -1 if there is an error, 0 otherwise
+ */
+/* cppcheck-suppress unusedFunction */
+int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem)
+{
+	unsigned int head;
+	int ret;
+
+	if (vu_queue_empty(vq))
+		return -1;
+
+	/* Needed after vu_queue_empty(), see comment in
+	 * virtqueue_num_heads().
+	 */
+	smp_rmb();
+
+	if (vq->inuse >= vq->vring.num)
+		die("vhost-user queue size exceeded");
+
+	virtqueue_get_head(vq, vq->last_avail_idx++, &head);
+
+	if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX))
+		vring_set_avail_event(vq, vq->last_avail_idx);
+
+	ret = vu_queue_map_desc(dev, vq, head, elem);
+
+	if (ret < 0)
+		return ret;
+
+	vq->inuse++;
+
+	return 0;
+}
+
+/**
+ * vu_queue_detach_element() - Detach an element from the virqueue
+ * @vq:		Virtqueue
+ */
+void vu_queue_detach_element(struct vu_virtq *vq)
+{
+	vq->inuse--;
+	/* unmap, when DMA support is added */
+}
+
+/**
+ * vu_queue_unpop() - Push back the previously popped element from the virqueue
+ * @vq:		Virtqueue
+ */
+/* cppcheck-suppress unusedFunction */
+void vu_queue_unpop(struct vu_virtq *vq)
+{
+	vq->last_avail_idx--;
+	vu_queue_detach_element(vq);
+}
+
+/**
+ * vu_queue_rewind() - Push back a given number of popped elements
+ * @vq:		Virtqueue
+ * @num:	Number of element to unpop
+ */
+/* cppcheck-suppress unusedFunction */
+bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num)
+{
+	if (num > vq->inuse)
+		return false;
+
+	vq->last_avail_idx -= num;
+	vq->inuse -= num;
+	return true;
+}
+
+/**
+ * vring_used_write() - Write an entry in the used ring
+ * @vq:		Virtqueue
+ * @uelem:	Entry to write
+ * @i:		Index of the entry in the used ring
+ */
+static inline void vring_used_write(struct vu_virtq *vq,
+				    const struct vring_used_elem *uelem, int i)
+{
+	struct vring_used *used = vq->vring.used;
+
+	used->ring[i] = *uelem;
+}
+
+/**
+ * vu_queue_fill_by_index() - Update information of a descriptor ring entry
+ *			      in the used ring
+ * @vq:		Virtqueue
+ * @index:	Descriptor ring index
+ * @len:	Size of the element
+ * @idx:	Used ring entry index
+ */
+void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
+			    unsigned int len, unsigned int idx)
+{
+	struct vring_used_elem uelem;
+
+	idx = (idx + vq->used_idx) % vq->vring.num;
+
+	uelem.id = htole32(index);
+	uelem.len = htole32(len);
+	vring_used_write(vq, &uelem, idx);
+}
+
+/**
+ * vu_queue_fill() - Update information of a given element in the used ring
+ * @dev:	Vhost-user device
+ * @vq:		Virtqueue
+ * @elem:	Element information to fill
+ * @len:	Size of the element
+ * @idx:	Used ring entry index
+ */
+/* cppcheck-suppress unusedFunction */
+void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem,
+		   unsigned int len, unsigned int idx)
+{
+	vu_queue_fill_by_index(vq, elem->index, len, idx);
+}
+
+/**
+ * vring_used_idx_set() - Set the descriptor ring current index
+ * @vq:		Virtqueue
+ * @val:	Value to set in the index
+ */
+static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val)
+{
+	vq->vring.used->idx = htole16(val);
+
+	vq->used_idx = val;
+}
+
+/**
+ * vu_queue_flush() - Flush the virtqueue
+ * @vq:		Virtqueue
+ * @count:	Number of entry to flush
+ */
+/* cppcheck-suppress unusedFunction */
+void vu_queue_flush(struct vu_virtq *vq, unsigned int count)
+{
+	uint16_t old, new;
+
+	/* Make sure buffer is written before we update index. */
+	smp_wmb();
+
+	old = vq->used_idx;
+	new = old + count;
+	vring_used_idx_set(vq, new);
+	vq->inuse -= count;
+	if ((uint16_t)(new - vq->signalled_used) < (uint16_t)(new - old))
+		vq->signalled_used_valid = false;
+}
diff --git a/virtio.h b/virtio.h
new file mode 100644
index 0000000..94efeb0
--- /dev/null
+++ b/virtio.h
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * virtio API, vring and virtqueue functions definition
+ *
+ * Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ */
+
+#ifndef VIRTIO_H
+#define VIRTIO_H
+
+#include <stdbool.h>
+#include <linux/vhost_types.h>
+
+/* Maximum size of a virtqueue */
+#define VIRTQUEUE_MAX_SIZE 1024
+
+/**
+ * struct vu_ring - Virtqueue rings
+ * @num:		Size of the queue
+ * @desc:		Descriptor ring
+ * @avail:		Available ring
+ * @used:		Used ring
+ * @log_guest_addr:	Guest address for logging
+ * @flags:		Vring flags
+ * 			VHOST_VRING_F_LOG is set if log address is valid
+ */
+struct vu_ring {
+	unsigned int num;
+	struct vring_desc *desc;
+	struct vring_avail *avail;
+	struct vring_used *used;
+	uint64_t log_guest_addr;
+	uint32_t flags;
+};
+
+/**
+ * struct vu_virtq - Virtqueue definition
+ * @vring:			Virtqueue rings
+ * @last_avail_idx:		Next head to pop
+ * @shadow_avail_idx:		Last avail_idx read from VQ.
+ * @used_idx:			Descriptor ring current index
+ * @signalled_used:		Last used index value we have signalled on
+ * @signalled_used_valid:	True if signalled_used if valid
+ * @notification:		True if the queues notify (via event
+ * 				index or interrupt)
+ * @inuse:			Number of entries in use
+ * @call_fd:			The event file descriptor to signal when
+ * 				buffers are used.
+ * @kick_fd:			The event file descriptor for adding
+ * 				buffers to the vring
+ * @err_fd:			The event file descriptor to signal when
+ * 				error occurs
+ * @enable:			True if the virtqueue is enabled
+ * @started:			True if the virtqueue is started
+ * @vra:			QEMU address of our rings
+ */
+struct vu_virtq {
+	struct vu_ring vring;
+	uint16_t last_avail_idx;
+	uint16_t shadow_avail_idx;
+	uint16_t used_idx;
+	uint16_t signalled_used;
+	bool signalled_used_valid;
+	bool notification;
+	unsigned int inuse;
+	int call_fd;
+	int kick_fd;
+	int err_fd;
+	unsigned int enable;
+	bool started;
+	struct vhost_vring_addr vra;
+};
+
+/**
+ * struct vu_dev_region - guest shared memory region
+ * @gpa:		Guest physical address of the region
+ * @size:		Memory size in bytes
+ * @qva:		QEMU virtual address
+ * @mmap_offset:	Offset where the region starts in the mapped memory
+ * @mmap_addr:		Address of the mapped memory
+ */
+struct vu_dev_region {
+	uint64_t gpa;
+	uint64_t size;
+	uint64_t qva;
+	uint64_t mmap_offset;
+	uint64_t mmap_addr;
+};
+
+#define VHOST_USER_MAX_QUEUES 2
+
+/*
+ * Set a reasonable maximum number of ram slots, which will be supported by
+ * any architecture.
+ */
+#define VHOST_USER_MAX_RAM_SLOTS 32
+
+/**
+ * struct vu_dev - vhost-user device information
+ * @context:		Execution context
+ * @nregions:		Number of shared memory regions
+ * @regions:		Guest shared memory regions
+ * @features:		Vhost-user features
+ * @protocol_features:	Vhost-user protocol features
+ */
+struct vu_dev {
+	uint32_t nregions;
+	struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS];
+	struct vu_virtq vq[VHOST_USER_MAX_QUEUES];
+	uint64_t features;
+	uint64_t protocol_features;
+};
+
+/**
+ * struct vu_virtq_element - virtqueue element
+ * @index:	Descriptor ring index
+ * @out_num:	Number of outgoing iovec buffers
+ * @in_num:	Number of incoming iovec buffers
+ * @in_sg:	Incoming iovec buffers
+ * @out_sg:	Outgoing iovec buffers
+ */
+struct vu_virtq_element {
+	unsigned int index;
+	unsigned int out_num;
+	unsigned int in_num;
+	struct iovec *in_sg;
+	struct iovec *out_sg;
+};
+
+/**
+ * has_feature() - Check a feature bit in a features set
+ * @features:	Features set
+ * @fb:		Feature bit to check
+ *
+ * Return:	True if the feature bit is set
+ */
+static inline bool has_feature(uint64_t features, unsigned int fbit)
+{
+	return !!(features & (1ULL << fbit));
+}
+
+/**
+ * vu_has_feature() - Check if a virtio-net feature is available
+ * @vdev:	Vhost-user device
+ * @bit:	Feature to check
+ *
+ * Return:	True if the feature is available
+ */
+static inline bool vu_has_feature(const struct vu_dev *vdev,
+				  unsigned int fbit)
+{
+	return has_feature(vdev->features, fbit);
+}
+
+/**
+ * vu_has_protocol_feature() - Check if a vhost-user feature is available
+ * @vdev:	Vhost-user device
+ * @bit:	Feature to check
+ *
+ * Return:	True if the feature is available
+ */
+/* cppcheck-suppress unusedFunction */
+static inline bool vu_has_protocol_feature(const struct vu_dev *vdev,
+					   unsigned int fbit)
+{
+	return has_feature(vdev->protocol_features, fbit);
+}
+
+bool vu_queue_empty(struct vu_virtq *vq);
+void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq);
+int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq,
+		 struct vu_virtq_element *elem);
+void vu_queue_detach_element(struct vu_virtq *vq);
+void vu_queue_unpop(struct vu_virtq *vq);
+bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num);
+void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
+			    unsigned int len, unsigned int idx);
+void vu_queue_fill(struct vu_virtq *vq,
+		   const struct vu_virtq_element *elem, unsigned int len,
+		   unsigned int idx);
+void vu_queue_flush(struct vu_virtq *vq, unsigned int count);
+#endif /* VIRTIO_H */

From 31117b27c6c905a6bf5fb2567f30fa2f9e0fb3cd Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 22 Nov 2024 17:43:30 +0100
Subject: [PATCH 135/382] vhost-user: introduce vhost-user API

Add vhost_user.c and vhost_user.h that define the functions needed
to implement vhost-user backend.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile     |   4 +-
 vhost_user.c | 970 +++++++++++++++++++++++++++++++++++++++++++++++++++
 vhost_user.h | 208 +++++++++++
 virtio.h     |   1 +
 4 files changed, 1181 insertions(+), 2 deletions(-)
 create mode 100644 vhost_user.c
 create mode 100644 vhost_user.h

diff --git a/Makefile b/Makefile
index 9b61a47..bcb084e 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
 PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
 	icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
 	ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
-	tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c virtio.c
+	tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c vhost_user.c virtio.c
 QRAP_SRCS = qrap.c
 SRCS = $(PASST_SRCS) $(QRAP_SRCS)
 
@@ -47,7 +47,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
 	lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
 	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
-	udp.h udp_flow.h util.h virtio.h
+	udp.h udp_flow.h util.h vhost_user.h virtio.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
 C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/vhost_user.c b/vhost_user.c
new file mode 100644
index 0000000..89627a2
--- /dev/null
+++ b/vhost_user.c
@@ -0,0 +1,970 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * vhost-user API, command management and virtio interface
+ *
+ * Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ *
+ * Some parts from QEMU subprojects/libvhost-user/libvhost-user.c
+ * licensed under the following terms:
+ *
+ * Copyright IBM, Corp. 2007
+ * Copyright (c) 2016 Red Hat, Inc.
+ *
+ * Authors:
+ *  Anthony Liguori <aliguori@us.ibm.com>
+ *  Marc-André Lureau <mlureau@redhat.com>
+ *  Victor Kaplansky <victork@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <inttypes.h>
+#include <time.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include <sys/mman.h>
+#include <linux/vhost_types.h>
+#include <linux/virtio_net.h>
+
+#include "util.h"
+#include "passt.h"
+#include "tap.h"
+#include "vhost_user.h"
+#include "pcap.h"
+
+/* vhost-user version we are compatible with */
+#define VHOST_USER_VERSION 1
+
+/**
+ * vu_print_capabilities() - print vhost-user capabilities
+ * 			     this is part of the vhost-user backend
+ * 			     convention.
+ */
+/* cppcheck-suppress unusedFunction */
+void vu_print_capabilities(void)
+{
+	info("{");
+	info("  \"type\": \"net\"");
+	info("}");
+	exit(EXIT_SUCCESS);
+}
+
+/**
+ * vu_request_to_string() - convert a vhost-user request number to its name
+ * @req:	request number
+ *
+ * Return: the name of request number
+ */
+static const char *vu_request_to_string(unsigned int req)
+{
+	if (req < VHOST_USER_MAX) {
+#define REQ(req) [req] = #req
+		static const char * const vu_request_str[VHOST_USER_MAX] = {
+			REQ(VHOST_USER_NONE),
+			REQ(VHOST_USER_GET_FEATURES),
+			REQ(VHOST_USER_SET_FEATURES),
+			REQ(VHOST_USER_SET_OWNER),
+			REQ(VHOST_USER_RESET_OWNER),
+			REQ(VHOST_USER_SET_MEM_TABLE),
+			REQ(VHOST_USER_SET_LOG_BASE),
+			REQ(VHOST_USER_SET_LOG_FD),
+			REQ(VHOST_USER_SET_VRING_NUM),
+			REQ(VHOST_USER_SET_VRING_ADDR),
+			REQ(VHOST_USER_SET_VRING_BASE),
+			REQ(VHOST_USER_GET_VRING_BASE),
+			REQ(VHOST_USER_SET_VRING_KICK),
+			REQ(VHOST_USER_SET_VRING_CALL),
+			REQ(VHOST_USER_SET_VRING_ERR),
+			REQ(VHOST_USER_GET_PROTOCOL_FEATURES),
+			REQ(VHOST_USER_SET_PROTOCOL_FEATURES),
+			REQ(VHOST_USER_GET_QUEUE_NUM),
+			REQ(VHOST_USER_SET_VRING_ENABLE),
+			REQ(VHOST_USER_SEND_RARP),
+			REQ(VHOST_USER_NET_SET_MTU),
+			REQ(VHOST_USER_SET_BACKEND_REQ_FD),
+			REQ(VHOST_USER_IOTLB_MSG),
+			REQ(VHOST_USER_SET_VRING_ENDIAN),
+			REQ(VHOST_USER_GET_CONFIG),
+			REQ(VHOST_USER_SET_CONFIG),
+			REQ(VHOST_USER_POSTCOPY_ADVISE),
+			REQ(VHOST_USER_POSTCOPY_LISTEN),
+			REQ(VHOST_USER_POSTCOPY_END),
+			REQ(VHOST_USER_GET_INFLIGHT_FD),
+			REQ(VHOST_USER_SET_INFLIGHT_FD),
+			REQ(VHOST_USER_GPU_SET_SOCKET),
+			REQ(VHOST_USER_VRING_KICK),
+			REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
+			REQ(VHOST_USER_ADD_MEM_REG),
+			REQ(VHOST_USER_REM_MEM_REG),
+		};
+#undef REQ
+		return vu_request_str[req];
+	}
+
+	return "unknown";
+}
+
+/**
+ * qva_to_va() -  Translate front-end (QEMU) virtual address to our virtual
+ * 		  address
+ * @dev:		vhost-user device
+ * @qemu_addr:		front-end userspace address
+ *
+ * Return: the memory address in our process virtual address space.
+ */
+static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr)
+{
+	unsigned int i;
+
+	/* Find matching memory region.  */
+	for (i = 0; i < dev->nregions; i++) {
+		const struct vu_dev_region *r = &dev->regions[i];
+
+		if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
+			/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
+			return (void *)(qemu_addr - r->qva + r->mmap_addr +
+					r->mmap_offset);
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * vmsg_close_fds() - Close all file descriptors of a given message
+ * @vmsg:	vhost-user message with the list of the file descriptors
+ */
+static void vmsg_close_fds(const struct vhost_user_msg *vmsg)
+{
+	int i;
+
+	for (i = 0; i < vmsg->fd_num; i++)
+		close(vmsg->fds[i]);
+}
+
+/**
+ * vu_remove_watch() - Remove a file descriptor from our passt epoll
+ * 		       file descriptor
+ * @vdev:	vhost-user device
+ * @fd:		file descriptor to remove
+ */
+static void vu_remove_watch(const struct vu_dev *vdev, int fd)
+{
+	/* Placeholder to add passt related code */
+	(void)vdev;
+	(void)fd;
+}
+
+/**
+ * vmsg_set_reply_u64() - Set reply payload.u64 and clear request flags
+ * 			  and fd_num
+ * @vmsg:	vhost-user message
+ * @val:	64-bit value to reply
+ */
+static void vmsg_set_reply_u64(struct vhost_user_msg *vmsg, uint64_t val)
+{
+	vmsg->hdr.flags = 0; /* defaults will be set by vu_send_reply() */
+	vmsg->hdr.size = sizeof(vmsg->payload.u64);
+	vmsg->payload.u64 = val;
+	vmsg->fd_num = 0;
+}
+
+/**
+ * vu_message_read_default() - Read incoming vhost-user message from the
+ * 			       front-end
+ * @conn_fd:	vhost-user command socket
+ * @vmsg:	vhost-user message
+ *
+ * Return:  0 if recvmsg() has been interrupted or if there's no data to read,
+ *          1 if a message has been received
+ */
+static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg)
+{
+	char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS *
+		     sizeof(int))] = { 0 };
+	struct iovec iov = {
+		.iov_base = (char *)vmsg,
+		.iov_len = VHOST_USER_HDR_SIZE,
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = control,
+		.msg_controllen = sizeof(control),
+	};
+	ssize_t ret, sz_payload;
+	struct cmsghdr *cmsg;
+
+	ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT);
+	if (ret < 0) {
+		if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
+			return 0;
+		die_perror("vhost-user message receive (recvmsg)");
+	}
+
+	vmsg->fd_num = 0;
+	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
+	     cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+		if (cmsg->cmsg_level == SOL_SOCKET &&
+		    cmsg->cmsg_type == SCM_RIGHTS) {
+			size_t fd_size;
+
+			ASSERT(cmsg->cmsg_len >= CMSG_LEN(0));
+			fd_size = cmsg->cmsg_len - CMSG_LEN(0);
+			ASSERT(fd_size <= sizeof(vmsg->fds));
+			vmsg->fd_num = fd_size / sizeof(int);
+			memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
+			break;
+		}
+	}
+
+	sz_payload = vmsg->hdr.size;
+	if ((size_t)sz_payload > sizeof(vmsg->payload)) {
+		die("vhost-user message request too big: %d,"
+			 " size: vmsg->size: %zd, "
+			 "while sizeof(vmsg->payload) = %zu",
+			 vmsg->hdr.request, sz_payload, sizeof(vmsg->payload));
+	}
+
+	if (sz_payload) {
+		do
+			ret = recv(conn_fd, &vmsg->payload, sz_payload, 0);
+		while (ret < 0 && errno == EINTR);
+
+		if (ret < 0)
+			die_perror("vhost-user message receive");
+
+		if (ret == 0)
+			die("EOF on vhost-user message receive");
+
+		if (ret < sz_payload)
+			die("Short-read on vhost-user message receive");
+	}
+
+	return 1;
+}
+
+/**
+ * vu_message_write() - Send a message to the front-end
+ * @conn_fd:	vhost-user command socket
+ * @vmsg:	vhost-user message
+ *
+ * #syscalls:vu sendmsg
+ */
+static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg)
+{
+	char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = { 0 };
+	struct iovec iov = {
+		.iov_base = (char *)vmsg,
+		.iov_len = VHOST_USER_HDR_SIZE + vmsg->hdr.size,
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = control,
+	};
+	int rc;
+
+	ASSERT(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS);
+	if (vmsg->fd_num > 0) {
+		size_t fdsize = vmsg->fd_num * sizeof(int);
+		struct cmsghdr *cmsg;
+
+		msg.msg_controllen = CMSG_SPACE(fdsize);
+		cmsg = CMSG_FIRSTHDR(&msg);
+		cmsg->cmsg_len = CMSG_LEN(fdsize);
+		cmsg->cmsg_level = SOL_SOCKET;
+		cmsg->cmsg_type = SCM_RIGHTS;
+		memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize);
+	}
+
+	do
+		rc = sendmsg(conn_fd, &msg, 0);
+	while (rc < 0 && errno == EINTR);
+
+	if (rc < 0)
+		die_perror("vhost-user message send");
+
+	if ((uint32_t)rc < VHOST_USER_HDR_SIZE + vmsg->hdr.size)
+		die("EOF on vhost-user message send");
+}
+
+/**
+ * vu_send_reply() - Update message flags and send it to front-end
+ * @conn_fd:	vhost-user command socket
+ * @vmsg:	vhost-user message
+ */
+static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
+{
+	msg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
+	msg->hdr.flags |= VHOST_USER_VERSION;
+	msg->hdr.flags |= VHOST_USER_REPLY_MASK;
+
+	vu_message_write(conn_fd, msg);
+}
+
+/**
+ * vu_get_features_exec() - Provide back-end features bitmask to front-end
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: True as a reply is requested
+ */
+static bool vu_get_features_exec(struct vu_dev *vdev,
+				 struct vhost_user_msg *msg)
+{
+	uint64_t features =
+		1ULL << VIRTIO_F_VERSION_1 |
+		1ULL << VIRTIO_NET_F_MRG_RXBUF |
+		1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
+
+	(void)vdev;
+
+	vmsg_set_reply_u64(msg, features);
+
+	debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64);
+
+	return true;
+}
+
+/**
+ * vu_set_enable_all_rings() - Enable/disable all the virtqueues
+ * @vdev:	vhost-user device
+ * @enable:	New virtqueues state
+ */
+static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable)
+{
+	uint16_t i;
+
+	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++)
+		vdev->vq[i].enable = enable;
+}
+
+/**
+ * vu_set_features_exec() - Enable features of the back-end
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_features_exec(struct vu_dev *vdev,
+				 struct vhost_user_msg *msg)
+{
+	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+
+	vdev->features = msg->payload.u64;
+	/* We only support devices conforming to VIRTIO 1.0 or
+	 * later
+	 */
+	if (!vu_has_feature(vdev, VIRTIO_F_VERSION_1))
+		die("virtio legacy devices aren't supported by passt");
+
+	if (!vu_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES))
+		vu_set_enable_all_rings(vdev, true);
+
+	return false;
+}
+
+/**
+ * vu_set_owner_exec() - Session start flag, do nothing in our case
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_owner_exec(struct vu_dev *vdev,
+			      struct vhost_user_msg *msg)
+{
+	(void)vdev;
+	(void)msg;
+
+	return false;
+}
+
+/**
+ * map_ring() - Convert ring front-end (QEMU) addresses to our process
+ * 		virtual address space.
+ * @vdev:	vhost-user device
+ * @vq:		Virtqueue
+ *
+ * Return: True if ring cannot be mapped to our address space
+ */
+static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
+{
+	vq->vring.desc = qva_to_va(vdev, vq->vra.desc_user_addr);
+	vq->vring.used = qva_to_va(vdev, vq->vra.used_user_addr);
+	vq->vring.avail = qva_to_va(vdev, vq->vra.avail_user_addr);
+
+	debug("Setting virtq addresses:");
+	debug("    vring_desc  at %p", (void *)vq->vring.desc);
+	debug("    vring_used  at %p", (void *)vq->vring.used);
+	debug("    vring_avail at %p", (void *)vq->vring.avail);
+
+	return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
+}
+
+/**
+ * vu_set_mem_table_exec() - Sets the memory map regions to be able to
+ * 			     translate the vring addresses.
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ *
+ * #syscalls:vu mmap munmap
+ */
+static bool vu_set_mem_table_exec(struct vu_dev *vdev,
+				  struct vhost_user_msg *msg)
+{
+	struct vhost_user_memory m = msg->payload.memory, *memory = &m;
+	unsigned int i;
+
+	for (i = 0; i < vdev->nregions; i++) {
+		const struct vu_dev_region *r = &vdev->regions[i];
+
+		if (r->mmap_addr) {
+			/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
+			munmap((void *)r->mmap_addr, r->size + r->mmap_offset);
+		}
+	}
+	vdev->nregions = memory->nregions;
+
+	debug("vhost-user nregions: %u", memory->nregions);
+	for (i = 0; i < vdev->nregions; i++) {
+		struct vhost_user_memory_region *msg_region = &memory->regions[i];
+		struct vu_dev_region *dev_region = &vdev->regions[i];
+		void *mmap_addr;
+
+		debug("vhost-user region %d", i);
+		debug("    guest_phys_addr: 0x%016"PRIx64,
+		      msg_region->guest_phys_addr);
+		debug("    memory_size:     0x%016"PRIx64,
+		      msg_region->memory_size);
+		debug("    userspace_addr   0x%016"PRIx64,
+		      msg_region->userspace_addr);
+		debug("    mmap_offset      0x%016"PRIx64,
+		      msg_region->mmap_offset);
+
+		dev_region->gpa = msg_region->guest_phys_addr;
+		dev_region->size = msg_region->memory_size;
+		dev_region->qva = msg_region->userspace_addr;
+		dev_region->mmap_offset = msg_region->mmap_offset;
+
+		/* We don't use offset argument of mmap() since the
+		 * mapped address has to be page aligned.
+		 */
+		mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
+				 PROT_READ | PROT_WRITE, MAP_SHARED |
+				 MAP_NORESERVE, msg->fds[i], 0);
+
+		if (mmap_addr == MAP_FAILED)
+			die_perror("vhost-user region mmap error");
+
+		dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
+		debug("    mmap_addr:       0x%016"PRIx64,
+		      dev_region->mmap_addr);
+
+		close(msg->fds[i]);
+	}
+
+	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
+		if (vdev->vq[i].vring.desc) {
+			if (map_ring(vdev, &vdev->vq[i]))
+				die("remapping queue %d during setmemtable", i);
+		}
+	}
+
+	return false;
+}
+
+/**
+ * vu_set_vring_num_exec() - Set the size of the queue (vring size)
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_vring_num_exec(struct vu_dev *vdev,
+				  struct vhost_user_msg *msg)
+{
+	unsigned int idx = msg->payload.state.index;
+	unsigned int num = msg->payload.state.num;
+
+	debug("State.index: %u", idx);
+	debug("State.num:   %u", num);
+	vdev->vq[idx].vring.num = num;
+
+	return false;
+}
+
+/**
+ * vu_set_vring_addr_exec() - Set the addresses of the vring
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
+				   struct vhost_user_msg *msg)
+{
+	/* We need to copy the payload to vhost_vring_addr structure
+         * to access index because address of msg->payload.addr
+         * can be unaligned as it is packed.
+         */
+	struct vhost_vring_addr addr = msg->payload.addr;
+	struct vu_virtq *vq = &vdev->vq[addr.index];
+
+	debug("vhost_vring_addr:");
+	debug("    index:  %d", addr.index);
+	debug("    flags:  %d", addr.flags);
+	debug("    desc_user_addr:   0x%016" PRIx64,
+	      (uint64_t)addr.desc_user_addr);
+	debug("    used_user_addr:   0x%016" PRIx64,
+	      (uint64_t)addr.used_user_addr);
+	debug("    avail_user_addr:  0x%016" PRIx64,
+	      (uint64_t)addr.avail_user_addr);
+	debug("    log_guest_addr:   0x%016" PRIx64,
+	      (uint64_t)addr.log_guest_addr);
+
+	vq->vra = msg->payload.addr;
+	vq->vring.flags = addr.flags;
+	vq->vring.log_guest_addr = addr.log_guest_addr;
+
+	if (map_ring(vdev, vq))
+		die("Invalid vring_addr message");
+
+	vq->used_idx = le16toh(vq->vring.used->idx);
+
+	if (vq->last_avail_idx != vq->used_idx) {
+		debug("Last avail index != used index: %u != %u",
+		      vq->last_avail_idx, vq->used_idx);
+	}
+
+	return false;
+}
+/**
+ * vu_set_vring_base_exec() - Sets the next index to use for descriptors
+ * 			      in this vring
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_vring_base_exec(struct vu_dev *vdev,
+				   struct vhost_user_msg *msg)
+{
+	unsigned int idx = msg->payload.state.index;
+	unsigned int num = msg->payload.state.num;
+
+	debug("State.index: %u", idx);
+	debug("State.num:   %u", num);
+	vdev->vq[idx].shadow_avail_idx = vdev->vq[idx].last_avail_idx = num;
+
+	return false;
+}
+
+/**
+ * vu_get_vring_base_exec() - Stops the vring and returns the current
+ * 			      descriptor index or indices
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: True as a reply is requested
+ */
+static bool vu_get_vring_base_exec(struct vu_dev *vdev,
+				   struct vhost_user_msg *msg)
+{
+	unsigned int idx = msg->payload.state.index;
+
+	debug("State.index: %u", idx);
+	msg->payload.state.num = vdev->vq[idx].last_avail_idx;
+	msg->hdr.size = sizeof(msg->payload.state);
+
+	vdev->vq[idx].started = false;
+
+	if (vdev->vq[idx].call_fd != -1) {
+		close(vdev->vq[idx].call_fd);
+		vdev->vq[idx].call_fd = -1;
+	}
+	if (vdev->vq[idx].kick_fd != -1) {
+		vu_remove_watch(vdev, vdev->vq[idx].kick_fd);
+		close(vdev->vq[idx].kick_fd);
+		vdev->vq[idx].kick_fd = -1;
+	}
+
+	return true;
+}
+
+/**
+ * vu_set_watch() - Add a file descriptor to the passt epoll file descriptor
+ * @vdev:	vhost-user device
+ * @idx:	queue index of the file descriptor to add
+ */
+static void vu_set_watch(const struct vu_dev *vdev, int idx)
+{
+	/* Placeholder to add passt related code */
+	(void)vdev;
+	(void)idx;
+}
+
+/**
+ * vu_check_queue_msg_file() - Check if a message is valid,
+ * 			       close fds if NOFD bit is set
+ * @vmsg:	vhost-user message
+ */
+static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
+{
+	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+
+	if (idx >= VHOST_USER_MAX_QUEUES)
+		die("Invalid vhost-user queue index: %u", idx);
+
+	if (nofd) {
+		vmsg_close_fds(msg);
+		return;
+	}
+
+	if (msg->fd_num != 1)
+		die("Invalid fds in vhost-user request: %d", msg->hdr.request);
+}
+
+/**
+ * vu_set_vring_kick_exec() - Set the event file descriptor for adding buffers
+ * 			      to the vring
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
+				   struct vhost_user_msg *msg)
+{
+	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+
+	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+
+	vu_check_queue_msg_file(msg);
+
+	if (vdev->vq[idx].kick_fd != -1) {
+		vu_remove_watch(vdev, vdev->vq[idx].kick_fd);
+		close(vdev->vq[idx].kick_fd);
+		vdev->vq[idx].kick_fd = -1;
+	}
+
+	if (!nofd)
+		vdev->vq[idx].kick_fd = msg->fds[0];
+
+	debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx);
+
+	vdev->vq[idx].started = true;
+
+	if (vdev->vq[idx].kick_fd != -1 && VHOST_USER_IS_QUEUE_TX(idx)) {
+		vu_set_watch(vdev, idx);
+		debug("Waiting for kicks on fd: %d for vq: %d",
+		      vdev->vq[idx].kick_fd, idx);
+	}
+
+	return false;
+}
+
+/**
+ * vu_set_vring_call_exec() - Set the event file descriptor to signal when
+ * 			      buffers are used
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_vring_call_exec(struct vu_dev *vdev,
+				   struct vhost_user_msg *msg)
+{
+	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+
+	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+
+	vu_check_queue_msg_file(msg);
+
+	if (vdev->vq[idx].call_fd != -1) {
+		close(vdev->vq[idx].call_fd);
+		vdev->vq[idx].call_fd = -1;
+	}
+
+	if (!nofd)
+		vdev->vq[idx].call_fd = msg->fds[0];
+
+	/* in case of I/O hang after reconnecting */
+	if (vdev->vq[idx].call_fd != -1)
+		eventfd_write(msg->fds[0], 1);
+
+	debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx);
+
+	return false;
+}
+
+/**
+ * vu_set_vring_err_exec() - Set the event file descriptor to signal when
+ * 			     error occurs
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_vring_err_exec(struct vu_dev *vdev,
+				  struct vhost_user_msg *msg)
+{
+	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+
+	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+
+	vu_check_queue_msg_file(msg);
+
+	if (vdev->vq[idx].err_fd != -1) {
+		close(vdev->vq[idx].err_fd);
+		vdev->vq[idx].err_fd = -1;
+	}
+
+	if (!nofd)
+		vdev->vq[idx].err_fd = msg->fds[0];
+
+	return false;
+}
+
+/**
+ * vu_get_protocol_features_exec() - Provide the protocol (vhost-user) features
+ * 				     to the front-end
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: True as a reply is requested
+ */
+static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
+					  struct vhost_user_msg *msg)
+{
+	uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK;
+
+	(void)vdev;
+	vmsg_set_reply_u64(msg, features);
+
+	return true;
+}
+
+/**
+ * vu_set_protocol_features_exec() - Enable protocol (vhost-user) features
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
+					  struct vhost_user_msg *msg)
+{
+	uint64_t features = msg->payload.u64;
+
+	debug("u64: 0x%016"PRIx64, features);
+
+	vdev->protocol_features = msg->payload.u64;
+
+	return false;
+}
+
+/**
+ * vu_get_queue_num_exec() - Tell how many queues we support
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: True as a reply is requested
+ */
+static bool vu_get_queue_num_exec(struct vu_dev *vdev,
+				  struct vhost_user_msg *msg)
+{
+	(void)vdev;
+
+	vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES);
+
+	return true;
+}
+
+/**
+ * vu_set_vring_enable_exec() - Enable or disable corresponding vring
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
+				     struct vhost_user_msg *msg)
+{
+	unsigned int enable = msg->payload.state.num;
+	unsigned int idx = msg->payload.state.index;
+
+	debug("State.index:  %u", idx);
+	debug("State.enable: %u", enable);
+
+	if (idx >= VHOST_USER_MAX_QUEUES)
+		die("Invalid vring_enable index: %u", idx);
+
+	vdev->vq[idx].enable = enable;
+	return false;
+}
+
+/**
+ * vu_init() - Initialize vhost-user device structure
+ * @c:		execution context
+ * @vdev:	vhost-user device
+ */
+/* cppcheck-suppress unusedFunction */
+void vu_init(struct ctx *c, struct vu_dev *vdev)
+{
+	int i;
+
+	vdev->context = c;
+	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
+		vdev->vq[i] = (struct vu_virtq){
+			.call_fd = -1,
+			.kick_fd = -1,
+			.err_fd = -1,
+			.notification = true,
+		};
+	}
+}
+
+/**
+ * vu_cleanup() - Reset vhost-user device
+ * @vdev:	vhost-user device
+ */
+/* cppcheck-suppress unusedFunction */
+void vu_cleanup(struct vu_dev *vdev)
+{
+	unsigned int i;
+
+	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
+		struct vu_virtq *vq = &vdev->vq[i];
+
+		vq->started = false;
+		vq->notification = true;
+
+		if (vq->call_fd != -1) {
+			close(vq->call_fd);
+			vq->call_fd = -1;
+		}
+		if (vq->err_fd != -1) {
+			close(vq->err_fd);
+			vq->err_fd = -1;
+		}
+		if (vq->kick_fd != -1) {
+			vu_remove_watch(vdev, vq->kick_fd);
+			close(vq->kick_fd);
+			vq->kick_fd = -1;
+		}
+
+		vq->vring.desc = 0;
+		vq->vring.used = 0;
+		vq->vring.avail = 0;
+	}
+
+	for (i = 0; i < vdev->nregions; i++) {
+		const struct vu_dev_region *r = &vdev->regions[i];
+
+		if (r->mmap_addr) {
+			/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
+			munmap((void *)r->mmap_addr, r->size + r->mmap_offset);
+		}
+	}
+	vdev->nregions = 0;
+}
+
+/**
+ * vu_sock_reset() - Reset connection socket
+ * @vdev:	vhost-user device
+ */
+static void vu_sock_reset(struct vu_dev *vdev)
+{
+	/* Placeholder to add passt related code */
+	(void)vdev;
+}
+
+static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
+					struct vhost_user_msg *msg) = {
+	[VHOST_USER_GET_FEATURES]	   = vu_get_features_exec,
+	[VHOST_USER_SET_FEATURES]	   = vu_set_features_exec,
+	[VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec,
+	[VHOST_USER_SET_PROTOCOL_FEATURES] = vu_set_protocol_features_exec,
+	[VHOST_USER_GET_QUEUE_NUM]	   = vu_get_queue_num_exec,
+	[VHOST_USER_SET_OWNER]		   = vu_set_owner_exec,
+	[VHOST_USER_SET_MEM_TABLE]	   = vu_set_mem_table_exec,
+	[VHOST_USER_SET_VRING_NUM]	   = vu_set_vring_num_exec,
+	[VHOST_USER_SET_VRING_ADDR]	   = vu_set_vring_addr_exec,
+	[VHOST_USER_SET_VRING_BASE]	   = vu_set_vring_base_exec,
+	[VHOST_USER_GET_VRING_BASE]	   = vu_get_vring_base_exec,
+	[VHOST_USER_SET_VRING_KICK]	   = vu_set_vring_kick_exec,
+	[VHOST_USER_SET_VRING_CALL]	   = vu_set_vring_call_exec,
+	[VHOST_USER_SET_VRING_ERR]	   = vu_set_vring_err_exec,
+	[VHOST_USER_SET_VRING_ENABLE]	   = vu_set_vring_enable_exec,
+};
+
+/**
+ * vu_control_handler() - Handle control commands for vhost-user
+ * @vdev:	vhost-user device
+ * @fd:		vhost-user message socket
+ * @events:	epoll events
+ */
+/* cppcheck-suppress unusedFunction */
+void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
+{
+	struct vhost_user_msg msg = { 0 };
+	bool need_reply, reply_requested;
+	int ret;
+
+	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
+		vu_sock_reset(vdev);
+		return;
+	}
+
+	ret = vu_message_read_default(fd, &msg);
+	if (ret == 0) {
+		vu_sock_reset(vdev);
+		return;
+	}
+	debug("================ Vhost user message ================");
+	debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
+		msg.hdr.request);
+	debug("Flags:   0x%x", msg.hdr.flags);
+	debug("Size:    %u", msg.hdr.size);
+
+	need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
+
+	if (msg.hdr.request >= 0 && msg.hdr.request < VHOST_USER_MAX &&
+	    vu_handle[msg.hdr.request])
+		reply_requested = vu_handle[msg.hdr.request](vdev, &msg);
+	else
+		die("Unhandled request: %d", msg.hdr.request);
+
+	/* cppcheck-suppress legacyUninitvar */
+	if (!reply_requested && need_reply) {
+		msg.payload.u64 = 0;
+		msg.hdr.flags = 0;
+		msg.hdr.size = sizeof(msg.payload.u64);
+		msg.fd_num = 0;
+		reply_requested = true;
+	}
+
+	if (reply_requested)
+		vu_send_reply(fd, &msg);
+}
diff --git a/vhost_user.h b/vhost_user.h
new file mode 100644
index 0000000..5af349b
--- /dev/null
+++ b/vhost_user.h
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * vhost-user API, command management and virtio interface
+ *
+ * Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ */
+
+/* some parts from subprojects/libvhost-user/libvhost-user.h */
+
+#ifndef VHOST_USER_H
+#define VHOST_USER_H
+
+#include "virtio.h"
+#include "iov.h"
+
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+
+#define VHOST_MEMORY_BASELINE_NREGIONS 8
+
+/**
+ * enum vhost_user_protocol_feature - List of available vhost-user features
+ */
+enum vhost_user_protocol_feature {
+	VHOST_USER_PROTOCOL_F_MQ = 0,
+	VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
+	VHOST_USER_PROTOCOL_F_RARP = 2,
+	VHOST_USER_PROTOCOL_F_REPLY_ACK = 3,
+	VHOST_USER_PROTOCOL_F_NET_MTU = 4,
+	VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5,
+	VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6,
+	VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
+	VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
+	VHOST_USER_PROTOCOL_F_CONFIG = 9,
+	VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
+	VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
+	VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
+	VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14,
+	VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
+
+	VHOST_USER_PROTOCOL_F_MAX
+};
+
+/**
+ * enum vhost_user_request - List of available vhost-user requests
+ */
+enum vhost_user_request {
+	VHOST_USER_NONE = 0,
+	VHOST_USER_GET_FEATURES = 1,
+	VHOST_USER_SET_FEATURES = 2,
+	VHOST_USER_SET_OWNER = 3,
+	VHOST_USER_RESET_OWNER = 4,
+	VHOST_USER_SET_MEM_TABLE = 5,
+	VHOST_USER_SET_LOG_BASE = 6,
+	VHOST_USER_SET_LOG_FD = 7,
+	VHOST_USER_SET_VRING_NUM = 8,
+	VHOST_USER_SET_VRING_ADDR = 9,
+	VHOST_USER_SET_VRING_BASE = 10,
+	VHOST_USER_GET_VRING_BASE = 11,
+	VHOST_USER_SET_VRING_KICK = 12,
+	VHOST_USER_SET_VRING_CALL = 13,
+	VHOST_USER_SET_VRING_ERR = 14,
+	VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+	VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+	VHOST_USER_GET_QUEUE_NUM = 17,
+	VHOST_USER_SET_VRING_ENABLE = 18,
+	VHOST_USER_SEND_RARP = 19,
+	VHOST_USER_NET_SET_MTU = 20,
+	VHOST_USER_SET_BACKEND_REQ_FD = 21,
+	VHOST_USER_IOTLB_MSG = 22,
+	VHOST_USER_SET_VRING_ENDIAN = 23,
+	VHOST_USER_GET_CONFIG = 24,
+	VHOST_USER_SET_CONFIG = 25,
+	VHOST_USER_CREATE_CRYPTO_SESSION = 26,
+	VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
+	VHOST_USER_POSTCOPY_ADVISE  = 28,
+	VHOST_USER_POSTCOPY_LISTEN  = 29,
+	VHOST_USER_POSTCOPY_END     = 30,
+	VHOST_USER_GET_INFLIGHT_FD = 31,
+	VHOST_USER_SET_INFLIGHT_FD = 32,
+	VHOST_USER_GPU_SET_SOCKET = 33,
+	VHOST_USER_VRING_KICK = 35,
+	VHOST_USER_GET_MAX_MEM_SLOTS = 36,
+	VHOST_USER_ADD_MEM_REG = 37,
+	VHOST_USER_REM_MEM_REG = 38,
+	VHOST_USER_MAX
+};
+
+/**
+ * struct vhost_user_header - vhost-user message header
+ * @request:	Request type of the message
+ * @flags:	Request flags
+ * @size:	The following payload size
+ */
+struct vhost_user_header {
+	enum vhost_user_request request;
+
+#define VHOST_USER_VERSION_MASK     0x3
+#define VHOST_USER_REPLY_MASK       (0x1 << 2)
+#define VHOST_USER_NEED_REPLY_MASK  (0x1 << 3)
+	uint32_t flags;
+	uint32_t size;
+} __attribute__ ((__packed__));
+
+/**
+ * struct vhost_user_memory_region - Front-end shared memory region information
+ * @guest_phys_addr:	Guest physical address of the region
+ * @memory_size:	Memory size
+ * @userspace_addr:	front-end (QEMU) userspace address
+ * @mmap_offset:	region offset in the shared memory area
+ */
+struct vhost_user_memory_region {
+	uint64_t guest_phys_addr;
+	uint64_t memory_size;
+	uint64_t userspace_addr;
+	uint64_t mmap_offset;
+};
+
+/**
+ * struct vhost_user_memory - List of all the shared memory regions
+ * @nregions:	Number of memory regions
+ * @padding:	Padding
+ * @regions:	Memory regions list
+ */
+struct vhost_user_memory {
+	uint32_t nregions;
+	uint32_t padding;
+	struct vhost_user_memory_region regions[VHOST_MEMORY_BASELINE_NREGIONS];
+};
+
+/**
+ * union vhost_user_payload - vhost-user message payload
+ * @u64:		64-bit payload
+ * @state:		vring state payload
+ * @addr:		vring addresses payload
+ * vhost_user_memory:	Memory regions information payload
+ */
+union vhost_user_payload {
+#define VHOST_USER_VRING_IDX_MASK   0xff
+#define VHOST_USER_VRING_NOFD_MASK  (0x1 << 8)
+	uint64_t u64;
+	struct vhost_vring_state state;
+	struct vhost_vring_addr addr;
+	struct vhost_user_memory memory;
+};
+
+/**
+ * struct vhost_user_msg - vhost-use message
+ * @hdr:		Message header
+ * @payload:		Message payload
+ * @fds:		File descriptors associated with the message
+ * 			in the ancillary data.
+ * 			(shared memory or event file descriptors)
+ * @fd_num:		Number of file descriptors
+ */
+struct vhost_user_msg {
+	struct vhost_user_header hdr;
+	union vhost_user_payload payload;
+
+	int fds[VHOST_MEMORY_BASELINE_NREGIONS];
+	int fd_num;
+} __attribute__ ((__packed__));
+#define VHOST_USER_HDR_SIZE sizeof(struct vhost_user_header)
+
+/* index of the RX virtqueue */
+#define VHOST_USER_RX_QUEUE 0
+/* index of the TX virtqueue */
+#define VHOST_USER_TX_QUEUE 1
+
+/* in case of multiqueue, the RX and TX queues are interleaved */
+#define VHOST_USER_IS_QUEUE_TX(n)	(n % 2)
+#define VHOST_USER_IS_QUEUE_RX(n)	(!(n % 2))
+
+/* Default virtio-net header for passt */
+#define VU_HEADER ((struct virtio_net_hdr){	\
+	.flags = VIRTIO_NET_HDR_F_DATA_VALID,	\
+	.gso_type = VIRTIO_NET_HDR_GSO_NONE,	\
+})
+
+/**
+ * vu_queue_enabled - Return state of a virtqueue
+ * @vq:		virtqueue to check
+ *
+ * Return: true if the virqueue is enabled, false otherwise
+ */
+/* cppcheck-suppress unusedFunction */
+static inline bool vu_queue_enabled(const struct vu_virtq *vq)
+{
+	return vq->enable;
+}
+
+/**
+ * vu_queue_started - Return state of a virtqueue
+ * @vq:		virtqueue to check
+ *
+ * Return: true if the virqueue is started, false otherwise
+ */
+/* cppcheck-suppress unusedFunction */
+static inline bool vu_queue_started(const struct vu_virtq *vq)
+{
+	return vq->started;
+}
+
+void vu_print_capabilities(void);
+void vu_init(struct ctx *c, struct vu_dev *vdev);
+void vu_cleanup(struct vu_dev *vdev);
+void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events);
+#endif /* VHOST_USER_H */
diff --git a/virtio.h b/virtio.h
index 94efeb0..6410d60 100644
--- a/virtio.h
+++ b/virtio.h
@@ -105,6 +105,7 @@ struct vu_dev_region {
  * @protocol_features:	Vhost-user protocol features
  */
 struct vu_dev {
+	struct ctx *context;
 	uint32_t nregions;
 	struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS];
 	struct vu_virtq vq[VHOST_USER_MAX_QUEUES];

From 5a8b33c667d4468e82c4d50e81da06c0e681761e Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 22 Nov 2024 17:43:31 +0100
Subject: [PATCH 136/382] udp: Prepare udp.c to be shared with vhost-user

Export udp_payload_t, udp_update_hdr4(), udp_update_hdr6() and
udp_sock_errs().

Rename udp_listen_sock_handler() to udp_buf_listen_sock_handler() and
udp_reply_sock_handler to udp_buf_reply_sock_handler().

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c          | 74 ++++++++++++++++++++++++++++++--------------------
 udp_internal.h | 34 +++++++++++++++++++++++
 2 files changed, 79 insertions(+), 29 deletions(-)
 create mode 100644 udp_internal.h

diff --git a/udp.c b/udp.c
index 4be165f..9718ed8 100644
--- a/udp.c
+++ b/udp.c
@@ -109,8 +109,7 @@
 #include "pcap.h"
 #include "log.h"
 #include "flow_table.h"
-
-#define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
+#include "udp_internal.h"
 
 /* "Spliced" sockets indexed by bound port (host order) */
 static int udp_splice_ns  [IP_VERSIONS][NUM_PORTS];
@@ -118,20 +117,8 @@ static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
 
 /* Static buffers */
 
-/**
- * struct udp_payload_t - UDP header and data for inbound messages
- * @uh:		UDP header
- * @data:	UDP data
- */
-static struct udp_payload_t {
-	struct udphdr uh;
-	char data[USHRT_MAX - sizeof(struct udphdr)];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-udp_payload[UDP_MAX_FRAMES];
+/* UDP header and data for inbound messages */
+static struct udp_payload_t udp_payload[UDP_MAX_FRAMES];
 
 /* Ethernet header for IPv4 frames */
 static struct ethhdr udp4_eth_hdr;
@@ -302,9 +289,9 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
  *
  * Return: size of IPv4 payload (UDP header + data)
  */
-static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
-			      const struct flowside *toside, size_t dlen,
-			      bool no_udp_csum)
+size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
+		       const struct flowside *toside, size_t dlen,
+		       bool no_udp_csum)
 {
 	const struct in_addr *src = inany_v4(&toside->oaddr);
 	const struct in_addr *dst = inany_v4(&toside->eaddr);
@@ -345,9 +332,9 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
  *
  * Return: size of IPv6 payload (UDP header + data)
  */
-static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
-			      const struct flowside *toside, size_t dlen,
-			      bool no_udp_csum)
+size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
+		       const struct flowside *toside, size_t dlen,
+		       bool no_udp_csum)
 {
 	uint16_t l4len = dlen + sizeof(bp->uh);
 
@@ -477,7 +464,7 @@ static int udp_sock_recverr(int s)
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
+int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
@@ -554,7 +541,7 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
 }
 
 /**
- * udp_listen_sock_handler() - Handle new data from socket
+ * udp_buf_listen_sock_handler() - Handle new data from socket
  * @c:		Execution context
  * @ref:	epoll reference
  * @events:	epoll events bitmap
@@ -562,8 +549,9 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
  *
  * #syscalls recvmmsg
  */
-void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
-			     uint32_t events, const struct timespec *now)
+static void udp_buf_listen_sock_handler(const struct ctx *c,
+					union epoll_ref ref, uint32_t events,
+					const struct timespec *now)
 {
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
 	int n, i;
@@ -630,7 +618,21 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 }
 
 /**
- * udp_reply_sock_handler() - Handle new data from flow specific socket
+ * udp_listen_sock_handler() - Handle new data from socket
+ * @c:		Execution context
+ * @ref:	epoll reference
+ * @events:	epoll events bitmap
+ * @now:	Current timestamp
+ */
+void udp_listen_sock_handler(const struct ctx *c,
+			     union epoll_ref ref, uint32_t events,
+			     const struct timespec *now)
+{
+	udp_buf_listen_sock_handler(c, ref, events, now);
+}
+
+/**
+ * udp_buf_reply_sock_handler() - Handle new data from flow specific socket
  * @c:		Execution context
  * @ref:	epoll reference
  * @events:	epoll events bitmap
@@ -638,8 +640,9 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
  *
  * #syscalls recvmmsg
  */
-void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-			    uint32_t events, const struct timespec *now)
+static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
+				       uint32_t events,
+				       const struct timespec *now)
 {
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
@@ -685,6 +688,19 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	}
 }
 
+/**
+ * udp_reply_sock_handler() - Handle new data from flow specific socket
+ * @c:		Execution context
+ * @ref:	epoll reference
+ * @events:	epoll events bitmap
+ * @now:	Current timestamp
+ */
+void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
+			    uint32_t events, const struct timespec *now)
+{
+	udp_buf_reply_sock_handler(c, ref, events, now);
+}
+
 /**
  * udp_tap_handler() - Handle packets from tap
  * @c:		Execution context
diff --git a/udp_internal.h b/udp_internal.h
new file mode 100644
index 0000000..cc80e30
--- /dev/null
+++ b/udp_internal.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef UDP_INTERNAL_H
+#define UDP_INTERNAL_H
+
+#include "tap.h" /* needed by udp_meta_t */
+
+#define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
+
+/**
+ * struct udp_payload_t - UDP header and data for inbound messages
+ * @uh:		UDP header
+ * @data:	UDP data
+ */
+struct udp_payload_t {
+	struct udphdr uh;
+	char data[USHRT_MAX - sizeof(struct udphdr)];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)));
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
+#endif
+
+size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
+		       const struct flowside *toside, size_t dlen,
+		       bool no_udp_csum);
+size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
+                       const struct flowside *toside, size_t dlen,
+		       bool no_udp_csum);
+int udp_sock_errs(const struct ctx *c, int s, uint32_t events);
+#endif /* UDP_INTERNAL_H */

From b7c292b758a165066b9042cfbac1a2e1d3d197c4 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 22 Nov 2024 17:43:32 +0100
Subject: [PATCH 137/382] tcp: Export headers functions

Export tcp_fill_headers[4|6]() and tcp_update_check_tcp[4|6]().

They'll be needed by vhost-user.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 30 +++++++++++++++---------------
 tcp_internal.h | 15 +++++++++++++++
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/tcp.c b/tcp.c
index 1eb85bb..e08ffd3 100644
--- a/tcp.c
+++ b/tcp.c
@@ -758,9 +758,9 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
  * @iov_cnt:	Length of the array
  * @l4offset:	IPv4 payload offset in the iovec array
  */
-static void tcp_update_check_tcp4(const struct iphdr *iph,
-				  const struct iovec *iov, int iov_cnt,
-				  size_t l4offset)
+void tcp_update_check_tcp4(const struct iphdr *iph,
+			   const struct iovec *iov, int iov_cnt,
+			   size_t l4offset)
 {
 	uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
 	struct in_addr saddr = { .s_addr = iph->saddr };
@@ -810,9 +810,9 @@ static void tcp_update_check_tcp4(const struct iphdr *iph,
  * @iov_cnt:	Length of the array
  * @l4offset:	IPv6 payload offset in the iovec array
  */
-static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
-				  const struct iovec *iov, int iov_cnt,
-				  size_t l4offset)
+void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
+			   const struct iovec *iov, int iov_cnt,
+			   size_t l4offset)
 {
 	uint16_t l4len = ntohs(ip6h->payload_len);
 	size_t check_ofs;
@@ -978,11 +978,11 @@ static void tcp_fill_header(struct tcphdr *th,
  *
  * Return: The IPv4 payload length, host order
  */
-static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
-				struct tap_hdr *taph,
-				struct iphdr *iph, struct tcp_payload_t *bp,
-				size_t dlen, const uint16_t *check,
-				uint32_t seq, bool no_tcp_csum)
+size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
+			 struct tap_hdr *taph,
+			 struct iphdr *iph, struct tcp_payload_t *bp,
+			 size_t dlen, const uint16_t *check,
+			 uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *src4 = inany_v4(&tapside->oaddr);
@@ -1030,10 +1030,10 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
  *
  * Return: The IPv6 payload length, host order
  */
-static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
-				struct tap_hdr *taph,
-				struct ipv6hdr *ip6h, struct tcp_payload_t *bp,
-				size_t dlen, uint32_t seq, bool no_tcp_csum)
+size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
+			 struct tap_hdr *taph,
+			 struct ipv6hdr *ip6h, struct tcp_payload_t *bp,
+			 size_t dlen, uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	size_t l4len = dlen + sizeof(bp->th);
diff --git a/tcp_internal.h b/tcp_internal.h
index c846f60..8625eed 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -162,6 +162,21 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
 
 struct tcp_info_linux;
 
+void tcp_update_check_tcp4(const struct iphdr *iph,
+			   const struct iovec *iov, int iov_cnt,
+			   size_t l4offset);
+void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
+			   const struct iovec *iov, int iov_cnt,
+			   size_t l4offset);
+size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
+			 struct tap_hdr *taph,
+			 struct iphdr *iph, struct tcp_payload_t *bp,
+			 size_t dlen, const uint16_t *check,
+			 uint32_t seq, bool no_tcp_csum);
+size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
+			 struct tap_hdr *taph,
+			 struct ipv6hdr *ip6h, struct tcp_payload_t *bp,
+			 size_t dlen, uint32_t seq, bool no_tcp_csum);
 size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 			       struct iovec *iov, size_t dlen,
 			       const uint16_t *check, uint32_t seq,

From b2e62f7e85ac77a91daf5d77b7f32198ef0e59c2 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 22 Nov 2024 17:43:33 +0100
Subject: [PATCH 138/382] passt: rename tap_sock_init() to tap_backend_init()

Extract pool storage initialization loop to tap_sock_update_pool(),
extract QEMU hints to tap_backend_show_hints().

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.c |  2 +-
 tap.c   | 56 +++++++++++++++++++++++++++++++++++++++++---------------
 tap.h   |  2 +-
 3 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/passt.c b/passt.c
index 06e0a33..8a37407 100644
--- a/passt.c
+++ b/passt.c
@@ -245,7 +245,7 @@ int main(int argc, char **argv)
 
 	pasta_netns_quit_init(&c);
 
-	tap_sock_init(&c);
+	tap_backend_init(&c);
 
 	random_init(&c);
 
diff --git a/tap.c b/tap.c
index 5347df4..b489430 100644
--- a/tap.c
+++ b/tap.c
@@ -1193,11 +1193,31 @@ int tap_sock_unix_open(char *sock_path)
 	return fd;
 }
 
+/**
+ * tap_backend_show_hints() - Give help information to start QEMU
+ * @c:		Execution context
+ */
+static void tap_backend_show_hints(struct ctx *c)
+{
+	switch (c->mode) {
+	case MODE_PASTA:
+		/* No hints */
+		break;
+	case MODE_PASST:
+		info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):");
+		info("    kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
+		     c->sock_path);
+		info("or qrap, for earlier qemu versions:");
+		info("    ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
+		break;
+	}
+}
+
 /**
  * tap_sock_unix_init() - Start listening for connections on AF_UNIX socket
  * @c:		Execution context
  */
-static void tap_sock_unix_init(struct ctx *c)
+static void tap_sock_unix_init(const struct ctx *c)
 {
 	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN };
 	struct epoll_event ev = { 0 };
@@ -1208,12 +1228,6 @@ static void tap_sock_unix_init(struct ctx *c)
 	ev.events = EPOLLIN | EPOLLET;
 	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
-
-	info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):");
-	info("    kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
-	     c->sock_path);
-	info("or qrap, for earlier qemu versions:");
-	info("    ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
 }
 
 /**
@@ -1326,21 +1340,31 @@ static void tap_sock_tun_init(struct ctx *c)
 }
 
 /**
- * tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor
- * @c:		Execution context
+ * tap_sock_update_pool() - Set the buffer base and size for the pool of packets
+ * @base:	Buffer base
+ * @size	Buffer size
  */
-void tap_sock_init(struct ctx *c)
+static void tap_sock_update_pool(void *base, size_t size)
 {
-	size_t sz = sizeof(pkt_buf);
 	int i;
 
-	pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz);
-	pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz);
+	pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size);
+	pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size);
 
 	for (i = 0; i < TAP_SEQS; i++) {
-		tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
-		tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
+		tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
+		tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
 	}
+}
+
+/**
+ * tap_backend_init() - Create and set up AF_UNIX socket or
+ *			tuntap file descriptor
+ * @c:		Execution context
+ */
+void tap_backend_init(struct ctx *c)
+{
+	tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
 
 	if (c->fd_tap != -1) { /* Passed as --fd */
 		struct epoll_event ev = { 0 };
@@ -1370,4 +1394,6 @@ void tap_sock_init(struct ctx *c)
 		 */
 		memset(&c->guest_mac, 0xff, sizeof(c->guest_mac));
 	}
+
+	tap_backend_show_hints(c);
 }
diff --git a/tap.h b/tap.h
index 85f1e84..8728cc5 100644
--- a/tap.h
+++ b/tap.h
@@ -68,7 +68,7 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
 void tap_handler_passt(struct ctx *c, uint32_t events,
 		       const struct timespec *now);
 int tap_sock_unix_open(char *sock_path);
-void tap_sock_init(struct ctx *c);
+void tap_backend_init(struct ctx *c);
 void tap_flush_pools(void);
 void tap_handler(struct ctx *c, const struct timespec *now);
 void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);

From 28997fcb29b560fc0dcfd91bad5eece3ded5eb72 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 22 Nov 2024 17:43:34 +0100
Subject: [PATCH 139/382] vhost-user: add vhost-user

add virtio and vhost-user functions to connect with QEMU.

  $ ./passt --vhost-user

and

  # qemu-system-x86_64 ... -m 4G \
        -object memory-backend-memfd,id=memfd0,share=on,size=4G \
        -numa node,memdev=memfd0 \
        -chardev socket,id=chr0,path=/tmp/passt_1.socket \
        -netdev vhost-user,id=netdev0,chardev=chr0 \
        -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \
        ...

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: as suggested by lvivier, include <netinet/if_ether.h>
 before including <linux/if_ether.h> as C libraries such as musl
 __UAPI_DEF_ETHHDR in <netinet/if_ether.h> if they already have
 a definition of struct ethhdr]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile     |   6 +-
 conf.c       |  19 +-
 epoll_type.h |   4 +
 iov.c        |   1 -
 isolation.c  |  17 +-
 packet.c     |  11 ++
 packet.h     |   8 +-
 passt.1      |  10 +-
 passt.c      |   9 +
 passt.h      |   7 +
 pcap.c       |   1 -
 tap.c        |  77 ++++++--
 tap.h        |   5 +-
 tcp.c        |   7 +
 tcp_vu.c     | 498 +++++++++++++++++++++++++++++++++++++++++++++++++++
 tcp_vu.h     |  12 ++
 udp.c        |  11 ++
 udp_vu.c     | 343 +++++++++++++++++++++++++++++++++++
 udp_vu.h     |  13 ++
 vhost_user.c |  41 +++--
 vhost_user.h |   4 +-
 virtio.c     |   5 -
 vu_common.c  | 283 +++++++++++++++++++++++++++++
 vu_common.h  |  60 +++++++
 24 files changed, 1399 insertions(+), 53 deletions(-)
 create mode 100644 tcp_vu.c
 create mode 100644 tcp_vu.h
 create mode 100644 udp_vu.c
 create mode 100644 udp_vu.h
 create mode 100644 vu_common.c
 create mode 100644 vu_common.h

diff --git a/Makefile b/Makefile
index bcb084e..faa5c23 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
 PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
 	icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
 	ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
-	tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c vhost_user.c virtio.c
+	tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
+	vhost_user.c virtio.c vu_common.c
 QRAP_SRCS = qrap.c
 SRCS = $(PASST_SRCS) $(QRAP_SRCS)
 
@@ -47,7 +48,8 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
 	lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
 	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
-	udp.h udp_flow.h util.h vhost_user.h virtio.h
+	tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \
+	virtio.h vu_common.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
 C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/conf.c b/conf.c
index c6bffc4..eaa7d99 100644
--- a/conf.c
+++ b/conf.c
@@ -45,6 +45,7 @@
 #include "lineread.h"
 #include "isolation.h"
 #include "log.h"
+#include "vhost_user.h"
 
 #define NETNS_RUN_DIR	"/run/netns"
 
@@ -807,9 +808,14 @@ static void usage(const char *name, FILE *f, int status)
 			"    default: same interface name as external one\n");
 	} else {
 		FPRINTF(f,
-			"  -s, --socket PATH	UNIX domain socket path\n"
+			"  -s, --socket, --socket-path PATH	UNIX domain socket path\n"
 			"    default: probe free path starting from "
 			UNIX_SOCK_PATH "\n", 1);
+		FPRINTF(f,
+			"  --vhost-user		Enable vhost-user mode\n"
+			"    UNIX domain socket is provided by -s option\n"
+			"  --print-capabilities	print back-end capabilities in JSON format,\n"
+			"    only meaningful for vhost-user mode\n");
 	}
 
 	FPRINTF(f,
@@ -1345,6 +1351,10 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"map-guest-addr", required_argument,	NULL,		22 },
 		{"host-lo-to-ns-lo", no_argument, 	NULL,		23 },
 		{"dns-host",	required_argument,	NULL,		24 },
+		{"vhost-user",	no_argument,		NULL,		25 },
+		/* vhost-user backend program convention */
+		{"print-capabilities", no_argument,	NULL,		26 },
+		{"socket-path",	required_argument,	NULL,		's' },
 		{ 0 },
 	};
 	const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@@ -1538,6 +1548,13 @@ void conf(struct ctx *c, int argc, char **argv)
 				break;
 
 			die("Invalid host nameserver address: %s", optarg);
+		case 25:
+			if (c->mode == MODE_PASTA)
+				die("--vhost-user is for passt mode only");
+			c->mode = MODE_VU;
+			break;
+		case 26:
+			vu_print_capabilities();
 			break;
 		case 'd':
 			c->debug = 1;
diff --git a/epoll_type.h b/epoll_type.h
index 0ad1efa..f3ef415 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -36,6 +36,10 @@ enum epoll_type {
 	EPOLL_TYPE_TAP_PASST,
 	/* socket listening for qemu socket connections */
 	EPOLL_TYPE_TAP_LISTEN,
+	/* vhost-user command socket */
+	EPOLL_TYPE_VHOST_CMD,
+	/* vhost-user kick event socket */
+	EPOLL_TYPE_VHOST_KICK,
 
 	EPOLL_NUM_TYPES,
 };
diff --git a/iov.c b/iov.c
index 3f9e229..3741db2 100644
--- a/iov.c
+++ b/iov.c
@@ -68,7 +68,6 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
  *
  * Returns:    The number of bytes successfully copied.
  */
-/* cppcheck-suppress unusedFunction */
 size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
 		    size_t offset, const void *buf, size_t bytes)
 {
diff --git a/isolation.c b/isolation.c
index 45fba1e..c944fb3 100644
--- a/isolation.c
+++ b/isolation.c
@@ -379,12 +379,21 @@ void isolate_postfork(const struct ctx *c)
 
 	prctl(PR_SET_DUMPABLE, 0);
 
-	if (c->mode == MODE_PASTA) {
-		prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
-		prog.filter = filter_pasta;
-	} else {
+	switch (c->mode) {
+	case MODE_PASST:
 		prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
 		prog.filter = filter_passt;
+		break;
+	case MODE_PASTA:
+		prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
+		prog.filter = filter_pasta;
+		break;
+	case MODE_VU:
+		prog.len = (unsigned short)ARRAY_SIZE(filter_vu);
+		prog.filter = filter_vu;
+		break;
+	default:
+		ASSERT(0);
 	}
 
 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
diff --git a/packet.c b/packet.c
index 3748996..e5a78d0 100644
--- a/packet.c
+++ b/packet.c
@@ -36,6 +36,17 @@
 static int packet_check_range(const struct pool *p, size_t offset, size_t len,
 			      const char *start, const char *func, int line)
 {
+	if (p->buf_size == 0) {
+		int ret;
+
+		ret = vu_packet_check_range((void *)p->buf, offset, len, start);
+
+		if (ret == -1)
+			trace("cannot find region, %s:%i", func, line);
+
+		return ret;
+	}
+
 	if (start < p->buf) {
 		trace("packet start %p before buffer start %p, "
 		      "%s:%i", (void *)start, (void *)p->buf, func, line);
diff --git a/packet.h b/packet.h
index 8377dcf..3f70e94 100644
--- a/packet.h
+++ b/packet.h
@@ -8,8 +8,10 @@
 
 /**
  * struct pool - Generic pool of packets stored in a buffer
- * @buf:	Buffer storing packet descriptors
- * @buf_size:	Total size of buffer
+ * @buf:	Buffer storing packet descriptors,
+ * 		a struct vu_dev_region array for passt vhost-user mode
+ * @buf_size:	Total size of buffer,
+ * 		0 for passt vhost-user mode
  * @size:	Number of usable descriptors for the pool
  * @count:	Number of used descriptors for the pool
  * @pkt:	Descriptors: see macros below
@@ -22,6 +24,8 @@ struct pool {
 	struct iovec pkt[1];
 };
 
+int vu_packet_check_range(void *buf, size_t offset, size_t len,
+			  const char *start);
 void packet_add_do(struct pool *p, size_t len, const char *start,
 		   const char *func, int line);
 void *packet_get_do(const struct pool *p, const size_t idx,
diff --git a/passt.1 b/passt.1
index 15c8338..b2896a2 100644
--- a/passt.1
+++ b/passt.1
@@ -404,12 +404,20 @@ interface address are configured on a given host interface.
 .SS \fBpasst\fR-only options
 
 .TP
-.BR \-s ", " \-\-socket " " \fIpath
+.BR \-s ", " \-\-socket-path ", " \-\-socket " " \fIpath
 Path for UNIX domain socket used by \fBqemu\fR(1) or \fBqrap\fR(1) to connect to
 \fBpasst\fR.
 Default is to probe a free socket, not accepting connections, starting from
 \fI/tmp/passt_1.socket\fR to \fI/tmp/passt_64.socket\fR.
 
+.TP
+.BR \-\-vhost-user
+Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR.
+
+.TP
+.BR \-\-print-capabilities
+Print back-end capabilities in JSON format, only meaningful for vhost-user mode.
+
 .TP
 .BR \-F ", " \-\-fd " " \fIFD
 Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
diff --git a/passt.c b/passt.c
index 8a37407..957f3d0 100644
--- a/passt.c
+++ b/passt.c
@@ -50,6 +50,7 @@
 #include "log.h"
 #include "tcp_splice.h"
 #include "ndp.h"
+#include "vu_common.h"
 
 #define EPOLL_EVENTS		8
 
@@ -72,6 +73,8 @@ char *epoll_type_str[] = {
 	[EPOLL_TYPE_TAP_PASTA]		= "/dev/net/tun device",
 	[EPOLL_TYPE_TAP_PASST]		= "connected qemu socket",
 	[EPOLL_TYPE_TAP_LISTEN]		= "listening qemu socket",
+	[EPOLL_TYPE_VHOST_CMD]		= "vhost-user command socket",
+	[EPOLL_TYPE_VHOST_KICK]		= "vhost-user kick socket",
 };
 static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
 	      "epoll_type_str[] doesn't match enum epoll_type");
@@ -347,6 +350,12 @@ loop:
 		case EPOLL_TYPE_PING:
 			icmp_sock_handler(&c, ref);
 			break;
+		case EPOLL_TYPE_VHOST_CMD:
+			vu_control_handler(c.vdev, c.fd_tap, eventmask);
+			break;
+		case EPOLL_TYPE_VHOST_KICK:
+			vu_kick_cb(c.vdev, ref, &now);
+			break;
 		default:
 			/* Can't happen */
 			ASSERT(0);
diff --git a/passt.h b/passt.h
index 799ee50..c038630 100644
--- a/passt.h
+++ b/passt.h
@@ -25,6 +25,7 @@ union epoll_ref;
 #include "fwd.h"
 #include "tcp.h"
 #include "udp.h"
+#include "vhost_user.h"
 
 /* Default address for our end on the tap interface.  Bit 0 of byte 0 must be 0
  * (unicast) and bit 1 of byte 1 must be 1 (locally administered).  Otherwise
@@ -43,6 +44,7 @@ union epoll_ref;
  * @icmp:	ICMP-specific reference part
  * @data:	Data handled by protocol handlers
  * @nsdir_fd:	netns dirfd for fallback timer checking if namespace is gone
+ * @queue:	vhost-user queue index for this fd
  * @u64:	Opaque reference for epoll_ctl() and epoll_wait()
  */
 union epoll_ref {
@@ -58,6 +60,7 @@ union epoll_ref {
 			union udp_listen_epoll_ref udp;
 			uint32_t data;
 			int nsdir_fd;
+			int queue;
 		};
 	};
 	uint64_t u64;
@@ -94,6 +97,7 @@ struct fqdn {
 enum passt_modes {
 	MODE_PASST,
 	MODE_PASTA,
+	MODE_VU,
 };
 
 /**
@@ -229,6 +233,7 @@ struct ip6_ctx {
  * @freebind:		Allow binding of non-local addresses for forwarding
  * @low_wmem:		Low probed net.core.wmem_max
  * @low_rmem:		Low probed net.core.rmem_max
+ * @vdev:		vhost-user device
  */
 struct ctx {
 	enum passt_modes mode;
@@ -291,6 +296,8 @@ struct ctx {
 
 	int low_wmem;
 	int low_rmem;
+
+	struct vu_dev *vdev;
 };
 
 void proto_update_l2_buf(const unsigned char *eth_d,
diff --git a/pcap.c b/pcap.c
index 23205dd..3d623cf 100644
--- a/pcap.c
+++ b/pcap.c
@@ -143,7 +143,6 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
  * @iovcnt:	Number of buffers (@iov entries)
  * @offset:	Offset of the L2 frame within the full data length
  */
-/* cppcheck-suppress unusedFunction */
 void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
 {
 	struct timespec now = { 0 };
diff --git a/tap.c b/tap.c
index b489430..cde1719 100644
--- a/tap.c
+++ b/tap.c
@@ -58,6 +58,8 @@
 #include "packet.h"
 #include "tap.h"
 #include "log.h"
+#include "vhost_user.h"
+#include "vu_common.h"
 
 /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
 static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
@@ -78,16 +80,22 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
 	struct iovec iov[2];
 	size_t iovcnt = 0;
 
-	if (c->mode == MODE_PASST) {
+	switch (c->mode) {
+	case MODE_PASST:
 		iov[iovcnt] = IOV_OF_LVALUE(vnet_len);
 		iovcnt++;
+		/* fall through */
+	case MODE_PASTA:
+		iov[iovcnt].iov_base = (void *)data;
+		iov[iovcnt].iov_len = l2len;
+		iovcnt++;
+
+		tap_send_frames(c, iov, iovcnt, 1);
+		break;
+	case MODE_VU:
+		vu_send_single(c, data, l2len);
+		break;
 	}
-
-	iov[iovcnt].iov_base = (void *)data;
-	iov[iovcnt].iov_len = l2len;
-	iovcnt++;
-
-	tap_send_frames(c, iov, iovcnt, 1);
 }
 
 /**
@@ -414,10 +422,18 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
 	if (!nframes)
 		return 0;
 
-	if (c->mode == MODE_PASTA)
+	switch (c->mode) {
+	case MODE_PASTA:
 		m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes);
-	else
+		break;
+	case MODE_PASST:
 		m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes);
+		break;
+	case MODE_VU:
+		/* fall through */
+	default:
+		ASSERT(0);
+	}
 
 	if (m < nframes)
 		debug("tap: failed to send %zu frames of %zu",
@@ -979,7 +995,7 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
  * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket
  * @c:		Execution context
  */
-static void tap_sock_reset(struct ctx *c)
+void tap_sock_reset(struct ctx *c)
 {
 	info("Client connection closed%s", c->one_off ? ", exiting" : "");
 
@@ -990,6 +1006,8 @@ static void tap_sock_reset(struct ctx *c)
 	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
 	close(c->fd_tap);
 	c->fd_tap = -1;
+	if (c->mode == MODE_VU)
+		vu_cleanup(c->vdev);
 }
 
 /**
@@ -1210,6 +1228,11 @@ static void tap_backend_show_hints(struct ctx *c)
 		info("or qrap, for earlier qemu versions:");
 		info("    ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
 		break;
+	case MODE_VU:
+		info("You can start qemu with:");
+		info("    kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n",
+		     c->sock_path);
+		break;
 	}
 }
 
@@ -1237,8 +1260,8 @@ static void tap_sock_unix_init(const struct ctx *c)
  */
 void tap_listen_handler(struct ctx *c, uint32_t events)
 {
-	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST };
 	struct epoll_event ev = { 0 };
+	union epoll_ref ref = { 0 };
 	int v = INT_MAX / 2;
 	struct ucred ucred;
 	socklen_t len;
@@ -1278,6 +1301,10 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
 		trace("tap: failed to set SO_SNDBUF to %i", v);
 
 	ref.fd = c->fd_tap;
+	if (c->mode == MODE_VU)
+		ref.type = EPOLL_TYPE_VHOST_CMD;
+	else
+		ref.type = EPOLL_TYPE_TAP_PASST;
 	ev.events = EPOLLIN | EPOLLRDHUP;
 	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
@@ -1344,7 +1371,7 @@ static void tap_sock_tun_init(struct ctx *c)
  * @base:	Buffer base
  * @size	Buffer size
  */
-static void tap_sock_update_pool(void *base, size_t size)
+void tap_sock_update_pool(void *base, size_t size)
 {
 	int i;
 
@@ -1364,7 +1391,10 @@ static void tap_sock_update_pool(void *base, size_t size)
  */
 void tap_backend_init(struct ctx *c)
 {
-	tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
+	if (c->mode == MODE_VU)
+		tap_sock_update_pool(NULL, 0);
+	else
+		tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
 
 	if (c->fd_tap != -1) { /* Passed as --fd */
 		struct epoll_event ev = { 0 };
@@ -1372,10 +1402,17 @@ void tap_backend_init(struct ctx *c)
 
 		ASSERT(c->one_off);
 		ref.fd = c->fd_tap;
-		if (c->mode == MODE_PASST)
+		switch (c->mode) {
+		case MODE_PASST:
 			ref.type = EPOLL_TYPE_TAP_PASST;
-		else
+			break;
+		case MODE_PASTA:
 			ref.type = EPOLL_TYPE_TAP_PASTA;
+			break;
+		case MODE_VU:
+			ref.type = EPOLL_TYPE_VHOST_CMD;
+			break;
+		}
 
 		ev.events = EPOLLIN | EPOLLRDHUP;
 		ev.data.u64 = ref.u64;
@@ -1383,9 +1420,14 @@ void tap_backend_init(struct ctx *c)
 		return;
 	}
 
-	if (c->mode == MODE_PASTA) {
+	switch (c->mode) {
+	case MODE_PASTA:
 		tap_sock_tun_init(c);
-	} else {
+		break;
+	case MODE_VU:
+		vu_init(c);
+		/* fall through */
+	case MODE_PASST:
 		tap_sock_unix_init(c);
 
 		/* In passt mode, we don't know the guest's MAC address until it
@@ -1393,6 +1435,7 @@ void tap_backend_init(struct ctx *c)
 		 * first packets will reach it.
 		 */
 		memset(&c->guest_mac, 0xff, sizeof(c->guest_mac));
+		break;
 	}
 
 	tap_backend_show_hints(c);
diff --git a/tap.h b/tap.h
index 8728cc5..dfbd8b9 100644
--- a/tap.h
+++ b/tap.h
@@ -40,7 +40,8 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c,
  */
 static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
 {
-	thdr->vnet_len = htonl(l2len);
+	if (thdr)
+		thdr->vnet_len = htonl(l2len);
 }
 
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
@@ -68,6 +69,8 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
 void tap_handler_passt(struct ctx *c, uint32_t events,
 		       const struct timespec *now);
 int tap_sock_unix_open(char *sock_path);
+void tap_sock_reset(struct ctx *c);
+void tap_sock_update_pool(void *base, size_t size);
 void tap_backend_init(struct ctx *c);
 void tap_flush_pools(void);
 void tap_handler(struct ctx *c, const struct timespec *now);
diff --git a/tcp.c b/tcp.c
index e08ffd3..e197a1a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -304,6 +304,7 @@
 #include "flow_table.h"
 #include "tcp_internal.h"
 #include "tcp_buf.h"
+#include "tcp_vu.h"
 
 /* MSS rounding: see SET_MSS() */
 #define MSS_DEFAULT			536
@@ -1314,6 +1315,9 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
 			 int flags)
 {
+	if (c->mode == MODE_VU)
+		return tcp_vu_send_flag(c, conn, flags);
+
 	return tcp_buf_send_flag(c, conn, flags);
 }
 
@@ -1707,6 +1711,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
  */
 static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 {
+	if (c->mode == MODE_VU)
+		return tcp_vu_data_from_sock(c, conn);
+
 	return tcp_buf_data_from_sock(c, conn);
 }
 
diff --git a/tcp_vu.c b/tcp_vu.c
new file mode 100644
index 0000000..1bebb31
--- /dev/null
+++ b/tcp_vu.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* tcp_vu.c - TCP L2 vhost-user management functions
+ *
+ * Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ */
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include <sys/socket.h>
+
+#include <netinet/if_ether.h>
+#include <linux/virtio_net.h>
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "siphash.h"
+#include "inany.h"
+#include "vhost_user.h"
+#include "tcp.h"
+#include "pcap.h"
+#include "flow.h"
+#include "tcp_conn.h"
+#include "flow_table.h"
+#include "tcp_vu.h"
+#include "tap.h"
+#include "tcp_internal.h"
+#include "checksum.h"
+#include "vu_common.h"
+#include <time.h>
+
+static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
+static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
+static int head[VIRTQUEUE_MAX_SIZE + 1];
+static int head_cnt;
+
+/**
+ * tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
+ * @v6:		Set for IPv6 packet
+ *
+ * Return: Return the size of the header
+ */
+static size_t tcp_vu_hdrlen(bool v6)
+{
+	size_t hdrlen;
+
+	hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) +
+		 sizeof(struct ethhdr) + sizeof(struct tcphdr);
+
+	if (v6)
+		hdrlen += sizeof(struct ipv6hdr);
+	else
+		hdrlen += sizeof(struct iphdr);
+
+	return hdrlen;
+}
+
+/**
+ * tcp_vu_update_check() - Calculate TCP checksum
+ * @tapside:	Address information for one side of the flow
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ */
+static void tcp_vu_update_check(const struct flowside *tapside,
+			        struct iovec *iov, int iov_cnt)
+{
+	char *base = iov[0].iov_base;
+
+	if (inany_v4(&tapside->oaddr)) {
+		const struct iphdr *iph = vu_ip(base);
+
+		tcp_update_check_tcp4(iph, iov, iov_cnt,
+				      (char *)vu_payloadv4(base) - base);
+	} else {
+		const struct ipv6hdr *ip6h = vu_ip(base);
+
+		tcp_update_check_tcp6(ip6h, iov, iov_cnt,
+				      (char *)vu_payloadv6(base) - base);
+	}
+}
+
+/**
+ * tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload)
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ * @flags:	TCP flags: if not set, send segment only if ACK is due
+ *
+ * Return: negative error code on connection reset, 0 otherwise
+ */
+int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
+{
+	struct vu_dev *vdev = c->vdev;
+	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	const struct flowside *tapside = TAPFLOW(conn);
+	size_t l2len, l4len, optlen, hdrlen;
+	struct vu_virtq_element flags_elem[2];
+	struct tcp_payload_t *payload;
+	struct ipv6hdr *ip6h = NULL;
+	struct iovec flags_iov[2];
+	struct iphdr *iph = NULL;
+	struct ethhdr *eh;
+	uint32_t seq;
+	int elem_cnt;
+	int nb_ack;
+	int ret;
+
+	hdrlen = tcp_vu_hdrlen(CONN_V6(conn));
+
+	vu_set_element(&flags_elem[0], NULL, &flags_iov[0]);
+
+	elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1,
+			      hdrlen + sizeof(struct tcp_syn_opts), NULL);
+	if (elem_cnt != 1)
+		return -1;
+
+	ASSERT(flags_elem[0].in_sg[0].iov_len >=
+	       hdrlen + sizeof(struct tcp_syn_opts));
+
+	vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1);
+
+	eh = vu_eth(flags_elem[0].in_sg[0].iov_base);
+
+	memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
+	memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
+
+	if (CONN_V4(conn)) {
+		eh->h_proto = htons(ETH_P_IP);
+
+		iph = vu_ip(flags_elem[0].in_sg[0].iov_base);
+		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+
+		payload = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
+	} else {
+		eh->h_proto = htons(ETH_P_IPV6);
+
+		ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base);
+		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
+		payload = vu_payloadv6(flags_elem[0].in_sg[0].iov_base);
+	}
+
+	memset(&payload->th, 0, sizeof(payload->th));
+	payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
+	payload->th.ack = 1;
+
+	seq = conn->seq_to_tap;
+	ret = tcp_prepare_flags(c, conn, flags, &payload->th,
+				(struct tcp_syn_opts *)payload->data,
+				&optlen);
+	if (ret <= 0) {
+		vu_queue_rewind(vq, 1);
+		return ret;
+	}
+
+	if (CONN_V4(conn)) {
+		l4len = tcp_fill_headers4(conn, NULL, iph, payload, optlen,
+					  NULL, seq, true);
+		l2len = sizeof(*iph);
+	} else {
+		l4len = tcp_fill_headers6(conn, NULL, ip6h, payload, optlen,
+					  seq, true);
+		l2len = sizeof(*ip6h);
+	}
+	l2len += l4len + sizeof(struct ethhdr);
+
+	flags_elem[0].in_sg[0].iov_len = l2len +
+				   sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	if (*c->pcap) {
+		tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1);
+		pcap_iov(&flags_elem[0].in_sg[0], 1,
+			 sizeof(struct virtio_net_hdr_mrg_rxbuf));
+	}
+	nb_ack = 1;
+
+	if (flags & DUP_ACK) {
+		vu_set_element(&flags_elem[1], NULL, &flags_iov[1]);
+
+		elem_cnt = vu_collect(vdev, vq, &flags_elem[1], 1,
+				      flags_elem[0].in_sg[0].iov_len, NULL);
+		if (elem_cnt == 1 &&
+		    flags_elem[1].in_sg[0].iov_len >=
+		    flags_elem[0].in_sg[0].iov_len) {
+			memcpy(flags_elem[1].in_sg[0].iov_base,
+			       flags_elem[0].in_sg[0].iov_base,
+			       flags_elem[0].in_sg[0].iov_len);
+			nb_ack++;
+
+			if (*c->pcap) {
+				pcap_iov(&flags_elem[1].in_sg[0], 1,
+					 sizeof(struct virtio_net_hdr_mrg_rxbuf));
+			}
+		}
+	}
+
+	vu_flush(vdev, vq, flags_elem, nb_ack);
+
+	return 0;
+}
+
+/** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers
+ * @c:			Execution context
+ * @conn:		Connection pointer
+ * @v6:			Set for IPv6 connections
+ * @already_sent:	Number of bytes already sent
+ * @fillsize:		Maximum bytes to fill in guest-side receiving window
+ * @iov_cnt:		number of iov (output)
+ *
+ * Return: Number of iov entries used to store the data or negative error code
+ */
+static ssize_t tcp_vu_sock_recv(const struct ctx *c,
+				const struct tcp_tap_conn *conn, bool v6,
+				uint32_t already_sent, size_t fillsize,
+				int *iov_cnt)
+{
+	struct vu_dev *vdev = c->vdev;
+	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	struct msghdr mh_sock = { 0 };
+	uint16_t mss = MSS_GET(conn);
+	int s = conn->sock;
+	ssize_t ret, len;
+	size_t hdrlen;
+	int elem_cnt;
+	int i;
+
+	*iov_cnt = 0;
+
+	hdrlen = tcp_vu_hdrlen(v6);
+
+	vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
+
+	elem_cnt = 0;
+	head_cnt = 0;
+	while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) {
+		struct iovec *iov;
+		size_t frame_size, dlen;
+		int cnt;
+
+		cnt = vu_collect(vdev, vq, &elem[elem_cnt],
+				 VIRTQUEUE_MAX_SIZE - elem_cnt,
+				 MIN(mss, fillsize) + hdrlen, &frame_size);
+		if (cnt == 0)
+			break;
+
+		dlen = frame_size - hdrlen;
+
+		/* reserve space for headers in iov */
+		iov = &elem[elem_cnt].in_sg[0];
+		ASSERT(iov->iov_len >= hdrlen);
+		iov->iov_base = (char *)iov->iov_base + hdrlen;
+		iov->iov_len -= hdrlen;
+		head[head_cnt++] = elem_cnt;
+
+		fillsize -= dlen;
+		elem_cnt += cnt;
+	}
+
+	if (peek_offset_cap) {
+		mh_sock.msg_iov = iov_vu + 1;
+		mh_sock.msg_iovlen = elem_cnt;
+	} else {
+		iov_vu[0].iov_base = tcp_buf_discard;
+		iov_vu[0].iov_len = already_sent;
+
+		mh_sock.msg_iov = iov_vu;
+		mh_sock.msg_iovlen = elem_cnt + 1;
+	}
+
+	do
+		ret = recvmsg(s, &mh_sock, MSG_PEEK);
+	while (ret < 0 && errno == EINTR);
+
+	if (ret < 0) {
+		vu_queue_rewind(vq, elem_cnt);
+		return -errno;
+	}
+
+	if (!peek_offset_cap)
+		ret -= already_sent;
+
+	/* adjust iov number and length of the last iov */
+	len = ret;
+	for (i = 0; len && i < elem_cnt; i++) {
+		struct iovec *iov = &elem[i].in_sg[0];
+
+		if (iov->iov_len > (size_t)len)
+			iov->iov_len = len;
+
+		len -= iov->iov_len;
+	}
+	/* adjust head count */
+	while (head_cnt > 0 && head[head_cnt - 1] > i)
+		head_cnt--;
+	/* mark end of array */
+	head[head_cnt] = i;
+	*iov_cnt = i;
+
+	/* release unused buffers */
+	vu_queue_rewind(vq, elem_cnt - i);
+
+	/* restore space for headers in iov */
+	for (i = 0; i < head_cnt; i++) {
+		struct iovec *iov = &elem[head[i]].in_sg[0];
+
+		iov->iov_base = (char *)iov->iov_base - hdrlen;
+		iov->iov_len += hdrlen;
+	}
+
+	return ret;
+}
+
+/**
+ * tcp_vu_prepare() - Prepare the frame header
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ * @first:	Pointer to the array of IO vectors
+ * @dlen:	Packet data length
+ * @check:	Checksum, if already known
+ */
+static void tcp_vu_prepare(const struct ctx *c,
+			   struct tcp_tap_conn *conn, char *base,
+			   size_t dlen, const uint16_t **check)
+{
+	const struct flowside *toside = TAPFLOW(conn);
+	struct tcp_payload_t *payload;
+	struct ipv6hdr *ip6h = NULL;
+	struct iphdr *iph = NULL;
+	struct ethhdr *eh;
+
+	/* we guess the first iovec provided by the guest can embed
+	 * all the headers needed by L2 frame
+	 */
+
+	eh = vu_eth(base);
+
+	memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
+	memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
+
+	/* initialize header */
+
+	if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
+		eh->h_proto = htons(ETH_P_IP);
+
+		iph = vu_ip(base);
+		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+		payload = vu_payloadv4(base);
+	} else {
+		eh->h_proto = htons(ETH_P_IPV6);
+
+		ip6h = vu_ip(base);
+		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
+
+		payload = vu_payloadv6(base);
+	}
+
+	memset(&payload->th, 0, sizeof(payload->th));
+	payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
+	payload->th.ack = 1;
+
+	if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
+		tcp_fill_headers4(conn, NULL, iph, payload, dlen,
+				  *check, conn->seq_to_tap, true);
+		*check = &iph->check;
+	} else {
+		tcp_fill_headers6(conn, NULL, ip6h, payload, dlen,
+				  conn->seq_to_tap, true);
+	}
+}
+
+/**
+ * tcp_vu_data_from_sock() - Handle new data from socket, queue to vhost-user,
+ *			     in window
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ *
+ * Return: Negative on connection reset, 0 otherwise
+ */
+int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
+	struct vu_dev *vdev = c->vdev;
+	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	const struct flowside *tapside = TAPFLOW(conn);
+	size_t fillsize, hdrlen;
+	int v6 = CONN_V6(conn);
+	uint32_t already_sent;
+	const uint16_t *check;
+	int i, iov_cnt;
+	ssize_t len;
+
+	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
+		debug("Got packet, but RX virtqueue not usable yet");
+		return 0;
+	}
+
+	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
+
+	if (SEQ_LT(already_sent, 0)) {
+		/* RFC 761, section 2.1. */
+		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
+			   conn->seq_ack_from_tap, conn->seq_to_tap);
+		conn->seq_to_tap = conn->seq_ack_from_tap;
+		already_sent = 0;
+		if (tcp_set_peek_offset(conn->sock, 0)) {
+			tcp_rst(c, conn);
+			return -1;
+		}
+	}
+
+	if (!wnd_scaled || already_sent >= wnd_scaled) {
+		conn_flag(c, conn, STALLED);
+		conn_flag(c, conn, ACK_FROM_TAP_DUE);
+		return 0;
+	}
+
+	/* Set up buffer descriptors we'll fill completely and partially. */
+
+	fillsize = wnd_scaled - already_sent;
+
+	/* collect the buffers from vhost-user and fill them with the
+	 * data from the socket
+	 */
+	len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, &iov_cnt);
+	if (len < 0) {
+		if (len != -EAGAIN && len != -EWOULDBLOCK) {
+			tcp_rst(c, conn);
+			return len;
+		}
+		return 0;
+	}
+
+	if (!len) {
+		if (already_sent) {
+			conn_flag(c, conn, STALLED);
+		} else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
+			   SOCK_FIN_RCVD) {
+			int ret = tcp_vu_send_flag(c, conn, FIN | ACK);
+			if (ret) {
+				tcp_rst(c, conn);
+				return ret;
+			}
+
+			conn_event(c, conn, TAP_FIN_SENT);
+		}
+
+		return 0;
+	}
+
+	conn_flag(c, conn, ~STALLED);
+
+	/* Likely, some new data was acked too. */
+	tcp_update_seqack_wnd(c, conn, false, NULL);
+
+	/* initialize headers */
+	/* iov_vu is an array of buffers and the buffer size can be
+	 * smaller than the frame size we want to use but with
+	 * num_buffer we can merge several virtio iov buffers in one packet
+	 * we need only to set the packet headers in the first iov and
+	 * num_buffer to the number of iov entries
+	 */
+
+	hdrlen = tcp_vu_hdrlen(v6);
+	for (i = 0, check = NULL; i < head_cnt; i++) {
+		struct iovec *iov = &elem[head[i]].in_sg[0];
+		int buf_cnt = head[i + 1] - head[i];
+		int dlen = iov_size(iov, buf_cnt) - hdrlen;
+
+		vu_set_vnethdr(vdev, iov->iov_base, buf_cnt);
+
+		/* we compute IPv4 header checksum only for the
+		 * first and the last, all other checksums are the
+		 * same as the first one
+		 */
+		if (i + 1 == head_cnt)
+			check = NULL;
+
+		tcp_vu_prepare(c, conn, iov->iov_base, dlen, &check);
+
+		if (*c->pcap) {
+			tcp_vu_update_check(tapside, iov, buf_cnt);
+			pcap_iov(iov, buf_cnt,
+				 sizeof(struct virtio_net_hdr_mrg_rxbuf));
+		}
+
+		conn->seq_to_tap += dlen;
+	}
+
+	/* send packets */
+	vu_flush(vdev, vq, elem, iov_cnt);
+
+	conn_flag(c, conn, ACK_FROM_TAP_DUE);
+
+	return 0;
+}
diff --git a/tcp_vu.h b/tcp_vu.h
new file mode 100644
index 0000000..6ab6057
--- /dev/null
+++ b/tcp_vu.h
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ */
+
+#ifndef TCP_VU_H
+#define TCP_VU_H
+
+int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
+int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
+
+#endif  /*TCP_VU_H */
diff --git a/udp.c b/udp.c
index 9718ed8..5b0093a 100644
--- a/udp.c
+++ b/udp.c
@@ -110,6 +110,7 @@
 #include "log.h"
 #include "flow_table.h"
 #include "udp_internal.h"
+#include "udp_vu.h"
 
 /* "Spliced" sockets indexed by bound port (host order) */
 static int udp_splice_ns  [IP_VERSIONS][NUM_PORTS];
@@ -628,6 +629,11 @@ void udp_listen_sock_handler(const struct ctx *c,
 			     union epoll_ref ref, uint32_t events,
 			     const struct timespec *now)
 {
+	if (c->mode == MODE_VU) {
+		udp_vu_listen_sock_handler(c, ref, events, now);
+		return;
+	}
+
 	udp_buf_listen_sock_handler(c, ref, events, now);
 }
 
@@ -698,6 +704,11 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now)
 {
+	if (c->mode == MODE_VU) {
+		udp_vu_reply_sock_handler(c, ref, events, now);
+		return;
+	}
+
 	udp_buf_reply_sock_handler(c, ref, events, now);
 }
 
diff --git a/udp_vu.c b/udp_vu.c
new file mode 100644
index 0000000..c911022
--- /dev/null
+++ b/udp_vu.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* udp_vu.c - UDP L2 vhost-user management functions
+ *
+ * Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ */
+
+#include <unistd.h>
+#include <assert.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/uio.h>
+#include <linux/virtio_net.h>
+
+#include "checksum.h"
+#include "util.h"
+#include "ip.h"
+#include "siphash.h"
+#include "inany.h"
+#include "passt.h"
+#include "pcap.h"
+#include "log.h"
+#include "vhost_user.h"
+#include "udp_internal.h"
+#include "flow.h"
+#include "flow_table.h"
+#include "udp_flow.h"
+#include "udp_vu.h"
+#include "vu_common.h"
+
+static struct iovec     iov_vu		[VIRTQUEUE_MAX_SIZE];
+static struct vu_virtq_element	elem		[VIRTQUEUE_MAX_SIZE];
+
+/**
+ * udp_vu_hdrlen() - return the size of the header in level 2 frame (UDP)
+ * @v6:		Set for IPv6 packet
+ *
+ * Return: Return the size of the header
+ */
+static size_t udp_vu_hdrlen(bool v6)
+{
+	size_t hdrlen;
+
+	hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) +
+		 sizeof(struct ethhdr) + sizeof(struct udphdr);
+
+	if (v6)
+		hdrlen += sizeof(struct ipv6hdr);
+	else
+		hdrlen += sizeof(struct iphdr);
+
+	return hdrlen;
+}
+
+/**
+ * udp_vu_sock_info() - get socket information
+ * @s:		Socket to get information from
+ * @s_in:	Socket address (output)
+ *
+ * Return: 0 if socket address can be read, -1 otherwise
+ */
+static int udp_vu_sock_info(int s, union sockaddr_inany *s_in)
+{
+	struct msghdr msg = {
+		.msg_name = s_in,
+		.msg_namelen = sizeof(union sockaddr_inany),
+	};
+
+	return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
+}
+
+/**
+ * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
+ * @c:		Execution context
+ * @s:		Socket to receive from
+ * @events:	epoll events bitmap
+ * @v6:		Set for IPv6 connections
+ * @dlen:	Size of received data (output)
+ *
+ * Return: Number of iov entries used to store the datagram
+ */
+static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
+			    bool v6, ssize_t *dlen)
+{
+	struct vu_dev *vdev = c->vdev;
+	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	int iov_cnt, idx, iov_used;
+	struct msghdr msg  = { 0 };
+	size_t off, hdrlen;
+
+	ASSERT(!c->no_udp);
+
+	if (!(events & EPOLLIN))
+		return 0;
+
+	/* compute L2 header length */
+	hdrlen = udp_vu_hdrlen(v6);
+
+	vu_init_elem(elem, iov_vu, VIRTQUEUE_MAX_SIZE);
+
+	iov_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE,
+			     IP_MAX_MTU - sizeof(struct udphdr) + hdrlen,
+			     NULL);
+	if (iov_cnt == 0)
+		return 0;
+
+	/* reserve space for the headers */
+	ASSERT(iov_vu[0].iov_len >= hdrlen);
+	iov_vu[0].iov_base = (char *)iov_vu[0].iov_base + hdrlen;
+	iov_vu[0].iov_len -= hdrlen;
+
+	/* read data from the socket */
+	msg.msg_iov = iov_vu;
+	msg.msg_iovlen = iov_cnt;
+
+	*dlen = recvmsg(s, &msg, 0);
+	if (*dlen < 0) {
+		vu_queue_rewind(vq, iov_cnt);
+		return 0;
+	}
+
+	/* restore the pointer to the headers address */
+	iov_vu[0].iov_base = (char *)iov_vu[0].iov_base - hdrlen;
+	iov_vu[0].iov_len += hdrlen;
+
+	/* count the numbers of buffer filled by recvmsg() */
+	idx = iov_skip_bytes(iov_vu, iov_cnt, *dlen + hdrlen, &off);
+
+	/* adjust last iov length */
+	if (idx < iov_cnt)
+		iov_vu[idx].iov_len = off;
+	iov_used = idx + !!off;
+
+	vu_set_vnethdr(vdev, iov_vu[0].iov_base, iov_used);
+
+	/* release unused buffers */
+	vu_queue_rewind(vq, iov_cnt - iov_used);
+
+	return iov_used;
+}
+
+/**
+ * udp_vu_prepare() - Prepare the packet header
+ * @c:		Execution context
+ * @toside:	Address information for one side of the flow
+ * @dlen:	Packet data length
+ *
+ * Return: Layer-4 length
+ */
+static size_t udp_vu_prepare(const struct ctx *c,
+			     const struct flowside *toside, ssize_t dlen)
+{
+	struct ethhdr *eh;
+	size_t l4len;
+
+	/* ethernet header */
+	eh = vu_eth(iov_vu[0].iov_base);
+
+	memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
+	memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
+
+	/* initialize header */
+	if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
+		struct iphdr *iph = vu_ip(iov_vu[0].iov_base);
+		struct udp_payload_t *bp = vu_payloadv4(iov_vu[0].iov_base);
+
+		eh->h_proto = htons(ETH_P_IP);
+
+		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP);
+
+		l4len = udp_update_hdr4(iph, bp, toside, dlen, true);
+	} else {
+		struct ipv6hdr *ip6h = vu_ip(iov_vu[0].iov_base);
+		struct udp_payload_t *bp = vu_payloadv6(iov_vu[0].iov_base);
+
+		eh->h_proto = htons(ETH_P_IPV6);
+
+		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP);
+
+		l4len = udp_update_hdr6(ip6h, bp, toside, dlen, true);
+	}
+
+	return l4len;
+}
+
+/**
+ * udp_vu_csum() - Calculate and set checksum for a UDP packet
+ * @toside:	Address information for one side of the flow
+ * @iov_used:	Number of used iov_vu items
+ */
+static void udp_vu_csum(const struct flowside *toside, int iov_used)
+{
+	const struct in_addr *src4 = inany_v4(&toside->oaddr);
+	const struct in_addr *dst4 = inany_v4(&toside->eaddr);
+	char *base = iov_vu[0].iov_base;
+	struct udp_payload_t *bp;
+
+	if (src4 && dst4) {
+		bp = vu_payloadv4(base);
+		csum_udp4(&bp->uh, *src4, *dst4, iov_vu, iov_used,
+			  (char *)&bp->data - base);
+	} else {
+		bp = vu_payloadv6(base);
+		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
+			  iov_vu, iov_used, (char *)&bp->data - base);
+	}
+}
+
+/**
+ * udp_vu_listen_sock_handler() - Handle new data from socket
+ * @c:		Execution context
+ * @ref:	epoll reference
+ * @events:	epoll events bitmap
+ * @now:	Current timestamp
+ */
+void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
+				uint32_t events, const struct timespec *now)
+{
+	struct vu_dev *vdev = c->vdev;
+	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	int i;
+
+	if (udp_sock_errs(c, ref.fd, events) < 0) {
+		err("UDP: Unrecoverable error on listening socket:"
+		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
+		return;
+	}
+
+	for (i = 0; i < UDP_MAX_FRAMES; i++) {
+		const struct flowside *toside;
+		union sockaddr_inany s_in;
+		flow_sidx_t sidx;
+		uint8_t pif;
+		ssize_t dlen;
+		int iov_used;
+		bool v6;
+
+		if (udp_vu_sock_info(ref.fd, &s_in) < 0)
+			break;
+
+		sidx = udp_flow_from_sock(c, ref, &s_in, now);
+		pif = pif_at_sidx(sidx);
+
+		if (pif != PIF_TAP) {
+			if (flow_sidx_valid(sidx)) {
+				flow_sidx_t fromsidx = flow_sidx_opposite(sidx);
+				struct udp_flow *uflow = udp_at_sidx(sidx);
+
+				flow_err(uflow,
+					"No support for forwarding UDP from %s to %s",
+					pif_name(pif_at_sidx(fromsidx)),
+					pif_name(pif));
+			} else {
+				debug("Discarding 1 datagram without flow");
+			}
+
+			continue;
+		}
+
+		toside = flowside_at_sidx(sidx);
+
+		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
+
+		iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen);
+		if (iov_used <= 0)
+			break;
+
+		udp_vu_prepare(c, toside, dlen);
+		if (*c->pcap) {
+			udp_vu_csum(toside, iov_used);
+			pcap_iov(iov_vu, iov_used,
+				 sizeof(struct virtio_net_hdr_mrg_rxbuf));
+		}
+		vu_flush(vdev, vq, elem, iov_used);
+	}
+}
+
+/**
+ * udp_vu_reply_sock_handler() - Handle new data from flow specific socket
+ * @c:		Execution context
+ * @ref:	epoll reference
+ * @events:	epoll events bitmap
+ * @now:	Current timestamp
+ */
+void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
+			        uint32_t events, const struct timespec *now)
+{
+	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+	const struct flowside *toside = flowside_at_sidx(tosidx);
+	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
+	int from_s = uflow->s[ref.flowside.sidei];
+	struct vu_dev *vdev = c->vdev;
+	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	int i;
+
+	ASSERT(!c->no_udp);
+
+	if (udp_sock_errs(c, from_s, events) < 0) {
+		flow_err(uflow, "Unrecoverable error on reply socket");
+		flow_err_details(uflow);
+		udp_flow_close(c, uflow);
+		return;
+	}
+
+	for (i = 0; i < UDP_MAX_FRAMES; i++) {
+		uint8_t topif = pif_at_sidx(tosidx);
+		ssize_t dlen;
+		int iov_used;
+		bool v6;
+
+		ASSERT(uflow);
+
+		if (topif != PIF_TAP) {
+			uint8_t frompif = pif_at_sidx(ref.flowside);
+
+			flow_err(uflow,
+				 "No support for forwarding UDP from %s to %s",
+				 pif_name(frompif), pif_name(topif));
+			continue;
+		}
+
+		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
+
+		iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen);
+		if (iov_used <= 0)
+			break;
+		flow_trace(uflow, "Received 1 datagram on reply socket");
+		uflow->ts = now->tv_sec;
+
+		udp_vu_prepare(c, toside, dlen);
+		if (*c->pcap) {
+			udp_vu_csum(toside, iov_used);
+			pcap_iov(iov_vu, iov_used,
+				 sizeof(struct virtio_net_hdr_mrg_rxbuf));
+		}
+		vu_flush(vdev, vq, elem, iov_used);
+	}
+}
diff --git a/udp_vu.h b/udp_vu.h
new file mode 100644
index 0000000..ba7018d
--- /dev/null
+++ b/udp_vu.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ */
+
+#ifndef UDP_VU_H
+#define UDP_VU_H
+
+void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
+				uint32_t events, const struct timespec *now);
+void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
+			       uint32_t events, const struct timespec *now);
+#endif /* UDP_VU_H */
diff --git a/vhost_user.c b/vhost_user.c
index 89627a2..51c90db 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -48,12 +48,13 @@
 /* vhost-user version we are compatible with */
 #define VHOST_USER_VERSION 1
 
+static struct vu_dev vdev_storage;
+
 /**
  * vu_print_capabilities() - print vhost-user capabilities
  * 			     this is part of the vhost-user backend
  * 			     convention.
  */
-/* cppcheck-suppress unusedFunction */
 void vu_print_capabilities(void)
 {
 	info("{");
@@ -163,9 +164,7 @@ static void vmsg_close_fds(const struct vhost_user_msg *vmsg)
  */
 static void vu_remove_watch(const struct vu_dev *vdev, int fd)
 {
-	/* Placeholder to add passt related code */
-	(void)vdev;
-	(void)fd;
+	epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, fd, NULL);
 }
 
 /**
@@ -487,6 +486,14 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
 		}
 	}
 
+	/* As vu_packet_check_range() has no access to the number of
+	 * memory regions, mark the end of the array with mmap_addr = 0
+	 */
+	ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1);
+	vdev->regions[vdev->nregions].mmap_addr = 0;
+
+	tap_sock_update_pool(vdev->regions, 0);
+
 	return false;
 }
 
@@ -615,9 +622,16 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev,
  */
 static void vu_set_watch(const struct vu_dev *vdev, int idx)
 {
-	/* Placeholder to add passt related code */
-	(void)vdev;
-	(void)idx;
+	union epoll_ref ref = {
+		.type = EPOLL_TYPE_VHOST_KICK,
+		.fd = vdev->vq[idx].kick_fd,
+		.queue = idx
+	 };
+	struct epoll_event ev = { 0 };
+
+	ev.data.u64 = ref.u64;
+	ev.events = EPOLLIN;
+	epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
 }
 
 /**
@@ -829,14 +843,14 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
  * @c:		execution context
  * @vdev:	vhost-user device
  */
-/* cppcheck-suppress unusedFunction */
-void vu_init(struct ctx *c, struct vu_dev *vdev)
+void vu_init(struct ctx *c)
 {
 	int i;
 
-	vdev->context = c;
+	c->vdev = &vdev_storage;
+	c->vdev->context = c;
 	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
-		vdev->vq[i] = (struct vu_virtq){
+		c->vdev->vq[i] = (struct vu_virtq){
 			.call_fd = -1,
 			.kick_fd = -1,
 			.err_fd = -1,
@@ -849,7 +863,6 @@ void vu_init(struct ctx *c, struct vu_dev *vdev)
  * vu_cleanup() - Reset vhost-user device
  * @vdev:	vhost-user device
  */
-/* cppcheck-suppress unusedFunction */
 void vu_cleanup(struct vu_dev *vdev)
 {
 	unsigned int i;
@@ -896,8 +909,7 @@ void vu_cleanup(struct vu_dev *vdev)
  */
 static void vu_sock_reset(struct vu_dev *vdev)
 {
-	/* Placeholder to add passt related code */
-	(void)vdev;
+	tap_sock_reset(vdev->context);
 }
 
 static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
@@ -925,7 +937,6 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
  * @fd:		vhost-user message socket
  * @events:	epoll events
  */
-/* cppcheck-suppress unusedFunction */
 void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
 {
 	struct vhost_user_msg msg = { 0 };
diff --git a/vhost_user.h b/vhost_user.h
index 5af349b..464ba21 100644
--- a/vhost_user.h
+++ b/vhost_user.h
@@ -183,7 +183,6 @@ struct vhost_user_msg {
  *
  * Return: true if the virqueue is enabled, false otherwise
  */
-/* cppcheck-suppress unusedFunction */
 static inline bool vu_queue_enabled(const struct vu_virtq *vq)
 {
 	return vq->enable;
@@ -195,14 +194,13 @@ static inline bool vu_queue_enabled(const struct vu_virtq *vq)
  *
  * Return: true if the virqueue is started, false otherwise
  */
-/* cppcheck-suppress unusedFunction */
 static inline bool vu_queue_started(const struct vu_virtq *vq)
 {
 	return vq->started;
 }
 
 void vu_print_capabilities(void);
-void vu_init(struct ctx *c, struct vu_dev *vdev);
+void vu_init(struct ctx *c);
 void vu_cleanup(struct vu_dev *vdev);
 void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events);
 #endif /* VHOST_USER_H */
diff --git a/virtio.c b/virtio.c
index b23a68c..6a97435 100644
--- a/virtio.c
+++ b/virtio.c
@@ -325,7 +325,6 @@ static bool vring_can_notify(const struct vu_dev *dev, struct vu_virtq *vq)
  * @dev:	Vhost-user device
  * @vq:		Virtqueue
  */
-/* cppcheck-suppress unusedFunction */
 void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
 {
 	if (!vring_can_notify(dev, vq)) {
@@ -498,7 +497,6 @@ static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned i
  *
  * Return: -1 if there is an error, 0 otherwise
  */
-/* cppcheck-suppress unusedFunction */
 int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem)
 {
 	unsigned int head;
@@ -556,7 +554,6 @@ void vu_queue_unpop(struct vu_virtq *vq)
  * @vq:		Virtqueue
  * @num:	Number of element to unpop
  */
-/* cppcheck-suppress unusedFunction */
 bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num)
 {
 	if (num > vq->inuse)
@@ -609,7 +606,6 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
  * @len:	Size of the element
  * @idx:	Used ring entry index
  */
-/* cppcheck-suppress unusedFunction */
 void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem,
 		   unsigned int len, unsigned int idx)
 {
@@ -633,7 +629,6 @@ static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val)
  * @vq:		Virtqueue
  * @count:	Number of entry to flush
  */
-/* cppcheck-suppress unusedFunction */
 void vu_queue_flush(struct vu_virtq *vq, unsigned int count)
 {
 	uint16_t old, new;
diff --git a/vu_common.c b/vu_common.c
new file mode 100644
index 0000000..f2eb701
--- /dev/null
+++ b/vu_common.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ *
+ * common_vu.c - vhost-user common UDP and TCP functions
+ */
+
+#include <unistd.h>
+#include <sys/uio.h>
+#include <sys/eventfd.h>
+#include <netinet/if_ether.h>
+#include <linux/virtio_net.h>
+
+#include "util.h"
+#include "passt.h"
+#include "tap.h"
+#include "vhost_user.h"
+#include "pcap.h"
+#include "vu_common.h"
+
+/**
+ * vu_packet_check_range() - Check if a given memory zone is contained in
+ * 			     a mapped guest memory region
+ * @buf:	Array of the available memory regions
+ * @offset:	Offset of data range in packet descriptor
+ * @size:	Length of desired data range
+ * @start:	Start of the packet descriptor
+ *
+ * Return: 0 if the zone is in a mapped memory region, -1 otherwise
+ */
+int vu_packet_check_range(void *buf, size_t offset, size_t len,
+			  const char *start)
+{
+	struct vu_dev_region *dev_region;
+
+	for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
+		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
+		char *m = (char *)dev_region->mmap_addr;
+
+		if (m <= start &&
+		    start + offset + len <= m + dev_region->mmap_offset +
+					       dev_region->size)
+			return 0;
+	}
+
+	return -1;
+}
+
+/**
+ * vu_init_elem() - initialize an array of virtqueue elements with 1 iov in each
+ * @elem:	Array of virtqueue elements to initialize
+ * @iov:	Array of iovec to assign to virtqueue element
+ * @elem_cnt:	Number of virtqueue element
+ */
+void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt)
+{
+	int i;
+
+	for (i = 0; i < elem_cnt; i++)
+		vu_set_element(&elem[i], NULL, &iov[i]);
+}
+
+/**
+ * vu_collect() - collect virtio buffers from a given virtqueue
+ * @vdev:		vhost-user device
+ * @vq:			virtqueue to collect from
+ * @elem:		Array of virtqueue element
+ * 			each element must be initialized with one iovec entry
+ * 			in the in_sg array.
+ * @max_elem:		Number of virtqueue elements in the array
+ * @size:		Maximum size of the data in the frame
+ * @frame_size:		The total size of the buffers (output)
+ *
+ * Return: number of elements used to contain the frame
+ */
+int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq,
+	       struct vu_virtq_element *elem, int max_elem,
+	       size_t size, size_t *frame_size)
+{
+	size_t current_size = 0;
+	int elem_cnt = 0;
+
+	while (current_size < size && elem_cnt < max_elem) {
+		struct iovec *iov;
+		int ret;
+
+		ret = vu_queue_pop(vdev, vq, &elem[elem_cnt]);
+		if (ret < 0)
+			break;
+
+		if (elem[elem_cnt].in_num < 1) {
+			warn("virtio-net receive queue contains no in buffers");
+			vu_queue_detach_element(vq);
+			break;
+		}
+
+		iov = &elem[elem_cnt].in_sg[0];
+
+		if (iov->iov_len > size - current_size)
+			iov->iov_len = size - current_size;
+
+		current_size += iov->iov_len;
+		elem_cnt++;
+
+		if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
+			break;
+	}
+
+	if (frame_size)
+		*frame_size = current_size;
+
+	return elem_cnt;
+}
+
+/**
+ * vu_set_vnethdr() - set virtio-net headers
+ * @vdev:		vhost-user device
+ * @vnethdr:		Address of the header to set
+ * @num_buffers:	Number of guest buffers of the frame
+ */
+void vu_set_vnethdr(const struct vu_dev *vdev,
+		    struct virtio_net_hdr_mrg_rxbuf *vnethdr,
+		    int num_buffers)
+{
+	vnethdr->hdr = VU_HEADER;
+	if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
+		vnethdr->num_buffers = htole16(num_buffers);
+}
+
+/**
+ * vu_flush() - flush all the collected buffers to the vhost-user interface
+ * @vdev:	vhost-user device
+ * @vq:		vhost-user virtqueue
+ * @elem:	virtqueue elements array to send back to the virtqueue
+ * @elem_cnt:	Length of the array
+ */
+void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
+	      struct vu_virtq_element *elem, int elem_cnt)
+{
+	int i;
+
+	for (i = 0; i < elem_cnt; i++)
+		vu_queue_fill(vq, &elem[i], elem[i].in_sg[0].iov_len, i);
+
+	vu_queue_flush(vq, elem_cnt);
+	vu_queue_notify(vdev, vq);
+}
+
+/**
+ * vu_handle_tx() - Receive data from the TX virtqueue
+ * @vdev:	vhost-user device
+ * @index:	index of the virtqueue
+ * @now:	Current timestamp
+ */
+static void vu_handle_tx(struct vu_dev *vdev, int index,
+			 const struct timespec *now)
+{
+	struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
+	struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
+	struct vu_virtq *vq = &vdev->vq[index];
+	int hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	int out_sg_count;
+	int count;
+
+	ASSERT(VHOST_USER_IS_QUEUE_TX(index));
+
+	tap_flush_pools();
+
+	count = 0;
+	out_sg_count = 0;
+	while (count < VIRTQUEUE_MAX_SIZE) {
+		int ret;
+
+		vu_set_element(&elem[count], &out_sg[out_sg_count], NULL);
+		ret = vu_queue_pop(vdev, vq, &elem[count]);
+		if (ret < 0)
+			break;
+		out_sg_count += elem[count].out_num;
+
+		if (elem[count].out_num < 1) {
+			warn("virtio-net transmit queue contains no out buffers");
+			break;
+		}
+		ASSERT(elem[count].out_num == 1);
+
+		tap_add_packet(vdev->context,
+			       elem[count].out_sg[0].iov_len - hdrlen,
+			       (char *)elem[count].out_sg[0].iov_base + hdrlen);
+		count++;
+	}
+	tap_handler(vdev->context, now);
+
+	if (count) {
+		int i;
+
+		for (i = 0; i < count; i++)
+			vu_queue_fill(vq, &elem[i], 0, i);
+		vu_queue_flush(vq, count);
+		vu_queue_notify(vdev, vq);
+	}
+}
+
+/**
+ * vu_kick_cb() - Called on a kick event to start to receive data
+ * @vdev:	vhost-user device
+ * @ref:	epoll reference information
+ * @now:	Current timestamp
+ */
+void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
+		const struct timespec *now)
+{
+	eventfd_t kick_data;
+	ssize_t rc;
+
+	rc = eventfd_read(ref.fd, &kick_data);
+	if (rc == -1)
+		die_perror("vhost-user kick eventfd_read()");
+
+	debug("vhost-user: got kick_data: %016"PRIx64" idx: %d",
+	      kick_data, ref.queue);
+	if (VHOST_USER_IS_QUEUE_TX(ref.queue))
+		vu_handle_tx(vdev, ref.queue, now);
+}
+
+/**
+ * vu_send_single() - Send a buffer to the front-end using the RX virtqueue
+ * @c:		execution context
+ * @buf:	address of the buffer
+ * @size:	size of the buffer
+ *
+ * Return: number of bytes sent, -1 if there is an error
+ */
+int vu_send_single(const struct ctx *c, const void *buf, size_t size)
+{
+	struct vu_dev *vdev = c->vdev;
+	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
+	struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
+	size_t total;
+	int elem_cnt;
+	int i;
+
+	debug("vu_send_single size %zu", size);
+
+	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
+		debug("Got packet, but RX virtqueue not usable yet");
+		return -1;
+	}
+
+	vu_init_elem(elem, in_sg, VIRTQUEUE_MAX_SIZE);
+
+	size += sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	elem_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE, size, &total);
+	if (total < size) {
+		debug("vu_send_single: no space to send the data "
+		      "elem_cnt %d size %zd", elem_cnt, total);
+		goto err;
+	}
+
+	vu_set_vnethdr(vdev, in_sg[0].iov_base, elem_cnt);
+
+	total -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
+
+	/* copy data from the buffer to the iovec */
+	iov_from_buf(in_sg, elem_cnt, sizeof(struct virtio_net_hdr_mrg_rxbuf),
+		     buf, total);
+
+	if (*c->pcap) {
+		pcap_iov(in_sg, elem_cnt,
+			 sizeof(struct virtio_net_hdr_mrg_rxbuf));
+	}
+
+	vu_flush(vdev, vq, elem, elem_cnt);
+
+	debug("vhost-user sent %zu", total);
+
+	return total;
+err:
+	for (i = 0; i < elem_cnt; i++)
+		vu_queue_detach_element(vq);
+
+	return -1;
+}
diff --git a/vu_common.h b/vu_common.h
new file mode 100644
index 0000000..901d972
--- /dev/null
+++ b/vu_common.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ *
+ * vhost-user common UDP and TCP functions
+ */
+
+#ifndef VU_COMMON_H
+#define VU_COMMON_H
+#include <linux/virtio_net.h>
+
+static inline void *vu_eth(void *base)
+{
+	return ((char *)base + sizeof(struct virtio_net_hdr_mrg_rxbuf));
+}
+
+static inline void *vu_ip(void *base)
+{
+	return (struct ethhdr *)vu_eth(base) + 1;
+}
+
+static inline void *vu_payloadv4(void *base)
+{
+	return (struct iphdr *)vu_ip(base) + 1;
+}
+
+static inline void *vu_payloadv6(void *base)
+{
+	return (struct ipv6hdr *)vu_ip(base) + 1;
+}
+
+/**
+ * vu_set_element() - Initialize a vu_virtq_element
+ * @elem:	Element to initialize
+ * @out_sg:	One out iovec entry to set in elem
+ * @in_sg:	One in iovec entry to set in elem
+ */
+static inline void vu_set_element(struct vu_virtq_element *elem,
+				  struct iovec *out_sg, struct iovec *in_sg)
+{
+	elem->out_num = !!out_sg;
+	elem->out_sg = out_sg;
+	elem->in_num = !!in_sg;
+	elem->in_sg = in_sg;
+}
+
+void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov,
+		  int elem_cnt);
+int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq,
+	       struct vu_virtq_element *elem, int max_elem, size_t size,
+	       size_t *frame_size);
+void vu_set_vnethdr(const struct vu_dev *vdev,
+		    struct virtio_net_hdr_mrg_rxbuf *vnethdr,
+                    int num_buffers);
+void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
+	      struct vu_virtq_element *elem, int elem_cnt);
+void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
+		const struct timespec *now);
+int vu_send_single(const struct ctx *c, const void *buf, size_t size);
+#endif /* VU_COMMON_H */

From 676bf5488ec4bd4312dbae4be1a1bb2ed02bd2ba Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 22 Nov 2024 17:43:35 +0100
Subject: [PATCH 140/382] test: Add tests for passt in vhost-user mode

Run functional and performance tests for vhost-user mode as well. For
functional tests, we add passt_vu and passt_vu_in_ns as symbolic links
to their non-vhost-user counterparts, as no differences are intended
but we want to distinguish them in test logs.

For performance tests, instead, we add separate perf/passt_vu_tcp and
perf/passt_vu_udp files, as we need longer test duration, as well as
higher UDP sending bandwidths and larger TCP windows, to actually get
the highest throughput vhost-user mode offers.

For valgrind tests, vhost-user mode needs two extra system calls:
statx and readlink. Add them as EXTRA_SYSCALLS for the valgrind
target.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 Makefile               |   3 +-
 test/lib/perf_report   |  15 +++
 test/lib/setup         |  77 ++++++++++++---
 test/lib/setup_ugly    |   2 +-
 test/passt_vu          |   1 +
 test/passt_vu_in_ns    |   1 +
 test/perf/passt_vu_tcp | 211 +++++++++++++++++++++++++++++++++++++++++
 test/perf/passt_vu_udp | 159 +++++++++++++++++++++++++++++++
 test/run               |  25 +++++
 test/two_guests_vu     |   1 +
 10 files changed, 479 insertions(+), 16 deletions(-)
 create mode 120000 test/passt_vu
 create mode 120000 test/passt_vu_in_ns
 create mode 100644 test/perf/passt_vu_tcp
 create mode 100644 test/perf/passt_vu_udp
 create mode 120000 test/two_guests_vu

diff --git a/Makefile b/Makefile
index faa5c23..cb74480 100644
--- a/Makefile
+++ b/Makefile
@@ -101,7 +101,8 @@ qrap: $(QRAP_SRCS) passt.h
 
 valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction	\
 			    rt_sigreturn getpid gettid kill clock_gettime mmap \
-			    mmap2 munmap open unlink gettimeofday futex
+			    mmap2 munmap open unlink gettimeofday futex statx \
+			    readlink
 valgrind: FLAGS += -g -DVALGRIND
 valgrind: all
 
diff --git a/test/lib/perf_report b/test/lib/perf_report
index d1ef50b..c4ec817 100755
--- a/test/lib/perf_report
+++ b/test/lib/perf_report
@@ -49,6 +49,21 @@ td:empty { visibility: hidden; }
 	__passt_tcp_LINE__ __passt_udp_LINE__
 </table>
 
+</li><li><p>passt with vhost-user support</p>
+<table class="passt" width="70%">
+	<tr>
+		<th/>
+		<th id="perf_passt_vu_tcp" colspan="__passt_vu_tcp_cols__">TCP, __passt_vu_tcp_threads__ at __passt_vu_tcp_freq__ GHz</th>
+		<th id="perf_passt_vu_udp" colspan="__passt_vu_udp_cols__">UDP, __passt_vu_udp_threads__ at __passt_vu_udp_freq__ GHz</th>
+	</tr>
+	<tr>
+		<td align="right">MTU:</td>
+		__passt_vu_tcp_header__
+		__passt_vu_udp_header__
+	</tr>
+	__passt_vu_tcp_LINE__ __passt_vu_udp_LINE__
+</table>
+
 <style type="text/CSS">
 table.pasta_local td { border: 0px solid; padding: 6px; line-height: 1; }
 table.pasta_local td { text-align: right; }
diff --git a/test/lib/setup b/test/lib/setup
index 5338393..580825f 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -15,8 +15,7 @@
 
 INITRAMFS="${BASEPATH}/mbuto.img"
 VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )"
-__mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
-VMEM="$((${__mem_kib} / 1024 / 4))"
+MEM_KIB="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
 QEMU_ARCH="$(uname -m)"
 [ "${QEMU_ARCH}" = "i686" ] && QEMU_ARCH=i386
 
@@ -46,6 +45,7 @@ setup_passt() {
 	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt.pcap"
 	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
 	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+	[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
 
 	context_run passt "make clean"
 	context_run passt "make valgrind"
@@ -54,16 +54,29 @@ setup_passt() {
 	# pidfile isn't created until passt is listening
 	wait_for [ -f "${STATESETUP}/passt.pid" ]
 
+	__vmem="$((${MEM_KIB} / 1024 / 4))"
+	if [ ${VHOST_USER} -eq 1 ]; then
+		__vmem="$(((${__vmem} + 500) / 1000))G"
+		__qemu_netdev="						       \
+			-chardev socket,id=c,path=${STATESETUP}/passt.socket   \
+			-netdev vhost-user,id=v,chardev=c		       \
+			-device virtio-net,netdev=v			       \
+			-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
+			-numa node,memdev=m"
+	else
+		__qemu_netdev="-device virtio-net-pci,netdev=s		       \
+			-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket"
+	fi
+
 	GUEST_CID=94557
 	context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}"		   \
 		' -machine accel=kvm'                                      \
-		' -m '${VMEM}' -cpu host -smp '${VCPUS}                    \
+		' -m '${__vmem}' -cpu host -smp '${VCPUS}		   \
 		' -kernel '"${KERNEL}"					   \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	   \
 		' -nodefaults'						   \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	   \
-		' -device virtio-net-pci,netdev=s0 '			   \
-		" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \
+		" ${__qemu_netdev}"					   \
 		" -pidfile ${STATESETUP}/qemu.pid"			   \
 		" -device vhost-vsock-pci,guest-cid=$GUEST_CID"
 
@@ -142,6 +155,7 @@ setup_passt_in_ns() {
 	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_in_pasta.pcap"
 	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
 	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+	[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
 
 	if [ ${VALGRIND} -eq 1 ]; then
 		context_run passt "make clean"
@@ -154,17 +168,30 @@ setup_passt_in_ns() {
 	fi
 	wait_for [ -f "${STATESETUP}/passt.pid" ]
 
+	__vmem="$((${MEM_KIB} / 1024 / 4))"
+	if [ ${VHOST_USER} -eq 1 ]; then
+		__vmem="$(((${__vmem} + 500) / 1000))G"
+		__qemu_netdev="						       \
+			-chardev socket,id=c,path=${STATESETUP}/passt.socket   \
+			-netdev vhost-user,id=v,chardev=c		       \
+			-device virtio-net,netdev=v			       \
+			-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
+			-numa node,memdev=m"
+	else
+		__qemu_netdev="-device virtio-net-pci,netdev=s		       \
+			-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket"
+	fi
+
 	GUEST_CID=94557
 	context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}"		   \
 		' -machine accel=kvm'                                      \
 		' -M accel=kvm:tcg'                                        \
-		' -m '${VMEM}' -cpu host -smp '${VCPUS}                    \
+		' -m '${__vmem}' -cpu host -smp '${VCPUS}		   \
 		' -kernel '"${KERNEL}"					   \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	   \
 		' -nodefaults'						   \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	   \
-		' -device virtio-net-pci,netdev=s0 '			   \
-		" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \
+		" ${__qemu_netdev}"					   \
 		" -pidfile ${STATESETUP}/qemu.pid"			   \
 		" -device vhost-vsock-pci,guest-cid=$GUEST_CID"
 
@@ -214,6 +241,7 @@ setup_two_guests() {
 	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
 	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
 	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+	[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
 
 	context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001"
 	wait_for [ -f "${STATESETUP}/passt_1.pid" ]
@@ -222,33 +250,54 @@ setup_two_guests() {
 	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
 	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
 	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+	[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
 
 	context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004"
 	wait_for [ -f "${STATESETUP}/passt_2.pid" ]
 
+	__vmem="$((${MEM_KIB} / 1024 / 4))"
+	if [ ${VHOST_USER} -eq 1 ]; then
+		__vmem="$(((${__vmem} + 500) / 1000))G"
+		__qemu_netdev1="					       \
+			-chardev socket,id=c,path=${STATESETUP}/passt_1.socket \
+			-netdev vhost-user,id=v,chardev=c		       \
+			-device virtio-net,netdev=v			       \
+			-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
+			-numa node,memdev=m"
+		__qemu_netdev2="					       \
+			-chardev socket,id=c,path=${STATESETUP}/passt_2.socket \
+			-netdev vhost-user,id=v,chardev=c		       \
+			-device virtio-net,netdev=v			       \
+			-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
+			-numa node,memdev=m"
+	else
+		__qemu_netdev1="-device virtio-net-pci,netdev=s		       \
+			-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket"
+		__qemu_netdev2="-device virtio-net-pci,netdev=s		       \
+			-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket"
+	fi
+
 	GUEST_1_CID=94557
 	context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}"		     \
 		' -M accel=kvm:tcg'                                          \
-		' -m '${VMEM}' -cpu host -smp '${VCPUS}                      \
+		' -m '${__vmem}' -cpu host -smp '${VCPUS}		     \
 		' -kernel '"${KERNEL}"					     \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	     \
 		' -nodefaults'						     \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	     \
-		' -device virtio-net-pci,netdev=s0 '			     \
-		" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket " \
+		" ${__qemu_netdev1}"					     \
 		" -pidfile ${STATESETUP}/qemu_1.pid"			     \
 		" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"
 
 	GUEST_2_CID=94558
 	context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}"		     \
 		' -M accel=kvm:tcg'                                          \
-		' -m '${VMEM}' -cpu host -smp '${VCPUS}                      \
+		' -m '${__vmem}' -cpu host -smp '${VCPUS}		     \
 		' -kernel '"${KERNEL}"					     \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	     \
 		' -nodefaults'						     \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	     \
-		' -device virtio-net-pci,netdev=s0 '			     \
-		" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket " \
+		" ${__qemu_netdev2}"					     \
 		" -pidfile ${STATESETUP}/qemu_2.pid"			     \
 		" -device vhost-vsock-pci,guest-cid=$GUEST_2_CID"
 
diff --git a/test/lib/setup_ugly b/test/lib/setup_ugly
index 4b2a077..2802cc3 100755
--- a/test/lib/setup_ugly
+++ b/test/lib/setup_ugly
@@ -33,7 +33,7 @@ setup_memory() {
 
 	pane_or_context_run guest 'qemu-system-$(uname -m)'		   \
 		' -machine accel=kvm'                                      \
-		' -m '${VMEM}' -cpu host -smp '${VCPUS}                    \
+		' -m '$((${MEM_KIB} / 1024 / 4))' -cpu host -smp '${VCPUS}                    \
 		' -kernel ' "/boot/vmlinuz-$(uname -r)"			   \
 		' -initrd '${INITRAMFS_MEM}' -nographic -serial stdio'	   \
 		' -nodefaults'						   \
diff --git a/test/passt_vu b/test/passt_vu
new file mode 120000
index 0000000..22f1840
--- /dev/null
+++ b/test/passt_vu
@@ -0,0 +1 @@
+passt
\ No newline at end of file
diff --git a/test/passt_vu_in_ns b/test/passt_vu_in_ns
new file mode 120000
index 0000000..3ff479e
--- /dev/null
+++ b/test/passt_vu_in_ns
@@ -0,0 +1 @@
+passt_in_ns
\ No newline at end of file
diff --git a/test/perf/passt_vu_tcp b/test/perf/passt_vu_tcp
new file mode 100644
index 0000000..b434008
--- /dev/null
+++ b/test/perf/passt_vu_tcp
@@ -0,0 +1,211 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/perf/passt_vu_tcp - Check TCP performance in passt vhost-user mode
+#
+# Copyright (c) 2021 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+gtools	/sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr # From neper
+nstools	/sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr
+htools	bc head sed seq
+
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+test	passt: throughput and latency
+
+guest	/sbin/sysctl -w net.core.rmem_max=536870912
+guest	/sbin/sysctl -w net.core.wmem_max=536870912
+guest	/sbin/sysctl -w net.core.rmem_default=33554432
+guest	/sbin/sysctl -w net.core.wmem_default=33554432
+guest	/sbin/sysctl -w net.ipv4.tcp_rmem="4096 131072 268435456"
+guest	/sbin/sysctl -w net.ipv4.tcp_wmem="4096 131072 268435456"
+guest	/sbin/sysctl -w net.ipv4.tcp_timestamps=0
+
+ns	/sbin/sysctl -w net.ipv4.tcp_rmem="4096 524288 134217728"
+ns	/sbin/sysctl -w net.ipv4.tcp_wmem="4096 524288 134217728"
+ns	/sbin/sysctl -w net.ipv4.tcp_timestamps=0
+
+gout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+
+hout	FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
+hout	FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
+hout	FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
+
+set	THREADS 4
+set	TIME 5
+set	OMIT 0.1
+set	OPTS -Z -P __THREADS__ -l 1M -O__OMIT__ -N
+
+info	Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
+report	passt_vu tcp __THREADS__ __FREQ__
+
+th	MTU 256B 576B 1280B 1500B 9000B 65520B
+
+
+tr	TCP throughput over IPv6: guest to host
+iperf3s	ns 10002
+
+bw	-
+bw	-
+guest	ip link set dev __IFNAME__ mtu 1280
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M
+bw	__BW__ 1.2 1.5
+guest	ip link set dev __IFNAME__ mtu 1500
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 32M
+bw	__BW__ 1.6 1.8
+guest	ip link set dev __IFNAME__ mtu 9000
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M
+bw	__BW__ 4.0 5.0
+guest	ip link set dev __IFNAME__ mtu 65520
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M
+bw	__BW__ 7.0 8.0
+
+iperf3k	ns
+
+tl	TCP RR latency over IPv6: guest to host
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+nsb	tcp_rr --nolog -6
+gout	LAT tcp_rr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 200 150
+
+tl	TCP CRR latency over IPv6: guest to host
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+nsb	tcp_crr --nolog -6
+gout	LAT tcp_crr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 500 400
+
+tr	TCP throughput over IPv4: guest to host
+iperf3s	ns 10002
+
+guest	ip link set dev __IFNAME__ mtu 256
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 2M
+bw	__BW__ 0.2 0.3
+guest	ip link set dev __IFNAME__ mtu 576
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M
+bw	__BW__ 0.5 0.8
+guest	ip link set dev __IFNAME__ mtu 1280
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M
+bw	__BW__ 1.2 1.5
+guest	ip link set dev __IFNAME__ mtu 1500
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M
+bw	__BW__ 1.6 1.8
+guest	ip link set dev __IFNAME__ mtu 9000
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M
+bw	__BW__ 4.0 5.0
+guest	ip link set dev __IFNAME__ mtu 65520
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M
+bw	__BW__ 7.0 8.0
+
+iperf3k	ns
+
+# Reducing MTU below 1280 deconfigures IPv6, get our address back
+guest	dhclient -6 -x
+guest	dhclient -6 __IFNAME__
+
+tl	TCP RR latency over IPv4: guest to host
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+nsb	tcp_rr --nolog -4
+gout	LAT tcp_rr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 200 150
+
+tl	TCP CRR latency over IPv4: guest to host
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+nsb	tcp_crr --nolog -4
+gout	LAT tcp_crr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 500 400
+
+tr	TCP throughput over IPv6: host to guest
+iperf3s	guest 10001
+
+bw	-
+bw	-
+bw	-
+bw	-
+bw	-
+iperf3	BW ns ::1 10001 __TIME__ __OPTS__ -w 32M
+bw	__BW__ 6.0 6.8
+
+iperf3k	guest
+
+tl	TCP RR latency over IPv6: host to guest
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+guestb	tcp_rr --nolog -P 10001 -C 10011 -6
+sleep	1
+nsout	LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 200 150
+
+tl	TCP CRR latency over IPv6: host to guest
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+guestb	tcp_crr --nolog -P 10001 -C 10011 -6
+sleep	1
+nsout	LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 500 350
+
+
+tr	TCP throughput over IPv4: host to guest
+iperf3s	guest 10001
+
+bw	-
+bw	-
+bw	-
+bw	-
+bw	-
+iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -w 32M
+bw	__BW__ 6.0 6.8
+
+iperf3k	guest
+
+tl	TCP RR latency over IPv4: host to guest
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+guestb	tcp_rr --nolog -P 10001 -C 10011 -4
+sleep	1
+nsout	LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 200 150
+
+tl	TCP CRR latency over IPv6: host to guest
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+guestb	tcp_crr --nolog -P 10001 -C 10011 -4
+sleep	1
+nsout	LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 500 300
+
+te
diff --git a/test/perf/passt_vu_udp b/test/perf/passt_vu_udp
new file mode 100644
index 0000000..943ac11
--- /dev/null
+++ b/test/perf/passt_vu_udp
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/perf/passt_vu_udp - Check UDP performance in passt vhost-user mode
+#
+# Copyright (c) 2021 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+gtools	/sbin/sysctl ip jq nproc sleep iperf3 udp_rr # From neper
+nstools	ip jq sleep iperf3 udp_rr
+htools	bc head sed
+
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+test	passt: throughput and latency
+
+guest	/sbin/sysctl -w net.core.rmem_max=16777216
+guest	/sbin/sysctl -w net.core.wmem_max=16777216
+guest	/sbin/sysctl -w net.core.rmem_default=16777216
+guest	/sbin/sysctl -w net.core.wmem_default=16777216
+
+hout	FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
+hout	FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
+hout	FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
+
+set	THREADS 2
+set	TIME 1
+set	OPTS -u -P __THREADS__ --pacing-timer 1000
+
+info	Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
+
+report	passt_vu udp __THREADS__ __FREQ__
+
+th	pktlen 256B 576B 1280B 1500B 9000B 65520B
+
+tr	UDP throughput over IPv6: guest to host
+iperf3s	ns 10002
+# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header
+
+bw	-
+bw	-
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 3G -l 1232
+bw	__BW__ 0.8 1.2
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 4G -l 1452
+bw	__BW__ 1.0 1.5
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 10G -l 8952
+bw	__BW__ 4.0 5.0
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 20G -l 64372
+bw	__BW__ 4.0 5.0
+
+iperf3k	ns
+
+tl	UDP RR latency over IPv6: guest to host
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+nsb	udp_rr --nolog -6
+gout	LAT udp_rr --nolog -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 200 150
+
+
+tr	UDP throughput over IPv4: guest to host
+iperf3s	ns 10002
+# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
+
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 1G -l 228
+bw	__BW__ 0.0 0.0
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 2G -l 548
+bw	__BW__ 0.4 0.6
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 3G -l 1252
+bw	__BW__ 0.8 1.2
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 4G -l 1472
+bw	__BW__ 1.0 1.5
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 10G -l 8972
+bw	__BW__ 4.0 5.0
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 20G -l 65492
+bw	__BW__ 4.0 5.0
+
+iperf3k	ns
+
+tl	UDP RR latency over IPv4: guest to host
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+nsb	udp_rr --nolog -4
+gout	LAT udp_rr --nolog -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 200 150
+
+
+tr	UDP throughput over IPv6: host to guest
+iperf3s	guest 10001
+# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header
+
+bw	-
+bw	-
+iperf3	BW ns ::1 10001 __TIME__ __OPTS__ -b 3G -l 1232
+bw	__BW__ 0.8 1.2
+iperf3	BW ns ::1 10001 __TIME__ __OPTS__ -b 4G -l 1452
+bw	__BW__ 1.0 1.5
+iperf3	BW ns ::1 10001 __TIME__ __OPTS__ -b 10G -l 8952
+bw	__BW__ 3.0 4.0
+iperf3	BW ns ::1 10001 __TIME__ __OPTS__ -b 20G -l 64372
+bw	__BW__ 3.0 4.0
+
+iperf3k	guest
+
+tl	UDP RR latency over IPv6: host to guest
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+guestb	udp_rr --nolog -P 10001 -C 10011 -6
+sleep	1
+nsout	LAT udp_rr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 200 150
+
+
+tr	UDP throughput over IPv4: host to guest
+iperf3s	guest 10001
+# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
+
+iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 1G -l 228
+bw	__BW__ 0.0 0.0
+iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 2G -l 548
+bw	__BW__ 0.4 0.6
+iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 3G -l 1252
+bw	__BW__ 0.8 1.2
+iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 4G -l 1472
+bw	__BW__ 1.0 1.5
+iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 10G -l 8972
+bw	__BW__ 3.0 4.0
+iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 20G -l 65492
+bw	__BW__ 3.0 4.0
+
+iperf3k	guest
+
+tl	UDP RR latency over IPv4: host to guest
+lat	-
+lat	-
+lat	-
+lat	-
+lat	-
+guestb	udp_rr --nolog -P 10001 -C 10011 -4
+sleep	1
+nsout	LAT udp_rr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
+lat	__LAT__ 200 150
+
+te
diff --git a/test/run b/test/run
index 547a729..f188d8e 100755
--- a/test/run
+++ b/test/run
@@ -93,6 +93,7 @@ run() {
 	test memory/passt
 	teardown memory
 
+	VHOST_USER=0
 	setup passt
 	test passt/ndp
 	test passt/dhcp
@@ -115,7 +116,22 @@ run() {
 	test two_guests/basic
 	teardown two_guests
 
+	VHOST_USER=1
+	setup passt_in_ns
+	test passt_vu/ndp
+	test passt_vu_in_ns/dhcp
+	test passt_vu_in_ns/icmp
+	test passt_vu_in_ns/tcp
+	test passt_vu_in_ns/udp
+	test passt_vu_in_ns/shutdown
+	teardown passt_in_ns
+
+	setup two_guests
+	test two_guests_vu/basic
+	teardown two_guests
+
 	VALGRIND=0
+	VHOST_USER=0
 	setup passt_in_ns
 	test passt/ndp
 	test passt_in_ns/dhcp
@@ -126,6 +142,15 @@ run() {
 	test passt_in_ns/shutdown
 	teardown passt_in_ns
 
+	VHOST_USER=1
+	setup passt_in_ns
+	test passt_vu/ndp
+	test passt_vu_in_ns/dhcp
+	test perf/passt_vu_tcp
+	test perf/passt_vu_udp
+	test passt_vu_in_ns/shutdown
+	teardown passt_in_ns
+
 	# TODO: Make those faster by at least pre-installing gcc and make on
 	# non-x86 images, then re-enable.
 skip_distro() {
diff --git a/test/two_guests_vu b/test/two_guests_vu
new file mode 120000
index 0000000..a8648fc
--- /dev/null
+++ b/test/two_guests_vu
@@ -0,0 +1 @@
+two_guests
\ No newline at end of file

From 7e131e920c04054b9d005dac718ac54e5169fa71 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 22 Nov 2024 17:43:36 +0100
Subject: [PATCH 141/382] tcp: Move tcp_l2_buf_fill_headers() to tcp_buf.c

This function only has callers in tcp_buf.c.  More importantly, it's
inherently tied to the "buf" path, because it uses internal knowledge of
how we lay out the various headers across our locally allocated buffers.

Therefore, move it to tcp_buf.c.

Slightly reformat the prototypes while we're at it.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 57 +++++++-------------------------------------------
 tcp_buf.c      | 39 ++++++++++++++++++++++++++++++----
 tcp_internal.h | 22 ++++++++-----------
 tcp_vu.c       | 22 ++++++++-----------
 4 files changed, 61 insertions(+), 79 deletions(-)

diff --git a/tcp.c b/tcp.c
index e197a1a..61c12a5 100644
--- a/tcp.c
+++ b/tcp.c
@@ -976,14 +976,11 @@ static void tcp_fill_header(struct tcphdr *th,
  * @check:		Checksum, if already known
  * @seq:		Sequence number for this segment
  * @no_tcp_csum:	Do not set TCP checksum
- *
- * Return: The IPv4 payload length, host order
  */
-size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
-			 struct tap_hdr *taph,
-			 struct iphdr *iph, struct tcp_payload_t *bp,
-			 size_t dlen, const uint16_t *check,
-			 uint32_t seq, bool no_tcp_csum)
+void tcp_fill_headers4(const struct tcp_tap_conn *conn,
+		       struct tap_hdr *taph, struct iphdr *iph,
+		       struct tcp_payload_t *bp, size_t dlen,
+		       const uint16_t *check, uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *src4 = inany_v4(&tapside->oaddr);
@@ -1014,8 +1011,6 @@ size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
 	}
 
 	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
-
-	return l4len;
 }
 
 /**
@@ -1028,13 +1023,11 @@ size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
  * @check:		Checksum, if already known
  * @seq:		Sequence number for this segment
  * @no_tcp_csum:	Do not set TCP checksum
- *
- * Return: The IPv6 payload length, host order
  */
-size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
-			 struct tap_hdr *taph,
-			 struct ipv6hdr *ip6h, struct tcp_payload_t *bp,
-			 size_t dlen, uint32_t seq, bool no_tcp_csum)
+void tcp_fill_headers6(const struct tcp_tap_conn *conn,
+		       struct tap_hdr *taph, struct ipv6hdr *ip6h,
+		       struct tcp_payload_t *bp, size_t dlen,
+		       uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	size_t l4len = dlen + sizeof(bp->th);
@@ -1065,40 +1058,6 @@ size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
 	}
 
 	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
-
-	return l4len;
-}
-
-/**
- * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
- * @conn:	Connection pointer
- * @iov:	Pointer to an array of iovec of TCP pre-cooked buffers
- * @dlen:	TCP payload length
- * @check:	Checksum, if already known
- * @seq:	Sequence number for this segment
- * @no_tcp_csum: Do not set TCP checksum
- *
- * Return: IP payload length, host order
- */
-size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
-			       struct iovec *iov, size_t dlen,
-			       const uint16_t *check, uint32_t seq,
-			       bool no_tcp_csum)
-{
-	const struct flowside *tapside = TAPFLOW(conn);
-	const struct in_addr *a4 = inany_v4(&tapside->oaddr);
-
-	if (a4) {
-		return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
-					 iov[TCP_IOV_IP].iov_base,
-					 iov[TCP_IOV_PAYLOAD].iov_base, dlen,
-					 check, seq, no_tcp_csum);
-	}
-
-	return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
-				 iov[TCP_IOV_IP].iov_base,
-				 iov[TCP_IOV_PAYLOAD].iov_base, dlen,
-				 seq, no_tcp_csum);
 }
 
 /**
diff --git a/tcp_buf.c b/tcp_buf.c
index d29c1a9..0946cd5 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -147,6 +147,36 @@ void tcp_payload_flush(const struct ctx *c)
 	tcp_payload_used = 0;
 }
 
+/**
+ * tcp_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
+ * @conn:	Connection pointer
+ * @iov:	Pointer to an array of iovec of TCP pre-cooked buffers
+ * @dlen:	TCP payload length
+ * @check:	Checksum, if already known
+ * @seq:	Sequence number for this segment
+ * @no_tcp_csum: Do not set TCP checksum
+ */
+static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
+				    struct iovec *iov, size_t dlen,
+				    const uint16_t *check, uint32_t seq,
+				    bool no_tcp_csum)
+{
+	const struct flowside *tapside = TAPFLOW(conn);
+	const struct in_addr *a4 = inany_v4(&tapside->oaddr);
+
+	if (a4) {
+		tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
+				  iov[TCP_IOV_IP].iov_base,
+				  iov[TCP_IOV_PAYLOAD].iov_base, dlen,
+				  check, seq, no_tcp_csum);
+	} else {
+		tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
+				  iov[TCP_IOV_IP].iov_base,
+				  iov[TCP_IOV_PAYLOAD].iov_base, dlen,
+				  seq, no_tcp_csum);
+	}
+}
+
 /**
  * tcp_buf_send_flag() - Send segment with flags to tap (no payload)
  * @c:         Execution context
@@ -181,8 +211,10 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 		return ret;
 
 	tcp_payload_used++;
-	l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
+	l4len = optlen + sizeof(struct tcphdr);
 	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+	tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
+
 	if (flags & DUP_ACK) {
 		struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
 
@@ -215,7 +247,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 	struct tcp_payload_t *payload;
 	const uint16_t *check = NULL;
 	struct iovec *iov;
-	size_t l4len;
 
 	conn->seq_to_tap = seq + dlen;
 	tcp_frame_conns[tcp_payload_used] = conn;
@@ -238,8 +269,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 	payload->th.th_x2 = 0;
 	payload->th.th_flags = 0;
 	payload->th.ack = 1;
-	l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
-	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+	iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
+	tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
 	if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
 		tcp_payload_flush(c);
 }
diff --git a/tcp_internal.h b/tcp_internal.h
index 8625eed..d7b125f 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -168,19 +168,15 @@ void tcp_update_check_tcp4(const struct iphdr *iph,
 void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
 			   const struct iovec *iov, int iov_cnt,
 			   size_t l4offset);
-size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
-			 struct tap_hdr *taph,
-			 struct iphdr *iph, struct tcp_payload_t *bp,
-			 size_t dlen, const uint16_t *check,
-			 uint32_t seq, bool no_tcp_csum);
-size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
-			 struct tap_hdr *taph,
-			 struct ipv6hdr *ip6h, struct tcp_payload_t *bp,
-			 size_t dlen, uint32_t seq, bool no_tcp_csum);
-size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
-			       struct iovec *iov, size_t dlen,
-			       const uint16_t *check, uint32_t seq,
-			       bool no_tcp_csum);
+void tcp_fill_headers4(const struct tcp_tap_conn *conn,
+		       struct tap_hdr *taph, struct iphdr *iph,
+		       struct tcp_payload_t *bp, size_t dlen,
+		       const uint16_t *check, uint32_t seq, bool no_tcp_csum);
+void tcp_fill_headers6(const struct tcp_tap_conn *conn,
+		       struct tap_hdr *taph, struct ipv6hdr *ip6h,
+		       struct tcp_payload_t *bp, size_t dlen,
+		       uint32_t seq, bool no_tcp_csum);
+
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 			  bool force_seq, struct tcp_info_linux *tinfo);
 int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
diff --git a/tcp_vu.c b/tcp_vu.c
index 1bebb31..f27e175 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -98,7 +98,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	const struct flowside *tapside = TAPFLOW(conn);
-	size_t l2len, l4len, optlen, hdrlen;
+	size_t optlen, hdrlen;
 	struct vu_virtq_element flags_elem[2];
 	struct tcp_payload_t *payload;
 	struct ipv6hdr *ip6h = NULL;
@@ -157,19 +157,15 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 		return ret;
 	}
 
-	if (CONN_V4(conn)) {
-		l4len = tcp_fill_headers4(conn, NULL, iph, payload, optlen,
-					  NULL, seq, true);
-		l2len = sizeof(*iph);
-	} else {
-		l4len = tcp_fill_headers6(conn, NULL, ip6h, payload, optlen,
-					  seq, true);
-		l2len = sizeof(*ip6h);
-	}
-	l2len += l4len + sizeof(struct ethhdr);
+	flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
+
+	if (CONN_V4(conn)) {
+		tcp_fill_headers4(conn, NULL, iph, payload, optlen, NULL, seq,
+				  true);
+	} else {
+		tcp_fill_headers6(conn, NULL, ip6h, payload, optlen, seq, true);
+	}
 
-	flags_elem[0].in_sg[0].iov_len = l2len +
-				   sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	if (*c->pcap) {
 		tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1);
 		pcap_iov(&flags_elem[0].in_sg[0], 1,

From 6fae899cbbb3ec5bc9a0d5c7dde16131b2f85f05 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 27 Nov 2024 11:25:21 +0100
Subject: [PATCH 142/382] virtio: check if avail ring is configured

If the connection to the vhost-user front end is closed during transfers
virtio rings are deconfigured and not available anymore, but we can
try to access them to process queued data. This can trigger a SIGSEG as
we try to access unavailable memory.
To fix that check vq->vring.avail is sane before accessing the vring

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 virtio.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/virtio.c b/virtio.c
index 6a97435..0598ff4 100644
--- a/virtio.c
+++ b/virtio.c
@@ -284,6 +284,9 @@ static int virtqueue_read_next_desc(const struct vring_desc *desc,
  */
 bool vu_queue_empty(struct vu_virtq *vq)
 {
+	if (!vq->vring.avail)
+		return true;
+
 	if (vq->shadow_avail_idx != vq->last_avail_idx)
 		return false;
 
@@ -327,6 +330,9 @@ static bool vring_can_notify(const struct vu_dev *dev, struct vu_virtq *vq)
  */
 void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
 {
+	if (!vq->vring.avail)
+		return;
+
 	if (!vring_can_notify(dev, vq)) {
 		debug("vhost-user: virtqueue can skip notify...");
 		return;
@@ -502,6 +508,9 @@ int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_elemen
 	unsigned int head;
 	int ret;
 
+	if (!vq->vring.avail)
+		return -1;
+
 	if (vu_queue_empty(vq))
 		return -1;
 
@@ -591,6 +600,9 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
 {
 	struct vring_used_elem uelem;
 
+	if (!vq->vring.avail)
+		return;
+
 	idx = (idx + vq->used_idx) % vq->vring.num;
 
 	uelem.id = htole32(index);
@@ -633,6 +645,9 @@ void vu_queue_flush(struct vu_virtq *vq, unsigned int count)
 {
 	uint16_t old, new;
 
+	if (!vq->vring.avail)
+		return;
+
 	/* Make sure buffer is written before we update index. */
 	smp_wmb();
 

From 00cc2303fd6ac4b72c19d1741dff72fd42c09a47 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 27 Nov 2024 12:15:51 +0100
Subject: [PATCH 143/382] Fix build on 32bit target
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the following errors when built with CFLAGS="-m32 -U__AVX2__":

packet.c:57:23: warning: format ‘%lu’ expects argument of type ‘long unsigned int’, but argument 5 has type ‘size_t’ {aka ‘unsigned int’} [-Wformat=]
   57 |                 trace("packet offset plus length %lu from size %lu, "
   58 |                       "%s:%i", start - p->buf + len + offset,
      |                                ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                                                     |
      |                                                     size_t {aka unsigned int}

packet.c:57:23: warning: format ‘%lu’ expects argument of type ‘long unsigned int’, but argument 6 has type ‘size_t’ {aka ‘unsigned int’} [-Wformat=]
   57 |                 trace("packet offset plus length %lu from size %lu, "
      |                       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   58 |                       "%s:%i", start - p->buf + len + offset,
   59 |                       p->buf_size, func, line);
      |                       ~~~~~~~~~~~
      |                        |
      |                        size_t {aka unsigned int}

vhost_user.c:139:32: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
  139 |                         return (void *)(qemu_addr - r->qva + r->mmap_addr +
      |                                ^

vhost_user.c:439:32: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
  439 |                         munmap((void *)r->mmap_addr, r->size + r->mmap_offset);
      |                                ^

vhost_user.c:900:32: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
  900 |                         munmap((void *)r->mmap_addr, r->size + r->mmap_offset);
      |                                ^

virtio.c:111:32: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
  111 |                         return (void *)(guest_addr - r->gpa + r->mmap_addr +
      |                                ^

vu_common.c:37:27: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
   37 |                 char *m = (char *)dev_region->mmap_addr;
      |                           ^

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c     |  2 +-
 vhost_user.c | 11 +++++++----
 virtio.c     |  5 +++--
 vu_common.c  |  2 +-
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/packet.c b/packet.c
index e5a78d0..03a11e6 100644
--- a/packet.c
+++ b/packet.c
@@ -54,7 +54,7 @@ static int packet_check_range(const struct pool *p, size_t offset, size_t len,
 	}
 
 	if (start + len + offset > p->buf + p->buf_size) {
-		trace("packet offset plus length %lu from size %lu, "
+		trace("packet offset plus length %zu from size %zu, "
 		      "%s:%i", start - p->buf + len + offset,
 		      p->buf_size, func, line);
 		return -1;
diff --git a/vhost_user.c b/vhost_user.c
index 51c90db..4b8558f 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -136,8 +136,9 @@ static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr)
 
 		if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
 			/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
-			return (void *)(qemu_addr - r->qva + r->mmap_addr +
-					r->mmap_offset);
+			return (void *)(uintptr_t)(qemu_addr - r->qva +
+						   r->mmap_addr +
+						   r->mmap_offset);
 		}
 	}
 
@@ -436,7 +437,8 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
 
 		if (r->mmap_addr) {
 			/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
-			munmap((void *)r->mmap_addr, r->size + r->mmap_offset);
+			munmap((void *)(uintptr_t)r->mmap_addr,
+			       r->size + r->mmap_offset);
 		}
 	}
 	vdev->nregions = memory->nregions;
@@ -897,7 +899,8 @@ void vu_cleanup(struct vu_dev *vdev)
 
 		if (r->mmap_addr) {
 			/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
-			munmap((void *)r->mmap_addr, r->size + r->mmap_offset);
+			munmap((void *)(uintptr_t)r->mmap_addr,
+			       r->size + r->mmap_offset);
 		}
 	}
 	vdev->nregions = 0;
diff --git a/virtio.c b/virtio.c
index 0598ff4..a76de5e 100644
--- a/virtio.c
+++ b/virtio.c
@@ -108,8 +108,9 @@ static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_add
 			if ((guest_addr + *plen) > (r->gpa + r->size))
 				*plen = r->gpa + r->size - guest_addr;
 			/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
-			return (void *)(guest_addr - r->gpa + r->mmap_addr +
-						     r->mmap_offset);
+			return (void *)(uintptr_t)(guest_addr - r->gpa +
+						   r->mmap_addr +
+						   r->mmap_offset);
 		}
 	}
 
diff --git a/vu_common.c b/vu_common.c
index f2eb701..299b5a3 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -35,7 +35,7 @@ int vu_packet_check_range(void *buf, size_t offset, size_t len,
 
 	for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
 		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
-		char *m = (char *)dev_region->mmap_addr;
+		char *m = (char *)(uintptr_t)dev_region->mmap_addr;
 
 		if (m <= start &&
 		    start + offset + len <= m + dev_region->mmap_offset +

From 804a7ce94a14fbc4dee0a14b2c5f7a72ebb8bff6 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 27 Nov 2024 15:37:01 +0100
Subject: [PATCH 144/382] tcp_vu: Change 'dlen' to ssize_t in
 tcp_vu_data_from_sock()

...to quickly suppress a false positive from Coverity, which assumes
that iov_size is 0 and 'dlen' might overflow as a result (with hdrlen
being 66). An ASSERT() in tcp_vu_sock_recv() already guarantees that
iov_size(iov, buf_cnt) here is anyway greater than 'hdrlen'.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
---
 tcp_vu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcp_vu.c b/tcp_vu.c
index f27e175..bbae918 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -463,7 +463,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	for (i = 0, check = NULL; i < head_cnt; i++) {
 		struct iovec *iov = &elem[head[i]].in_sg[0];
 		int buf_cnt = head[i + 1] - head[i];
-		int dlen = iov_size(iov, buf_cnt) - hdrlen;
+		ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen;
 
 		vu_set_vnethdr(vdev, iov->iov_base, buf_cnt);
 

From f9311031713ab8f18e9c872a42a8f6a9935954ec Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 27 Nov 2024 14:54:04 +1100
Subject: [PATCH 145/382] iov: iov tail helpers

In the vhost-user code we have a number of places where we need to locate
a particular header within the guest-supplied IO vector.  We need to work
out which buffer the header is in, and verify that it's contiguous and
aligned as we need.  At the moment this is open-coded, but introduce a
helper to make this more straightforward.

We add a new datatype 'struct iov_tail' representing an IO vector from
which we've logically consumed some number of headers.  The IOV_PULL_HEADER
macro consumes a new header from the vector, returning a pointer and
updating the iov_tail.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 iov.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 iov.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 169 insertions(+)

diff --git a/iov.c b/iov.c
index 3741db2..4c6416c 100644
--- a/iov.c
+++ b/iov.c
@@ -155,3 +155,96 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt)
 
 	return len;
 }
+
+/**
+ * iov_tail_prune() - Remove any unneeded buffers from an IOV tail
+ * @tail:	IO vector tail (modified)
+ *
+ * If an IOV tail's offset is large enough, it may not include any bytes from
+ * the first (or first several) buffers in the underlying IO vector.  Modify the
+ * tail's representation so it contains the same logical bytes, but only
+ * includes buffers that are actually needed.  This will avoid stepping through
+ * unnecessary elements of the underlying IO vector on future operations.
+ *
+ * Return:	true if the tail still contains any bytes, otherwise false
+ */
+bool iov_tail_prune(struct iov_tail *tail)
+{
+	size_t i;
+
+	i = iov_skip_bytes(tail->iov, tail->cnt, tail->off, &tail->off);
+	tail->iov += i;
+	tail->cnt -= i;
+
+	return !!tail->cnt;
+}
+
+/**
+ * iov_tail_size - Calculate the total size of an IO vector tail
+ * @tail:	IO vector tail
+ *
+ * Returns:    The total size in bytes.
+ */
+/* cppcheck-suppress unusedFunction */
+size_t iov_tail_size(struct iov_tail *tail)
+{
+	iov_tail_prune(tail);
+	return iov_size(tail->iov, tail->cnt) - tail->off;
+}
+
+/**
+ * iov_peek_header_() - Get pointer to a header from an IOV tail
+ * @tail:	IOV tail to get header from
+ * @len:	Length of header to get, in bytes
+ * @align:	Required alignment of header, in bytes
+ *
+ * @tail may be pruned, but will represent the same bytes as before.
+ *
+ * Returns: Pointer to the first @len logical bytes of the tail, NULL if that
+ *	    overruns the IO vector, is not contiguous or doesn't have the
+ *	    requested alignment.
+ */
+void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
+{
+	char *p;
+
+	if (!iov_tail_prune(tail))
+		return NULL; /* Nothing left */
+
+	if (tail->off + len < tail->off)
+		return NULL; /* Overflow */
+
+	if (tail->off + len > tail->iov[0].iov_len)
+		return NULL; /* Not contiguous */
+
+	p = (char *)tail->iov[0].iov_base + tail->off;
+	if ((uintptr_t)p % align)
+		return NULL; /* not aligned */
+
+	return p;
+}
+
+/**
+ * iov_remove_header_() - Remove a header from an IOV tail
+ * @tail:	IOV tail to remove header from (modified)
+ * @len:	Length of header to remove, in bytes
+ * @align:	Required alignment of header, in bytes
+ *
+ * On success, @tail is updated so that it longer includes the bytes of the
+ * returned header.
+ *
+ * Returns: Pointer to the first @len logical bytes of the tail, NULL if that
+ *	    overruns the IO vector, is not contiguous or doesn't have the
+ *	    requested alignment.
+ */
+/* cppcheck-suppress unusedFunction */
+void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align)
+{
+	char *p = iov_peek_header_(tail, len, align);
+
+	if (!p)
+		return NULL;
+
+	tail->off = tail->off + len;
+	return p;
+}
diff --git a/iov.h b/iov.h
index a9e1722..9855bf0 100644
--- a/iov.h
+++ b/iov.h
@@ -28,4 +28,80 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
 size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
                   size_t offset, void *buf, size_t bytes);
 size_t iov_size(const struct iovec *iov, size_t iov_cnt);
+
+/*
+ * DOC: Theory of Operation, struct iov_tail
+ *
+ * Sometimes a single logical network frame is split across multiple buffers,
+ * represented by an IO vector (struct iovec[]).  We often want to process this
+ * one header / network layer at a time.  So, it's useful to maintain a "tail"
+ * of the vector representing the parts we haven't yet extracted.
+ *
+ * The headers we extract need not line up with buffer boundaries (though we do
+ * assume they're contiguous within a single buffer for now).  So, we could
+ * represent that tail as another struct iovec[], but that would mean copying
+ * the whole array of struct iovecs, just so we can adjust the offset and length
+ * on the first one.
+ *
+ * So, instead represent the tail as pointer into an existing struct iovec[],
+ * with an explicit offset for where the "tail" starts within it.  If we extract
+ * enough headers that some buffers of the original vector no longer contain
+ * part of the tail, we (lazily) advance our struct iovec * to the first buffer
+ * we still need, and adjust the vector length and offset to match.
+ */
+
+/**
+ * struct iov_tail - An IO vector which may have some headers logically removed
+ * @iov:	IO vector
+ * @cnt:	Number of entries in @iov
+ * @off:	Current offset in @iov
+ */
+struct iov_tail {
+	const struct iovec *iov;
+	size_t cnt, off;
+};
+
+/**
+ * IOV_TAIL() - Create a new IOV tail
+ * @iov_:	IO vector to create tail from
+ * @cnt_:	Length of the IO vector at @iov_
+ * @off_:	Byte offset in the IO vector where the tail begins
+ */
+#define IOV_TAIL(iov_, cnt_, off_) \
+	(struct iov_tail){ .iov = (iov_), .cnt = (cnt_), .off = (off_) }
+
+bool iov_tail_prune(struct iov_tail *tail);
+size_t iov_tail_size(struct iov_tail *tail);
+void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align);
+void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align);
+
+/**
+ * IOV_PEEK_HEADER() - Get typed pointer to a header from an IOV tail
+ * @tail_:	IOV tail to get header from
+ * @type_:	Data type of the header
+ *
+ * @tail_ may be pruned, but will represent the same bytes as before.
+ *
+ * Returns: Pointer of type (@type_ *) located at the start of @tail_, NULL if
+ *          we can't get a contiguous and aligned pointer.
+ */
+#define IOV_PEEK_HEADER(tail_, type_)					\
+	((type_ *)(iov_peek_header_((tail_),				\
+				    sizeof(type_), __alignof__(type_))))
+
+/**
+ * IOV_REMOVE_HEADER() - Remove and return typed header from an IOV tail
+ * @tail_:	IOV tail to remove header from (modified)
+ * @type_:	Data type of the header to remove
+ *
+ * On success, @tail_ is updated so that it longer includes the bytes of the
+ * returned header.
+ *
+ * Returns: Pointer of type (@type_ *) located at the old start of @tail_, NULL
+ *          if we can't get a contiguous and aligned pointer.
+ */
+#define IOV_REMOVE_HEADER(tail_, type_)					\
+	((type_ *)(iov_remove_header_((tail_),				\
+				      sizeof(type_), __alignof__(type_))))
+
 #endif /* IOVEC_H */

From 67151090bc349d9eec5a0b303d0cb3347b755251 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 27 Nov 2024 14:54:05 +1100
Subject: [PATCH 146/382] iov, checksum: Replace csum_iov() with
 csum_iov_tail()

We usually want to checksum only the tail part of a frame, excluding at
least some headers.  csum_iov() does that for a frame represented as an
IO vector, not actually summing the entire IO vector.  We now have struct
iov_tail to explicitly represent this construct, so replace csum_iov()
with csum_iov_tail() taking that representation rather than 3 parameters.

We propagate the same change to csum_udp4() and csum_udp6() which take
similar parameters.  This slightly simplifies the code, and will allow some
further simplifications as struct iov_tail is more widely used.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 checksum.c | 56 +++++++++++++++++++++---------------------------------
 checksum.h |  8 ++++----
 iov.c      |  1 -
 tap.c      |  6 ++++--
 tcp.c      |  6 ++++--
 udp.c      |  7 ++++---
 udp_vu.c   |  9 +++++----
 7 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/checksum.c b/checksum.c
index c673993..1c4354d 100644
--- a/checksum.c
+++ b/checksum.c
@@ -166,24 +166,22 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
  * @udp4hr:	UDP header, initialised apart from checksum
  * @saddr:	IPv4 source address
  * @daddr:	IPv4 destination address
- * @iov:	Pointer to the array of IO vectors
- * @iov_cnt:	Length of the array
- * @offset:	UDP payload offset in the iovec array
+ * @data:	UDP payload (as IO vector tail)
  */
 void csum_udp4(struct udphdr *udp4hr,
 	       struct in_addr saddr, struct in_addr daddr,
-	       const struct iovec *iov, int iov_cnt, size_t offset)
+	       struct iov_tail *data)
 {
 	/* UDP checksums are optional, so don't bother */
 	udp4hr->check = 0;
 
 	if (UDP4_REAL_CHECKSUMS) {
-		uint16_t l4len = iov_size(iov, iov_cnt) - offset +
-				 sizeof(struct udphdr);
+		uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr);
 		uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
 						       saddr, daddr);
+
 		psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
-		udp4hr->check = csum_iov(iov, iov_cnt, offset, psum);
+		udp4hr->check = csum_iov_tail(data, psum);
 	}
 }
 
@@ -231,22 +229,20 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
  * @udp6hr:	UDP header, initialised apart from checksum
  * @saddr:	Source address
  * @daddr:	Destination address
- * @iov:	Pointer to the array of IO vectors
- * @iov_cnt:	Length of the array
- * @offset:	UDP payload offset in the iovec array
+ * @data:	UDP payload (as IO vector tail)
  */
 void csum_udp6(struct udphdr *udp6hr,
 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
-	       const struct iovec *iov, int iov_cnt, size_t offset)
+	       struct iov_tail *data)
 {
-	uint16_t l4len = iov_size(iov, iov_cnt) - offset +
-			 sizeof(struct udphdr);
+	uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr);
 	uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
 					       saddr, daddr);
+
 	udp6hr->check = 0;
 
 	psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
-	udp6hr->check = csum_iov(iov, iov_cnt, offset, psum);
+	udp6hr->check = csum_iov_tail(data, psum);
 }
 
 /**
@@ -501,31 +497,23 @@ uint16_t csum(const void *buf, size_t len, uint32_t init)
 }
 
 /**
- * csum_iov() - Calculates the unfolded checksum over an array of IO vectors
- *
- * @iov		Pointer to the array of IO vectors
- * @n		Length of the array
- * @offset:	Offset of the data to checksum within the full data length
+ * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector
+ * @tail:	IO vector tail to checksum
  * @init	Initial 32-bit checksum, 0 for no pre-computed checksum
  *
  * Return: 16-bit folded, complemented checksum
  */
-uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
-		  uint32_t init)
+uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init)
 {
-	unsigned int i;
-	size_t first;
-
-	i = iov_skip_bytes(iov, n, offset, &first);
-	if (i >= n)
-		return (uint16_t)~csum_fold(init);
-
-	init = csum_unfolded((char *)iov[i].iov_base + first,
-			     iov[i].iov_len - first, init);
-	i++;
-
-	for (; i < n; i++)
-		init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);
+	if (iov_tail_prune(tail)) {
+		size_t i;
 
+		init = csum_unfolded((char *)tail->iov[0].iov_base + tail->off,
+				     tail->iov[0].iov_len - tail->off, init);
+		for (i = 1; i < tail->cnt; i++) {
+			const struct iovec *iov = &tail->iov[i];
+			init = csum_unfolded(iov->iov_base, iov->iov_len, init);
+		}
+	}
 	return (uint16_t)~csum_fold(init);
 }
diff --git a/checksum.h b/checksum.h
index 31ba322..e243c97 100644
--- a/checksum.h
+++ b/checksum.h
@@ -9,6 +9,7 @@
 struct udphdr;
 struct icmphdr;
 struct icmp6hdr;
+struct iov_tail;
 
 uint32_t sum_16b(const void *buf, size_t len);
 uint16_t csum_fold(uint32_t sum);
@@ -19,20 +20,19 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
 				struct in_addr saddr, struct in_addr daddr);
 void csum_udp4(struct udphdr *udp4hr,
 	       struct in_addr saddr, struct in_addr daddr,
-	       const struct iovec *iov, int iov_cnt, size_t offset);
+	       struct iov_tail *data);
 void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
 uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
 				const struct in6_addr *saddr,
 				const struct in6_addr *daddr);
 void csum_udp6(struct udphdr *udp6hr,
 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
-	       const struct iovec *iov, int iov_cnt, size_t offset);
+	       struct iov_tail *data);
 void csum_icmp6(struct icmp6hdr *icmp6hr,
 		const struct in6_addr *saddr, const struct in6_addr *daddr,
 		const void *payload, size_t dlen);
 uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
 uint16_t csum(const void *buf, size_t len, uint32_t init);
-uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
-		  uint32_t init);
+uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init);
 
 #endif /* CHECKSUM_H */
diff --git a/iov.c b/iov.c
index 4c6416c..2f7be15 100644
--- a/iov.c
+++ b/iov.c
@@ -185,7 +185,6 @@ bool iov_tail_prune(struct iov_tail *tail)
  *
  * Returns:    The total size in bytes.
  */
-/* cppcheck-suppress unusedFunction */
 size_t iov_tail_size(struct iov_tail *tail)
 {
 	iov_tail_prune(tail);
diff --git a/tap.c b/tap.c
index cde1719..c418064 100644
--- a/tap.c
+++ b/tap.c
@@ -184,11 +184,12 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 		.iov_base = (void *)in,
 		.iov_len = dlen
 	};
+	struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
 
 	uh->source = htons(sport);
 	uh->dest = htons(dport);
 	uh->len = htons(l4len);
-	csum_udp4(uh, src, dst, &iov, 1, 0);
+	csum_udp4(uh, src, dst, &payload);
 	memcpy(data, in, dlen);
 
 	tap_send_single(c, buf, dlen + (data - buf));
@@ -271,11 +272,12 @@ void tap_udp6_send(const struct ctx *c,
 		.iov_base = in,
 		.iov_len = dlen
 	};
+	struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
 
 	uh->source = htons(sport);
 	uh->dest = htons(dport);
 	uh->len = htons(l4len);
-	csum_udp6(uh, src, dst, &iov, 1, 0);
+	csum_udp6(uh, src, dst, &payload);
 	memcpy(data, in, dlen);
 
 	tap_send_single(c, buf, dlen + (data - buf));
diff --git a/tcp.c b/tcp.c
index 61c12a5..f334ca5 100644
--- a/tcp.c
+++ b/tcp.c
@@ -764,6 +764,7 @@ void tcp_update_check_tcp4(const struct iphdr *iph,
 			   size_t l4offset)
 {
 	uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
+	struct iov_tail l4 = IOV_TAIL(iov, iov_cnt, l4offset);
 	struct in_addr saddr = { .s_addr = iph->saddr };
 	struct in_addr daddr = { .s_addr = iph->daddr };
 	size_t check_ofs;
@@ -801,7 +802,7 @@ void tcp_update_check_tcp4(const struct iphdr *iph,
 	check = (uint16_t *)ptr;
 
 	*check = 0;
-	*check = csum_iov(iov, iov_cnt, l4offset, sum);
+	*check = csum_iov_tail(&l4, sum);
 }
 
 /**
@@ -815,6 +816,7 @@ void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
 			   const struct iovec *iov, int iov_cnt,
 			   size_t l4offset)
 {
+	struct iov_tail l4 = IOV_TAIL(iov, iov_cnt, l4offset);
 	uint16_t l4len = ntohs(ip6h->payload_len);
 	size_t check_ofs;
 	uint16_t *check;
@@ -852,7 +854,7 @@ void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
 	check = (uint16_t *)ptr;
 
 	*check = 0;
-	*check = csum_iov(iov, iov_cnt, l4offset, sum);
+	*check = csum_iov_tail(&l4, sum);
 }
 
 /**
diff --git a/udp.c b/udp.c
index 5b0093a..c89f031 100644
--- a/udp.c
+++ b/udp.c
@@ -316,7 +316,8 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 			.iov_base = bp->data,
 			.iov_len = dlen
 		};
-		csum_udp4(&bp->uh, *src, *dst, &iov, 1, 0);
+		struct iov_tail data = IOV_TAIL(&iov, 1, 0);
+		csum_udp4(&bp->uh, *src, *dst, &data);
 	}
 
 	return l4len;
@@ -360,8 +361,8 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
 			.iov_base = bp->data,
 			.iov_len = dlen
 		};
-		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
-			  &iov, 1, 0);
+		struct iov_tail data = IOV_TAIL(&iov, 1, 0);
+		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data);
 	}
 
 	return l4len;
diff --git a/udp_vu.c b/udp_vu.c
index c911022..9c697f3 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -199,15 +199,16 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used)
 	const struct in_addr *dst4 = inany_v4(&toside->eaddr);
 	char *base = iov_vu[0].iov_base;
 	struct udp_payload_t *bp;
+	struct iov_tail data;
 
 	if (src4 && dst4) {
 		bp = vu_payloadv4(base);
-		csum_udp4(&bp->uh, *src4, *dst4, iov_vu, iov_used,
-			  (char *)&bp->data - base);
+		data = IOV_TAIL(iov_vu, iov_used, (char *)&bp->data - base);
+		csum_udp4(&bp->uh, *src4, *dst4, &data);
 	} else {
 		bp = vu_payloadv6(base);
-		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
-			  iov_vu, iov_used, (char *)&bp->data - base);
+		data = IOV_TAIL(iov_vu, iov_used, (char *)&bp->data - base);
+		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data);
 	}
 }
 

From 2ee07697c4ab4f4efff6431aaa787f21bcc6f1d1 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 27 Nov 2024 14:54:06 +1100
Subject: [PATCH 147/382] tcp: Pass TCP header and payload separately to
 tcp_update_check_tcp[46]()

Currently these expects both the TCP header and payload in a single IOV,
and goes to some trouble to locate the checksum field within it.  In the
current caller we've already know where the TCP header is, so we might as
well just pass it in.  This will need to work a bit differently for
vhost-user, but that code already needs to locate the TCP header for other
reasons, so again we can just pass it in.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 106 ++++++++++---------------------------------------
 tcp_internal.h |  10 ++---
 tcp_vu.c       |  12 ++++--
 3 files changed, 34 insertions(+), 94 deletions(-)

diff --git a/tcp.c b/tcp.c
index f334ca5..5c40e18 100644
--- a/tcp.c
+++ b/tcp.c
@@ -755,106 +755,42 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
 /**
  * tcp_update_check_tcp4() - Calculate TCP checksum for IPv4
  * @iph:	IPv4 header
- * @iov:	Pointer to the array of IO vectors
- * @iov_cnt:	Length of the array
- * @l4offset:	IPv4 payload offset in the iovec array
+ * @th:		TCP header (updated)
+ * @payload:	TCP payload
  */
-void tcp_update_check_tcp4(const struct iphdr *iph,
-			   const struct iovec *iov, int iov_cnt,
-			   size_t l4offset)
+void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th,
+			   struct iov_tail *payload)
 {
 	uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
-	struct iov_tail l4 = IOV_TAIL(iov, iov_cnt, l4offset);
 	struct in_addr saddr = { .s_addr = iph->saddr };
 	struct in_addr daddr = { .s_addr = iph->daddr };
-	size_t check_ofs;
-	uint16_t *check;
-	int check_idx;
 	uint32_t sum;
-	char *ptr;
 
 	sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
 
-	check_idx = iov_skip_bytes(iov, iov_cnt,
-				   l4offset + offsetof(struct tcphdr, check),
-				   &check_ofs);
-
-	if (check_idx >= iov_cnt) {
-		err("TCP4 buffer is too small, iov size %zd, check offset %zd",
-		    iov_size(iov, iov_cnt),
-		    l4offset + offsetof(struct tcphdr, check));
-		return;
-	}
-
-	if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
-		err("TCP4 checksum field memory is not contiguous "
-		    "check_ofs %zd check_idx %d iov_len %zd",
-		    check_ofs, check_idx, iov[check_idx].iov_len);
-		return;
-	}
-
-	ptr = (char *)iov[check_idx].iov_base + check_ofs;
-	if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
-		err("TCP4 checksum field is not correctly aligned in memory");
-		return;
-	}
-
-	check = (uint16_t *)ptr;
-
-	*check = 0;
-	*check = csum_iov_tail(&l4, sum);
+	th->check = 0;
+	sum = csum_unfolded(th, sizeof(*th), sum);
+	th->check = csum_iov_tail(payload, sum);
 }
 
 /**
  * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
  * @ip6h:	IPv6 header
- * @iov:	Pointer to the array of IO vectors
- * @iov_cnt:	Length of the array
- * @l4offset:	IPv6 payload offset in the iovec array
+ * @th:		TCP header (updated)
+ * @payload:	TCP payload
  */
-void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
-			   const struct iovec *iov, int iov_cnt,
-			   size_t l4offset)
+void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, struct tcphdr *th,
+			   struct iov_tail *payload)
 {
-	struct iov_tail l4 = IOV_TAIL(iov, iov_cnt, l4offset);
 	uint16_t l4len = ntohs(ip6h->payload_len);
-	size_t check_ofs;
-	uint16_t *check;
-	int check_idx;
 	uint32_t sum;
-	char *ptr;
 
 	sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr,
 				     &ip6h->daddr);
 
-	check_idx = iov_skip_bytes(iov, iov_cnt,
-				   l4offset + offsetof(struct tcphdr, check),
-				   &check_ofs);
-
-	if (check_idx >= iov_cnt) {
-		err("TCP6 buffer is too small, iov size %zd, check offset %zd",
-		    iov_size(iov, iov_cnt),
-		    l4offset + offsetof(struct tcphdr, check));
-		return;
-	}
-
-	if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
-		err("TCP6 checksum field memory is not contiguous "
-		    "check_ofs %zd check_idx %d iov_len %zd",
-		    check_ofs, check_idx, iov[check_idx].iov_len);
-		return;
-	}
-
-	ptr = (char *)iov[check_idx].iov_base + check_ofs;
-	if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
-		err("TCP6 checksum field is not correctly aligned in memory");
-		return;
-	}
-
-	check = (uint16_t *)ptr;
-
-	*check = 0;
-	*check = csum_iov_tail(&l4, sum);
+	th->check = 0;
+	sum = csum_unfolded(th, sizeof(*th), sum);
+	th->check = csum_iov_tail(payload, sum);
 }
 
 /**
@@ -1005,11 +941,12 @@ void tcp_fill_headers4(const struct tcp_tap_conn *conn,
 		bp->th.check = 0;
 	} else {
 		const struct iovec iov = {
-			.iov_base = bp,
-			.iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr),
+			.iov_base = bp->data,
+			.iov_len = dlen,
 		};
+		struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
 
-		tcp_update_check_tcp4(iph, &iov, 1, 0);
+		tcp_update_check_tcp4(iph, &bp->th, &payload);
 	}
 
 	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
@@ -1052,11 +989,12 @@ void tcp_fill_headers6(const struct tcp_tap_conn *conn,
 		bp->th.check = 0;
 	} else {
 		const struct iovec iov = {
-			.iov_base = bp,
-			.iov_len = ntohs(ip6h->payload_len)
+			.iov_base = bp->data,
+			.iov_len = dlen,
 		};
+		struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
 
-		tcp_update_check_tcp6(ip6h, &iov, 1, 0);
+		tcp_update_check_tcp6(ip6h, &bp->th, &payload);
 	}
 
 	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
diff --git a/tcp_internal.h b/tcp_internal.h
index d7b125f..744c5c0 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -162,12 +162,10 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
 
 struct tcp_info_linux;
 
-void tcp_update_check_tcp4(const struct iphdr *iph,
-			   const struct iovec *iov, int iov_cnt,
-			   size_t l4offset);
-void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
-			   const struct iovec *iov, int iov_cnt,
-			   size_t l4offset);
+void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th,
+			   struct iov_tail *payload);
+void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, struct tcphdr *th,
+			   struct iov_tail *payload);
 void tcp_fill_headers4(const struct tcp_tap_conn *conn,
 		       struct tap_hdr *taph, struct iphdr *iph,
 		       struct tcp_payload_t *bp, size_t dlen,
diff --git a/tcp_vu.c b/tcp_vu.c
index bbae918..134650e 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -73,15 +73,19 @@ static void tcp_vu_update_check(const struct flowside *tapside,
 	char *base = iov[0].iov_base;
 
 	if (inany_v4(&tapside->oaddr)) {
+		struct tcphdr *th = vu_payloadv4(base);
 		const struct iphdr *iph = vu_ip(base);
+		struct iov_tail payload = IOV_TAIL(iov, iov_cnt,
+						   (char *)(th + 1) - base);
 
-		tcp_update_check_tcp4(iph, iov, iov_cnt,
-				      (char *)vu_payloadv4(base) - base);
+		tcp_update_check_tcp4(iph, th, &payload);
 	} else {
+		struct tcphdr *th = vu_payloadv6(base);
 		const struct ipv6hdr *ip6h = vu_ip(base);
+		struct iov_tail payload = IOV_TAIL(iov, iov_cnt,
+						   (char *)(th + 1) - base);
 
-		tcp_update_check_tcp6(ip6h, iov, iov_cnt,
-				      (char *)vu_payloadv6(base) - base);
+		tcp_update_check_tcp6(ip6h, th, &payload);
 	}
 }
 

From 08ea3cc581beed16afff3fa934f31cbdb82cbb95 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 27 Nov 2024 14:54:07 +1100
Subject: [PATCH 148/382] tcp: Pass TCP header and payload separately to
 tcp_fill_headers[46]()

At the moment these take separate pointers to the tap specific and IP
headers, but expect the TCP header and payload as a single tcp_payload_t.
As well as being slightly inconsistent, this involves some slightly iffy
pointer shenanigans when called on the flags path with a tcp_flags_t
instead of a tcp_payload_t.

More importantly, it's inconvenient for the upcoming vhost-user case, where
the TCP header and payload might not be contiguous.  Furthermore, the
payload itself might not be contiguous.

So, pass the TCP header as its own pointer, and the TCP payload as an IO
vector.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 iov.c          |  1 -
 tcp.c          | 50 +++++++++++++++------------------------
 tcp_buf.c      | 22 ++++++++----------
 tcp_internal.h |  4 ++--
 tcp_vu.c       | 63 ++++++++++++++++++++++++++++----------------------
 5 files changed, 65 insertions(+), 75 deletions(-)

diff --git a/iov.c b/iov.c
index 2f7be15..3b12272 100644
--- a/iov.c
+++ b/iov.c
@@ -236,7 +236,6 @@ void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
  *	    overruns the IO vector, is not contiguous or doesn't have the
  *	    requested alignment.
  */
-/* cppcheck-suppress unusedFunction */
 void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align)
 {
 	char *p = iov_peek_header_(tail, len, align);
diff --git a/tcp.c b/tcp.c
index 5c40e18..2f900fc 100644
--- a/tcp.c
+++ b/tcp.c
@@ -909,21 +909,21 @@ static void tcp_fill_header(struct tcphdr *th,
  * @conn:		Connection pointer
  * @taph:		tap backend specific header
  * @iph:		Pointer to IPv4 header
- * @bp:			Pointer to TCP header followed by TCP payload
- * @dlen:		TCP payload length
+ * @th:			Pointer to TCP header
+ * @payload:		TCP payload
  * @check:		Checksum, if already known
  * @seq:		Sequence number for this segment
  * @no_tcp_csum:	Do not set TCP checksum
  */
 void tcp_fill_headers4(const struct tcp_tap_conn *conn,
 		       struct tap_hdr *taph, struct iphdr *iph,
-		       struct tcp_payload_t *bp, size_t dlen,
+		       struct tcphdr *th, struct iov_tail *payload,
 		       const uint16_t *check, uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *src4 = inany_v4(&tapside->oaddr);
 	const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
-	size_t l4len = dlen + sizeof(bp->th);
+	size_t l4len = iov_tail_size(payload) + sizeof(*th);
 	size_t l3len = l4len + sizeof(*iph);
 
 	ASSERT(src4 && dst4);
@@ -935,19 +935,12 @@ void tcp_fill_headers4(const struct tcp_tap_conn *conn,
 	iph->check = check ? *check :
 			     csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4);
 
-	tcp_fill_header(&bp->th, conn, seq);
+	tcp_fill_header(th, conn, seq);
 
-	if (no_tcp_csum) {
-		bp->th.check = 0;
-	} else {
-		const struct iovec iov = {
-			.iov_base = bp->data,
-			.iov_len = dlen,
-		};
-		struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
-
-		tcp_update_check_tcp4(iph, &bp->th, &payload);
-	}
+	if (no_tcp_csum)
+		th->check = 0;
+	else
+		tcp_update_check_tcp4(iph, th, payload);
 
 	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
 }
@@ -957,19 +950,19 @@ void tcp_fill_headers4(const struct tcp_tap_conn *conn,
  * @conn:		Connection pointer
  * @taph:		tap backend specific header
  * @ip6h:		Pointer to IPv6 header
- * @bp:			Pointer to TCP header followed by TCP payload
- * @dlen:		TCP payload length
+ * @th:			Pointer to TCP header
+ * @payload:		TCP payload
  * @check:		Checksum, if already known
  * @seq:		Sequence number for this segment
  * @no_tcp_csum:	Do not set TCP checksum
  */
 void tcp_fill_headers6(const struct tcp_tap_conn *conn,
 		       struct tap_hdr *taph, struct ipv6hdr *ip6h,
-		       struct tcp_payload_t *bp, size_t dlen,
+		       struct tcphdr *th, struct iov_tail *payload,
 		       uint32_t seq, bool no_tcp_csum)
 {
+	size_t l4len = iov_tail_size(payload) + sizeof(*th);
 	const struct flowside *tapside = TAPFLOW(conn);
-	size_t l4len = dlen + sizeof(bp->th);
 
 	ip6h->payload_len = htons(l4len);
 	ip6h->saddr = tapside->oaddr.a6;
@@ -983,19 +976,12 @@ void tcp_fill_headers6(const struct tcp_tap_conn *conn,
 	ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
 	ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
 
-	tcp_fill_header(&bp->th, conn, seq);
+	tcp_fill_header(th, conn, seq);
 
-	if (no_tcp_csum) {
-		bp->th.check = 0;
-	} else {
-		const struct iovec iov = {
-			.iov_base = bp->data,
-			.iov_len = dlen,
-		};
-		struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
-
-		tcp_update_check_tcp6(ip6h, &bp->th, &payload);
-	}
+	if (no_tcp_csum)
+		th->check = 0;
+	else
+		tcp_update_check_tcp6(ip6h, th, payload);
 
 	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
 }
diff --git a/tcp_buf.c b/tcp_buf.c
index 0946cd5..830c23d 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -151,29 +151,27 @@ void tcp_payload_flush(const struct ctx *c)
  * tcp_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
  * @conn:	Connection pointer
  * @iov:	Pointer to an array of iovec of TCP pre-cooked buffers
- * @dlen:	TCP payload length
  * @check:	Checksum, if already known
  * @seq:	Sequence number for this segment
  * @no_tcp_csum: Do not set TCP checksum
  */
 static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
-				    struct iovec *iov, size_t dlen,
-				    const uint16_t *check, uint32_t seq,
-				    bool no_tcp_csum)
+				    struct iovec *iov, const uint16_t *check,
+				    uint32_t seq, bool no_tcp_csum)
 {
+	struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0);
+	struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr);
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *a4 = inany_v4(&tapside->oaddr);
 
 	if (a4) {
 		tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
-				  iov[TCP_IOV_IP].iov_base,
-				  iov[TCP_IOV_PAYLOAD].iov_base, dlen,
-				  check, seq, no_tcp_csum);
+				  iov[TCP_IOV_IP].iov_base, th,
+				  &tail,  check, seq, no_tcp_csum);
 	} else {
 		tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
-				  iov[TCP_IOV_IP].iov_base,
-				  iov[TCP_IOV_PAYLOAD].iov_base, dlen,
-				  seq, no_tcp_csum);
+				  iov[TCP_IOV_IP].iov_base, th,
+				  &tail, seq, no_tcp_csum);
 	}
 }
 
@@ -213,7 +211,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	tcp_payload_used++;
 	l4len = optlen + sizeof(struct tcphdr);
 	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-	tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
+	tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
 
 	if (flags & DUP_ACK) {
 		struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
@@ -270,7 +268,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 	payload->th.th_flags = 0;
 	payload->th.ack = 1;
 	iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
-	tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
+	tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
 	if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
 		tcp_payload_flush(c);
 }
diff --git a/tcp_internal.h b/tcp_internal.h
index 744c5c0..9732b5b 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -168,11 +168,11 @@ void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, struct tcphdr *th,
 			   struct iov_tail *payload);
 void tcp_fill_headers4(const struct tcp_tap_conn *conn,
 		       struct tap_hdr *taph, struct iphdr *iph,
-		       struct tcp_payload_t *bp, size_t dlen,
+		       struct tcphdr *th, struct iov_tail *payload,
 		       const uint16_t *check, uint32_t seq, bool no_tcp_csum);
 void tcp_fill_headers6(const struct tcp_tap_conn *conn,
 		       struct tap_hdr *taph, struct ipv6hdr *ip6h,
-		       struct tcp_payload_t *bp, size_t dlen,
+		       struct tcphdr *th, struct iov_tail *payload,
 		       uint32_t seq, bool no_tcp_csum);
 
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
diff --git a/tcp_vu.c b/tcp_vu.c
index 134650e..470649e 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -104,10 +104,12 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	const struct flowside *tapside = TAPFLOW(conn);
 	size_t optlen, hdrlen;
 	struct vu_virtq_element flags_elem[2];
-	struct tcp_payload_t *payload;
 	struct ipv6hdr *ip6h = NULL;
 	struct iovec flags_iov[2];
+	struct tcp_syn_opts *opts;
 	struct iphdr *iph = NULL;
+	struct iov_tail payload;
+	struct tcphdr *th;
 	struct ethhdr *eh;
 	uint32_t seq;
 	int elem_cnt;
@@ -139,35 +141,35 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 		iph = vu_ip(flags_elem[0].in_sg[0].iov_base);
 		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
 
-		payload = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
+		th = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
 	} else {
 		eh->h_proto = htons(ETH_P_IPV6);
 
 		ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base);
 		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
-		payload = vu_payloadv6(flags_elem[0].in_sg[0].iov_base);
+		th = vu_payloadv6(flags_elem[0].in_sg[0].iov_base);
 	}
 
-	memset(&payload->th, 0, sizeof(payload->th));
-	payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
-	payload->th.ack = 1;
+	memset(th, 0, sizeof(*th));
+	th->doff = sizeof(*th) / 4;
+	th->ack = 1;
 
 	seq = conn->seq_to_tap;
-	ret = tcp_prepare_flags(c, conn, flags, &payload->th,
-				(struct tcp_syn_opts *)payload->data,
-				&optlen);
+	opts = (struct tcp_syn_opts *)(th + 1);
+	ret = tcp_prepare_flags(c, conn, flags, th, opts, &optlen);
 	if (ret <= 0) {
 		vu_queue_rewind(vq, 1);
 		return ret;
 	}
 
 	flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
+	payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
 
 	if (CONN_V4(conn)) {
-		tcp_fill_headers4(conn, NULL, iph, payload, optlen, NULL, seq,
-				  true);
+		tcp_fill_headers4(conn, NULL, iph, th, &payload,
+				  NULL, seq, true);
 	} else {
-		tcp_fill_headers6(conn, NULL, ip6h, payload, optlen, seq, true);
+		tcp_fill_headers6(conn, NULL, ip6h, th, &payload, seq, true);
 	}
 
 	if (*c->pcap) {
@@ -317,23 +319,28 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
  * tcp_vu_prepare() - Prepare the frame header
  * @c:		Execution context
  * @conn:	Connection pointer
- * @first:	Pointer to the array of IO vectors
- * @dlen:	Packet data length
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Number of entries in @iov
  * @check:	Checksum, if already known
  */
-static void tcp_vu_prepare(const struct ctx *c,
-			   struct tcp_tap_conn *conn, char *base,
-			   size_t dlen, const uint16_t **check)
+static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
+			   struct iovec *iov, size_t iov_cnt,
+			   const uint16_t **check)
 {
 	const struct flowside *toside = TAPFLOW(conn);
-	struct tcp_payload_t *payload;
+	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
+	size_t hdrlen = tcp_vu_hdrlen(v6);
+	struct iov_tail payload = IOV_TAIL(iov, iov_cnt, hdrlen);
+	char *base = iov[0].iov_base;
 	struct ipv6hdr *ip6h = NULL;
 	struct iphdr *iph = NULL;
+	struct tcphdr *th;
 	struct ethhdr *eh;
 
 	/* we guess the first iovec provided by the guest can embed
 	 * all the headers needed by L2 frame
 	 */
+	ASSERT(iov[0].iov_len >= hdrlen);
 
 	eh = vu_eth(base);
 
@@ -342,31 +349,31 @@ static void tcp_vu_prepare(const struct ctx *c,
 
 	/* initialize header */
 
-	if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
+	if (!v6) {
 		eh->h_proto = htons(ETH_P_IP);
 
 		iph = vu_ip(base);
 		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
-		payload = vu_payloadv4(base);
+		th = vu_payloadv4(base);
 	} else {
 		eh->h_proto = htons(ETH_P_IPV6);
 
 		ip6h = vu_ip(base);
 		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
 
-		payload = vu_payloadv6(base);
+		th = vu_payloadv6(base);
 	}
 
-	memset(&payload->th, 0, sizeof(payload->th));
-	payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
-	payload->th.ack = 1;
+	memset(th, 0, sizeof(*th));
+	th->doff = sizeof(*th) / 4;
+	th->ack = 1;
 
-	if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
-		tcp_fill_headers4(conn, NULL, iph, payload, dlen,
+	if (!v6) {
+		tcp_fill_headers4(conn, NULL, iph, th, &payload,
 				  *check, conn->seq_to_tap, true);
 		*check = &iph->check;
 	} else {
-		tcp_fill_headers6(conn, NULL, ip6h, payload, dlen,
+		tcp_fill_headers6(conn, NULL, ip6h, th, &payload,
 				  conn->seq_to_tap, true);
 	}
 }
@@ -478,7 +485,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		if (i + 1 == head_cnt)
 			check = NULL;
 
-		tcp_vu_prepare(c, conn, iov->iov_base, dlen, &check);
+		tcp_vu_prepare(c, conn, iov, buf_cnt, &check);
 
 		if (*c->pcap) {
 			tcp_vu_update_check(tapside, iov, buf_cnt);

From 2abf5ab7f3734eae9377cfab4759ae83fabf3a7e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 27 Nov 2024 14:54:08 +1100
Subject: [PATCH 149/382] tcp: Merge tcp_update_check_tcp[46]()

The only reason we need separate functions for the IPv4 and IPv6 case is
to calculate the checksum of the IP pseudo-header, which is different for
the two cases.  However, the caller already knows which path it's on and
can access the values needed for the pseudo-header partial sum more easily
than tcp_update_check_tcp[46]() can.

So, merge these functions into a single tcp_update_csum() function that
just takes the pseudo-header partial sum, calculated in the caller.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 59 +++++++++++++++++---------------------------------
 tcp_internal.h |  6 ++---
 tcp_vu.c       | 22 ++++++++++++-------
 3 files changed, 36 insertions(+), 51 deletions(-)

diff --git a/tcp.c b/tcp.c
index 2f900fc..482e460 100644
--- a/tcp.c
+++ b/tcp.c
@@ -753,44 +753,16 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
 }
 
 /**
- * tcp_update_check_tcp4() - Calculate TCP checksum for IPv4
- * @iph:	IPv4 header
+ * tcp_update_csum() - Calculate TCP checksum
+ * @psum:	Unfolded partial checksum of the IPv4 or IPv6 pseudo-header
  * @th:		TCP header (updated)
  * @payload:	TCP payload
  */
-void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th,
-			   struct iov_tail *payload)
+void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload)
 {
-	uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
-	struct in_addr saddr = { .s_addr = iph->saddr };
-	struct in_addr daddr = { .s_addr = iph->daddr };
-	uint32_t sum;
-
-	sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
-
 	th->check = 0;
-	sum = csum_unfolded(th, sizeof(*th), sum);
-	th->check = csum_iov_tail(payload, sum);
-}
-
-/**
- * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
- * @ip6h:	IPv6 header
- * @th:		TCP header (updated)
- * @payload:	TCP payload
- */
-void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, struct tcphdr *th,
-			   struct iov_tail *payload)
-{
-	uint16_t l4len = ntohs(ip6h->payload_len);
-	uint32_t sum;
-
-	sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr,
-				     &ip6h->daddr);
-
-	th->check = 0;
-	sum = csum_unfolded(th, sizeof(*th), sum);
-	th->check = csum_iov_tail(payload, sum);
+	psum = csum_unfolded(th, sizeof(*th), psum);
+	th->check = csum_iov_tail(payload, psum);
 }
 
 /**
@@ -937,10 +909,14 @@ void tcp_fill_headers4(const struct tcp_tap_conn *conn,
 
 	tcp_fill_header(th, conn, seq);
 
-	if (no_tcp_csum)
+	if (no_tcp_csum) {
 		th->check = 0;
-	else
-		tcp_update_check_tcp4(iph, th, payload);
+	} else {
+		uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
+						       *src4, *dst4);
+
+		tcp_update_csum(psum, th, payload);
+	}
 
 	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
 }
@@ -978,10 +954,15 @@ void tcp_fill_headers6(const struct tcp_tap_conn *conn,
 
 	tcp_fill_header(th, conn, seq);
 
-	if (no_tcp_csum)
+	if (no_tcp_csum) {
 		th->check = 0;
-	else
-		tcp_update_check_tcp6(ip6h, th, payload);
+	} else {
+		uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
+						       &ip6h->saddr,
+						       &ip6h->daddr);
+
+		tcp_update_csum(psum, th, payload);
+	}
 
 	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
 }
diff --git a/tcp_internal.h b/tcp_internal.h
index 9732b5b..cff06e0 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -162,10 +162,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
 
 struct tcp_info_linux;
 
-void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th,
-			   struct iov_tail *payload);
-void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, struct tcphdr *th,
-			   struct iov_tail *payload);
+void tcp_update_csum(uint32_t psum, struct tcphdr *th,
+		     struct iov_tail *payload);
 void tcp_fill_headers4(const struct tcp_tap_conn *conn,
 		       struct tap_hdr *taph, struct iphdr *iph,
 		       struct tcphdr *th, struct iov_tail *payload,
diff --git a/tcp_vu.c b/tcp_vu.c
index 470649e..a3d2e7d 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -71,22 +71,28 @@ static void tcp_vu_update_check(const struct flowside *tapside,
 			        struct iovec *iov, int iov_cnt)
 {
 	char *base = iov[0].iov_base;
+	struct iov_tail payload;
+	struct tcphdr *th;
+	uint32_t psum;
 
 	if (inany_v4(&tapside->oaddr)) {
-		struct tcphdr *th = vu_payloadv4(base);
+		const struct in_addr *src4 = inany_v4(&tapside->oaddr);
+		const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
 		const struct iphdr *iph = vu_ip(base);
-		struct iov_tail payload = IOV_TAIL(iov, iov_cnt,
-						   (char *)(th + 1) - base);
+		size_t l4len = ntohs(iph->tot_len) - sizeof(*iph);
 
-		tcp_update_check_tcp4(iph, th, &payload);
+		th = vu_payloadv4(base);
+		psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, *src4, *dst4);
 	} else {
-		struct tcphdr *th = vu_payloadv6(base);
 		const struct ipv6hdr *ip6h = vu_ip(base);
-		struct iov_tail payload = IOV_TAIL(iov, iov_cnt,
-						   (char *)(th + 1) - base);
+		size_t l4len = ntohs(ip6h->payload_len);
 
-		tcp_update_check_tcp6(ip6h, th, &payload);
+		th = vu_payloadv6(base);
+		psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
+					      &ip6h->saddr, &ip6h->daddr);
 	}
+	payload = IOV_TAIL(iov, iov_cnt, (char *)(th + 1) - base);
+	tcp_update_csum(psum, th, &payload);
 }
 
 /**

From a6348cad51398346b1ce1009be87a718b8f72bba Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 27 Nov 2024 14:54:09 +1100
Subject: [PATCH 150/382] tcp: Merge tcp_fill_headers[46]() with each other

We have different versions of this function for IPv4 and IPv6, but the
caller already requires some IP version specific code to get the right
header pointers.  Instead, have a common function that fills either an
IPv4 or an IPv6 header based on which header pointer it is passed.  This
allows us to remove a small amount of code duplication and make a few
slightly ugly conditionals.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 124 ++++++++++++++++++++++---------------------------
 tcp_buf.c      |  19 ++++----
 tcp_internal.h |  13 ++----
 tcp_vu.c       |  32 +++++--------
 4 files changed, 83 insertions(+), 105 deletions(-)

diff --git a/tcp.c b/tcp.c
index 482e460..1872ccb 100644
--- a/tcp.c
+++ b/tcp.c
@@ -877,96 +877,84 @@ static void tcp_fill_header(struct tcphdr *th,
 }
 
 /**
- * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers
+ * tcp_fill_headers() - Fill 802.3, IP, TCP headers
  * @conn:		Connection pointer
  * @taph:		tap backend specific header
- * @iph:		Pointer to IPv4 header
+ * @ip4h:		Pointer to IPv4 header, or NULL
+ * @ip6h:		Pointer to IPv6 header, or NULL
  * @th:			Pointer to TCP header
  * @payload:		TCP payload
- * @check:		Checksum, if already known
+ * @ip4_check:		IPv4 checksum, if already known
  * @seq:		Sequence number for this segment
  * @no_tcp_csum:	Do not set TCP checksum
  */
-void tcp_fill_headers4(const struct tcp_tap_conn *conn,
-		       struct tap_hdr *taph, struct iphdr *iph,
-		       struct tcphdr *th, struct iov_tail *payload,
-		       const uint16_t *check, uint32_t seq, bool no_tcp_csum)
+void tcp_fill_headers(const struct tcp_tap_conn *conn,
+		      struct tap_hdr *taph,
+		      struct iphdr *ip4h, struct ipv6hdr *ip6h,
+		      struct tcphdr *th, struct iov_tail *payload,
+		      const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
-	const struct in_addr *src4 = inany_v4(&tapside->oaddr);
-	const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
 	size_t l4len = iov_tail_size(payload) + sizeof(*th);
-	size_t l3len = l4len + sizeof(*iph);
+	size_t l3len = l4len;
+	uint32_t psum = 0;
 
-	ASSERT(src4 && dst4);
+	if (ip4h) {
+		const struct in_addr *src4 = inany_v4(&tapside->oaddr);
+		const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
 
-	iph->tot_len = htons(l3len);
-	iph->saddr = src4->s_addr;
-	iph->daddr = dst4->s_addr;
+		ASSERT(src4 && dst4);
 
-	iph->check = check ? *check :
-			     csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4);
+		l3len += + sizeof(*ip4h);
+
+		ip4h->tot_len = htons(l3len);
+		ip4h->saddr = src4->s_addr;
+		ip4h->daddr = dst4->s_addr;
+
+		if (ip4_check)
+			ip4h->check = *ip4_check;
+		else
+			ip4h->check = csum_ip4_header(l3len, IPPROTO_TCP,
+						      *src4, *dst4);
+
+		if (!no_tcp_csum) {
+			psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
+						      *src4, *dst4);
+		}
+	}
+
+	if (ip6h) {
+		l3len += sizeof(*ip6h);
+
+		ip6h->payload_len = htons(l4len);
+		ip6h->saddr = tapside->oaddr.a6;
+		ip6h->daddr = tapside->eaddr.a6;
+
+		ip6h->hop_limit = 255;
+		ip6h->version = 6;
+		ip6h->nexthdr = IPPROTO_TCP;
+
+		ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf;
+		ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
+		ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
+
+		if (!no_tcp_csum) {
+			psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
+						      &ip6h->saddr,
+						      &ip6h->daddr);
+		}
+	}
 
 	tcp_fill_header(th, conn, seq);
 
-	if (no_tcp_csum) {
+	if (no_tcp_csum)
 		th->check = 0;
-	} else {
-		uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
-						       *src4, *dst4);
-
+	else
 		tcp_update_csum(psum, th, payload);
-	}
 
 	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
 }
 
-/**
- * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers
- * @conn:		Connection pointer
- * @taph:		tap backend specific header
- * @ip6h:		Pointer to IPv6 header
- * @th:			Pointer to TCP header
- * @payload:		TCP payload
- * @check:		Checksum, if already known
- * @seq:		Sequence number for this segment
- * @no_tcp_csum:	Do not set TCP checksum
- */
-void tcp_fill_headers6(const struct tcp_tap_conn *conn,
-		       struct tap_hdr *taph, struct ipv6hdr *ip6h,
-		       struct tcphdr *th, struct iov_tail *payload,
-		       uint32_t seq, bool no_tcp_csum)
-{
-	size_t l4len = iov_tail_size(payload) + sizeof(*th);
-	const struct flowside *tapside = TAPFLOW(conn);
-
-	ip6h->payload_len = htons(l4len);
-	ip6h->saddr = tapside->oaddr.a6;
-	ip6h->daddr = tapside->eaddr.a6;
-
-	ip6h->hop_limit = 255;
-	ip6h->version = 6;
-	ip6h->nexthdr = IPPROTO_TCP;
-
-	ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf;
-	ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
-	ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
-
-	tcp_fill_header(th, conn, seq);
-
-	if (no_tcp_csum) {
-		th->check = 0;
-	} else {
-		uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
-						       &ip6h->saddr,
-						       &ip6h->daddr);
-
-		tcp_update_csum(psum, th, payload);
-	}
-
-	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
-}
-
 /**
  * tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap
  * @c:		Execution context
diff --git a/tcp_buf.c b/tcp_buf.c
index 830c23d..a975a55 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -161,18 +161,19 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 {
 	struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0);
 	struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr);
+	struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base;
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *a4 = inany_v4(&tapside->oaddr);
+	struct ipv6hdr *ip6h = NULL;
+	struct iphdr *ip4h = NULL;
 
-	if (a4) {
-		tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
-				  iov[TCP_IOV_IP].iov_base, th,
-				  &tail,  check, seq, no_tcp_csum);
-	} else {
-		tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
-				  iov[TCP_IOV_IP].iov_base, th,
-				  &tail, seq, no_tcp_csum);
-	}
+	if (a4)
+		ip4h = iov[TCP_IOV_IP].iov_base;
+	else
+		ip6h = iov[TCP_IOV_IP].iov_base;
+
+	tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail,
+			 check, seq, no_tcp_csum);
 }
 
 /**
diff --git a/tcp_internal.h b/tcp_internal.h
index cff06e0..94e5780 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -164,14 +164,11 @@ struct tcp_info_linux;
 
 void tcp_update_csum(uint32_t psum, struct tcphdr *th,
 		     struct iov_tail *payload);
-void tcp_fill_headers4(const struct tcp_tap_conn *conn,
-		       struct tap_hdr *taph, struct iphdr *iph,
-		       struct tcphdr *th, struct iov_tail *payload,
-		       const uint16_t *check, uint32_t seq, bool no_tcp_csum);
-void tcp_fill_headers6(const struct tcp_tap_conn *conn,
-		       struct tap_hdr *taph, struct ipv6hdr *ip6h,
-		       struct tcphdr *th, struct iov_tail *payload,
-		       uint32_t seq, bool no_tcp_csum);
+void tcp_fill_headers(const struct tcp_tap_conn *conn,
+		      struct tap_hdr *taph,
+		      struct iphdr *ip4h, struct ipv6hdr *ip6h,
+		      struct tcphdr *th, struct iov_tail *payload,
+		      const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum);
 
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 			  bool force_seq, struct tcp_info_linux *tinfo);
diff --git a/tcp_vu.c b/tcp_vu.c
index a3d2e7d..db2c64d 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -111,9 +111,9 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	size_t optlen, hdrlen;
 	struct vu_virtq_element flags_elem[2];
 	struct ipv6hdr *ip6h = NULL;
+	struct iphdr *ip4h = NULL;
 	struct iovec flags_iov[2];
 	struct tcp_syn_opts *opts;
-	struct iphdr *iph = NULL;
 	struct iov_tail payload;
 	struct tcphdr *th;
 	struct ethhdr *eh;
@@ -144,8 +144,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	if (CONN_V4(conn)) {
 		eh->h_proto = htons(ETH_P_IP);
 
-		iph = vu_ip(flags_elem[0].in_sg[0].iov_base);
-		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+		ip4h = vu_ip(flags_elem[0].in_sg[0].iov_base);
+		*ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
 
 		th = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
 	} else {
@@ -171,12 +171,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
 	payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
 
-	if (CONN_V4(conn)) {
-		tcp_fill_headers4(conn, NULL, iph, th, &payload,
-				  NULL, seq, true);
-	} else {
-		tcp_fill_headers6(conn, NULL, ip6h, th, &payload, seq, true);
-	}
+	tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
+			 NULL, seq, true);
 
 	if (*c->pcap) {
 		tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1);
@@ -339,7 +335,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
 	struct iov_tail payload = IOV_TAIL(iov, iov_cnt, hdrlen);
 	char *base = iov[0].iov_base;
 	struct ipv6hdr *ip6h = NULL;
-	struct iphdr *iph = NULL;
+	struct iphdr *ip4h = NULL;
 	struct tcphdr *th;
 	struct ethhdr *eh;
 
@@ -358,8 +354,8 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (!v6) {
 		eh->h_proto = htons(ETH_P_IP);
 
-		iph = vu_ip(base);
-		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+		ip4h = vu_ip(base);
+		*ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
 		th = vu_payloadv4(base);
 	} else {
 		eh->h_proto = htons(ETH_P_IPV6);
@@ -374,14 +370,10 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
 	th->doff = sizeof(*th) / 4;
 	th->ack = 1;
 
-	if (!v6) {
-		tcp_fill_headers4(conn, NULL, iph, th, &payload,
-				  *check, conn->seq_to_tap, true);
-		*check = &iph->check;
-	} else {
-		tcp_fill_headers6(conn, NULL, ip6h, th, &payload,
-				  conn->seq_to_tap, true);
-	}
+	tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
+			 *check, conn->seq_to_tap, true);
+	if (ip4h)
+		*check = &ip4h->check;
 }
 
 /**

From b6e79efa0b0c8ab6327f5184f81c5b3ab8af4ff8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 27 Nov 2024 14:54:10 +1100
Subject: [PATCH 151/382] tcp_vu: Remove unnecessary tcp_vu_update_check()
 function

Because the vhost-user <-> virtio-net path ignores checksums, we usually
don't calculate them when sending packets to the guest.  So, we always
pass no_tcp_csum=true to tcp_fill_headers().  We do want accurate
checksums when capturing packets though, so the captures don't show bogus
values.

Currently we handle this by updating the checksum field immediately before
writing the packet to the capture file, using tcp_vu_update_check().  This
is unnecessary, though: in each case tcp_fill_headers() is called not very
long before, so we can alter its no_tcp_csum parameter pased on whether
we're generating captures or not.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_vu.c | 59 +++++++++++---------------------------------------------
 1 file changed, 11 insertions(+), 48 deletions(-)

diff --git a/tcp_vu.c b/tcp_vu.c
index db2c64d..5d5c97d 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -61,40 +61,6 @@ static size_t tcp_vu_hdrlen(bool v6)
 	return hdrlen;
 }
 
-/**
- * tcp_vu_update_check() - Calculate TCP checksum
- * @tapside:	Address information for one side of the flow
- * @iov:	Pointer to the array of IO vectors
- * @iov_cnt:	Length of the array
- */
-static void tcp_vu_update_check(const struct flowside *tapside,
-			        struct iovec *iov, int iov_cnt)
-{
-	char *base = iov[0].iov_base;
-	struct iov_tail payload;
-	struct tcphdr *th;
-	uint32_t psum;
-
-	if (inany_v4(&tapside->oaddr)) {
-		const struct in_addr *src4 = inany_v4(&tapside->oaddr);
-		const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
-		const struct iphdr *iph = vu_ip(base);
-		size_t l4len = ntohs(iph->tot_len) - sizeof(*iph);
-
-		th = vu_payloadv4(base);
-		psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, *src4, *dst4);
-	} else {
-		const struct ipv6hdr *ip6h = vu_ip(base);
-		size_t l4len = ntohs(ip6h->payload_len);
-
-		th = vu_payloadv6(base);
-		psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
-					      &ip6h->saddr, &ip6h->daddr);
-	}
-	payload = IOV_TAIL(iov, iov_cnt, (char *)(th + 1) - base);
-	tcp_update_csum(psum, th, &payload);
-}
-
 /**
  * tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload)
  * @c:		Execution context
@@ -107,7 +73,6 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 {
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
-	const struct flowside *tapside = TAPFLOW(conn);
 	size_t optlen, hdrlen;
 	struct vu_virtq_element flags_elem[2];
 	struct ipv6hdr *ip6h = NULL;
@@ -172,10 +137,9 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
 
 	tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
-			 NULL, seq, true);
+			 NULL, seq, !*c->pcap);
 
 	if (*c->pcap) {
-		tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1);
 		pcap_iov(&flags_elem[0].in_sg[0], 1,
 			 sizeof(struct virtio_net_hdr_mrg_rxbuf));
 	}
@@ -319,15 +283,16 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 
 /**
  * tcp_vu_prepare() - Prepare the frame header
- * @c:		Execution context
- * @conn:	Connection pointer
- * @iov:	Pointer to the array of IO vectors
- * @iov_cnt:	Number of entries in @iov
- * @check:	Checksum, if already known
+ * @c:			Execution context
+ * @conn:		Connection pointer
+ * @iov:		Pointer to the array of IO vectors
+ * @iov_cnt:		Number of entries in @iov
+ * @check:		Checksum, if already known
+ * @no_tcp_csum:	Do not set TCP checksum
  */
 static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
 			   struct iovec *iov, size_t iov_cnt,
-			   const uint16_t **check)
+			   const uint16_t **check, bool no_tcp_csum)
 {
 	const struct flowside *toside = TAPFLOW(conn);
 	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
@@ -371,7 +336,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
 	th->ack = 1;
 
 	tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
-			 *check, conn->seq_to_tap, true);
+			 *check, conn->seq_to_tap, no_tcp_csum);
 	if (ip4h)
 		*check = &ip4h->check;
 }
@@ -389,8 +354,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
-	const struct flowside *tapside = TAPFLOW(conn);
-	size_t fillsize, hdrlen;
+	size_t hdrlen, fillsize;
 	int v6 = CONN_V6(conn);
 	uint32_t already_sent;
 	const uint16_t *check;
@@ -483,10 +447,9 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		if (i + 1 == head_cnt)
 			check = NULL;
 
-		tcp_vu_prepare(c, conn, iov, buf_cnt, &check);
+		tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap);
 
 		if (*c->pcap) {
-			tcp_vu_update_check(tapside, iov, buf_cnt);
 			pcap_iov(iov, buf_cnt,
 				 sizeof(struct virtio_net_hdr_mrg_rxbuf));
 		}

From d9c0f8eefb0015a5a06c7259666c877fff6fbe92 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 27 Nov 2024 17:16:45 +0100
Subject: [PATCH 152/382] Makefile: Use make internal string functions

TARGET_ARCH is computed from '$(CC) -dumpmachine' using external
bash commands like echo, cut, tr and sed. This can be done using
make internal string functions.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index cb74480..1fce737 100644
--- a/Makefile
+++ b/Makefile
@@ -17,8 +17,9 @@ DUAL_STACK_SOCKETS := 1
 
 TARGET ?= $(shell $(CC) -dumpmachine)
 # Get 'uname -m'-like architecture description for target
-TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
-TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')
+TARGET_ARCH := $(firstword $(subst -, ,$(TARGET)))
+TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH))
+TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH))
 
 # On some systems enabling optimization also enables source fortification,
 # automagically. Do not override it.

From 020c8b7127e38872e68bffb30ad388001e088552 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 28 Nov 2024 13:08:41 +0100
Subject: [PATCH 153/382] tcp_vu: Compute IPv4 header checksum if dlen changes

In tcp_vu_data_from_sock() we compute IPv4 header checksum only
for the first and the last packets, and re-use the first packet checksum
for all the other packets as the content of the header doesn't change.

It's more accurate to check the dlen value to know if the checksum
should change as dlen is the only information that can change in the
loop.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_vu.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tcp_vu.c b/tcp_vu.c
index 5d5c97d..10e17d3 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -354,12 +354,12 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	ssize_t len, previous_dlen;
 	size_t hdrlen, fillsize;
 	int v6 = CONN_V6(conn);
 	uint32_t already_sent;
 	const uint16_t *check;
 	int i, iov_cnt;
-	ssize_t len;
 
 	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
 		debug("Got packet, but RX virtqueue not usable yet");
@@ -433,19 +433,17 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	 */
 
 	hdrlen = tcp_vu_hdrlen(v6);
-	for (i = 0, check = NULL; i < head_cnt; i++) {
+	for (i = 0, previous_dlen = -1, check = NULL; i < head_cnt; i++) {
 		struct iovec *iov = &elem[head[i]].in_sg[0];
 		int buf_cnt = head[i + 1] - head[i];
 		ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen;
 
 		vu_set_vnethdr(vdev, iov->iov_base, buf_cnt);
 
-		/* we compute IPv4 header checksum only for the
-		 * first and the last, all other checksums are the
-		 * same as the first one
-		 */
-		if (i + 1 == head_cnt)
+		/* The IPv4 header checksum varies only with dlen */
+		if (previous_dlen != dlen)
 			check = NULL;
+		previous_dlen = dlen;
 
 		tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap);
 

From 966fdc8749048d37a4ffe845388e1ec106eb278d Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 28 Nov 2024 15:06:44 +0100
Subject: [PATCH 154/382] perf/passt_vu_tcp: Make it shine

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/perf/passt_vu_tcp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/test/perf/passt_vu_tcp b/test/perf/passt_vu_tcp
index b434008..c4409b9 100644
--- a/test/perf/passt_vu_tcp
+++ b/test/perf/passt_vu_tcp
@@ -38,10 +38,10 @@ hout	FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\
 hout	FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
 hout	FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
 
-set	THREADS 4
-set	TIME 5
+set	THREADS 6
+set	TIME 2
 set	OMIT 0.1
-set	OPTS -Z -P __THREADS__ -l 1M -O__OMIT__ -N
+set	OPTS -Z -P __THREADS__ -O__OMIT__ -N
 
 info	Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
 report	passt_vu tcp __THREADS__ __FREQ__
@@ -55,16 +55,16 @@ iperf3s	ns 10002
 bw	-
 bw	-
 guest	ip link set dev __IFNAME__ mtu 1280
-iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M -l 1M
 bw	__BW__ 1.2 1.5
 guest	ip link set dev __IFNAME__ mtu 1500
-iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 32M
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 32M -l 1M
 bw	__BW__ 1.6 1.8
 guest	ip link set dev __IFNAME__ mtu 9000
-iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M -l 1M
 bw	__BW__ 4.0 5.0
 guest	ip link set dev __IFNAME__ mtu 65520
-iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M -l 1M
 bw	__BW__ 7.0 8.0
 
 iperf3k	ns
@@ -93,22 +93,22 @@ tr	TCP throughput over IPv4: guest to host
 iperf3s	ns 10002
 
 guest	ip link set dev __IFNAME__ mtu 256
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 2M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 2M -l 1M
 bw	__BW__ 0.2 0.3
 guest	ip link set dev __IFNAME__ mtu 576
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M -l 1M
 bw	__BW__ 0.5 0.8
 guest	ip link set dev __IFNAME__ mtu 1280
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M -l 1M
 bw	__BW__ 1.2 1.5
 guest	ip link set dev __IFNAME__ mtu 1500
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M -l 1M
 bw	__BW__ 1.6 1.8
 guest	ip link set dev __IFNAME__ mtu 9000
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M -l 1M
 bw	__BW__ 4.0 5.0
 guest	ip link set dev __IFNAME__ mtu 65520
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M -l 1M
 bw	__BW__ 7.0 8.0
 
 iperf3k	ns
@@ -145,7 +145,7 @@ bw	-
 bw	-
 bw	-
 bw	-
-iperf3	BW ns ::1 10001 __TIME__ __OPTS__ -w 32M
+iperf3	BW ns ::1 10001 __TIME__ __OPTS__ -w 256M -l 16k
 bw	__BW__ 6.0 6.8
 
 iperf3k	guest
@@ -181,7 +181,7 @@ bw	-
 bw	-
 bw	-
 bw	-
-iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -w 32M
+iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -w 256M -l 16k
 bw	__BW__ 6.0 6.8
 
 iperf3k	guest

From 1db4f773e87fc77eae2c4965a6bb90fcb56a0ff3 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 5 Dec 2024 15:26:01 +1100
Subject: [PATCH 155/382] udp: Improve detail of UDP endpoint sanity checking

In udp_flow_new() we reject a flow if the endpoint isn't unicast, or it has
a zero endpoint port.  Those conditions aren't strictly illegal, but we
can't safely handle them at present:
 * Multicast UDP endpoints are certainly possible, but our current flow
   tracking only makes sense for simple unicast flows - we'll need
   different handling if we want to handle multicast flows in future
 * It's not entirely clear if port 0 is RFC-ishly correct, but for socket
   interfaces port 0 sometimes has a special meaning such as "pick the port
   for me, kernel".  That makes flows on port 0 unsafe to forward in the
   usual way.

For the same reason we also can't safely handle port 0 as our port.  In
principle that's also true for our address, however in the case of flows
initiated from a socket, we may not know our address since the socket
could be bound to 0.0.0.0 or ::, so we can only verify that our address
is unicast for flows initiated from the tap side.

Refine the current check in udp_flow_new() to slightly more detailed checks
in udp_flow_from_sock() and udp_flow_from_tap() to make what is and isn't
handled clearer.  This makes this checking more similar to what we do for
TCP connections.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_flow.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/udp_flow.c b/udp_flow.c
index b81be2c..c8fdb5f 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -75,16 +75,10 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 				int s_ini, const struct timespec *now)
 {
-	const struct flowside *ini = &flow->f.side[INISIDE];
 	struct udp_flow *uflow = NULL;
 	const struct flowside *tgt;
 	uint8_t tgtpif;
 
-	if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) {
-		flow_trace(flow, "Invalid endpoint to initiate UDP flow");
-		goto cancel;
-	}
-
 	if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
 		goto cancel;
 	tgtpif = flow->f.pif[TGTSIDE];
@@ -189,6 +183,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now)
 {
+	const struct flowside *ini;
 	struct udp_flow *uflow;
 	union flow *flow;
 	flow_sidx_t sidx;
@@ -210,7 +205,19 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 		return FLOW_SIDX_NONE;
 	}
 
-	flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port);
+	ini = flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port);
+
+	if (!inany_is_unicast(&ini->eaddr) ||
+	    ini->eport == 0 || ini->oport == 0) {
+		/* In principle ini->oddr also must be unicast, but when we've
+		 * been initiated from a socket bound to 0.0.0.0 or ::, we don't
+		 * know our address, so we have to leave it unpopulated.
+		 */
+		flow_err(flow, "Invalid endpoint on UDP recvfrom()");
+		flow_alloc_cancel(flow);
+		return FLOW_SIDX_NONE;
+	}
+
 	return udp_flow_new(c, flow, ref.fd, now);
 }
 
@@ -233,6 +240,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 			      in_port_t srcport, in_port_t dstport,
 			      const struct timespec *now)
 {
+	const struct flowside *ini;
 	struct udp_flow *uflow;
 	union flow *flow;
 	flow_sidx_t sidx;
@@ -256,7 +264,15 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 		return FLOW_SIDX_NONE;
 	}
 
-	flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport);
+	ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport,
+			       daddr, dstport);
+
+	if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 ||
+	    !inany_is_unicast(&ini->oaddr) || ini->oport == 0) {
+		flow_dbg(flow, "Invalid endpoint on UDP packet");
+		flow_alloc_cancel(flow);
+		return FLOW_SIDX_NONE;
+	}
 
 	return udp_flow_new(c, flow, -1, now);
 }

From 190829705e315972a7c674d2fa55d322aa18d26e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 5 Dec 2024 15:26:02 +1100
Subject: [PATCH 156/382] flow: Remove over-zealous sanity checks in
 flow_sidx_hash()

In flow_sidx_hash() we verify that the flow we're hashing doesn't have an
unspecified endpoint address, or zero for either port.  The hash table only
works if we're looking for exact matches of address and port, and this is
attempting to catch any cases where we might have left address or port
unpopulated or filled with a wildcard.

This doesn't really work though, because there are cases where unspecified
addresses or zero ports are correct:
 * We already use unspecified addresses for our address in cases where we
   don't know the specific local address for that side, and exclude the
   obvious extra check on side->oaddr for that reason.
 * Zero port numbers aren't strictly forbidden over the wire.  We forbid
   them for TCP & UDP because they can't safely be handled on the socket
   side.  However for ICMP a zero id, which goes in the port field is
   valid.
 * Possible future flow types (for example, for multicast protocols) might
   legitimately have an unspecified address.

Although it makes them easier to miss, these sorts of sanity checks really
have to be done at the protocol / flow type layer, and we already do so.
Remove the checks in flow_sidx_hash() other than checking that the pif
is specified.

Reported-by: Stefan <steffhip@gmail.com>
Link: https://bugs.passt.top/show_bug.cgi?id=105
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/flow.c b/flow.c
index 1ea112b..ee1221b 100644
--- a/flow.c
+++ b/flow.c
@@ -597,12 +597,7 @@ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
 	const struct flowside *side = &f->side[sidx.sidei];
 	uint8_t pif = f->pif[sidx.sidei];
 
-	/* For the hash table to work, entries must have complete endpoint
-	 * information, and at least a forwarding port.
-	 */
-	ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
-	       side->eport != 0 && side->oport != 0);
-
+	ASSERT(pif != PIF_NONE);
 	return flow_hash(c, FLOW_PROTO(f), pif, side);
 }
 

From 8996d183c5c50399d9dbae4d60d77d08f44ffb54 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 5 Dec 2024 08:37:18 +0100
Subject: [PATCH 157/382] udp_vu: update segment size

In udp_vu_sock_recv(), collect a segment with a size defined to
IP_MAX_MTU + ETH_HLEN + sizeof(struct virtio_net_hdr_mrg_rxbuf)

The original version double counted the IP header: IP_MAX_MTU includes
the IP header, and so did hdrlen.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_vu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/udp_vu.c b/udp_vu.c
index 9c697f3..4123510 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -104,7 +104,8 @@ static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
 	vu_init_elem(elem, iov_vu, VIRTQUEUE_MAX_SIZE);
 
 	iov_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE,
-			     IP_MAX_MTU - sizeof(struct udphdr) + hdrlen,
+			     IP_MAX_MTU + ETH_HLEN +
+			     sizeof(struct virtio_net_hdr_mrg_rxbuf),
 			     NULL);
 	if (iov_cnt == 0)
 		return 0;

From 2139ad33fc8ab48736d65f3d65dc882f0d612006 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Mon, 9 Dec 2024 17:54:49 +0100
Subject: [PATCH 158/382] tap: Use a common function to start a new connection

Merge code from tap_backend_init(), tap_sock_tun_init() and
tap_listen_handler() to set epoll_ref entry and to add it
to epollfd.

No functionality change

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 66 +++++++++++++++++++++++++++--------------------------------
 1 file changed, 30 insertions(+), 36 deletions(-)

diff --git a/tap.c b/tap.c
index c418064..b2d3045 100644
--- a/tap.c
+++ b/tap.c
@@ -1255,6 +1255,33 @@ static void tap_sock_unix_init(const struct ctx *c)
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
 }
 
+/**
+ * tap_start_connection() - start a new connection
+ * @c:		Execution context
+ */
+static void tap_start_connection(const struct ctx *c)
+{
+	struct epoll_event ev = { 0 };
+	union epoll_ref ref = { 0 };
+
+	ref.fd = c->fd_tap;
+	switch (c->mode) {
+	case MODE_PASST:
+		ref.type = EPOLL_TYPE_TAP_PASST;
+		break;
+	case MODE_PASTA:
+		ref.type = EPOLL_TYPE_TAP_PASTA;
+		break;
+	case MODE_VU:
+		ref.type = EPOLL_TYPE_VHOST_CMD;
+		break;
+	}
+
+	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.data.u64 = ref.u64;
+	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
+}
+
 /**
  * tap_listen_handler() - Handle new connection on listening socket
  * @c:		Execution context
@@ -1262,8 +1289,6 @@ static void tap_sock_unix_init(const struct ctx *c)
  */
 void tap_listen_handler(struct ctx *c, uint32_t events)
 {
-	struct epoll_event ev = { 0 };
-	union epoll_ref ref = { 0 };
 	int v = INT_MAX / 2;
 	struct ucred ucred;
 	socklen_t len;
@@ -1302,14 +1327,7 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
 	    setsockopt(c->fd_tap, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
 		trace("tap: failed to set SO_SNDBUF to %i", v);
 
-	ref.fd = c->fd_tap;
-	if (c->mode == MODE_VU)
-		ref.type = EPOLL_TYPE_VHOST_CMD;
-	else
-		ref.type = EPOLL_TYPE_TAP_PASST;
-	ev.events = EPOLLIN | EPOLLRDHUP;
-	ev.data.u64 = ref.u64;
-	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
+	tap_start_connection(c);
 }
 
 /**
@@ -1353,19 +1371,13 @@ static int tap_ns_tun(void *arg)
  */
 static void tap_sock_tun_init(struct ctx *c)
 {
-	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASTA };
-	struct epoll_event ev = { 0 };
-
 	NS_CALL(tap_ns_tun, c);
 	if (c->fd_tap == -1)
 		die("Failed to set up tap device in namespace");
 
 	pasta_ns_conf(c);
 
-	ref.fd = c->fd_tap;
-	ev.events = EPOLLIN | EPOLLRDHUP;
-	ev.data.u64 = ref.u64;
-	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
+	tap_start_connection(c);
 }
 
 /**
@@ -1399,26 +1411,8 @@ void tap_backend_init(struct ctx *c)
 		tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
 
 	if (c->fd_tap != -1) { /* Passed as --fd */
-		struct epoll_event ev = { 0 };
-		union epoll_ref ref;
-
 		ASSERT(c->one_off);
-		ref.fd = c->fd_tap;
-		switch (c->mode) {
-		case MODE_PASST:
-			ref.type = EPOLL_TYPE_TAP_PASST;
-			break;
-		case MODE_PASTA:
-			ref.type = EPOLL_TYPE_TAP_PASTA;
-			break;
-		case MODE_VU:
-			ref.type = EPOLL_TYPE_VHOST_CMD;
-			break;
-		}
-
-		ev.events = EPOLLIN | EPOLLRDHUP;
-		ev.data.u64 = ref.u64;
-		epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
+		tap_start_connection(c);
 		return;
 	}
 

From 947f5cdb93062fd4e56adbab9901bbbb8aa8b5cb Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Mon, 9 Dec 2024 17:54:50 +0100
Subject: [PATCH 159/382] tap: Call vu_init() with --fd

We need to initialize vhost-user structures with --fd too.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tap.c b/tap.c
index b2d3045..cd32a90 100644
--- a/tap.c
+++ b/tap.c
@@ -1405,10 +1405,12 @@ void tap_sock_update_pool(void *base, size_t size)
  */
 void tap_backend_init(struct ctx *c)
 {
-	if (c->mode == MODE_VU)
+	if (c->mode == MODE_VU) {
 		tap_sock_update_pool(NULL, 0);
-	else
+		vu_init(c);
+	} else {
 		tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
+	}
 
 	if (c->fd_tap != -1) { /* Passed as --fd */
 		ASSERT(c->one_off);
@@ -1421,8 +1423,6 @@ void tap_backend_init(struct ctx *c)
 		tap_sock_tun_init(c);
 		break;
 	case MODE_VU:
-		vu_init(c);
-		/* fall through */
 	case MODE_PASST:
 		tap_sock_unix_init(c);
 

From e24f0262229a1f9c673dca3452ad103cbe06b866 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Tue, 10 Dec 2024 13:36:45 -0500
Subject: [PATCH 160/382] pasta: make it possible to disable socket splicing

During testing it is sometimes useful to force traffic which would
normally be forwared by socket splicing through the tap interface.

In this commit, we add a command switch enabling such funtionality
for inbound local traffic.

For outbound local traffic this is much trickier, if even possible,
so leave that for a later commit.

Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c  | 7 ++++++-
 fwd.c   | 2 +-
 passt.1 | 5 +++++
 passt.h | 2 ++
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/conf.c b/conf.c
index eaa7d99..97d8beb 100644
--- a/conf.c
+++ b/conf.c
@@ -977,7 +977,8 @@ pasta_opts:
 		"			Don't copy all routes to namespace\n"
 		"  --no-copy-addrs	DEPRECATED:\n"
 		"			Don't copy all addresses to namespace\n"
-		"  --ns-mac-addr ADDR	Set MAC address on tap interface\n");
+		"  --ns-mac-addr ADDR	Set MAC address on tap interface\n"
+		"  --no-splice		Disable inbound socket splicing\n");
 
 	exit(status);
 }
@@ -1319,6 +1320,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"no-dhcpv6",	no_argument,		&c->no_dhcpv6,	1 },
 		{"no-ndp",	no_argument,		&c->no_ndp,	1 },
 		{"no-ra",	no_argument,		&c->no_ra,	1 },
+		{"no-splice",	no_argument,		&c->no_splice,	1 },
 		{"freebind",	no_argument,		&c->freebind,	1 },
 		{"no-map-gw",	no_argument,		&no_map_gw,	1 },
 		{"ipv4-only",	no_argument,		NULL,		'4' },
@@ -1756,6 +1758,9 @@ void conf(struct ctx *c, int argc, char **argv)
 		}
 	} while (name != -1);
 
+	if (c->mode != MODE_PASTA)
+		c->no_splice = 1;
+
 	if (c->mode == MODE_PASTA && !c->pasta_conf_ns) {
 		if (copy_routes_opt)
 			die("--no-copy-routes needs --config-net");
diff --git a/fwd.c b/fwd.c
index 0b7f8b1..2829cd2 100644
--- a/fwd.c
+++ b/fwd.c
@@ -443,7 +443,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
 	else if (proto == IPPROTO_UDP)
 		tgt->eport += c->udp.fwd_in.delta[tgt->eport];
 
-	if (c->mode == MODE_PASTA && inany_is_loopback(&ini->eaddr) &&
+	if (!c->no_splice && inany_is_loopback(&ini->eaddr) &&
 	    (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
 		/* spliceable */
 
diff --git a/passt.1 b/passt.1
index b2896a2..d9cd33e 100644
--- a/passt.1
+++ b/passt.1
@@ -695,6 +695,11 @@ Configure MAC address \fIaddr\fR on the tap interface in the namespace.
 
 Default is to let the tap driver build a pseudorandom hardware address.
 
+.TP
+.BR \-\-no-splice
+Disable the bypass path for inbound, local traffic. See the section \fBHandling
+of local traffic in pasta\fR in the \fBNOTES\fR for more details.
+
 .SH EXAMPLES
 
 .SS \fBpasta
diff --git a/passt.h b/passt.h
index c038630..0dd4efa 100644
--- a/passt.h
+++ b/passt.h
@@ -229,6 +229,7 @@ struct ip6_ctx {
  * @no_dhcpv6:		Disable DHCPv6 server
  * @no_ndp:		Disable NDP handler altogether
  * @no_ra:		Disable router advertisements
+ * @no_splice:		Disable socket splicing for inbound traffic
  * @host_lo_to_ns_lo:	Map host loopback addresses to ns loopback addresses
  * @freebind:		Allow binding of non-local addresses for forwarding
  * @low_wmem:		Low probed net.core.wmem_max
@@ -291,6 +292,7 @@ struct ctx {
 	int no_dhcpv6;
 	int no_ndp;
 	int no_ra;
+	int no_splice;
 	int host_lo_to_ns_lo;
 	int freebind;
 

From 09478d55fe1a21f8c55902399df84d13867e71be Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 11 Dec 2024 00:13:39 +0100
Subject: [PATCH 161/382] treewide: Dodge dynamic memory allocation in
 strerror() from glibc > 2.40

With glibc commit 25a5eb4010df ("string: strerror, strsignal cannot
use buffer after dlmopen (bug 32026)"), strerror() now needs, at least
on x86, the getrandom() and brk() system calls, in order to fill in
the locale-translated error message. But getrandom() and brk() are not
allowed by our seccomp profiles.

This became visible on Fedora Rawhide with the "podman login and
logout" Podman tests, defined at test/e2e/login_logout_test.go in the
Podman source tree, where pasta would terminate upon printing error
descriptions (at least the ones related to the SO_ERROR queue for
spliced connections).

Avoid dynamic memory allocation by calling strerrordesc_np() instead,
which is a GNU function returning a static, untranslated version of
the error description. If it's not available, keep calling strerror(),
which at that point should be simple enough as to be usable (at least,
that's currently the case for musl).

Reported-by: Paul Holzinger <pholzing@redhat.com>
Link: https://github.com/containers/podman/issues/24804
Analysed-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: Paul Holzinger <pholzing@redhat.com>
---
 conf.c       | 10 +++++-----
 icmp.c       |  4 ++--
 log.c        |  2 +-
 netlink.c    |  2 +-
 pasta.c      | 22 +++++++++++-----------
 tcp.c        | 22 +++++++++++-----------
 tcp_splice.c | 16 ++++++++--------
 udp.c        |  4 ++--
 udp_flow.c   |  8 ++++----
 util.c       |  6 +++---
 util.h       | 32 ++++++++++++++++++++++++++++++++
 11 files changed, 80 insertions(+), 48 deletions(-)

diff --git a/conf.c b/conf.c
index 97d8beb..df2b016 100644
--- a/conf.c
+++ b/conf.c
@@ -365,7 +365,7 @@ mode_conflict:
 	die("Port forwarding mode '%s' conflicts with previous mode", optarg);
 bind_fail:
 	die("Failed to bind port %u (%s) for option '-%c %s', exiting",
-	    i, strerror(-ret), optname, optarg);
+	    i, strerror_(-ret), optname, optarg);
 bind_all_fail:
 	die("Failed to bind any port for '-%c %s', exiting", optname, optarg);
 }
@@ -655,7 +655,7 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
 					  &ip4->guest_gw);
 		if (rc < 0) {
 			debug("Couldn't discover IPv4 gateway address: %s",
-			      strerror(-rc));
+			      strerror_(-rc));
 			return 0;
 		}
 	}
@@ -665,7 +665,7 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
 				     &ip4->addr, &ip4->prefix_len, NULL);
 		if (rc < 0) {
 			debug("Couldn't discover IPv4 address: %s",
-			      strerror(-rc));
+			      strerror_(-rc));
 			return 0;
 		}
 	}
@@ -729,7 +729,7 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
 		rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->guest_gw);
 		if (rc < 0) {
 			debug("Couldn't discover IPv6 gateway address: %s",
-			      strerror(-rc));
+			      strerror_(-rc));
 			return 0;
 		}
 	}
@@ -738,7 +738,7 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
 			 IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ? &ip6->addr : NULL,
 			 &prefix_len, &ip6->our_tap_ll);
 	if (rc < 0) {
-		debug("Couldn't discover IPv6 address: %s", strerror(-rc));
+		debug("Couldn't discover IPv6 address: %s", strerror_(-rc));
 		return 0;
 	}
 
diff --git a/icmp.c b/icmp.c
index f514dbc..143e93b 100644
--- a/icmp.c
+++ b/icmp.c
@@ -85,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
 
 	n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
 	if (n < 0) {
-		flow_err(pingf, "recvfrom() error: %s", strerror(errno));
+		flow_err(pingf, "recvfrom() error: %s", strerror_(errno));
 		return;
 	}
 
@@ -301,7 +301,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 	pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
 	if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
 		flow_dbg(pingf, "failed to relay request to socket: %s",
-			 strerror(errno));
+			 strerror_(errno));
 	} else {
 		flow_dbg(pingf,
 			 "echo request to socket, ID: %"PRIu16", seq: %"PRIu16,
diff --git a/log.c b/log.c
index 239c8ce..95e4576 100644
--- a/log.c
+++ b/log.c
@@ -322,7 +322,7 @@ void logmsg_perror(int pri, const char *format, ...)
 	vlogmsg(false, false, pri, format, ap);
 	va_end(ap);
 
-	logmsg(true, true, pri, ": %s", strerror(errno_copy));
+	logmsg(true, true, pri, ": %s", strerror_(errno_copy));
 }
 
 /**
diff --git a/netlink.c b/netlink.c
index 4aba2a3..0407692 100644
--- a/netlink.c
+++ b/netlink.c
@@ -320,7 +320,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
 	}
 
 	if (status < 0)
-		warn("netlink: RTM_GETROUTE failed: %s", strerror(-status));
+		warn("netlink: RTM_GETROUTE failed: %s", strerror_(-status));
 
 	if (defifi) {
 		if (ndef > 1) {
diff --git a/pasta.c b/pasta.c
index 96dacc3..ff41c95 100644
--- a/pasta.c
+++ b/pasta.c
@@ -296,7 +296,7 @@ void pasta_ns_conf(struct ctx *c)
 	rc = nl_link_set_flags(nl_sock_ns, 1 /* lo */, IFF_UP, IFF_UP);
 	if (rc < 0)
 		die("Couldn't bring up loopback interface in namespace: %s",
-		    strerror(-rc));
+		    strerror_(-rc));
 
 	/* Get or set MAC in target namespace */
 	if (MAC_IS_ZERO(c->guest_mac))
@@ -305,7 +305,7 @@ void pasta_ns_conf(struct ctx *c)
 		rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac);
 	if (rc < 0)
 		die("Couldn't set MAC address in namespace: %s",
-		    strerror(-rc));
+		    strerror_(-rc));
 
 	if (c->pasta_conf_ns) {
 		unsigned int flags = IFF_UP;
@@ -332,7 +332,7 @@ void pasta_ns_conf(struct ctx *c)
 
 			if (rc < 0) {
 				die("Couldn't set IPv4 address(es) in namespace: %s",
-				    strerror(-rc));
+				    strerror_(-rc));
 			}
 
 			if (c->ip4.no_copy_routes) {
@@ -346,7 +346,7 @@ void pasta_ns_conf(struct ctx *c)
 
 			if (rc < 0) {
 				die("Couldn't set IPv4 route(s) in guest: %s",
-				    strerror(-rc));
+				    strerror_(-rc));
 			}
 		}
 
@@ -355,13 +355,13 @@ void pasta_ns_conf(struct ctx *c)
 					    &c->ip6.addr_ll_seen);
 			if (rc < 0) {
 				warn("Can't get LL address from namespace: %s",
-				    strerror(-rc));
+				    strerror_(-rc));
 			}
 
 			rc = nl_addr_set_ll_nodad(nl_sock_ns, c->pasta_ifi);
 			if (rc < 0) {
 				warn("Can't set nodad for LL in namespace: %s",
-				    strerror(-rc));
+				    strerror_(-rc));
 			}
 
 			/* We dodged DAD: re-enable neighbour solicitations */
@@ -382,7 +382,7 @@ void pasta_ns_conf(struct ctx *c)
 
 			if (rc < 0) {
 				die("Couldn't set IPv6 address(es) in namespace: %s",
-				    strerror(-rc));
+				    strerror_(-rc));
 			}
 
 			if (c->ip6.no_copy_routes) {
@@ -397,7 +397,7 @@ void pasta_ns_conf(struct ctx *c)
 
 			if (rc < 0) {
 				die("Couldn't set IPv6 route(s) in guest: %s",
-				    strerror(-rc));
+				    strerror_(-rc));
 			}
 		}
 	}
@@ -446,18 +446,18 @@ void pasta_netns_quit_init(const struct ctx *c)
 		return;
 
 	if ((dir_fd = open(c->netns_dir, O_CLOEXEC | O_RDONLY)) < 0)
-		die("netns dir open: %s, exiting", strerror(errno));
+		die("netns dir open: %s, exiting", strerror_(errno));
 
 	if (fstatfs(dir_fd, &s)          || s.f_type == DEVPTS_SUPER_MAGIC ||
 	    s.f_type == PROC_SUPER_MAGIC || s.f_type == SYSFS_MAGIC)
 		try_inotify = false;
 
 	if (try_inotify && (fd = inotify_init1(flags)) < 0)
-		warn("inotify_init1(): %s, use a timer", strerror(errno));
+		warn("inotify_init1(): %s, use a timer", strerror_(errno));
 
 	if (fd >= 0 && inotify_add_watch(fd, c->netns_dir, IN_DELETE) < 0) {
 		warn("inotify_add_watch(): %s, use a timer",
-		     strerror(errno));
+		     strerror_(errno));
 		close(fd);
 		fd = -1;
 	}
diff --git a/tcp.c b/tcp.c
index 1872ccb..ec433f7 100644
--- a/tcp.c
+++ b/tcp.c
@@ -516,7 +516,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		fd = timerfd_create(CLOCK_MONOTONIC, 0);
 		if (fd == -1 || fd > FD_REF_MAX) {
 			flow_dbg(conn, "failed to get timer: %s",
-				 strerror(errno));
+				 strerror_(errno));
 			if (fd > -1)
 				close(fd);
 			conn->timer = -1;
@@ -526,7 +526,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 
 		if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
 			flow_dbg(conn, "failed to add timer: %s",
-				 strerror(errno));
+				 strerror_(errno));
 			close(conn->timer);
 			conn->timer = -1;
 			return;
@@ -551,7 +551,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
 
 	if (timerfd_settime(conn->timer, 0, &it, NULL))
-		flow_err(conn, "failed to set timer: %s", strerror(errno));
+		flow_err(conn, "failed to set timer: %s", strerror_(errno));
 }
 
 /**
@@ -1307,7 +1307,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af)
 		return s;
 
 	err("TCP: Unable to open socket for new connection: %s",
-	    strerror(-s));
+	    strerror_(-s));
 	return -1;
 }
 
@@ -1360,7 +1360,7 @@ static void tcp_bind_outbound(const struct ctx *c,
 			flow_dbg(conn,
 				 "Can't bind TCP outbound socket to %s:%hu: %s",
 				 inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
-				 tgt->oport, strerror(errno));
+				 tgt->oport, strerror_(errno));
 		}
 	}
 
@@ -1371,7 +1371,7 @@ static void tcp_bind_outbound(const struct ctx *c,
 				       strlen(c->ip4.ifname_out))) {
 				flow_dbg(conn, "Can't bind IPv4 TCP socket to"
 					 " interface %s: %s", c->ip4.ifname_out,
-					 strerror(errno));
+					 strerror_(errno));
 			}
 		}
 	} else if (bind_sa.sa_family == AF_INET6) {
@@ -1381,7 +1381,7 @@ static void tcp_bind_outbound(const struct ctx *c,
 				       strlen(c->ip6.ifname_out))) {
 				flow_dbg(conn, "Can't bind IPv6 TCP socket to"
 					 " interface %s: %s", c->ip6.ifname_out,
-					 strerror(errno));
+					 strerror_(errno));
 			}
 		}
 	}
@@ -2113,7 +2113,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 	 * and we just set the timer to a new point in the future: discard it.
 	 */
 	if (timerfd_gettime(conn->timer, &check_armed))
-		flow_err(conn, "failed to read timer: %s", strerror(errno));
+		flow_err(conn, "failed to read timer: %s", strerror_(errno));
 
 	if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
 		return;
@@ -2154,7 +2154,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 		 */
 		if (timerfd_settime(conn->timer, 0, &new, &old))
 			flow_err(conn, "failed to set timer: %s",
-				 strerror(errno));
+				 strerror_(errno));
 
 		if (old.it_value.tv_sec == ACT_TIMEOUT) {
 			flow_dbg(conn, "activity timeout");
@@ -2422,13 +2422,13 @@ static void tcp_sock_refill_init(const struct ctx *c)
 		int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv4 host socket pool: %s",
-			     strerror(-rc));
+			     strerror_(-rc));
 	}
 	if (c->ifi6) {
 		int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv6 host socket pool: %s",
-			     strerror(-rc));
+			     strerror_(-rc));
 	}
 }
 
diff --git a/tcp_splice.c b/tcp_splice.c
index 93f8bce..3a0f868 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -160,7 +160,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
 	if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
 	    epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
 		int ret = -errno;
-		flow_err(conn, "ERROR on epoll_ctl(): %s", strerror(errno));
+		flow_err(conn, "ERROR on epoll_ctl(): %s", strerror_(errno));
 		return ret;
 	}
 
@@ -314,7 +314,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
 		if (conn->pipe[sidei][0] < 0) {
 			if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
 				flow_err(conn, "cannot create %d->%d pipe: %s",
-					 sidei, !sidei, strerror(errno));
+					 sidei, !sidei, strerror_(errno));
 				conn_flag(c, conn, CLOSING);
 				return -EIO;
 			}
@@ -370,7 +370,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
 	if (connect(conn->s[1], &sa.sa, sl)) {
 		if (errno != EINPROGRESS) {
 			flow_trace(conn, "Couldn't connect socket for splice: %s",
-				   strerror(errno));
+				   strerror_(errno));
 			return -errno;
 		}
 
@@ -469,10 +469,10 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
 		rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl);
 		if (rc)
 			flow_err(conn, "Error retrieving SO_ERROR: %s",
-				 strerror(errno));
+				 strerror_(errno));
 		else
 			flow_trace(conn, "Error event on socket: %s",
-				   strerror(err));
+				   strerror_(err));
 
 		goto close;
 	}
@@ -551,7 +551,7 @@ eintr:
 					       &lowat, sizeof(lowat))) {
 					flow_trace(conn,
 						   "Setting SO_RCVLOWAT %i: %s",
-						   lowat, strerror(errno));
+						   lowat, strerror_(errno));
 				} else {
 					conn_flag(c, conn, lowat_set_flag);
 					conn_flag(c, conn, lowat_act_flag);
@@ -696,13 +696,13 @@ static int tcp_sock_refill_ns(void *arg)
 		int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv4 ns socket pool: %s",
-			     strerror(-rc));
+			     strerror_(-rc));
 	}
 	if (c->ifi6) {
 		int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv6 ns socket pool: %s",
-			     strerror(-rc));
+			     strerror_(-rc));
 	}
 
 	return 0;
diff --git a/udp.c b/udp.c
index c89f031..923cc38 100644
--- a/udp.c
+++ b/udp.c
@@ -453,7 +453,7 @@ static int udp_sock_recverr(int s)
 
 	/* TODO: When possible propagate and otherwise handle errors */
 	debug("%s error on UDP socket %i: %s",
-	      str_ee_origin(ee), s, strerror(ee->ee_errno));
+	      str_ee_origin(ee), s, strerror_(ee->ee_errno));
 
 	return 1;
 }
@@ -492,7 +492,7 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
 	}
 
 	if (err) {
-		debug("Unqueued error on UDP socket %i: %s", s, strerror(err));
+		debug("Unqueued error on UDP socket %i: %s", s, strerror_(err));
 		n_err++;
 	}
 
diff --git a/udp_flow.c b/udp_flow.c
index c8fdb5f..343caae 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -95,7 +95,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		if (uflow->s[INISIDE] < 0) {
 			flow_err(uflow,
 				 "Couldn't duplicate listening socket: %s",
-				 strerror(errno));
+				 strerror_(errno));
 			goto cancel;
 		}
 	}
@@ -115,14 +115,14 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		if (uflow->s[TGTSIDE] < 0) {
 			flow_dbg(uflow,
 				 "Couldn't open socket for spliced flow: %s",
-				 strerror(errno));
+				 strerror_(errno));
 			goto cancel;
 		}
 
 		if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) {
 			flow_dbg(uflow,
 				 "Couldn't connect flow socket: %s",
-				 strerror(errno));
+				 strerror_(errno));
 			goto cancel;
 		}
 
@@ -144,7 +144,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		} else if (errno != EAGAIN) {
 			flow_err(uflow,
 				 "Unexpected error discarding datagrams: %s",
-				 strerror(errno));
+				 strerror_(errno));
 		}
 	}
 
diff --git a/util.c b/util.c
index 55cae3f..11973c4 100644
--- a/util.c
+++ b/util.c
@@ -90,7 +90,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 
 	ret = -errno;
 	if (fd < 0) {
-		warn("L4 socket: %s", strerror(-ret));
+		warn("L4 socket: %s", strerror_(-ret));
 		return ret;
 	}
 
@@ -162,7 +162,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 
 	if (type == EPOLL_TYPE_TCP_LISTEN && listen(fd, 128) < 0) {
 		ret = -errno;
-		warn("TCP socket listen: %s", strerror(-ret));
+		warn("TCP socket listen: %s", strerror_(-ret));
 		close(fd);
 		return ret;
 	}
@@ -171,7 +171,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	ev.data.u64 = ref.u64;
 	if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
 		ret = -errno;
-		warn("L4 epoll_ctl: %s", strerror(-ret));
+		warn("L4 epoll_ctl: %s", strerror_(-ret));
 		return ret;
 	}
 
diff --git a/util.h b/util.h
index 41bbd60..3fa1d12 100644
--- a/util.h
+++ b/util.h
@@ -274,6 +274,38 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
 
 void raw_random(void *buf, size_t buflen);
 
+/*
+ * Starting from glibc 2.40.9000 and commit 25a5eb4010df ("string: strerror,
+ * strsignal cannot use buffer after dlmopen (bug 32026)"), strerror() needs
+ * getrandom(2) and brk(2) as it allocates memory for the locale-translated
+ * error description, but our seccomp profiles forbid both.
+ *
+ * Use the strerror_() wrapper instead, calling into strerrordesc_np() to get
+ * a static untranslated string. It's a GNU implementation, but also defined by
+ * bionic.
+ *
+ * If strerrordesc_np() is not defined (e.g. musl), call strerror(). C libraries
+ * not defining strerrordesc_np() are expected to provide strerror()
+ * implementations that are simple enough for us to call.
+ */
+__attribute__ ((weak)) const char *strerrordesc_np(int errnum);
+
+/**
+ * strerror_() - strerror() wrapper calling strerrordesc_np() if available
+ * @errnum:	Error code
+ *
+ * Return: error description string
+ */
+static inline const char *strerror_(int errnum)
+{
+	if (strerrordesc_np)
+		return strerrordesc_np(errnum);
+
+	return strerror(errnum);
+}
+
+#define strerror(x) @ "Don't call strerror() directly, use strerror_() instead"
+
 /*
  * Workarounds for https://github.com/llvm/llvm-project/issues/58992
  *

From e5ba8adef71ec53e192373ed1267dc338719dda0 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 12 Dec 2024 10:50:48 +0100
Subject: [PATCH 162/382] README: Mark vhost-user as supported

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 752e59f..54fed07 100644
--- a/README.md
+++ b/README.md
@@ -321,7 +321,7 @@ speeding up local connections, and usually requiring NAT. _pasta_:
   protocol
 * ✅ 4 to 50 times IPv4 TCP throughput of existing, conceptually similar
   solutions depending on MTU (UDP and IPv6 hard to compare)
-* 🛠 [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for
+* ✅ [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for
   maximum one copy on every data path and lower request-response latency
 * ⌚ [multithreading](https://bugs.passt.top/show_bug.cgi?id=13)
 * ⌚ [raw IP socket support](https://bugs.passt.top/show_bug.cgi?id=14) if

From 2385b69a66807e32dca5ae17ab64686888e4c682 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 19 Dec 2024 17:27:44 +0100
Subject: [PATCH 163/382] Makefile: Report error and stop if we can't set
 TARGET

I don't think it's necessarily productive to check all the possible
error conditions in the Makefile, but this one is annoying: issue
'make' without a C compiler, then install one, and build again.

Then run passt and it will mysteriously terminate on epoll_wait(),
because seccomp.h is good enough to build against, but the resulting
seccomp filter doesn't allow any system call. Not really fun to debug.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 1fce737..464eef1 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
 DUAL_STACK_SOCKETS := 1
 
 TARGET ?= $(shell $(CC) -dumpmachine)
+$(if $(TARGET),,$(error Failed to get target architecture))
 # Get 'uname -m'-like architecture description for target
 TARGET_ARCH := $(firstword $(subst -, ,$(TARGET)))
 TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH))

From 324233bd9b8baa3ec13a7425ea3ec7145e3ce645 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 20 Dec 2024 12:40:29 +0100
Subject: [PATCH 164/382] udp_flow: Don't block multicast and broadcast
 messages

It was reported that SSDP notifications sent from a container (with
e.g. minidlna) stopped appearing on the network starting from commit
1db4f773e87f ("udp: Improve detail of UDP endpoint sanity checking").
As a minimal reproducer using minidlnad(8):

  $ mkdir /tmp/minidlna
  $ cat conf
  media_dir=/tmp/minidlna
  db_dir=/tmp/minidlna
  $ ./pasta -d --config-net -- sh -c '/usr/sbin/minidlnad -p 31337 -S -f conf -P /dev/null & (sleep 1; killall minidlnad)'

[...]

  1.0327: Flow 0 (NEW): FREE -> NEW
  1.0327: Flow 0 (INI): NEW -> INI
  1.0327: Flow 0 (INI): TAP [88.198.0.164]:54185 -> [239.255.255.250]:1900 => ?
  1.0327: Flow 0 (INI): Invalid endpoint on UDP packet
  1.0327: Flow 0 (FREE): INI -> FREE
  1.0328: Flow 0 (FREE): TAP [88.198.0.164]:54185 -> [239.255.255.250]:1900 => ?
  1.0328: Dropping datagram with no flow TAP 88.198.0.164:54185 -> 239.255.255.250:1900

This is an actual regression as there's no particular reason to block
outbound multicast UDP packets.

And even if we don't handle multicast groups in any particular way
(https://bugs.passt.top/show_bug.cgi?id=2, "Add IGMP/MLD proxy"),
there's no reason to block inbound multicast or broadcast packets
either, should they ever be somehow delivered to passt or pasta.

Let multicast and broadcast packets through, refusing only to
establish flows with unspecified endpoint, as those would actually
cause havoc in the flow table.

IP-wise, SSDP notifications look like this (after this patch), inside
and outside:

  $ pasta -p /tmp/minidlna.pcap --config-net -- sh -c '/usr/sbin/minidlnad -p 31337 -S -f minidlna.conf -P /dev/null & (sleep 1; killall minidlnad)'

[...]

  $ tshark -a packets:1 -r /tmp/minidlna.pcap ssdp
      2   0.074808 88.198.0.164 ? 239.255.255.250 SSDP 200 NOTIFY * HTTP/1.1

  # tshark -i ens3 -a packets:1 multicast 2>/dev/null
      1 0.000000000 88.198.0.164 ? 239.255.255.250 SSDP 200 NOTIFY * HTTP/1.1

Link: https://github.com/containers/podman/issues/24871
Fixes: 1db4f773e87f ("udp: Improve detail of UDP endpoint sanity checking")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_flow.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/udp_flow.c b/udp_flow.c
index 343caae..9fd7d06 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -209,7 +209,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 
 	if (!inany_is_unicast(&ini->eaddr) ||
 	    ini->eport == 0 || ini->oport == 0) {
-		/* In principle ini->oddr also must be unicast, but when we've
+		/* In principle ini->oddr also must be specified, but when we've
 		 * been initiated from a socket bound to 0.0.0.0 or ::, we don't
 		 * know our address, so we have to leave it unpopulated.
 		 */
@@ -267,8 +267,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 	ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport,
 			       daddr, dstport);
 
-	if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 ||
-	    !inany_is_unicast(&ini->oaddr) || ini->oport == 0) {
+	if (inany_is_unspecified(&ini->eaddr) || ini->eport == 0 ||
+	    inany_is_unspecified(&ini->oaddr) || ini->oport == 0) {
 		flow_dbg(flow, "Invalid endpoint on UDP packet");
 		flow_alloc_cancel(flow);
 		return FLOW_SIDX_NONE;

From 898e853635a79e33917bb4646ff1fb5fc3a92997 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 19 Dec 2024 12:13:52 +0100
Subject: [PATCH 165/382] virtio: Use const pointer for vu_dev

We don't modify the structure in some virtio functions.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 virtio.c    | 14 +++++++++-----
 virtio.h    |  2 +-
 vu_common.c |  2 +-
 vu_common.h |  2 +-
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/virtio.c b/virtio.c
index a76de5e..625bac3 100644
--- a/virtio.c
+++ b/virtio.c
@@ -92,7 +92,8 @@
  *
  * Return: virtual address in our address space of the guest physical address
  */
-static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_addr)
+static void *vu_gpa_to_va(const struct vu_dev *dev, uint64_t *plen,
+			  uint64_t guest_addr)
 {
 	unsigned int i;
 
@@ -210,7 +211,8 @@ static void virtqueue_get_head(const struct vu_virtq *vq,
  *
  * Return: -1 if there is an error, 0 otherwise
  */
-static int virtqueue_read_indirect_desc(struct vu_dev *dev, struct vring_desc *desc,
+static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
+					struct vring_desc *desc,
 					uint64_t addr, size_t len)
 {
 	uint64_t read_len;
@@ -390,7 +392,7 @@ static inline void vring_set_avail_event(const struct vu_virtq *vq,
  *
  * Return: false on error, true otherwise
  */
-static bool virtqueue_map_desc(struct vu_dev *dev,
+static bool virtqueue_map_desc(const struct vu_dev *dev,
 			       unsigned int *p_num_sg, struct iovec *iov,
 			       unsigned int max_num_sg,
 			       uint64_t pa, size_t sz)
@@ -426,7 +428,8 @@ static bool virtqueue_map_desc(struct vu_dev *dev,
  *
  * Return: -1 if there is an error, 0 otherwise
  */
-static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned int idx,
+static int vu_queue_map_desc(const struct vu_dev *dev,
+			     struct vu_virtq *vq, unsigned int idx,
 			     struct vu_virtq_element *elem)
 {
 	const struct vring_desc *desc = vq->vring.desc;
@@ -504,7 +507,8 @@ static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned i
  *
  * Return: -1 if there is an error, 0 otherwise
  */
-int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem)
+int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
+		 struct vu_virtq_element *elem)
 {
 	unsigned int head;
 	int ret;
diff --git a/virtio.h b/virtio.h
index 6410d60..0af259d 100644
--- a/virtio.h
+++ b/virtio.h
@@ -170,7 +170,7 @@ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev,
 
 bool vu_queue_empty(struct vu_virtq *vq);
 void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq);
-int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq,
+int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
 		 struct vu_virtq_element *elem);
 void vu_queue_detach_element(struct vu_virtq *vq);
 void vu_queue_unpop(struct vu_virtq *vq);
diff --git a/vu_common.c b/vu_common.c
index 299b5a3..6d365be 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -73,7 +73,7 @@ void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt
  *
  * Return: number of elements used to contain the frame
  */
-int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq,
+int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq,
 	       struct vu_virtq_element *elem, int max_elem,
 	       size_t size, size_t *frame_size)
 {
diff --git a/vu_common.h b/vu_common.h
index 901d972..bd70faf 100644
--- a/vu_common.h
+++ b/vu_common.h
@@ -46,7 +46,7 @@ static inline void vu_set_element(struct vu_virtq_element *elem,
 
 void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov,
 		  int elem_cnt);
-int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq,
+int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq,
 	       struct vu_virtq_element *elem, int max_elem, size_t size,
 	       size_t *frame_size);
 void vu_set_vnethdr(const struct vu_dev *vdev,

From 3876fc780d01870040343cdab7da3f14f53272d5 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 27 Dec 2024 11:40:19 +0100
Subject: [PATCH 166/382] seccomp: Unconditionally allow accept(2) even if
 accept4(2) is present

On Alpine Linux 3.21, passt aborts right away as soon as QEMU connects
to it.

Most likely, this has always been the case with musl, because since
musl commit dc01e2cbfb29 ("add fallback emulation for accept4 on old
kernels"), accept4() without flags is implemented using accept().

However, I guess that nobody realised earlier because it's typically
pasta(1) being used on musl-based distributions, and the only place
where we call accept4() without flags is tap_listen_handler().

Add accept() to the list of allowed system calls regardless of the
presence of accept4().

Reported-by: NN708 <nn708@outlook.com>
Link: https://bugs.passt.top/show_bug.cgi?id=106
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt.c b/passt.c
index 957f3d0..1a0c404 100644
--- a/passt.c
+++ b/passt.c
@@ -180,7 +180,7 @@ void exit_handler(int signal)
  * #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close
  * #syscalls bind connect recvfrom sendto shutdown
  * #syscalls arm:recv ppc64le:recv arm:send ppc64le:send
- * #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
+ * #syscalls accept4 accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
  * #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64
  */
 int main(int argc, char **argv)

From 725acd111ba340122f2bb0601e373534eb4b5ed8 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 6 Jan 2025 10:10:29 +0100
Subject: [PATCH 167/382] tcp_splice: Set (again) TCP_NODELAY on both sides

In commit 7ecf69329787 ("pasta, tcp: Don't set TCP_CORK on spliced
sockets") I just assumed that we wouldn't benefit from disabling
Nagle's algorithm once we drop TCP_CORK (and its 200ms fixed delay).

It turns out that with some patterns, such as a PostgreSQL server
in a container receiving parameterised, short queries, for which pasta
sees several short inbound messages (Parse, Bind, Describe, Execute
and Sync commands getting each one their own packet, 5 to 49 bytes TCP
payload each), we'll read them usually in two batches, and send them
in matching batches, for example:

  9165.2467:          pasta: epoll event on connected spliced TCP socket 117 (events: 0x00000001)
  9165.2468:          Flow 0 (TCP connection (spliced)): 76 from read-side call
  9165.2468:          Flow 0 (TCP connection (spliced)): 76 from write-side call (passed 524288)
  9165.2469:          pasta: epoll event on connected spliced TCP socket 117 (events: 0x00000001)
  9165.2470:          Flow 0 (TCP connection (spliced)): 15 from read-side call
  9165.2470:          Flow 0 (TCP connection (spliced)): 15 from write-side call (passed 524288)
  9165.2944:          pasta: epoll event on connected spliced TCP socket 118 (events: 0x00000001)

and the kernel delivers the first one, waits for acknowledgement from
the receiver, then delivers the second one. This adds very substantial
and unnecessary delay. It's usually a fixed ~40ms between the two
batches, which is clearly unacceptable for loopback connections.

In this example, the delay is shown by the timestamp of the response
from socket 118. The peer (server) doesn't actually take that long
(less than a millisecond), but it takes that long for the kernel to
deliver our request.

To avoid batching and delays, disable Nagle's algorithm by setting
TCP_NODELAY on both internal and external sockets: this way, we get
one inbound packet for each original message, we transfer them right
away, and the kernel delivers them to the process in the container as
they are, without delay.

We can do this safely as we don't care much about network utilisation
when there's in fact pretty much no network (loopback connections).

This is unfortunately not visible in the TCP request-response tests
from the test suite because, with smaller messages (we use one byte),
Nagle's algorithm doesn't even kick in. It's probably not trivial to
implement a universal test covering this case.

Fixes: 7ecf69329787 ("pasta, tcp: Don't set TCP_CORK on spliced sockets")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_splice.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tcp_splice.c b/tcp_splice.c
index 3a0f868..3a000ff 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -348,6 +348,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
 	uint8_t tgtpif = conn->f.pif[TGTSIDE];
 	union sockaddr_inany sa;
 	socklen_t sl;
+	int one = 1;
 
 	if (tgtpif == PIF_HOST)
 		conn->s[1] = tcp_conn_sock(c, af);
@@ -359,12 +360,21 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
 	if (conn->s[1] < 0)
 		return -1;
 
-	if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK,
-		       &((int){ 1 }), sizeof(int))) {
+	if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &one, sizeof(one))) {
 		flow_trace(conn, "failed to set TCP_QUICKACK on socket %i",
 			   conn->s[1]);
 	}
 
+	if (setsockopt(conn->s[0], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
+		flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
+			   conn->s[0]);
+	}
+
+	if (setsockopt(conn->s[1], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
+		flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
+			   conn->s[1]);
+	}
+
 	pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);
 
 	if (connect(conn->s[1], &sa.sa, sl)) {

From 2c174f1fe8a5f1923b14cde703941d4daac39850 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 9 Jan 2025 14:06:48 +0100
Subject: [PATCH 168/382] checksum: fix checksum with odd base address

csum_unfolded() must call csum_avx2() with a 32byte aligned base address.

To be able to do that if the buffer is not correctly aligned,
it splits the buffers in 2 parts, the second part is 32byte aligned and
can be used with csum_avx2(), the first part is the remaining part, that
is not 32byte aligned and we use sum_16b() to compute the checksum.

A problem appears if the length of the first part is odd because
the checksum is using 16bit words to do the checksum.

If the length is odd, when the second part is computed, all words are
shifted by 1 byte, meaning weight of upper and lower byte is swapped.

For instance a 13 bytes buffer:

bytes:

aa AA bb BB cc CC dd DD ee EE ff FF gg

16bit words:

AAaa BBbb CCcc DDdd EEee FFff 00gg

If we don't split the sequence, the checksum is:

AAaa + BBbb + CCcc + DDdd + EEee + FFff + 00gg

If we split the sequence with an even length for the first part:

(AAaa + BBbb) + (CCcc + DDdd + EEee + FFff + 00gg)

But if the first part has an odd length:

(AAaa + BBbb + 00cc) + (ddCC + eeDD + ffEE + ggFF)

To avoid the problem, do not call csum_avx2() if the first part cannot
have an even length, and compute the checksum of all the buffer using
sum_16b().

This is slower but it can only happen if the buffer base address is odd,
and this can only happen if the binary is built using '-Os', and that
means we have chosen to prioritize size over speed.

Reported-by: Mike Jones <mike@mjones.io>
Link: https://bugs.passt.top/show_bug.cgi?id=108
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Added comment explaining why we check for pad & 1]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 checksum.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/checksum.c b/checksum.c
index 1c4354d..b01e0fe 100644
--- a/checksum.c
+++ b/checksum.c
@@ -452,7 +452,8 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
 	intptr_t align = ROUND_UP((intptr_t)buf, sizeof(__m256i));
 	unsigned int pad = align - (intptr_t)buf;
 
-	if (len < pad)
+	/* Don't mix sum_16b() and csum_avx2() with odd padding lengths */
+	if (pad & 1 || len < pad)
 		pad = len;
 
 	if (pad)

From f04b483d1509b852951fe1421ef6f6740c9f9a08 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sat, 11 Jan 2025 00:46:51 +0100
Subject: [PATCH 169/382] test/pasta_podman: Run Podman tests on a single CPU
 thread

Increasingly often, I'm getting occasional failures of the same type
as https://github.com/containers/podman/issues/24147. I guess it
mostly depends on the system load.

It will be a while until I'll actually run tests on a kernel
including my fix for it, kernel commit a502ea6fa94b ("udp: Deal with
race between UDP socket address change and rehash"), so add a horrible
workaround using taskset(1), for the moment.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/pasta_podman/bats | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pasta_podman/bats b/test/pasta_podman/bats
index 6b1c575..2f07be8 100644
--- a/test/pasta_podman/bats
+++ b/test/pasta_podman/bats
@@ -23,4 +23,4 @@ check	[ "__PASTA_BIN__" = "__WD__/pasta" ]
 
 test	Podman system test with bats
 
-host	PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats test/podman/test/system/505-networking-pasta.bats
+host	PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" taskset -c 1 bats test/podman/test/system/505-networking-pasta.bats

From 1b95bd6fa1148f3609bebf7b2bcd6d47376e61a6 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 15 Jan 2025 17:22:30 +0100
Subject: [PATCH 170/382] vhost_user: fix multibuffer from linux

Under some conditions, linux can provide several buffers
in the same element (multiple entries in the iovec array).

I didn't identify what changed between the kernel guest that
provides one buffer and the one that provides several
(doesn't seem to be a kernel change or a configuration change).

Fix the following assert:

ASSERTION FAILED in virtqueue_map_desc (virtio.c:402): num_sg < max_num_sg

What I can see is the buffer can be splitted in two iovecs:
  - vnet header
  - packet data

This change manages this special case but the real fix will be to allow
tap_add_packet() to manage iovec array.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vu_common.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/vu_common.c b/vu_common.c
index 6d365be..431fba6 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -18,6 +18,8 @@
 #include "pcap.h"
 #include "vu_common.h"
 
+#define VU_MAX_TX_BUFFER_NB	2
+
 /**
  * vu_packet_check_range() - Check if a given memory zone is contained in
  * 			     a mapped guest memory region
@@ -168,10 +170,15 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 
 	count = 0;
 	out_sg_count = 0;
-	while (count < VIRTQUEUE_MAX_SIZE) {
+	while (count < VIRTQUEUE_MAX_SIZE &&
+	       out_sg_count + VU_MAX_TX_BUFFER_NB <= VIRTQUEUE_MAX_SIZE) {
 		int ret;
 
-		vu_set_element(&elem[count], &out_sg[out_sg_count], NULL);
+		elem[count].out_num = VU_MAX_TX_BUFFER_NB;
+		elem[count].out_sg = &out_sg[out_sg_count];
+		elem[count].in_num = 0;
+		elem[count].in_sg = NULL;
+
 		ret = vu_queue_pop(vdev, vq, &elem[count]);
 		if (ret < 0)
 			break;
@@ -181,11 +188,20 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 			warn("virtio-net transmit queue contains no out buffers");
 			break;
 		}
-		ASSERT(elem[count].out_num == 1);
+		if (elem[count].out_num == 1) {
+			tap_add_packet(vdev->context,
+				       elem[count].out_sg[0].iov_len - hdrlen,
+				       (char *)elem[count].out_sg[0].iov_base +
+				        hdrlen);
+		} else {
+			/* vnet header can be in a separate iovec */
+			ASSERT(elem[count].out_num == 2);
+			ASSERT(elem[count].out_sg[0].iov_len == (size_t)hdrlen);
+			tap_add_packet(vdev->context,
+				       elem[count].out_sg[1].iov_len,
+				       (char *)elem[count].out_sg[1].iov_base);
+		}
 
-		tap_add_packet(vdev->context,
-			       elem[count].out_sg[0].iov_len - hdrlen,
-			       (char *)elem[count].out_sg[0].iov_base + hdrlen);
 		count++;
 	}
 	tap_handler(vdev->context, now);

From 707f77b0a93160c8695b3cf5bfd7c24d9992b106 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 16 Jan 2025 20:06:59 +0100
Subject: [PATCH 171/382] tcp: Fix ACK sequence getting out of sync on EPOLLOUT
 wake-up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the next patches, I'm extending the usage of STALLED to a few more
cases.

Doing so revealed this issue: if we set STALLED and, consequently,
EPOLLOUT (which is wrong, fixed later) right after we set a connection
to ESTABLISHED (which also happened by mistake while I was preparing
another change), with the guest sending data together with the final
ACK in the handshake, say:

  41.3661: vhost-user: got kick_data: 0000000000000001 idx: 1
  41.3662: Flow 2 (NEW): FREE -> NEW
  41.3663: Flow 2 (INI): NEW -> INI
  41.3663: Flow 2 (INI): TAP [2a01:4f8:222:904::2]:52536 -> [2001:db8:9a55::1]:10003 => ?
  41.3665: Flow 2 (TGT): INI -> TGT
  41.3666: Flow 2 (TGT): TAP [2a01:4f8:222:904::2]:52536 -> [2001:db8:9a55::1]:10003 => HOST [::]:0 -> [2001:db8:9a55::1]:10003
  41.3667: Flow 2 (TCP connection): TGT -> TYPED
  41.3667: Flow 2 (TCP connection): TAP [2a01:4f8:222:904::2]:52536 -> [2001:db8:9a55::1]:10003 => HOST [::]:0 -> [2001:db8:9a55::1]:10003
  41.3669: Flow 2 (TCP connection): TAP_SYN_RCVD: CLOSED -> SYN_SENT
  41.3670: Flow 2 (TCP connection): Side 0 hash table insert: bucket: 339814
  41.3672: Flow 2 (TCP connection): TYPED -> ACTIVE
  41.3673: Flow 2 (TCP connection): TAP [2a01:4f8:222:904::2]:52536 -> [2001:db8:9a55::1]:10003 => HOST [::]:0 -> [2001:db8:9a55::1]:10003
  41.3674: Flow 2 (TCP connection): TAP_SYN_ACK_SENT: SYN_SENT -> SYN_RCVD
  41.3675: Flow 2 (TCP connection): ACK_FROM_TAP_DUE
  41.3675: Flow 2 (TCP connection): timer expires in 10.000s
  41.3675: vhost-user: got kick_data: 0000000000000001 idx: 1
  41.3676: Flow 2 (TCP connection): ACK_FROM_TAP_DUE dropped
  41.3676: Flow 2 (TCP connection): ESTABLISHED: SYN_RCVD -> ESTABLISHED
  41.3678: Flow 2 (TCP connection): STALLED
  41.3678: vhost-user: got kick_data: 0000000000000002 idx: 1
  41.3679: Flow 2 (TCP connection): ACK_TO_TAP_DUE
  41.3680: Flow 2 (TCP connection): timer expires in 0.010s
  41.3680: Flow 2 (TCP connection): STALLED dropped

we'll immediately get an EPOLLOUT event, call tcp_update_seqack_wnd(),
but ignore window and ACK sequence update. At this point, we think we
acknowledged all the data to the guest (but we didn't) and we'll
happily proceed to clear the ACK_TO_TAP_DUE flag:

  41.3780: Flow 2 (TCP connection): ACK_TO_TAP_DUE dropped
  41.3780: Flow 2 (TCP connection): timer expires in 7200.000s
  41.5754: vhost-user: got kick_data: 0000000000000001 idx: 1
  41.9956: vhost-user: got kick_data: 0000000000000001 idx: 1
  42.8275: vhost-user: got kick_data: 0000000000000001 idx: 1

while the guest starts retransmitting that data desperately, without
ever getting an ACK segment from us:

   1433  38.746353 2a01:4f8:222:904::2 → 2001:db8:9a55::1 94 TCP 54312 → 10003 [SYN] Seq=0 Win=65460 Len=0 MSS=65460 SACK_PERM TSval=1089126192 TSecr=0 WS=128
   1434  38.747357 2001:db8:9a55::1 → 2a01:4f8:222:904::2 82 TCP 10003 → 54312 [SYN, ACK] Seq=0 Ack=1 Win=65535 Len=0 MSS=61440 WS=256
   1435  38.747500 2a01:4f8:222:904::2 → 2001:db8:9a55::1 74 TCP 54312 → 10003 [ACK] Seq=1 Ack=1 Win=65536 Len=0
   1436  38.747769 2a01:4f8:222:904::2 → 2001:db8:9a55::1 8266 TCP 54312 → 10003 [PSH, ACK] Seq=1 Ack=1 Win=65536 Len=8192
   1437  38.747798 2a01:4f8:222:904::2 → 2001:db8:9a55::1 32841 TCP 54312 → 10003 [ACK] Seq=8193 Ack=1 Win=65536 Len=32767
   1438  38.748049 2001:db8:9a55::1 → 2a01:4f8:222:904::2 74 TCP [TCP Window Update] 10003 → 54312 [ACK] Seq=1 Ack=1 Win=65280 Len=0
   1439  38.954044 2a01:4f8:222:904::2 → 2001:db8:9a55::1 8266 TCP [TCP Retransmission] 54312 → 10003 [PSH, ACK] Seq=1 Ack=1 Win=65536 Len=8192
   1440  39.370096 2a01:4f8:222:904::2 → 2001:db8:9a55::1 8266 TCP [TCP Retransmission] 54312 → 10003 [PSH, ACK] Seq=1 Ack=1 Win=65536 Len=8192
   1441  40.202135 2a01:4f8:222:904::2 → 2001:db8:9a55::1 8266 TCP [TCP Retransmission] 54312 → 10003 [PSH, ACK] Seq=1 Ack=1 Win=65536 Len=8192

because seq_ack_to_tap is already set to the sequence after frame
number 1437 in the example.

For some reason, I could only reproduce this with vhost-user, IPv6,
and passt running under valgrind while taking captures. Even under
these conditions, it happens quite rarely.

Forcibly send an ACK segment if we update the ACK sequence (or the
advertised window).

Fixes: e5eefe77435a ("tcp: Refactor to use events instead of states, split out spliced implementation")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index ec433f7..72fca63 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2200,8 +2200,10 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
 		if (events & EPOLLIN)
 			tcp_data_from_sock(c, conn);
 
-		if (events & EPOLLOUT)
-			tcp_update_seqack_wnd(c, conn, false, NULL);
+		if (events & EPOLLOUT) {
+			if (tcp_update_seqack_wnd(c, conn, false, NULL))
+				tcp_send_flag(c, conn, ACK);
+		}
 
 		return;
 	}

From 22cf08ba00890c83922c61f5d65803b7f4c1299a Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 16 Jan 2025 20:31:35 +0100
Subject: [PATCH 172/382] tcp: Don't subscribe to EPOLLOUT events on STALLED

I inadvertently added that in an unrelated change, but it doesn't make
sense: STALLED means we have pending socket data that we can't write
to the guest, not the other way around.

Fixes: bb708111833e ("treewide: Packet abstraction with mandatory boundary checks")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcp.c b/tcp.c
index 72fca63..ef33388 100644
--- a/tcp.c
+++ b/tcp.c
@@ -437,7 +437,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
 			return EPOLLET;
 
 		if (conn_flags & STALLED)
-			return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;
+			return EPOLLIN | EPOLLRDHUP | EPOLLET;
 
 		return EPOLLIN | EPOLLRDHUP;
 	}

From b8f573cdc222905c06f39625c0567da265a2e36e Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 14 Jan 2025 23:03:49 +0100
Subject: [PATCH 173/382] tcp: Set EPOLLET when when reading from a socket
 fails with EAGAIN

Before SO_PEEK_OFF support was introduced by commit e63d281871ef
("tcp: leverage support of SO_PEEK_OFF socket option when available"),
we would peek data from sockets using a "discard" buffer as first
iovec element, so that, unless we had no pending data at all, we would
always get a positive return code from recvmsg() (except for closing
connections or errors).

If we couldn't send more data to the guest, in the window, we would
set the STALLED flag (causing the epoll descriptor to switch to
edge-triggered mode), and return early from tcp_data_from_sock().

With SO_PEEK_OFF, we don't have a discard buffer, and if there's data
on the socket, but nothing beyond our current peeking offset, we'll
get EAGAIN instead of our current "discard" length. In that case, we
return even earlier, and we don't set EPOLLET on the socket as a
result.

As reported by Asahi Lina, this causes event loops where the kernel is
signalling socket readiness, because there's data we didn't dequeue
yet (waiting for the guest to acknowledge it), but we won't actually
peek anything new, and return early without setting EPOLLET.

This is the original report, mentioning the originally proposed fix:

--
When there is unacknowledged data in the inbound socket buffer, passt
leaves the socket in the epoll instance to accept new data from the
server. Since there is already data in the socket buffer, an epoll
without EPOLLET will repeatedly fire while no data is processed,
busy-looping the CPU:

epoll_pwait(3, [...], 8, 1000, NULL, 8) = 4
recvmsg(25, {msg_namelen=0}, MSG_PEEK)  = -1 EAGAIN (Resource temporarily unavailable)
recvmsg(169, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable)
recvmsg(111, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable)
recvmsg(180, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable)
epoll_pwait(3, [...], 8, 1000, NULL, 8) = 4
recvmsg(25, {msg_namelen=0}, MSG_PEEK)  = -1 EAGAIN (Resource temporarily unavailable)
recvmsg(169, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable)
recvmsg(111, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable)
recvmsg(180, {msg_namelen=0}, MSG_PEEK) = -1 EAGAIN (Resource temporarily unavailable)

Add in the missing EPOLLET flag for this case. This brings CPU
usage down from around ~80% when downloading over TCP, to ~5% (use
case: passt as network transport for muvm, downloading Steam games).
--

we can't set EPOLLET unconditionally though, at least right now,
because we don't monitor the guest tap for EPOLLOUT in case we fail
to write on that side because we filled up that buffer (and not the
window of a TCP connection).

Instead, rely on the observation that, once a connection is
established, we only get EAGAIN on recvmsg() if we are attempting to
peek data from a socket with a non-zero peeking offset: we only peek
when there's pending data on a socket, and in that case, if we peek
without offset, we'll always see some data.

And if we peek data with a non-zero offset and get EAGAIN, that means
that we're either waiting for more data to arrive on the socket (which
would cause further wake-ups, even with EPOLLET), or we're waiting for
the guest to acknowledge some of it, which would anyway cause a
wake-up.

In that case, it's safe to set STALLED and, in turn, EPOLLET on the
socket, which fixes the EPOLLIN event loop.

While we're establishing a connection from the socket side, though,
we'll call, once, tcp_{buf,vu}_data_from_sock() to see if we got
any data while we were waiting for SYN, ACK from the guest. See the
comment at the end of tcp_conn_from_sock_finish().

And if there's no data queued on the socket as we check, we'll also
get EAGAIN, even if our peeking offset is zero. For this reason, we
need to additionally check that 'already_sent' is not zero, meaning,
explicitly, that our peeking offset is not zero.

Reported-by: Asahi Lina <lina@asahilina.net>
Fixes: e63d281871ef ("tcp: leverage support of SO_PEEK_OFF socket option when available")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_buf.c | 3 +++
 tcp_vu.c  | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/tcp_buf.c b/tcp_buf.c
index a975a55..8c15101 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -359,6 +359,9 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 			return -errno;
 		}
 
+		if (already_sent) /* No new data and EAGAIN: set EPOLLET */
+			conn_flag(c, conn, STALLED);
+
 		return 0;
 	}
 
diff --git a/tcp_vu.c b/tcp_vu.c
index 10e17d3..8256f53 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -399,6 +399,10 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 			tcp_rst(c, conn);
 			return len;
 		}
+
+		if (already_sent) /* No new data and EAGAIN: set EPOLLET */
+			conn_flag(c, conn, STALLED);
+
 		return 0;
 	}
 

From a8f4fc481ce3afbf48522a0af44d222d665b515e Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 16 Jan 2025 20:47:00 +0100
Subject: [PATCH 174/382] tcp: Mask EPOLLIN altogether if we're blocked waiting
 on an ACK from the guest

There are pretty much two cases of the (misnomer) STALLED: in one
case, we could send more data to the guest if it becomes available,
and in another case, we can't, because we filled the window.

If, in this second case, we keep EPOLLIN enabled, but never read from
the socket, we get short but CPU-annoying storms of EPOLLIN events,
upon which we reschedule the ACK timeout handler, never read from the
socket, go back to epoll_wait(), and so on:

  timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0
  epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1
  timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0
  epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1
  timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0
  epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1

also known as:

  29.1517: Flow 2 (TCP connection): timer expires in 2.000s
  29.1517: Flow 2 (TCP connection): timer expires in 2.000s
  29.1517: Flow 2 (TCP connection): timer expires in 2.000s

which, for some reason, becomes very visible with muvm and aria2c
downloading from a server nearby in parallel chunks.

That's because EPOLLIN isn't cleared if we don't read from the socket,
and even with EPOLLET, epoll_wait() will repeatedly wake us up until
we actually read something.

In this case, we don't want to subscribe to EPOLLIN at all: all we're
waiting for is an ACK segment from the guest. Differentiate this case
with a new connection flag, ACK_FROM_TAP_BLOCKS, which doesn't just
indicate that we're waiting for an ACK from the guest
(ACK_FROM_TAP_DUE), but also that we're blocked waiting for it.

If this flag is set before we set STALLED, EPOLLIN will be masked
while we set EPOLLET because of STALLED. Whenever we clear STALLED,
we also clear this flag.

This is definitely not elegant, but it's a minimal fix.

We can probably simplify this at a later point by having a category
of connection flags directly corresponding to epoll flags, and
dropping STALLED altogether, or, perhaps, always using EPOLLET (but
we need a mechanism to re-check sockets for pending data if we can't
temporarily write to the guest).

I suspect that this might also be implied in
https://github.com/containers/podman/issues/23686, hence the Link:
tag. It doesn't necessarily mean I'm fixing it (I can't reproduce
that).

Link: https://github.com/containers/podman/issues/23686
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c      | 8 ++++++--
 tcp_buf.c  | 2 ++
 tcp_conn.h | 1 +
 tcp_vu.c   | 2 ++
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index ef33388..3b3193a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -345,7 +345,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
 
 static const char *tcp_flag_str[] __attribute((__unused__)) = {
 	"STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE",
-	"ACK_FROM_TAP_DUE",
+	"ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS",
 };
 
 /* Listening sockets, used for automatic port forwarding in pasta mode only */
@@ -436,8 +436,12 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
 		if (events & TAP_FIN_SENT)
 			return EPOLLET;
 
-		if (conn_flags & STALLED)
+		if (conn_flags & STALLED) {
+			if (conn_flags & ACK_FROM_TAP_BLOCKS)
+				return EPOLLRDHUP | EPOLLET;
+
 			return EPOLLIN | EPOLLRDHUP | EPOLLET;
+		}
 
 		return EPOLLIN | EPOLLRDHUP;
 	}
diff --git a/tcp_buf.c b/tcp_buf.c
index 8c15101..cbefa42 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -309,6 +309,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	}
 
 	if (!wnd_scaled || already_sent >= wnd_scaled) {
+		conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
 		conn_flag(c, conn, STALLED);
 		conn_flag(c, conn, ACK_FROM_TAP_DUE);
 		return 0;
@@ -387,6 +388,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
+	conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
 	conn_flag(c, conn, ~STALLED);
 
 	send_bufs = DIV_ROUND_UP(len, mss);
diff --git a/tcp_conn.h b/tcp_conn.h
index 6ae0511..d342680 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -77,6 +77,7 @@ struct tcp_tap_conn {
 #define ACTIVE_CLOSE		BIT(2)
 #define ACK_TO_TAP_DUE		BIT(3)
 #define ACK_FROM_TAP_DUE	BIT(4)
+#define ACK_FROM_TAP_BLOCKS	BIT(5)
 
 #define SNDBUF_BITS		24
 	unsigned int	sndbuf		:SNDBUF_BITS;
diff --git a/tcp_vu.c b/tcp_vu.c
index 8256f53..a216bb1 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -381,6 +381,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	}
 
 	if (!wnd_scaled || already_sent >= wnd_scaled) {
+		conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
 		conn_flag(c, conn, STALLED);
 		conn_flag(c, conn, ACK_FROM_TAP_DUE);
 		return 0;
@@ -423,6 +424,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
+	conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
 	conn_flag(c, conn, ~STALLED);
 
 	/* Likely, some new data was acked too. */

From 6016e04a3aae90cdd49fec391088b83a6d2170a6 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 19 Dec 2024 12:13:53 +0100
Subject: [PATCH 175/382] vhost-user: update protocol features and commands
 list

vhost-user protocol specification has been updated with
feature flags and commands we will need to implement migration.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
[sbrivio: Fix comment to union vhost_user_payload]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c |  5 +++++
 vhost_user.h | 40 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/vhost_user.c b/vhost_user.c
index 4b8558f..48226a8 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -110,6 +110,11 @@ static const char *vu_request_to_string(unsigned int req)
 			REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
 			REQ(VHOST_USER_ADD_MEM_REG),
 			REQ(VHOST_USER_REM_MEM_REG),
+			REQ(VHOST_USER_SET_STATUS),
+			REQ(VHOST_USER_GET_STATUS),
+			REQ(VHOST_USER_GET_SHARED_OBJECT),
+			REQ(VHOST_USER_SET_DEVICE_STATE_FD),
+			REQ(VHOST_USER_CHECK_DEVICE_STATE),
 		};
 #undef REQ
 		return vu_request_str[req];
diff --git a/vhost_user.h b/vhost_user.h
index 464ba21..c880893 100644
--- a/vhost_user.h
+++ b/vhost_user.h
@@ -37,6 +37,10 @@ enum vhost_user_protocol_feature {
 	VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
 	VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14,
 	VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
+	VHOST_USER_PROTOCOL_F_STATUS = 16,
+	/* Feature 17 reserved for VHOST_USER_PROTOCOL_F_XEN_MMAP. */
+	VHOST_USER_PROTOCOL_F_SHARED_OBJECT = 18,
+	VHOST_USER_PROTOCOL_F_DEVICE_STATE = 19,
 
 	VHOST_USER_PROTOCOL_F_MAX
 };
@@ -83,6 +87,11 @@ enum vhost_user_request {
 	VHOST_USER_GET_MAX_MEM_SLOTS = 36,
 	VHOST_USER_ADD_MEM_REG = 37,
 	VHOST_USER_REM_MEM_REG = 38,
+	VHOST_USER_SET_STATUS = 39,
+	VHOST_USER_GET_STATUS = 40,
+	VHOST_USER_GET_SHARED_OBJECT = 41,
+	VHOST_USER_SET_DEVICE_STATE_FD = 42,
+	VHOST_USER_CHECK_DEVICE_STATE = 43,
 	VHOST_USER_MAX
 };
 
@@ -128,12 +137,39 @@ struct vhost_user_memory {
 	struct vhost_user_memory_region regions[VHOST_MEMORY_BASELINE_NREGIONS];
 };
 
+/**
+ * struct vhost_user_log - Address and size of the shared memory region used
+ *			   to log page update
+ * @mmap_size:		Size of the shared memory region
+ * @mmap_offset:	Offset of the shared memory region
+ */
+struct vhost_user_log {
+	uint64_t mmap_size;
+	uint64_t mmap_offset;
+};
+
+/**
+ * struct vhost_user_transfer_device_state - Set the direction and phase
+ *                                           of the backend device state fd
+ * @direction:		Device state transfer direction (save or load)
+ * @phase:		Migration phase (only stopped is supported)
+ */
+struct vhost_user_transfer_device_state {
+	uint32_t direction;
+#define VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE 0
+#define VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD 1
+	uint32_t phase;
+#define VHOST_USER_TRANSFER_STATE_PHASE_STOPPED 0
+};
+
 /**
  * union vhost_user_payload - vhost-user message payload
  * @u64:		64-bit payload
  * @state:		vring state payload
  * @addr:		vring addresses payload
- * vhost_user_memory:	Memory regions information payload
+ * @memory:		Memory regions information payload
+ * @log:		Memory logging payload
+ * @transfer_state:	Device state payload
  */
 union vhost_user_payload {
 #define VHOST_USER_VRING_IDX_MASK   0xff
@@ -142,6 +178,8 @@ union vhost_user_payload {
 	struct vhost_vring_state state;
 	struct vhost_vring_addr addr;
 	struct vhost_user_memory memory;
+	struct vhost_user_log log;
+	struct vhost_user_transfer_device_state transfer_state;
 };
 
 /**

From b04195c60ff34db89b6bc400ad582d0ff399757b Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 19 Dec 2024 12:13:54 +0100
Subject: [PATCH 176/382] vhost-user: add VHOST_USER_SET_LOG_FD command

VHOST_USER_SET_LOG_FD is an optional message with an eventfd
in ancillary data, it may be used to inform the front-end that the
log has been modified.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
[sbrivio: Fix comment to vu_set_log_fd_exec()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 vhost_user.h |  1 +
 virtio.h     |  2 ++
 3 files changed, 59 insertions(+)

diff --git a/vhost_user.c b/vhost_user.c
index 48226a8..3f34c91 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -504,6 +504,57 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
 	return false;
 }
 
+/**
+ * vu_close_log() - Close the logging file descriptor
+ * @vdev:	vhost-user device
+ */
+static void vu_close_log(struct vu_dev *vdev)
+{
+	if (vdev->log_call_fd != -1) {
+		close(vdev->log_call_fd);
+		vdev->log_call_fd = -1;
+	}
+}
+
+/**
+ * vu_log_kick() - Inform the front-end that the log has been modified
+ * @vdev:	vhost-user device
+ */
+/* cppcheck-suppress unusedFunction */
+void vu_log_kick(const struct vu_dev *vdev)
+{
+	if (vdev->log_call_fd != -1) {
+		int rc;
+
+		rc = eventfd_write(vdev->log_call_fd, 1);
+		if (rc == -1)
+			die_perror("vhost-user kick eventfd_write()");
+	}
+}
+
+/**
+ * vu_set_log_fd_exec() - Set the eventfd used to report logging update
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_set_log_fd_exec(struct vu_dev *vdev,
+			       struct vhost_user_msg *msg)
+{
+	if (msg->fd_num != 1)
+		die("Invalid log_fd message");
+
+	if (vdev->log_call_fd != -1)
+		close(vdev->log_call_fd);
+
+	vdev->log_call_fd = msg->fds[0];
+
+	debug("Got log_call_fd: %d", vdev->log_call_fd);
+
+	return false;
+}
+
 /**
  * vu_set_vring_num_exec() - Set the size of the queue (vring size)
  * @vdev:	vhost-user device
@@ -864,8 +915,10 @@ void vu_init(struct ctx *c)
 			.notification = true,
 		};
 	}
+	c->vdev->log_call_fd = -1;
 }
 
+
 /**
  * vu_cleanup() - Reset vhost-user device
  * @vdev:	vhost-user device
@@ -909,6 +962,8 @@ void vu_cleanup(struct vu_dev *vdev)
 		}
 	}
 	vdev->nregions = 0;
+
+	vu_close_log(vdev);
 }
 
 /**
@@ -929,6 +984,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
 	[VHOST_USER_GET_QUEUE_NUM]	   = vu_get_queue_num_exec,
 	[VHOST_USER_SET_OWNER]		   = vu_set_owner_exec,
 	[VHOST_USER_SET_MEM_TABLE]	   = vu_set_mem_table_exec,
+	[VHOST_USER_SET_LOG_FD]		   = vu_set_log_fd_exec,
 	[VHOST_USER_SET_VRING_NUM]	   = vu_set_vring_num_exec,
 	[VHOST_USER_SET_VRING_ADDR]	   = vu_set_vring_addr_exec,
 	[VHOST_USER_SET_VRING_BASE]	   = vu_set_vring_base_exec,
diff --git a/vhost_user.h b/vhost_user.h
index c880893..bf3eb50 100644
--- a/vhost_user.h
+++ b/vhost_user.h
@@ -240,5 +240,6 @@ static inline bool vu_queue_started(const struct vu_virtq *vq)
 void vu_print_capabilities(void);
 void vu_init(struct ctx *c);
 void vu_cleanup(struct vu_dev *vdev);
+void vu_log_kick(const struct vu_dev *vdev);
 void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events);
 #endif /* VHOST_USER_H */
diff --git a/virtio.h b/virtio.h
index 0af259d..3b0df34 100644
--- a/virtio.h
+++ b/virtio.h
@@ -103,6 +103,7 @@ struct vu_dev_region {
  * @regions:		Guest shared memory regions
  * @features:		Vhost-user features
  * @protocol_features:	Vhost-user protocol features
+ * @log_call_fd:	Eventfd to report logging update
  */
 struct vu_dev {
 	struct ctx *context;
@@ -111,6 +112,7 @@ struct vu_dev {
 	struct vu_virtq vq[VHOST_USER_MAX_QUEUES];
 	uint64_t features;
 	uint64_t protocol_features;
+	int log_call_fd;
 };
 
 /**

From 538312af196308dea9a4ddb9442bed921c0dc915 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 19 Dec 2024 12:13:55 +0100
Subject: [PATCH 177/382] vhost-user: Pass vu_dev to more virtio functions

vu_dev will be needed to log page update.

Add the parameter to:

  vring_used_write()
  vu_queue_fill_by_index()
  vu_queue_fill()
  vring_used_idx_set()
  vu_queue_flush()

The new parameter is unused for now.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 virtio.c    | 32 ++++++++++++++++++++++----------
 virtio.h    | 10 ++++++----
 vu_common.c |  8 ++++----
 3 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/virtio.c b/virtio.c
index 625bac3..52d5a4d 100644
--- a/virtio.c
+++ b/virtio.c
@@ -580,28 +580,34 @@ bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num)
 
 /**
  * vring_used_write() - Write an entry in the used ring
+ * @dev:	Vhost-user device
  * @vq:		Virtqueue
  * @uelem:	Entry to write
  * @i:		Index of the entry in the used ring
  */
-static inline void vring_used_write(struct vu_virtq *vq,
+static inline void vring_used_write(const struct vu_dev *vdev,
+				    struct vu_virtq *vq,
 				    const struct vring_used_elem *uelem, int i)
 {
 	struct vring_used *used = vq->vring.used;
 
 	used->ring[i] = *uelem;
+	(void)vdev;
 }
 
+
 /**
  * vu_queue_fill_by_index() - Update information of a descriptor ring entry
  *			      in the used ring
+ * @dev:	Vhost-user device
  * @vq:		Virtqueue
  * @index:	Descriptor ring index
  * @len:	Size of the element
  * @idx:	Used ring entry index
  */
-void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
-			    unsigned int len, unsigned int idx)
+void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq,
+			    unsigned int index, unsigned int len,
+			    unsigned int idx)
 {
 	struct vring_used_elem uelem;
 
@@ -612,7 +618,7 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
 
 	uelem.id = htole32(index);
 	uelem.len = htole32(len);
-	vring_used_write(vq, &uelem, idx);
+	vring_used_write(vdev, vq, &uelem, idx);
 }
 
 /**
@@ -623,30 +629,36 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
  * @len:	Size of the element
  * @idx:	Used ring entry index
  */
-void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem,
-		   unsigned int len, unsigned int idx)
+void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq,
+		   const struct vu_virtq_element *elem, unsigned int len,
+		   unsigned int idx)
 {
-	vu_queue_fill_by_index(vq, elem->index, len, idx);
+	vu_queue_fill_by_index(vdev, vq, elem->index, len, idx);
 }
 
 /**
  * vring_used_idx_set() - Set the descriptor ring current index
+ * @dev:	Vhost-user device
  * @vq:		Virtqueue
  * @val:	Value to set in the index
  */
-static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val)
+static inline void vring_used_idx_set(const struct vu_dev *vdev,
+				      struct vu_virtq *vq, uint16_t val)
 {
 	vq->vring.used->idx = htole16(val);
+	(void)vdev;
 
 	vq->used_idx = val;
 }
 
 /**
  * vu_queue_flush() - Flush the virtqueue
+ * @dev:	Vhost-user device
  * @vq:		Virtqueue
  * @count:	Number of entry to flush
  */
-void vu_queue_flush(struct vu_virtq *vq, unsigned int count)
+void vu_queue_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
+		    unsigned int count)
 {
 	uint16_t old, new;
 
@@ -658,7 +670,7 @@ void vu_queue_flush(struct vu_virtq *vq, unsigned int count)
 
 	old = vq->used_idx;
 	new = old + count;
-	vring_used_idx_set(vq, new);
+	vring_used_idx_set(vdev, vq, new);
 	vq->inuse -= count;
 	if ((uint16_t)(new - vq->signalled_used) < (uint16_t)(new - old))
 		vq->signalled_used_valid = false;
diff --git a/virtio.h b/virtio.h
index 3b0df34..d95bb07 100644
--- a/virtio.h
+++ b/virtio.h
@@ -177,10 +177,12 @@ int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
 void vu_queue_detach_element(struct vu_virtq *vq);
 void vu_queue_unpop(struct vu_virtq *vq);
 bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num);
-void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
-			    unsigned int len, unsigned int idx);
-void vu_queue_fill(struct vu_virtq *vq,
+void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq,
+			    unsigned int index, unsigned int len,
+			    unsigned int idx);
+void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq,
 		   const struct vu_virtq_element *elem, unsigned int len,
 		   unsigned int idx);
-void vu_queue_flush(struct vu_virtq *vq, unsigned int count);
+void vu_queue_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
+		    unsigned int count);
 #endif /* VIRTIO_H */
diff --git a/vu_common.c b/vu_common.c
index 431fba6..0ba2351 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -142,9 +142,9 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
 	int i;
 
 	for (i = 0; i < elem_cnt; i++)
-		vu_queue_fill(vq, &elem[i], elem[i].in_sg[0].iov_len, i);
+		vu_queue_fill(vdev, vq, &elem[i], elem[i].in_sg[0].iov_len, i);
 
-	vu_queue_flush(vq, elem_cnt);
+	vu_queue_flush(vdev, vq, elem_cnt);
 	vu_queue_notify(vdev, vq);
 }
 
@@ -210,8 +210,8 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 		int i;
 
 		for (i = 0; i < count; i++)
-			vu_queue_fill(vq, &elem[i], 0, i);
-		vu_queue_flush(vq, count);
+			vu_queue_fill(vdev, vq, &elem[i], 0, i);
+		vu_queue_flush(vdev, vq, count);
 		vu_queue_notify(vdev, vq);
 	}
 }

From 3c1d91b8162607ec27b05502278a361cd73a54e2 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 19 Dec 2024 12:13:56 +0100
Subject: [PATCH 178/382] vhost-user: add VHOST_USER_SET_LOG_BASE command

Sets logging shared memory space.

When the back-end has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol feature,
the log memory fd is provided in the ancillary data of
VHOST_USER_SET_LOG_BASE message, the size and offset of shared memory
area provided in the message.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
[sbrivio: Fix coding style in a bunch of places]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 util.h       |  3 ++
 vhost_user.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 vhost_user.h |  3 ++
 virtio.c     | 74 ++++++++++++++++++++++++++++++++++++++++++--
 virtio.h     |  4 +++
 5 files changed, 167 insertions(+), 3 deletions(-)

diff --git a/util.h b/util.h
index 3fa1d12..d02333d 100644
--- a/util.h
+++ b/util.h
@@ -152,6 +152,9 @@ static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); }
 #define smp_wmb()	smp_mb_release()
 #define smp_rmb()	smp_mb_acquire()
 
+#define qatomic_or(ptr, n) \
+	((void) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST))
+
 #define NS_FN_STACK_SIZE	(1024 * 1024) /* 1MiB */
 
 int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
diff --git a/vhost_user.c b/vhost_user.c
index 3f34c91..66ded12 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -510,6 +510,12 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
  */
 static void vu_close_log(struct vu_dev *vdev)
 {
+	if (vdev->log_table) {
+		if (munmap(vdev->log_table, vdev->log_size) != 0)
+			die_perror("close log munmap() error");
+		vdev->log_table = NULL;
+	}
+
 	if (vdev->log_call_fd != -1) {
 		close(vdev->log_call_fd);
 		vdev->log_call_fd = -1;
@@ -520,7 +526,6 @@ static void vu_close_log(struct vu_dev *vdev)
  * vu_log_kick() - Inform the front-end that the log has been modified
  * @vdev:	vhost-user device
  */
-/* cppcheck-suppress unusedFunction */
 void vu_log_kick(const struct vu_dev *vdev)
 {
 	if (vdev->log_call_fd != -1) {
@@ -532,6 +537,83 @@ void vu_log_kick(const struct vu_dev *vdev)
 	}
 }
 
+/**
+ * vu_log_page() - Update logging table
+ * @log_table:	Base address of the logging table
+ * @page:	Page number that has been updated
+ */
+/* NOLINTNEXTLINE(readability-non-const-parameter) */
+static void vu_log_page(uint8_t *log_table, uint64_t page)
+{
+	qatomic_or(&log_table[page / 8], 1 << (page % 8));
+}
+
+/**
+ * vu_log_write() - Log memory write
+ * @dev:	vhost-user device
+ * @address:	Memory address
+ * @length:	Memory size
+ */
+void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length)
+{
+	uint64_t page;
+
+	if (!vdev->log_table || !length ||
+	    !vu_has_feature(vdev, VHOST_F_LOG_ALL))
+		return;
+
+	page = address / VHOST_LOG_PAGE;
+	while (page * VHOST_LOG_PAGE < address + length) {
+		vu_log_page(vdev->log_table, page);
+		page++;
+	}
+	vu_log_kick(vdev);
+}
+
+/**
+ * vu_set_log_base_exec() - Set the memory log base
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ *
+ * #syscalls:vu mmap|mmap2 munmap
+ */
+static bool vu_set_log_base_exec(struct vu_dev *vdev,
+				 struct vhost_user_msg *msg)
+{
+	uint64_t log_mmap_size, log_mmap_offset;
+	void *base;
+	int fd;
+
+	if (msg->fd_num != 1 || msg->hdr.size != sizeof(msg->payload.log))
+		die("vhost-user: Invalid log_base message");
+
+	fd = msg->fds[0];
+	log_mmap_offset = msg->payload.log.mmap_offset;
+	log_mmap_size = msg->payload.log.mmap_size;
+
+	debug("vhost-user log mmap_offset: %"PRId64, log_mmap_offset);
+	debug("vhost-user log mmap_size:   %"PRId64, log_mmap_size);
+
+	base = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
+		    log_mmap_offset);
+	close(fd);
+	if (base == MAP_FAILED)
+		die("vhost-user log mmap error");
+
+	if (vdev->log_table)
+		munmap(vdev->log_table, vdev->log_size);
+
+	vdev->log_table = base;
+	vdev->log_size = log_mmap_size;
+
+	msg->hdr.size = sizeof(msg->payload.u64);
+	msg->fd_num = 0;
+
+	return true;
+}
+
 /**
  * vu_set_log_fd_exec() - Set the eventfd used to report logging update
  * @vdev:	vhost-user device
@@ -915,6 +997,7 @@ void vu_init(struct ctx *c)
 			.notification = true,
 		};
 	}
+	c->vdev->log_table = NULL;
 	c->vdev->log_call_fd = -1;
 }
 
@@ -984,6 +1067,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
 	[VHOST_USER_GET_QUEUE_NUM]	   = vu_get_queue_num_exec,
 	[VHOST_USER_SET_OWNER]		   = vu_set_owner_exec,
 	[VHOST_USER_SET_MEM_TABLE]	   = vu_set_mem_table_exec,
+	[VHOST_USER_SET_LOG_BASE]	   = vu_set_log_base_exec,
 	[VHOST_USER_SET_LOG_FD]		   = vu_set_log_fd_exec,
 	[VHOST_USER_SET_VRING_NUM]	   = vu_set_vring_num_exec,
 	[VHOST_USER_SET_VRING_ADDR]	   = vu_set_vring_addr_exec,
diff --git a/vhost_user.h b/vhost_user.h
index bf3eb50..e769cb1 100644
--- a/vhost_user.h
+++ b/vhost_user.h
@@ -15,6 +15,7 @@
 #include "iov.h"
 
 #define VHOST_USER_F_PROTOCOL_FEATURES 30
+#define VHOST_LOG_PAGE 4096
 
 #define VHOST_MEMORY_BASELINE_NREGIONS 8
 
@@ -241,5 +242,7 @@ void vu_print_capabilities(void);
 void vu_init(struct ctx *c);
 void vu_cleanup(struct vu_dev *vdev);
 void vu_log_kick(const struct vu_dev *vdev);
+void vu_log_write(const struct vu_dev *vdev, uint64_t address,
+		  uint64_t length);
 void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events);
 #endif /* VHOST_USER_H */
diff --git a/virtio.c b/virtio.c
index 52d5a4d..2b58e4d 100644
--- a/virtio.c
+++ b/virtio.c
@@ -81,6 +81,7 @@
 
 #include "util.h"
 #include "virtio.h"
+#include "vhost_user.h"
 
 #define VIRTQUEUE_MAX_SIZE 1024
 
@@ -592,7 +593,72 @@ static inline void vring_used_write(const struct vu_dev *vdev,
 	struct vring_used *used = vq->vring.used;
 
 	used->ring[i] = *uelem;
-	(void)vdev;
+	vu_log_write(vdev, vq->vring.log_guest_addr +
+		     offsetof(struct vring_used, ring[i]),
+		     sizeof(used->ring[i]));
+}
+
+/**
+ * vu_log_queue_fill() - Log virtqueue memory update
+ * @dev:	vhost-user device
+ * @vq:		Virtqueue
+ * @index:	Descriptor ring index
+ * @len:	Size of the element
+ */
+static void vu_log_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq,
+			      unsigned int index, unsigned int len)
+{
+	struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
+	struct vring_desc *desc = vq->vring.desc;
+	unsigned int max, min;
+	unsigned num_bufs = 0;
+	uint64_t read_len;
+
+	if (!vdev->log_table || !len || !vu_has_feature(vdev, VHOST_F_LOG_ALL))
+		return;
+
+	max = vq->vring.num;
+
+	if (le16toh(desc[index].flags) & VRING_DESC_F_INDIRECT) {
+		unsigned int desc_len;
+		uint64_t desc_addr;
+
+		if (le32toh(desc[index].len) % sizeof(struct vring_desc))
+			die("Invalid size for indirect buffer table");
+
+		/* loop over the indirect descriptor table */
+		desc_addr = le64toh(desc[index].addr);
+		desc_len = le32toh(desc[index].len);
+		max = desc_len / sizeof(struct vring_desc);
+		read_len = desc_len;
+		desc = vu_gpa_to_va(vdev, &read_len, desc_addr);
+		if (desc && read_len != desc_len) {
+			/* Failed to use zero copy */
+			desc = NULL;
+			if (!virtqueue_read_indirect_desc(vdev, desc_buf,
+							  desc_addr,
+							  desc_len))
+				desc = desc_buf;
+		}
+
+		if (!desc)
+			die("Invalid indirect buffer table");
+
+		index = 0;
+	}
+
+	do {
+		if (++num_bufs > max)
+			die("Looped descriptor");
+
+		if (le16toh(desc[index].flags) & VRING_DESC_F_WRITE) {
+			min = MIN(le32toh(desc[index].len), len);
+			vu_log_write(vdev, le64toh(desc[index].addr), min);
+			len -= min;
+		}
+	} while (len > 0 &&
+		 (virtqueue_read_next_desc(desc, index, max, &index) ==
+		  VIRTQUEUE_READ_DESC_MORE));
 }
 
 
@@ -614,6 +680,8 @@ void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq,
 	if (!vq->vring.avail)
 		return;
 
+	vu_log_queue_fill(vdev, vq, index, len);
+
 	idx = (idx + vq->used_idx) % vq->vring.num;
 
 	uelem.id = htole32(index);
@@ -646,7 +714,9 @@ static inline void vring_used_idx_set(const struct vu_dev *vdev,
 				      struct vu_virtq *vq, uint16_t val)
 {
 	vq->vring.used->idx = htole16(val);
-	(void)vdev;
+	vu_log_write(vdev, vq->vring.log_guest_addr +
+		     offsetof(struct vring_used, idx),
+		     sizeof(vq->vring.used->idx));
 
 	vq->used_idx = val;
 }
diff --git a/virtio.h b/virtio.h
index d95bb07..f572341 100644
--- a/virtio.h
+++ b/virtio.h
@@ -104,6 +104,8 @@ struct vu_dev_region {
  * @features:		Vhost-user features
  * @protocol_features:	Vhost-user protocol features
  * @log_call_fd:	Eventfd to report logging update
+ * @log_size:		Size of the logging memory region
+ * @log_table:		Base of the logging memory region
  */
 struct vu_dev {
 	struct ctx *context;
@@ -113,6 +115,8 @@ struct vu_dev {
 	uint64_t features;
 	uint64_t protocol_features;
 	int log_call_fd;
+	uint64_t log_size;
+	uint8_t *log_table;
 };
 
 /**

From 78c73e9395b13354272010d2f202c819689d48f8 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 19 Dec 2024 12:13:57 +0100
Subject: [PATCH 179/382] vhost-user: Report to front-end we support
 VHOST_USER_PROTOCOL_F_LOG_SHMFD

This features allows QEMU to be migrated. We need also to report
VHOST_F_LOG_ALL.

This protocol feature reports we can log the page update and
implement VHOST_USER_SET_LOG_BASE and VHOST_USER_SET_LOG_FD.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vhost_user.c b/vhost_user.c
index 66ded12..747b7f6 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -334,6 +334,7 @@ static bool vu_get_features_exec(struct vu_dev *vdev,
 	uint64_t features =
 		1ULL << VIRTIO_F_VERSION_1 |
 		1ULL << VIRTIO_NET_F_MRG_RXBUF |
+		1ULL << VHOST_F_LOG_ALL |
 		1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
 
 	(void)vdev;
@@ -911,7 +912,8 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
 static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
 					  struct vhost_user_msg *msg)
 {
-	uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK;
+	uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
+			    1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD;
 
 	(void)vdev;
 	vmsg_set_reply_u64(msg, features);

From 878e16345461eb2745c761f6929fd6e9da0df447 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 19 Dec 2024 12:13:58 +0100
Subject: [PATCH 180/382] vhost-user: add VHOST_USER_CHECK_DEVICE_STATE command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After transferring the back-end’s internal state during migration,
check whether the back-end was able to successfully fully process
the state.

The value returned indicates success or error;
0 is success, any non-zero value is an error.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 21 +++++++++++++++++++++
 virtio.h     | 18 ++++++++++--------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/vhost_user.c b/vhost_user.c
index 747b7f6..2962709 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -980,6 +980,23 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
 	return false;
 }
 
+/**
+ * vu_check_device_state_exec() -- Return device state migration result
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: True as the reply contains the migration result
+ */
+static bool vu_check_device_state_exec(struct vu_dev *vdev,
+				       struct vhost_user_msg *msg)
+{
+	(void)vdev;
+
+	vmsg_set_reply_u64(msg, vdev->device_state_result);
+
+	return true;
+}
+
 /**
  * vu_init() - Initialize vhost-user device structure
  * @c:		execution context
@@ -1001,6 +1018,7 @@ void vu_init(struct ctx *c)
 	}
 	c->vdev->log_table = NULL;
 	c->vdev->log_call_fd = -1;
+	c->vdev->device_state_result = -1;
 }
 
 
@@ -1049,6 +1067,8 @@ void vu_cleanup(struct vu_dev *vdev)
 	vdev->nregions = 0;
 
 	vu_close_log(vdev);
+
+	vdev->device_state_result = -1;
 }
 
 /**
@@ -1079,6 +1099,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
 	[VHOST_USER_SET_VRING_CALL]	   = vu_set_vring_call_exec,
 	[VHOST_USER_SET_VRING_ERR]	   = vu_set_vring_err_exec,
 	[VHOST_USER_SET_VRING_ENABLE]	   = vu_set_vring_enable_exec,
+	[VHOST_USER_CHECK_DEVICE_STATE]    = vu_check_device_state_exec,
 };
 
 /**
diff --git a/virtio.h b/virtio.h
index f572341..512ec1b 100644
--- a/virtio.h
+++ b/virtio.h
@@ -98,14 +98,15 @@ struct vu_dev_region {
 
 /**
  * struct vu_dev - vhost-user device information
- * @context:		Execution context
- * @nregions:		Number of shared memory regions
- * @regions:		Guest shared memory regions
- * @features:		Vhost-user features
- * @protocol_features:	Vhost-user protocol features
- * @log_call_fd:	Eventfd to report logging update
- * @log_size:		Size of the logging memory region
- * @log_table:		Base of the logging memory region
+ * @context:			Execution context
+ * @nregions:			Number of shared memory regions
+ * @regions:			Guest shared memory regions
+ * @features:			Vhost-user features
+ * @protocol_features:		Vhost-user protocol features
+ * @log_call_fd:		Eventfd to report logging update
+ * @log_size:			Size of the logging memory region
+ * @log_table:			Base of the logging memory region
+ * @device_state_result:	Device state migration result
  */
 struct vu_dev {
 	struct ctx *context;
@@ -117,6 +118,7 @@ struct vu_dev {
 	int log_call_fd;
 	uint64_t log_size;
 	uint8_t *log_table;
+	int device_state_result;
 };
 
 /**

From 31d70024beda1e49131d7b68dd7554bee16c79f3 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 19 Dec 2024 12:13:59 +0100
Subject: [PATCH 181/382] vhost-user: add VHOST_USER_SET_DEVICE_STATE_FD
 command

Set the file descriptor to use to transfer the
backend device state during migration.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
[sbrivio: Fixed nits and coding style here and there]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 epoll_type.h |  2 ++
 passt.c      |  4 +++
 vhost_user.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 virtio.h     |  2 ++
 vu_common.c  | 49 +++++++++++++++++++++++++++++++
 vu_common.h  |  1 +
 6 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/epoll_type.h b/epoll_type.h
index f3ef415..fd9eac3 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -40,6 +40,8 @@ enum epoll_type {
 	EPOLL_TYPE_VHOST_CMD,
 	/* vhost-user kick event socket */
 	EPOLL_TYPE_VHOST_KICK,
+	/* vhost-user migration socket */
+	EPOLL_TYPE_VHOST_MIGRATION,
 
 	EPOLL_NUM_TYPES,
 };
diff --git a/passt.c b/passt.c
index 1a0c404..b1c8ab6 100644
--- a/passt.c
+++ b/passt.c
@@ -75,6 +75,7 @@ char *epoll_type_str[] = {
 	[EPOLL_TYPE_TAP_LISTEN]		= "listening qemu socket",
 	[EPOLL_TYPE_VHOST_CMD]		= "vhost-user command socket",
 	[EPOLL_TYPE_VHOST_KICK]		= "vhost-user kick socket",
+	[EPOLL_TYPE_VHOST_MIGRATION]	= "vhost-user migration socket",
 };
 static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
 	      "epoll_type_str[] doesn't match enum epoll_type");
@@ -356,6 +357,9 @@ loop:
 		case EPOLL_TYPE_VHOST_KICK:
 			vu_kick_cb(c.vdev, ref, &now);
 			break;
+		case EPOLL_TYPE_VHOST_MIGRATION:
+			vu_migrate(c.vdev, eventmask);
+			break;
 		default:
 			/* Can't happen */
 			ASSERT(0);
diff --git a/vhost_user.c b/vhost_user.c
index 2962709..daff9ab 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -981,7 +981,78 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
 }
 
 /**
- * vu_check_device_state_exec() -- Return device state migration result
+ * vu_set_migration_watch() - Add the migration file descriptor to epoll
+ * @vdev:	vhost-user device
+ * @fd:		File descriptor to add
+ * @direction:	Direction of the migration (save or load backend state)
+ */
+static void vu_set_migration_watch(const struct vu_dev *vdev, int fd,
+				   uint32_t direction)
+{
+	union epoll_ref ref = {
+		.type = EPOLL_TYPE_VHOST_MIGRATION,
+		.fd = fd,
+	 };
+	struct epoll_event ev = { 0 };
+
+	ev.data.u64 = ref.u64;
+	switch (direction) {
+	case VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE:
+		ev.events = EPOLLOUT;
+		break;
+	case VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD:
+		ev.events = EPOLLIN;
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
+}
+
+/**
+ * vu_set_device_state_fd_exec() - Set the device state migration channel
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: True as the reply contains 0 to indicate success
+ *         and set bit 8 as we don't provide our own fd.
+ */
+static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
+					struct vhost_user_msg *msg)
+{
+	unsigned int direction = msg->payload.transfer_state.direction;
+	unsigned int phase = msg->payload.transfer_state.phase;
+
+	if (msg->fd_num != 1)
+		die("Invalid device_state_fd message");
+
+	if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED)
+		die("Invalid device_state_fd phase: %d", phase);
+
+	if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE &&
+	    direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
+		die("Invalide device_state_fd direction: %d", direction);
+
+	if (vdev->device_state_fd != -1) {
+		vu_remove_watch(vdev, vdev->device_state_fd);
+		close(vdev->device_state_fd);
+	}
+
+	vdev->device_state_fd = msg->fds[0];
+	vdev->device_state_result = -1;
+	vu_set_migration_watch(vdev, vdev->device_state_fd, direction);
+
+	debug("Got device_state_fd: %d", vdev->device_state_fd);
+
+	/* We don't provide a new fd for the data transfer */
+	vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
+
+	return true;
+}
+
+/**
+ * vu_check_device_state_exec() - Return device state migration result
  * @vdev:	vhost-user device
  * @vmsg:	vhost-user message
  *
@@ -1018,6 +1089,7 @@ void vu_init(struct ctx *c)
 	}
 	c->vdev->log_table = NULL;
 	c->vdev->log_call_fd = -1;
+	c->vdev->device_state_fd = -1;
 	c->vdev->device_state_result = -1;
 }
 
@@ -1068,7 +1140,12 @@ void vu_cleanup(struct vu_dev *vdev)
 
 	vu_close_log(vdev);
 
-	vdev->device_state_result = -1;
+	if (vdev->device_state_fd != -1) {
+		vu_remove_watch(vdev, vdev->device_state_fd);
+		close(vdev->device_state_fd);
+		vdev->device_state_fd = -1;
+		vdev->device_state_result = -1;
+	}
 }
 
 /**
@@ -1099,6 +1176,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
 	[VHOST_USER_SET_VRING_CALL]	   = vu_set_vring_call_exec,
 	[VHOST_USER_SET_VRING_ERR]	   = vu_set_vring_err_exec,
 	[VHOST_USER_SET_VRING_ENABLE]	   = vu_set_vring_enable_exec,
+	[VHOST_USER_SET_DEVICE_STATE_FD]   = vu_set_device_state_fd_exec,
 	[VHOST_USER_CHECK_DEVICE_STATE]    = vu_check_device_state_exec,
 };
 
diff --git a/virtio.h b/virtio.h
index 512ec1b..7bef2d2 100644
--- a/virtio.h
+++ b/virtio.h
@@ -106,6 +106,7 @@ struct vu_dev_region {
  * @log_call_fd:		Eventfd to report logging update
  * @log_size:			Size of the logging memory region
  * @log_table:			Base of the logging memory region
+ * @device_state_fd:		Device state migration channel
  * @device_state_result:	Device state migration result
  */
 struct vu_dev {
@@ -118,6 +119,7 @@ struct vu_dev {
 	int log_call_fd;
 	uint64_t log_size;
 	uint8_t *log_table;
+	int device_state_fd;
 	int device_state_result;
 };
 
diff --git a/vu_common.c b/vu_common.c
index 0ba2351..87a0d94 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -297,3 +297,52 @@ err:
 
 	return -1;
 }
+
+/**
+ * vu_migrate() - Send/receive passt insternal state to/from QEMU
+ * @vdev:	vhost-user device
+ * @events:	epoll events
+ */
+void vu_migrate(struct vu_dev *vdev, uint32_t events)
+{
+	int ret;
+
+	/* TODO: collect/set passt internal state
+	 * and use vdev->device_state_fd to send/receive it
+	 */
+	debug("vu_migrate fd %d events %x", vdev->device_state_fd, events);
+	if (events & EPOLLOUT) {
+		debug("Saving backend state");
+
+		/* send some stuff */
+		ret = write(vdev->device_state_fd, "PASST", 6);
+		/* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
+		vdev->device_state_result = ret == -1 ? -1 : 0;
+		/* Closing the file descriptor signals the end of transfer */
+		epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL,
+			  vdev->device_state_fd, NULL);
+		close(vdev->device_state_fd);
+		vdev->device_state_fd = -1;
+	} else if (events & EPOLLIN) {
+		char buf[6];
+
+		debug("Loading backend state");
+		/* read some stuff */
+		ret = read(vdev->device_state_fd, buf, sizeof(buf));
+		/* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
+		if (ret != sizeof(buf)) {
+			vdev->device_state_result = -1;
+		} else {
+			ret = strncmp(buf, "PASST", sizeof(buf));
+			vdev->device_state_result = ret == 0 ? 0 : -1;
+		}
+	} else if (events & EPOLLHUP) {
+		debug("Closing migration channel");
+
+		/* The end of file signals the end of the transfer. */
+		epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL,
+			  vdev->device_state_fd, NULL);
+		close(vdev->device_state_fd);
+		vdev->device_state_fd = -1;
+	}
+}
diff --git a/vu_common.h b/vu_common.h
index bd70faf..d56c021 100644
--- a/vu_common.h
+++ b/vu_common.h
@@ -57,4 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
 void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
 		const struct timespec *now);
 int vu_send_single(const struct ctx *c, const void *buf, size_t size);
+void vu_migrate(struct vu_dev *vdev, uint32_t events);
 #endif /* VU_COMMON_H */

From 412ed4f09ff2e07545acdc5fe87a55a34aab4f92 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 19 Dec 2024 12:14:00 +0100
Subject: [PATCH 182/382] vhost-user: Report to front-end we support
 VHOST_USER_PROTOCOL_F_DEVICE_STATE

Report to front-end that we support device state commands:
VHOST_USER_CHECK_DEVICE_STATE
VHOST_USER_SET_LOG_BASE

These feature is needed to transfer backend state using frontend
channel.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vhost_user.c b/vhost_user.c
index daff9ab..f12dec5 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -913,7 +913,8 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
 					  struct vhost_user_msg *msg)
 {
 	uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
-			    1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD;
+			    1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
+			    1ULL << VHOST_USER_PROTOCOL_F_DEVICE_STATE;
 
 	(void)vdev;
 	vmsg_set_reply_u64(msg, features);

From c96a88d550fcda3f1972aee395fcfda19905d0a4 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Mon, 20 Jan 2025 14:15:22 +0100
Subject: [PATCH 183/382] vhost_user: remove ASSERT() on iovec number

Replace ASSERT() on the number of iovec in the element and on
the first entry length by a debug() message.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
[sbrivio: Fix typo in failure message]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vu_common.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vu_common.c b/vu_common.c
index 87a0d94..aa5ca7b 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -195,8 +195,12 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 				        hdrlen);
 		} else {
 			/* vnet header can be in a separate iovec */
-			ASSERT(elem[count].out_num == 2);
-			ASSERT(elem[count].out_sg[0].iov_len == (size_t)hdrlen);
+			if (elem[count].out_num != 2)
+				debug("virtio-net transmit queue contains more than one buffer ([%d]: %u)",
+				      count, elem[count].out_num);
+			if (elem[count].out_sg[0].iov_len != (size_t)hdrlen)
+				debug("virtio-net transmit queue entry not aligned on hdrlen ([%d]: %d != %zu)",
+				count, hdrlen, elem[count].out_sg[0].iov_len);
 			tap_add_packet(vdev->context,
 				       elem[count].out_sg[1].iov_len,
 				       (char *)elem[count].out_sg[1].iov_base);

From 8757834d145a06b845aa0bb6bdfd4f93971b8d74 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 20 Jan 2025 16:49:30 +0100
Subject: [PATCH 184/382] tcp: Buffer sizes are *not* inherited on
 accept()/accept4()

...so it's pointless to set SO_RCVBUF and SO_SNDBUF on listening
sockets.

Call tcp_sock_set_bufsize() after accept4(), for inbound sockets.

As we didn't have large buffer sizes set for inbound sockets for
a long time (they are set explicitly only if the maximum size is
big enough, more than than the ~200 KiB default), I ran some more
throughput tests for this one, and I see slightly better numbers
(say, 17 gbps instead of 15 gbps guest to host without vhost-user).

Fixes: 904b86ade7db ("tcp: Rework window handling, timers, add SO_RCVLOWAT and pools for sockets/pipes")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tcp.c b/tcp.c
index 3b3193a..a012b81 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2057,6 +2057,8 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 	if (s < 0)
 		goto cancel;
 
+	tcp_sock_set_bufsize(c, s);
+
 	/* FIXME: When listening port has a specific bound address, record that
 	 * as our address
 	 */
@@ -2260,7 +2262,6 @@ static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
 	if (s < 0)
 		return s;
 
-	tcp_sock_set_bufsize(c, s);
 	return s;
 }
 
@@ -2317,9 +2318,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
 
 	s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4,
 			NULL, port, tref.u32);
-	if (s >= 0)
-		tcp_sock_set_bufsize(c, s);
-	else
+	if (s < 0)
 		s = -1;
 
 	if (c->tcp.fwd_out.mode == FWD_AUTO)
@@ -2343,9 +2342,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
 
 	s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6,
 			NULL, port, tref.u32);
-	if (s >= 0)
-		tcp_sock_set_bufsize(c, s);
-	else
+	if (s < 0)
 		s = -1;
 
 	if (c->tcp.fwd_out.mode == FWD_AUTO)

From 54bb972cfb2637f64a9718023a2351f8f259abdb Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 17 Jan 2025 10:10:10 +0100
Subject: [PATCH 185/382] tcp: Disable Nagle's algorithm (set TCP_NODELAY) on
 all sockets

Following up on 725acd111ba3 ("tcp_splice: Set (again) TCP_NODELAY on
both sides"), David argues that, in general, we don't know what kind
of TCP traffic we're dealing with, on any side or path.

TCP segments might have been delivered to our socket with a PSH flag,
but we don't have a way to know about it.

Similarly, the guest might send us segments with PSH or URG set, but
we don't know if we should generally TCP_CORK sockets and uncork on
those flags, because that would assume they're running a Linux kernel
(and a particular version of it) matching the kernel that delivers
outbound packets for us.

Given that we can't make any assumption and everything might very well
be interactive traffic, disable Nagle's algorithm on all non-spliced
sockets as well.

After all, John Nagle himself is nowadays recommending that delayed
ACKs should never be enabled together with his algorithm, but we
don't have a practical way to ensure that our environment is free from
delayed ACKs (TCP_QUICKACK is not really usable for this purpose):

  https://news.ycombinator.com/item?id=34180239

Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tcp.c b/tcp.c
index a012b81..4d6a6b3 100644
--- a/tcp.c
+++ b/tcp.c
@@ -756,6 +756,19 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
 		trace("TCP: failed to set SO_SNDBUF to %i", v);
 }
 
+/**
+ * tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm)
+ * @s:		Socket, can be -1 to avoid check in the caller
+ */
+static void tcp_sock_set_nodelay(int s)
+{
+	if (s == -1)
+		return;
+
+	if (setsockopt(s, SOL_TCP, TCP_NODELAY, &((int){ 1 }), sizeof(int)))
+		debug("TCP: failed to set TCP_NODELAY on socket %i", s);
+}
+
 /**
  * tcp_update_csum() - Calculate TCP checksum
  * @psum:	Unfolded partial checksum of the IPv4 or IPv6 pseudo-header
@@ -1285,6 +1298,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
 		return -errno;
 
 	tcp_sock_set_bufsize(c, s);
+	tcp_sock_set_nodelay(s);
 
 	return s;
 }
@@ -2058,6 +2072,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 		goto cancel;
 
 	tcp_sock_set_bufsize(c, s);
+	tcp_sock_set_nodelay(s);
 
 	/* FIXME: When listening port has a specific bound address, record that
 	 * as our address

From db2c91ae86c7c0d1d068714db2342b9057506148 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 20 Jan 2025 18:36:30 +0100
Subject: [PATCH 186/382] tcp: Set ACK flag on *all* RST segments, even for
 client in SYN-SENT state

Somewhat curiously, RFC 9293, section 3.10.7.3, states:

   If the state is SYN-SENT, then
   [...]

      Second, check the RST bit:
      -  If the RST bit is set,
      [...]

         o  If the ACK was acceptable, then signal to the user "error:
            connection reset", drop the segment, enter CLOSED state,
            delete TCB, and return.  Otherwise (no ACK), drop the
            segment and return.

which matches verbatim RFC 793, pages 66-67, and is implemented as-is
by tcp_rcv_synsent_state_process() in the Linux kernel, that is:

	/* No ACK in the segment */

	if (th->rst) {
		/* rfc793:
		 * "If the RST bit is set
		 *
		 *      Otherwise (no ACK) drop the segment and return."
		 */

		goto discard_and_undo;
	}

meaning that if a client is in SYN-SENT state, and we send a RST
segment once we realise that we can't establish the outbound
connection, the client will ignore our segment and will need to
pointlessly wait until the connection times out instead of aborting
it right away.

The ACK flag on a RST, in this case, doesn't really seem to have any
function, but we must set it nevertheless. The ACK sequence number is
already correct because we always set it before calling
tcp_prepare_flags(), whenever relevant.

This leaves us with no cases where we should *not* set the ACK flag
on non-SYN segments, so always set the ACK flag for RST segments.

Note that non-SYN, non-RST segments were already covered by commit
4988e2b40631 ("tcp: Unconditionally force ACK for all !SYN, !RST
packets").

Reported-by: Dirk Janssen <Dirk.Janssen@schiphol.nl>
Reported-by: Roeland van de Pol <Roeland.van.de.Pol@schiphol.nl>
Reported-by: Robert Floor <Robert.Floor@schiphol.nl>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcp.c b/tcp.c
index 4d6a6b3..c89f323 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1147,7 +1147,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 
 		*opts = TCP_SYN_OPTS(mss, conn->ws_to_tap);
 		*optlen = sizeof(*opts);
-	} else if (!(flags & RST)) {
+	} else {
 		flags |= ACK;
 	}
 

From ec5c4d936dafcbc5e07caeb594dfd771050da221 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 21 Jan 2025 00:39:06 +0100
Subject: [PATCH 187/382] tcp: Set PSH flag for last incoming packets in a
 batch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So far we omitted setting PSH flags for inbound traffic altogether: as
we ignore the nature of the data we're sending, we can't conclude that
some data is more or less urgent. This works fine with Linux guests,
as the Linux kernel doesn't do much with it, on input: it will
generally deliver data to the application layer without delay.

However, with Windows, things change: if we don't set the PSH flag on
interactive inbound traffic, we can expect long delays before the data
is delivered to the application.

This is very visible with RDP, where packets we send on behalf of the
RDP client are delivered with delays exceeding one second:

  $ tshark -r rdp.pcap -td -Y 'frame.number in { 33170 .. 33173 }' --disable-protocol tls
  33170   0.030296 93.235.154.248 → 88.198.0.164 54 TCP 49012 → 3389 [ACK] Seq=13820 Ack=285229 Win=387968 Len=0
  33171   0.985412 88.198.0.164 → 93.235.154.248 105 TCP 3389 → 49012 [PSH, ACK] Seq=285229 Ack=13820 Win=63198 Len=51
  33172   0.030373 93.235.154.248 → 88.198.0.164 54 TCP 49012 → 3389 [ACK] Seq=13820 Ack=285280 Win=387968 Len=0
  33173   1.383776 88.198.0.164 → 93.235.154.248 424 TCP 3389 → 49012 [PSH, ACK] Seq=285280 Ack=13820 Win=63198 Len=370

in this example (packet capture taken by passt), frame #33172 is a
mouse event sent by the RDP client, and frame #33173 is the first
event (display reacting to click) sent back by the server. This
appears as a 1.4 s delay before we get frame #33173.

If we set PSH, instead:

  $ tshark -r rdp_psh.pcap -td -Y 'frame.number in { 314 .. 317 }' --disable-protocol tls
  314   0.002503 93.235.154.248 → 88.198.0.164 170 TCP 51066 → 3389 [PSH, ACK] Seq=7779 Ack=74047 Win=31872 Len=116
  315   0.000557 88.198.0.164 → 93.235.154.248 54 TCP 3389 → 51066 [ACK] Seq=79162 Ack=7895 Win=62872 Len=0
  316   0.012752 93.235.154.248 → 88.198.0.164 170 TCP 51066 → 3389 [PSH, ACK] Seq=7895 Ack=79162 Win=31872 Len=116
  317   0.011927 88.198.0.164 → 93.235.154.248 107 TCP 3389 → 51066 [PSH, ACK] Seq=79162 Ack=8011 Win=62756 Len=53

here, in frame #316, our mouse event is delivered without a delay and
receives a response in approximately 12 ms.

Set PSH on the last segment for any batch we dequeue from the socket,
that is, set it whenever we know that we might not be sending data to
the same port for a while.

Reported-by: NN708
Link: https://bugs.passt.top/show_bug.cgi?id=107
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_buf.c | 11 ++++++++---
 tcp_vu.c  |  7 +++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tcp_buf.c b/tcp_buf.c
index cbefa42..72d99c5 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -239,9 +239,10 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
  * @dlen:	TCP payload length
  * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
  * @seq:	Sequence number to be sent
+ * @push:	Set PSH flag, last segment in a batch
  */
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
-			    ssize_t dlen, int no_csum, uint32_t seq)
+			    ssize_t dlen, int no_csum, uint32_t seq, bool push)
 {
 	struct tcp_payload_t *payload;
 	const uint16_t *check = NULL;
@@ -268,6 +269,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 	payload->th.th_x2 = 0;
 	payload->th.th_flags = 0;
 	payload->th.ack = 1;
+	payload->th.psh = push;
 	iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
 	tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
 	if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
@@ -402,11 +404,14 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	seq = conn->seq_to_tap;
 	for (i = 0; i < send_bufs; i++) {
 		int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
+		bool push = false;
 
-		if (i == send_bufs - 1)
+		if (i == send_bufs - 1) {
 			dlen = last_len;
+			push = true;
+		}
 
-		tcp_data_to_tap(c, conn, dlen, no_csum, seq);
+		tcp_data_to_tap(c, conn, dlen, no_csum, seq, push);
 		seq += dlen;
 	}
 
diff --git a/tcp_vu.c b/tcp_vu.c
index a216bb1..fad7065 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -289,10 +289,11 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
  * @iov_cnt:		Number of entries in @iov
  * @check:		Checksum, if already known
  * @no_tcp_csum:	Do not set TCP checksum
+ * @push:		Set PSH flag, last segment in a batch
  */
 static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
 			   struct iovec *iov, size_t iov_cnt,
-			   const uint16_t **check, bool no_tcp_csum)
+			   const uint16_t **check, bool no_tcp_csum, bool push)
 {
 	const struct flowside *toside = TAPFLOW(conn);
 	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
@@ -334,6 +335,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
 	memset(th, 0, sizeof(*th));
 	th->doff = sizeof(*th) / 4;
 	th->ack = 1;
+	th->psh = push;
 
 	tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
 			 *check, conn->seq_to_tap, no_tcp_csum);
@@ -443,6 +445,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		struct iovec *iov = &elem[head[i]].in_sg[0];
 		int buf_cnt = head[i + 1] - head[i];
 		ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen;
+		bool push = i == head_cnt - 1;
 
 		vu_set_vnethdr(vdev, iov->iov_base, buf_cnt);
 
@@ -451,7 +454,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 			check = NULL;
 		previous_dlen = dlen;
 
-		tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap);
+		tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap, push);
 
 		if (*c->pcap) {
 			pcap_iov(iov, buf_cnt,

From 4f2c8e79130ef3d6132e34c49746e397745f9d73 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 21 Jan 2025 14:16:02 +0100
Subject: [PATCH 188/382] vhost_user: Drop packet with unsupported iovec array

If the iovec array cannot be managed, drop it rather than
passing the second entry to tap_add_packet().

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vu_common.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vu_common.c b/vu_common.c
index aa5ca7b..f43d8ac 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -195,15 +195,17 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 				        hdrlen);
 		} else {
 			/* vnet header can be in a separate iovec */
-			if (elem[count].out_num != 2)
+			if (elem[count].out_num != 2) {
 				debug("virtio-net transmit queue contains more than one buffer ([%d]: %u)",
 				      count, elem[count].out_num);
-			if (elem[count].out_sg[0].iov_len != (size_t)hdrlen)
+			} else if (elem[count].out_sg[0].iov_len != (size_t)hdrlen) {
 				debug("virtio-net transmit queue entry not aligned on hdrlen ([%d]: %d != %zu)",
-				count, hdrlen, elem[count].out_sg[0].iov_len);
-			tap_add_packet(vdev->context,
-				       elem[count].out_sg[1].iov_len,
-				       (char *)elem[count].out_sg[1].iov_base);
+				      count, hdrlen, elem[count].out_sg[0].iov_len);
+			} else {
+				tap_add_packet(vdev->context,
+					       elem[count].out_sg[1].iov_len,
+					       (char *)elem[count].out_sg[1].iov_base);
+			}
 		}
 
 		count++;

From d477a1fb03c5995d07e481b25dd94fc9e9bc02f2 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 23 Jan 2025 08:55:49 +0100
Subject: [PATCH 189/382] netlink: Skip loopback interface while looking for a
 template

There might be reasons to have routes on the loopback interface, for
example Any-IP/AnyIP routes as implemented by Linux kernel commit
ab79ad14a2d5 ("ipv6: Implement Any-IP support for IPv6.").

If we use the loopback interface as a template, though, we'll pick
'lo' (typically) as interface name for our tap interface, but we'll
already have an interface called 'lo' in the target namespace, and as
we TUNSETIFF on it, we'll fail with EINVAL, because it's not a tap
interface.

Skip the loopback interface while looking for a template interface or,
more accurately, skip the interface with index 1.

Strictly speaking, we should fetch interface flags via RTM_GETLINK
instead, and check for IFF_LOOPBACK, but interleaving that request
while we're iterating over routes is unnecessarily complicated.

Link: https://www.reddit.com/r/podman/comments/1i6pj7u/starting_pod_without_external_network/
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 netlink.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/netlink.c b/netlink.c
index 0407692..37d8b5b 100644
--- a/netlink.c
+++ b/netlink.c
@@ -297,6 +297,10 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
 		if (!thisifi)
 			continue; /* No interface for this route */
 
+		/* Skip 'lo': we should test IFF_LOOPBACK, but keep it simple */
+		if (thisifi == 1)
+			continue;
+
 		/* Skip routes to link-local addresses */
 		if (af == AF_INET && dst &&
 		    IN4_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len))

From dd6a6854c73a09c4091c1776ee7f349d1e1f966c Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 24 Jan 2025 20:07:41 +0100
Subject: [PATCH 190/382] vhost-user: Implement an empty VHOST_USER_SEND_RARP
 command

Passt cannot manage and doesn't need to manage the broadcast of a fake RARP,
but QEMU will report an error message if Passt doesn't implement it.

Implement an empty SEND_RARP command to silence QEMU error message.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/vhost_user.c b/vhost_user.c
index f12dec5..6bf0dda 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -914,7 +914,8 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
 {
 	uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
 			    1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
-			    1ULL << VHOST_USER_PROTOCOL_F_DEVICE_STATE;
+			    1ULL << VHOST_USER_PROTOCOL_F_DEVICE_STATE |
+			    1ULL << VHOST_USER_PROTOCOL_F_RARP;
 
 	(void)vdev;
 	vmsg_set_reply_u64(msg, features);
@@ -981,6 +982,32 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
 	return false;
 }
 
+/**
+ * vu_set_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
+ * 			     RARP to notify the migration is terminated",
+ * 			     but passt doesn't need to update any ARP table,
+ * 			     so do nothing to silence QEMU bogus error message
+ * @vdev:	vhost-user device
+ * @vmsg:	vhost-user message
+ *
+ * Return: False as no reply is requested
+ */
+static bool vu_send_rarp_exec(struct vu_dev *vdev,
+			      struct vhost_user_msg *msg)
+{
+	char macstr[ETH_ADDRSTRLEN];
+
+	(void)vdev;
+
+	/* ignore the command */
+
+	debug("Ignore command VHOST_USER_SEND_RARP for %s",
+	      eth_ntop((unsigned char *)&msg->payload.u64, macstr,
+		       sizeof(macstr)));
+
+	return false;
+}
+
 /**
  * vu_set_migration_watch() - Add the migration file descriptor to epoll
  * @vdev:	vhost-user device
@@ -1177,6 +1204,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
 	[VHOST_USER_SET_VRING_CALL]	   = vu_set_vring_call_exec,
 	[VHOST_USER_SET_VRING_ERR]	   = vu_set_vring_err_exec,
 	[VHOST_USER_SET_VRING_ENABLE]	   = vu_set_vring_enable_exec,
+	[VHOST_USER_SEND_RARP]		   = vu_send_rarp_exec,
 	[VHOST_USER_SET_DEVICE_STATE_FD]   = vu_set_device_state_fd_exec,
 	[VHOST_USER_CHECK_DEVICE_STATE]    = vu_check_device_state_exec,
 };

From 10c4a9e1b383becd7366bda986f886675f7c4cb2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 30 Jan 2025 17:52:10 +1100
Subject: [PATCH 191/382] tcp: Always pass NULL event with EPOLL_CTL_DEL

In tcp_epoll_ctl() we pass an event pointer with EPOLL_CTL_DEL, even though
it will be ignored.  It's possible this was a workaround for pre-2.6.9
kernels which required a non-NULL pointer here, but we rely on the kernel
accepting NULL events for EPOLL_CTL_DEL in lots of other places.  Use
NULL instead for simplicity and consistency.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index c89f323..4eed82b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -468,9 +468,9 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 
 	if (conn->events == CLOSED) {
 		if (conn->in_epoll)
-			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL);
 		if (conn->timer != -1)
-			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
+			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, NULL);
 		return 0;
 	}
 

From 0349cf637f64a5128846c79d9537849e1ed3e1cc Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 30 Jan 2025 17:52:11 +1100
Subject: [PATCH 192/382] util: Rename and make global vu_remove_watch()

vu_remove_watch() is used in vhost_user.c to remove an fd from the global
epoll set.  There's nothing really vhost user specific about it though,
so rename, move to util.c and use it in a bunch of places outside
vhost_user.c where it makes things marginally more readable.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 icmp.c       |  2 +-
 tap.c        |  2 +-
 tcp.c        |  4 ++--
 tcp_splice.c |  4 ++--
 udp_flow.c   |  2 +-
 util.c       | 10 ++++++++++
 util.h       |  1 +
 vhost_user.c | 21 +++++----------------
 vu_common.c  |  6 ++----
 9 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/icmp.c b/icmp.c
index 143e93b..bcf498d 100644
--- a/icmp.c
+++ b/icmp.c
@@ -150,7 +150,7 @@ unexpected:
 static void icmp_ping_close(const struct ctx *c,
 			    const struct icmp_ping_flow *pingf)
 {
-	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL);
+	epoll_del(c, pingf->sock);
 	close(pingf->sock);
 	flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
 }
diff --git a/tap.c b/tap.c
index cd32a90..772648f 100644
--- a/tap.c
+++ b/tap.c
@@ -1005,7 +1005,7 @@ void tap_sock_reset(struct ctx *c)
 		exit(EXIT_SUCCESS);
 
 	/* Close the connected socket, wait for a new connection */
-	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
+	epoll_del(c, c->fd_tap);
 	close(c->fd_tap);
 	c->fd_tap = -1;
 	if (c->mode == MODE_VU)
diff --git a/tcp.c b/tcp.c
index 4eed82b..7787381 100644
--- a/tcp.c
+++ b/tcp.c
@@ -468,9 +468,9 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 
 	if (conn->events == CLOSED) {
 		if (conn->in_epoll)
-			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL);
+			epoll_del(c, conn->sock);
 		if (conn->timer != -1)
-			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, NULL);
+			epoll_del(c, conn->timer);
 		return 0;
 	}
 
diff --git a/tcp_splice.c b/tcp_splice.c
index 3a000ff..5db1d62 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -200,8 +200,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
 	}
 
 	if (flag == CLOSING) {
-		epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[0], NULL);
-		epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[1], NULL);
+		epoll_del(c, conn->s[0]);
+		epoll_del(c, conn->s[1]);
 	}
 }
 
diff --git a/udp_flow.c b/udp_flow.c
index 9fd7d06..7fae81d 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -52,7 +52,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 
 	if (uflow->s[TGTSIDE] >= 0) {
 		/* But the flow specific one needs to be removed */
-		epoll_ctl(c->epollfd, EPOLL_CTL_DEL, uflow->s[TGTSIDE], NULL);
+		epoll_del(c, uflow->s[TGTSIDE]);
 		close(uflow->s[TGTSIDE]);
 		uflow->s[TGTSIDE] = -1;
 	}
diff --git a/util.c b/util.c
index 11973c4..c7b09f0 100644
--- a/util.c
+++ b/util.c
@@ -837,3 +837,13 @@ void raw_random(void *buf, size_t buflen)
 	if (random_read < buflen)
 		die("Unexpected EOF on random data source");
 }
+
+/**
+ * epoll_del() - Remove a file descriptor from our passt epoll
+ * @c:		Execution context
+ * @fd:		File descriptor to remove
+ */
+void epoll_del(const struct ctx *c, int fd)
+{
+	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
+}
diff --git a/util.h b/util.h
index d02333d..800a28b 100644
--- a/util.h
+++ b/util.h
@@ -276,6 +276,7 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
 #define FPRINTF(f, ...)	(void)fprintf(f, __VA_ARGS__)
 
 void raw_random(void *buf, size_t buflen);
+void epoll_del(const struct ctx *c, int fd);
 
 /*
  * Starting from glibc 2.40.9000 and commit 25a5eb4010df ("string: strerror,
diff --git a/vhost_user.c b/vhost_user.c
index 6bf0dda..bbbf504 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -162,17 +162,6 @@ static void vmsg_close_fds(const struct vhost_user_msg *vmsg)
 		close(vmsg->fds[i]);
 }
 
-/**
- * vu_remove_watch() - Remove a file descriptor from our passt epoll
- * 		       file descriptor
- * @vdev:	vhost-user device
- * @fd:		file descriptor to remove
- */
-static void vu_remove_watch(const struct vu_dev *vdev, int fd)
-{
-	epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, fd, NULL);
-}
-
 /**
  * vmsg_set_reply_u64() - Set reply payload.u64 and clear request flags
  * 			  and fd_num
@@ -748,7 +737,7 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev,
 		vdev->vq[idx].call_fd = -1;
 	}
 	if (vdev->vq[idx].kick_fd != -1) {
-		vu_remove_watch(vdev, vdev->vq[idx].kick_fd);
+		epoll_del(vdev->context, vdev->vq[idx].kick_fd);
 		close(vdev->vq[idx].kick_fd);
 		vdev->vq[idx].kick_fd = -1;
 	}
@@ -816,7 +805,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
 	vu_check_queue_msg_file(msg);
 
 	if (vdev->vq[idx].kick_fd != -1) {
-		vu_remove_watch(vdev, vdev->vq[idx].kick_fd);
+		epoll_del(vdev->context, vdev->vq[idx].kick_fd);
 		close(vdev->vq[idx].kick_fd);
 		vdev->vq[idx].kick_fd = -1;
 	}
@@ -1063,7 +1052,7 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
 		die("Invalide device_state_fd direction: %d", direction);
 
 	if (vdev->device_state_fd != -1) {
-		vu_remove_watch(vdev, vdev->device_state_fd);
+		epoll_del(vdev->context, vdev->device_state_fd);
 		close(vdev->device_state_fd);
 	}
 
@@ -1145,7 +1134,7 @@ void vu_cleanup(struct vu_dev *vdev)
 			vq->err_fd = -1;
 		}
 		if (vq->kick_fd != -1) {
-			vu_remove_watch(vdev, vq->kick_fd);
+			epoll_del(vdev->context, vq->kick_fd);
 			close(vq->kick_fd);
 			vq->kick_fd = -1;
 		}
@@ -1169,7 +1158,7 @@ void vu_cleanup(struct vu_dev *vdev)
 	vu_close_log(vdev);
 
 	if (vdev->device_state_fd != -1) {
-		vu_remove_watch(vdev, vdev->device_state_fd);
+		epoll_del(vdev->context, vdev->device_state_fd);
 		close(vdev->device_state_fd);
 		vdev->device_state_fd = -1;
 		vdev->device_state_result = -1;
diff --git a/vu_common.c b/vu_common.c
index f43d8ac..2c12dca 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -325,8 +325,7 @@ void vu_migrate(struct vu_dev *vdev, uint32_t events)
 		/* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
 		vdev->device_state_result = ret == -1 ? -1 : 0;
 		/* Closing the file descriptor signals the end of transfer */
-		epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL,
-			  vdev->device_state_fd, NULL);
+		epoll_del(vdev->context, vdev->device_state_fd);
 		close(vdev->device_state_fd);
 		vdev->device_state_fd = -1;
 	} else if (events & EPOLLIN) {
@@ -346,8 +345,7 @@ void vu_migrate(struct vu_dev *vdev, uint32_t events)
 		debug("Closing migration channel");
 
 		/* The end of file signals the end of the transfer. */
-		epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL,
-			  vdev->device_state_fd, NULL);
+		epoll_del(vdev->context, vdev->device_state_fd);
 		close(vdev->device_state_fd);
 		vdev->device_state_fd = -1;
 	}

From dcd6d8191aa29f232593ad2819a197e135f8cac8 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 31 Jan 2025 19:13:00 +0100
Subject: [PATCH 193/382] tcp: Add HOSTSIDE(x), HOSTFLOW(x) macros

Those are symmetric to TAPSIDE(x)/TAPFLOW(x) and I'll use them in
the next patch to extract 'oport' in order to re-bind sockets to
the original socket-side local port.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp_internal.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tcp_internal.h b/tcp_internal.h
index 94e5780..9cf31f5 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -38,9 +38,13 @@
 #define OPT_SACK	5
 #define OPT_TS		8
 
-#define TAPSIDE(conn_)	((conn_)->f.pif[1] == PIF_TAP)
-#define TAPFLOW(conn_)	(&((conn_)->f.side[TAPSIDE(conn_)]))
-#define TAP_SIDX(conn_)	(FLOW_SIDX((conn_), TAPSIDE(conn_)))
+#define TAPSIDE(conn_)		((conn_)->f.pif[1] == PIF_TAP)
+#define TAPFLOW(conn_)		(&((conn_)->f.side[TAPSIDE(conn_)]))
+#define TAP_SIDX(conn_)		(FLOW_SIDX((conn_), TAPSIDE(conn_)))
+
+#define HOSTSIDE(conn_)		((conn_)->f.pif[1] == PIF_HOST)
+#define HOSTFLOW(conn_)		(&((conn_)->f.side[HOSTSIDE(conn_)]))
+#define HOST_SIDX(conn_)	(FLOW_SIDX((conn_), TAPSIDE(conn_)))
 
 #define CONN_V4(conn)		(!!inany_v4(&TAPFLOW(conn)->oaddr))
 #define CONN_V6(conn)		(!CONN_V4(conn))

From bf2860819d868c7d116923e9b5d798d410d38715 Mon Sep 17 00:00:00 2001
From: 7ppKb5bW <pONy4THS@protonmail.com>
Date: Sun, 2 Feb 2025 19:21:21 +0000
Subject: [PATCH 194/382] pasta.te: fix demo.sh and remove one duplicate rule

On Fedora 41, without "allow pasta_t unconfined_t:dir read"
/usr/bin/pasta can't open /proc/[pid]/ns, which is required by
pasta_netns_quit_init().

This patch also remove one duplicate rule "allow pasta_t nsfs_t:file
read;", "allow pasta_t nsfs_t:file { open read };" at line 123 is
enough.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/selinux/pasta.te | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index 69be081..d0ff0cc 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -171,7 +171,7 @@ allow pasta_t init_t:lnk_file read;
 allow pasta_t init_t:unix_stream_socket connectto;
 allow pasta_t init_t:dbus send_msg;
 allow pasta_t init_t:system status;
-allow pasta_t unconfined_t:dir search;
+allow pasta_t unconfined_t:dir { read search };
 allow pasta_t unconfined_t:file read;
 allow pasta_t unconfined_t:lnk_file read;
 allow pasta_t self:process { setpgid setcap };
@@ -192,8 +192,6 @@ allow pasta_t sysctl_net_t:dir search;
 allow pasta_t sysctl_net_t:file { open read write };
 allow pasta_t kernel_t:system module_request;
 
-allow pasta_t nsfs_t:file read;
-
 allow pasta_t proc_t:dir mounton;
 allow pasta_t proc_t:filesystem mount;
 allow pasta_t net_conf_t:lnk_file read;

From 722d347c1932f630a53ba05ea0270a651ed601b2 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 3 Feb 2025 08:19:16 +0100
Subject: [PATCH 195/382] tcp: Don't reset outbound connection on SYN retries

Reported by somebody on IRC: if the server has considerable latency,
it might happen that the client retries sending SYN segments for the
same flow while we're still in a TAP_SYN_RCVD, non-ESTABLISHED state.

In that case, we should go with the blanket assumption that we need
to reset the connection on any unexpected segment: RFC 9293 explicitly
mentions this case in Figure 8: Recovery from Old Duplicate SYN,
section 3.5. It doesn't make sense for us to set a specific sequence
number, socket-side, but we should definitely wait and see.

Ignoring the duplicate SYN segment should also be compatible with
section 3.10.7.3. SYN-SENT STATE, which mentions updating sequences
socket-side (which we can't do anyway), but certainly not reset the
connection.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tcp.c b/tcp.c
index 7787381..51ad692 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1920,6 +1920,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 
 	/* Establishing connection from tap */
 	if (conn->events & TAP_SYN_RCVD) {
+		if (th->syn && !th->ack && !th->fin)
+			return 1;	/* SYN retry: ignore and keep waiting */
+
 		if (!(conn->events & TAP_SYN_ACK_SENT))
 			goto reset;
 

From b75ad159e8a13a10ce1fb4b86503636420da126d Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sun, 2 Feb 2025 20:49:58 +0100
Subject: [PATCH 196/382] vhost_user: On 32-bit ARM, mmap() is not available,
 mmap2() is used instead

Link: https://buildd.debian.org/status/fetch.php?pkg=passt&arch=armel&ver=0.0%7Egit20250121.4f2c8e7-1&stamp=1737477467&raw=0
Link: https://buildd.debian.org/status/fetch.php?pkg=passt&arch=armhf&ver=0.0%7Egit20250121.4f2c8e7-1&stamp=1737477421&raw=0
Fixes: 31117b27c6c9 ("vhost-user: introduce vhost-user API")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 vhost_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vhost_user.c b/vhost_user.c
index bbbf504..58baee2 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -419,7 +419,7 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
  *
  * Return: False as no reply is requested
  *
- * #syscalls:vu mmap munmap
+ * #syscalls:vu mmap|mmap2 munmap
  */
 static bool vu_set_mem_table_exec(struct vu_dev *vdev,
 				  struct vhost_user_msg *msg)

From 71fa7362776bfa075d83383b600d2beeab923893 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sun, 2 Feb 2025 20:53:47 +0100
Subject: [PATCH 197/382] tcp_splice, udp_flow: fcntl64() support on PPC64
 depends on glibc version

I explicitly added fcntl64() to the list of allowed system calls for
PPC64 a while ago, and now it turns out it's not available in recent
Debian builds. The warning from seccomp.sh is harmless because we
unconditionally try to enable fcntl() anyway, but take care of it
anyway.

Link: https://buildd.debian.org/status/fetch.php?pkg=passt&arch=ppc64&ver=0.0%7Egit20250121.4f2c8e7-1&stamp=1737477147&raw=0
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp_splice.c | 2 +-
 udp_flow.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcp_splice.c b/tcp_splice.c
index 5db1d62..f048a82 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -28,7 +28,7 @@
  * - FIN_SENT_0:		FIN (write shutdown) sent to accepted socket
  * - FIN_SENT_1:		FIN (write shutdown) sent to target socket
  *
- * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
+ * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64
  */
 
 #include <sched.h>
diff --git a/udp_flow.c b/udp_flow.c
index 7fae81d..83c2568 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -174,7 +174,7 @@ cancel:
  * @s_in:	Source socket address, filled in by recvmmsg()
  * @now:	Timestamp
  *
- * #syscalls fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
+ * #syscalls fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64
  *
  * Return: sidx for the destination side of the flow for this packet, or
  *         FLOW_SIDX_NONE if we couldn't find or create a flow.

From e25a93032f8c09f1e0bfbc32e81431dd995f9605 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sun, 26 Jan 2025 09:05:03 +0100
Subject: [PATCH 198/382] util: Add read_remainder() and read_all_buf()

These are symmetric to write_remainder() and write_all_buf() and
almost a copy and paste of them, with the most notable differences
being reversed reads/writes and a couple of better-safe-than-sorry
asserts to keep Coverity happy.

I'll use them in the next patch. At least for the moment, they're
going to be used for vhost-user mode only, so I'm not unconditionally
enabling readv() in the seccomp profile: the caller has to ensure it's
there.

[dgibson: make read_remainder() take const pointer to iovec]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 util.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 util.h |  2 ++
 2 files changed, 86 insertions(+)

diff --git a/util.c b/util.c
index c7b09f0..800c6b5 100644
--- a/util.c
+++ b/util.c
@@ -606,6 +606,90 @@ int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip)
 	return 0;
 }
 
+/**
+ * read_all_buf() - Fill a whole buffer from a file descriptor
+ * @fd:		File descriptor
+ * @buf:	Pointer to base of buffer
+ * @len:	Length of buffer
+ *
+ * Return: 0 on success, -1 on error (with errno set)
+ *
+ * #syscalls read
+ */
+int read_all_buf(int fd, void *buf, size_t len)
+{
+	size_t left = len;
+	char *p = buf;
+
+	while (left) {
+		ssize_t rc;
+
+		ASSERT(left <= len);
+
+		do
+			rc = read(fd, p, left);
+		while ((rc < 0) && errno == EINTR);
+
+		if (rc < 0)
+			return -1;
+
+		if (rc == 0) {
+			errno = ENODATA;
+			return -1;
+		}
+
+		p += rc;
+		left -= rc;
+	}
+	return 0;
+}
+
+/**
+ * read_remainder() - Read the tail of an IO vector from a file descriptor
+ * @fd:		File descriptor
+ * @iov:	IO vector
+ * @cnt:	Number of entries in @iov
+ * @skip:	Number of bytes of the vector to skip reading
+ *
+ * Return: 0 on success, -1 on error (with errno set)
+ *
+ * Note: mode-specific seccomp profiles need to enable readv() to use this.
+ */
+/* cppcheck-suppress unusedFunction */
+int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip)
+{
+	size_t i = 0, offset;
+
+	while ((i += iov_skip_bytes(iov + i, cnt - i, skip, &offset)) < cnt) {
+		ssize_t rc;
+
+		if (offset) {
+			ASSERT(offset < iov[i].iov_len);
+			/* Read the remainder of the partially read buffer */
+			if (read_all_buf(fd, (char *)iov[i].iov_base + offset,
+					 iov[i].iov_len - offset) < 0)
+				return -1;
+			i++;
+		}
+
+		if (cnt == i)
+			break;
+
+		/* Fill as many of the remaining buffers as we can */
+		rc = readv(fd, &iov[i], cnt - i);
+		if (rc < 0)
+			return -1;
+
+		if (rc == 0) {
+			errno = ENODATA;
+			return -1;
+		}
+
+		skip = rc;
+	}
+	return 0;
+}
+
 /** sockaddr_ntop() - Convert a socket address to text format
  * @sa:		Socket address
  * @dst:	output buffer, minimum SOCKADDR_STRLEN bytes
diff --git a/util.h b/util.h
index 800a28b..23b165c 100644
--- a/util.h
+++ b/util.h
@@ -203,6 +203,8 @@ int fls(unsigned long x);
 int write_file(const char *path, const char *buf);
 int write_all_buf(int fd, const void *buf, size_t len);
 int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
+int read_all_buf(int fd, void *buf, size_t len);
+int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip);
 void close_open_files(int argc, char **argv);
 bool snprintf_check(char *str, size_t size, const char *format, ...);
 

From e894d9ae8212c49dc44e52ad583954ed24e6905b Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 31 Jan 2025 11:41:51 +0100
Subject: [PATCH 199/382] vhost_user: Turn some vhost-user message reports to
 trace()

Having every vhost-user message printed as part of debug output makes
debugging anything else a bit complicated.

Change per-packet debug() messages in vu_kick_cb() and
vu_send_single() to trace()

[dgibson: switch different messages to trace()]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 4 ++--
 vu_common.c  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vhost_user.c b/vhost_user.c
index 58baee2..9e38cfd 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -640,8 +640,8 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev,
 	unsigned int idx = msg->payload.state.index;
 	unsigned int num = msg->payload.state.num;
 
-	debug("State.index: %u", idx);
-	debug("State.num:   %u", num);
+	trace("State.index: %u", idx);
+	trace("State.num:   %u", num);
 	vdev->vq[idx].vring.num = num;
 
 	return false;
diff --git a/vu_common.c b/vu_common.c
index 2c12dca..ab04d31 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -238,7 +238,7 @@ void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
 	if (rc == -1)
 		die_perror("vhost-user kick eventfd_read()");
 
-	debug("vhost-user: got kick_data: %016"PRIx64" idx: %d",
+	trace("vhost-user: got kick_data: %016"PRIx64" idx: %d",
 	      kick_data, ref.queue);
 	if (VHOST_USER_IS_QUEUE_TX(ref.queue))
 		vu_handle_tx(vdev, ref.queue, now);
@@ -262,7 +262,7 @@ int vu_send_single(const struct ctx *c, const void *buf, size_t size)
 	int elem_cnt;
 	int i;
 
-	debug("vu_send_single size %zu", size);
+	trace("vu_send_single size %zu", size);
 
 	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
 		debug("Got packet, but RX virtqueue not usable yet");
@@ -294,7 +294,7 @@ int vu_send_single(const struct ctx *c, const void *buf, size_t size)
 
 	vu_flush(vdev, vq, elem, elem_cnt);
 
-	debug("vhost-user sent %zu", total);
+	trace("vhost-user sent %zu", total);
 
 	return total;
 err:

From 8c24301462c39027e6eb6f1ad56c1f6c83fb0c23 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 28 Jan 2025 00:03:13 +0100
Subject: [PATCH 200/382] Introduce passt-repair

A privileged helper to set/clear TCP_REPAIR on sockets on behalf of
passt. Not used yet.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 .gitignore                            |   1 +
 Makefile                              |  17 ++-
 contrib/apparmor/usr.bin.passt-repair |  29 +++++
 contrib/fedora/passt.spec             |   2 +
 contrib/selinux/passt-repair.fc       |  11 ++
 contrib/selinux/passt-repair.te       |  58 ++++++++++
 hooks/pre-push                        |   1 +
 passt-repair.1                        |  70 ++++++++++++
 passt-repair.c                        | 154 ++++++++++++++++++++++++++
 seccomp.sh                            |   6 +-
 10 files changed, 342 insertions(+), 7 deletions(-)
 create mode 100644 contrib/apparmor/usr.bin.passt-repair
 create mode 100644 contrib/selinux/passt-repair.fc
 create mode 100644 contrib/selinux/passt-repair.te
 create mode 100644 passt-repair.1
 create mode 100644 passt-repair.c

diff --git a/.gitignore b/.gitignore
index d1c8be9..5824a71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 /passt.avx2
 /pasta
 /pasta.avx2
+/passt-repair
 /qrap
 /pasta.1
 /seccomp.h
diff --git a/Makefile b/Makefile
index 464eef1..6ab8d24 100644
--- a/Makefile
+++ b/Makefile
@@ -42,9 +42,10 @@ PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
 	tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
 	vhost_user.c virtio.c vu_common.c
 QRAP_SRCS = qrap.c
-SRCS = $(PASST_SRCS) $(QRAP_SRCS)
+PASST_REPAIR_SRCS = passt-repair.c
+SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
 
-MANPAGES = passt.1 pasta.1 qrap.1
+MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
 
 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
@@ -72,9 +73,9 @@ mandir		?= $(datarootdir)/man
 man1dir		?= $(mandir)/man1
 
 ifeq ($(TARGET_ARCH),x86_64)
-BIN := passt passt.avx2 pasta pasta.avx2 qrap
+BIN := passt passt.avx2 pasta pasta.avx2 qrap passt-repair
 else
-BIN := passt pasta qrap
+BIN := passt pasta qrap passt-repair
 endif
 
 all: $(BIN) $(MANPAGES) docs
@@ -83,7 +84,10 @@ static: FLAGS += -static -DGLIBC_NO_STATIC_NSS
 static: clean all
 
 seccomp.h: seccomp.sh $(PASST_SRCS) $(PASST_HEADERS)
-	@ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh $(PASST_SRCS) $(PASST_HEADERS)
+	@ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp.h $(PASST_SRCS) $(PASST_HEADERS)
+
+seccomp_repair.h: seccomp.sh $(PASST_REPAIR_SRCS)
+	@ ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp_repair.h $(PASST_REPAIR_SRCS)
 
 passt: $(PASST_SRCS) $(HEADERS)
 	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_SRCS) -o passt $(LDFLAGS)
@@ -101,6 +105,9 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
 qrap: $(QRAP_SRCS) passt.h
 	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)
 
+passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h
+	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS)
+
 valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction	\
 			    rt_sigreturn getpid gettid kill clock_gettime mmap \
 			    mmap2 munmap open unlink gettimeofday futex statx \
diff --git a/contrib/apparmor/usr.bin.passt-repair b/contrib/apparmor/usr.bin.passt-repair
new file mode 100644
index 0000000..901189d
--- /dev/null
+++ b/contrib/apparmor/usr.bin.passt-repair
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# contrib/apparmor/usr.bin.passt-repair - AppArmor profile for passt-repair(1)
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+abi <abi/3.0>,
+
+#include <tunables/global>
+
+profile passt-repair /usr/bin/passt-repair {
+  #include <abstractions/base>
+  /** rw,			# passt's ".repair" socket might be anywhere
+  unix (connect, receive, send) type=stream,
+
+  capability dac_override,	# connect to passt's socket as root
+  capability net_admin,		# currently needed for TCP_REPAIR socket option
+  capability net_raw,		# what TCP_REPAIR should require instead
+
+  network unix stream,		# connect and use UNIX domain socket
+  network inet stream,		# use TCP sockets
+}
diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec
index 7950fb9..6a83f8b 100644
--- a/contrib/fedora/passt.spec
+++ b/contrib/fedora/passt.spec
@@ -108,9 +108,11 @@ fi
 %{_bindir}/passt
 %{_bindir}/pasta
 %{_bindir}/qrap
+%{_bindir}/passt-repair
 %{_mandir}/man1/passt.1*
 %{_mandir}/man1/pasta.1*
 %{_mandir}/man1/qrap.1*
+%{_mandir}/man1/passt-repair.1*
 %ifarch x86_64
 %{_bindir}/passt.avx2
 %{_mandir}/man1/passt.avx2.1*
diff --git a/contrib/selinux/passt-repair.fc b/contrib/selinux/passt-repair.fc
new file mode 100644
index 0000000..bcd526e
--- /dev/null
+++ b/contrib/selinux/passt-repair.fc
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# contrib/selinux/passt-repair.fc - SELinux: File Context for passt-repair
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+/usr/bin/passt-repair		system_u:object_r:passt_repair_exec_t:s0
diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te
new file mode 100644
index 0000000..e3ffbcd
--- /dev/null
+++ b/contrib/selinux/passt-repair.te
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# contrib/selinux/passt-repair.te - SELinux: Type Enforcement for passt-repair
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+policy_module(passt-repair, 0.1)
+
+require {
+	type unconfined_t;
+	type passt_t;
+	role unconfined_r;
+	class process transition;
+
+	class file { read execute execute_no_trans entrypoint open map };
+	class capability { dac_override net_admin net_raw };
+	class chr_file { append open getattr read write ioctl };
+
+	class unix_stream_socket { create connect sendto };
+	class sock_file { read write };
+
+	class tcp_socket { read setopt write };
+
+	type console_device_t;
+	type user_devpts_t;
+	type user_tmp_t;
+}
+
+type passt_repair_t;
+domain_type(passt_repair_t);
+type passt_repair_exec_t;
+files_type(passt_repair_exec_t);
+
+role unconfined_r types passt_repair_t;
+
+allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans entrypoint open map };
+type_transition unconfined_t passt_repair_exec_t:process passt_repair_t;
+allow unconfined_t passt_repair_t:process transition;
+
+allow passt_repair_t self:capability { dac_override net_admin net_raw };
+
+allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl };
+allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl };
+
+allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
+allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
+allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
+
+allow passt_repair_t unconfined_t:sock_file { read write };
+allow passt_repair_t passt_t:sock_file { read write };
+allow passt_repair_t user_tmp_t:sock_file { read write };
+
+allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
+allow passt_repair_t passt_t:tcp_socket { read setopt write };
diff --git a/hooks/pre-push b/hooks/pre-push
index 33a2052..8dbfa5f 100755
--- a/hooks/pre-push
+++ b/hooks/pre-push
@@ -56,6 +56,7 @@ cd ..
 make pkgs
 scp passt passt.avx2 passt.1 qrap qrap.1	"${USER_HOST}:${BIN}"
 scp pasta pasta.avx2 pasta.1			"${USER_HOST}:${BIN}"
+scp passt-repair passt-repair.1			"${USER_HOST}:${BIN}"
 
 ssh "${USER_HOST}" 				"rm -f ${BIN}/*.deb"
 ssh "${USER_HOST}"				"rm -f ${BIN}/*.rpm"
diff --git a/passt-repair.1 b/passt-repair.1
new file mode 100644
index 0000000..8d07c97
--- /dev/null
+++ b/passt-repair.1
@@ -0,0 +1,70 @@
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\" Copyright (c) 2025 Red Hat GmbH
+.\" Author: Stefano Brivio <sbrivio@redhat.com>
+.TH passt-repair 1
+
+.SH NAME
+.B passt-repair
+\- Helper setting TCP_REPAIR socket options for \fBpasst\fR(1)
+
+.SH SYNOPSIS
+.B passt-repair
+\fIPATH\fR
+
+.SH DESCRIPTION
+
+.B passt-repair
+is a privileged helper setting and clearing repair mode on TCP sockets on behalf
+of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain
+socket, specified by \fIPATH\fR.
+
+It can be used to migrate TCP connections between guests without granting
+additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections,
+\fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR
+capability (see \fBcapabilities\fR(7)) to be set or cleared.
+
+.SH PROTOCOL
+
+\fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via
+\fI--repair-path\fR option in \fBpasst\fR(1) itself. By default, the name is the
+same as the UNIX domain socket used for guest communication, suffixed by
+\fI.repair\fR.
+
+The messages consist of one 8-bit signed integer that can be \fITCP_REPAIR_ON\fR
+(1), \fITCP_REPAIR_OFF\fR (2), or \fITCP_REPAIR_OFF_WP\fR (-1), as defined by
+the Linux kernel user API, and one to SCM_MAX_FD (253) sockets as SCM_RIGHTS
+(see \fBunix\fR(7)) ancillary message, sent by the server, \fBpasst\fR(1).
+
+The client, \fBpasst-repair\fR(1), replies with the same byte (and no ancillary
+message) to indicate success, and closes the connection on failure.
+
+The server closes the connection on error or completion.
+
+.SH NOTES
+
+\fBpasst-repair\fR(1) can be granted the \fBCAP_NET_ADMIN\fR capability
+(preferred, as it limits privileges to the strictly necessary ones), or it can
+be run as root.
+
+.SH AUTHOR
+
+Stefano Brivio <sbrivio@redhat.com>.
+
+.SH REPORTING BUGS
+
+Please report issues on the bug tracker at https://bugs.passt.top/, or
+send a message to the passt-user@passt.top mailing list, see
+https://lists.passt.top/.
+
+.SH COPYRIGHT
+
+Copyright (c) 2025 Red Hat GmbH.
+
+\fBpasst-repair\fR is free software: you can redistribute them and/or modify
+them under the terms of the GNU General Public License as published by the Free
+Software Foundation, either version 2 of the License, or (at your option) any
+later version. 
+
+.SH SEE ALSO
+
+\fBpasst\fR(1), \fBqemu\fR(1), \fBcapabilities\fR(7), \fBunix\fR(7).
diff --git a/passt-repair.c b/passt-repair.c
new file mode 100644
index 0000000..767a821
--- /dev/null
+++ b/passt-repair.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ *  for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ *  for network namespace/tap device mode
+ *
+ * passt-repair.c - Privileged helper to set/clear TCP_REPAIR on sockets
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ *
+ * Connect to passt via UNIX domain socket, receive sockets via SCM_RIGHTS along
+ * with byte commands mapping to TCP_REPAIR values, and switch repair mode on or
+ * off. Reply by echoing the command. Exit on EOF.
+ */
+
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <unistd.h>
+#include <netdb.h>
+
+#include <netinet/tcp.h>
+
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+
+#include "seccomp_repair.h"
+
+#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+
+/**
+ * main() - Entry point and whole program with loop
+ * @argc:	Argument count, must be 2
+ * @argv:	Argument: path of UNIX domain socket to connect to
+ *
+ * Return: 0 on success (EOF), 1 on error, 2 on usage error
+ *
+ * #syscalls:repair connect setsockopt write exit_group
+ * #syscalls:repair socket s390x:socketcall i686:socketcall
+ * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv
+ * #syscalls:repair sendto sendmsg arm:send ppc64le:send
+ */
+int main(int argc, char **argv)
+{
+	char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
+	     __attribute__ ((aligned(__alignof__(struct cmsghdr))));
+	struct sockaddr_un a = { AF_UNIX, "" };
+	int fds[SCM_MAX_FD], s, ret, i, n;
+	struct sock_fprog prog;
+	int8_t cmd = INT8_MAX;
+	struct cmsghdr *cmsg;
+	struct msghdr msg;
+	struct iovec iov;
+
+	prctl(PR_SET_DUMPABLE, 0);
+
+	prog.len = (unsigned short)sizeof(filter_repair) /
+				   sizeof(filter_repair[0]);
+	prog.filter = filter_repair;
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
+	    prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
+		fprintf(stderr, "Failed to apply seccomp filter");
+		return 1;
+	}
+
+	iov = (struct iovec){ &cmd, sizeof(cmd) };
+	msg = (struct msghdr){ NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
+	cmsg = CMSG_FIRSTHDR(&msg);
+
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s PATH\n", argv[0]);
+		return 2;
+	}
+
+	ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
+	if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
+		fprintf(stderr, "Invalid socket path: %s\n", argv[1]);
+		return 2;
+	}
+
+	if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+		perror("Failed to create AF_UNIX socket");
+		return 1;
+	}
+
+	if (connect(s, (struct sockaddr *)&a, sizeof(a))) {
+		fprintf(stderr, "Failed to connect to %s: %s\n", argv[1],
+			strerror(errno));
+		return 1;
+	}
+
+loop:
+	ret = recvmsg(s, &msg, 0);
+	if (ret < 0) {
+		perror("Failed to receive message");
+		return 1;
+	}
+
+	if (!ret)	/* Done */
+		return 0;
+
+	if (!cmsg ||
+	    cmsg->cmsg_len < CMSG_LEN(sizeof(int)) ||
+	    cmsg->cmsg_len > CMSG_LEN(sizeof(int) * SCM_MAX_FD) ||
+	    cmsg->cmsg_type != SCM_RIGHTS) {
+		fprintf(stderr, "No/bad ancillary data from peer\n");
+		return 1;
+	}
+
+	n = cmsg->cmsg_len / CMSG_LEN(sizeof(int));
+	memcpy(fds, CMSG_DATA(cmsg), sizeof(int) * n);
+
+	if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF &&
+	    cmd != TCP_REPAIR_OFF_NO_WP) {
+		fprintf(stderr, "Unsupported command 0x%04x\n", cmd);
+		return 1;
+	}
+
+	for (i = 0; i < n; i++) {
+		int o = cmd;
+
+		if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &o, sizeof(o))) {
+			fprintf(stderr,
+				"Setting TCP_REPAIR to %i on socket %i: %s", o,
+				fds[i], strerror(errno));
+			return 1;
+		}
+
+		/* Close _our_ copy */
+		close(fds[i]);
+
+		/* Confirm setting by echoing the command back */
+		if (send(s, &cmd, sizeof(cmd), 0) < 0) {
+			fprintf(stderr, "Reply to command %i: %s\n",
+				o, strerror(errno));
+			return 1;
+		}
+	}
+
+	goto loop;
+
+	return 0;
+}
diff --git a/seccomp.sh b/seccomp.sh
index 6499c58..4c521ae 100755
--- a/seccomp.sh
+++ b/seccomp.sh
@@ -14,8 +14,10 @@
 # Author: Stefano Brivio <sbrivio@redhat.com>
 
 TMP="$(mktemp)"
-IN="$@"
 OUT="$(mktemp)"
+OUT_FINAL="${1}"
+shift
+IN="$@"
 
 [ -z "${ARCH}" ] && ARCH="$(uname -m)"
 [ -z "${CC}" ] && CC="cc"
@@ -268,4 +270,4 @@ for __p in ${__profiles}; do
 	gen_profile "${__p}" ${__calls}
 done
 
-mv "${OUT}" seccomp.h
+mv "${OUT}" "${OUT_FINAL}"

From 52e57f9c9a6d8ae4153ac592d01d868b31c10171 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 31 Jan 2025 18:27:07 +0100
Subject: [PATCH 201/382] tcp: Get socket port and address using getsockname()
 when connecting from guest

For migration only: we need to store 'oport', our socket-side port,
as we establish a connection from the guest, so that we can bind the
same oport as source port in the migration target.

Similar for 'oaddr': this is needed in case the migration target has
additional network interfaces, and we need to make sure our socket is
bound to the equivalent interface as it was on the source.

Use getsockname() to fetch them.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c       |  4 ++--
 flow_table.h |  4 ++--
 tcp.c        | 28 +++++++++++++++++++++++++++-
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/flow.c b/flow.c
index ee1221b..a6fe6d1 100644
--- a/flow.c
+++ b/flow.c
@@ -414,8 +414,8 @@ const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
  *
  * Return: pointer to the target flowside information
  */
-const struct flowside *flow_target(const struct ctx *c, union flow *flow,
-				   uint8_t proto)
+struct flowside *flow_target(const struct ctx *c, union flow *flow,
+			     uint8_t proto)
 {
 	char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
 	struct flow_common *f = &flow->f;
diff --git a/flow_table.h b/flow_table.h
index f15db53..eeb6f41 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -168,8 +168,8 @@ const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
 				      sa_family_t af,
 				      const void *saddr, in_port_t sport,
 				      const void *daddr, in_port_t dport);
-const struct flowside *flow_target(const struct ctx *c, union flow *flow,
-				   uint8_t proto);
+struct flowside *flow_target(const struct ctx *c, union flow *flow,
+			     uint8_t proto);
 
 union flow *flow_set_type(union flow *flow, enum flow_type type);
 #define FLOW_SET_TYPE(flow_, t_, var_)	(&flow_set_type((flow_), (t_))->var_)
diff --git a/tcp.c b/tcp.c
index 51ad692..fac322c 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1415,6 +1415,8 @@ static void tcp_bind_outbound(const struct ctx *c,
  * @opts:	Pointer to start of options
  * @optlen:	Bytes in options: caller MUST ensure available length
  * @now:	Current timestamp
+ *
+ * #syscalls:vu getsockname
  */
 static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 			      const void *saddr, const void *daddr,
@@ -1423,9 +1425,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 {
 	in_port_t srcport = ntohs(th->source);
 	in_port_t dstport = ntohs(th->dest);
-	const struct flowside *ini, *tgt;
+	const struct flowside *ini;
 	struct tcp_tap_conn *conn;
 	union sockaddr_inany sa;
+	struct flowside *tgt;
 	union flow *flow;
 	int s = -1, mss;
 	uint64_t hash;
@@ -1530,6 +1533,29 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 	}
 
 	tcp_epoll_ctl(c, conn);
+
+	if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
+		if (af == AF_INET) {
+			struct sockaddr_in s_in;
+
+			sl = sizeof(s_in);
+			if (!getsockname(s, (struct sockaddr *)&s_in, &sl)) {
+				/* NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) */
+				tgt->oport = ntohs(s_in.sin_port);
+				tgt->oaddr = inany_from_v4(s_in.sin_addr);
+			}
+		} else {
+			struct sockaddr_in6 s_in6;
+
+			sl = sizeof(s_in6);
+			if (!getsockname(s, (struct sockaddr *)&s_in6, &sl)) {
+				/* NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) */
+				tgt->oport = ntohs(s_in6.sin6_port);
+				tgt->oaddr.a6 = s_in6.sin6_addr;
+			}
+		}
+	}
+
 	FLOW_ACTIVATE(conn);
 	return;
 

From dcf014be8876d5417b0eddb8b07152c6b2035485 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sun, 2 Feb 2025 10:38:46 +0100
Subject: [PATCH 202/382] doc: Add mock of migration source and target

These test programs show the migration of a TCP connection using the
passt-repair helper.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 doc/migration/.gitignore |   2 +
 doc/migration/Makefile   |  20 ++++++++
 doc/migration/README     |  51 ++++++++++++++++++++
 doc/migration/source.c   |  92 +++++++++++++++++++++++++++++++++++
 doc/migration/target.c   | 102 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 267 insertions(+)
 create mode 100644 doc/migration/.gitignore
 create mode 100644 doc/migration/Makefile
 create mode 100644 doc/migration/README
 create mode 100644 doc/migration/source.c
 create mode 100644 doc/migration/target.c

diff --git a/doc/migration/.gitignore b/doc/migration/.gitignore
new file mode 100644
index 0000000..59cb765
--- /dev/null
+++ b/doc/migration/.gitignore
@@ -0,0 +1,2 @@
+/source
+/target
diff --git a/doc/migration/Makefile b/doc/migration/Makefile
new file mode 100644
index 0000000..04f6891
--- /dev/null
+++ b/doc/migration/Makefile
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+TARGETS = source target
+CFLAGS = -Wall -Wextra -pedantic
+
+all: $(TARGETS)
+
+$(TARGETS): %: %.c
+
+clean:
+	rm -f $(TARGETS)
diff --git a/doc/migration/README b/doc/migration/README
new file mode 100644
index 0000000..375603b
--- /dev/null
+++ b/doc/migration/README
@@ -0,0 +1,51 @@
+<!---
+SPDX-License-Identifier: GPL-2.0-or-later
+Copyright (c) 2025 Red Hat GmbH
+Author: Stefano Brivio <sbrivio@redhat.com>
+-->
+
+Migration
+=========
+
+These test programs show a migration of a TCP connection from one process to
+another using the TCP_REPAIR socket option.
+
+The two processes are a mock of the matching implementation in passt(1), and run
+unprivileged, so they rely on the passt-repair helper to connect to them and set
+or clear TCP_REPAIR on the connection socket, transferred to the helper using
+SCM_RIGHTS.
+
+The passt-repair helper needs to have the CAP_NET_ADMIN capability, or run as
+root.
+
+Example of usage
+----------------
+
+* Start the test server
+
+        $ nc -l 9999
+
+* Start the source side of the TCP client (mock of the source instance of passt)
+
+        $ ./source 127.0.0.1 9999 9998 /tmp/repair.sock
+
+* The client sends a test string, and waits for a connection from passt-repair
+
+        # passt-repair /tmp/repair.sock
+
+* The socket is now in repair mode, and `source` dumps sequences, then exits
+
+        sending sequence: 3244673313
+        receiving sequence: 2250449386
+
+* Continue the connection on the target side, restarting from those sequences
+
+        $ ./target 127.0.0.1 9999 9998 /tmp/repair.sock 3244673313 2250449386
+
+* The target side now waits for a connection from passt-repair
+
+        # passt-repair /tmp/repair.sock
+
+* The target side asks passt-repair to switch the socket to repair mode, sets up
+  the TCP sequences, then asks passt-repair to clear repair mode, and sends a
+  test string to the server
diff --git a/doc/migration/source.c b/doc/migration/source.c
new file mode 100644
index 0000000..d44ebf1
--- /dev/null
+++ b/doc/migration/source.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ *  for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ *  for network namespace/tap device mode
+ *
+ * doc/migration/source.c - Mock of TCP migration source, use with passt-repair
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <arpa/inet.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/tcp.h>
+
+int main(int argc, char **argv)
+{
+	struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } };
+	struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0,
+				  NULL, NULL, NULL };
+	struct sockaddr_un a_helper = { AF_UNIX, { 0 } };
+	int seq, s, s_helper;
+	int8_t cmd;
+	struct iovec iov = { &cmd, sizeof(cmd) };
+	char buf[CMSG_SPACE(sizeof(int))];
+	struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
+	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+	socklen_t seqlen = sizeof(int);
+	struct addrinfo *r;
+
+	(void)argc;
+
+	if (argc != 5) {
+		fprintf(stderr, "%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH\n",
+			argv[0]);
+		return -1;
+	}
+
+	strcpy(a_helper.sun_path, argv[4]);
+	getaddrinfo(argv[1], argv[2], &hints, &r);
+
+	/* Connect socket to server and send some data */
+	s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP);
+	setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int));
+	bind(s, (struct sockaddr *)&a, sizeof(a));
+	connect(s, r->ai_addr, r->ai_addrlen);
+	send(s, "before migration\n", sizeof("before migration\n"), 0);
+
+	/* Wait for helper */
+	s_helper = socket(AF_UNIX, SOCK_STREAM, 0);
+	unlink(a_helper.sun_path);
+	bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper));
+	listen(s_helper, 1);
+	s_helper = accept(s_helper, NULL, NULL);
+
+	/* Set up message for helper, with socket */
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+	memcpy(CMSG_DATA(cmsg), &s, sizeof(s));
+
+	/* Send command to helper: turn repair mode on, wait for reply */
+	cmd = TCP_REPAIR_ON;
+	sendmsg(s_helper, &msg, 0);
+	recv(s_helper, &((int8_t){ 0 }), 1, 0);
+
+	/* Terminate helper */
+	close(s_helper);
+
+	/* Get sending sequence */
+	seq = TCP_SEND_QUEUE;
+	setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
+	getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen);
+	fprintf(stdout, "%u ", seq);
+
+	/* Get receiving sequence */
+	seq = TCP_RECV_QUEUE;
+	setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
+	getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen);
+	fprintf(stdout, "%u\n", seq);
+}
diff --git a/doc/migration/target.c b/doc/migration/target.c
new file mode 100644
index 0000000..f7d3108
--- /dev/null
+++ b/doc/migration/target.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ *  for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ *  for network namespace/tap device mode
+ *
+ * doc/migration/target.c - Mock of TCP migration target, use with passt-repair
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <arpa/inet.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <netinet/tcp.h>
+
+int main(int argc, char **argv)
+{
+	struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } };
+	struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0,
+				  NULL, NULL, NULL };
+	struct sockaddr_un a_helper = { AF_UNIX, { 0 } };
+	int s, s_helper, seq;
+	int8_t cmd;
+	struct iovec iov = { &cmd, sizeof(cmd) };
+	char buf[CMSG_SPACE(sizeof(int))];
+	struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
+	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+	struct addrinfo *r;
+
+	(void)argc;
+
+	strcpy(a_helper.sun_path, argv[4]);
+	getaddrinfo(argv[1], argv[2], &hints, &r);
+
+	if (argc != 7) {
+		fprintf(stderr,
+			"%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH SSEQ RSEQ\n",
+			argv[0]);
+		return -1;
+	}
+
+	/* Prepare socket, bind to source port */
+	s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP);
+	setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int));
+	bind(s, (struct sockaddr *)&a, sizeof(a));
+
+	/* Wait for helper */
+	s_helper = socket(AF_UNIX, SOCK_STREAM, 0);
+	unlink(a_helper.sun_path);
+	bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper));
+	listen(s_helper, 1);
+	s_helper = accept(s_helper, NULL, NULL);
+
+	/* Set up message for helper, with socket */
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+	memcpy(CMSG_DATA(cmsg), &s, sizeof(s));
+
+	/* Send command to helper: turn repair mode on, wait for reply */
+	cmd = TCP_REPAIR_ON;
+	sendmsg(s_helper, &msg, 0);
+	recv(s_helper, &((int){ 0 }), 1, 0);
+
+	/* Set sending sequence */
+	seq = TCP_SEND_QUEUE;
+	setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
+	seq = atoi(argv[5]);
+	setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq));
+
+	/* Set receiving sequence */
+	seq = TCP_RECV_QUEUE;
+	setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
+	seq = atoi(argv[6]);
+	setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq));
+
+	/* Connect setting kernel state only, without actual SYN / handshake */
+	connect(s, r->ai_addr, r->ai_addrlen);
+
+	/* Send command to helper: turn repair mode off, wait for reply */
+	cmd = TCP_REPAIR_OFF;
+	sendmsg(s_helper, &msg, 0);
+
+	recv(s_helper, &((int8_t){ 0 }), 1, 0);
+
+	/* Terminate helper */
+	close(s_helper);
+
+	/* Send some more data */
+	send(s, "after migration\n", sizeof("after migration\n"), 0);
+}

From b4a7b5d4a66db5f419cb5de87da3403cfba3847d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 4 Feb 2025 16:42:13 +1100
Subject: [PATCH 203/382] migrate: Fix several errors with passt-repair

The passt-repair helper is now merged, but alas it contains several small
bugs:
 * close() is not in the seccomp profile, meaning it will immediately
   SIGSYS when you make a request of it
 * The generated header, seccomp_repair.h isn't listed in .gitignore or
   removed by "make clean"

Fixes: 8c24301462c3 ("Introduce passt-repair")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 .gitignore     | 1 +
 Makefile       | 2 +-
 passt-repair.c | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5824a71..3c16adc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,5 +7,6 @@
 /qrap
 /pasta.1
 /seccomp.h
+/seccomp_repair.h
 /c*.json
 README.plain.md
diff --git a/Makefile b/Makefile
index 6ab8d24..d3d4b78 100644
--- a/Makefile
+++ b/Makefile
@@ -117,7 +117,7 @@ valgrind: all
 
 .PHONY: clean
 clean:
-	$(RM) $(BIN) *~ *.o seccomp.h pasta.1 \
+	$(RM) $(BIN) *~ *.o seccomp.h seccomp_repair.h pasta.1 \
 		passt.tar passt.tar.gz *.deb *.rpm \
 		passt.pid README.plain.md
 
diff --git a/passt-repair.c b/passt-repair.c
index 767a821..dd8578f 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -46,7 +46,7 @@
  *
  * Return: 0 on success (EOF), 1 on error, 2 on usage error
  *
- * #syscalls:repair connect setsockopt write exit_group
+ * #syscalls:repair connect setsockopt write close exit_group
  * #syscalls:repair socket s390x:socketcall i686:socketcall
  * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv
  * #syscalls:repair sendto sendmsg arm:send ppc64le:send

From 745c163e60b0e5da7bf6013645d79b4bdbf3e848 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 4 Feb 2025 16:42:15 +1100
Subject: [PATCH 204/382] tcp: Simplify handling of getsockname()

For migration we need to get the specific local address and port for
connected sockets with getsockname().  We currently open code marshalling
the results into the flow entry.

However, we already have inany_from_sockaddr() which handles the fiddly
parts of this, so use it.  Also report failures, which may make debugging
problems easier.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Drop re-declarations of 'sa' and 'sl']
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/tcp.c b/tcp.c
index fac322c..af6bd95 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1535,24 +1535,12 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 	tcp_epoll_ctl(c, conn);
 
 	if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
-		if (af == AF_INET) {
-			struct sockaddr_in s_in;
-
-			sl = sizeof(s_in);
-			if (!getsockname(s, (struct sockaddr *)&s_in, &sl)) {
-				/* NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) */
-				tgt->oport = ntohs(s_in.sin_port);
-				tgt->oaddr = inany_from_v4(s_in.sin_addr);
-			}
+		sl = sizeof(sa);
+		if (!getsockname(s, &sa.sa, &sl)) {
+			inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa);
 		} else {
-			struct sockaddr_in6 s_in6;
-
-			sl = sizeof(s_in6);
-			if (!getsockname(s, (struct sockaddr *)&s_in6, &sl)) {
-				/* NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) */
-				tgt->oport = ntohs(s_in6.sin6_port);
-				tgt->oaddr.a6 = s_in6.sin6_addr;
-			}
+			err("Failed to get local address for socket: %s",
+			    strerror_(errno));
 		}
 	}
 

From d0006fa784a7de881db187756770d2492c75df5d Mon Sep 17 00:00:00 2001
From: Paul Holzinger <pholzing@redhat.com>
Date: Wed, 5 Feb 2025 14:00:41 +0100
Subject: [PATCH 205/382] treewide: use _exit() over exit()

In the podman CI I noticed many seccomp denials in our logs even though
tests passed:
comm="pasta.avx2" exe="/usr/bin/pasta.avx2" sig=31 arch=c000003e
syscall=202 compat=0 ip=0x7fb3d31f69db code=0x80000000

Which is futex being called and blocked by the pasta profile. After a
few tries I managed to reproduce locally with this loop in ~20 min:
while :;
  do podman run -d --network bridge quay.io/libpod/testimage:20241011 \
	sleep 100 && \
  sleep 10 && \
  podman rm -fa -t0
done

And using a pasta version with prctl(PR_SET_DUMPABLE, 1); set I got the
following stack trace:
Stack trace of thread 1:
  #0  0x00007fc95e6de91b __lll_lock_wait_private (libc.so.6 + 0x9491b)
  #1  0x00007fc95e68d6de __run_exit_handlers (libc.so.6 + 0x436de)
  #2  0x00007fc95e68d70e exit (libc.so.6 + 0x4370e)
  #3  0x000055f31b78c50b n/a (n/a + 0x0)
  #4  0x00007fc95e68d70e exit (libc.so.6 + 0x4370e)
  #5  0x000055f31b78d5a2 n/a (n/a + 0x0)

Pasta got killed in exit(), it seems glibc is trying to use a lock when
running exit handlers even though no exit handlers are defined.

Given no exit handlers are needed we can call _exit() instead. This
skips exit handlers and does not flush stdio streams compared to exit()
which should be fine for the use here.

Based on the input from Stefano I did not change the test/doc programs
or qrap as they do not use seccomp filters.

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c       | 8 ++++----
 log.h        | 4 ++--
 passt.c      | 8 ++++----
 pasta.c      | 8 ++++----
 tap.c        | 2 +-
 util.c       | 8 ++++----
 vhost_user.c | 2 +-
 7 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/conf.c b/conf.c
index df2b016..6817377 100644
--- a/conf.c
+++ b/conf.c
@@ -769,7 +769,7 @@ static void conf_ip6_local(struct ip6_ctx *ip6)
  * usage() - Print usage, exit with given status code
  * @name:	Executable name
  * @f:		Stream to print usage info to
- * @status:	Status code for exit()
+ * @status:	Status code for _exit()
  */
 static void usage(const char *name, FILE *f, int status)
 {
@@ -925,7 +925,7 @@ static void usage(const char *name, FILE *f, int status)
 		"    SPEC is as described for TCP above\n"
 		"    default: none\n");
 
-	exit(status);
+	_exit(status);
 
 pasta_opts:
 
@@ -980,7 +980,7 @@ pasta_opts:
 		"  --ns-mac-addr ADDR	Set MAC address on tap interface\n"
 		"  --no-splice		Disable inbound socket splicing\n");
 
-	exit(status);
+	_exit(status);
 }
 
 /**
@@ -1482,7 +1482,7 @@ void conf(struct ctx *c, int argc, char **argv)
 			FPRINTF(stdout,
 				c->mode == MODE_PASTA ? "pasta " : "passt ");
 			FPRINTF(stdout, VERSION_BLOB);
-			exit(EXIT_SUCCESS);
+			_exit(EXIT_SUCCESS);
 		case 15:
 			ret = snprintf(c->ip4.ifname_out,
 				       sizeof(c->ip4.ifname_out), "%s", optarg);
diff --git a/log.h b/log.h
index a30b091..22c7b9a 100644
--- a/log.h
+++ b/log.h
@@ -32,13 +32,13 @@ void logmsg_perror(int pri, const char *format, ...)
 #define die(...)							\
 	do {								\
 		err(__VA_ARGS__);					\
-		exit(EXIT_FAILURE);					\
+		_exit(EXIT_FAILURE);					\
 	} while (0)
 
 #define die_perror(...)							\
 	do {								\
 		err_perror(__VA_ARGS__);				\
-		exit(EXIT_FAILURE);					\
+		_exit(EXIT_FAILURE);					\
 	} while (0)
 
 extern int log_trace;
diff --git a/passt.c b/passt.c
index b1c8ab6..53fdd38 100644
--- a/passt.c
+++ b/passt.c
@@ -167,7 +167,7 @@ void exit_handler(int signal)
 {
 	(void)signal;
 
-	exit(EXIT_SUCCESS);
+	_exit(EXIT_SUCCESS);
 }
 
 /**
@@ -210,7 +210,7 @@ int main(int argc, char **argv)
 	sigaction(SIGQUIT, &sa, NULL);
 
 	if (argc < 1)
-		exit(EXIT_FAILURE);
+		_exit(EXIT_FAILURE);
 
 	strncpy(argv0, argv[0], PATH_MAX - 1);
 	name = basename(argv0);
@@ -226,7 +226,7 @@ int main(int argc, char **argv)
 	} else if (strstr(name, "passt")) {
 		c.mode = MODE_PASST;
 	} else {
-		exit(EXIT_FAILURE);
+		_exit(EXIT_FAILURE);
 	}
 
 	madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
@@ -259,7 +259,7 @@ int main(int argc, char **argv)
 	flow_init();
 
 	if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
-		exit(EXIT_FAILURE);
+		_exit(EXIT_FAILURE);
 
 	proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
 
diff --git a/pasta.c b/pasta.c
index ff41c95..f15084d 100644
--- a/pasta.c
+++ b/pasta.c
@@ -73,12 +73,12 @@ void pasta_child_handler(int signal)
 	    !waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) {
 		if (infop.si_pid == pasta_child_pid) {
 			if (infop.si_code == CLD_EXITED)
-				exit(infop.si_status);
+				_exit(infop.si_status);
 
 			/* If killed by a signal, si_status is the number.
 			 * Follow common shell convention of returning it + 128.
 			 */
-			exit(infop.si_status + 128);
+			_exit(infop.si_status + 128);
 
 			/* Nothing to do, detached PID namespace going away */
 		}
@@ -499,7 +499,7 @@ void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd)
 		return;
 
 	info("Namespace %s is gone, exiting", c->netns_base);
-	exit(EXIT_SUCCESS);
+	_exit(EXIT_SUCCESS);
 }
 
 /**
@@ -525,7 +525,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref)
 			return;
 
 		info("Namespace %s is gone, exiting", c->netns_base);
-		exit(EXIT_SUCCESS);
+		_exit(EXIT_SUCCESS);
 	}
 
 	close(fd);
diff --git a/tap.c b/tap.c
index 772648f..8c92d23 100644
--- a/tap.c
+++ b/tap.c
@@ -1002,7 +1002,7 @@ void tap_sock_reset(struct ctx *c)
 	info("Client connection closed%s", c->one_off ? ", exiting" : "");
 
 	if (c->one_off)
-		exit(EXIT_SUCCESS);
+		_exit(EXIT_SUCCESS);
 
 	/* Close the connected socket, wait for a new connection */
 	epoll_del(c, c->fd_tap);
diff --git a/util.c b/util.c
index 800c6b5..4d51e04 100644
--- a/util.c
+++ b/util.c
@@ -405,7 +405,7 @@ void pidfile_write(int fd, pid_t pid)
 
 	if (write(fd, pid_buf, n) < 0) {
 		perror("PID file write");
-		exit(EXIT_FAILURE);
+		_exit(EXIT_FAILURE);
 	}
 
 	close(fd);
@@ -441,12 +441,12 @@ int __daemon(int pidfile_fd, int devnull_fd)
 
 	if (pid == -1) {
 		perror("fork");
-		exit(EXIT_FAILURE);
+		_exit(EXIT_FAILURE);
 	}
 
 	if (pid) {
 		pidfile_write(pidfile_fd, pid);
-		exit(EXIT_SUCCESS);
+		_exit(EXIT_SUCCESS);
 	}
 
 	if (setsid()				< 0 ||
@@ -454,7 +454,7 @@ int __daemon(int pidfile_fd, int devnull_fd)
 	    dup2(devnull_fd, STDOUT_FILENO)	< 0 ||
 	    dup2(devnull_fd, STDERR_FILENO)	< 0 ||
 	    close(devnull_fd))
-		exit(EXIT_FAILURE);
+		_exit(EXIT_FAILURE);
 
 	return 0;
 }
diff --git a/vhost_user.c b/vhost_user.c
index 9e38cfd..159f0b3 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -60,7 +60,7 @@ void vu_print_capabilities(void)
 	info("{");
 	info("  \"type\": \"net\"");
 	info("}");
-	exit(EXIT_SUCCESS);
+	_exit(EXIT_SUCCESS);
 }
 
 /**

From a9d63f91a59a4c02cd77af41fa70d82e73f17576 Mon Sep 17 00:00:00 2001
From: Paul Holzinger <pholzing@redhat.com>
Date: Wed, 5 Feb 2025 14:00:42 +0100
Subject: [PATCH 206/382] passt-repair: use _exit() over return

When returning from main it does the same as calling exit() which is not
good as glibc might try to call futex() which will be blocked by
seccomp. See the prevoius commit "treewide: use _exit() over exit()" for
a more detailed explanation.

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/passt-repair.c b/passt-repair.c
index dd8578f..6f79423 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -71,7 +71,7 @@ int main(int argc, char **argv)
 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
 	    prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
 		fprintf(stderr, "Failed to apply seccomp filter");
-		return 1;
+		_exit(1);
 	}
 
 	iov = (struct iovec){ &cmd, sizeof(cmd) };
@@ -80,42 +80,42 @@ int main(int argc, char **argv)
 
 	if (argc != 2) {
 		fprintf(stderr, "Usage: %s PATH\n", argv[0]);
-		return 2;
+		_exit(2);
 	}
 
 	ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
 	if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
 		fprintf(stderr, "Invalid socket path: %s\n", argv[1]);
-		return 2;
+		_exit(2);
 	}
 
 	if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
 		perror("Failed to create AF_UNIX socket");
-		return 1;
+		_exit(1);
 	}
 
 	if (connect(s, (struct sockaddr *)&a, sizeof(a))) {
 		fprintf(stderr, "Failed to connect to %s: %s\n", argv[1],
 			strerror(errno));
-		return 1;
+		_exit(1);
 	}
 
 loop:
 	ret = recvmsg(s, &msg, 0);
 	if (ret < 0) {
 		perror("Failed to receive message");
-		return 1;
+		_exit(1);
 	}
 
 	if (!ret)	/* Done */
-		return 0;
+		_exit(0);
 
 	if (!cmsg ||
 	    cmsg->cmsg_len < CMSG_LEN(sizeof(int)) ||
 	    cmsg->cmsg_len > CMSG_LEN(sizeof(int) * SCM_MAX_FD) ||
 	    cmsg->cmsg_type != SCM_RIGHTS) {
 		fprintf(stderr, "No/bad ancillary data from peer\n");
-		return 1;
+		_exit(1);
 	}
 
 	n = cmsg->cmsg_len / CMSG_LEN(sizeof(int));
@@ -124,7 +124,7 @@ loop:
 	if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF &&
 	    cmd != TCP_REPAIR_OFF_NO_WP) {
 		fprintf(stderr, "Unsupported command 0x%04x\n", cmd);
-		return 1;
+		_exit(1);
 	}
 
 	for (i = 0; i < n; i++) {
@@ -134,7 +134,7 @@ loop:
 			fprintf(stderr,
 				"Setting TCP_REPAIR to %i on socket %i: %s", o,
 				fds[i], strerror(errno));
-			return 1;
+			_exit(1);
 		}
 
 		/* Close _our_ copy */
@@ -144,11 +144,11 @@ loop:
 		if (send(s, &cmd, sizeof(cmd), 0) < 0) {
 			fprintf(stderr, "Reply to command %i: %s\n",
 				o, strerror(errno));
-			return 1;
+			_exit(1);
 		}
 	}
 
 	goto loop;
 
-	return 0;
+	_exit(0);
 }

From 9215f68a0c2ad274b73862bc865fbdbb464e182a Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 5 Feb 2025 16:57:55 +0100
Subject: [PATCH 207/382] passt-repair: Build fixes for musl

When building against musl headers:

- sizeof() needs stddef.h, as it should be;

- we can't initialise a struct msghdr by simply listing fields in
  order, as they contain explicit padding fields. Use field names
  instead.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt-repair.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/passt-repair.c b/passt-repair.c
index 6f79423..3c3247b 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -21,6 +21,7 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 #include <errno.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -75,7 +76,11 @@ int main(int argc, char **argv)
 	}
 
 	iov = (struct iovec){ &cmd, sizeof(cmd) };
-	msg = (struct msghdr){ NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
+	msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0,
+			       .msg_iov = &iov, .msg_iovlen = 1,
+			       .msg_control = buf,
+			       .msg_controllen = sizeof(buf),
+			       .msg_flags = 0 };
 	cmsg = CMSG_FIRSTHDR(&msg);
 
 	if (argc != 2) {

From 593be3277429f0a2c06f6bebab4f20736c96abc8 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 5 Feb 2025 17:02:27 +0100
Subject: [PATCH 208/382] passt-repair.1: Fix indication of TCP_REPAIR
 constants

...perhaps I should adopt the healthy habit of actually reading
headers instead of using my mental copy.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt-repair.1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt-repair.1 b/passt-repair.1
index 8d07c97..7c1b140 100644
--- a/passt-repair.1
+++ b/passt-repair.1
@@ -31,7 +31,7 @@ same as the UNIX domain socket used for guest communication, suffixed by
 \fI.repair\fR.
 
 The messages consist of one 8-bit signed integer that can be \fITCP_REPAIR_ON\fR
-(1), \fITCP_REPAIR_OFF\fR (2), or \fITCP_REPAIR_OFF_WP\fR (-1), as defined by
+(1), \fITCP_REPAIR_OFF\fR (0), or \fITCP_REPAIR_OFF_NO_WP\fR (-1), as defined by
 the Linux kernel user API, and one to SCM_MAX_FD (253) sockets as SCM_RIGHTS
 (see \fBunix\fR(7)) ancillary message, sent by the server, \fBpasst\fR(1).
 

From f66769c2de82550ac1ee2548960c09a4b052341f Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 5 Feb 2025 17:21:59 +0100
Subject: [PATCH 209/382] apparmor: Workaround for unconfined libvirtd when
 triggered by unprivileged user

If libvirtd is triggered by an unprivileged user, the virt-aa-helper
mechanism doesn't work, because per-VM profiles can't be instantiated,
and as a result libvirtd runs unconfined.

This means passt can't start, because the passt subprofile from
libvirt's profile is not loaded either.

Example:

  $ virsh start alpine
  error: Failed to start domain 'alpine'
  error: internal error: Child process (passt --one-off --socket /run/user/1000/libvirt/qemu/run/passt/1-alpine-net0.socket --pid /run/user/1000/libvirt/qemu/run/passt/1-alpine-net0-passt.pid --tcp-ports 40922:2) unexpected fatal signal 11

Add an annoying workaround for the moment being. Much better than
encouraging users to start guests as root, or to disable AppArmor
altogether.

Reported-by: Prafulla Giri <prafulla.giri@protonmail.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/apparmor/usr.bin.passt | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/contrib/apparmor/usr.bin.passt b/contrib/apparmor/usr.bin.passt
index 9568189..62a4514 100644
--- a/contrib/apparmor/usr.bin.passt
+++ b/contrib/apparmor/usr.bin.passt
@@ -27,4 +27,25 @@ profile passt /usr/bin/passt{,.avx2} {
 
   owner @{HOME}/**			w,	# pcap(), pidfile_open(),
 						# pidfile_write()
+
+  # Workaround: libvirt's profile comes with a passt subprofile which includes,
+  # in turn, <abstractions/passt>, and adds libvirt-specific rules on top, to
+  # allow passt (when started by libvirtd) to write socket and PID files in the
+  # location requested by libvirtd itself, and to execute passt itself.
+  #
+  # However, when libvirt runs as unprivileged user, the mechanism based on
+  # virt-aa-helper, designed to build per-VM profiles as guests are started,
+  # doesn't work. The helper needs to create and load profiles on the fly, which
+  # can't be done by unprivileged users, of course.
+  #
+  # As a result, libvirtd runs unconfined if guests are started by unprivileged
+  # users, starting passt unconfined as well, which means that passt runs under
+  # its own stand-alone profile (this one), which implies in turn that execve()
+  # of /usr/bin/passt is not allowed, and socket and PID files can't be written.
+  #
+  # Duplicate libvirt-specific rules here as long as this is not solved in
+  # libvirt's profile itself.
+  /usr/bin/passt r,
+  owner @{run}/user/[0-9]*/libvirt/qemu/run/passt/* rw,
+  owner @{run}/libvirt/qemu/passt/* rw,
 }

From 0da87b393b63747526d162c728987f320b41771e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 6 Feb 2025 16:49:42 +1100
Subject: [PATCH 210/382] debug: Add tcpdump to mbuto.img

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/passt.mbuto | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/passt.mbuto b/test/passt.mbuto
index 138d365..d4d57cb 100755
--- a/test/passt.mbuto
+++ b/test/passt.mbuto
@@ -13,7 +13,7 @@
 PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod
        modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname
        sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
-       nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}"
+       nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump}"
 
 # OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and
 # sshd-session the per-session program. We need the latter as well, and the path
@@ -65,6 +65,7 @@ EOF
 	# sshd via vsock
 	cat > /etc/passwd << EOF
 root:x:0:0:root:/root:/bin/sh
+tcpdump:x:72:72:tcpdump:/:/sbin/nologin
 sshd:x:100:100:Privilege-separated SSH:/var/empty/sshd:/sbin/nologin
 EOF
 	cat > /etc/shadow << EOF

From a5cca995dee9b4196d41c86034a4948d346266ca Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 6 Feb 2025 09:33:05 +0100
Subject: [PATCH 211/382] conf, passt.1: Un-deprecate --host-lo-to-ns-lo

It was established behaviour, and it's now the third report about it:
users ask how to achieve the same functionality, and we don't have a
better answer yet.

The idea behind declaring it deprecated to start with, I guess, was
that we would eventually replace it by more flexible and generic
configuration options, which is still planned. But there's nothing
preventing us to alias this in the future to a particular
configuration.

So, stop scaring users off, and un-deprecate this.

Link: https://archives.passt.top/passt-dev/20240925102009.62b9a0ce@elisabeth/
Link: https://github.com/rootless-containers/rootlesskit/pull/482#issuecomment-2591855705
Link: https://github.com/moby/moby/issues/48838
Link: https://github.com/containers/podman/discussions/25243
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c  | 3 +--
 passt.1 | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/conf.c b/conf.c
index 6817377..f5d04db 100644
--- a/conf.c
+++ b/conf.c
@@ -963,8 +963,7 @@ pasta_opts:
 		"  -U, --udp-ns SPEC	UDP port forwarding to init namespace\n"
 		"    SPEC is as described above\n"
 		"    default: auto\n"
-		"  --host-lo-to-ns-lo	DEPRECATED:\n"
-		"			Translate host-loopback forwards to\n"
+		"  --host-lo-to-ns-lo	Translate host-loopback forwards to\n"
 		"			namespace loopback\n"
 		"  --userns NSPATH 	Target user namespace to join\n"
 		"  --netns PATH|NAME	Target network namespace to join\n"
diff --git a/passt.1 b/passt.1
index d9cd33e..2928af5 100644
--- a/passt.1
+++ b/passt.1
@@ -622,7 +622,7 @@ Configure UDP port forwarding from target namespace to init namespace.
 Default is \fBauto\fR.
 
 .TP
-.BR \-\-host-lo-to-ns-lo " " (DEPRECATED)
+.BR \-\-host-lo-to-ns-lo
 If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
 the host's loopback address will appear on the loopback address in the
 guest as well.  Without this option such forwarded packets will appear

From a0b7f56b3a3c220b3d8065d7cfdd83a6e3919467 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 7 Feb 2025 01:51:38 +0100
Subject: [PATCH 212/382] passt-repair: Don't use perror(), accept ECONNRESET
 as termination

If we use glibc's perror(), we need to allow dup() and fcntl() in our
seccomp profiles, which are a bit too much for this simple helper. On
top of that, we would probably need a wrapper to avoid allocation for
translated messages.

While at it: ECONNRESET is just a close() from passt, treat it like
EOF.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt-repair.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/passt-repair.c b/passt-repair.c
index 3c3247b..d137a18 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -95,7 +95,7 @@ int main(int argc, char **argv)
 	}
 
 	if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
-		perror("Failed to create AF_UNIX socket");
+		fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
 		_exit(1);
 	}
 
@@ -108,8 +108,12 @@ int main(int argc, char **argv)
 loop:
 	ret = recvmsg(s, &msg, 0);
 	if (ret < 0) {
-		perror("Failed to receive message");
-		_exit(1);
+		if (errno == ECONNRESET) {
+			ret = 0;
+		} else {
+			fprintf(stderr, "Failed to read message: %i\n", errno);
+			_exit(1);
+		}
 	}
 
 	if (!ret)	/* Done */

From 0f009ea598707c5978846387d716f4a612d07b36 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 7 Feb 2025 01:55:08 +0100
Subject: [PATCH 213/382] passt-repair: Fix calculation of payload length from
 cmsg_len

There's no inverse function for CMSG_LEN(), so we need to loop over
SCM_MAX_FD (253) possible input values. The previous calculation is
clearly wrong, as not every int takes CMSG_LEN(sizeof(int)) in cmsg
data.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/passt-repair.c b/passt-repair.c
index d137a18..5ad5c9c 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -57,7 +57,7 @@ int main(int argc, char **argv)
 	char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
 	     __attribute__ ((aligned(__alignof__(struct cmsghdr))));
 	struct sockaddr_un a = { AF_UNIX, "" };
-	int fds[SCM_MAX_FD], s, ret, i, n;
+	int fds[SCM_MAX_FD], s, ret, i, n = 0;
 	struct sock_fprog prog;
 	int8_t cmd = INT8_MAX;
 	struct cmsghdr *cmsg;
@@ -127,7 +127,21 @@ loop:
 		_exit(1);
 	}
 
-	n = cmsg->cmsg_len / CMSG_LEN(sizeof(int));
+	/* No inverse formula for CMSG_LEN(x), and building one with CMSG_LEN(0)
+	 * works but there's no guarantee it does. Search the whole domain.
+	 */
+	for (i = 1; i < SCM_MAX_FD; i++) {
+		if (CMSG_LEN(sizeof(int) * i) == cmsg->cmsg_len) {
+			n = i;
+			break;
+		}
+	}
+	if (!n) {
+		fprintf(stderr, "Invalid ancillary data length %zu from peer\n",
+			cmsg->cmsg_len);
+		_exit(1);
+	}
+
 	memcpy(fds, CMSG_DATA(cmsg), sizeof(int) * n);
 
 	if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF &&

From b7b70ba24369891d79079d247f246c1e357948d2 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 7 Feb 2025 01:58:00 +0100
Subject: [PATCH 214/382] passt-repair: Dodge "structurally unreachable code"
 warning from Coverity

While main() conventionally returns int, and we need a return at the
end of the function to avoid compiler warnings, turning that return
into _exit() to avoid exit handlers triggers a Coverity warning. It's
unreachable code anyway, so switch that single occurence back to a
plain return.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt-repair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt-repair.c b/passt-repair.c
index 5ad5c9c..322066a 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -173,5 +173,5 @@ loop:
 
 	goto loop;
 
-	_exit(0);
+	return 0;
 }

From fe8b6a7c42625ee1fc63186204d32458b1ba31b9 Mon Sep 17 00:00:00 2001
From: Enrique Llorente <ellorent@redhat.com>
Date: Tue, 4 Feb 2025 10:43:37 +0100
Subject: [PATCH 215/382] dhcp: Don't re-use request message for reply

The logic composing the DHCP reply message is reusing the request
message to compose it, future long options like FQDN may
exceed the request message limit making it go beyond the lower
bound.

This change creates a new reply message with a fixed options size of 308
and fills it in with proper fields from requests adding on top the generated
options, this way the reply lower bound does not depend on the request.

Signed-off-by: Enrique Llorente <ellorent@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 dhcp.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/dhcp.c b/dhcp.c
index d8515aa..2a23ed4 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -151,9 +151,6 @@ static int fill(struct msg *m)
 {
 	int i, o, offset = 0;
 
-	m->op = BOOTREPLY;
-	m->secs = 0;
-
 	for (o = 0; o < 255; o++)
 		opts[o].sent = 0;
 
@@ -291,8 +288,9 @@ int dhcp(const struct ctx *c, const struct pool *p)
 	const struct ethhdr *eh;
 	const struct iphdr *iph;
 	const struct udphdr *uh;
+	struct msg const *m;
+	struct msg reply;
 	unsigned int i;
-	struct msg *m;
 
 	eh  = packet_get(p, 0, offset, sizeof(*eh),  NULL);
 	offset += sizeof(*eh);
@@ -321,6 +319,22 @@ int dhcp(const struct ctx *c, const struct pool *p)
 	    m->op != BOOTREQUEST)
 		return -1;
 
+	reply.op		= BOOTREPLY;
+	reply.htype		= m->htype;
+	reply.hlen		= m->hlen;
+	reply.hops		= 0;
+	reply.xid		= m->xid;
+	reply.secs		= 0;
+	reply.flags		= m->flags;
+	reply.ciaddr		= m->ciaddr;
+	reply.yiaddr		= c->ip4.addr;
+	reply.siaddr		= 0;
+	reply.giaddr		= m->giaddr;
+	memcpy(&reply.chaddr,	m->chaddr,	sizeof(reply.chaddr));
+	memset(&reply.sname,	0,		sizeof(reply.sname));
+	memset(&reply.file,	0,		sizeof(reply.file));
+	reply.magic		= m->magic;
+
 	offset += offsetof(struct msg, o);
 
 	for (i = 0; i < ARRAY_SIZE(opts); i++)
@@ -364,7 +378,6 @@ int dhcp(const struct ctx *c, const struct pool *p)
 
 	info("    from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr)));
 
-	m->yiaddr = c->ip4.addr;
 	mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len));
 	memcpy(opts[1].s,  &mask,                sizeof(mask));
 	memcpy(opts[3].s,  &c->ip4.guest_gw,     sizeof(c->ip4.guest_gw));
@@ -401,14 +414,14 @@ int dhcp(const struct ctx *c, const struct pool *p)
 	if (!c->no_dhcp_dns_search)
 		opt_set_dns_search(c, sizeof(m->o));
 
-	dlen = offsetof(struct msg, o) + fill(m);
+	dlen = offsetof(struct msg, o) + fill(&reply);
 
 	if (m->flags & FLAG_BROADCAST)
 		dst = in4addr_broadcast;
 	else
 		dst = c->ip4.addr;
 
-	tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, m, dlen);
+	tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, &reply, dlen);
 
 	return 1;
 }

From 864be475d9db58c93540eb883ecf656c3eff861f Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 7 Feb 2025 08:59:57 +0100
Subject: [PATCH 216/382] passt-repair: Send one confirmation *per command*,
 not *per socket*

It looks like me, myself and I couldn't agree on the "simple" protocol
between passt and passt-repair. The man page and passt say it's one
confirmation per command, but the passt-repair implementation had one
confirmation per socket instead.

This caused all sort of mysterious issues with repair mode
pseudo-randomly enabled, and leading to hours of fun (mostly not
mine). Oops.

Switch to one confirmation per command (of course).

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt-repair.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/passt-repair.c b/passt-repair.c
index 322066a..614cee0 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -63,6 +63,7 @@ int main(int argc, char **argv)
 	struct cmsghdr *cmsg;
 	struct msghdr msg;
 	struct iovec iov;
+	int op;
 
 	prctl(PR_SET_DUMPABLE, 0);
 
@@ -150,25 +151,24 @@ loop:
 		_exit(1);
 	}
 
-	for (i = 0; i < n; i++) {
-		int o = cmd;
+	op = cmd;
 
-		if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &o, sizeof(o))) {
+	for (i = 0; i < n; i++) {
+		if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &op, sizeof(op))) {
 			fprintf(stderr,
-				"Setting TCP_REPAIR to %i on socket %i: %s", o,
+				"Setting TCP_REPAIR to %i on socket %i: %s", op,
 				fds[i], strerror(errno));
 			_exit(1);
 		}
 
 		/* Close _our_ copy */
 		close(fds[i]);
+	}
 
-		/* Confirm setting by echoing the command back */
-		if (send(s, &cmd, sizeof(cmd), 0) < 0) {
-			fprintf(stderr, "Reply to command %i: %s\n",
-				o, strerror(errno));
-			_exit(1);
-		}
+	/* Confirm setting by echoing the command back */
+	if (send(s, &cmd, sizeof(cmd), 0) < 0) {
+		fprintf(stderr, "Reply to %i: %s\n", op, strerror(errno));
+		_exit(1);
 	}
 
 	goto loop;

From a3d142a6f64d89fffe26634e158dedd55fa31e7b Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 3 Feb 2025 09:22:10 +0100
Subject: [PATCH 217/382] conf: Don't map DNS traffic to host, if host gateway
 is a resolver

This should be a relatively common case and I'm a bit surprised it's
been broken since I added the "gateway mapping" functionality, but it
doesn't happen with Podman, and not with systemd-resolved or similar
local proxies, and also not with servers where typically the gateway
is just a router and not a DNS resolver. That could be the reason why
nobody noticed until now.

By default, we'll map the address of the default gateway, in
containers and guests, to represent "the host", so that we have a
well-defined way to reach the host. Say:

  0.0029:     NAT to host 127.0.0.1: 192.168.100.1

But if the host gateway is also a DNS resolver:

  0.0029: DNS:
  0.0029:     192.168.100.1

then we'll send DNS queries directed to it to the host instead:

  0.0372: Flow 0 (INI): TAP [192.168.100.157]:41892 -> [192.168.100.1]:53 => ?
  0.0372: Flow 0 (TGT): INI -> TGT
  0.0373: Flow 0 (TGT): TAP [192.168.100.157]:41892 -> [192.168.100.1]:53 => HOST [0.0.0.0]:41892 -> [127.0.0.1]:53
  0.0373: Flow 0 (UDP flow): TGT -> TYPED
  0.0373: Flow 0 (UDP flow): TAP [192.168.100.157]:41892 -> [192.168.100.1]:53 => HOST [0.0.0.0]:41892 -> [127.0.0.1]:53
  0.0373: Flow 0 (UDP flow): Side 0 hash table insert: bucket: 31049
  0.0374: Flow 0 (UDP flow): TYPED -> ACTIVE
  0.0374: Flow 0 (UDP flow): TAP [192.168.100.157]:41892 -> [192.168.100.1]:53 => HOST [0.0.0.0]:41892 -> [127.0.0.1]:53

which doesn't quite work, of course:

  0.0374: pasta: epoll event on UDP reply socket 95 (events: 0x00000008)
  0.0374: ICMP error on UDP socket 95: Connection refused

unless the host is a resolver itself... but then we wouldn't find the
address of the gateway in its /etc/resolv.conf, presumably.

Fix this by making an exception for DNS traffic: if the default
gateway is a resolver, match on DNS traffic going to the default
gateway, and explicitly forward it to the configured resolver.

Reported-by: Prafulla Giri <prafulla.giri@protonmail.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c  | 16 ++++++++++------
 passt.1 | 14 ++++++++++----
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/conf.c b/conf.c
index f5d04db..142dc94 100644
--- a/conf.c
+++ b/conf.c
@@ -426,10 +426,12 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver,
 		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
 			c->ip4.dns_host = ns4;
 
-		/* Guest or container can only access local addresses via
-		 * redirect
+		/* Special handling if guest or container can only access local
+		 * addresses via redirect, or if the host gateway is also a
+		 * resolver and we shadow its address
 		 */
-		if (IN4_IS_ADDR_LOOPBACK(&ns4)) {
+		if (IN4_IS_ADDR_LOOPBACK(&ns4) ||
+		    IN4_ARE_ADDR_EQUAL(&ns4, &c->ip4.map_host_loopback)) {
 			if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
 				return;
 
@@ -445,10 +447,12 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver,
 		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
 			c->ip6.dns_host = ns6;
 
-		/* Guest or container can only access local addresses via
-		 * redirect
+		/* Special handling if guest or container can only access local
+		 * addresses via redirect, or if the host gateway is also a
+		 * resolver and we shadow its address
 		 */
-		if (IN6_IS_ADDR_LOOPBACK(&ns6)) {
+		if (IN6_IS_ADDR_LOOPBACK(&ns6) ||
+		    IN6_ARE_ADDR_EQUAL(&ns6, &c->ip6.map_host_loopback)) {
 			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
 				return;
 
diff --git a/passt.1 b/passt.1
index 2928af5..29cc3ed 100644
--- a/passt.1
+++ b/passt.1
@@ -941,10 +941,16 @@ with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
 the last observed source address from guest or namespace is 192.0.2.2, this will
 be translated to a connection from 192.0.2.1 to 192.0.2.2.
 
-Similarly, for traffic coming from guest or namespace, packets with
-destination address corresponding to the \fB\-\-map-host-loopback\fR
-address will have their destination address translated to a loopback
-address.
+Similarly, for traffic coming from guest or namespace, packets with destination
+address corresponding to the \fB\-\-map-host-loopback\fR address will have their
+destination address translated to a loopback address.
+
+As an exception, traffic identified as DNS, originally directed to the
+\fB\-\-map-host-loopback\fR address, if this address matches a resolver address
+on the host, is \fBnot\fR translated to loopback, but rather handled in the same
+way as if specified as \-\-dns-forward address, if no such option was given.
+In the common case where the host gateway also acts a resolver, this avoids that
+the host mapping shadows the gateway/resolver itself.
 
 .SS Handling of local traffic in pasta
 

From 31e8109a86eeebb473ffba8124a3f399cf0aeccf Mon Sep 17 00:00:00 2001
From: Enrique Llorente <ellorent@redhat.com>
Date: Fri, 7 Feb 2025 12:36:55 +0100
Subject: [PATCH 218/382] dhcp, dhcpv6: Add hostname and client fqdn ops

Both DHCPv4 and DHCPv6 has the capability to pass the hostname to
clients, the DHCPv4 uses option 12 (hostname) while the DHCPv6 uses option 39
(client fqdn), for some virt deployments like kubevirt is expected to
have the VirtualMachine name as the guest hostname.

This change add the following arguments:
 - -H --hostname NAME to configure the hostname DHCPv4 option(12)
 - --fqdn NAME to configure client fqdn option for both DHCPv4(81) and
   DHCPv6(39)

Signed-off-by: Enrique Llorente <ellorent@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c           | 20 ++++++++--
 dhcp.c           | 61 +++++++++++++++++++++++++----
 dhcpv6.c         | 99 ++++++++++++++++++++++++++++++++++++++++--------
 passt.1          | 10 +++++
 passt.h          |  5 +++
 pasta.c          | 17 +++++++--
 test/lib/setup   | 10 ++---
 test/passt.mbuto |  6 ++-
 test/passt/dhcp  | 15 +++++++-
 util.c           | 24 ++++++++++++
 util.h           |  6 +++
 11 files changed, 235 insertions(+), 38 deletions(-)

diff --git a/conf.c b/conf.c
index 142dc94..d9de07c 100644
--- a/conf.c
+++ b/conf.c
@@ -858,7 +858,9 @@ static void usage(const char *name, FILE *f, int status)
 		FPRINTF(f, "    default: use addresses from /etc/resolv.conf\n");
 	FPRINTF(f,
 		"  -S, --search LIST	Space-separated list, search domains\n"
-		"    a single, empty option disables the DNS search list\n");
+		"    a single, empty option disables the DNS search list\n"
+		"  -H, --hostname NAME 	Hostname to configure client with\n"
+		"  --fqdn NAME		FQDN to configure client with\n");
 	if (strstr(name, "pasta"))
 		FPRINTF(f, "    default: don't use any search list\n");
 	else
@@ -1316,6 +1318,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"outbound",	required_argument,	NULL,		'o' },
 		{"dns",		required_argument,	NULL,		'D' },
 		{"search",	required_argument,	NULL,		'S' },
+		{"hostname",	required_argument,	NULL,		'H' },
 		{"no-tcp",	no_argument,		&c->no_tcp,	1 },
 		{"no-udp",	no_argument,		&c->no_udp,	1 },
 		{"no-icmp",	no_argument,		&c->no_icmp,	1 },
@@ -1360,6 +1363,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		/* vhost-user backend program convention */
 		{"print-capabilities", no_argument,	NULL,		26 },
 		{"socket-path",	required_argument,	NULL,		's' },
+		{"fqdn",	required_argument,	NULL,		27 },
 		{ 0 },
 	};
 	const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@@ -1382,9 +1386,9 @@ void conf(struct ctx *c, int argc, char **argv)
 	if (c->mode == MODE_PASTA) {
 		c->no_dhcp_dns = c->no_dhcp_dns_search = 1;
 		fwd_default = FWD_AUTO;
-		optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:46t:u:T:U:";
+		optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:";
 	} else {
-		optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:461t:u:";
+		optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:";
 	}
 
 	c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
@@ -1561,6 +1565,11 @@ void conf(struct ctx *c, int argc, char **argv)
 		case 26:
 			vu_print_capabilities();
 			break;
+		case 27:
+			if (snprintf_check(c->fqdn, PASST_MAXDNAME,
+					   "%s", optarg))
+				die("Invalid FQDN: %s", optarg);
+			break;
 		case 'd':
 			c->debug = 1;
 			c->quiet = 0;
@@ -1730,6 +1739,11 @@ void conf(struct ctx *c, int argc, char **argv)
 
 			die("Cannot use DNS search domain %s", optarg);
 			break;
+		case 'H':
+			if (snprintf_check(c->hostname, PASST_MAXDNAME,
+					   "%s", optarg))
+				die("Invalid hostname: %s", optarg);
+			break;
 		case '4':
 			v4_only = true;
 			v6_only = false;
diff --git a/dhcp.c b/dhcp.c
index 2a23ed4..401cb5b 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -63,6 +63,11 @@ static struct opt opts[255];
 
 #define OPT_MIN		60 /* RFC 951 */
 
+/* Total option size (excluding end option) is 576 (RFC 2131), minus
+ * offset of options (268), minus end option and its length (2).
+ */
+#define OPT_MAX		306
+
 /**
  * dhcp_init() - Initialise DHCP options
  */
@@ -122,7 +127,7 @@ struct msg {
 	uint8_t sname[64];
 	uint8_t file[128];
 	uint32_t magic;
-	uint8_t o[308];
+	uint8_t o[OPT_MAX + 2 /* End option and its length */ ];
 } __attribute__((__packed__));
 
 /**
@@ -130,15 +135,28 @@ struct msg {
  * @m:		Message to fill
  * @o:		Option number
  * @offset:	Current offset within options field, updated on insertion
+ *
+ * Return: false if m has space to write the option, true otherwise
  */
-static void fill_one(struct msg *m, int o, int *offset)
+static bool fill_one(struct msg *m, int o, int *offset)
 {
+	size_t slen = opts[o].slen;
+
+	/* If we don't have space to write the option, then just skip */
+	if (*offset + 1 /* length of option */ + slen > OPT_MAX)
+		return true;
+
 	m->o[*offset] = o;
-	m->o[*offset + 1] = opts[o].slen;
-	memcpy(&m->o[*offset + 2], opts[o].s, opts[o].slen);
+	m->o[*offset + 1] = slen;
+
+	/* Move to option */
+	*offset += 2;
+
+	memcpy(&m->o[*offset], opts[o].s, slen);
 
 	opts[o].sent = 1;
-	*offset += 2 + opts[o].slen;
+	*offset += slen;
+	return false;
 }
 
 /**
@@ -159,17 +177,20 @@ static int fill(struct msg *m)
 	 * Put it there explicitly, unless requested via option 55.
 	 */
 	if (opts[55].clen > 0 && !memchr(opts[55].c, 53, opts[55].clen))
-		fill_one(m, 53, &offset);
+		if (fill_one(m, 53, &offset))
+			 debug("DHCP: skipping option 53");
 
 	for (i = 0; i < opts[55].clen; i++) {
 		o = opts[55].c[i];
 		if (opts[o].slen != -1)
-			fill_one(m, o, &offset);
+			if (fill_one(m, o, &offset))
+				debug("DHCP: skipping option %i", o);
 	}
 
 	for (o = 0; o < 255; o++) {
 		if (opts[o].slen != -1 && !opts[o].sent)
-			fill_one(m, o, &offset);
+			if (fill_one(m, o, &offset))
+				debug("DHCP: skipping option %i", o);
 	}
 
 	m->o[offset++] = 255;
@@ -411,6 +432,30 @@ int dhcp(const struct ctx *c, const struct pool *p)
 	if (!opts[6].slen)
 		opts[6].slen = -1;
 
+	opt_len = strlen(c->hostname);
+	if (opt_len > 0) {
+		opts[12].slen = opt_len;
+		memcpy(opts[12].s, &c->hostname, opt_len);
+	}
+
+	opt_len = strlen(c->fqdn);
+	if (opt_len > 0) {
+		opt_len += 3 /* flags */
+			+ 2; /* Length byte for first label, and terminator */
+
+		if (sizeof(opts[81].s) >= opt_len) {
+			opts[81].s[0] = 0x4; /* flags (E) */
+			opts[81].s[1] = 0xff; /* RCODE1 */
+			opts[81].s[2] = 0xff; /* RCODE2 */
+
+			encode_domain_name((char *)opts[81].s + 3, c->fqdn);
+
+			opts[81].slen = opt_len;
+		} else {
+			debug("DHCP: client FQDN option doesn't fit, skipping");
+		}
+	}
+
 	if (!c->no_dhcp_dns_search)
 		opt_set_dns_search(c, sizeof(m->o));
 
diff --git a/dhcpv6.c b/dhcpv6.c
index 0523bba..373a988 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -48,6 +48,7 @@ struct opt_hdr {
 # define  STATUS_NOTONLINK	htons_constant(4)
 # define OPT_DNS_SERVERS	htons_constant(23)
 # define OPT_DNS_SEARCH		htons_constant(24)
+# define OPT_CLIENT_FQDN	htons_constant(39)
 #define   STR_NOTONLINK		"Prefix not appropriate for link."
 
 	uint16_t l;
@@ -58,6 +59,9 @@ struct opt_hdr {
 					      sizeof(struct opt_hdr))
 #define OPT_VSIZE(x)		(sizeof(struct opt_##x) - 		\
 				 sizeof(struct opt_hdr))
+#define OPT_MAX_SIZE		IPV6_MIN_MTU - (sizeof(struct ipv6hdr) + \
+						sizeof(struct udphdr) + \
+						sizeof(struct msg_hdr))
 
 /**
  * struct opt_client_id - DHCPv6 Client Identifier option
@@ -163,6 +167,18 @@ struct opt_dns_search {
 	char list[MAXDNSRCH * NS_MAXDNAME];
 } __attribute__((packed));
 
+/**
+ * struct opt_client_fqdn - Client FQDN option (RFC 4704)
+ * @hdr:		Option header
+ * @flags:		Flags described by RFC 4704
+ * @domain_name:	Client FQDN
+ */
+struct opt_client_fqdn {
+	struct opt_hdr hdr;
+	uint8_t flags;
+	char domain_name[PASST_MAXDNAME];
+} __attribute__((packed));
+
 /**
  * struct msg_hdr - DHCPv6 client/server message header
  * @type:		DHCP message type
@@ -193,6 +209,7 @@ struct msg_hdr {
  * @client_id:		Client Identifier, variable length
  * @dns_servers:	DNS Recursive Name Server, here just for storage size
  * @dns_search:		Domain Search List, here just for storage size
+ * @client_fqdn:	Client FQDN, variable length
  */
 static struct resp_t {
 	struct msg_hdr hdr;
@@ -203,6 +220,7 @@ static struct resp_t {
 	struct opt_client_id client_id;
 	struct opt_dns_servers dns_servers;
 	struct opt_dns_search dns_search;
+	struct opt_client_fqdn client_fqdn;
 } __attribute__((__packed__)) resp = {
 	{ 0 },
 	SERVER_ID,
@@ -228,6 +246,10 @@ static struct resp_t {
 	{ { OPT_DNS_SEARCH,	0, },
 	  { 0 },
 	},
+
+	{ { OPT_CLIENT_FQDN, 0, },
+	  0, { 0 },
+	},
 };
 
 static const struct opt_status_code sc_not_on_link = {
@@ -346,7 +368,6 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset)
 {
 	struct opt_dns_servers *srv = NULL;
 	struct opt_dns_search *srch = NULL;
-	char *p = NULL;
 	int i;
 
 	if (c->no_dhcp_dns)
@@ -383,34 +404,81 @@ search:
 		if (!name_len)
 			continue;
 
+		name_len += 2; /* Length byte for first label, and terminator */
+		if (name_len >
+		    NS_MAXDNAME + 1 /* Length byte for first label */ ||
+		    name_len > 255) {
+			debug("DHCP: DNS search name '%s' too long, skipping",
+			      c->dns_search[i].n);
+			continue;
+		}
+
 		if (!srch) {
 			srch = (struct opt_dns_search *)(buf + offset);
 			offset += sizeof(struct opt_hdr);
 			srch->hdr.t = OPT_DNS_SEARCH;
 			srch->hdr.l = 0;
-			p = srch->list;
 		}
 
-		*p = '.';
-		p = stpncpy(p + 1, c->dns_search[i].n, name_len);
-		p++;
-		srch->hdr.l += name_len + 2;
-		offset += name_len + 2;
+		encode_domain_name(buf + offset, c->dns_search[i].n);
+
+		srch->hdr.l += name_len;
+		offset += name_len;
+
 	}
 
-	if (srch) {
-		for (i = 0; i < srch->hdr.l; i++) {
-			if (srch->list[i] == '.') {
-				srch->list[i] = strcspn(srch->list + i + 1,
-							".");
-			}
-		}
+	if (srch)
 		srch->hdr.l = htons(srch->hdr.l);
-	}
 
 	return offset;
 }
 
+/**
+ * dhcpv6_client_fqdn_fill() - Fill in client FQDN option
+ * @c:		Execution context
+ * @buf:	Response message buffer where options will be appended
+ * @offset:	Offset in message buffer for new options
+ *
+ * Return: updated length of response message buffer.
+ */
+static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
+				      char *buf, int offset)
+
+{
+	struct opt_client_fqdn const *req_opt;
+	struct opt_client_fqdn *o;
+	size_t opt_len;
+
+	opt_len = strlen(c->fqdn);
+	if (opt_len == 0) {
+		return offset;
+	}
+
+	opt_len += 2; /* Length byte for first label, and terminator */
+	if (opt_len > OPT_MAX_SIZE - (offset +
+				      sizeof(struct opt_hdr) +
+				      1 /* flags */ )) {
+		debug("DHCPv6: client FQDN option doesn't fit, skipping");
+		return offset;
+	}
+
+	o = (struct opt_client_fqdn *)(buf + offset);
+	encode_domain_name(o->domain_name, c->fqdn);
+	req_opt = (struct opt_client_fqdn *)dhcpv6_opt(p, &(size_t){ 0 },
+						       OPT_CLIENT_FQDN);
+	if (req_opt && req_opt->flags & 0x01 /* S flag */)
+		o->flags = 0x02 /* O flag */;
+	else
+		o->flags = 0x00;
+
+	opt_len++;
+
+	o->hdr.t = OPT_CLIENT_FQDN;
+	o->hdr.l = htons(opt_len);
+
+	return offset + sizeof(struct opt_hdr) + opt_len;
+}
+
 /**
  * dhcpv6() - Check if this is a DHCPv6 message, reply as needed
  * @c:		Execution context
@@ -544,6 +612,7 @@ int dhcpv6(struct ctx *c, const struct pool *p,
 	n = offsetof(struct resp_t, client_id) +
 	    sizeof(struct opt_hdr) + ntohs(client_id->l);
 	n = dhcpv6_dns_fill(c, (char *)&resp, n);
+	n = dhcpv6_client_fqdn_fill(p, c, (char *)&resp, n);
 
 	resp.hdr.xid = mh->xid;
 
diff --git a/passt.1 b/passt.1
index 29cc3ed..9d347d8 100644
--- a/passt.1
+++ b/passt.1
@@ -401,6 +401,16 @@ Enable IPv6-only operation. IPv4 traffic will be ignored.
 By default, IPv4 operation is enabled as long as at least an IPv4 route and an
 interface address are configured on a given host interface.
 
+.TP
+.BR \-H ", " \-\-hostname " " \fIname
+Hostname to configure the client with.
+Send \fIname\fR as DHCP option 12 (hostname).
+
+.TP
+.BR \-\-fqdn " " \fIname
+FQDN to configure the client with.
+Send \fIname\fR as Client FQDN: DHCP option 81 and DHCPv6 option 39.
+
 .SS \fBpasst\fR-only options
 
 .TP
diff --git a/passt.h b/passt.h
index 0dd4efa..f3151f0 100644
--- a/passt.h
+++ b/passt.h
@@ -209,6 +209,8 @@ struct ip6_ctx {
  * @ifi4:		Template interface for IPv4, -1: none, 0: IPv4 disabled
  * @ip:			IPv4 configuration
  * @dns_search:		DNS search list
+ * @hostname:		Guest hostname
+ * @fqdn:		Guest FQDN
  * @ifi6:		Template interface for IPv6, -1: none, 0: IPv6 disabled
  * @ip6:		IPv6 configuration
  * @pasta_ifn:		Name of namespace interface for pasta
@@ -269,6 +271,9 @@ struct ctx {
 
 	struct fqdn dns_search[MAXDNSRCH];
 
+	char hostname[PASST_MAXDNAME];
+	char fqdn[PASST_MAXDNAME];
+
 	int ifi6;
 	struct ip6_ctx ip6;
 
diff --git a/pasta.c b/pasta.c
index f15084d..585a51c 100644
--- a/pasta.c
+++ b/pasta.c
@@ -169,10 +169,12 @@ void pasta_open_ns(struct ctx *c, const char *netns)
  * struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd()
  * @exe:	Executable to run
  * @argv:	Command and arguments to run
+ * @ctx:	Context to read config from
  */
 struct pasta_spawn_cmd_arg {
 	const char *exe;
 	char *const *argv;
+	struct ctx *c;
 };
 
 /**
@@ -186,6 +188,7 @@ static int pasta_spawn_cmd(void *arg)
 {
 	char hostname[HOST_NAME_MAX + 1] = HOSTNAME_PREFIX;
 	const struct pasta_spawn_cmd_arg *a;
+	size_t conf_hostname_len;
 	sigset_t set;
 
 	/* We run in a detached PID and mount namespace: mount /proc over */
@@ -195,9 +198,15 @@ static int pasta_spawn_cmd(void *arg)
 	if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
 		warn("Cannot set ping_group_range, ICMP requests might fail");
 
-	if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1,
-			 HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) ||
-	    errno == ENAMETOOLONG) {
+	a = (const struct pasta_spawn_cmd_arg *)arg;
+
+	conf_hostname_len = strlen(a->c->hostname);
+	if (conf_hostname_len > 0) {
+		if (sethostname(a->c->hostname, conf_hostname_len))
+			warn("Unable to set configured hostname");
+	} else if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1,
+				HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) ||
+		   errno == ENAMETOOLONG) {
 		hostname[HOST_NAME_MAX] = '\0';
 		if (sethostname(hostname, strlen(hostname)))
 			warn("Unable to set pasta-prefixed hostname");
@@ -208,7 +217,6 @@ static int pasta_spawn_cmd(void *arg)
 	sigaddset(&set, SIGUSR1);
 	sigwaitinfo(&set, NULL);
 
-	a = (const struct pasta_spawn_cmd_arg *)arg;
 	execvp(a->exe, a->argv);
 
 	die_perror("Failed to start command or shell");
@@ -230,6 +238,7 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
 	struct pasta_spawn_cmd_arg arg = {
 		.exe = argv[0],
 		.argv = argv,
+		.c = c,
 	};
 	char uidmap[BUFSIZ], gidmap[BUFSIZ];
 	char *sh_argv[] = { NULL, NULL };
diff --git a/test/lib/setup b/test/lib/setup
index 580825f..ee67152 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -49,7 +49,7 @@ setup_passt() {
 
 	context_run passt "make clean"
 	context_run passt "make valgrind"
-	context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -P ${STATESETUP}/passt.pid"
+	context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -H hostname1 --fqdn fqdn1.passt.test -P ${STATESETUP}/passt.pid"
 
 	# pidfile isn't created until passt is listening
 	wait_for [ -f "${STATESETUP}/passt.pid" ]
@@ -160,11 +160,11 @@ setup_passt_in_ns() {
 	if [ ${VALGRIND} -eq 1 ]; then
 		context_run passt "make clean"
 		context_run passt "make valgrind"
-		context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
+		context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
 	else
 		context_run passt "make clean"
 		context_run passt "make"
-		context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
+		context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
 	fi
 	wait_for [ -f "${STATESETUP}/passt.pid" ]
 
@@ -243,7 +243,7 @@ setup_two_guests() {
 	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
 	[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
 
-	context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001"
+	context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} --fqdn fqdn1.passt.test -H hostname1 -t 10001 -u 10001"
 	wait_for [ -f "${STATESETUP}/passt_1.pid" ]
 
 	__opts=
@@ -252,7 +252,7 @@ setup_two_guests() {
 	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
 	[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
 
-	context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004"
+	context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} --hostname hostname2 --fqdn fqdn2 -t 10004 -u 10004"
 	wait_for [ -f "${STATESETUP}/passt_2.pid" ]
 
 	__vmem="$((${MEM_KIB} / 1024 / 4))"
diff --git a/test/passt.mbuto b/test/passt.mbuto
index d4d57cb..e45a284 100755
--- a/test/passt.mbuto
+++ b/test/passt.mbuto
@@ -13,7 +13,7 @@
 PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod
        modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname
        sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
-       nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump}"
+       nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump env}"
 
 # OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and
 # sshd-session the per-session program. We need the latter as well, and the path
@@ -41,6 +41,7 @@ FIXUP="${FIXUP}"'
 #!/bin/sh
 LOG=/var/log/dhclient-script.log
 echo \${reason} \${interface} >> \$LOG
+env >> \$LOG
 set >> \$LOG
 
 [ -n "\${new_interface_mtu}" ]       && ip link set dev \${interface} mtu \${new_interface_mtu}
@@ -54,7 +55,8 @@ set >> \$LOG
 [ -n "\${new_ip6_address}" ]         && ip addr add \${new_ip6_address}/\${new_ip6_prefixlen} dev \${interface}
 [ -n "\${new_dhcp6_name_servers}" ]  && for d in \${new_dhcp6_name_servers}; do echo "nameserver \${d}%\${interface}" >> /etc/resolv.conf; done
 [ -n "\${new_dhcp6_domain_search}" ] && (printf "search"; for d in \${new_dhcp6_domain_search}; do printf " %s" "\${d}"; done; printf "\n") >> /etc/resolv.conf
-[ -n "\${new_host_name}" ]           && hostname "\${new_host_name}"
+[ -n "\${new_host_name}" ]           && echo "\${new_host_name}" > /tmp/new_host_name
+[ -n "\${new_fqdn_fqdn}" ]           && echo "\${new_fqdn_fqdn}" > /tmp/new_fqdn_fqdn
 exit 0
 EOF
 	chmod 755 /sbin/dhclient-script
diff --git a/test/passt/dhcp b/test/passt/dhcp
index 9925ab9..145f1ba 100644
--- a/test/passt/dhcp
+++ b/test/passt/dhcp
@@ -11,7 +11,7 @@
 # Copyright (c) 2021 Red Hat GmbH
 # Author: Stefano Brivio <sbrivio@redhat.com>
 
-gtools	ip jq dhclient sed tr
+gtools	ip jq dhclient sed tr hostname
 htools	ip jq sed tr head
 
 test	Interface name
@@ -47,7 +47,16 @@ gout	SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^searc
 hout	HOST_SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
 check	[ "__SEARCH__" = "__HOST_SEARCH__" ]
 
+test	DHCP: Hostname
+gout	NEW_HOST_NAME cat /tmp/new_host_name
+check	[ "__NEW_HOST_NAME__" = "hostname1" ]
+
+test	DHCP: Client FQDN
+gout	NEW_FQDN_FQDN cat /tmp/new_fqdn_fqdn
+check	[ "__NEW_FQDN_FQDN__" = "fqdn1.passt.test" ]
+
 test	DHCPv6: address
+guest	rm /tmp/new_fqdn_fqdn
 guest	/sbin/dhclient -6 __IFNAME__
 # Wait for DAD to complete
 guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
@@ -70,3 +79,7 @@ test	DHCPv6: search list
 gout	SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
 hout	HOST_SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
 check	[ "__SEARCH6__" = "__HOST_SEARCH6__" ]
+
+test	DHCPv6: Hostname
+gout	NEW_FQDN_FQDN cat /tmp/new_fqdn_fqdn
+check	[ "__NEW_FQDN_FQDN__" = "fqdn1.passt.test" ]
diff --git a/util.c b/util.c
index 4d51e04..ba33866 100644
--- a/util.c
+++ b/util.c
@@ -930,4 +930,28 @@ void raw_random(void *buf, size_t buflen)
 void epoll_del(const struct ctx *c, int fd)
 {
 	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
+
+}
+
+/**
+ * encode_domain_name() - Encode domain name according to RFC 1035, section 3.1
+ * @buf:		Buffer to fill in with encoded domain name
+ * @domain_name:	Input domain name string with terminator
+ *
+ * The buffer's 'buf' size has to be >= strlen(domain_name) + 2
+ */
+void encode_domain_name(char *buf, const char *domain_name)
+{
+	size_t i;
+	char *p;
+
+	buf[0] = strcspn(domain_name, ".");
+	p = buf + 1;
+	for (i = 0; domain_name[i]; i++) {
+		if (domain_name[i] == '.')
+			p[i] = strcspn(domain_name + i + 1, ".");
+		else
+			p[i] = domain_name[i];
+	}
+	p[i] = 0L;
 }
diff --git a/util.h b/util.h
index 23b165c..9c92a37 100644
--- a/util.h
+++ b/util.h
@@ -40,6 +40,9 @@
 #ifndef IP_MAX_MTU
 #define IP_MAX_MTU			USHRT_MAX
 #endif
+#ifndef IPV6_MIN_MTU
+#define IPV6_MIN_MTU			1280
+#endif
 
 #ifndef MIN
 #define MIN(x, y)		(((x) < (y)) ? (x) : (y))
@@ -352,4 +355,7 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr,
 #define accept4(s, addr, addrlen, flags) \
 	wrap_accept4((s), (addr), (addrlen), (flags))
 
+#define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */
+void encode_domain_name(char *buf, const char *domain_name);
+
 #endif /* UTIL_H */

From 472e2e930f6e17d9d8664d6cf44c47af1db58bb3 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 11 Feb 2025 20:11:00 +0100
Subject: [PATCH 219/382] tcp: Don't discard window information on keep-alive
 segments

It looks like a detail, but it's critical if we're dealing with
somebody, such as near-future self, using TCP_REPAIR to migrate TCP
connections in the guest or container.

The last packet sent from the 'source' process/guest/container
typically reports a small window, or zero, because the guest/container
hadn't been draining it for a while.

The next packet, appearing as the target sets TCP_REPAIR_OFF on the
migrated socket, is a keep-alive (also called "window probe" in CRIU
or TCP_REPAIR-related code), and it comes with an updated window
value, reflecting the pre-migration "regular" value.

If we ignore it, it might take a while/forever before we realise we
can actually restart sending.

Fixes: 238c69f9af45 ("tcp: Acknowledge keep-alive segments, ignore them for the rest")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tcp.c b/tcp.c
index af6bd95..2addf4a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1664,8 +1664,10 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			tcp_send_flag(c, conn, ACK);
 			tcp_timer_ctl(c, conn);
 
-			if (p->count == 1)
+			if (p->count == 1) {
+				tcp_tap_window_update(conn, ntohs(th->window));
 				return 1;
+			}
 
 			continue;
 		}

From 90f91fe72673e36c8e071a1750e9c03deb20ab0f Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 11 Feb 2025 20:19:05 +0100
Subject: [PATCH 220/382] tcp: Implement conservative zero-window probe on ACK
 timeout

This probably doesn't cover all the cases where we should send a
zero-window probe, but it's rather unobtrusive and obvious, so start
from here, also because I just observed this case (without the fix
from the previous patch, to take into account window information from
keep-alive segments).

If we hit the ACK timeout, and try re-sending data from the socket,
if the window is zero, we'll just fail again, go back to the timer,
and so on, until we hit the maximum number of re-transmissions and
reset the connection.

Don't do that: forcibly try to send something by implementing the
equivalent of a zero-window probe in this case.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tcp.c b/tcp.c
index 2addf4a..b87478f 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2175,6 +2175,8 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 			flow_dbg(conn, "ACK timeout, retry");
 			conn->retrans++;
 			conn->seq_to_tap = conn->seq_ack_from_tap;
+			if (!conn->wnd_from_tap)
+				conn->wnd_from_tap = 1; /* Zero-window probe */
 			if (tcp_set_peek_offset(conn->sock, 0)) {
 				tcp_rst(c, conn);
 			} else {

From def7de4690ddb40f7c3b29e6ca81d30e9409fb5d Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 11 Feb 2025 20:43:32 +0100
Subject: [PATCH 221/382] tcp_vu: Fix off-by one in header count array
 adjustment

head_cnt represents the number of frames we're going to forward to the
guest in tcp_vu_sock_recv(), each of which could require multiple
buffers ("elements").  We initialise it with as many frames as we can
find space for in vu buffers, and we then need to adjust it down to
the number of frames we actually (partially) filled.

We adjust it down based on number of individual buffers used by the
data from recvmsg().  At this point 'i' is *one greater than* that
number of buffers, so we need to discard all (unused) frames with a
buffer index >= i, instead of > i.

Reported-by: David Gibson <david@gibson.dropbear.id.au>
[david: Contributed actual commit message]
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_vu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcp_vu.c b/tcp_vu.c
index fad7065..0622f17 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -261,7 +261,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 		len -= iov->iov_len;
 	}
 	/* adjust head count */
-	while (head_cnt > 0 && head[head_cnt - 1] > i)
+	while (head_cnt > 0 && head[head_cnt - 1] >= i)
 		head_cnt--;
 	/* mark end of array */
 	head[head_cnt] = i;

From 836fe215e049ee423750d3315a02742d8224eab2 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 12 Feb 2025 01:07:33 +0100
Subject: [PATCH 222/382] passt-repair: Fix off-by-one in check for number of
 file descriptors

Actually, 254 is too many, but 253 isn't.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt-repair.c b/passt-repair.c
index 614cee0..1174ae3 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -131,7 +131,7 @@ loop:
 	/* No inverse formula for CMSG_LEN(x), and building one with CMSG_LEN(0)
 	 * works but there's no guarantee it does. Search the whole domain.
 	 */
-	for (i = 1; i < SCM_MAX_FD; i++) {
+	for (i = 1; i <= SCM_MAX_FD; i++) {
 		if (CMSG_LEN(sizeof(int) * i) == cmsg->cmsg_len) {
 			n = i;
 			break;

From 5911e08c0f53e46547e7eeb1dd824c8ab96e512e Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 12 Feb 2025 18:07:13 +1100
Subject: [PATCH 223/382] migrate: Skeleton of live migration logic

Introduce facilities for guest migration on top of vhost-user
infrastructure.  Add migration facilities based on top of the current
vhost-user infrastructure, moving vu_migrate() and related functions
to migrate.c.

Versioned migration stages define function pointers to be called on
source or target, or data sections that need to be transferred.

The migration header consists of a magic number, a version number for the
encoding, and a "compat_version" which represents the oldest version which
is compatible with the current one.  We don't use it yet, but that allows
for the future possibility of backwards compatible protocol extensions.

Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile     |  12 +--
 epoll_type.h |   2 -
 migrate.c    | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++
 migrate.h    |  51 ++++++++++++
 passt.c      |   8 +-
 passt.h      |   8 ++
 util.h       |  29 +++++++
 vhost_user.c |  60 +++------------
 virtio.h     |   4 -
 vu_common.c  |  49 +-----------
 vu_common.h  |   2 +-
 11 files changed, 324 insertions(+), 115 deletions(-)
 create mode 100644 migrate.c
 create mode 100644 migrate.h

diff --git a/Makefile b/Makefile
index d3d4b78..be89b07 100644
--- a/Makefile
+++ b/Makefile
@@ -38,8 +38,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
 
 PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
 	icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
-	ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
-	tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
+	ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \
+	tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
 	vhost_user.c virtio.c vu_common.c
 QRAP_SRCS = qrap.c
 PASST_REPAIR_SRCS = passt-repair.c
@@ -49,10 +49,10 @@ MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
 
 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
-	lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
-	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
-	tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \
-	virtio.h vu_common.h
+	lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
+	pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \
+	tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \
+	vhost_user.h virtio.h vu_common.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
 C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/epoll_type.h b/epoll_type.h
index fd9eac3..f3ef415 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -40,8 +40,6 @@ enum epoll_type {
 	EPOLL_TYPE_VHOST_CMD,
 	/* vhost-user kick event socket */
 	EPOLL_TYPE_VHOST_KICK,
-	/* vhost-user migration socket */
-	EPOLL_TYPE_VHOST_MIGRATION,
 
 	EPOLL_NUM_TYPES,
 };
diff --git a/migrate.c b/migrate.c
new file mode 100644
index 0000000..aeac872
--- /dev/null
+++ b/migrate.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ *  for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ *  for network namespace/tap device mode
+ *
+ * migrate.c - Migration sections, layout, and routines
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <errno.h>
+#include <sys/uio.h>
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "inany.h"
+#include "flow.h"
+#include "flow_table.h"
+
+#include "migrate.h"
+
+/* Magic identifier for migration data */
+#define MIGRATE_MAGIC		0xB1BB1D1B0BB1D1B0
+
+/* Stages for version 1 */
+static const struct migrate_stage stages_v1[] = {
+	{ 0 },
+};
+
+/* Supported encoding versions, from latest (most preferred) to oldest */
+static const struct migrate_version versions[] = {
+	{ 1,	stages_v1, },
+	{ 0 },
+};
+
+/* Current encoding version */
+#define CURRENT_VERSION		(&versions[0])
+
+/**
+ * migrate_source() - Migration as source, send state to hypervisor
+ * @c:		Execution context
+ * @fd:		File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+static int migrate_source(struct ctx *c, int fd)
+{
+	const struct migrate_version *v = CURRENT_VERSION;
+	const struct migrate_header header = {
+		.magic		= htonll_constant(MIGRATE_MAGIC),
+		.version	= htonl(v->id),
+		.compat_version	= htonl(v->id),
+	};
+	const struct migrate_stage *s;
+	int ret;
+
+	if (write_all_buf(fd, &header, sizeof(header))) {
+		ret = errno;
+		err("Can't send migration header: %s, abort", strerror_(ret));
+		return ret;
+	}
+
+	for (s = v->s; s->name; s++) {
+		if (!s->source)
+			continue;
+
+		debug("Source side migration stage: %s", s->name);
+
+		if ((ret = s->source(c, s, fd))) {
+			err("Source migration stage: %s: %s, abort", s->name,
+			    strerror_(ret));
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * migrate_target_read_header() - Read header in target
+ * @fd:		Descriptor for state transfer
+ *
+ * Return: version structure on success, NULL on failure with errno set
+ */
+static const struct migrate_version *migrate_target_read_header(int fd)
+{
+	const struct migrate_version *v;
+	struct migrate_header h;
+	uint32_t id, compat_id;
+
+	if (read_all_buf(fd, &h, sizeof(h)))
+		return NULL;
+
+	id = ntohl(h.version);
+	compat_id = ntohl(h.compat_version);
+
+	debug("Source magic: 0x%016" PRIx64 ", version: %u, compat: %u",
+	      ntohll(h.magic), id, compat_id);
+
+	if (ntohll(h.magic) != MIGRATE_MAGIC || !id || !compat_id) {
+		err("Invalid incoming device state");
+		errno = EINVAL;
+		return NULL;
+	}
+
+	for (v = versions; v->id; v++)
+		if (v->id <= id && v->id >= compat_id)
+			return v;
+
+	errno = ENOTSUP;
+	err("Unsupported device state version: %u", id);
+	return NULL;
+}
+
+/**
+ * migrate_target() - Migration as target, receive state from hypervisor
+ * @c:		Execution context
+ * @fd:		File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+static int migrate_target(struct ctx *c, int fd)
+{
+	const struct migrate_version *v;
+	const struct migrate_stage *s;
+	int ret;
+
+	if (!(v = migrate_target_read_header(fd)))
+		return errno;
+
+	for (s = v->s; s->name; s++) {
+		if (!s->target)
+			continue;
+
+		debug("Target side migration stage: %s", s->name);
+
+		if ((ret = s->target(c, s, fd))) {
+			err("Target migration stage: %s: %s, abort", s->name,
+			    strerror_(ret));
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * migrate_init() - Set up things necessary for migration
+ * @c:		Execution context
+ */
+void migrate_init(struct ctx *c)
+{
+	c->device_state_result = -1;
+}
+
+/**
+ * migrate_close() - Close migration channel
+ * @c:		Execution context
+ */
+void migrate_close(struct ctx *c)
+{
+	if (c->device_state_fd != -1) {
+		debug("Closing migration channel, fd: %d", c->device_state_fd);
+		close(c->device_state_fd);
+		c->device_state_fd = -1;
+		c->device_state_result = -1;
+	}
+}
+
+/**
+ * migrate_request() - Request a migration of device state
+ * @c:		Execution context
+ * @fd:		fd to transfer state
+ * @target:	Are we the target of the migration?
+ */
+void migrate_request(struct ctx *c, int fd, bool target)
+{
+	debug("Migration requested, fd: %d (was %d)", fd, c->device_state_fd);
+
+	if (c->device_state_fd != -1)
+		migrate_close(c);
+
+	c->device_state_fd = fd;
+	c->migrate_target = target;
+}
+
+/**
+ * migrate_handler() - Send/receive passt internal state to/from hypervisor
+ * @c:		Execution context
+ */
+void migrate_handler(struct ctx *c)
+{
+	int rc;
+
+	if (c->device_state_fd < 0)
+		return;
+
+	debug("Handling migration request from fd: %d, target: %d",
+	      c->device_state_fd, c->migrate_target);
+
+	if (c->migrate_target)
+		rc = migrate_target(c, c->device_state_fd);
+	else
+		rc = migrate_source(c, c->device_state_fd);
+
+	migrate_close(c);
+
+	c->device_state_result = rc;
+}
diff --git a/migrate.h b/migrate.h
new file mode 100644
index 0000000..2c51cd9
--- /dev/null
+++ b/migrate.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef MIGRATE_H
+#define MIGRATE_H
+
+/**
+ * struct migrate_header - Migration header from source
+ * @magic:		0xB1BB1D1B0BB1D1B0, network order
+ * @version:		Highest known, target aborts if too old, network order
+ * @compat_version:	Lowest version compatible with @version, target aborts
+ *			if too new, network order
+ */
+struct migrate_header {
+	uint64_t magic;
+	uint32_t version;
+	uint32_t compat_version;
+} __attribute__((packed));
+
+/**
+ * struct migrate_stage - Callbacks and parameters for one stage of migration
+ * @name:	Stage name (for debugging)
+ * @source:	Callback to implement this stage on the source
+ * @target:	Callback to implement this stage on the target
+ */
+struct migrate_stage {
+	const char *name;
+	int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd);
+	int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd);
+
+	/* Add here separate rollback callbacks if needed */
+};
+
+/**
+ * struct migrate_version - Stages for a particular protocol version
+ * @id:		Version number, host order
+ * @s:		Ordered array of stages, NULL-terminated
+ */
+struct migrate_version {
+	uint32_t id;
+	const struct migrate_stage *s;
+};
+
+void migrate_init(struct ctx *c);
+void migrate_close(struct ctx *c);
+void migrate_request(struct ctx *c, int fd, bool target);
+void migrate_handler(struct ctx *c);
+
+#endif /* MIGRATE_H */
diff --git a/passt.c b/passt.c
index 53fdd38..935a69f 100644
--- a/passt.c
+++ b/passt.c
@@ -51,6 +51,7 @@
 #include "tcp_splice.h"
 #include "ndp.h"
 #include "vu_common.h"
+#include "migrate.h"
 
 #define EPOLL_EVENTS		8
 
@@ -75,7 +76,6 @@ char *epoll_type_str[] = {
 	[EPOLL_TYPE_TAP_LISTEN]		= "listening qemu socket",
 	[EPOLL_TYPE_VHOST_CMD]		= "vhost-user command socket",
 	[EPOLL_TYPE_VHOST_KICK]		= "vhost-user kick socket",
-	[EPOLL_TYPE_VHOST_MIGRATION]	= "vhost-user migration socket",
 };
 static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
 	      "epoll_type_str[] doesn't match enum epoll_type");
@@ -202,6 +202,7 @@ int main(int argc, char **argv)
 	isolate_initial(argc, argv);
 
 	c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1;
+	c.device_state_fd = -1;
 
 	sigemptyset(&sa.sa_mask);
 	sa.sa_flags = 0;
@@ -357,9 +358,6 @@ loop:
 		case EPOLL_TYPE_VHOST_KICK:
 			vu_kick_cb(c.vdev, ref, &now);
 			break;
-		case EPOLL_TYPE_VHOST_MIGRATION:
-			vu_migrate(c.vdev, eventmask);
-			break;
 		default:
 			/* Can't happen */
 			ASSERT(0);
@@ -368,5 +366,7 @@ loop:
 
 	post_handler(&c, &now);
 
+	migrate_handler(&c);
+
 	goto loop;
 }
diff --git a/passt.h b/passt.h
index f3151f0..5fdea52 100644
--- a/passt.h
+++ b/passt.h
@@ -237,6 +237,9 @@ struct ip6_ctx {
  * @low_wmem:		Low probed net.core.wmem_max
  * @low_rmem:		Low probed net.core.rmem_max
  * @vdev:		vhost-user device
+ * @device_state_fd:	Device state migration channel
+ * @device_state_result: Device state migration result
+ * @migrate_target:	Are we the target, on the next migration request?
  */
 struct ctx {
 	enum passt_modes mode;
@@ -305,6 +308,11 @@ struct ctx {
 	int low_rmem;
 
 	struct vu_dev *vdev;
+
+	/* Migration */
+	int device_state_fd;
+	int device_state_result;
+	bool migrate_target;
 };
 
 void proto_update_l2_buf(const unsigned char *eth_d,
diff --git a/util.h b/util.h
index 9c92a37..7df7767 100644
--- a/util.h
+++ b/util.h
@@ -125,14 +125,43 @@
 	 (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
 #endif
 
+#ifndef __bswap_constant_32
+#define __bswap_constant_32(x)						\
+	((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) |	\
+	 (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
+#endif
+
+#ifndef __bswap_constant_64
+#define __bswap_constant_64(x) \
+	((((x) & 0xff00000000000000ULL) >> 56) |			\
+	 (((x) & 0x00ff000000000000ULL) >> 40) |			\
+	 (((x) & 0x0000ff0000000000ULL) >> 24) |			\
+	 (((x) & 0x000000ff00000000ULL) >> 8)  |			\
+	 (((x) & 0x00000000ff000000ULL) << 8)  |			\
+	 (((x) & 0x0000000000ff0000ULL) << 24) |			\
+	 (((x) & 0x000000000000ff00ULL) << 40) |			\
+	 (((x) & 0x00000000000000ffULL) << 56))
+#endif
+
 #if __BYTE_ORDER == __BIG_ENDIAN
 #define	htons_constant(x)	(x)
 #define	htonl_constant(x)	(x)
+#define htonll_constant(x)	(x)
+#define	ntohs_constant(x)	(x)
+#define	ntohl_constant(x)	(x)
+#define ntohll_constant(x)	(x)
 #else
 #define	htons_constant(x)	(__bswap_constant_16(x))
 #define	htonl_constant(x)	(__bswap_constant_32(x))
+#define	htonll_constant(x)	(__bswap_constant_64(x))
+#define	ntohs_constant(x)	(__bswap_constant_16(x))
+#define	ntohl_constant(x)	(__bswap_constant_32(x))
+#define	ntohll_constant(x)	(__bswap_constant_64(x))
 #endif
 
+#define ntohll(x)		(be64toh((x)))
+#define htonll(x)		(htobe64((x)))
+
 /**
  * ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address
  * @p:		Pointer to the BE value in memory
diff --git a/vhost_user.c b/vhost_user.c
index 159f0b3..256c8ab 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -44,6 +44,7 @@
 #include "tap.h"
 #include "vhost_user.h"
 #include "pcap.h"
+#include "migrate.h"
 
 /* vhost-user version we are compatible with */
 #define VHOST_USER_VERSION 1
@@ -997,36 +998,6 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
 	return false;
 }
 
-/**
- * vu_set_migration_watch() - Add the migration file descriptor to epoll
- * @vdev:	vhost-user device
- * @fd:		File descriptor to add
- * @direction:	Direction of the migration (save or load backend state)
- */
-static void vu_set_migration_watch(const struct vu_dev *vdev, int fd,
-				   uint32_t direction)
-{
-	union epoll_ref ref = {
-		.type = EPOLL_TYPE_VHOST_MIGRATION,
-		.fd = fd,
-	 };
-	struct epoll_event ev = { 0 };
-
-	ev.data.u64 = ref.u64;
-	switch (direction) {
-	case VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE:
-		ev.events = EPOLLOUT;
-		break;
-	case VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD:
-		ev.events = EPOLLIN;
-		break;
-	default:
-		ASSERT(0);
-	}
-
-	epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
-}
-
 /**
  * vu_set_device_state_fd_exec() - Set the device state migration channel
  * @vdev:	vhost-user device
@@ -1051,16 +1022,8 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
 	    direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
 		die("Invalide device_state_fd direction: %d", direction);
 
-	if (vdev->device_state_fd != -1) {
-		epoll_del(vdev->context, vdev->device_state_fd);
-		close(vdev->device_state_fd);
-	}
-
-	vdev->device_state_fd = msg->fds[0];
-	vdev->device_state_result = -1;
-	vu_set_migration_watch(vdev, vdev->device_state_fd, direction);
-
-	debug("Got device_state_fd: %d", vdev->device_state_fd);
+	migrate_request(vdev->context, msg->fds[0],
+			direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
 
 	/* We don't provide a new fd for the data transfer */
 	vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
@@ -1075,12 +1038,11 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
  *
  * Return: True as the reply contains the migration result
  */
+/* cppcheck-suppress constParameterCallback */
 static bool vu_check_device_state_exec(struct vu_dev *vdev,
 				       struct vhost_user_msg *msg)
 {
-	(void)vdev;
-
-	vmsg_set_reply_u64(msg, vdev->device_state_result);
+	vmsg_set_reply_u64(msg, vdev->context->device_state_result);
 
 	return true;
 }
@@ -1106,8 +1068,8 @@ void vu_init(struct ctx *c)
 	}
 	c->vdev->log_table = NULL;
 	c->vdev->log_call_fd = -1;
-	c->vdev->device_state_fd = -1;
-	c->vdev->device_state_result = -1;
+
+	migrate_init(c);
 }
 
 
@@ -1157,12 +1119,8 @@ void vu_cleanup(struct vu_dev *vdev)
 
 	vu_close_log(vdev);
 
-	if (vdev->device_state_fd != -1) {
-		epoll_del(vdev->context, vdev->device_state_fd);
-		close(vdev->device_state_fd);
-		vdev->device_state_fd = -1;
-		vdev->device_state_result = -1;
-	}
+	/* If we lose the VU dev, we also lose our migration channel */
+	migrate_close(vdev->context);
 }
 
 /**
diff --git a/virtio.h b/virtio.h
index 7bef2d2..0a59441 100644
--- a/virtio.h
+++ b/virtio.h
@@ -106,8 +106,6 @@ struct vu_dev_region {
  * @log_call_fd:		Eventfd to report logging update
  * @log_size:			Size of the logging memory region
  * @log_table:			Base of the logging memory region
- * @device_state_fd:		Device state migration channel
- * @device_state_result:	Device state migration result
  */
 struct vu_dev {
 	struct ctx *context;
@@ -119,8 +117,6 @@ struct vu_dev {
 	int log_call_fd;
 	uint64_t log_size;
 	uint8_t *log_table;
-	int device_state_fd;
-	int device_state_result;
 };
 
 /**
diff --git a/vu_common.c b/vu_common.c
index ab04d31..48826b1 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -5,6 +5,7 @@
  * common_vu.c - vhost-user common UDP and TCP functions
  */
 
+#include <errno.h>
 #include <unistd.h>
 #include <sys/uio.h>
 #include <sys/eventfd.h>
@@ -17,6 +18,7 @@
 #include "vhost_user.h"
 #include "pcap.h"
 #include "vu_common.h"
+#include "migrate.h"
 
 #define VU_MAX_TX_BUFFER_NB	2
 
@@ -303,50 +305,3 @@ err:
 
 	return -1;
 }
-
-/**
- * vu_migrate() - Send/receive passt insternal state to/from QEMU
- * @vdev:	vhost-user device
- * @events:	epoll events
- */
-void vu_migrate(struct vu_dev *vdev, uint32_t events)
-{
-	int ret;
-
-	/* TODO: collect/set passt internal state
-	 * and use vdev->device_state_fd to send/receive it
-	 */
-	debug("vu_migrate fd %d events %x", vdev->device_state_fd, events);
-	if (events & EPOLLOUT) {
-		debug("Saving backend state");
-
-		/* send some stuff */
-		ret = write(vdev->device_state_fd, "PASST", 6);
-		/* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
-		vdev->device_state_result = ret == -1 ? -1 : 0;
-		/* Closing the file descriptor signals the end of transfer */
-		epoll_del(vdev->context, vdev->device_state_fd);
-		close(vdev->device_state_fd);
-		vdev->device_state_fd = -1;
-	} else if (events & EPOLLIN) {
-		char buf[6];
-
-		debug("Loading backend state");
-		/* read some stuff */
-		ret = read(vdev->device_state_fd, buf, sizeof(buf));
-		/* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
-		if (ret != sizeof(buf)) {
-			vdev->device_state_result = -1;
-		} else {
-			ret = strncmp(buf, "PASST", sizeof(buf));
-			vdev->device_state_result = ret == 0 ? 0 : -1;
-		}
-	} else if (events & EPOLLHUP) {
-		debug("Closing migration channel");
-
-		/* The end of file signals the end of the transfer. */
-		epoll_del(vdev->context, vdev->device_state_fd);
-		close(vdev->device_state_fd);
-		vdev->device_state_fd = -1;
-	}
-}
diff --git a/vu_common.h b/vu_common.h
index d56c021..f538f23 100644
--- a/vu_common.h
+++ b/vu_common.h
@@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
 void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
 		const struct timespec *now);
 int vu_send_single(const struct ctx *c, const void *buf, size_t size);
-void vu_migrate(struct vu_dev *vdev, uint32_t events);
+
 #endif /* VU_COMMON_H */

From 155cd0c41e549cea956b7f8506cda7803cf63419 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Feb 2025 18:07:14 +1100
Subject: [PATCH 224/382] migrate: Migrate guest observed addresses

Most of the information in struct ctx doesn't need to be migrated.
Either it's strictly back end information which is allowed to differ
between the two ends, or it must already be configured identically on
the two ends.

There are a few exceptions though.  In particular passt learns several
addresses of the guest by observing what it sends out.  If we lose
this information across migration we might get away with it, but if
there are active flows we might misdirect some packets before
re-learning the guest address.

Avoid this by migrating the guest's observed addresses.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Coding style stuff, comments, etc.]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 migrate.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/migrate.c b/migrate.c
index aeac872..72a6d40 100644
--- a/migrate.c
+++ b/migrate.c
@@ -27,8 +27,81 @@
 /* Magic identifier for migration data */
 #define MIGRATE_MAGIC		0xB1BB1D1B0BB1D1B0
 
+/**
+ * struct migrate_seen_addrs_v1 - Migratable guest addresses for v1 state stream
+ * @addr6:	Observed guest IPv6 address
+ * @addr6_ll:	Observed guest IPv6 link-local address
+ * @addr4:	Observed guest IPv4 address
+ * @mac:	Observed guest MAC address
+ */
+struct migrate_seen_addrs_v1 {
+	struct in6_addr addr6;
+	struct in6_addr addr6_ll;
+	struct in_addr addr4;
+	unsigned char mac[ETH_ALEN];
+} __attribute__((packed));
+
+/**
+ * seen_addrs_source_v1() - Copy and send guest observed addresses from source
+ * @c:		Execution context
+ * @stage:	Migration stage, unused
+ * @fd:		File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
+static int seen_addrs_source_v1(struct ctx *c,
+				const struct migrate_stage *stage, int fd)
+{
+	struct migrate_seen_addrs_v1 addrs = {
+		.addr6 = c->ip6.addr_seen,
+		.addr6_ll = c->ip6.addr_ll_seen,
+		.addr4 = c->ip4.addr_seen,
+	};
+
+	(void)stage;
+
+	memcpy(addrs.mac, c->guest_mac, sizeof(addrs.mac));
+
+	if (write_all_buf(fd, &addrs, sizeof(addrs)))
+		return errno;
+
+	return 0;
+}
+
+/**
+ * seen_addrs_target_v1() - Receive and use guest observed addresses on target
+ * @c:		Execution context
+ * @stage:	Migration stage, unused
+ * @fd:		File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+static int seen_addrs_target_v1(struct ctx *c,
+				const struct migrate_stage *stage, int fd)
+{
+	struct migrate_seen_addrs_v1 addrs;
+
+	(void)stage;
+
+	if (read_all_buf(fd, &addrs, sizeof(addrs)))
+		return errno;
+
+	c->ip6.addr_seen = addrs.addr6;
+	c->ip6.addr_ll_seen = addrs.addr6_ll;
+	c->ip4.addr_seen = addrs.addr4;
+	memcpy(c->guest_mac, addrs.mac, sizeof(c->guest_mac));
+
+	return 0;
+}
+
 /* Stages for version 1 */
 static const struct migrate_stage stages_v1[] = {
+	{
+		.name = "observed addresses",
+		.source = seen_addrs_source_v1,
+		.target = seen_addrs_target_v1,
+	},
 	{ 0 },
 };
 

From b899141ad52fb417fe608d9c8cfe66f9572207c7 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 12 Feb 2025 18:07:15 +1100
Subject: [PATCH 225/382] Add interfaces and configuration bits for
 passt-repair

In vhost-user mode, by default, create a second UNIX domain socket
accepting connections from passt-repair, with the usual listener
socket.

When we need to set or clear TCP_REPAIR on sockets, we'll send them
via SCM_RIGHTS to passt-repair, who sets the socket option values we
ask for.

To that end, introduce batched functions to request TCP_REPAIR
settings on sockets, so that we don't have to send a single message
for each socket, on migration. When needed, repair_flush() will
send the message and check for the reply.

Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile     |  12 +--
 conf.c       |  43 +++++++++-
 epoll_type.h |   4 +
 migrate.c    |   5 +-
 passt.1      |  11 +++
 passt.c      |   9 +++
 passt.h      |   7 ++
 repair.c     | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++
 repair.h     |  16 ++++
 tap.c        |  65 +--------------
 util.c       |  62 +++++++++++++++
 util.h       |   1 +
 12 files changed, 381 insertions(+), 73 deletions(-)
 create mode 100644 repair.c
 create mode 100644 repair.h

diff --git a/Makefile b/Makefile
index be89b07..d4e1096 100644
--- a/Makefile
+++ b/Makefile
@@ -38,9 +38,9 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
 
 PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
 	icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
-	ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \
-	tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
-	vhost_user.c virtio.c vu_common.c
+	ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \
+	repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \
+	udp_vu.c util.c vhost_user.c virtio.c vu_common.c
 QRAP_SRCS = qrap.c
 PASST_REPAIR_SRCS = passt-repair.c
 SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
@@ -50,9 +50,9 @@ MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
 	lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
-	pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \
-	tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \
-	vhost_user.h virtio.h vu_common.h
+	pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \
+	tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \
+	udp_vu.h util.h vhost_user.h virtio.h vu_common.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
 C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/conf.c b/conf.c
index d9de07c..18017f5 100644
--- a/conf.c
+++ b/conf.c
@@ -820,6 +820,9 @@ static void usage(const char *name, FILE *f, int status)
 			"    UNIX domain socket is provided by -s option\n"
 			"  --print-capabilities	print back-end capabilities in JSON format,\n"
 			"    only meaningful for vhost-user mode\n");
+		FPRINTF(f,
+			"  --repair-path PATH	path for passt-repair(1)\n"
+			"    default: append '.repair' to UNIX domain path\n");
 	}
 
 	FPRINTF(f,
@@ -1245,8 +1248,25 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
  */
 static void conf_open_files(struct ctx *c)
 {
-	if (c->mode != MODE_PASTA && c->fd_tap == -1)
-		c->fd_tap_listen = tap_sock_unix_open(c->sock_path);
+	if (c->mode != MODE_PASTA && c->fd_tap == -1) {
+		c->fd_tap_listen = sock_unix(c->sock_path);
+
+		if (c->mode == MODE_VU && strcmp(c->repair_path, "none")) {
+			if (!*c->repair_path &&
+			    snprintf_check(c->repair_path,
+					   sizeof(c->repair_path), "%s.repair",
+					   c->sock_path)) {
+				warn("passt-repair path %s not usable",
+				     c->repair_path);
+				c->fd_repair_listen = -1;
+			} else {
+				c->fd_repair_listen = sock_unix(c->repair_path);
+			}
+		} else {
+			c->fd_repair_listen = -1;
+		}
+		c->fd_repair = -1;
+	}
 
 	if (*c->pidfile) {
 		c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY);
@@ -1360,10 +1380,12 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"host-lo-to-ns-lo", no_argument, 	NULL,		23 },
 		{"dns-host",	required_argument,	NULL,		24 },
 		{"vhost-user",	no_argument,		NULL,		25 },
+
 		/* vhost-user backend program convention */
 		{"print-capabilities", no_argument,	NULL,		26 },
 		{"socket-path",	required_argument,	NULL,		's' },
 		{"fqdn",	required_argument,	NULL,		27 },
+		{"repair-path",	required_argument,	NULL,		28 },
 		{ 0 },
 	};
 	const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@@ -1570,6 +1592,9 @@ void conf(struct ctx *c, int argc, char **argv)
 					   "%s", optarg))
 				die("Invalid FQDN: %s", optarg);
 			break;
+		case 28:
+			/* Handle this once we checked --vhost-user */
+			break;
 		case 'd':
 			c->debug = 1;
 			c->quiet = 0;
@@ -1841,8 +1866,8 @@ void conf(struct ctx *c, int argc, char **argv)
 	if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
 		c->no_dhcp = 1;
 
-	/* Inbound port options & DNS can be parsed now (after IPv4/IPv6
-	 * settings)
+	/* Inbound port options, DNS, and --repair-path can be parsed now, after
+	 * IPv4/IPv6 settings and --vhost-user.
 	 */
 	fwd_probe_ephemeral();
 	udp_portmap_clear();
@@ -1888,6 +1913,16 @@ void conf(struct ctx *c, int argc, char **argv)
 			}
 
 			die("Cannot use DNS address %s", optarg);
+		} else if (name == 28) {
+			if (c->mode != MODE_VU && strcmp(optarg, "none"))
+				die("--repair-path is for vhost-user mode only");
+
+			if (snprintf_check(c->repair_path,
+					   sizeof(c->repair_path), "%s",
+					   optarg))
+				die("Invalid passt-repair path: %s", optarg);
+
+			break;
 		}
 	} while (name != -1);
 
diff --git a/epoll_type.h b/epoll_type.h
index f3ef415..7f2a121 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -40,6 +40,10 @@ enum epoll_type {
 	EPOLL_TYPE_VHOST_CMD,
 	/* vhost-user kick event socket */
 	EPOLL_TYPE_VHOST_KICK,
+	/* TCP_REPAIR helper listening socket */
+	EPOLL_TYPE_REPAIR_LISTEN,
+	/* TCP_REPAIR helper socket */
+	EPOLL_TYPE_REPAIR,
 
 	EPOLL_NUM_TYPES,
 };
diff --git a/migrate.c b/migrate.c
index 72a6d40..1c59016 100644
--- a/migrate.c
+++ b/migrate.c
@@ -23,6 +23,7 @@
 #include "flow_table.h"
 
 #include "migrate.h"
+#include "repair.h"
 
 /* Magic identifier for migration data */
 #define MIGRATE_MAGIC		0xB1BB1D1B0BB1D1B0
@@ -232,7 +233,7 @@ void migrate_init(struct ctx *c)
 }
 
 /**
- * migrate_close() - Close migration channel
+ * migrate_close() - Close migration channel and connection to passt-repair
  * @c:		Execution context
  */
 void migrate_close(struct ctx *c)
@@ -243,6 +244,8 @@ void migrate_close(struct ctx *c)
 		c->device_state_fd = -1;
 		c->device_state_result = -1;
 	}
+
+	repair_close(c);
 }
 
 /**
diff --git a/passt.1 b/passt.1
index 9d347d8..60066c2 100644
--- a/passt.1
+++ b/passt.1
@@ -428,6 +428,17 @@ Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR.
 .BR \-\-print-capabilities
 Print back-end capabilities in JSON format, only meaningful for vhost-user mode.
 
+.TP
+.BR \-\-repair-path " " \fIpath
+Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect
+to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during
+migration. \fB--repair-path none\fR disables this interface (if you need to
+specify a socket path called "none" you can prefix the path by \fI./\fR).
+
+Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
+chosen for the hypervisor UNIX domain socket. No socket is created if not in
+\-\-vhost-user mode.
+
 .TP
 .BR \-F ", " \-\-fd " " \fIFD
 Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
diff --git a/passt.c b/passt.c
index 935a69f..6f9fb4d 100644
--- a/passt.c
+++ b/passt.c
@@ -52,6 +52,7 @@
 #include "ndp.h"
 #include "vu_common.h"
 #include "migrate.h"
+#include "repair.h"
 
 #define EPOLL_EVENTS		8
 
@@ -76,6 +77,8 @@ char *epoll_type_str[] = {
 	[EPOLL_TYPE_TAP_LISTEN]		= "listening qemu socket",
 	[EPOLL_TYPE_VHOST_CMD]		= "vhost-user command socket",
 	[EPOLL_TYPE_VHOST_KICK]		= "vhost-user kick socket",
+	[EPOLL_TYPE_REPAIR_LISTEN]	= "TCP_REPAIR helper listening socket",
+	[EPOLL_TYPE_REPAIR]		= "TCP_REPAIR helper socket",
 };
 static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
 	      "epoll_type_str[] doesn't match enum epoll_type");
@@ -358,6 +361,12 @@ loop:
 		case EPOLL_TYPE_VHOST_KICK:
 			vu_kick_cb(c.vdev, ref, &now);
 			break;
+		case EPOLL_TYPE_REPAIR_LISTEN:
+			repair_listen_handler(&c, eventmask);
+			break;
+		case EPOLL_TYPE_REPAIR:
+			repair_handler(&c, eventmask);
+			break;
 		default:
 			/* Can't happen */
 			ASSERT(0);
diff --git a/passt.h b/passt.h
index 5fdea52..1f0dab5 100644
--- a/passt.h
+++ b/passt.h
@@ -20,6 +20,7 @@ union epoll_ref;
 #include "siphash.h"
 #include "ip.h"
 #include "inany.h"
+#include "migrate.h"
 #include "flow.h"
 #include "icmp.h"
 #include "fwd.h"
@@ -193,6 +194,7 @@ struct ip6_ctx {
  * @foreground:		Run in foreground, don't log to stderr by default
  * @nofile:		Maximum number of open files (ulimit -n)
  * @sock_path:		Path for UNIX domain socket
+ * @repair_path:	TCP_REPAIR helper path, can be "none", empty for default
  * @pcap:		Path for packet capture file
  * @pidfile:		Path to PID file, empty string if not configured
  * @pidfile_fd:		File descriptor for PID file, -1 if none
@@ -203,6 +205,8 @@ struct ip6_ctx {
  * @epollfd:		File descriptor for epoll instance
  * @fd_tap_listen:	File descriptor for listening AF_UNIX socket, if any
  * @fd_tap:		AF_UNIX socket, tuntap device, or pre-opened socket
+ * @fd_repair_listen:	File descriptor for listening TCP_REPAIR socket, if any
+ * @fd_repair:		Connected AF_UNIX socket for TCP_REPAIR helper
  * @our_tap_mac:	Pasta/passt's MAC on the tap link
  * @guest_mac:		MAC address of guest or namespace, seen or configured
  * @hash_secret:	128-bit secret for siphash functions
@@ -249,6 +253,7 @@ struct ctx {
 	int foreground;
 	int nofile;
 	char sock_path[UNIX_PATH_MAX];
+	char repair_path[UNIX_PATH_MAX];
 	char pcap[PATH_MAX];
 
 	char pidfile[PATH_MAX];
@@ -265,6 +270,8 @@ struct ctx {
 	int epollfd;
 	int fd_tap_listen;
 	int fd_tap;
+	int fd_repair_listen;
+	int fd_repair;
 	unsigned char our_tap_mac[ETH_ALEN];
 	unsigned char guest_mac[ETH_ALEN];
 	uint64_t hash_secret[2];
diff --git a/repair.c b/repair.c
new file mode 100644
index 0000000..d288617
--- /dev/null
+++ b/repair.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ *  for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ *  for network namespace/tap device mode
+ *
+ * repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <errno.h>
+#include <sys/uio.h>
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "inany.h"
+#include "flow.h"
+#include "flow_table.h"
+
+#include "repair.h"
+
+#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+
+/* Pending file descriptors for next repair_flush() call, or command change */
+static int repair_fds[SCM_MAX_FD];
+
+/* Pending command: flush pending file descriptors if it changes */
+static int8_t repair_cmd;
+
+/* Number of pending file descriptors set in @repair_fds */
+static int repair_nfds;
+
+/**
+ * repair_sock_init() - Start listening for connections on helper socket
+ * @c:		Execution context
+ */
+void repair_sock_init(const struct ctx *c)
+{
+	union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN };
+	struct epoll_event ev = { 0 };
+
+	if (c->fd_repair_listen == -1)
+		return;
+
+	if (listen(c->fd_repair_listen, 0)) {
+		err_perror("listen() on repair helper socket, won't migrate");
+		return;
+	}
+
+	ref.fd = c->fd_repair_listen;
+	ev.events = EPOLLIN | EPOLLHUP | EPOLLET;
+	ev.data.u64 = ref.u64;
+	if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev))
+		err_perror("repair helper socket epoll_ctl(), won't migrate");
+}
+
+/**
+ * repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
+ * @c:		Execution context
+ * @events:	epoll events
+ */
+void repair_listen_handler(struct ctx *c, uint32_t events)
+{
+	union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
+	struct epoll_event ev = { 0 };
+	struct ucred ucred;
+	socklen_t len;
+
+	if (events != EPOLLIN) {
+		debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
+		      events);
+		return;
+	}
+
+	len = sizeof(ucred);
+
+	/* Another client is already connected: accept and close right away. */
+	if (c->fd_repair != -1) {
+		int discard = accept4(c->fd_repair_listen, NULL, NULL,
+				      SOCK_NONBLOCK);
+
+		if (discard == -1)
+			return;
+
+		if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
+			info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
+
+		close(discard);
+		return;
+	}
+
+	if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) {
+		debug_perror("accept4() on TCP_REPAIR helper listening socket");
+		return;
+	}
+
+	if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
+		info("Accepted TCP_REPAIR helper, PID %i", ucred.pid);
+
+	ref.fd = c->fd_repair;
+	ev.events = EPOLLHUP | EPOLLET;
+	ev.data.u64 = ref.u64;
+	if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) {
+		debug_perror("epoll_ctl() on TCP_REPAIR helper socket");
+		close(c->fd_repair);
+		c->fd_repair = -1;
+	}
+}
+
+/**
+ * repair_close() - Close connection to TCP_REPAIR helper
+ * @c:		Execution context
+ */
+void repair_close(struct ctx *c)
+{
+	debug("Closing TCP_REPAIR helper socket");
+
+	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL);
+	close(c->fd_repair);
+	c->fd_repair = -1;
+}
+
+/**
+ * repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket
+ * @c:		Execution context
+ * @events:	epoll events
+ */
+void repair_handler(struct ctx *c, uint32_t events)
+{
+	(void)events;
+
+	repair_close(c);
+}
+
+/**
+ * repair_flush() - Flush current set of sockets to helper, with current command
+ * @c:		Execution context
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int repair_flush(struct ctx *c)
+{
+	struct iovec iov = { &repair_cmd, sizeof(repair_cmd) };
+	char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
+	     __attribute__ ((aligned(__alignof__(struct cmsghdr))));
+	struct cmsghdr *cmsg;
+	struct msghdr msg;
+	int8_t reply;
+
+	if (!repair_nfds)
+		return 0;
+
+	msg = (struct msghdr){ NULL, 0, &iov, 1,
+			       buf, CMSG_SPACE(sizeof(int) * repair_nfds), 0 };
+	cmsg = CMSG_FIRSTHDR(&msg);
+
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds);
+	memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds);
+
+	repair_nfds = 0;
+
+	if (sendmsg(c->fd_repair, &msg, 0) < 0) {
+		int ret = -errno;
+		err_perror("Failed to send sockets to TCP_REPAIR helper");
+		repair_close(c);
+		return ret;
+	}
+
+	if (recv(c->fd_repair, &reply, sizeof(reply), 0) < 0) {
+		int ret = -errno;
+		err_perror("Failed to receive reply from TCP_REPAIR helper");
+		repair_close(c);
+		return ret;
+	}
+
+	if (reply != repair_cmd) {
+		err("Unexpected reply from TCP_REPAIR helper: %d", reply);
+		repair_close(c);
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+/**
+ * repair_set() - Add socket to TCP_REPAIR set with given command
+ * @c:		Execution context
+ * @s:		Socket to add
+ * @cmd:	TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+/* cppcheck-suppress unusedFunction */
+int repair_set(struct ctx *c, int s, int cmd)
+{
+	int rc;
+
+	if (repair_nfds && repair_cmd != cmd) {
+		if ((rc = repair_flush(c)))
+			return rc;
+	}
+
+	repair_cmd = cmd;
+	repair_fds[repair_nfds++] = s;
+
+	if (repair_nfds >= SCM_MAX_FD) {
+		if ((rc = repair_flush(c)))
+			return rc;
+	}
+
+	return 0;
+}
diff --git a/repair.h b/repair.h
new file mode 100644
index 0000000..de279d6
--- /dev/null
+++ b/repair.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef REPAIR_H
+#define REPAIR_H
+
+void repair_sock_init(const struct ctx *c);
+void repair_listen_handler(struct ctx *c, uint32_t events);
+void repair_handler(struct ctx *c, uint32_t events);
+void repair_close(struct ctx *c);
+int repair_flush(struct ctx *c);
+int repair_set(struct ctx *c, int s, int cmd);
+
+#endif /* REPAIR_H */
diff --git a/tap.c b/tap.c
index 8c92d23..d0673e5 100644
--- a/tap.c
+++ b/tap.c
@@ -56,6 +56,7 @@
 #include "netlink.h"
 #include "pasta.h"
 #include "packet.h"
+#include "repair.h"
 #include "tap.h"
 #include "log.h"
 #include "vhost_user.h"
@@ -1151,68 +1152,6 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
 		tap_pasta_input(c, now);
 }
 
-/**
- * tap_sock_unix_open() - Create and bind AF_UNIX socket
- * @sock_path:	Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
- *
- * Return: socket descriptor on success, won't return on failure
- */
-int tap_sock_unix_open(char *sock_path)
-{
-	int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
-	struct sockaddr_un addr = {
-		.sun_family = AF_UNIX,
-	};
-	int i;
-
-	if (fd < 0)
-		die_perror("Failed to open UNIX domain socket");
-
-	for (i = 1; i < UNIX_SOCK_MAX; i++) {
-		char *path = addr.sun_path;
-		int ex, ret;
-
-		if (*sock_path)
-			memcpy(path, sock_path, UNIX_PATH_MAX);
-		else if (snprintf_check(path, UNIX_PATH_MAX - 1,
-					UNIX_SOCK_PATH, i))
-			die_perror("Can't build UNIX domain socket path");
-
-		ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
-			    0);
-		if (ex < 0)
-			die_perror("Failed to check for UNIX domain conflicts");
-
-		ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
-		if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
-			     errno != EACCES)) {
-			if (*sock_path)
-				die("Socket path %s already in use", path);
-
-			close(ex);
-			continue;
-		}
-		close(ex);
-
-		unlink(path);
-		ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
-		if (*sock_path && ret)
-			die_perror("Failed to bind UNIX domain socket");
-
-		if (!ret)
-			break;
-	}
-
-	if (i == UNIX_SOCK_MAX)
-		die_perror("Failed to bind UNIX domain socket");
-
-	info("UNIX domain socket bound at %s", addr.sun_path);
-	if (!*sock_path)
-		memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
-
-	return fd;
-}
-
 /**
  * tap_backend_show_hints() - Give help information to start QEMU
  * @c:		Execution context
@@ -1423,6 +1362,8 @@ void tap_backend_init(struct ctx *c)
 		tap_sock_tun_init(c);
 		break;
 	case MODE_VU:
+		repair_sock_init(c);
+		/* fall through */
 	case MODE_PASST:
 		tap_sock_unix_init(c);
 
diff --git a/util.c b/util.c
index ba33866..656e86a 100644
--- a/util.c
+++ b/util.c
@@ -178,6 +178,68 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	return fd;
 }
 
+/**
+ * sock_unix() - Create and bind AF_UNIX socket
+ * @sock_path:	Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
+ *
+ * Return: socket descriptor on success, won't return on failure
+ */
+int sock_unix(char *sock_path)
+{
+	int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+	struct sockaddr_un addr = {
+		.sun_family = AF_UNIX,
+	};
+	int i;
+
+	if (fd < 0)
+		die_perror("Failed to open UNIX domain socket");
+
+	for (i = 1; i < UNIX_SOCK_MAX; i++) {
+		char *path = addr.sun_path;
+		int ex, ret;
+
+		if (*sock_path)
+			memcpy(path, sock_path, UNIX_PATH_MAX);
+		else if (snprintf_check(path, UNIX_PATH_MAX - 1,
+					UNIX_SOCK_PATH, i))
+			die_perror("Can't build UNIX domain socket path");
+
+		ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+			    0);
+		if (ex < 0)
+			die_perror("Failed to check for UNIX domain conflicts");
+
+		ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
+		if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
+			     errno != EACCES)) {
+			if (*sock_path)
+				die("Socket path %s already in use", path);
+
+			close(ex);
+			continue;
+		}
+		close(ex);
+
+		unlink(path);
+		ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
+		if (*sock_path && ret)
+			die_perror("Failed to bind UNIX domain socket");
+
+		if (!ret)
+			break;
+	}
+
+	if (i == UNIX_SOCK_MAX)
+		die_perror("Failed to bind UNIX domain socket");
+
+	info("UNIX domain socket bound at %s", addr.sun_path);
+	if (!*sock_path)
+		memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
+
+	return fd;
+}
+
 /**
  * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
  * @c:		Execution context
diff --git a/util.h b/util.h
index 7df7767..50e96d3 100644
--- a/util.h
+++ b/util.h
@@ -217,6 +217,7 @@ struct ctx;
 int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	       const void *sa, socklen_t sl,
 	       const char *ifname, bool v6only, uint32_t data);
+int sock_unix(char *sock_path);
 void sock_probe_mem(struct ctx *c);
 long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
 int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);

From f3fe795ff58656c39a39dbfac47fe6769f5ce293 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 12 Feb 2025 18:07:16 +1100
Subject: [PATCH 226/382] vhost_user: Make source quit after reporting
 migration state

This will close all the sockets we currently have open in repair mode,
and completes our migration tasks as source. If the hypervisor wants
to have us back at this point, somebody needs to restart us.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vhost_user.c b/vhost_user.c
index 256c8ab..7ab1377 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -1203,4 +1203,11 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
 
 	if (reply_requested)
 		vu_send_reply(fd, &msg);
+
+	if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
+	    vdev->context->device_state_result == 0 &&
+	    !vdev->context->migrate_target) {
+		info("Migration complete, exiting");
+		_exit(EXIT_SUCCESS);
+	}
 }

From 6f122f0171fe4bc235d572945e0bf963e81139ea Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 12 Feb 2025 18:07:17 +1100
Subject: [PATCH 227/382] tcp: Get bound address for connected inbound sockets
 too

So that we can bind inbound sockets to specific addresses, like we
already do for outbound sockets.

While at it, change the error message in tcp_conn_from_tap() to match
this one.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c       |  6 +++---
 flow_table.h |  6 +++---
 tcp.c        | 22 ++++++++++++++--------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/flow.c b/flow.c
index a6fe6d1..3ac551b 100644
--- a/flow.c
+++ b/flow.c
@@ -390,9 +390,9 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
  *
  * Return: pointer to the initiating flowside information
  */
-const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
-					const union sockaddr_inany *ssa,
-					in_port_t dport)
+struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
+				  const union sockaddr_inany *ssa,
+				  in_port_t dport)
 {
 	struct flowside *ini = &flow->f.side[INISIDE];
 
diff --git a/flow_table.h b/flow_table.h
index eeb6f41..9a2ff24 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -161,9 +161,9 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
 					sa_family_t af,
 					const void *saddr, in_port_t sport,
 					const void *daddr, in_port_t dport);
-const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
-					const union sockaddr_inany *ssa,
-					in_port_t dport);
+struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
+				  const union sockaddr_inany *ssa,
+				  in_port_t dport);
 const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
 				      sa_family_t af,
 				      const void *saddr, in_port_t sport,
diff --git a/tcp.c b/tcp.c
index b87478f..a1d6c53 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1536,12 +1536,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 
 	if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
 		sl = sizeof(sa);
-		if (!getsockname(s, &sa.sa, &sl)) {
+		if (!getsockname(s, &sa.sa, &sl))
 			inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa);
-		} else {
-			err("Failed to get local address for socket: %s",
-			    strerror_(errno));
-		}
+		else
+			err_perror("Can't get local address for socket %i", s);
 	}
 
 	FLOW_ACTIVATE(conn);
@@ -2075,9 +2073,9 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
 void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			const struct timespec *now)
 {
-	const struct flowside *ini;
 	union sockaddr_inany sa;
 	socklen_t sl = sizeof(sa);
+	struct flowside *ini;
 	union flow *flow;
 	int s;
 
@@ -2093,12 +2091,20 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 	tcp_sock_set_bufsize(c, s);
 	tcp_sock_set_nodelay(s);
 
-	/* FIXME: When listening port has a specific bound address, record that
-	 * as our address
+	/* FIXME: If useful: when the listening port has a specific bound
+	 * address, record that as our address, as implemented for vhost-user
+	 * mode only, below.
 	 */
 	ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
 			       ref.tcp_listen.port);
 
+	if (c->mode == MODE_VU) { /* Rebind to same address after migration */
+		if (!getsockname(s, &sa.sa, &sl))
+			inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa);
+		else
+			err_perror("Can't get local address for socket %i", s);
+	}
+
 	if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) {
 		char sastr[SOCKADDR_STRLEN];
 

From a3011584563bb7d6cf46416e8e84873c2615ad63 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Feb 2025 18:07:19 +1100
Subject: [PATCH 228/382] rampstream: Add utility to test for corruption of
 data streams

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/.gitignore             |   1 +
 test/Makefile               |   5 +-
 test/migrate/rampstream_in  |  59 +++++++++++++++
 test/migrate/rampstream_out |  55 ++++++++++++++
 test/passt.mbuto            |   5 +-
 test/rampstream-check.sh    |   3 +
 test/rampstream.c           | 143 ++++++++++++++++++++++++++++++++++++
 7 files changed, 267 insertions(+), 4 deletions(-)
 create mode 100644 test/migrate/rampstream_in
 create mode 100644 test/migrate/rampstream_out
 create mode 100755 test/rampstream-check.sh
 create mode 100644 test/rampstream.c

diff --git a/test/.gitignore b/test/.gitignore
index 6dd4790..3573444 100644
--- a/test/.gitignore
+++ b/test/.gitignore
@@ -8,5 +8,6 @@ QEMU_EFI.fd
 *.raw.xz
 *.bin
 nstool
+rampstream
 guest-key
 guest-key.pub
diff --git a/test/Makefile b/test/Makefile
index 5e49047..bf63db8 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -52,7 +52,8 @@ UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS)
 
 DOWNLOAD_ASSETS = mbuto podman \
 	$(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS)
-TESTDATA_ASSETS = small.bin big.bin medium.bin
+TESTDATA_ASSETS = small.bin big.bin medium.bin \
+	rampstream
 LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \
 	$(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \
 	$(UBUNTU_NEW_IMGS:%=prepared-%) \
@@ -85,7 +86,7 @@ podman/bin/podman: pull-podman
 guest-key guest-key.pub:
 	ssh-keygen -f guest-key -N ''
 
-mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub $(TESTDATA_ASSETS)
+mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub rampstream-check.sh $(TESTDATA_ASSETS)
 	./mbuto/mbuto -p ./$< -c lz4 -f $@
 
 mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2
diff --git a/test/migrate/rampstream_in b/test/migrate/rampstream_in
new file mode 100644
index 0000000..46f4143
--- /dev/null
+++ b/test/migrate/rampstream_in
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/basic - Check basic migration functionality
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+set	RAMPS 6000000
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1	/sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test	TCP/IPv4: host > guest
+g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
+guest1b	socat -u TCP4-LISTEN:10001 EXEC:"rampstream-check.sh __RAMPS__"
+sleep	1
+hostb	socat -u EXEC:"test/rampstream send __RAMPS__" TCP4:__ADDR1__:10001
+
+sleep 1
+
+#mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+hostw
+
+guest2	cat rampstream.err
+guest2	[ $(cat rampstream.status) -eq 0 ]
diff --git a/test/migrate/rampstream_out b/test/migrate/rampstream_out
new file mode 100644
index 0000000..91b9c63
--- /dev/null
+++ b/test/migrate/rampstream_out
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/basic - Check basic migration functionality
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+set	RAMPS 6000000
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1	/sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test	TCP/IPv4: guest > host
+g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
+hostb	socat -u TCP4-LISTEN:10006 EXEC:"test/rampstream check __RAMPS__"
+sleep	1
+guest1b	socat -u EXEC:"rampstream send __RAMPS__" TCP4:__MAP_HOST4__:10006
+sleep	1
+
+mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+hostw
diff --git a/test/passt.mbuto b/test/passt.mbuto
index e45a284..5e00132 100755
--- a/test/passt.mbuto
+++ b/test/passt.mbuto
@@ -13,7 +13,8 @@
 PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod
        modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname
        sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
-       nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump env}"
+       nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump
+       env}"
 
 # OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and
 # sshd-session the per-session program. We need the latter as well, and the path
@@ -31,7 +32,7 @@ LINKS="${LINKS:-
 
 DIRS="${DIRS} /tmp /usr/sbin /usr/share /var/log /var/lib /etc/ssh /run/sshd /root/.ssh"
 
-COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin"
+COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin rampstream,/bin/rampstream rampstream-check.sh,/bin/rampstream-check.sh"
 
 FIXUP="${FIXUP}"'
 	mv /sbin/* /usr/sbin || :
diff --git a/test/rampstream-check.sh b/test/rampstream-check.sh
new file mode 100755
index 0000000..c27acdb
--- /dev/null
+++ b/test/rampstream-check.sh
@@ -0,0 +1,3 @@
+#! /bin/sh
+
+(rampstream check "$@" 2>&1; echo $? > rampstream.status) | tee rampstream.err
diff --git a/test/rampstream.c b/test/rampstream.c
new file mode 100644
index 0000000..8d81296
--- /dev/null
+++ b/test/rampstream.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* rampstream - Generate a check and stream of bytes in a ramp pattern
+ *
+ * Copyright Red Hat
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+/* Length of the repeating ramp.  This is a deliberately not a "round" number so
+ * that we're very likely to misalign with likely block or chunk sizes of the
+ * transport.  That means we'll detect gaps in the stream, even if they occur
+ * neatly on block boundaries.  Specifically this is the largest 8-bit prime. */
+#define RAMPLEN		251
+
+#define INTERVAL	10000
+
+#define	ARRAY_SIZE(a)	((int)(sizeof(a) / sizeof((a)[0])))
+
+#define die(...)						\
+	do {							\
+		fprintf(stderr, "rampstream: " __VA_ARGS__);	\
+		exit(1);					\
+	} while (0)
+
+static void usage(void)
+{
+	die("Usage:\n"
+	    "  rampstream send <number>\n"
+	    "    Generate a ramp pattern of bytes on stdout, repeated <number>\n"
+	    "    times\n"
+	    "  rampstream check <number>\n"
+	    "    Check a ramp pattern of bytes on stdin, repeater <number>\n"
+	    "    times\n");
+}
+
+static void ramp_send(unsigned long long num, const uint8_t *ramp)
+{
+	unsigned long long i;
+
+	for (i = 0; i < num; i++) {
+		int off = 0;
+		ssize_t rc;
+
+		if (i % INTERVAL == 0)
+			fprintf(stderr, "%llu...\r", i);
+
+		while (off < RAMPLEN) {
+			rc = write(1, ramp + off, RAMPLEN - off);
+			if (rc < 0) {
+				if (errno == EINTR ||
+				    errno == EAGAIN ||
+				    errno == EWOULDBLOCK)
+					continue;
+				die("Error writing ramp: %s\n",
+				    strerror(errno));
+			}
+			if (rc == 0)
+				die("Zero length write\n");
+			off += rc;
+		}
+	}
+}
+
+static void ramp_check(unsigned long long num, const uint8_t *ramp)
+{
+	unsigned long long i;
+
+	for (i = 0; i < num; i++) {
+		uint8_t buf[RAMPLEN];
+		int off = 0;
+		ssize_t rc;
+
+		if (i % INTERVAL == 0)
+			fprintf(stderr, "%llu...\r", i);
+		
+		while (off < RAMPLEN) {
+			rc = read(0, buf + off, RAMPLEN - off);
+			if (rc < 0) {
+				if (errno == EINTR ||
+				    errno == EAGAIN ||
+				    errno == EWOULDBLOCK)
+					continue;
+				die("Error reading ramp: %s\n",
+				    strerror(errno));
+			}
+			if (rc == 0)
+				die("Unexpected EOF, ramp %llu, byte %d\n",
+				    i, off);
+			off += rc;
+		}
+
+		if (memcmp(buf, ramp, sizeof(buf)) != 0) {
+			int j, k;
+
+			for (j = 0; j < RAMPLEN; j++)
+				if (buf[j] != ramp[j])
+					break;
+			for (k = j; k < RAMPLEN && k < j + 16; k++)
+				fprintf(stderr,
+					"Byte %d: expected 0x%02x, got 0x%02x\n",
+					k, ramp[k], buf[k]);
+			die("Data mismatch, ramp %llu, byte %d\n", i, j);
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	const char *subcmd = argv[1];
+	unsigned long long num;
+	uint8_t ramp[RAMPLEN];
+	char *e;
+	int i;
+
+	if (argc < 2)
+		usage();
+
+	errno = 0;
+	num = strtoull(argv[2], &e, 0);
+	if (*e || errno)
+		usage();
+
+	/* Initialize the ramp block */
+	for (i = 0; i < RAMPLEN; i++)
+		ramp[i] = i;
+
+	if (strcmp(subcmd, "send") == 0)
+		ramp_send(num, ramp);
+	else if (strcmp(subcmd, "check") == 0)
+		ramp_check(num, ramp);
+	else
+		usage();
+
+	exit(0);
+}

From 9a84df4c3f9608c5e814f24ee3306a6c64a73edd Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 13 Feb 2025 00:42:52 +0100
Subject: [PATCH 229/382] selinux: Add rules needed to run tests

...other than being convenient, they might be reasonably
representative of typical stand-alone usage.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/selinux/passt.te |  4 ++++
 contrib/selinux/pasta.te | 14 ++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index c6cea34..6e7a4cb 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -20,6 +20,7 @@ require {
 	type fs_t;
 	type tmp_t;
 	type user_tmp_t;
+	type user_home_t;
 	type tmpfs_t;
 	type root_t;
 
@@ -80,6 +81,9 @@ allow passt_t root_t:dir mounton;
 allow passt_t tmp_t:dir { add_name mounton remove_name write };
 allow passt_t tmpfs_t:filesystem mount;
 allow passt_t fs_t:filesystem unmount;
+allow passt_t user_home_t:dir search;
+allow passt_t user_tmp_t:fifo_file append;
+allow passt_t user_tmp_t:file map;
 
 manage_files_pattern(passt_t, user_tmp_t, user_tmp_t)
 files_pid_filetrans(passt_t, user_tmp_t, file)
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index d0ff0cc..89c8043 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -18,6 +18,7 @@ require {
 	type bin_t;
 	type user_home_t;
 	type user_home_dir_t;
+	type user_tmp_t;
 	type fs_t;
 	type tmp_t;
 	type tmpfs_t;
@@ -56,8 +57,10 @@ require {
 	attribute port_type;
 	type port_t;
 	type http_port_t;
+	type http_cache_port_t;
 	type ssh_port_t;
 	type reserved_port_t;
+	type unreserved_port_t;
 	type dns_port_t;
 	type dhcpc_port_t;
 	type chronyd_port_t;
@@ -122,8 +125,8 @@ domain_auto_trans(pasta_t, ping_exec_t, ping_t);
 
 allow pasta_t nsfs_t:file { open read };
 
-allow pasta_t user_home_t:dir getattr;
-allow pasta_t user_home_t:file { open read getattr setattr };
+allow pasta_t user_home_t:dir { getattr search };
+allow pasta_t user_home_t:file { open read getattr setattr execute execute_no_trans map};
 allow pasta_t user_home_dir_t:dir { search getattr open add_name read write };
 allow pasta_t user_home_dir_t:file { create open read write };
 allow pasta_t tmp_t:dir { add_name mounton remove_name write };
@@ -133,6 +136,11 @@ allow pasta_t root_t:dir mounton;
 manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t)
 files_pid_filetrans(pasta_t, pasta_pid_t, file)
 
+allow pasta_t user_tmp_t:dir { add_name remove_name search write };
+allow pasta_t user_tmp_t:fifo_file append;
+allow pasta_t user_tmp_t:file { create open write };
+allow pasta_t user_tmp_t:sock_file { create unlink };
+
 allow pasta_t console_device_t:chr_file { open write getattr ioctl };
 allow pasta_t user_devpts_t:chr_file { getattr read write ioctl };
 logging_send_syslog_msg(pasta_t)
@@ -160,6 +168,8 @@ allow pasta_t self:udp_socket create_stream_socket_perms;
 allow pasta_t reserved_port_t:udp_socket name_bind;
 allow pasta_t llmnr_port_t:tcp_socket name_bind;
 allow pasta_t llmnr_port_t:udp_socket name_bind;
+allow pasta_t http_cache_port_t:tcp_socket { name_bind name_connect };
+allow pasta_t unreserved_port_t:udp_socket name_bind;
 corenet_udp_sendrecv_generic_node(pasta_t)
 corenet_udp_bind_generic_node(pasta_t)
 allow pasta_t node_t:icmp_socket { name_bind node_bind };

From 98d474c8950e9cc5715d5686614fb0f504377303 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 13 Feb 2025 22:00:57 +0100
Subject: [PATCH 230/382] contrib/selinux: Enable mapping guest memory for
 libvirt guests

This doesn't actually belong to passt's own policy: we should export
an interface and libvirt's policy should use it, because passt's
policy shouldn't be aware of svirt_image_t at all.

However, libvirt doesn't maintain its own policy, which makes policy
updates rather involved. Add this workaround to ensure --vhost-user
is working in combination with libvirt, as it might take ages before
we can get the proper rule in libvirt's policy.

Reported-by: Laine Stump <laine@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/selinux/passt.te | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index 6e7a4cb..fc1320d 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -24,6 +24,12 @@ require {
 	type tmpfs_t;
 	type root_t;
 
+	# Workaround: passt --vhost-user needs to map guest memory, but
+	# libvirt doesn't maintain its own policy, which makes updates
+	# particularly complicated. To avoid breakage in the short term,
+	# deal with it in passt's own policy.
+	type svirt_image_t;
+
 	class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map };
 	class dir { search write add_name remove_name mounton };
 	class chr_file { append read write open getattr ioctl };
@@ -131,3 +137,9 @@ allow passt_t user_tmp_t:dir { add_name write };
 allow passt_t user_tmp_t:file { create open };
 allow passt_t user_tmp_t:sock_file { create read write unlink };
 allow passt_t unconfined_t:unix_stream_socket { read write };
+
+# Workaround: passt --vhost-user needs to map guest memory, but
+# libvirt doesn't maintain its own policy, which makes updates
+# particularly complicated. To avoid breakage in the short term,
+# deal with it in passt's own policy.
+allow passt_t svirt_image_t:file { read write map };

From 30f1e082c3c0cee0a985b3c32e2b05280c596343 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 13 Feb 2025 16:24:55 +0100
Subject: [PATCH 231/382] tcp: Keep updating window and checking for socket
 data after FIN from guest

Once we get a FIN segment from the container/guest, we enter something
resembling CLOSE_WAIT (from the perspective of the peer), but that
doesn't mean that we should stop processing window updates from the
guest and checking for socket data if the guest acknowledges
something.

If we don't do that, we can very easily run into a situation where we
send a burst of data to the tap, get a zero window update, along with
a FIN segment, because the flow is meant to be unidirectional, and now
the connection will be stuck forever, because we'll ignore updates.

Reproducer, server:

  $ pasta --config-net -t 9999 -- sh -c 'echo DONE | socat TCP-LISTEN:9997,shut-down STDIO'

and client:

  $ ./test/rampstream send 50000 | socat -u STDIN TCP:$LOCAL_ADDR:9997
  2025/02/13 09:14:45 socat[2997126] E write(5, 0x55f5dbf47000, 8192): Broken pipe

while at it, update the message string for the third passive close
state (which we see in this case): it's CLOSE_WAIT, not LAST_ACK.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tcp.c b/tcp.c
index a1d6c53..16d01f6 100644
--- a/tcp.c
+++ b/tcp.c
@@ -338,7 +338,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
 	"SYN_RCVD",	/* approximately maps to TAP_SYN_ACK_SENT */
 
 	/* Passive close: */
-	"CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK",
+	"CLOSE_WAIT", "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK",
 	/* Active close (+5): */
 	"CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT",
 };
@@ -1968,6 +1968,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 	/* Established connections not accepting data from tap */
 	if (conn->events & TAP_FIN_RCVD) {
 		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+		tcp_tap_window_update(conn, ntohs(th->window));
+		tcp_data_from_sock(c, conn);
 
 		if (conn->events & SOCK_FIN_RCVD &&
 		    conn->seq_ack_from_tap == conn->seq_to_tap)

From 71249ef3f9bcf1dbb2d6c13cdbc41ba88c794f06 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 13 Feb 2025 20:54:04 +0100
Subject: [PATCH 232/382] tcp, tcp_splice: Don't set SO_SNDBUF and SO_RCVBUF to
 maximum values

I added this a long long time ago because it dramatically improved
throughput back then: with rmem_max and wmem_max >= 4 MiB, we would
force send and receive buffer sizes for TCP sockets to the maximum
allowed value.

This effectively disables TCP auto-tuning, which would otherwise allow
us to exceed those limits, as crazy as it might sound. But in any
case, it made sense.

Now that we have zero (internal) copies on every path, plus vhost-user
support, it turns out that these settings are entirely obsolete. I get
substantially the same throughput in every test we perform, even with
very short durations (one second).

The settings are not just useless: they actually cause us quite some
trouble on guest state migration, because they lead to huge queues
that need to be moved as well.

Drop those settings.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c        | 41 +++++++++--------------------------------
 tcp_conn.h   |  4 ++--
 tcp_splice.c |  6 +++---
 3 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/tcp.c b/tcp.c
index 16d01f6..b978b30 100644
--- a/tcp.c
+++ b/tcp.c
@@ -738,24 +738,6 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
 	SNDBUF_SET(conn, MIN(INT_MAX, v));
 }
 
-/**
- * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values
- * @s:		Socket, can be -1 to avoid check in the caller
- */
-static void tcp_sock_set_bufsize(const struct ctx *c, int s)
-{
-	int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */
-
-	if (s == -1)
-		return;
-
-	if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v)))
-		trace("TCP: failed to set SO_RCVBUF to %i", v);
-
-	if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
-		trace("TCP: failed to set SO_SNDBUF to %i", v);
-}
-
 /**
  * tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm)
  * @s:		Socket, can be -1 to avoid check in the caller
@@ -1278,12 +1260,11 @@ int tcp_conn_pool_sock(int pool[])
 
 /**
  * tcp_conn_new_sock() - Open and prepare new socket for connection
- * @c:		Execution context
  * @af:		Address family
  *
  * Return: socket number on success, negative code if socket creation failed
  */
-static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
+static int tcp_conn_new_sock(sa_family_t af)
 {
 	int s;
 
@@ -1297,7 +1278,6 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
 	if (s < 0)
 		return -errno;
 
-	tcp_sock_set_bufsize(c, s);
 	tcp_sock_set_nodelay(s);
 
 	return s;
@@ -1305,12 +1285,11 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
 
 /**
  * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace
- * @c:		Execution context
  * @af:		Address family (AF_INET or AF_INET6)
  *
  * Return: Socket fd on success, -errno on failure
  */
-int tcp_conn_sock(const struct ctx *c, sa_family_t af)
+int tcp_conn_sock(sa_family_t af)
 {
 	int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4;
 	int s;
@@ -1321,7 +1300,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af)
 	/* If the pool is empty we just open a new one without refilling the
 	 * pool to keep latency down.
 	 */
-	if ((s = tcp_conn_new_sock(c, af)) >= 0)
+	if ((s = tcp_conn_new_sock(af)) >= 0)
 		return s;
 
 	err("TCP: Unable to open socket for new connection: %s",
@@ -1462,7 +1441,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 		goto cancel;
 	}
 
-	if ((s = tcp_conn_sock(c, af)) < 0)
+	if ((s = tcp_conn_sock(af)) < 0)
 		goto cancel;
 
 	pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport);
@@ -1483,7 +1462,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 	} else {
 		/* Not a local, bound destination, inconclusive test */
 		close(s);
-		if ((s = tcp_conn_sock(c, af)) < 0)
+		if ((s = tcp_conn_sock(af)) < 0)
 			goto cancel;
 	}
 
@@ -2090,7 +2069,6 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 	if (s < 0)
 		goto cancel;
 
-	tcp_sock_set_bufsize(c, s);
 	tcp_sock_set_nodelay(s);
 
 	/* FIXME: If useful: when the listening port has a specific bound
@@ -2434,13 +2412,12 @@ static int tcp_ns_socks_init(void *arg)
 
 /**
  * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets
- * @c:		Execution context
  * @pool:	Pool of sockets to refill
  * @af:		Address family to use
  *
  * Return: 0 on success, negative error code if there was at least one error
  */
-int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
+int tcp_sock_refill_pool(int pool[], sa_family_t af)
 {
 	int i;
 
@@ -2450,7 +2427,7 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
 		if (pool[i] >= 0)
 			continue;
 
-		if ((fd = tcp_conn_new_sock(c, af)) < 0)
+		if ((fd = tcp_conn_new_sock(af)) < 0)
 			return fd;
 
 		pool[i] = fd;
@@ -2466,13 +2443,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
 static void tcp_sock_refill_init(const struct ctx *c)
 {
 	if (c->ifi4) {
-		int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET);
+		int rc = tcp_sock_refill_pool(init_sock_pool4, AF_INET);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv4 host socket pool: %s",
 			     strerror_(-rc));
 	}
 	if (c->ifi6) {
-		int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
+		int rc = tcp_sock_refill_pool(init_sock_pool6, AF_INET6);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv6 host socket pool: %s",
 			     strerror_(-rc));
diff --git a/tcp_conn.h b/tcp_conn.h
index d342680..8c20805 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -143,8 +143,8 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn);
 bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
 void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
 int tcp_conn_pool_sock(int pool[]);
-int tcp_conn_sock(const struct ctx *c, sa_family_t af);
-int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af);
+int tcp_conn_sock(sa_family_t af);
+int tcp_sock_refill_pool(int pool[], sa_family_t af);
 void tcp_splice_refill(const struct ctx *c);
 
 #endif /* TCP_CONN_H */
diff --git a/tcp_splice.c b/tcp_splice.c
index f048a82..f1a9223 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -351,7 +351,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
 	int one = 1;
 
 	if (tgtpif == PIF_HOST)
-		conn->s[1] = tcp_conn_sock(c, af);
+		conn->s[1] = tcp_conn_sock(af);
 	else if (tgtpif == PIF_SPLICE)
 		conn->s[1] = tcp_conn_sock_ns(c, af);
 	else
@@ -703,13 +703,13 @@ static int tcp_sock_refill_ns(void *arg)
 	ns_enter(c);
 
 	if (c->ifi4) {
-		int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
+		int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv4 ns socket pool: %s",
 			     strerror_(-rc));
 	}
 	if (c->ifi6) {
-		int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
+		int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv6 ns socket pool: %s",
 			     strerror_(-rc));

From 7c33b1208632a9581d0ee7aabd1e0584a5d1fb20 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Sat, 15 Feb 2025 00:08:41 +1100
Subject: [PATCH 233/382] vhost_user: Clear ring address on GET_VRING_BASE

GET_VRING_BASE stops the queue, clearing the call and kick fds.  However,
we don't clear vring.avail.  That means that if vu_queue_notify() is called
it won't realise the queue isn't ready and will die with an EBADFD.

We get this during migration, because for some reason, qemu reconfigures
the vhost-user device when a migration is triggered.  There's a window
between the GET_VRING_BASE and re-establishing the call fd where the
notify function can be called, causing a crash.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vhost_user.c b/vhost_user.c
index 7ab1377..be1aa94 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -732,6 +732,7 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev,
 	msg->hdr.size = sizeof(msg->payload.state);
 
 	vdev->vq[idx].started = false;
+	vdev->vq[idx].vring.avail = 0;
 
 	if (vdev->vq[idx].call_fd != -1) {
 		close(vdev->vq[idx].call_fd);

From 667caa09c6d46d937b3076254176eded262b3eca Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sun, 16 Feb 2025 08:16:33 +0100
Subject: [PATCH 234/382] tcp_splice: Don't wake up on input data if we can't
 write it anywhere

If we set the OUT_WAIT_* flag (waiting on EPOLLOUT) for a side of a
given flow, it means that we're blocked, waiting for the receiver to
actually receive data, with a full pipe.

In that case, if we keep EPOLLIN set for the socket on the other side
(our receiving side), we'll get into a loop such as:

  41.0230:          pasta: epoll event on connected spliced TCP socket 108 (events: 0x00000001)
  41.0230:          Flow 1 (TCP connection (spliced)): -1 from read-side call
  41.0230:          Flow 1 (TCP connection (spliced)): -1 from write-side call (passed 8192)
  41.0230:          Flow 1 (TCP connection (spliced)): event at tcp_splice_sock_handler:577
  41.0230:          pasta: epoll event on connected spliced TCP socket 108 (events: 0x00000001)
  41.0230:          Flow 1 (TCP connection (spliced)): -1 from read-side call
  41.0230:          Flow 1 (TCP connection (spliced)): -1 from write-side call (passed 8192)
  41.0230:          Flow 1 (TCP connection (spliced)): event at tcp_splice_sock_handler:577

leading to 100% CPU usage, of course.

Drop EPOLLIN on our receiving side as long when we're waiting for
output readiness on the other side.

Link: https://github.com/containers/podman/issues/23686#issuecomment-2661036584
Link: https://www.reddit.com/r/podman/comments/1iph50j/pasta_high_cpu_on_podman_rootless_container/
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_splice.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tcp_splice.c b/tcp_splice.c
index f1a9223..8a39a6f 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -131,8 +131,12 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
 		ev[1].events = EPOLLOUT;
 	}
 
-	flow_foreach_sidei(sidei)
-		ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0;
+	flow_foreach_sidei(sidei) {
+		if (events & OUT_WAIT(sidei)) {
+			ev[sidei].events |= EPOLLOUT;
+			ev[!sidei].events &= ~EPOLLIN;
+		}
+	}
 }
 
 /**

From 01b6a164d94f26be7ad500f71210bdb888f416aa Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sun, 16 Feb 2025 08:31:13 +0100
Subject: [PATCH 235/382] tcp_splice: A typo three years ago and SO_RCVLOWAT is
 gone

In commit e5eefe77435a ("tcp: Refactor to use events instead of
states, split out spliced implementation"), this:

			if (!bitmap_isset(rcvlowat_set, conn - ts) &&
			    readlen > (long)c->tcp.pipe_size / 10) {

(note the !) became:

			if (conn->flags & lowat_set_flag &&
			    readlen > (long)c->tcp.pipe_size / 10) {

in the new tcp_splice_sock_handler().

We want to check, there, if we should set SO_RCVLOWAT, only if we
haven't set it already.

But, instead, we're checking if it's already set before we set it, so
we'll never set it, of course.

Fix the check and re-enable the functionality, which should give us
improved CPU utilisation in non-interactive cases where we are not
transferring at full pipe capacity.

Fixes: e5eefe77435a ("tcp: Refactor to use events instead of states, split out spliced implementation")
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_splice.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcp_splice.c b/tcp_splice.c
index 8a39a6f..5d845c9 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -556,7 +556,7 @@ eintr:
 			if (readlen >= (long)c->tcp.pipe_size * 10 / 100)
 				continue;
 
-			if (conn->flags & lowat_set_flag &&
+			if (!(conn->flags & lowat_set_flag) &&
 			    readlen > (long)c->tcp.pipe_size / 10) {
 				int lowat = c->tcp.pipe_size / 4;
 

From 3e903bbb1f386ebb892b1196d339d2d705bce8a2 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sat, 15 Feb 2025 06:13:13 +0100
Subject: [PATCH 236/382] repair, passt-repair: Build and warning fixes for
 musl

Checked against musl 1.2.5.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt-repair.c |  4 +++-
 repair.c       | 13 +++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/passt-repair.c b/passt-repair.c
index 1174ae3..e0c366e 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -63,6 +63,7 @@ int main(int argc, char **argv)
 	struct cmsghdr *cmsg;
 	struct msghdr msg;
 	struct iovec iov;
+	size_t cmsg_len;
 	int op;
 
 	prctl(PR_SET_DUMPABLE, 0);
@@ -138,8 +139,9 @@ loop:
 		}
 	}
 	if (!n) {
+		cmsg_len = cmsg->cmsg_len; /* socklen_t is 'unsigned' on musl */
 		fprintf(stderr, "Invalid ancillary data length %zu from peer\n",
-			cmsg->cmsg_len);
+			cmsg_len);
 		_exit(1);
 	}
 
diff --git a/repair.c b/repair.c
index d288617..dac28a6 100644
--- a/repair.c
+++ b/repair.c
@@ -13,6 +13,7 @@
  */
 
 #include <errno.h>
+#include <sys/socket.h>
 #include <sys/uio.h>
 
 #include "util.h"
@@ -145,9 +146,9 @@ void repair_handler(struct ctx *c, uint32_t events)
  */
 int repair_flush(struct ctx *c)
 {
-	struct iovec iov = { &repair_cmd, sizeof(repair_cmd) };
 	char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
-	     __attribute__ ((aligned(__alignof__(struct cmsghdr))));
+	     __attribute__ ((aligned(__alignof__(struct cmsghdr)))) = { 0 };
+	struct iovec iov = { &repair_cmd, sizeof(repair_cmd) };
 	struct cmsghdr *cmsg;
 	struct msghdr msg;
 	int8_t reply;
@@ -155,8 +156,12 @@ int repair_flush(struct ctx *c)
 	if (!repair_nfds)
 		return 0;
 
-	msg = (struct msghdr){ NULL, 0, &iov, 1,
-			       buf, CMSG_SPACE(sizeof(int) * repair_nfds), 0 };
+	msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0,
+			       .msg_iov = &iov, .msg_iovlen = 1,
+			       .msg_control = buf,
+			       .msg_controllen = CMSG_SPACE(sizeof(int) *
+							    repair_nfds),
+			       .msg_flags = 0 };
 	cmsg = CMSG_FIRSTHDR(&msg);
 
 	cmsg->cmsg_level = SOL_SOCKET;

From 89ecf2fd40adab549bdf25cdb68996f56d67b13e Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 13 Feb 2025 23:14:13 +1100
Subject: [PATCH 237/382] migrate: Migrate TCP flows

This implements flow preparation on the source, transfer of data with
a format roughly inspired by struct tcp_tap_conn, plus a specific
structure for parameters that don't fit in the flow table, and flow
insertion on the target, with all the appropriate window options,
window scaling, MSS, etc.

Contents of pending queues are transferred as well.

The target side is rather convoluted because we first need to create
sockets and switch them to repair mode, before we can apply options
that are *not* stored in the flow table. This also means that, if
we're testing this on the same machine, in the same namespace, we need
to close the listening socket on the source before we can start moving
data.

Further, we need to connect() the socket on the target before we can
restore data queues, but we can't do that (again, on the same machine)
as long as the matching source socket is open, which implies an
arbitrary limit on queue sizes we can transfer, because we can only
dump pending queues on the source as long as the socket is open, of
course.

Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/selinux/passt.te |   4 +-
 flow.c                   | 243 +++++++++++
 flow.h                   |   8 +
 migrate.c                |  10 +
 passt.c                  |   6 +-
 repair.c                 |   1 -
 tcp.c                    | 919 +++++++++++++++++++++++++++++++++++++++
 tcp_conn.h               | 103 +++++
 8 files changed, 1288 insertions(+), 6 deletions(-)

diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index fc1320d..f595079 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -45,7 +45,7 @@ require {
 	type net_conf_t;
 	type proc_net_t;
 	type node_t;
-	class tcp_socket { create accept listen name_bind name_connect };
+	class tcp_socket { create accept listen name_bind name_connect getattr };
 	class udp_socket { create accept listen };
 	class icmp_socket { bind create name_bind node_bind setopt read write };
 	class sock_file { create unlink write };
@@ -129,7 +129,7 @@ corenet_udp_sendrecv_all_ports(passt_t)
 allow passt_t node_t:icmp_socket { name_bind node_bind };
 allow passt_t port_t:icmp_socket name_bind;
 
-allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write };
+allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr };
 allow passt_t self:udp_socket { create getopt setopt connect bind read write };
 allow passt_t self:icmp_socket { bind create setopt read write };
 
diff --git a/flow.c b/flow.c
index 3ac551b..cc881e8 100644
--- a/flow.c
+++ b/flow.c
@@ -19,6 +19,7 @@
 #include "inany.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "repair.h"
 
 const char *flow_state_str[] = {
 	[FLOW_STATE_FREE]	= "FREE",
@@ -52,6 +53,35 @@ const uint8_t flow_proto[] = {
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");
 
+#define foreach_flow(i, flow, bound)					\
+	for ((i) = 0, (flow) = &flowtab[(i)];				\
+	     (i) < (bound);						\
+	     (i)++, (flow) = &flowtab[(i)])				\
+		if ((flow)->f.state == FLOW_STATE_FREE)			\
+			(i) += (flow)->free.n - 1;			\
+		else
+
+#define foreach_active_flow(i, flow, bound)				\
+	foreach_flow((i), (flow), (bound))				\
+		if ((flow)->f.state != FLOW_STATE_ACTIVE)		\
+			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
+			continue;					\
+		else
+
+#define foreach_tcp_flow(i, flow, bound)				\
+	foreach_active_flow((i), (flow), (bound))			\
+		if ((flow)->f.type != FLOW_TCP)				\
+			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
+			continue;					\
+		else
+
+#define foreach_established_tcp_flow(i, flow, bound)			\
+	foreach_tcp_flow((i), (flow), (bound))				\
+		if (!tcp_flow_is_established(&(flow)->tcp))		\
+			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
+			continue;					\
+		else
+
 /* Global Flow Table */
 
 /**
@@ -874,6 +904,219 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 	*last_next = FLOW_MAX;
 }
 
+/**
+ * flow_migrate_source_rollback() - Disable repair mode, return failure
+ * @c:		Execution context
+ * @max_flow:	Maximum index of affected flows
+ * @ret:	Negative error code
+ *
+ * Return: @ret
+ */
+static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
+					int ret)
+{
+	union flow *flow;
+	unsigned i;
+
+	debug("...roll back migration");
+
+	foreach_established_tcp_flow(i, flow, max_flow)
+		if (tcp_flow_repair_off(c, &flow->tcp))
+			die("Failed to roll back TCP_REPAIR mode");
+
+	if (repair_flush(c))
+		die("Failed to roll back TCP_REPAIR mode");
+
+	return ret;
+}
+
+/**
+ * flow_migrate_repair_all() - Turn repair mode on or off for all flows
+ * @c:		Execution context
+ * @enable:	Switch repair mode on if set, off otherwise
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int flow_migrate_repair_all(struct ctx *c, bool enable)
+{
+	union flow *flow;
+	unsigned i;
+	int rc;
+
+	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
+		if (enable)
+			rc = tcp_flow_repair_on(c, &flow->tcp);
+		else
+			rc = tcp_flow_repair_off(c, &flow->tcp);
+
+		if (rc) {
+			debug("Can't %s repair mode: %s",
+			      enable ? "enable" : "disable", strerror_(-rc));
+			return flow_migrate_source_rollback(c, i, rc);
+		}
+	}
+
+	if ((rc = repair_flush(c))) {
+		debug("Can't %s repair mode: %s",
+		      enable ? "enable" : "disable", strerror_(-rc));
+		return flow_migrate_source_rollback(c, i, rc);
+	}
+
+	return 0;
+}
+
+/**
+ * flow_migrate_source_pre() - Prepare flows for migration: enable repair mode
+ * @c:		Execution context
+ * @stage:	Migration stage information (unused)
+ * @fd:		Migration file descriptor (unused)
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
+			    int fd)
+{
+	int rc;
+
+	(void)stage;
+	(void)fd;
+
+	if ((rc = flow_migrate_repair_all(c, true)))
+		return -rc;
+
+	return 0;
+}
+
+/**
+ * flow_migrate_source() - Dump all the remaining information and send data
+ * @c:		Execution context (unused)
+ * @stage:	Migration stage information (unused)
+ * @fd:		Migration file descriptor
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
+			int fd)
+{
+	uint32_t count = 0;
+	bool first = true;
+	union flow *flow;
+	unsigned i;
+	int rc;
+
+	(void)c;
+	(void)stage;
+
+	foreach_established_tcp_flow(i, flow, FLOW_MAX)
+		count++;
+
+	count = htonl(count);
+	if (write_all_buf(fd, &count, sizeof(count))) {
+		rc = errno;
+		err_perror("Can't send flow count (%u)", ntohl(count));
+		return flow_migrate_source_rollback(c, FLOW_MAX, rc);
+	}
+
+	debug("Sending %u flows", ntohl(count));
+
+	/* Dump and send information that can be stored in the flow table.
+	 *
+	 * Limited rollback options here: if we fail to transfer any data (that
+	 * is, on the first flow), undo everything and resume. Otherwise, the
+	 * stream might now be inconsistent, and we might have closed listening
+	 * TCP sockets, so just terminate.
+	 */
+	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
+		rc = tcp_flow_migrate_source(fd, &flow->tcp);
+		if (rc) {
+			err("Can't send data, flow %u: %s", i, strerror_(-rc));
+			if (!first)
+				die("Inconsistent migration state, exiting");
+
+			return flow_migrate_source_rollback(c, FLOW_MAX, -rc);
+		}
+
+		first = false;
+	}
+
+	/* And then "extended" data (including window data we saved previously):
+	 * the target needs to set repair mode on sockets before it can set
+	 * this stuff, but it needs sockets (and flows) for that.
+	 *
+	 * This also closes sockets so that the target can start connecting
+	 * theirs: you can't sendmsg() to queues (using the socket) if the
+	 * socket is not connected (EPIPE), not even in repair mode. And the
+	 * target needs to restore queues now because we're sending the data.
+	 *
+	 * So, no rollback here, just try as hard as we can. Tolerate per-flow
+	 * failures but not if the stream might be inconsistent (reported here
+	 * as EIO).
+	 */
+	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
+		rc = tcp_flow_migrate_source_ext(fd, i, &flow->tcp);
+		if (rc) {
+			err("Extended data for flow %u: %s", i, strerror_(-rc));
+
+			if (rc == -EIO)
+				die("Inconsistent migration state, exiting");
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * flow_migrate_target() - Receive flows and insert in flow table
+ * @c:		Execution context
+ * @stage:	Migration stage information (unused)
+ * @fd:		Migration file descriptor
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
+			int fd)
+{
+	uint32_t count;
+	unsigned i;
+	int rc;
+
+	(void)stage;
+
+	if (read_all_buf(fd, &count, sizeof(count)))
+		return errno;
+
+	count = ntohl(count);
+	debug("Receiving %u flows", count);
+
+	if ((rc = flow_migrate_repair_all(c, true)))
+		return -rc;
+
+	repair_flush(c);
+
+	/* TODO: flow header with type, instead? */
+	for (i = 0; i < count; i++) {
+		rc = tcp_flow_migrate_target(c, fd);
+		if (rc) {
+			debug("Migration data failure at flow %u: %s, abort",
+			      i, strerror_(-rc));
+			return -rc;
+		}
+	}
+
+	repair_flush(c);
+
+	for (i = 0; i < count; i++) {
+		rc = tcp_flow_migrate_target_ext(c, flowtab + i, fd);
+		if (rc) {
+			debug("Migration data failure at flow %u: %s, abort",
+			      i, strerror_(-rc));
+			return -rc;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * flow_init() - Initialise flow related data structures
  */
diff --git a/flow.h b/flow.h
index 24ba3ef..675726e 100644
--- a/flow.h
+++ b/flow.h
@@ -249,6 +249,14 @@ union flow;
 
 void flow_init(void);
 void flow_defer_handler(const struct ctx *c, const struct timespec *now);
+int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
+			      int fd);
+int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
+			    int fd);
+int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
+			int fd);
+int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
+			int fd);
 
 void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 	__attribute__((format(printf, 3, 4)));
diff --git a/migrate.c b/migrate.c
index 1c59016..0fca77b 100644
--- a/migrate.c
+++ b/migrate.c
@@ -103,6 +103,16 @@ static const struct migrate_stage stages_v1[] = {
 		.source = seen_addrs_source_v1,
 		.target = seen_addrs_target_v1,
 	},
+	{
+		.name = "prepare flows",
+		.source = flow_migrate_source_pre,
+		.target = NULL,
+	},
+	{
+		.name = "transfer flows",
+		.source = flow_migrate_source,
+		.target = flow_migrate_target,
+	},
 	{ 0 },
 };
 
diff --git a/passt.c b/passt.c
index 6f9fb4d..68d1a28 100644
--- a/passt.c
+++ b/passt.c
@@ -223,9 +223,6 @@ int main(int argc, char **argv)
 		if (sigaction(SIGCHLD, &sa, NULL))
 			die_perror("Couldn't install signal handlers");
 
-		if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
-			die_perror("Couldn't set disposition for SIGPIPE");
-
 		c.mode = MODE_PASTA;
 	} else if (strstr(name, "passt")) {
 		c.mode = MODE_PASST;
@@ -233,6 +230,9 @@ int main(int argc, char **argv)
 		_exit(EXIT_FAILURE);
 	}
 
+	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
+		die_perror("Couldn't set disposition for SIGPIPE");
+
 	madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
 
 	c.epollfd = epoll_create1(EPOLL_CLOEXEC);
diff --git a/repair.c b/repair.c
index dac28a6..3ee089f 100644
--- a/repair.c
+++ b/repair.c
@@ -202,7 +202,6 @@ int repair_flush(struct ctx *c)
  *
  * Return: 0 on success, negative error code on failure
  */
-/* cppcheck-suppress unusedFunction */
 int repair_set(struct ctx *c, int s, int cmd)
 {
 	int rc;
diff --git a/tcp.c b/tcp.c
index b978b30..98e1c6a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -280,6 +280,7 @@
 #include <stddef.h>
 #include <string.h>
 #include <sys/epoll.h>
+#include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/timerfd.h>
 #include <sys/types.h>
@@ -287,6 +288,8 @@
 #include <time.h>
 #include <arpa/inet.h>
 
+#include <linux/sockios.h>
+
 #include "checksum.h"
 #include "util.h"
 #include "iov.h"
@@ -299,6 +302,7 @@
 #include "log.h"
 #include "inany.h"
 #include "flow.h"
+#include "repair.h"
 #include "linux_dep.h"
 
 #include "flow_table.h"
@@ -306,6 +310,21 @@
 #include "tcp_buf.h"
 #include "tcp_vu.h"
 
+#ifndef __USE_MISC
+/* From Linux UAPI, missing in netinet/tcp.h provided by musl */
+struct tcp_repair_opt {
+	__u32	opt_code;
+	__u32	opt_val;
+};
+
+enum {
+	TCP_NO_QUEUE,
+	TCP_RECV_QUEUE,
+	TCP_SEND_QUEUE,
+	TCP_QUEUES_NR,
+};
+#endif
+
 /* MSS rounding: see SET_MSS() */
 #define MSS_DEFAULT			536
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
@@ -326,6 +345,19 @@
 	 ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
 #define CONN_HAS(conn, set)	(((conn)->events & (set)) == (set))
 
+/* Buffers to migrate pending data from send and receive queues. No, they don't
+ * use memory if we don't use them. And we're going away after this, so splurge.
+ */
+#define TCP_MIGRATE_SND_QUEUE_MAX	(64 << 20)
+#define TCP_MIGRATE_RCV_QUEUE_MAX	(64 << 20)
+uint8_t tcp_migrate_snd_queue		[TCP_MIGRATE_SND_QUEUE_MAX];
+uint8_t tcp_migrate_rcv_queue		[TCP_MIGRATE_RCV_QUEUE_MAX];
+
+#define TCP_MIGRATE_RESTORE_CHUNK_MIN	1024 /* Try smaller when above this */
+
+/* "Extended" data (not stored in the flow table) for TCP flow migration */
+static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX];
+
 static const char *tcp_event_str[] __attribute((__unused__)) = {
 	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
 
@@ -1468,6 +1500,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 
 	conn->sock = s;
 	conn->timer = -1;
+	conn->listening_sock = -1;
 	conn_event(c, conn, TAP_SYN_RCVD);
 
 	conn->wnd_to_tap = WINDOW_DEFAULT;
@@ -1968,10 +2001,27 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		ack_due = 1;
 
 	if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
+		socklen_t sl;
+		struct tcp_info tinfo;
+
 		shutdown(conn->sock, SHUT_WR);
 		conn_event(c, conn, SOCK_FIN_SENT);
 		tcp_send_flag(c, conn, ACK);
 		ack_due = 0;
+
+		/* If we received a FIN, but the socket is in TCP_ESTABLISHED
+		 * state, it must be a migrated socket. The kernel saw the FIN
+		 * on the source socket, but not on the target socket.
+		 *
+		 * Approximate the effect of that FIN: as we're sending a FIN
+		 * out ourselves, the socket is now in a state equivalent to
+		 * LAST_ACK. Now that we sent the FIN out, close it with a RST.
+		 */
+		sl = sizeof(tinfo);
+		getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl);
+		if (tinfo.tcpi_state == TCP_ESTABLISHED &&
+		    conn->events & SOCK_FIN_RCVD)
+			goto reset;
 	}
 
 	if (ack_due)
@@ -2054,6 +2104,7 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
 void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			const struct timespec *now)
 {
+	struct tcp_tap_conn *conn;
 	union sockaddr_inany sa;
 	socklen_t sl = sizeof(sa);
 	struct flowside *ini;
@@ -2069,6 +2120,9 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 	if (s < 0)
 		goto cancel;
 
+	conn = (struct tcp_tap_conn *)flow;
+	conn->listening_sock = ref.fd;
+
 	tcp_sock_set_nodelay(s);
 
 	/* FIXME: If useful: when the listening port has a specific bound
@@ -2634,3 +2688,868 @@ void tcp_timer(struct ctx *c, const struct timespec *now)
 	if (c->mode == MODE_PASTA)
 		tcp_splice_refill(c);
 }
+
+/**
+ * tcp_flow_is_established() - Was the connection established? Includes closing
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: true if the connection was established, false otherwise
+ */
+bool tcp_flow_is_established(const struct tcp_tap_conn *conn)
+{
+	return conn->events & ESTABLISHED;
+}
+
+/**
+ * tcp_flow_repair_on() - Enable repair mode for a single TCP flow
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+	int rc = 0;
+
+	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON)))
+		err("Failed to set TCP_REPAIR");
+
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_off() - Clear repair mode for a single TCP flow
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+	int rc = 0;
+
+	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
+		err("Failed to clear TCP_REPAIR");
+
+	return rc;
+}
+
+/**
+ * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options
+ * @c:		Execution context
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t)
+{
+	struct tcp_info tinfo;
+	socklen_t sl;
+
+	sl = sizeof(tinfo);
+	if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+		int rc = -errno;
+		err_perror("Querying TCP_INFO, socket %i", s);
+		return rc;
+	}
+
+	t->snd_ws		= tinfo.tcpi_snd_wscale;
+	t->rcv_ws		= tinfo.tcpi_rcv_wscale;
+	t->tcpi_state		= tinfo.tcpi_state;
+	t->tcpi_options		= tinfo.tcpi_options;
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG
+ * @c:		Execution context
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t)
+{
+	socklen_t sl = sizeof(t->mss);
+
+	if (getsockopt(s, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) {
+		int rc = -errno;
+		err_perror("Getting MSS, socket %i", s);
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
+ * @c:		Execution context
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t)
+{
+	struct tcp_repair_window wnd;
+	socklen_t sl = sizeof(wnd);
+
+	if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
+		int rc = -errno;
+		err_perror("Getting window repair data, socket %i", s);
+		return rc;
+	}
+
+	t->snd_wl1	= wnd.snd_wl1;
+	t->snd_wnd	= wnd.snd_wnd;
+	t->max_window	= wnd.max_window;
+	t->rcv_wnd	= wnd.rcv_wnd;
+	t->rcv_wup	= wnd.rcv_wup;
+
+	/* If we received a FIN, we also need to adjust window parameters.
+	 *
+	 * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+	 */
+	if (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK) {
+		t->rcv_wup--;
+		t->rcv_wnd++;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_wnd() - Restore window parameters from extended data
+ * @c:		Execution context
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
+{
+	struct tcp_repair_window wnd;
+
+	wnd.snd_wl1	= t->snd_wl1;
+	wnd.snd_wnd	= t->snd_wnd;
+	wnd.max_window	= t->max_window;
+	wnd.rcv_wnd	= t->rcv_wnd;
+	wnd.rcv_wup	= t->rcv_wup;
+
+	if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sizeof(wnd))) {
+		int rc = -errno;
+		err_perror("Setting window data, socket %i", s);
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_select_queue() - Select queue (receive or send) for next operation
+ * @s:		Socket
+ * @queue:	TCP_RECV_QUEUE or TCP_SEND_QUEUE
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_select_queue(int s, int queue)
+{
+	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue))) {
+		int rc = -errno;
+		err_perror("Selecting TCP_SEND_QUEUE, socket %i", s);
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data
+ * @s:		Socket
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ *
+ * #syscalls:vu ioctl
+ */
+static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
+{
+	ssize_t rc;
+
+	if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) {
+		rc = -errno;
+		err_perror("Getting send queue size, socket %i", s);
+		return rc;
+	}
+
+	if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) {
+		rc = -errno;
+		err_perror("Getting not sent count, socket %i", s);
+		return rc;
+	}
+
+	/* If we sent a FIN, SIOCOUTQ and SIOCOUTQNSD are one greater than the
+	 * actual pending queue length, because they are based on the sequence
+	 * numbers, not directly on the buffer contents.
+	 *
+	 * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+	 */
+	if (t->tcpi_state == TCP_FIN_WAIT1 || t->tcpi_state == TCP_FIN_WAIT2 ||
+	    t->tcpi_state == TCP_LAST_ACK  || t->tcpi_state == TCP_CLOSING) {
+		if (t->sndq)
+			t->sndq--;
+		if (t->notsent)
+			t->notsent--;
+	}
+
+	if (t->notsent > t->sndq) {
+		err("Invalid notsent count socket %i, send: %u, not sent: %u",
+		    s, t->sndq, t->notsent);
+		return -EINVAL;
+	}
+
+	if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) {
+		err("Send queue too large to migrate socket %i: %u bytes",
+		    s, t->sndq);
+		return -ENOBUFS;
+	}
+
+	rc = recv(s, tcp_migrate_snd_queue,
+		  MIN(t->sndq, TCP_MIGRATE_SND_QUEUE_MAX), MSG_PEEK);
+	if (rc < 0) {
+		if (errno == EAGAIN)  { /* EAGAIN means empty */
+			rc = 0;
+		} else {
+			rc = -errno;
+			err_perror("Can't read send queue, socket %i", s);
+			return rc;
+		}
+	}
+
+	if ((uint32_t)rc < t->sndq) {
+		err("Short read migrating send queue");
+		return -ENXIO;
+	}
+
+	t->notsent = MIN(t->notsent, t->sndq);
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue
+ * @s:		Socket
+ * @len:	Length of data to be restored
+ * @buf:	Buffer with content of pending data queue
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
+{
+	size_t chunk = len;
+	uint8_t *p = buf;
+
+	while (len > 0) {
+		ssize_t rc = send(s, p, MIN(len, chunk), 0);
+
+		if (rc < 0) {
+			if ((errno == ENOBUFS || errno == ENOMEM) &&
+			    chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) {
+				chunk /= 2;
+				continue;
+			}
+
+			rc = -errno;
+			err_perror("Can't write queue, socket %i", s);
+			return rc;
+		}
+
+		len -= rc;
+		p += rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue
+ * @s:		Socket
+ * @v:		Sequence value, set on return
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_seq(int s, uint32_t *v)
+{
+	socklen_t sl = sizeof(*v);
+
+	if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
+		int rc = -errno;
+		err_perror("Dumping sequence, socket %i", s);
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_seq() - Restore sequence for pre-selected queue
+ * @s:		Socket
+ * @v:		Sequence value to be set
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_seq(int s, const uint32_t *v)
+{
+	if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
+		int rc = -errno;
+		err_perror("Setting sequence, socket %i", s);
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it
+ * @s:		Socket
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ *
+ * #syscalls:vu ioctl
+ */
+static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
+{
+	ssize_t rc;
+
+	if (ioctl(s, SIOCINQ, &t->rcvq) < 0) {
+		rc = -errno;
+		err_perror("Get receive queue size, socket %i", s);
+		return rc;
+	}
+
+	/* If we received a FIN, SIOCINQ is one greater than the actual number
+	 * of bytes on the queue, because it's based on the sequence number
+	 * rather than directly on the buffer contents.
+	 *
+	 * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+	 */
+	if (t->rcvq &&
+	    (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK))
+		t->rcvq--;
+
+	if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
+		err("Receive queue too large to migrate socket %i: %u bytes",
+		    s, t->rcvq);
+		return -ENOBUFS;
+	}
+
+	rc = recv(s, tcp_migrate_rcv_queue, t->rcvq, MSG_PEEK);
+	if (rc < 0) {
+		if (errno == EAGAIN)  { /* EAGAIN means empty */
+			rc = 0;
+		} else {
+			rc = -errno;
+			err_perror("Can't read receive queue for socket %i", s);
+			return rc;
+		}
+	}
+
+	if ((uint32_t)rc < t->rcvq) {
+		err("Short read migrating receive queue");
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps)
+ * @s:		Socket
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
+{
+	const struct tcp_repair_opt opts[] = {
+		{ TCPOPT_WINDOW,		t->snd_ws + (t->rcv_ws << 16) },
+		{ TCPOPT_MAXSEG,		t->mss },
+		{ TCPOPT_SACK_PERMITTED,	0 },
+		{ TCPOPT_TIMESTAMP,		0 },
+	};
+	socklen_t sl;
+
+	sl = sizeof(opts[0]) * (2 +
+				!!(t->tcpi_options & TCPI_OPT_SACK) +
+				!!(t->tcpi_options & TCPI_OPT_TIMESTAMPS));
+
+	if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
+		int rc = -errno;
+		err_perror("Setting repair options, socket %i", s);
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening
+ * @fd:		Descriptor for state migration
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
+{
+	struct tcp_tap_transfer t = {
+		.retrans		= conn->retrans,
+		.ws_from_tap		= conn->ws_from_tap,
+		.ws_to_tap		= conn->ws_to_tap,
+		.events			= conn->events,
+
+		.tap_mss		= htonl(MSS_GET(conn)),
+
+		.sndbuf			= htonl(conn->sndbuf),
+
+		.flags			= conn->flags,
+		.seq_dup_ack_approx	= conn->seq_dup_ack_approx,
+
+		.wnd_from_tap		= htons(conn->wnd_from_tap),
+		.wnd_to_tap		= htons(conn->wnd_to_tap),
+
+		.seq_to_tap		= htonl(conn->seq_to_tap),
+		.seq_ack_from_tap	= htonl(conn->seq_ack_from_tap),
+		.seq_from_tap		= htonl(conn->seq_from_tap),
+		.seq_ack_to_tap		= htonl(conn->seq_ack_to_tap),
+		.seq_init_from_tap	= htonl(conn->seq_init_from_tap),
+	};
+
+	memcpy(&t.pif, conn->f.pif, sizeof(t.pif));
+	memcpy(&t.side, conn->f.side, sizeof(t.side));
+
+	if (write_all_buf(fd, &t, sizeof(t))) {
+		int rc = -errno;
+		err_perror("Can't write migration data, socket %i", conn->sock);
+		return rc;
+	}
+
+	if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD))
+		close(conn->listening_sock);
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
+ * @fd:		Descriptor for state migration
+ * @fidx:	Flow index
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
+ */
+int tcp_flow_migrate_source_ext(int fd, int fidx,
+				const struct tcp_tap_conn *conn)
+{
+	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+	struct tcp_tap_transfer_ext *t = &migrate_ext[fidx];
+	int s = conn->sock;
+	int rc;
+
+	/* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode
+	 * weird.
+	 */
+	if (tcp_set_peek_offset(s, -1)) {
+		rc = -errno;
+		goto fail;
+	}
+
+	if ((rc = tcp_flow_dump_tinfo(s, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_mss(s, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_wnd(s, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_sndqueue(s, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_seq(s, &t->seq_snd)))
+		goto fail;
+
+	if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_rcvqueue(s, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_seq(s, &t->seq_rcv)))
+		goto fail;
+
+	close(s);
+
+	/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
+	 * based on the end of the queues.
+	 */
+	t->seq_rcv	-= t->rcvq;
+	t->seq_snd	-= t->sndq;
+
+	debug("Extended migration data, socket %i sequences send %u receive %u",
+	      s, t->seq_snd, t->seq_rcv);
+	debug("  pending queues: send %u not sent %u receive %u",
+	      t->sndq, t->notsent, t->rcvq);
+	debug("  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+	      t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
+	debug("  SO_PEEK_OFF %s  offset=%"PRIu32,
+	      peek_offset_cap ? "enabled" : "disabled", peek_offset);
+
+	/* Endianness fix-ups */
+	t->seq_snd	= htonl(t->seq_snd);
+	t->seq_rcv 	= htonl(t->seq_rcv);
+	t->sndq		= htonl(t->sndq);
+	t->notsent	= htonl(t->notsent);
+	t->rcvq		= htonl(t->rcvq);
+
+	t->snd_wl1	= htonl(t->snd_wl1);
+	t->snd_wnd	= htonl(t->snd_wnd);
+	t->max_window	= htonl(t->max_window);
+	t->rcv_wnd	= htonl(t->rcv_wnd);
+	t->rcv_wup	= htonl(t->rcv_wup);
+
+	if (write_all_buf(fd, t, sizeof(*t))) {
+		err_perror("Failed to write extended data, socket %i", s);
+		return -EIO;
+	}
+
+	if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) {
+		err_perror("Failed to write send queue data, socket %i", s);
+		return -EIO;
+	}
+
+	if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) {
+		err_perror("Failed to write receive queue data, socket %i", s);
+		return -EIO;
+	}
+
+	return 0;
+
+fail:
+	/* For any type of failure dumping data, write an invalid extended data
+	 * descriptor that allows us to keep the stream in sync, but tells the
+	 * target to skip the flow. If we fail to transfer data, that's fatal:
+	 * return -EIO in that case (and only in that case).
+	 */
+	t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */
+
+	if (write_all_buf(fd, t, sizeof(*t))) {
+		err_perror("Failed to write extended data, socket %i", s);
+		return -EIO;
+	}
+
+	if (rc == -EIO) /* but not a migration data transfer failure */
+		return -ENODATA;
+
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_socket() - Open and bind socket, request repair mode
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
+	const struct flowside *sockside = HOSTFLOW(conn);
+	union sockaddr_inany a;
+	socklen_t sl;
+	int s, rc;
+
+	pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
+
+	if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+				 IPPROTO_TCP)) < 0) {
+		rc = -errno;
+		err_perror("Failed to create socket for migrated flow");
+		return rc;
+	}
+	s = conn->sock;
+
+	if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)))
+		debug_perror("Setting SO_REUSEADDR on socket %i", s);
+
+	tcp_sock_set_nodelay(s);
+
+	if ((rc = bind(s, &a.sa, sizeof(a)))) {
+		err_perror("Failed to bind socket for migrated flow");
+		goto err;
+	}
+
+	if ((rc = tcp_flow_repair_on(c, conn)))
+		goto err;
+
+	return 0;
+
+err:
+	close(s);
+	conn->sock = -1;
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_connect(const struct ctx *c,
+				   struct tcp_tap_conn *conn)
+{
+	const struct flowside *tgt = HOSTFLOW(conn);
+	int rc;
+
+	rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
+	if (rc) {
+		rc = -errno;
+		err_perror("Failed to connect migrated socket %i", conn->sock);
+		return rc;
+	}
+
+	conn->in_epoll = 0;
+	conn->timer = -1;
+	conn->listening_sock = -1;
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert
+ * @c:		Execution context
+ * @fd:		Descriptor for state migration
+ *
+ * Return: 0 on success, negative on fatal failure, but 0 on single flow failure
+ */
+int tcp_flow_migrate_target(struct ctx *c, int fd)
+{
+	struct tcp_tap_transfer t;
+	struct tcp_tap_conn *conn;
+	union flow *flow;
+	int rc;
+
+	if (!(flow = flow_alloc())) {
+		err("Flow table full on migration target");
+		return 0;
+	}
+
+	if (read_all_buf(fd, &t, sizeof(t))) {
+		flow_alloc_cancel(flow);
+		err_perror("Failed to receive migration data");
+		return -errno;
+	}
+
+	flow->f.state = FLOW_STATE_TGT;
+	memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif));
+	memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
+	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
+	conn->retrans			= t.retrans;
+	conn->ws_from_tap		= t.ws_from_tap;
+	conn->ws_to_tap			= t.ws_to_tap;
+	conn->events			= t.events;
+
+	conn->sndbuf			= htonl(t.sndbuf);
+
+	conn->flags			= t.flags;
+	conn->seq_dup_ack_approx	= t.seq_dup_ack_approx;
+
+	MSS_SET(conn,			  ntohl(t.tap_mss));
+
+	conn->wnd_from_tap		= ntohs(t.wnd_from_tap);
+	conn->wnd_to_tap		= ntohs(t.wnd_to_tap);
+
+	conn->seq_to_tap		= ntohl(t.seq_to_tap);
+	conn->seq_ack_from_tap		= ntohl(t.seq_ack_from_tap);
+	conn->seq_from_tap		= ntohl(t.seq_from_tap);
+	conn->seq_ack_to_tap		= ntohl(t.seq_ack_to_tap);
+	conn->seq_init_from_tap		= ntohl(t.seq_init_from_tap);
+
+	if ((rc = tcp_flow_repair_socket(c, conn))) {
+		flow_err(flow, "Can't set up socket: %s, drop", strerror_(rc));
+		flow_alloc_cancel(flow);
+		return 0;
+	}
+
+	flow_hash_insert(c, TAP_SIDX(conn));
+	FLOW_ACTIVATE(conn);
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect
+ * @c:		Execution context
+ * @flow:	Existing flow for this connection data
+ * @fd:		Descriptor for state migration
+ *
+ * Return: 0 on success, negative on fatal failure, but 0 on single flow failure
+ */
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
+{
+	struct tcp_tap_conn *conn = &flow->tcp;
+	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+	struct tcp_tap_transfer_ext t;
+	int s = conn->sock, rc;
+
+	if (read_all_buf(fd, &t, sizeof(t))) {
+		rc = -errno;
+		err_perror("Failed to read extended data for socket %i", s);
+		return rc;
+	}
+
+	if (!t.tcpi_state) { /* Source wants us to skip this flow */
+		flow_err(flow, "Dropping as requested by source");
+		goto fail;
+	}
+
+	/* Endianness fix-ups */
+	t.seq_snd	= ntohl(t.seq_snd);
+	t.seq_rcv 	= ntohl(t.seq_rcv);
+	t.sndq		= ntohl(t.sndq);
+	t.notsent	= ntohl(t.notsent);
+	t.rcvq		= ntohl(t.rcvq);
+
+	t.snd_wl1	= ntohl(t.snd_wl1);
+	t.snd_wnd	= ntohl(t.snd_wnd);
+	t.max_window	= ntohl(t.max_window);
+	t.rcv_wnd	= ntohl(t.rcv_wnd);
+	t.rcv_wup	= ntohl(t.rcv_wup);
+
+	debug("Extended migration data, socket %i sequences send %u receive %u",
+	      s, t.seq_snd, t.seq_rcv);
+	debug("  pending queues: send %u not sent %u receive %u",
+	      t.sndq, t.notsent, t.rcvq);
+	debug("  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+	      t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
+	debug("  SO_PEEK_OFF %s  offset=%"PRIu32,
+	      peek_offset_cap ? "enabled" : "disabled", peek_offset);
+
+	if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq ||
+	    t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
+		err("Bad queues socket %i, send: %u, not sent: %u, receive: %u",
+		    s, t.sndq, t.notsent, t.rcvq);
+		return -EINVAL;
+	}
+
+	if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) {
+		rc = -errno;
+		err_perror("Failed to read send queue data, socket %i", s);
+		return rc;
+	}
+
+	if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) {
+		rc = -errno;
+		err_perror("Failed to read receive queue data, socket %i", s);
+		return rc;
+	}
+
+	if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
+		goto fail;
+
+	if (tcp_flow_repair_seq(s, &t.seq_snd))
+		goto fail;
+
+	if (tcp_flow_select_queue(s, TCP_RECV_QUEUE))
+		goto fail;
+
+	if (tcp_flow_repair_seq(s, &t.seq_rcv))
+		goto fail;
+
+	if (tcp_flow_repair_connect(c, conn))
+		goto fail;
+
+	if (tcp_flow_repair_queue(s, t.rcvq, tcp_migrate_rcv_queue))
+		goto fail;
+
+	if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
+		goto fail;
+
+	if (tcp_flow_repair_queue(s, t.sndq - t.notsent,
+				  tcp_migrate_snd_queue))
+		goto fail;
+
+	if (tcp_flow_repair_opt(s, &t))
+		goto fail;
+
+	/* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't
+	 * send it out, because we already sent it for sure.
+	 *
+	 * Call shutdown(x, SHUT_WR) in repair mode, so that we move to
+	 * FIN_WAIT_1 (tcp_shutdown()) without sending anything
+	 * (goto in tcp_write_xmit()).
+	 */
+	if (t.tcpi_state == TCP_FIN_WAIT2) {
+		int v;
+
+		v = TCP_SEND_QUEUE;
+		if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
+			debug_perror("Selecting repair queue, socket %i", s);
+		else
+			shutdown(s, SHUT_WR);
+	}
+
+	if (tcp_flow_repair_wnd(s, &t))
+		goto fail;
+
+	tcp_flow_repair_off(c, conn);
+	repair_flush(c);
+
+	if (t.notsent) {
+		if (tcp_flow_repair_queue(s, t.notsent,
+					  tcp_migrate_snd_queue +
+					  (t.sndq - t.notsent))) {
+			/* This sometimes seems to fail for unclear reasons.
+			 * Don't fail the whole migration, just reset the flow
+			 * and carry on to the next one.
+			 */
+			goto fail;
+		}
+	}
+
+	/* If we sent a FIN but it wasn't acknowledged yet (TCP_FIN_WAIT1), send
+	 * it out, because we don't know if we already sent it.
+	 *
+	 * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to
+	 * TCP_FIN_WAIT1.
+	 */
+	if (t.tcpi_state == TCP_FIN_WAIT1)
+		shutdown(s, SHUT_WR);
+
+	if (tcp_set_peek_offset(conn->sock, peek_offset))
+		goto fail;
+
+	tcp_send_flag(c, conn, ACK);
+	tcp_data_from_sock(c, conn);
+
+	if ((rc = tcp_epoll_ctl(c, conn))) {
+		debug("Failed to subscribe to epoll for migrated socket %i: %s",
+		      conn->sock, strerror_(-rc));
+		goto fail;
+	}
+
+	return 0;
+
+fail:
+	tcp_flow_repair_off(c, conn);
+	repair_flush(c);
+
+	conn->flags = 0; /* Not waiting for ACK, don't schedule timer */
+	tcp_rst(c, conn);
+
+	return 0;
+}
diff --git a/tcp_conn.h b/tcp_conn.h
index 8c20805..42dff48 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -19,6 +19,7 @@
  * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
  * @sock:		Socket descriptor number
  * @events:		Connection events, implying connection states
+ * @listening_sock:	Listening socket this socket was accept()ed from, or -1
  * @timer:		timerfd descriptor for timeout events
  * @flags:		Connection flags representing internal attributes
  * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
@@ -68,6 +69,7 @@ struct tcp_tap_conn {
 #define	CONN_STATE_BITS		/* Setting these clears other flags */	\
 	(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
 
+	int		listening_sock;
 
 	int		timer		:FD_REF_BITS;
 
@@ -96,6 +98,93 @@ struct tcp_tap_conn {
 	uint32_t	seq_init_from_tap;
 };
 
+/**
+ * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
+ * @pif:		Interfaces for each side of the flow
+ * @side:		Addresses and ports for each side of the flow
+ * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
+ * @ws_from_tap:	Window scaling factor advertised from tap/guest
+ * @ws_to_tap:		Window scaling factor advertised to tap/guest
+ * @events:		Connection events, implying connection states
+ * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
+ * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
+ * @flags:		Connection flags representing internal attributes
+ * @seq_dup_ack_approx:	Last duplicate ACK number sent to tap
+ * @wnd_from_tap:	Last window size from tap, unscaled (as received)
+ * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
+ * @seq_to_tap:		Next sequence for packets to tap
+ * @seq_ack_from_tap:	Last ACK number received from tap
+ * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
+ * @seq_ack_to_tap:	Last ACK number sent to tap
+ * @seq_init_from_tap:	Initial sequence number from tap
+*/
+struct tcp_tap_transfer {
+	uint8_t		pif[SIDES];
+	struct flowside	side[SIDES];
+
+	uint8_t		retrans;
+	uint8_t		ws_from_tap;
+	uint8_t		ws_to_tap;
+	uint8_t		events;
+
+	uint32_t	tap_mss;
+
+	uint32_t	sndbuf;
+
+	uint8_t		flags;
+	uint8_t		seq_dup_ack_approx;
+
+	uint16_t	wnd_from_tap;
+	uint16_t	wnd_to_tap;
+
+	uint32_t	seq_to_tap;
+	uint32_t	seq_ack_from_tap;
+	uint32_t	seq_from_tap;
+	uint32_t	seq_ack_to_tap;
+	uint32_t	seq_init_from_tap;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
+/**
+ * struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order
+ * @seq_snd:		Socket-side send sequence
+ * @seq_rcv:		Socket-side receive sequence
+ * @sndq:		Length of pending send queue (unacknowledged / not sent)
+ * @notsent:		Part of pending send queue that wasn't sent out yet
+ * @rcvq:		Length of pending receive queue
+ * @mss:		Socket-side MSS clamp
+ * @snd_wl1:		Next sequence used in window probe (next sequence - 1)
+ * @snd_wnd:		Socket-side sending window
+ * @max_window:		Window clamp
+ * @rcv_wnd:		Socket-side receive window
+ * @rcv_wup:		rcv_nxt on last window update sent
+ * @snd_ws:		Window scaling factor, send
+ * @rcv_ws:		Window scaling factor, receive
+ * @tcpi_state:		Connection state in TCP_INFO style (enum, tcp_states.h)
+ * @tcpi_options:	TCPI_OPT_* constants (timestamps, selective ACK)
+ */
+struct tcp_tap_transfer_ext {
+	uint32_t	seq_snd;
+	uint32_t	seq_rcv;
+
+	uint32_t	sndq;
+	uint32_t	notsent;
+	uint32_t	rcvq;
+
+	uint32_t	mss;
+
+	/* We can't just use struct tcp_repair_window: we need network order */
+	uint32_t	snd_wl1;
+	uint32_t	snd_wnd;
+	uint32_t	max_window;
+	uint32_t	rcv_wnd;
+	uint32_t	rcv_wup;
+
+	uint8_t		snd_ws;
+	uint8_t		rcv_ws;
+	uint8_t		tcpi_state;
+	uint8_t		tcpi_options;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
 /**
  * struct tcp_splice_conn - Descriptor for a spliced TCP connection
  * @f:			Generic flow information
@@ -140,6 +229,20 @@ extern int init_sock_pool4	[TCP_SOCK_POOL_SIZE];
 extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];
 
 bool tcp_flow_defer(const struct tcp_tap_conn *conn);
+
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
+
+int tcp_flow_migrate_shrink_window(int fidx, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(int fd, int fidx,
+				const struct tcp_tap_conn *conn);
+
+int tcp_flow_migrate_target(struct ctx *c, int fd);
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd);
+
+bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
+
 bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
 void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
 int tcp_conn_pool_sock(int pool[]);

From a1e48a02ff3550eb7875a7df6726086e9b3a1213 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 13 Feb 2025 23:14:14 +1100
Subject: [PATCH 238/382] test: Add migration tests

PCAP=1 ./run migrate/bidirectional gives an overview of how the
whole thing is working.

Add 12 tests in total, checking basic functionality with and without
flows in both directions, with and without sockets in half-closed
states (both inbound and outbound), migration behaviour under traffic
flood, under traffic flood with > 253 flows, and strict checking of
sequences under flood with ramp patterns in both directions.

These tests need preparation and teardown for each case, as we need
to restore the source guest in its own context and pane before we can
test again. Eventually, we could consider alternating source and
target so that we don't need to restart from scratch every time, but
that's beyond the scope of this initial test implementation.

Trick: './run migrate/*' runs all the tests with preparation and
teardown steps.

Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/lib/layout                |  57 +++++++++++++-
 test/lib/setup                 | 138 ++++++++++++++++++++++++++++++++-
 test/lib/test                  |  48 ++++++++++++
 test/migrate/basic             |  59 ++++++++++++++
 test/migrate/basic_fin         |  62 +++++++++++++++
 test/migrate/bidirectional     |  64 +++++++++++++++
 test/migrate/bidirectional_fin |  64 +++++++++++++++
 test/migrate/iperf3_bidir6     |  58 ++++++++++++++
 test/migrate/iperf3_in4        |  50 ++++++++++++
 test/migrate/iperf3_in6        |  58 ++++++++++++++
 test/migrate/iperf3_many_out6  |  60 ++++++++++++++
 test/migrate/iperf3_out4       |  47 +++++++++++
 test/migrate/iperf3_out6       |  58 ++++++++++++++
 test/migrate/rampstream_in     |  12 +--
 test/migrate/rampstream_out    |   8 +-
 test/run                       |  42 +++++++++-
 16 files changed, 871 insertions(+), 14 deletions(-)
 create mode 100644 test/migrate/basic
 create mode 100644 test/migrate/basic_fin
 create mode 100644 test/migrate/bidirectional
 create mode 100644 test/migrate/bidirectional_fin
 create mode 100644 test/migrate/iperf3_bidir6
 create mode 100644 test/migrate/iperf3_in4
 create mode 100644 test/migrate/iperf3_in6
 create mode 100644 test/migrate/iperf3_many_out6
 create mode 100644 test/migrate/iperf3_out4
 create mode 100644 test/migrate/iperf3_out6

diff --git a/test/lib/layout b/test/lib/layout
index 4d03572..fddcdc4 100644
--- a/test/lib/layout
+++ b/test/lib/layout
@@ -135,7 +135,7 @@ layout_two_guests() {
 	get_info_cols
 
 	pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
-	pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2
+	pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #1" qemu_2 guest_2
 
 	tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done'
 	tmux send-keys -t ${PANE_INFO} -N 100 C-m
@@ -143,13 +143,66 @@ layout_two_guests() {
 
 	pane_watch_contexts ${PANE_HOST} host host
 	pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
-	pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2
+	pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #1" pasta_1 passt_2
 
 	info_layout "two guests, two passt instances, in namespaces"
 
 	sleep 1
 }
 
+# layout_migrate() - Two guest panes, two passt panes, two passt-repair panes,
+#		     plus host and log
+layout_migrate() {
+	sleep 1
+
+	tmux kill-pane -a -t 0
+	cmd_write 0 clear
+
+	tmux split-window -v -t passt_test
+	tmux split-window -h -l '33%'
+	tmux split-window -h -t passt_test:1.1
+
+	tmux split-window -h -l '35%' -t passt_test:1.0
+	tmux split-window -v -t passt_test:1.0
+
+	tmux split-window -v -t passt_test:1.4
+	tmux split-window -v -t passt_test:1.6
+
+	tmux split-window -v -t passt_test:1.3
+
+	PANE_GUEST_1=0
+	PANE_GUEST_2=1
+	PANE_INFO=2
+	PANE_MON=3
+	PANE_HOST=4
+	PANE_PASST_REPAIR_1=5
+	PANE_PASST_1=6
+	PANE_PASST_REPAIR_2=7
+	PANE_PASST_2=8
+
+	get_info_cols
+
+	pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
+	pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2
+
+	tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done'
+	tmux send-keys -t ${PANE_INFO} -N 100 C-m
+	tmux select-pane -t ${PANE_INFO} -T "test log"
+
+	pane_watch_contexts ${PANE_MON} "QEMU monitor" mon mon
+
+	pane_watch_contexts ${PANE_HOST} host host
+	pane_watch_contexts ${PANE_PASST_REPAIR_1} "passt-repair #1 in namespace #1" repair_1 passt_repair_1
+	pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
+
+	pane_watch_contexts ${PANE_PASST_REPAIR_2} "passt-repair #2 in namespace #2" repair_2 passt_repair_2
+	pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2
+
+	info_layout "two guests, two passt + passt-repair instances, in namespaces"
+
+	sleep 1
+}
+
 # layout_demo_pasta() - Four panes for pasta demo
 layout_demo_pasta() {
 	sleep 1
diff --git a/test/lib/setup b/test/lib/setup
index ee67152..575bc21 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -305,6 +305,117 @@ setup_two_guests() {
 	context_setup_guest guest_2 ${GUEST_2_CID}
 }
 
+# setup_migrate() - Set up two namespace, run qemu, passt/passt-repair in both
+setup_migrate() {
+	context_setup_host host
+	context_setup_host mon
+	context_setup_host pasta_1
+	context_setup_host pasta_2
+
+	layout_migrate
+
+	# Ports:
+	#
+	#         guest #1  |  guest #2 |   ns #1   |    host
+	#         --------- |-----------|-----------|------------
+	#  10001  as server |           | to guest  |  to ns #1
+	#  10002            |           | as server |  to ns #1
+	#  10003            |           |  to init  |  as server
+	#  10004            | as server | to guest  |  to ns #1
+
+	__opts=
+	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/pasta_1.pcap"
+	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
+	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+
+	__map_host4=192.0.2.1
+	__map_host6=2001:db8:9a55::1
+	__map_ns4=192.0.2.2
+	__map_ns6=2001:db8:9a55::2
+
+	# Option 1: send stuff via spliced path in pasta
+	# context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002 -T 10003 -u 10001,10002 -U 10003 --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
+	# Option 2: send stuff via tap (--map-guest-addr) instead (useful to see capture of full migration)
+	context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002,10004 -T 10003 -u 10001,10002,10004 -U 10003 --map-guest-addr ${__map_host4} --map-guest-addr ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
+	context_setup_nstool passt_1 ${STATESETUP}/ns1.hold
+	context_setup_nstool passt_repair_1 ${STATESETUP}/ns1.hold
+
+	context_setup_nstool passt_2 ${STATESETUP}/ns1.hold
+	context_setup_nstool passt_repair_2 ${STATESETUP}/ns1.hold
+
+	context_setup_nstool qemu_1 ${STATESETUP}/ns1.hold
+	context_setup_nstool qemu_2 ${STATESETUP}/ns1.hold
+
+	__ifname="$(context_run qemu_1 "ip -j link show | jq -rM '.[] | select(.link_type == \"ether\").ifname'")"
+
+	sleep 1
+
+	__opts="--vhost-user"
+	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
+	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
+	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+
+	context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001"
+	wait_for [ -f "${STATESETUP}/passt_1.pid" ]
+
+	context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
+
+	__opts="--vhost-user"
+	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
+	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
+	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+
+	context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004"
+	wait_for [ -f "${STATESETUP}/passt_2.pid" ]
+
+	context_run_bg passt_repair_2 "./passt-repair ${STATESETUP}/passt_2.socket.repair"
+
+	__vmem="512M"	# Keep migration fast
+	__qemu_netdev1="					       \
+		-chardev socket,id=c,path=${STATESETUP}/passt_1.socket \
+		-netdev vhost-user,id=v,chardev=c		       \
+		-device virtio-net,netdev=v			       \
+		-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
+		-numa node,memdev=m"
+	__qemu_netdev2="					       \
+		-chardev socket,id=c,path=${STATESETUP}/passt_2.socket \
+		-netdev vhost-user,id=v,chardev=c		       \
+		-device virtio-net,netdev=v			       \
+		-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
+		-numa node,memdev=m"
+
+	GUEST_1_CID=94557
+	context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}"		     \
+		' -M accel=kvm:tcg'                                          \
+		' -m '${__vmem}' -cpu host -smp '${VCPUS}		     \
+		' -kernel '"${KERNEL}"					     \
+		' -initrd '${INITRAMFS}' -nographic -serial stdio'	     \
+		' -nodefaults'						     \
+		' -append "console=ttyS0 mitigations=off apparmor=0" '	     \
+		" ${__qemu_netdev1}"					     \
+		" -pidfile ${STATESETUP}/qemu_1.pid"			     \
+		" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"	     \
+		" -monitor unix:${STATESETUP}/qemu_1_mon.sock,server,nowait"
+
+	GUEST_2_CID=94558
+	context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}"		     \
+		' -M accel=kvm:tcg'                                          \
+		' -m '${__vmem}' -cpu host -smp '${VCPUS}		     \
+		' -kernel '"${KERNEL}"					     \
+		' -initrd '${INITRAMFS}' -nographic -serial stdio'	     \
+		' -nodefaults'						     \
+		' -append "console=ttyS0 mitigations=off apparmor=0" '	     \
+		" ${__qemu_netdev2}"					     \
+		" -pidfile ${STATESETUP}/qemu_2.pid"			     \
+		" -device vhost-vsock-pci,guest-cid=$GUEST_2_CID"	     \
+		" -monitor unix:${STATESETUP}/qemu_2_mon.sock,server,nowait" \
+		" -incoming tcp:0:20005"
+
+	context_setup_guest guest_1 ${GUEST_1_CID}
+	# Only available after migration:
+	( context_setup_guest guest_2 ${GUEST_2_CID} & )
+}
+
 # teardown_context_watch() - Remove contexts and stop panes watching them
 # $1:	Pane number watching
 # $@:	Context names
@@ -375,7 +486,8 @@ teardown_two_guests() {
 	context_wait pasta_1
 	context_wait pasta_2
 
-	rm -f "${STATESETUP}/passt__[12].pid" "${STATESETUP}/pasta_[12].pid"
+	rm "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid"
+	rm "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid"
 
 	teardown_context_watch ${PANE_HOST} host
 	teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
@@ -384,6 +496,30 @@ teardown_two_guests() {
 	teardown_context_watch ${PANE_PASST_2} pasta_2 passt_2
 }
 
+# teardown_migrate() - Exit namespaces, kill qemu processes, passt and pasta
+teardown_migrate() {
+	${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_1.pid")
+	${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_2.pid")
+	context_wait qemu_1
+	context_wait qemu_2
+
+	${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/passt_2.pid")
+	context_wait passt_1
+	context_wait passt_2
+	${NSTOOL} stop "${STATESETUP}/ns1.hold"
+	context_wait pasta_1
+
+	rm -f "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid"
+	rm -f "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid"
+
+	teardown_context_watch ${PANE_HOST} host
+
+	teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
+	teardown_context_watch ${PANE_GUEST_2} qemu_2 guest_2
+	teardown_context_watch ${PANE_PASST_1} pasta_1 passt_1
+	teardown_context_watch ${PANE_PASST_2} pasta_1 passt_2
+}
+
 # teardown_demo_passt() - Exit namespace, kill qemu, passt and pasta
 teardown_demo_passt() {
 	tmux send-keys -t ${PANE_GUEST} "C-c"
diff --git a/test/lib/test b/test/lib/test
index e6726be..758250a 100755
--- a/test/lib/test
+++ b/test/lib/test
@@ -68,6 +68,45 @@ test_iperf3() {
 	TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
 }
 
+# test_iperf3m() - Ugly helper for iperf3 directive, guest migration variant
+# $1:	Variable name: to put the measure bandwidth into
+# $2:	Initial source/client context
+# $3:	Second source/client context the guest is moving to
+# $4:	Destination name or address for client
+# $5:	Port number, ${i} is translated to process index
+# $6:	Run time, in seconds
+# $7:	Client options
+test_iperf3m() {
+	__var="${1}"; shift
+	__cctx="${1}"; shift
+	__cctx2="${1}"; shift
+	__dest="${1}"; shift
+	__port="${1}"; shift
+	__time="${1}"; shift
+
+	pane_or_context_run "${__cctx}" 'rm -f c.json'
+
+        # A 1s wait for connection on what's basically a local link
+        # indicates something is pretty wrong
+        __timeout=1000
+	pane_or_context_run_bg "${__cctx}" 				\
+		 'iperf3 -J -c '${__dest}' -p '${__port}		\
+		 '	 --connect-timeout '${__timeout}		\
+		 '	 -t'${__time}' -i0 '"${@}"' > c.json'		\
+
+	__jval=".end.sum_received.bits_per_second"
+
+	sleep $((${__time} + 3))
+
+	pane_or_context_output "${__cctx2}"				\
+		 'cat c.json'
+
+	__bw=$(pane_or_context_output "${__cctx2}"			\
+		 'cat c.json | jq -rMs "map('${__jval}') | add"')
+
+	TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
+}
+
 test_one_line() {
 	__line="${1}"
 
@@ -177,6 +216,12 @@ test_one_line() {
 	"guest2w")
 		pane_or_context_wait guest_2 || TEST_ONE_nok=1
 		;;
+	"mon")
+		pane_or_context_run mon "${__arg}" || TEST_ONE_nok=1
+		;;
+	"monb")
+		pane_or_context_run_bg mon "${__arg}"
+		;;
 	"ns")
 		pane_or_context_run ns "${__arg}" || TEST_ONE_nok=1
 		;;
@@ -292,6 +337,9 @@ test_one_line() {
 	"iperf3")
 		test_iperf3 ${__arg}
 		;;
+	"iperf3m")
+		test_iperf3m ${__arg}
+		;;
 	"set")
 		TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__arg%% *}__" "${__arg#* }")"
 		;;
diff --git a/test/migrate/basic b/test/migrate/basic
new file mode 100644
index 0000000..3f11f7d
--- /dev/null
+++ b/test/migrate/basic
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/basic - Check basic migration functionality
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1	/sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test	TCP/IPv4: guest1/guest2 > host
+g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
+hostb	socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
+sleep	1
+# Option 1: via spliced path in pasta, namespace to host
+# guest1b	{ printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003
+# Option 2: via --map-guest-addr (tap) in pasta, namespace to host
+guest1b	{ printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006
+sleep	1
+
+mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+hostw
+hout	MSG cat __STATESETUP__/msg
+check	[ "__MSG__" = "Hello from guest 1 and from guest 2" ]
diff --git a/test/migrate/basic_fin b/test/migrate/basic_fin
new file mode 100644
index 0000000..aa61ec5
--- /dev/null
+++ b/test/migrate/basic_fin
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/basic_fin - Outbound traffic across migration, half-closed socket
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1	/sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test	TCP/IPv4: guest1, half-close, guest2 > host
+g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
+
+hostb	echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg
+#hostb	socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
+
+#sleep	20
+# Option 1: via spliced path in pasta, namespace to host
+# guest1b	{ printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003
+# Option 2: via --map-guest-addr (tap) in pasta, namespace to host
+guest1b	{ printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006
+sleep	1
+
+mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+hostw
+hout	MSG cat __STATESETUP__/msg
+check	[ "__MSG__" = "Hello from guest 1 and from guest 2" ]
diff --git a/test/migrate/bidirectional b/test/migrate/bidirectional
new file mode 100644
index 0000000..4c04081
--- /dev/null
+++ b/test/migrate/bidirectional
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/bidirectional - Check migration with messages in both directions
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	TCP/IPv4: guest1/guest2 > host, host > guest1/guest2
+g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
+
+hostb	socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
+guest1b	socat -u TCP4-LISTEN:10001 OPEN:msg,create,trunc
+sleep	1
+
+guest1b	socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006
+hostb	socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001
+sleep	1
+guest1	printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock
+host	printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock
+sleep	1
+
+mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+sleep	2
+guest2	printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null
+host	printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null
+
+hostw
+# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1,
+# use sleep 1 for the moment
+sleep	1
+
+hout	MSG cat __STATESETUP__/msg
+check	[ "__MSG__" = "Hello from guest 1 and from guest 2" ]
+
+g2out	MSG cat msg
+check	[ "__MSG__" = "Dear guest 1, you are now guest 2" ]
diff --git a/test/migrate/bidirectional_fin b/test/migrate/bidirectional_fin
new file mode 100644
index 0000000..1c13527
--- /dev/null
+++ b/test/migrate/bidirectional_fin
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/bidirectional_fin - Both directions, half-closed sockets
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	TCP/IPv4: guest1/guest2 <- (half closed) -> host
+g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
+
+hostb	echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg
+guest1b	echo FIN | socat TCP4-LISTEN:10001,shut-down STDIO,ignoreeof > msg
+sleep	1
+
+guest1b	socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006
+hostb	socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001
+sleep	1
+guest1	printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock
+host	printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock
+sleep	1
+
+mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+sleep	2
+guest2	printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null
+host	printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null
+
+hostw
+# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1,
+# use sleep 1 for the moment
+sleep	1
+
+hout	MSG cat __STATESETUP__/msg
+check	[ "__MSG__" = "Hello from guest 1 and from guest 2" ]
+
+g2out	MSG cat msg
+check	[ "__MSG__" = "Dear guest 1, you are now guest 2" ]
diff --git a/test/migrate/iperf3_bidir6 b/test/migrate/iperf3_bidir6
new file mode 100644
index 0000000..4bfefb5
--- /dev/null
+++ b/test/migrate/iperf3_bidir6
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/iperf3_bidir6 - Migration behaviour with many bidirectional flows
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+set	THREADS 128
+set	TIME 3
+set	OMIT 0.1
+set	OPTS -Z -P __THREADS__ -O__OMIT__ -N --bidir
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1	/sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test	TCP/IPv6 host <-> guest flood, many flows, during migration
+
+monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s	host 10006
+iperf3m	BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
+bw	__BW__ 1 2
+
+iperf3k	host
diff --git a/test/migrate/iperf3_in4 b/test/migrate/iperf3_in4
new file mode 100644
index 0000000..c5f3916
--- /dev/null
+++ b/test/migrate/iperf3_in4
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/iperf3_in4 - Migration behaviour under inbound IPv4 flood
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+guest1	/sbin/sysctl -w net.core.rmem_max=33554432
+guest1	/sbin/sysctl -w net.core.wmem_max=33554432
+
+set	THREADS 1
+set	TIME 4
+set	OMIT 0.1
+set	OPTS -Z -P __THREADS__ -O__OMIT__ -N -R
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	TCP/IPv4 host to guest throughput during migration
+
+monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s	host 10006
+iperf3m	BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__
+bw	__BW__ 1 2
+
+iperf3k	host
diff --git a/test/migrate/iperf3_in6 b/test/migrate/iperf3_in6
new file mode 100644
index 0000000..16cf504
--- /dev/null
+++ b/test/migrate/iperf3_in6
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/iperf3_in6 - Migration behaviour under inbound IPv6 flood
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+set	THREADS 4
+set	TIME 3
+set	OMIT 0.1
+set	OPTS -Z -P __THREADS__ -O__OMIT__ -N -R
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1	/sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test	TCP/IPv6 host to guest throughput during migration
+
+monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s	host 10006
+iperf3m	BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
+bw	__BW__ 1 2
+
+iperf3k	host
diff --git a/test/migrate/iperf3_many_out6 b/test/migrate/iperf3_many_out6
new file mode 100644
index 0000000..88133f2
--- /dev/null
+++ b/test/migrate/iperf3_many_out6
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/iperf3_many_out6 - Migration behaviour with many outbound flows
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+set	THREADS 16
+set	TIME 3
+set	OMIT 0.1
+set	OPTS -Z -P __THREADS__ -O__OMIT__ -N -l 1M
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1	/sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test	TCP/IPv6 guest to host flood, many flows, during migration
+
+test	TCP/IPv6 host to guest throughput during migration
+
+monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s	host 10006
+iperf3m	BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
+bw	__BW__ 1 2
+
+iperf3k	host
diff --git a/test/migrate/iperf3_out4 b/test/migrate/iperf3_out4
new file mode 100644
index 0000000..968057b
--- /dev/null
+++ b/test/migrate/iperf3_out4
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/iperf3_out4 - Migration behaviour under outbound IPv4 flood
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+set	THREADS 6
+set	TIME 2
+set	OMIT 0.1
+set	OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	TCP/IPv4 guest to host throughput during migration
+
+monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s	host 10006
+iperf3m	BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__
+bw	__BW__ 1 2
+
+iperf3k	host
diff --git a/test/migrate/iperf3_out6 b/test/migrate/iperf3_out6
new file mode 100644
index 0000000..21fbfcd
--- /dev/null
+++ b/test/migrate/iperf3_out6
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#  for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+#  for network namespace/tap device mode
+#
+# test/migrate/iperf3_out6 - Migration behaviour under outbound IPv6 flood
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools	ip jq dhclient socat cat
+htools	ip jq
+
+set	MAP_HOST4 192.0.2.1
+set	MAP_HOST6 2001:db8:9a55::1
+set	MAP_NS4 192.0.2.2
+set	MAP_NS6 2001:db8:9a55::2
+
+set	THREADS 6
+set	TIME 2
+set	OMIT 0.1
+set	OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M
+
+test	Interface name
+g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check	[ -n "__IFNAME1__" ]
+
+test	DHCP: address
+guest1	ip link set dev __IFNAME1__ up
+guest1	/sbin/dhclient -4 __IFNAME1__
+g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check	[ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test	DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1	/sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test	TCP/IPv6 guest to host throughput during migration
+
+monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s	host 10006
+iperf3m	BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
+bw	__BW__ 1 2
+
+iperf3k	host
diff --git a/test/migrate/rampstream_in b/test/migrate/rampstream_in
index 46f4143..df333ba 100644
--- a/test/migrate/rampstream_in
+++ b/test/migrate/rampstream_in
@@ -6,10 +6,10 @@
 # PASTA - Pack A Subtle Tap Abstraction
 #  for network namespace/tap device mode
 #
-# test/migrate/basic - Check basic migration functionality
+# test/migrate/rampstream_in - Check sequence correctness with inbound ramp
 #
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
+# Copyright (c) 2025 Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
 
 g1tools	ip jq dhclient socat cat
 htools	ip jq
@@ -43,15 +43,15 @@ g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__")
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
 
-test	TCP/IPv4: host > guest
+test	TCP/IPv4: sequence check, ramps, inbound
 g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
 guest1b	socat -u TCP4-LISTEN:10001 EXEC:"rampstream-check.sh __RAMPS__"
 sleep	1
 hostb	socat -u EXEC:"test/rampstream send __RAMPS__" TCP4:__ADDR1__:10001
 
-sleep 1
+sleep	1
 
-#mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+monb	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
 
 hostw
 
diff --git a/test/migrate/rampstream_out b/test/migrate/rampstream_out
index 91b9c63..8ed3229 100644
--- a/test/migrate/rampstream_out
+++ b/test/migrate/rampstream_out
@@ -6,10 +6,10 @@
 # PASTA - Pack A Subtle Tap Abstraction
 #  for network namespace/tap device mode
 #
-# test/migrate/basic - Check basic migration functionality
+# test/migrate/rampstream_out - Check sequence correctness with outbound ramp
 #
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
+# Copyright (c) 2025 Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
 
 g1tools	ip jq dhclient socat cat
 htools	ip jq
@@ -43,7 +43,7 @@ g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__")
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
 
-test	TCP/IPv4: guest > host
+test	TCP/IPv4: sequence check, ramps, outbound
 g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
 hostb	socat -u TCP4-LISTEN:10006 EXEC:"test/rampstream check __RAMPS__"
 sleep	1
diff --git a/test/run b/test/run
index f188d8e..4e86f30 100755
--- a/test/run
+++ b/test/run
@@ -130,6 +130,43 @@ run() {
 	test two_guests_vu/basic
 	teardown two_guests
 
+	setup migrate
+	test migrate/basic
+	teardown migrate
+	setup migrate
+	test migrate/basic_fin
+	teardown migrate
+	setup migrate
+	test migrate/bidirectional
+	teardown migrate
+	setup migrate
+	test migrate/bidirectional_fin
+	teardown migrate
+	setup migrate
+	test migrate/iperf3_out4
+	teardown migrate
+	setup migrate
+	test migrate/iperf3_out6
+	teardown migrate
+	setup migrate
+	test migrate/iperf3_in4
+	teardown migrate
+	setup migrate
+	test migrate/iperf3_in6
+	teardown migrate
+	setup migrate
+	test migrate/iperf3_bidir6
+	teardown migrate
+	setup migrate
+	test migrate/iperf3_many_out6
+	teardown migrate
+	setup migrate
+	test migrate/rampstream_in
+	teardown migrate
+	setup migrate
+	test migrate/rampstream_out
+	teardown migrate
+
 	VALGRIND=0
 	VHOST_USER=0
 	setup passt_in_ns
@@ -186,7 +223,10 @@ run_selected() {
 
 	__setup=
 	for __test; do
-		if [ "${__test%%/*}" != "${__setup}" ]; then
+		# HACK: the migrate tests need the setup repeated for
+		#       each test
+		if [ "${__test%%/*}" != "${__setup}" -o		\
+		     "${__test%%/*}" = "migrate" ]; then
 			[ -n "${__setup}" ] && teardown "${__setup}"
 			__setup="${__test%%/*}"
 			setup "${__setup}"

From bcc4908c2b4a20c581f2b03fed40da97b804106f Mon Sep 17 00:00:00 2001
From: Enrique Llorente <ellorent@redhat.com>
Date: Mon, 17 Feb 2025 10:28:14 +0100
Subject: [PATCH 239/382] dhcp: Remove option 255 length byte

The option 255 (end of options) do not need the length byte, this change
remove that allowing to have one extra byte at other dynamic options.

Signed-off-by: Enrique Llorente <ellorent@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 dhcp.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/dhcp.c b/dhcp.c
index 401cb5b..4a209f1 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -64,9 +64,9 @@ static struct opt opts[255];
 #define OPT_MIN		60 /* RFC 951 */
 
 /* Total option size (excluding end option) is 576 (RFC 2131), minus
- * offset of options (268), minus end option and its length (2).
+ * offset of options (268), minus end option (1).
  */
-#define OPT_MAX		306
+#define OPT_MAX		307
 
 /**
  * dhcp_init() - Initialise DHCP options
@@ -127,7 +127,7 @@ struct msg {
 	uint8_t sname[64];
 	uint8_t file[128];
 	uint32_t magic;
-	uint8_t o[OPT_MAX + 2 /* End option and its length */ ];
+	uint8_t o[OPT_MAX + 1 /* End option */ ];
 } __attribute__((__packed__));
 
 /**
@@ -194,7 +194,6 @@ static int fill(struct msg *m)
 	}
 
 	m->o[offset++] = 255;
-	m->o[offset++] = 0;
 
 	if (offset < OPT_MIN) {
 		memset(&m->o[offset], 0, OPT_MIN - offset);

From 0a51060f7ac3e1e1a9d87ffdb037b9c367a2a4d9 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 13:07:17 +1100
Subject: [PATCH 240/382] packet: Use flexible array member in struct pool

Currently we have a dummy pkt[1] array, which we alias with an array of
a different size via various macros.  However, we already require C11 which
includes flexible array members, so we can do better.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packet.h b/packet.h
index 3f70e94..85ee550 100644
--- a/packet.h
+++ b/packet.h
@@ -21,7 +21,7 @@ struct pool {
 	size_t buf_size;
 	size_t size;
 	size_t count;
-	struct iovec pkt[1];
+	struct iovec pkt[];
 };
 
 int vu_packet_check_range(void *buf, size_t offset, size_t len,

From 354bc0bab1cb6095592288674d375511443427fd Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 13:07:18 +1100
Subject: [PATCH 241/382] packet: Don't pass start and offset separately to
 packet_check_range()

Fundamentally what packet_check_range() does is to check whether a given
memory range is within the allowed / expected memory set aside for packets
from a particular pool.  That range could represent a whole packet (from
packet_add_do()) or part of a packet (from packet_get_do()), but it doesn't
really matter which.

However, we pass the start of the range as two parameters: @start which is
the start of the packet, and @offset which is the offset within the packet
of the range we're interested in.  We never use these separately, only as
(start + offset).  Simplify the interface of packet_check_range() and
vu_packet_check_range() to directly take the start of the relevant range.
This will allow some additional future improvements.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c    | 36 +++++++++++++++++++-----------------
 packet.h    |  3 +--
 vu_common.c | 11 ++++-------
 3 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/packet.c b/packet.c
index 03a11e6..0330b54 100644
--- a/packet.c
+++ b/packet.c
@@ -23,23 +23,22 @@
 #include "log.h"
 
 /**
- * packet_check_range() - Check if a packet memory range is valid
+ * packet_check_range() - Check if a memory range is valid for a pool
  * @p:		Packet pool
- * @offset:	Offset of data range in packet descriptor
+ * @ptr:	Start of desired data range
  * @len:	Length of desired data range
- * @start:	Start of the packet descriptor
  * @func:	For tracing: name of calling function
  * @line:	For tracing: caller line of function call
  *
  * Return: 0 if the range is valid, -1 otherwise
  */
-static int packet_check_range(const struct pool *p, size_t offset, size_t len,
-			      const char *start, const char *func, int line)
+static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
+			      const char *func, int line)
 {
 	if (p->buf_size == 0) {
 		int ret;
 
-		ret = vu_packet_check_range((void *)p->buf, offset, len, start);
+		ret = vu_packet_check_range((void *)p->buf, ptr, len);
 
 		if (ret == -1)
 			trace("cannot find region, %s:%i", func, line);
@@ -47,16 +46,16 @@ static int packet_check_range(const struct pool *p, size_t offset, size_t len,
 		return ret;
 	}
 
-	if (start < p->buf) {
-		trace("packet start %p before buffer start %p, "
-		      "%s:%i", (void *)start, (void *)p->buf, func, line);
+	if (ptr < p->buf) {
+		trace("packet range start %p before buffer start %p, %s:%i",
+		      (void *)ptr, (void *)p->buf, func, line);
 		return -1;
 	}
 
-	if (start + len + offset > p->buf + p->buf_size) {
-		trace("packet offset plus length %zu from size %zu, "
-		      "%s:%i", start - p->buf + len + offset,
-		      p->buf_size, func, line);
+	if (ptr + len > p->buf + p->buf_size) {
+		trace("packet range end %p after buffer end %p, %s:%i",
+		      (void *)(ptr + len), (void *)(p->buf + p->buf_size),
+		      func, line);
 		return -1;
 	}
 
@@ -81,7 +80,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 		return;
 	}
 
-	if (packet_check_range(p, 0, len, start, func, line))
+	if (packet_check_range(p, start, len, func, line))
 		return;
 
 	if (len > UINT16_MAX) {
@@ -110,6 +109,8 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		    size_t len, size_t *left, const char *func, int line)
 {
+	char *ptr;
+
 	if (idx >= p->size || idx >= p->count) {
 		if (func) {
 			trace("packet %zu from pool size: %zu, count: %zu, "
@@ -135,14 +136,15 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		return NULL;
 	}
 
-	if (packet_check_range(p, offset, len, p->pkt[idx].iov_base,
-			       func, line))
+	ptr = (char *)p->pkt[idx].iov_base + offset;
+
+	if (packet_check_range(p, ptr, len, func, line))
 		return NULL;
 
 	if (left)
 		*left = p->pkt[idx].iov_len - offset - len;
 
-	return (char *)p->pkt[idx].iov_base + offset;
+	return ptr;
 }
 
 /**
diff --git a/packet.h b/packet.h
index 85ee550..bdc07fe 100644
--- a/packet.h
+++ b/packet.h
@@ -24,8 +24,7 @@ struct pool {
 	struct iovec pkt[];
 };
 
-int vu_packet_check_range(void *buf, size_t offset, size_t len,
-			  const char *start);
+int vu_packet_check_range(void *buf, const char *ptr, size_t len);
 void packet_add_do(struct pool *p, size_t len, const char *start,
 		   const char *func, int line);
 void *packet_get_do(const struct pool *p, const size_t idx,
diff --git a/vu_common.c b/vu_common.c
index 48826b1..686a09b 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -26,14 +26,12 @@
  * vu_packet_check_range() - Check if a given memory zone is contained in
  * 			     a mapped guest memory region
  * @buf:	Array of the available memory regions
- * @offset:	Offset of data range in packet descriptor
+ * @ptr:	Start of desired data range
  * @size:	Length of desired data range
- * @start:	Start of the packet descriptor
  *
  * Return: 0 if the zone is in a mapped memory region, -1 otherwise
  */
-int vu_packet_check_range(void *buf, size_t offset, size_t len,
-			  const char *start)
+int vu_packet_check_range(void *buf, const char *ptr, size_t len)
 {
 	struct vu_dev_region *dev_region;
 
@@ -41,9 +39,8 @@ int vu_packet_check_range(void *buf, size_t offset, size_t len,
 		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
 		char *m = (char *)(uintptr_t)dev_region->mmap_addr;
 
-		if (m <= start &&
-		    start + offset + len <= m + dev_region->mmap_offset +
-					       dev_region->size)
+		if (m <= ptr &&
+		    ptr + len <= m + dev_region->mmap_offset + dev_region->size)
 			return 0;
 	}
 

From 6b4065153c67e7578d448927e49f244deea70e4d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 13:07:19 +1100
Subject: [PATCH 242/382] tap: Remove unused ETH_HDR_INIT() macro

The uses of this macro were removed in d4598e1d18ac ("udp: Use the same
buffer for the L2 header for all frames").

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tap.h b/tap.h
index dfbd8b9..a476a12 100644
--- a/tap.h
+++ b/tap.h
@@ -6,8 +6,6 @@
 #ifndef TAP_H
 #define TAP_H
 
-#define ETH_HDR_INIT(proto) { .h_proto = htons_constant(proto) }
-
 /**
  * struct tap_hdr - tap backend specific headers
  * @vnet_len:	Frame length (for qemu socket transport)

From 5a07eb3cccf1abf0a44d6ab01819f8f605c87ef4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 13:50:13 +1100
Subject: [PATCH 243/382] tcp_vu: head_cnt need not be global

head_cnt is a global variable which tracks how many entries in head[] are
currently used.  The fact that it's global obscures the fact that the
lifetime over which it has a meaningful value is quite short: a single
call to of tcp_vu_data_from_sock().

Make it a local to tcp_vu_data_from_sock() to make that lifetime clearer.
We keep the head[] array global for now - although technically it has the
same valid lifetime - because it's large enough we might not want to put
it on the stack.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_vu.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tcp_vu.c b/tcp_vu.c
index 0622f17..6891ed1 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -38,7 +38,6 @@
 static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
 static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
 static int head[VIRTQUEUE_MAX_SIZE + 1];
-static int head_cnt;
 
 /**
  * tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
@@ -183,7 +182,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 				const struct tcp_tap_conn *conn, bool v6,
 				uint32_t already_sent, size_t fillsize,
-				int *iov_cnt)
+				int *iov_cnt, int *head_cnt)
 {
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
@@ -202,7 +201,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 	vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
 
 	elem_cnt = 0;
-	head_cnt = 0;
+	*head_cnt = 0;
 	while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) {
 		struct iovec *iov;
 		size_t frame_size, dlen;
@@ -221,7 +220,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 		ASSERT(iov->iov_len >= hdrlen);
 		iov->iov_base = (char *)iov->iov_base + hdrlen;
 		iov->iov_len -= hdrlen;
-		head[head_cnt++] = elem_cnt;
+		head[(*head_cnt)++] = elem_cnt;
 
 		fillsize -= dlen;
 		elem_cnt += cnt;
@@ -261,17 +260,18 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 		len -= iov->iov_len;
 	}
 	/* adjust head count */
-	while (head_cnt > 0 && head[head_cnt - 1] >= i)
-		head_cnt--;
+	while (*head_cnt > 0 && head[*head_cnt - 1] >= i)
+		(*head_cnt)--;
+
 	/* mark end of array */
-	head[head_cnt] = i;
+	head[*head_cnt] = i;
 	*iov_cnt = i;
 
 	/* release unused buffers */
 	vu_queue_rewind(vq, elem_cnt - i);
 
 	/* restore space for headers in iov */
-	for (i = 0; i < head_cnt; i++) {
+	for (i = 0; i < *head_cnt; i++) {
 		struct iovec *iov = &elem[head[i]].in_sg[0];
 
 		iov->iov_base = (char *)iov->iov_base - hdrlen;
@@ -357,11 +357,11 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	ssize_t len, previous_dlen;
+	int i, iov_cnt, head_cnt;
 	size_t hdrlen, fillsize;
 	int v6 = CONN_V6(conn);
 	uint32_t already_sent;
 	const uint16_t *check;
-	int i, iov_cnt;
 
 	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
 		debug("Got packet, but RX virtqueue not usable yet");
@@ -396,7 +396,8 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	/* collect the buffers from vhost-user and fill them with the
 	 * data from the socket
 	 */
-	len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, &iov_cnt);
+	len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize,
+			       &iov_cnt, &head_cnt);
 	if (len < 0) {
 		if (len != -EAGAIN && len != -EWOULDBLOCK) {
 			tcp_rst(c, conn);

From e56c8038fc23a349ff4a457c6b447f927ac1a56e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 19:59:21 +1100
Subject: [PATCH 244/382] tcp: More type safety for
 tcp_flow_migrate_target_ext()

tcp_flow_migrate_target_ext() takes a raw union flow *, although it is TCP
specific, and requires a FLOW_TYPE_TCP entry.  Our usual convention is that
such functions should take a struct tcp_tap_conn * instead.  Convert it to
do so.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c     | 2 +-
 tcp.c      | 7 +++----
 tcp_conn.h | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/flow.c b/flow.c
index cc881e8..abe95b2 100644
--- a/flow.c
+++ b/flow.c
@@ -1106,7 +1106,7 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	repair_flush(c);
 
 	for (i = 0; i < count; i++) {
-		rc = tcp_flow_migrate_target_ext(c, flowtab + i, fd);
+		rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
 		if (rc) {
 			debug("Migration data failure at flow %u: %s, abort",
 			      i, strerror_(-rc));
diff --git a/tcp.c b/tcp.c
index 98e1c6a..272e4cd 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3394,14 +3394,13 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 /**
  * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect
  * @c:		Execution context
- * @flow:	Existing flow for this connection data
+ * @conn:	Connection entry to complete with extra data
  * @fd:		Descriptor for state migration
  *
  * Return: 0 on success, negative on fatal failure, but 0 on single flow failure
  */
-int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
+int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd)
 {
-	struct tcp_tap_conn *conn = &flow->tcp;
 	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
 	struct tcp_tap_transfer_ext t;
 	int s = conn->sock, rc;
@@ -3413,7 +3412,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
 	}
 
 	if (!t.tcpi_state) { /* Source wants us to skip this flow */
-		flow_err(flow, "Dropping as requested by source");
+		flow_err(conn, "Dropping as requested by source");
 		goto fail;
 	}
 
diff --git a/tcp_conn.h b/tcp_conn.h
index 42dff48..53887c0 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -239,7 +239,7 @@ int tcp_flow_migrate_source_ext(int fd, int fidx,
 				const struct tcp_tap_conn *conn);
 
 int tcp_flow_migrate_target(struct ctx *c, int fd);
-int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd);
+int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
 
 bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
 

From 854bc7b1a3b4e5443ea071e49b3a68198dbb88b3 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 19:59:22 +1100
Subject: [PATCH 245/382] tcp: Remove spurious prototype for
 tcp_flow_migrate_shrink_window

This function existed in drafts of the migration code, but not the final
version.  Get rid of the prototype.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_conn.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tcp_conn.h b/tcp_conn.h
index 53887c0..8a15b08 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -233,7 +233,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn);
 int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
 int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
 
-int tcp_flow_migrate_shrink_window(int fidx, const struct tcp_tap_conn *conn);
 int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
 int tcp_flow_migrate_source_ext(int fd, int fidx,
 				const struct tcp_tap_conn *conn);

From ba0823f8a0e60d4fc0cb21179aaf64940509156a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 19:59:23 +1100
Subject: [PATCH 246/382] tcp: Don't pass both flow pointer and flow index

tcp_flow_migrate_source_ext() is passed both the index of the flow it
operates on and the pointer to the connection structure.  However, the
former is trivially derived from the latter.  Simplify the interface.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c     | 2 +-
 tcp.c      | 6 ++----
 tcp_conn.h | 3 +--
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/flow.c b/flow.c
index abe95b2..cc393e0 100644
--- a/flow.c
+++ b/flow.c
@@ -1053,7 +1053,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * as EIO).
 	 */
 	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
-		rc = tcp_flow_migrate_source_ext(fd, i, &flow->tcp);
+		rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
 		if (rc) {
 			err("Extended data for flow %u: %s", i, strerror_(-rc));
 
diff --git a/tcp.c b/tcp.c
index 272e4cd..21b6c6c 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3141,16 +3141,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
 /**
  * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
  * @fd:		Descriptor for state migration
- * @fidx:	Flow index
  * @conn:	Pointer to the TCP connection structure
  *
  * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
  */
-int tcp_flow_migrate_source_ext(int fd, int fidx,
-				const struct tcp_tap_conn *conn)
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 {
 	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
-	struct tcp_tap_transfer_ext *t = &migrate_ext[fidx];
+	struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
 	int s = conn->sock;
 	int rc;
 
diff --git a/tcp_conn.h b/tcp_conn.h
index 8a15b08..9126a36 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -234,8 +234,7 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
 int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
 
 int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
-int tcp_flow_migrate_source_ext(int fd, int fidx,
-				const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
 
 int tcp_flow_migrate_target(struct ctx *c, int fd);
 int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);

From adb46c11d0ea67824cf8c4ef2113ec0b2c563c0e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 19:59:24 +1100
Subject: [PATCH 247/382] flow: Add flow_perror() helper

Our general logging helpers include a number of _perror() variants which,
like perror(3) include the description of the current errno.  We didn't
have those for our flow specific logging helpers, though.  Fill this gap
with flow_perror() and flow_dbg_perror(), and use them where it's useful.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c       | 12 +++++++-----
 flow.h       | 18 ++++++++++++++----
 icmp.c       |  5 ++---
 tcp.c        | 33 +++++++++++++++------------------
 tcp_splice.c |  9 ++++-----
 udp_flow.c   | 19 +++++++------------
 6 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/flow.c b/flow.c
index cc393e0..c68f6bb 100644
--- a/flow.c
+++ b/flow.c
@@ -289,11 +289,13 @@ int flowside_connect(const struct ctx *c, int s,
 
 /** flow_log_ - Log flow-related message
  * @f:		flow the message is related to
+ * @newline:	Append newline at the end of the message, if missing
  * @pri:	Log priority
  * @fmt:	Format string
  * @...:	printf-arguments
  */
-void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
+void flow_log_(const struct flow_common *f, bool newline, int pri,
+	       const char *fmt, ...)
 {
 	const char *type_or_state;
 	char msg[BUFSIZ];
@@ -309,7 +311,7 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 	else
 		type_or_state = FLOW_TYPE(f);
 
-	logmsg(true, false, pri,
+	logmsg(newline, false, pri,
 	       "Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
 }
 
@@ -329,7 +331,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
 	const struct flowside *tgt = &f->side[TGTSIDE];
 
 	if (state >= FLOW_STATE_TGT)
-		flow_log_(f, pri,
+		flow_log_(f, true, pri,
 			  "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
 			  pif_name(f->pif[INISIDE]),
 			  inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
@@ -342,7 +344,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
 			  inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
 			  tgt->eport);
 	else if (state >= FLOW_STATE_INI)
-		flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
+		flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
 			  pif_name(f->pif[INISIDE]),
 			  inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
 			  ini->eport,
@@ -363,7 +365,7 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
 	ASSERT(oldstate < FLOW_NUM_STATES);
 
 	f->state = state;
-	flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
+	flow_log_(f, true, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
 		  FLOW_STATE(f));
 
 	flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
diff --git a/flow.h b/flow.h
index 675726e..dcf7645 100644
--- a/flow.h
+++ b/flow.h
@@ -258,11 +258,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 			int fd);
 
-void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
-	__attribute__((format(printf, 3, 4)));
-
-#define flow_log(f_, pri, ...)	flow_log_(&(f_)->f, (pri), __VA_ARGS__)
+void flow_log_(const struct flow_common *f, bool newline, int pri,
+	       const char *fmt, ...)
+	__attribute__((format(printf, 4, 5)));
 
+#define flow_log(f_, pri, ...)	flow_log_(&(f_)->f, true, (pri), __VA_ARGS__)
 #define flow_dbg(f, ...)	flow_log((f), LOG_DEBUG, __VA_ARGS__)
 #define flow_err(f, ...)	flow_log((f), LOG_ERR, __VA_ARGS__)
 
@@ -272,6 +272,16 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 			flow_dbg((f), __VA_ARGS__);			\
 	} while (0)
 
+#define flow_log_perror_(f, pri, ...)					\
+	do {								\
+		int errno_ = errno;					\
+		flow_log_((f), false, (pri), __VA_ARGS__);		\
+		logmsg(true, true, (pri), ": %s", strerror_(errno_));	\
+	} while (0)
+
+#define flow_dbg_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_DEBUG, __VA_ARGS__)
+#define flow_perror(f_, ...)	flow_log_perror_(&(f_)->f, LOG_ERR, __VA_ARGS__)
+
 void flow_log_details_(const struct flow_common *f, int pri,
 		       enum flow_state state);
 #define flow_log_details(f_, pri) \
diff --git a/icmp.c b/icmp.c
index bcf498d..7e2b342 100644
--- a/icmp.c
+++ b/icmp.c
@@ -85,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
 
 	n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
 	if (n < 0) {
-		flow_err(pingf, "recvfrom() error: %s", strerror_(errno));
+		flow_perror(pingf, "recvfrom() error");
 		return;
 	}
 
@@ -300,8 +300,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 
 	pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
 	if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
-		flow_dbg(pingf, "failed to relay request to socket: %s",
-			 strerror_(errno));
+		flow_dbg_perror(pingf, "failed to relay request to socket");
 	} else {
 		flow_dbg(pingf,
 			 "echo request to socket, ID: %"PRIu16", seq: %"PRIu16,
diff --git a/tcp.c b/tcp.c
index 21b6c6c..f498f5b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -551,8 +551,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 
 		fd = timerfd_create(CLOCK_MONOTONIC, 0);
 		if (fd == -1 || fd > FD_REF_MAX) {
-			flow_dbg(conn, "failed to get timer: %s",
-				 strerror_(errno));
+			flow_dbg_perror(conn, "failed to get timer");
 			if (fd > -1)
 				close(fd);
 			conn->timer = -1;
@@ -561,8 +560,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		conn->timer = fd;
 
 		if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
-			flow_dbg(conn, "failed to add timer: %s",
-				 strerror_(errno));
+			flow_dbg_perror(conn, "failed to add timer");
 			close(conn->timer);
 			conn->timer = -1;
 			return;
@@ -587,7 +585,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
 
 	if (timerfd_settime(conn->timer, 0, &it, NULL))
-		flow_err(conn, "failed to set timer: %s", strerror_(errno));
+		flow_perror(conn, "failed to set timer");
 }
 
 /**
@@ -1386,10 +1384,10 @@ static void tcp_bind_outbound(const struct ctx *c,
 		if (bind(s, &bind_sa.sa, sl)) {
 			char sstr[INANY_ADDRSTRLEN];
 
-			flow_dbg(conn,
-				 "Can't bind TCP outbound socket to %s:%hu: %s",
-				 inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
-				 tgt->oport, strerror_(errno));
+			flow_dbg_perror(conn,
+					"Can't bind TCP outbound socket to %s:%hu",
+					inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
+					tgt->oport);
 		}
 	}
 
@@ -1398,9 +1396,9 @@ static void tcp_bind_outbound(const struct ctx *c,
 			if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
 				       c->ip4.ifname_out,
 				       strlen(c->ip4.ifname_out))) {
-				flow_dbg(conn, "Can't bind IPv4 TCP socket to"
-					 " interface %s: %s", c->ip4.ifname_out,
-					 strerror_(errno));
+				flow_dbg_perror(conn,
+						"Can't bind IPv4 TCP socket to interface %s",
+						c->ip4.ifname_out);
 			}
 		}
 	} else if (bind_sa.sa_family == AF_INET6) {
@@ -1408,9 +1406,9 @@ static void tcp_bind_outbound(const struct ctx *c,
 			if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
 				       c->ip6.ifname_out,
 				       strlen(c->ip6.ifname_out))) {
-				flow_dbg(conn, "Can't bind IPv6 TCP socket to"
-					 " interface %s: %s", c->ip6.ifname_out,
-					 strerror_(errno));
+				flow_dbg_perror(conn,
+						"Can't bind IPv6 TCP socket to interface %s",
+						c->ip6.ifname_out);
 			}
 		}
 	}
@@ -2193,7 +2191,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 	 * and we just set the timer to a new point in the future: discard it.
 	 */
 	if (timerfd_gettime(conn->timer, &check_armed))
-		flow_err(conn, "failed to read timer: %s", strerror_(errno));
+		flow_perror(conn, "failed to read timer");
 
 	if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
 		return;
@@ -2235,8 +2233,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 		 * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
 		 */
 		if (timerfd_settime(conn->timer, 0, &new, &old))
-			flow_err(conn, "failed to set timer: %s",
-				 strerror_(errno));
+			flow_perror(conn, "failed to set timer");
 
 		if (old.it_value.tv_sec == ACT_TIMEOUT) {
 			flow_dbg(conn, "activity timeout");
diff --git a/tcp_splice.c b/tcp_splice.c
index 5d845c9..0d10e3d 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -164,7 +164,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
 	if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
 	    epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
 		int ret = -errno;
-		flow_err(conn, "ERROR on epoll_ctl(): %s", strerror_(errno));
+		flow_perror(conn, "ERROR on epoll_ctl()");
 		return ret;
 	}
 
@@ -317,8 +317,8 @@ static int tcp_splice_connect_finish(const struct ctx *c,
 
 		if (conn->pipe[sidei][0] < 0) {
 			if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
-				flow_err(conn, "cannot create %d->%d pipe: %s",
-					 sidei, !sidei, strerror_(errno));
+				flow_perror(conn, "cannot create %d->%d pipe",
+					    sidei, !sidei);
 				conn_flag(c, conn, CLOSING);
 				return -EIO;
 			}
@@ -482,8 +482,7 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
 
 		rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl);
 		if (rc)
-			flow_err(conn, "Error retrieving SO_ERROR: %s",
-				 strerror_(errno));
+			flow_perror(conn, "Error retrieving SO_ERROR");
 		else
 			flow_trace(conn, "Error event on socket: %s",
 				   strerror_(err));
diff --git a/udp_flow.c b/udp_flow.c
index 83c2568..c6b8630 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -93,9 +93,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		 */
 		uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0);
 		if (uflow->s[INISIDE] < 0) {
-			flow_err(uflow,
-				 "Couldn't duplicate listening socket: %s",
-				 strerror_(errno));
+			flow_perror(uflow,
+				    "Couldn't duplicate listening socket");
 			goto cancel;
 		}
 	}
@@ -113,16 +112,13 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY,
 						     tgtpif, tgt, fref.data);
 		if (uflow->s[TGTSIDE] < 0) {
-			flow_dbg(uflow,
-				 "Couldn't open socket for spliced flow: %s",
-				 strerror_(errno));
+			flow_dbg_perror(uflow,
+					"Couldn't open socket for spliced flow");
 			goto cancel;
 		}
 
 		if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) {
-			flow_dbg(uflow,
-				 "Couldn't connect flow socket: %s",
-				 strerror_(errno));
+			flow_dbg_perror(uflow, "Couldn't connect flow socket");
 			goto cancel;
 		}
 
@@ -142,9 +138,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 			flow_trace(uflow,
 				   "Discarded %d spurious reply datagrams", rc);
 		} else if (errno != EAGAIN) {
-			flow_err(uflow,
-				 "Unexpected error discarding datagrams: %s",
-				 strerror_(errno));
+			flow_perror(uflow,
+				    "Unexpected error discarding datagrams");
 		}
 	}
 

From 7ffca35fddf1568698199c931ba1877c1908b443 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Feb 2025 13:28:34 +1100
Subject: [PATCH 248/382] flow: Remove unneeded index from foreach_* macros

The foreach macros are odd in that they take two loop counters: an integer
index, and a pointer to the flow.  We nearly always want the latter, not
the former, and we can get the index from the pointer trivially when we
need it.  So, rearrange the macros not to need the integer index.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/flow.c b/flow.c
index c68f6bb..3fcdd9f 100644
--- a/flow.c
+++ b/flow.c
@@ -53,30 +53,28 @@ const uint8_t flow_proto[] = {
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");
 
-#define foreach_flow(i, flow, bound)					\
-	for ((i) = 0, (flow) = &flowtab[(i)];				\
-	     (i) < (bound);						\
-	     (i)++, (flow) = &flowtab[(i)])				\
+#define foreach_flow(flow, bound)					\
+	for ((flow) = flowtab; FLOW_IDX(flow) < (bound); (flow)++)	\
 		if ((flow)->f.state == FLOW_STATE_FREE)			\
-			(i) += (flow)->free.n - 1;			\
+			(flow) += (flow)->free.n - 1;			\
 		else
 
-#define foreach_active_flow(i, flow, bound)				\
-	foreach_flow((i), (flow), (bound))				\
+#define foreach_active_flow(flow, bound)				\
+	foreach_flow((flow), (bound))					\
 		if ((flow)->f.state != FLOW_STATE_ACTIVE)		\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
 		else
 
-#define foreach_tcp_flow(i, flow, bound)				\
-	foreach_active_flow((i), (flow), (bound))			\
+#define foreach_tcp_flow(flow, bound)					\
+	foreach_active_flow((flow), (bound))				\
 		if ((flow)->f.type != FLOW_TCP)				\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
 		else
 
-#define foreach_established_tcp_flow(i, flow, bound)			\
-	foreach_tcp_flow((i), (flow), (bound))				\
+#define foreach_established_tcp_flow(flow, bound)			\
+	foreach_tcp_flow((flow), (bound))				\
 		if (!tcp_flow_is_established(&(flow)->tcp))		\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
@@ -918,11 +916,10 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
 					int ret)
 {
 	union flow *flow;
-	unsigned i;
 
 	debug("...roll back migration");
 
-	foreach_established_tcp_flow(i, flow, max_flow)
+	foreach_established_tcp_flow(flow, max_flow)
 		if (tcp_flow_repair_off(c, &flow->tcp))
 			die("Failed to roll back TCP_REPAIR mode");
 
@@ -942,10 +939,9 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
 static int flow_migrate_repair_all(struct ctx *c, bool enable)
 {
 	union flow *flow;
-	unsigned i;
 	int rc;
 
-	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow, FLOW_MAX) {
 		if (enable)
 			rc = tcp_flow_repair_on(c, &flow->tcp);
 		else
@@ -954,14 +950,15 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable)
 		if (rc) {
 			debug("Can't %s repair mode: %s",
 			      enable ? "enable" : "disable", strerror_(-rc));
-			return flow_migrate_source_rollback(c, i, rc);
+			return flow_migrate_source_rollback(c, FLOW_IDX(flow),
+							    rc);
 		}
 	}
 
 	if ((rc = repair_flush(c))) {
 		debug("Can't %s repair mode: %s",
 		      enable ? "enable" : "disable", strerror_(-rc));
-		return flow_migrate_source_rollback(c, i, rc);
+		return flow_migrate_source_rollback(c, FLOW_IDX(flow), rc);
 	}
 
 	return 0;
@@ -1003,13 +1000,12 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	uint32_t count = 0;
 	bool first = true;
 	union flow *flow;
-	unsigned i;
 	int rc;
 
 	(void)c;
 	(void)stage;
 
-	foreach_established_tcp_flow(i, flow, FLOW_MAX)
+	foreach_established_tcp_flow(flow, FLOW_MAX)
 		count++;
 
 	count = htonl(count);
@@ -1028,10 +1024,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * stream might now be inconsistent, and we might have closed listening
 	 * TCP sockets, so just terminate.
 	 */
-	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow, FLOW_MAX) {
 		rc = tcp_flow_migrate_source(fd, &flow->tcp);
 		if (rc) {
-			err("Can't send data, flow %u: %s", i, strerror_(-rc));
+			err("Can't send data, flow %u: %s", FLOW_IDX(flow),
+			    strerror_(-rc));
 			if (!first)
 				die("Inconsistent migration state, exiting");
 
@@ -1054,10 +1051,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * failures but not if the stream might be inconsistent (reported here
 	 * as EIO).
 	 */
-	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow, FLOW_MAX) {
 		rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
 		if (rc) {
-			err("Extended data for flow %u: %s", i, strerror_(-rc));
+			err("Extended data for flow %u: %s", FLOW_IDX(flow),
+			    strerror_(-rc));
 
 			if (rc == -EIO)
 				die("Inconsistent migration state, exiting");

From b79a22d3601b69cf58b1803c5ead7f4667c46827 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Feb 2025 13:28:35 +1100
Subject: [PATCH 249/382] flow: Remove unneeded bound parameter from flow
 traversal macros

The foreach macros used to step through flows each take a 'bound' parameter
to only scan part of the flow table.  Only one place actually passes a
bound different from FLOW_MAX.  So we can simplify every other invocation
by having that one case manually handle the bound.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/flow.c b/flow.c
index 3fcdd9f..602fea7 100644
--- a/flow.c
+++ b/flow.c
@@ -53,28 +53,28 @@ const uint8_t flow_proto[] = {
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");
 
-#define foreach_flow(flow, bound)					\
-	for ((flow) = flowtab; FLOW_IDX(flow) < (bound); (flow)++)	\
+#define foreach_flow(flow)						\
+	for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++)	\
 		if ((flow)->f.state == FLOW_STATE_FREE)			\
 			(flow) += (flow)->free.n - 1;			\
 		else
 
-#define foreach_active_flow(flow, bound)				\
-	foreach_flow((flow), (bound))					\
+#define foreach_active_flow(flow)					\
+	foreach_flow((flow))						\
 		if ((flow)->f.state != FLOW_STATE_ACTIVE)		\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
 		else
 
-#define foreach_tcp_flow(flow, bound)					\
-	foreach_active_flow((flow), (bound))				\
+#define foreach_tcp_flow(flow)						\
+	foreach_active_flow((flow))					\
 		if ((flow)->f.type != FLOW_TCP)				\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
 		else
 
-#define foreach_established_tcp_flow(flow, bound)			\
-	foreach_tcp_flow((flow), (bound))				\
+#define foreach_established_tcp_flow(flow)				\
+	foreach_tcp_flow((flow))					\
 		if (!tcp_flow_is_established(&(flow)->tcp))		\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
@@ -907,21 +907,23 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 /**
  * flow_migrate_source_rollback() - Disable repair mode, return failure
  * @c:		Execution context
- * @max_flow:	Maximum index of affected flows
+ * @bound:	No need to roll back flow indices >= @bound
  * @ret:	Negative error code
  *
  * Return: @ret
  */
-static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
-					int ret)
+static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
 {
 	union flow *flow;
 
 	debug("...roll back migration");
 
-	foreach_established_tcp_flow(flow, max_flow)
+	foreach_established_tcp_flow(flow) {
+		if (FLOW_IDX(flow) >= bound)
+			break;
 		if (tcp_flow_repair_off(c, &flow->tcp))
 			die("Failed to roll back TCP_REPAIR mode");
+	}
 
 	if (repair_flush(c))
 		die("Failed to roll back TCP_REPAIR mode");
@@ -941,7 +943,7 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable)
 	union flow *flow;
 	int rc;
 
-	foreach_established_tcp_flow(flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow) {
 		if (enable)
 			rc = tcp_flow_repair_on(c, &flow->tcp);
 		else
@@ -1005,7 +1007,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	(void)c;
 	(void)stage;
 
-	foreach_established_tcp_flow(flow, FLOW_MAX)
+	foreach_established_tcp_flow(flow)
 		count++;
 
 	count = htonl(count);
@@ -1024,7 +1026,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * stream might now be inconsistent, and we might have closed listening
 	 * TCP sockets, so just terminate.
 	 */
-	foreach_established_tcp_flow(flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow) {
 		rc = tcp_flow_migrate_source(fd, &flow->tcp);
 		if (rc) {
 			err("Can't send data, flow %u: %s", FLOW_IDX(flow),
@@ -1051,7 +1053,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * failures but not if the stream might be inconsistent (reported here
 	 * as EIO).
 	 */
-	foreach_established_tcp_flow(flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow) {
 		rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
 		if (rc) {
 			err("Extended data for flow %u: %s", FLOW_IDX(flow),

From 65e317a8fca4eaf9efbfe642cc7e4322c56aa1f7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Feb 2025 13:28:36 +1100
Subject: [PATCH 250/382] flow: Clean up and generalise flow traversal macros

The migration code introduced a number of 'foreach' macros to traverse the
flow table.  These aren't inherently tied to migration, so polish up their
naming, move them to flow_table.h and also use in flow_defer_handler()
which is the other place we need to traverse the whole table.

For now we keep foreach_established_tcp_flow() as is.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c       | 36 ++++++++----------------------------
 flow_table.h | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/flow.c b/flow.c
index 602fea7..bb5dcc3 100644
--- a/flow.c
+++ b/flow.c
@@ -53,28 +53,8 @@ const uint8_t flow_proto[] = {
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");
 
-#define foreach_flow(flow)						\
-	for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++)	\
-		if ((flow)->f.state == FLOW_STATE_FREE)			\
-			(flow) += (flow)->free.n - 1;			\
-		else
-
-#define foreach_active_flow(flow)					\
-	foreach_flow((flow))						\
-		if ((flow)->f.state != FLOW_STATE_ACTIVE)		\
-			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
-			continue;					\
-		else
-
-#define foreach_tcp_flow(flow)						\
-	foreach_active_flow((flow))					\
-		if ((flow)->f.type != FLOW_TCP)				\
-			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
-			continue;					\
-		else
-
 #define foreach_established_tcp_flow(flow)				\
-	foreach_tcp_flow((flow))					\
+	flow_foreach_of_type((flow), FLOW_TCP)				\
 		if (!tcp_flow_is_established(&(flow)->tcp))		\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
@@ -801,7 +781,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 	struct flow_free_cluster *free_head = NULL;
 	unsigned *last_next = &flow_first_free;
 	bool timer = false;
-	unsigned idx;
+	union flow *flow;
 
 	if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
 		timer = true;
@@ -810,8 +790,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 
 	ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
 
-	for (idx = 0; idx < FLOW_MAX; idx++) {
-		union flow *flow = &flowtab[idx];
+	flow_foreach_slot(flow) {
 		bool closed = false;
 
 		switch (flow->f.state) {
@@ -828,12 +807,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 			} else {
 				/* New free cluster, add to chain */
 				free_head = &flow->free;
-				*last_next = idx;
+				*last_next = FLOW_IDX(flow);
 				last_next = &free_head->next;
 			}
 
 			/* Skip remaining empty entries */
-			idx += skip - 1;
+			flow += skip - 1;
 			continue;
 		}
 
@@ -886,14 +865,15 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 
 			if (free_head) {
 				/* Add slot to current free cluster */
-				ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
+				ASSERT(FLOW_IDX(flow) ==
+				       FLOW_IDX(free_head) + free_head->n);
 				free_head->n++;
 				flow->free.n = flow->free.next = 0;
 			} else {
 				/* Create new free cluster */
 				free_head = &flow->free;
 				free_head->n = 1;
-				*last_next = idx;
+				*last_next = FLOW_IDX(flow);
 				last_next = &free_head->next;
 			}
 		} else {
diff --git a/flow_table.h b/flow_table.h
index 9a2ff24..fd2c57b 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -50,6 +50,42 @@ extern union flow flowtab[];
 #define flow_foreach_sidei(sidei_) \
 	for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++)
 
+
+/**
+ * flow_foreach_slot() - Step through each flow table entry
+ * @flow:	Takes values of pointer to each flow table entry
+ *
+ * Includes FREE slots.
+ */
+#define flow_foreach_slot(flow)						\
+	for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++)
+
+/**
+ * flow_foreach() - Step through each active flow
+ * @flow:	Takes values of pointer to each active flow
+ */
+#define flow_foreach(flow)						\
+	flow_foreach_slot((flow))					\
+		if ((flow)->f.state == FLOW_STATE_FREE)			\
+			(flow) += (flow)->free.n - 1;			\
+		else if ((flow)->f.state != FLOW_STATE_ACTIVE) {	\
+			flow_err((flow), "Bad flow state during traversal"); \
+			continue;					\
+		} else
+
+/**
+ * flow_foreach_of_type() - Step through each active flow of given type
+ * @flow:	Takes values of pointer to each flow
+ * @type_:	Type of flow to traverse
+ */
+#define flow_foreach_of_type(flow, type_)				\
+	flow_foreach((flow))						\
+	if ((flow)->f.type != (type_))					\
+			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
+			continue;					\
+		else
+
+
 /** flow_idx() - Index of flow from common structure
  * @f:	Common flow fields pointer
  *

From 3dc7da68a2731f661d7251a5fc759daffe24ca70 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Feb 2025 14:14:27 +1100
Subject: [PATCH 251/382] conf: More thorough error checking when parsing --mtu
 option

We're a bit sloppy with parsing MTU which can lead to some surprising,
though fairly harmless, results:
  * Passing a non-number like '-m xyz' will not give an error and act like
    -m 0
  * Junk after a number (e.g. '-m 1500pqr') will be ignored rather than
    giving an error
  * We parse the MTU as a long, then immediately assign to an int, so on
    some platforms certain ludicrously out of bounds values will be
    silently truncated, rather than giving an error

Be a bit more thorough with the error checking to avoid that.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/conf.c b/conf.c
index 18017f5..335f37c 100644
--- a/conf.c
+++ b/conf.c
@@ -1652,20 +1652,29 @@ void conf(struct ctx *c, int argc, char **argv)
 				die("Invalid PID file: %s", optarg);
 
 			break;
-		case 'm':
-			errno = 0;
-			c->mtu = strtol(optarg, NULL, 0);
+		case 'm': {
+			unsigned long mtu;
+			char *e;
 
-			if (!c->mtu) {
+			errno = 0;
+			mtu = strtoul(optarg, &e, 0);
+
+			if (errno || *e)
+				die("Invalid MTU: %s", optarg);
+
+			if (!mtu) {
 				c->mtu = -1;
 				break;
 			}
 
-			if (c->mtu < ETH_MIN_MTU || c->mtu > (int)ETH_MAX_MTU ||
-			    errno)
-				die("Invalid MTU: %s", optarg);
+			if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
+				die("MTU %lu out of range (%u..%u)", mtu,
+				    ETH_MIN_MTU, ETH_MAX_MTU);
+			}
 
+			c->mtu = mtu;
 			break;
+		}
 		case 'a':
 			if (inet_pton(AF_INET6, optarg, &c->ip6.addr)	&&
 			    !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)	&&

From 1cc5d4c9fe0a84d3d39fc07358996989ca1b5875 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Feb 2025 14:14:28 +1100
Subject: [PATCH 252/382] conf: Use 0 instead of -1 as "unassigned" mtu value

On the command line -m 0 means "don't assign an MTU" (letting the guest use
its default.  However, internally we use (c->mtu == -1) to represent that
state.  We use (c->mtu == 0) to represent "the user didn't specify on the
command line, so use the default" - but this is only used during conf(),
never afterwards.

This is unnecessarily confusing.  We can instead just initialise c->mtu to
its default (65520) before parsing options and use 0 on both the command
line and internally to represent the "don't assign" special case.  This
ensures that c->mtu is always 0..65535, so we can store it in a uint16_t
which is more natural.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c  | 11 ++---------
 dhcp.c  |  2 +-
 ndp.c   |  2 +-
 passt.h |  3 ++-
 pasta.c |  2 +-
 tcp.c   |  2 +-
 6 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/conf.c b/conf.c
index 335f37c..c5ee07b 100644
--- a/conf.c
+++ b/conf.c
@@ -1413,6 +1413,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:";
 	}
 
+	c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
 	c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
 	c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET;
 	memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN);
@@ -1662,12 +1663,7 @@ void conf(struct ctx *c, int argc, char **argv)
 			if (errno || *e)
 				die("Invalid MTU: %s", optarg);
 
-			if (!mtu) {
-				c->mtu = -1;
-				break;
-			}
-
-			if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
+			if (mtu && (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU)) {
 				die("MTU %lu out of range (%u..%u)", mtu,
 				    ETH_MIN_MTU, ETH_MAX_MTU);
 			}
@@ -1980,9 +1976,6 @@ void conf(struct ctx *c, int argc, char **argv)
 		c->no_dhcpv6 = 1;
 	}
 
-	if (!c->mtu)
-		c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
-
 	get_dns(c);
 
 	if (!*c->pasta_ifn) {
diff --git a/dhcp.c b/dhcp.c
index 4a209f1..66a716e 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -417,7 +417,7 @@ int dhcp(const struct ctx *c, const struct pool *p)
 		       &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
 	}
 
-	if (c->mtu != -1) {
+	if (c->mtu) {
 		opts[26].slen = 2;
 		opts[26].s[0] = c->mtu / 256;
 		opts[26].s[1] = c->mtu % 256;
diff --git a/ndp.c b/ndp.c
index 37bf7a3..ded2081 100644
--- a/ndp.c
+++ b/ndp.c
@@ -256,7 +256,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
 
 	ptr = &ra.var[0];
 
-	if (c->mtu != -1) {
+	if (c->mtu) {
 		struct opt_mtu *mtu = (struct opt_mtu *)ptr;
 		*mtu = (struct opt_mtu) {
 			.header = {
diff --git a/passt.h b/passt.h
index 1f0dab5..28d1389 100644
--- a/passt.h
+++ b/passt.h
@@ -274,6 +274,8 @@ struct ctx {
 	int fd_repair;
 	unsigned char our_tap_mac[ETH_ALEN];
 	unsigned char guest_mac[ETH_ALEN];
+	uint16_t mtu;
+
 	uint64_t hash_secret[2];
 
 	int ifi4;
@@ -298,7 +300,6 @@ struct ctx {
 	int no_icmp;
 	struct icmp_ctx icmp;
 
-	int mtu;
 	int no_dns;
 	int no_dns_search;
 	int no_dhcp_dns;
diff --git a/pasta.c b/pasta.c
index 585a51c..fa3e7de 100644
--- a/pasta.c
+++ b/pasta.c
@@ -319,7 +319,7 @@ void pasta_ns_conf(struct ctx *c)
 	if (c->pasta_conf_ns) {
 		unsigned int flags = IFF_UP;
 
-		if (c->mtu != -1)
+		if (c->mtu)
 			nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu);
 
 		if (c->ifi6) /* Avoid duplicate address detection on link up */
diff --git a/tcp.c b/tcp.c
index f498f5b..e3c0a53 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1139,7 +1139,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (flags & SYN) {
 		int mss;
 
-		if (c->mtu == -1) {
+		if (!c->mtu) {
 			mss = tinfo.tcpi_snd_mss;
 		} else {
 			mss = c->mtu - sizeof(struct tcphdr);

From 183bedf478e34079244fe4cfbb2c1a0f02a5a037 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 18 Feb 2025 09:34:26 +0100
Subject: [PATCH 253/382] Makefile: Use mmap2() as alternative for mmap() in
 valgrind extra syscalls

...instead of unconditionally trying to enable both: mmap2() is the
32-bit ARM variant for mmap() (and perhaps for other architectures),
bot if mmap() is available, valgrind will use that one.

This avoids seccomp.sh warning us about missing mmap2() if mmap() is
present, and is consistent with what we do in vhost-user code.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index d4e1096..f2ac8e5 100644
--- a/Makefile
+++ b/Makefile
@@ -109,9 +109,9 @@ passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h
 	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS)
 
 valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction	\
-			    rt_sigreturn getpid gettid kill clock_gettime mmap \
-			    mmap2 munmap open unlink gettimeofday futex statx \
-			    readlink
+			    rt_sigreturn getpid gettid kill clock_gettime \
+			    mmap|mmap2 munmap open unlink gettimeofday futex \
+			    statx readlink
 valgrind: FLAGS += -g -DVALGRIND
 valgrind: all
 

From 16553c82806e0a55508baf553cb79e902638c10f Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 18 Feb 2025 09:42:28 +0100
Subject: [PATCH 254/382] dhcp: Add option code byte in calculation for OPT_MAX
 boundary check

Otherwise we'll limit messages to 577 bytes, instead of 576 bytes as
intended:

  $ fqdn="thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.then_make_it_251_with_this"
  $ hostname="__eighteen_bytes__"
  $ ./pasta --fqdn ${fqdn} -H ${hostname} -p dhcp.pcap -- /sbin/dhclient -4
  Saving packet capture to dhcp.pcap
  $ tshark -r dhcp.pcap -V -Y 'dhcp.option.value == 5' | grep "Total Length"
      Total Length: 577

This was hidden by the issue fixed by commit bcc4908c2b4a ("dhcp
Remove option 255 length byte") until now.

Fixes: 31e8109a86ee ("dhcp, dhcpv6: Add hostname and client fqdn ops")
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Enrique Llorente <ellorent@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 dhcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhcp.c b/dhcp.c
index 66a716e..b0de04b 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -143,7 +143,7 @@ static bool fill_one(struct msg *m, int o, int *offset)
 	size_t slen = opts[o].slen;
 
 	/* If we don't have space to write the option, then just skip */
-	if (*offset + 1 /* length of option */ + slen > OPT_MAX)
+	if (*offset + 2 /* code and length of option */ + slen > OPT_MAX)
 		return true;
 
 	m->o[*offset] = o;

From 4dac2351fae5534c01e144273f849ce9ece0dca7 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 18 Feb 2025 09:49:40 +0100
Subject: [PATCH 255/382] contrib/fedora: Actually install passt-repair SELinux
 policy file

Otherwise we build it, but we don't install it. Not an issue that
warrants a a release right away as it's anyway usable.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/fedora/passt.spec | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec
index 6a83f8b..745cf01 100644
--- a/contrib/fedora/passt.spec
+++ b/contrib/fedora/passt.spec
@@ -44,7 +44,7 @@ Requires(preun): %{name}
 Requires(preun): policycoreutils
 
 %description selinux
-This package adds SELinux enforcement to passt(1) and pasta(1).
+This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1).
 
 %prep
 %setup -q -n passt-%{git_hash}
@@ -82,6 +82,7 @@ make -f %{_datadir}/selinux/devel/Makefile
 install -p -m 644 -D passt.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
 install -p -m 644 -D passt.if %{buildroot}%{_datadir}/selinux/devel/include/distributed/passt.if
 install -p -m 644 -D pasta.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
+install -p -m 644 -D passt-repair.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
 popd
 
 %pre selinux
@@ -90,11 +91,13 @@ popd
 %post selinux
 %selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
 %selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
+%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
 
 %postun selinux
 if [ $1 -eq 0 ]; then
 	%selinux_modules_uninstall -s %{selinuxtype} passt
 	%selinux_modules_uninstall -s %{selinuxtype} pasta
+	%selinux_modules_uninstall -s %{selinuxtype} passt-repair
 fi
 
 %posttrans selinux
@@ -124,6 +127,7 @@ fi
 %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
 %{_datadir}/selinux/devel/include/distributed/passt.if
 %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
+%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
 
 %changelog
 {{{ passt_git_changelog }}}

From ea69ca6a20ac7408a913fd5de383a5383d679678 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Wed, 19 Feb 2025 10:20:41 -0500
Subject: [PATCH 256/382] tap: always set the no_frag flag in IPv4 headers

When studying the Linux source code and Wireshark dumps it seems like
the no_frag flag in the IPv4 header is always set. Following discussions
in the Internet on this subject indicates that modern routers never
fragment packets, and that it isn't even supported in many cases.

Adding to this that incoming messages forwarded on the tap interface
never even pass through a router it seems safe to always set this flag.

This makes the IPv4 headers of forwarded messages identical to those
sent by the external sockets, something we must consider desirable.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ip.h  | 3 ++-
 tap.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ip.h b/ip.h
index 1544dbf..858cc89 100644
--- a/ip.h
+++ b/ip.h
@@ -36,13 +36,14 @@
 		.tos		= 0,					\
 		.tot_len	= 0,					\
 		.id		= 0,					\
-		.frag_off	= 0,					\
+		.frag_off	= htons(IP_DF), 			\
 		.ttl		= 0xff,					\
 		.protocol	= (proto),				\
 		.saddr		= 0,					\
 		.daddr		= 0,					\
 	}
 #define L2_BUF_IP4_PSUM(proto)	((uint32_t)htons_constant(0x4500) +	\
+				 (uint32_t)htons_constant(IP_DF) +	\
 				 (uint32_t)htons(0xff00 | (proto)))
 
 
diff --git a/tap.c b/tap.c
index d0673e5..44b0fc0 100644
--- a/tap.c
+++ b/tap.c
@@ -153,7 +153,7 @@ static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 	ip4h->tos = 0;
 	ip4h->tot_len = htons(l3len);
 	ip4h->id = 0;
-	ip4h->frag_off = 0;
+	ip4h->frag_off = htons(IP_DF);
 	ip4h->ttl = 255;
 	ip4h->protocol = proto;
 	ip4h->saddr = src.s_addr;

From be86232f72dcfbd51a889206e80d587fbcaa1c5b Mon Sep 17 00:00:00 2001
From: Michal Privoznik <mprivozn@redhat.com>
Date: Fri, 21 Feb 2025 12:53:13 +0100
Subject: [PATCH 257/382] seccomp.sh: Silence stty errors

When printing list of allowed syscalls the width of terminal is
obtained for nicer output (see commit below). The width is
obtained by running 'stty'. While this works when building from a
console, it doesn't work during rpmbuild/emerge/.. as stdout is
usually not a console but a logfile and stdin is usually
/dev/null or something. This results in stty reporting errors
like this:

  stty: 'standard input': Inappropriate ioctl for device

Redirect stty's stderr to /dev/null to silence it.

Fixes: 712ca3235329 ("seccomp.sh: Try to account for terminal width while formatting list of system calls")
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 seccomp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seccomp.sh b/seccomp.sh
index 4c521ae..a7bc417 100755
--- a/seccomp.sh
+++ b/seccomp.sh
@@ -255,7 +255,7 @@ for __p in ${__profiles}; do
 	__calls="${__calls} ${EXTRA_SYSCALLS:-}"
 	__calls="$(filter ${__calls})"
 
-	cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
+	cols="$(stty -a 2>/dev/null | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
 	case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac
 	echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args}
 

From 87471731e6bb0b5df3a50277527caf3381b45ee4 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 28 Feb 2025 01:14:01 +0100
Subject: [PATCH 258/382] selinux: Fixes/workarounds for passt and
 passt-repair, mostly for libvirt usage

Here are a bunch of workarounds and a couple of fixes for libvirt
usage which are rather hard to split into single logical patches
as there appear to be some obscure dependencies between some of them:

- passt-repair needs to have an exec_type typeattribute (otherwise
  the policy for lsmd(1) causes a violation on getattr on its
  executable) file, and that typeattribute just happened to be there
  for passt as a result of init_daemon_domain(), but passt-repair
  isn't a daemon, so we need an explicit corecmd_executable_file()

- passt-repair needs a workaround, which I'll revisit once
  https://github.com/fedora-selinux/selinux-policy/issues/2579 is
  solved, for usage with libvirt: allow it to use qemu_var_run_t
  and virt_var_run_t sockets

- add 'bpf' and 'dac_read_search' capabilities for passt-repair:
  they are needed (for whatever reason I didn't investigate) to
  actually receive socket files via SCM_RIGHTS

- passt needs further workarounds in the sense of
  https://github.com/fedora-selinux/selinux-policy/issues/2579:
  allow it to use map and use svirt_tmpfs_t (not just svirt_image_t):
  it depends on where the libvirt guest image is

- ...it also needs to map /dev/null if <access mode='shared'/> is
  enabled in libvirt's XML for the memoryBacking object, for
  vhost-user operation

- and 'ioctl' on the TCP socket appears to be actually needed, on top
  of 'getattr', to dump some socket parameters

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/selinux/passt-repair.te | 33 +++++++++++++++++++++++++++++++--
 contrib/selinux/passt.te        |  9 +++++++--
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te
index e3ffbcd..f171be6 100644
--- a/contrib/selinux/passt-repair.te
+++ b/contrib/selinux/passt-repair.te
@@ -28,12 +28,22 @@ require {
 	type console_device_t;
 	type user_devpts_t;
 	type user_tmp_t;
+
+	# Workaround: passt-repair needs to needs to access socket files
+	# that passt, started by libvirt, might create under different
+	# labels, depending on whether passt is started as root or not.
+	#
+	# However, libvirt doesn't maintain its own policy, which makes
+	# updates particularly complicated. To avoid breakage in the short
+	# term, deal with that in passt's own policy.
+	type qemu_var_run_t;
+	type virt_var_run_t;
 }
 
 type passt_repair_t;
 domain_type(passt_repair_t);
 type passt_repair_exec_t;
-files_type(passt_repair_exec_t);
+corecmd_executable_file(passt_repair_exec_t);
 
 role unconfined_r types passt_repair_t;
 
@@ -41,7 +51,8 @@ allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans en
 type_transition unconfined_t passt_repair_exec_t:process passt_repair_t;
 allow unconfined_t passt_repair_t:process transition;
 
-allow passt_repair_t self:capability { dac_override net_admin net_raw };
+allow passt_repair_t self:capability { dac_override dac_read_search net_admin net_raw };
+allow passt_repair_t self:capability2 bpf;
 
 allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl };
 allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl };
@@ -50,9 +61,27 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
 allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
 allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
 
+allow passt_repair_t user_tmp_t:dir search;
+
 allow passt_repair_t unconfined_t:sock_file { read write };
 allow passt_repair_t passt_t:sock_file { read write };
 allow passt_repair_t user_tmp_t:sock_file { read write };
 
 allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
 allow passt_repair_t passt_t:tcp_socket { read setopt write };
+
+# Workaround: passt-repair needs to needs to access socket files
+# that passt, started by libvirt, might create under different
+# labels, depending on whether passt is started as root or not.
+#
+# However, libvirt doesn't maintain its own policy, which makes
+# updates particularly complicated. To avoid breakage in the short
+# term, deal with that in passt's own policy.
+allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write };
+allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write };
+
+allow passt_repair_t qemu_var_run_t:dir search;
+allow passt_repair_t virt_var_run_t:dir search;
+
+allow passt_repair_t qemu_var_run_t:sock_file { read write };
+allow passt_repair_t virt_var_run_t:sock_file { read write };
diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index f595079..f8ea672 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -29,6 +29,9 @@ require {
 	# particularly complicated. To avoid breakage in the short term,
 	# deal with it in passt's own policy.
 	type svirt_image_t;
+	type svirt_tmpfs_t;
+	type svirt_t;
+	type null_device_t;
 
 	class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map };
 	class dir { search write add_name remove_name mounton };
@@ -45,7 +48,7 @@ require {
 	type net_conf_t;
 	type proc_net_t;
 	type node_t;
-	class tcp_socket { create accept listen name_bind name_connect getattr };
+	class tcp_socket { create accept listen name_bind name_connect getattr ioctl };
 	class udp_socket { create accept listen };
 	class icmp_socket { bind create name_bind node_bind setopt read write };
 	class sock_file { create unlink write };
@@ -129,7 +132,7 @@ corenet_udp_sendrecv_all_ports(passt_t)
 allow passt_t node_t:icmp_socket { name_bind node_bind };
 allow passt_t port_t:icmp_socket name_bind;
 
-allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr };
+allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl };
 allow passt_t self:udp_socket { create getopt setopt connect bind read write };
 allow passt_t self:icmp_socket { bind create setopt read write };
 
@@ -143,3 +146,5 @@ allow passt_t unconfined_t:unix_stream_socket { read write };
 # particularly complicated. To avoid breakage in the short term,
 # deal with it in passt's own policy.
 allow passt_t svirt_image_t:file { read write map };
+allow passt_t svirt_tmpfs_t:file { read write map };
+allow passt_t null_device_t:chr_file map;

From 7b92f2e8525a94fb6f80d5e0bedba7eacc378714 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 27 Feb 2025 16:55:13 +1100
Subject: [PATCH 259/382] migrate, flow: Trivially succeed if migrating with no
 flows

We could get a migration request when we have no active flows; or at least
none that we need or are able to migrate.  In this case after sending or
receiving the number of flows we continue to step through various lists.

In the target case, this could include communication with passt-repair.  If
passt-repair wasn't started that could cause further errors, but of course
they shouldn't matter if we have nothing to repair.

Make it more obvious that there's nothing to do and avoid such errors by
short-circuiting flow_migrate_{source,target}() if there are no migratable
flows.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/flow.c b/flow.c
index bb5dcc3..6cf96c2 100644
--- a/flow.c
+++ b/flow.c
@@ -999,6 +999,9 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 
 	debug("Sending %u flows", ntohl(count));
 
+	if (!count)
+		return 0;
+
 	/* Dump and send information that can be stored in the flow table.
 	 *
 	 * Limited rollback options here: if we fail to transfer any data (that
@@ -1070,6 +1073,9 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	count = ntohl(count);
 	debug("Receiving %u flows", count);
 
+	if (!count)
+		return 0;
+
 	if ((rc = flow_migrate_repair_all(c, true)))
 		return -rc;
 

From 39f85bce1a3b9da3bd11458c521e589f674e587a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 27 Feb 2025 16:55:14 +1100
Subject: [PATCH 260/382] migrate, flow: Don't attempt to migrate TCP flows
 without passt-repair

Migrating TCP flows requires passt-repair in order to use TCP_REPAIR.  If
passt-repair is not started, our failure mode is pretty ugly though: we'll
attempt the migration, hitting various problems when we can't enter repair
mode.  In some cases we may not roll back these changes properly, meaning
we break network connections on the source.

Our general approach is not to completely block migration if there are
problems, but simply to break any flows we can't migrate.  So, if we have
no connection from passt-repair carry on with the migration, but don't
attempt to migrate any TCP connections.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/flow.c b/flow.c
index 6cf96c2..749c498 100644
--- a/flow.c
+++ b/flow.c
@@ -923,6 +923,10 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable)
 	union flow *flow;
 	int rc;
 
+	/* If we don't have a repair helper, there's nothing we can do */
+	if (c->fd_repair < 0)
+		return 0;
+
 	foreach_established_tcp_flow(flow) {
 		if (enable)
 			rc = tcp_flow_repair_on(c, &flow->tcp);
@@ -987,8 +991,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	(void)c;
 	(void)stage;
 
-	foreach_established_tcp_flow(flow)
-		count++;
+	/* If we don't have a repair helper, we can't migrate TCP flows */
+	if (c->fd_repair >= 0) {
+		foreach_established_tcp_flow(flow)
+			count++;
+	}
 
 	count = htonl(count);
 	if (write_all_buf(fd, &count, sizeof(count))) {

From 56ce03ed0acf2a41c67d44e353c00a018604ccb7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 27 Feb 2025 16:55:15 +1100
Subject: [PATCH 261/382] tcp: Correct error code handling from
 tcp_flow_repair_socket()

There are two small bugs in error returns from tcp_low_repair_socket(),
which is supposed to return a negative errno code:

1) On bind() failures, wedirectly pass on the return code from bind(),
   which is just 0 or -1, instead of an error code.

2) In the caller, tcp_flow_migrate_target() we call strerror_() directly
   on the negative error code, but strerror() requires a positive error
   code.

Correct both of these.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index e3c0a53..8528ee3 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3280,7 +3280,8 @@ int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
 
 	tcp_sock_set_nodelay(s);
 
-	if ((rc = bind(s, &a.sa, sizeof(a)))) {
+	if (bind(s, &a.sa, sizeof(a))) {
+		rc = -errno;
 		err_perror("Failed to bind socket for migrated flow");
 		goto err;
 	}
@@ -3375,7 +3376,7 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 	conn->seq_init_from_tap		= ntohl(t.seq_init_from_tap);
 
 	if ((rc = tcp_flow_repair_socket(c, conn))) {
-		flow_err(flow, "Can't set up socket: %s, drop", strerror_(rc));
+		flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc));
 		flow_alloc_cancel(flow);
 		return 0;
 	}

From b2708218a6eec82fad98da52d7569d13cf35e05c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 27 Feb 2025 16:55:16 +1100
Subject: [PATCH 262/382] tcp: Unconditionally move to CLOSED state on
 tcp_rst()

tcp_rst() attempts to send an RST packet to the guest, and if that succeeds
moves the flow to CLOSED state.  However, even if the tcp_send_flag() fails
the flow is still dead: we've usually closed the socket already, and
something has already gone irretrievably wrong.  So we should still mark
the flow as CLOSED.  That will cause it to be cleaned up, meaning any
future packets from the guest for it won't match a flow, so should generate
new RSTs (they don't at the moment, but that's a separate bug).

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index 8528ee3..d23b6d9 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1214,8 +1214,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
 	if (conn->events == CLOSED)
 		return;
 
-	if (!tcp_send_flag(c, conn, RST))
-		conn_event(c, conn, CLOSED);
+	tcp_send_flag(c, conn, RST);
+	conn_event(c, conn, CLOSED);
 }
 
 /**

From 52419a64f2dfa31707b31148e6a311bb57be6e5f Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 27 Feb 2025 16:55:17 +1100
Subject: [PATCH 263/382] migrate, tcp: Don't flow_alloc_cancel() during
 incoming migration

In tcp_flow_migrate_target(), if we're unable to create and bind the new
socket, we print an error, cancel the flow and carry on.  This seems to
make sense based on our policy of generally letting the migration complete
even if some or all flows are lost in the process.  But it doesn't quite
work: the flow_alloc_cancel() means that the flows in the target's flow
table are no longer one to one match to the flows which the source is
sending data for.  This means that data for later flows will be mismatched
to a different flow.  Most likely that will cause some nasty error later,
but even worse it might appear to succeed but lead to data corruption due
to incorrectly restoring one of the flows.

Instead, we should leave the flow in the table until we've read all the
data for it, *then* discard it.  Technically removing the
flow_alloc_cancel() would be enough for this: if tcp_flow_repair_socket()
fails it leaves conn->sock == -1, which will cause the restore functions
in tcp_flow_migrate_target_ext() to fail, discarding the flow.  To make
what's going on clearer (and with less extraneous error messages), put
several explicit tests for a missing socket later in the migration path to
read the data associated with the flow but explicitly discard it.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tcp.c b/tcp.c
index d23b6d9..b3aa9a2 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2708,6 +2708,9 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
 {
 	int rc = 0;
 
+	if (conn->sock < 0)
+		return 0;
+
 	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON)))
 		err("Failed to set TCP_REPAIR");
 
@@ -2725,6 +2728,9 @@ int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
 {
 	int rc = 0;
 
+	if (conn->sock < 0)
+		return 0;
+
 	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
 		err("Failed to clear TCP_REPAIR");
 
@@ -3377,7 +3383,8 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 
 	if ((rc = tcp_flow_repair_socket(c, conn))) {
 		flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc));
-		flow_alloc_cancel(flow);
+		/* Can't leave the flow in an incomplete state */
+		FLOW_ACTIVATE(conn);
 		return 0;
 	}
 
@@ -3453,6 +3460,10 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 		return rc;
 	}
 
+	if (conn->sock < 0)
+		/* We weren't able to create the socket, discard flow */
+		goto fail;
+
 	if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
 		goto fail;
 
@@ -3540,8 +3551,10 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	return 0;
 
 fail:
-	tcp_flow_repair_off(c, conn);
-	repair_flush(c);
+	if (conn->sock >= 0) {
+		tcp_flow_repair_off(c, conn);
+		repair_flush(c);
+	}
 
 	conn->flags = 0; /* Not waiting for ACK, don't schedule timer */
 	tcp_rst(c, conn);

From 008175636c789d36ef585a94eee4d62536cac7d6 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 15:32:28 +1100
Subject: [PATCH 264/382] ip: Helpers to access IPv6 flow label

The flow label is a 20-bit field in the IPv6 header.  The length and
alignment make it awkward to pass around as is.  Obviously, it can be
packed into a 32-bit integer though, and we do this in two places.  We
have some further upcoming places where we want to manipulate the flow
label, so make some helpers for marshalling and unmarshalling it to an
integer.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ip.h  | 25 +++++++++++++++++++++++++
 tap.c |  4 +---
 tcp.c |  4 +---
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/ip.h b/ip.h
index 858cc89..5edb7e7 100644
--- a/ip.h
+++ b/ip.h
@@ -91,6 +91,31 @@ struct ipv6_opt_hdr {
 	 */
 } __attribute__((packed));	/* required for some archs */
 
+/**
+ * ip6_set_flow_lbl() - Set flow label in an IPv6 header
+ * @ip6h:	Pointer to IPv6 header, updated
+ * @flow:	Set @ip6h flow label to the low 20 bits of this integer
+ */
+static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow)
+{
+	ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
+	ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
+	ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
+}
+
+/** ip6_get_flow_lbl() - Get flow label from an IPv6 header
+ * @ip6h:	Pointer to IPv6 header
+ *
+ * Return: flow label from @ip6h as an integer (<= 20 bits)
+ */
+/* cppcheck-suppress unusedFunction */
+static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
+{
+	return (ip6h->flow_lbl[0] & 0xf) << 16 |
+		ip6h->flow_lbl[1] << 8 |
+		ip6h->flow_lbl[2];
+}
+
 char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
 		 size_t *dlen);
 
diff --git a/tap.c b/tap.c
index 44b0fc0..3908262 100644
--- a/tap.c
+++ b/tap.c
@@ -241,9 +241,7 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h,
 	ip6h->hop_limit = 255;
 	ip6h->saddr = *src;
 	ip6h->daddr = *dst;
-	ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
-	ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
-	ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
+	ip6_set_flow_lbl(ip6h, flow);
 	return ip6h + 1;
 }
 
diff --git a/tcp.c b/tcp.c
index b3aa9a2..7459803 100644
--- a/tcp.c
+++ b/tcp.c
@@ -963,9 +963,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
 		ip6h->version = 6;
 		ip6h->nexthdr = IPPROTO_TCP;
 
-		ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf;
-		ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
-		ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
+		ip6_set_flow_lbl(ip6h, conn->sock);
 
 		if (!no_tcp_csum) {
 			psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,

From 1f236817ea715e9215e0fe4ecb0938d0a9809ce1 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 15:32:29 +1100
Subject: [PATCH 265/382] tap: Consider IPv6 flow label when building packet
 sequences

To allow more batching, we group together related packets into "seqs" in
the tap layer, before passing them to the L4 protocol layers.  Currently
we consider the IP protocol, both IP addresses and also the L4 ports when
grouping things into seqs.  We ignore the IPv6 flow label.

We have some future cases where we want to consider the the flow label in
the L4 code, which is awkward if we could be given a single batch with
multiple labels.  Add the flow label to tap6_l4_t and group by it as well
as the other criteria.  In future we could possibly use the flow label
_instead_ of peeking into the L4 header for the ports, but we don't do so
for now.

The guest should use the same flow label for all packets in a low, but if
it doesn't this change won't break anything, it just means we'll batch
things a bit sub-optimally.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ip.h  | 1 -
 tap.c | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ip.h b/ip.h
index 5edb7e7..c82431e 100644
--- a/ip.h
+++ b/ip.h
@@ -108,7 +108,6 @@ static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow)
  *
  * Return: flow label from @ip6h as an integer (<= 20 bits)
  */
-/* cppcheck-suppress unusedFunction */
 static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
 {
 	return (ip6h->flow_lbl[0] & 0xf) << 16 |
diff --git a/tap.c b/tap.c
index 3908262..202abae 100644
--- a/tap.c
+++ b/tap.c
@@ -489,6 +489,7 @@ static struct tap4_l4_t {
  * struct l4_seq6_t - Message sequence for one protocol handler call, IPv6
  * @msgs:	Count of messages in sequence
  * @protocol:	Protocol number
+ * @flow_lbl:	IPv6 flow label
  * @source:	Source port
  * @dest:	Destination port
  * @saddr:	Source address
@@ -497,6 +498,7 @@ static struct tap4_l4_t {
  */
 static struct tap6_l4_t {
 	uint8_t protocol;
+	uint32_t flow_lbl :20;
 
 	uint16_t source;
 	uint16_t dest;
@@ -870,6 +872,7 @@ resume:
 		((seq)->protocol == (proto)                &&		\
 		 (seq)->source   == (uh)->source           &&		\
 		 (seq)->dest == (uh)->dest                 &&		\
+		 (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) &&		\
 		 IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr)  &&		\
 		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
 
@@ -878,6 +881,7 @@ resume:
 		(seq)->protocol	= (proto);				\
 		(seq)->source	= (uh)->source;				\
 		(seq)->dest	= (uh)->dest;				\
+		(seq)->flow_lbl	= ip6_get_flow_lbl(ip6h);		\
 		(seq)->saddr	= *saddr;				\
 		(seq)->daddr	= *daddr;				\
 	} while (0)

From 672d786de1c1f2aca32caedbcf440f710c4aecb5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 15:32:30 +1100
Subject: [PATCH 266/382] tcp: Send RST in response to guest packets that match
 no connection

Currently, if a non-SYN TCP packet arrives which doesn't match any existing
connection, we simply ignore it.  However RFC 9293, section 3.10.7.1 says
we should respond with an RST to a non-SYN, non-RST packet that's for a
CLOSED (i.e. non-existent) connection.

This can arise in practice with migration, in cases where some error means
we have to discard a connection.  We destroy the connection with tcp_rst()
in that case, but because the guest is stopped, we may not be able to
deliver the RST packet on the tap interface immediately.  This change
ensures an RST will be sent if the guest tries to use the connection again.

A similar situation can arise if a passt/pasta instance is killed or
crashes, but is then replaced with another attached to the same guest.
This can leave the guest with stale connections that the new passt instance
isn't aware of.  It's better to send an RST so the guest knows quickly
these are broken, rather than letting them linger until they time out.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 17 +++++++-------
 tap.h |  6 +++++
 tcp.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 tcp.h |  2 +-
 4 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/tap.c b/tap.c
index 202abae..86d051e 100644
--- a/tap.c
+++ b/tap.c
@@ -122,7 +122,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
  *
  * Return: pointer at which to write the packet's payload
  */
-static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
+void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
 {
 	struct ethhdr *eh = (struct ethhdr *)buf;
 
@@ -143,8 +143,8 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
  *
  * Return: pointer at which to write the packet's payload
  */
-static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
-			   struct in_addr dst, size_t l4len, uint8_t proto)
+void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+		    struct in_addr dst, size_t l4len, uint8_t proto)
 {
 	uint16_t l3len = l4len + sizeof(*ip4h);
 
@@ -229,10 +229,9 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
  *
  * Return: pointer at which to write the packet's payload
  */
-static void *tap_push_ip6h(struct ipv6hdr *ip6h,
-			   const struct in6_addr *src,
-			   const struct in6_addr *dst,
-			   size_t l4len, uint8_t proto, uint32_t flow)
+void *tap_push_ip6h(struct ipv6hdr *ip6h,
+		    const struct in6_addr *src, const struct in6_addr *dst,
+		    size_t l4len, uint8_t proto, uint32_t flow)
 {
 	ip6h->payload_len = htons(l4len);
 	ip6h->priority = 0;
@@ -744,7 +743,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += tcp_tap_handler(c, PIF_TAP, AF_INET,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     0, p, k, now);
 		} else if (seq->protocol == IPPROTO_UDP) {
 			if (c->no_udp)
 				continue;
@@ -927,7 +926,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += tcp_tap_handler(c, PIF_TAP, AF_INET6,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     seq->flow_lbl, p, k, now);
 		} else if (seq->protocol == IPPROTO_UDP) {
 			if (c->no_udp)
 				continue;
diff --git a/tap.h b/tap.h
index a476a12..390ac12 100644
--- a/tap.h
+++ b/tap.h
@@ -42,6 +42,9 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
 		thdr->vnet_len = htonl(l2len);
 }
 
+void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
+void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+		     struct in_addr dst, size_t l4len, uint8_t proto);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
@@ -49,6 +52,9 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
 		    const void *in, size_t l4len);
 const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
 				     const struct in6_addr *src);
+void *tap_push_ip6h(struct ipv6hdr *ip6h,
+		    const struct in6_addr *src, const struct in6_addr *dst,
+		    size_t l4len, uint8_t proto, uint32_t flow);
 void tap_udp6_send(const struct ctx *c,
 		   const struct in6_addr *src, in_port_t sport,
 		   const struct in6_addr *dst, in_port_t dport,
diff --git a/tcp.c b/tcp.c
index 7459803..fb04e2e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1866,6 +1866,75 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
 	tcp_data_from_sock(c, conn);
 }
 
+/**
+ * tcp_rst_no_conn() - Send RST in response to a packet with no connection
+ * @c:		Execution context
+ * @af:		Address family, AF_INET or AF_INET6
+ * @saddr:	Source address of the packet we're responding to
+ * @daddr:	Destination address of the packet we're responding to
+ * @flow_lbl:	IPv6 flow label (ignored for IPv4)
+ * @th:		TCP header of the packet we're responding to
+ * @l4len:	Packet length, including TCP header
+ */
+static void tcp_rst_no_conn(const struct ctx *c, int af,
+			    const void *saddr, const void *daddr,
+			    uint32_t flow_lbl,
+			    const struct tcphdr *th, size_t l4len)
+{
+	struct iov_tail payload = IOV_TAIL(NULL, 0, 0);
+	struct tcphdr *rsth;
+	char buf[USHRT_MAX];
+	uint32_t psum = 0;
+	size_t rst_l2len;
+
+	/* Don't respond to RSTs without a connection */
+	if (th->rst)
+		return;
+
+	if (af == AF_INET) {
+		struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+		const struct in_addr *rst_src = daddr;
+		const struct in_addr *rst_dst = saddr;
+
+		rsth = tap_push_ip4h(ip4h, *rst_src, *rst_dst,
+				     sizeof(*rsth), IPPROTO_TCP);
+		psum = proto_ipv4_header_psum(sizeof(*rsth), IPPROTO_TCP,
+					      *rst_src, *rst_dst);
+
+	} else {
+		struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+		const struct in6_addr *rst_src = daddr;
+		const struct in6_addr *rst_dst = saddr;
+
+		rsth = tap_push_ip6h(ip6h, rst_src, rst_dst,
+				     sizeof(*rsth), IPPROTO_TCP, flow_lbl);
+		psum = proto_ipv6_header_psum(sizeof(*rsth), IPPROTO_TCP,
+					      rst_src, rst_dst);
+	}
+
+	memset(rsth, 0, sizeof(*rsth));
+
+	rsth->source = th->dest;
+	rsth->dest = th->source;
+	rsth->rst = 1;
+	rsth->doff = sizeof(*rsth) / 4UL;
+
+	/* Sequence matching logic from RFC 9293 section 3.10.7.1 */
+	if (th->ack) {
+		rsth->seq = th->ack_seq;
+	} else {
+		size_t dlen = l4len - th->doff * 4UL;
+		uint32_t ack = ntohl(th->seq) + dlen;
+
+		rsth->ack_seq = htonl(ack);
+		rsth->ack = 1;
+	}
+
+	tcp_update_csum(psum, rsth, &payload);
+	rst_l2len = ((char *)rsth - buf) + sizeof(*rsth);
+	tap_send_single(c, buf, rst_l2len);
+}
+
 /**
  * tcp_tap_handler() - Handle packets from tap and state transitions
  * @c:		Execution context
@@ -1873,6 +1942,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
  * @af:		Address family, AF_INET or AF_INET6
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @flow_lbl:	IPv6 flow label (ignored for IPv4)
  * @p:		Pool of TCP packets, with TCP headers
  * @idx:	Index of first packet in pool to process
  * @now:	Current timestamp
@@ -1880,7 +1950,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
  * Return: count of consumed packets
  */
 int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
-		    const void *saddr, const void *daddr,
+		    const void *saddr, const void *daddr, uint32_t flow_lbl,
 		    const struct pool *p, int idx, const struct timespec *now)
 {
 	struct tcp_tap_conn *conn;
@@ -1913,6 +1983,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		if (opts && th->syn && !th->ack)
 			tcp_conn_from_tap(c, af, saddr, daddr, th,
 					  opts, optlen, now);
+		else
+			tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, len);
 		return 1;
 	}
 
diff --git a/tcp.h b/tcp.h
index cf30744..9142eca 100644
--- a/tcp.h
+++ b/tcp.h
@@ -16,7 +16,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
 		      uint32_t events);
 int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
-		    const void *saddr, const void *daddr,
+		    const void *saddr, const void *daddr, uint32_t flow_lbl,
 		    const struct pool *p, int idx, const struct timespec *now);
 int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);

From 1924e25f0723c0a86c1e33812f8e1d8aa045a146 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:20:03 +1100
Subject: [PATCH 267/382] conf: Be more precise about minimum MTUs

Currently we reject the -m option if given a value less than ETH_MIN_MTU
(68).  That define is derived from the kernel, but its name is misleading:
it doesn't really have anything to do with Ethernet per se, but is rather
the minimum payload any L2 link must be able to handle in order to carry
IPv4.  For IPv6, it's not sufficient: that requires an MTU of at least
1280.

Newer kernels have better named constants IPV4_MIN_MTU and IPv6_MIN_MTU.
Copy and use those constants instead, along with some more specific error
messages.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 18 +++++++++++++++---
 ip.h   |  7 +++++++
 util.h |  6 ------
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/conf.c b/conf.c
index c5ee07b..065e720 100644
--- a/conf.c
+++ b/conf.c
@@ -1663,9 +1663,9 @@ void conf(struct ctx *c, int argc, char **argv)
 			if (errno || *e)
 				die("Invalid MTU: %s", optarg);
 
-			if (mtu && (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU)) {
-				die("MTU %lu out of range (%u..%u)", mtu,
-				    ETH_MIN_MTU, ETH_MAX_MTU);
+			if (mtu > ETH_MAX_MTU) {
+				die("MTU %lu too large (max %u)",
+				    mtu, ETH_MAX_MTU);
 			}
 
 			c->mtu = mtu;
@@ -1842,9 +1842,21 @@ void conf(struct ctx *c, int argc, char **argv)
 		c->ifi4 = conf_ip4(ifi4, &c->ip4);
 	if (!v4_only)
 		c->ifi6 = conf_ip6(ifi6, &c->ip6);
+
+	if (c->ifi4 && c->mtu < IPV4_MIN_MTU) {
+		warn("MTU %"PRIu16" is too small for IPv4 (minimum %u)",
+		     c->mtu, IPV4_MIN_MTU);
+	}
+	if (c->ifi6 && c->mtu < IPV6_MIN_MTU) {
+		warn("MTU %"PRIu16" is too small for IPv6 (minimum %u)",
+			     c->mtu, IPV6_MIN_MTU);
+	}
+
 	if ((*c->ip4.ifname_out && !c->ifi4) ||
 	    (*c->ip6.ifname_out && !c->ifi6))
 		die("External interface not usable");
+
+
 	if (!c->ifi4 && !c->ifi6) {
 		info("No external interface as template, switch to local mode");
 
diff --git a/ip.h b/ip.h
index c82431e..471c57e 100644
--- a/ip.h
+++ b/ip.h
@@ -129,4 +129,11 @@ static const struct in6_addr in6addr_ll_all_nodes = {
 /* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */
 static const struct in_addr in4addr_broadcast = { 0xffffffff };
 
+#ifndef IPV4_MIN_MTU
+#define IPV4_MIN_MTU		68
+#endif
+#ifndef IPV6_MIN_MTU
+#define IPV6_MIN_MTU		1280
+#endif
+
 #endif /* IP_H */
diff --git a/util.h b/util.h
index 50e96d3..0f70f4d 100644
--- a/util.h
+++ b/util.h
@@ -34,15 +34,9 @@
 #ifndef ETH_MAX_MTU
 #define ETH_MAX_MTU			USHRT_MAX
 #endif
-#ifndef ETH_MIN_MTU
-#define ETH_MIN_MTU			68
-#endif
 #ifndef IP_MAX_MTU
 #define IP_MAX_MTU			USHRT_MAX
 #endif
-#ifndef IPV6_MIN_MTU
-#define IPV6_MIN_MTU			1280
-#endif
 
 #ifndef MIN
 #define MIN(x, y)		(((x) < (y)) ? (x) : (y))

From 82a839be988ecfdb013b5823afc93211200a9f55 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Thu, 6 Mar 2025 13:00:03 -0500
Subject: [PATCH 268/382] tap: break out building of udp header from
 tap_udp4_send function

We will need to build the UDP header at other locations than in function
tap_udp4_send(), so we break that part out to a separate function.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 34 +++++++++++++++++++++++++++-------
 tap.h |  5 +++++
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/tap.c b/tap.c
index 86d051e..6f7063e 100644
--- a/tap.c
+++ b/tap.c
@@ -163,7 +163,7 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 }
 
 /**
- * tap_udp4_send() - Send UDP over IPv4 packet
+ * tap_push_uh4() - Build UDPv4 header with checksum
  * @c:		Execution context
  * @src:	IPv4 source address
  * @sport:	UDP source port
@@ -171,16 +171,14 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
  * @dport:	UDP destination port
  * @in:		UDP payload contents (not including UDP header)
  * @dlen:	UDP payload length (not including UDP header)
+ *
+ * Return: pointer at which to write the packet's payload
  */
-void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
+void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen)
 {
 	size_t l4len = dlen + sizeof(struct udphdr);
-	char buf[USHRT_MAX];
-	struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
-	struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
-	char *data = (char *)(uh + 1);
 	const struct iovec iov = {
 		.iov_base = (void *)in,
 		.iov_len = dlen
@@ -191,8 +189,30 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 	uh->dest = htons(dport);
 	uh->len = htons(l4len);
 	csum_udp4(uh, src, dst, &payload);
-	memcpy(data, in, dlen);
+	return (char *)uh + sizeof(*uh);
+}
 
+/**
+ * tap_udp4_send() - Send UDP over IPv4 packet
+ * @c:		Execution context
+ * @src:	IPv4 source address
+ * @sport:	UDP source port
+ * @dst:	IPv4 destination address
+ * @dport:	UDP destination port
+ * @in:	UDP payload contents (not including UDP header)
+ * @dlen:	UDP payload length (not including UDP header)
+ */
+void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
+		   struct in_addr dst, in_port_t dport,
+		   const void *in, size_t dlen)
+{
+	size_t l4len = dlen + sizeof(struct udphdr);
+	char buf[USHRT_MAX];
+	struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+	struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
+	char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
+
+	memcpy(data, in, dlen);
 	tap_send_single(c, buf, dlen + (data - buf));
 }
 
diff --git a/tap.h b/tap.h
index 390ac12..a2cf9bc 100644
--- a/tap.h
+++ b/tap.h
@@ -6,6 +6,8 @@
 #ifndef TAP_H
 #define TAP_H
 
+struct udphdr;
+
 /**
  * struct tap_hdr - tap backend specific headers
  * @vnet_len:	Frame length (for qemu socket transport)
@@ -45,6 +47,9 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
 void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
 void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 		     struct in_addr dst, size_t l4len, uint8_t proto);
+void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
+		   struct in_addr dst, in_port_t dport,
+		   const void *in, size_t dlen);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);

From 55431f0077b6a25c264bd2492680d7f99815cc5f Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Thu, 6 Mar 2025 13:00:04 -0500
Subject: [PATCH 269/382] udp: create and send ICMPv4 to local peer when
 applicable

When a local peer sends a UDP message to a non-existing port on an
existing remote host, that host will return an ICMP message containing
the error code ICMP_PORT_UNREACH, plus the header and the first eight
bytes of the original message. If the sender socket has been connected,
it uses this message to issue a "Connection Refused" event to the user.

Until now, we have only read such events from the externally facing
socket, but we don't forward them back to the local sender because
we cannot read the ICMP message directly to user space. Because of
this, the local peer will hang and wait for a response that never
arrives.

We now fix this for IPv4 by recreating and forwarding a correct ICMP
message back to the internal sender. We synthesize the message based
on the information in the extended error structure, plus the returned
part of the original message body.

Note that for the sake of completeness, we even produce ICMP messages
for other error codes. We have noticed that at least ICMP_PROT_UNREACH
is propagated as an error event back to the user.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
[sbrivio: fix cppcheck warning: udp_send_conn_fail_icmp4() doesn't
 modify 'in', it can be declared as const]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c          |  2 +-
 tap.h          |  2 ++
 udp.c          | 87 +++++++++++++++++++++++++++++++++++++++++++-------
 udp_internal.h |  2 +-
 udp_vu.c       |  4 +--
 5 files changed, 81 insertions(+), 16 deletions(-)

diff --git a/tap.c b/tap.c
index 6f7063e..57d0795 100644
--- a/tap.c
+++ b/tap.c
@@ -159,7 +159,7 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 	ip4h->saddr = src.s_addr;
 	ip4h->daddr = dst.s_addr;
 	ip4h->check = csum_ip4_header(l3len, proto, src, dst);
-	return ip4h + 1;
+	return (char *)ip4h + sizeof(*ip4h);
 }
 
 /**
diff --git a/tap.h b/tap.h
index a2cf9bc..9ac17ce 100644
--- a/tap.h
+++ b/tap.h
@@ -50,6 +50,8 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
+void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+		    struct in_addr dst, size_t l4len, uint8_t proto);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
diff --git a/udp.c b/udp.c
index 923cc38..b72c3ce 100644
--- a/udp.c
+++ b/udp.c
@@ -87,6 +87,7 @@
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/udp.h>
+#include <netinet/ip_icmp.h>
 #include <stdint.h>
 #include <stddef.h>
 #include <string.h>
@@ -112,6 +113,9 @@
 #include "udp_internal.h"
 #include "udp_vu.h"
 
+/* Maximum UDP data to be returned in ICMP messages */
+#define ICMP4_MAX_DLEN 8
+
 /* "Spliced" sockets indexed by bound port (host order) */
 static int udp_splice_ns  [IP_VERSIONS][NUM_PORTS];
 static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
@@ -402,25 +406,76 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
 	(*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
 }
 
+/**
+ * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer
+ * @c:		Execution context
+ * @ee:	Extended error descriptor
+ * @toside:	Destination side of flow
+ * @saddr:	Address of ICMP generating node
+ * @in:	First bytes (max 8) of original UDP message body
+ * @dlen:	Length of the read part of original UDP message body
+ */
+static void udp_send_conn_fail_icmp4(const struct ctx *c,
+				     const struct sock_extended_err *ee,
+				     const struct flowside *toside,
+				     struct in_addr saddr,
+				     const void *in, size_t dlen)
+{
+	struct in_addr oaddr = toside->oaddr.v4mapped.a4;
+	struct in_addr eaddr = toside->eaddr.v4mapped.a4;
+	in_port_t eport = toside->eport;
+	in_port_t oport = toside->oport;
+	struct {
+		struct icmphdr icmp4h;
+		struct iphdr ip4h;
+		struct udphdr uh;
+		char data[ICMP4_MAX_DLEN];
+	} __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
+	size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
+	size_t l4len = dlen + sizeof(struct udphdr);
+
+	ASSERT(dlen <= ICMP4_MAX_DLEN);
+	memset(&msg, 0, sizeof(msg));
+	msg.icmp4h.type = ee->ee_type;
+	msg.icmp4h.code = ee->ee_code;
+	if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED)
+		msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info);
+
+	/* Reconstruct the original headers as returned in the ICMP message */
+	tap_push_ip4h(&msg.ip4h, eaddr, oaddr, l4len, IPPROTO_UDP);
+	tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
+	memcpy(&msg.data, in, dlen);
+
+	tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
+}
+
 /**
  * udp_sock_recverr() - Receive and clear an error from a socket
- * @s:		Socket to receive from
+ * @c:		Execution context
+ * @ref:	epoll reference
  *
  * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
  *         if there was an error reading the queue
  *
  * #syscalls recvmsg
  */
-static int udp_sock_recverr(int s)
+static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 {
 	const struct sock_extended_err *ee;
 	const struct cmsghdr *hdr;
+	union sockaddr_inany saddr;
 	char buf[CMSG_SPACE(sizeof(*ee))];
+	char data[ICMP4_MAX_DLEN];
+	int s = ref.fd;
+	struct iovec iov = {
+		.iov_base = data,
+		.iov_len = sizeof(data)
+	};
 	struct msghdr mh = {
-		.msg_name = NULL,
-		.msg_namelen = 0,
-		.msg_iov = NULL,
-		.msg_iovlen = 0,
+		.msg_name = &saddr,
+		.msg_namelen = sizeof(saddr),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
 		.msg_control = buf,
 		.msg_controllen = sizeof(buf),
 	};
@@ -450,8 +505,15 @@ static int udp_sock_recverr(int s)
 	}
 
 	ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
+	if (ref.type == EPOLL_TYPE_UDP_REPLY) {
+		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
+		const struct flowside *toside = flowside_at_sidx(sidx);
 
-	/* TODO: When possible propagate and otherwise handle errors */
+		udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr,
+					 data, rc);
+	} else {
+		trace("Ignoring received IP_RECVERR cmsg on listener socket");
+	}
 	debug("%s error on UDP socket %i: %s",
 	      str_ee_origin(ee), s, strerror_(ee->ee_errno));
 
@@ -461,15 +523,16 @@ static int udp_sock_recverr(int s)
 /**
  * udp_sock_errs() - Process errors on a socket
  * @c:		Execution context
- * @s:		Socket to receive from
+ * @ref:	epoll reference
  * @events:	epoll events bitmap
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
+int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
+	int s = ref.fd;
 	int rc, err;
 
 	ASSERT(!c->no_udp);
@@ -478,7 +541,7 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
 		return 0; /* Nothing to do */
 
 	/* Empty the error queue */
-	while ((rc = udp_sock_recverr(s)) > 0)
+	while ((rc = udp_sock_recverr(c, ref)) > 0)
 		n_err += rc;
 
 	if (rc < 0)
@@ -558,7 +621,7 @@ static void udp_buf_listen_sock_handler(const struct ctx *c,
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
 	int n, i;
 
-	if (udp_sock_errs(c, ref.fd, events) < 0) {
+	if (udp_sock_errs(c, ref, events) < 0) {
 		err("UDP: Unrecoverable error on listening socket:"
 		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
 		/* FIXME: what now?  close/re-open socket? */
@@ -661,7 +724,7 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	from_s = uflow->s[ref.flowside.sidei];
 
-	if (udp_sock_errs(c, from_s, events) < 0) {
+	if (udp_sock_errs(c, ref, events) < 0) {
 		flow_err(uflow, "Unrecoverable error on reply socket");
 		flow_err_details(uflow);
 		udp_flow_close(c, uflow);
diff --git a/udp_internal.h b/udp_internal.h
index cc80e30..3b081f5 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -30,5 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                        const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
-int udp_sock_errs(const struct ctx *c, int s, uint32_t events);
+int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events);
 #endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index 4123510..c26a223 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -227,7 +227,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;
 
-	if (udp_sock_errs(c, ref.fd, events) < 0) {
+	if (udp_sock_errs(c, ref, events) < 0) {
 		err("UDP: Unrecoverable error on listening socket:"
 		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
 		return;
@@ -302,7 +302,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	ASSERT(!c->no_udp);
 
-	if (udp_sock_errs(c, from_s, events) < 0) {
+	if (udp_sock_errs(c, ref, events) < 0) {
 		flow_err(uflow, "Unrecoverable error on reply socket");
 		flow_err_details(uflow);
 		udp_flow_close(c, uflow);

From 87e6a464429372dfaa7212b61e5062dad87179dc Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Thu, 6 Mar 2025 13:00:05 -0500
Subject: [PATCH 270/382] tap: break out building of udp header from
 tap_udp6_send function

We will need to build the UDP header at other locations than in function
tap_udp6_send(), so we break that part out to a separate function.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 46 ++++++++++++++++++++++++++++++++++------------
 tap.h |  4 ++++
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/tap.c b/tap.c
index 57d0795..7082620 100644
--- a/tap.c
+++ b/tap.c
@@ -265,7 +265,7 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h,
 }
 
 /**
- * tap_udp6_send() - Send UDP over IPv6 packet
+ * tap_push_uh6() - Build UDPv6 header with checksum
  * @c:		Execution context
  * @src:	IPv6 source address
  * @sport:	UDP source port
@@ -274,6 +274,38 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h,
  * @flow:	Flow label
  * @in:		UDP payload contents (not including UDP header)
  * @dlen:	UDP payload length (not including UDP header)
+ *
+ * Return: pointer at which to write the packet's payload
+ */
+void *tap_push_uh6(struct udphdr *uh,
+		   const struct in6_addr *src, in_port_t sport,
+		   const struct in6_addr *dst, in_port_t dport,
+		   void *in, size_t dlen)
+{
+	size_t l4len = dlen + sizeof(struct udphdr);
+	const struct iovec iov = {
+		.iov_base = in,
+		.iov_len = dlen
+	};
+	struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
+
+	uh->source = htons(sport);
+	uh->dest = htons(dport);
+	uh->len = htons(l4len);
+	csum_udp6(uh, src, dst, &payload);
+	return (char *)uh + sizeof(*uh);
+}
+
+/**
+ * tap_udp6_send() - Send UDP over IPv6 packet
+ * @c:		Execution context
+ * @src:	IPv6 source address
+ * @sport:	UDP source port
+ * @dst:	IPv6 destination address
+ * @dport:	UDP destination port
+ * @flow:	Flow label
+ * @in:	UDP payload contents (not including UDP header)
+ * @dlen:	UDP payload length (not including UDP header)
  */
 void tap_udp6_send(const struct ctx *c,
 		   const struct in6_addr *src, in_port_t sport,
@@ -285,19 +317,9 @@ void tap_udp6_send(const struct ctx *c,
 	struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
 	struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
 					  l4len, IPPROTO_UDP, flow);
-	char *data = (char *)(uh + 1);
-	const struct iovec iov = {
-		.iov_base = in,
-		.iov_len = dlen
-	};
-	struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
+	char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
 
-	uh->source = htons(sport);
-	uh->dest = htons(dport);
-	uh->len = htons(l4len);
-	csum_udp6(uh, src, dst, &payload);
 	memcpy(data, in, dlen);
-
 	tap_send_single(c, buf, dlen + (data - buf));
 }
 
diff --git a/tap.h b/tap.h
index 9ac17ce..b53a5b8 100644
--- a/tap.h
+++ b/tap.h
@@ -50,6 +50,10 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
+void *tap_push_uh6(struct udphdr *uh,
+		   const struct in6_addr *src, in_port_t sport,
+		   const struct in6_addr *dst, in_port_t dport,
+		   void *in, size_t dlen);
 void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 		    struct in_addr dst, size_t l4len, uint8_t proto);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,

From 68b04182e07da6a437479cb191e5468db382bc56 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Thu, 6 Mar 2025 13:00:06 -0500
Subject: [PATCH 271/382] udp: create and send ICMPv6 to local peer when
 applicable

When a local peer sends a UDP message to a non-existing port on an
existing remote host, that host will return an ICMPv6 message containing
the error code ICMP6_DST_UNREACH_NOPORT, plus the IPv6 header, UDP header
and the first 1232 bytes of the original message, if any. If the sender
socket has been connected, it uses this message to issue a
"Connection Refused" event to the user.

Until now, we have only read such events from the externally facing
socket, but we don't forward them back to the local sender because
we cannot read the ICMP message directly to user space. Because of
this, the local peer will hang and wait for a response that never
arrives.

We now fix this for IPv6 by recreating and forwarding a correct ICMP
message back to the internal sender. We synthesize the message based
on the information in the extended error structure, plus the returned
part of the original message body.

Note that for the sake of completeness, we even produce ICMP messages
for other error types and codes. We have noticed that at least
ICMP_PROT_UNREACH is propagated as an error event back to the user.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
[sbrivio: fix cppcheck warning, udp_send_conn_fail_icmp6() doesn't
 modify saddr which can be declared as const]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c |  2 +-
 tap.h |  4 ++++
 udp.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/tap.c b/tap.c
index 7082620..4541f51 100644
--- a/tap.c
+++ b/tap.c
@@ -261,7 +261,7 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h,
 	ip6h->saddr = *src;
 	ip6h->daddr = *dst;
 	ip6_set_flow_lbl(ip6h, flow);
-	return ip6h + 1;
+	return (char *)ip6h + sizeof(*ip6h);
 }
 
 /**
diff --git a/tap.h b/tap.h
index b53a5b8..a2c3b87 100644
--- a/tap.h
+++ b/tap.h
@@ -56,6 +56,10 @@ void *tap_push_uh6(struct udphdr *uh,
 		   void *in, size_t dlen);
 void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 		    struct in_addr dst, size_t l4len, uint8_t proto);
+void *tap_push_ip6h(struct ipv6hdr *ip6h,
+		    const struct in6_addr *src,
+		    const struct in6_addr *dst,
+		    size_t l4len, uint8_t proto, uint32_t flow);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
diff --git a/udp.c b/udp.c
index b72c3ce..80520cb 100644
--- a/udp.c
+++ b/udp.c
@@ -88,6 +88,7 @@
 #include <netinet/ip.h>
 #include <netinet/udp.h>
 #include <netinet/ip_icmp.h>
+#include <netinet/icmp6.h>
 #include <stdint.h>
 #include <stddef.h>
 #include <string.h>
@@ -115,6 +116,9 @@
 
 /* Maximum UDP data to be returned in ICMP messages */
 #define ICMP4_MAX_DLEN 8
+#define ICMP6_MAX_DLEN (IPV6_MIN_MTU			\
+			- sizeof(struct udphdr)	\
+			- sizeof(struct ipv6hdr))
 
 /* "Spliced" sockets indexed by bound port (host order) */
 static int udp_splice_ns  [IP_VERSIONS][NUM_PORTS];
@@ -449,6 +453,51 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c,
 	tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
 }
 
+
+/**
+ * udp_send_conn_fail_icmp6() - Construct and send ICMPv6 to local peer
+ * @c:		Execution context
+ * @ee:	Extended error descriptor
+ * @toside:	Destination side of flow
+ * @saddr:	Address of ICMP generating node
+ * @in:	First bytes (max 1232) of original UDP message body
+ * @dlen:	Length of the read part of original UDP message body
+ * @flow:	IPv6 flow identifier
+ */
+static void udp_send_conn_fail_icmp6(const struct ctx *c,
+				     const struct sock_extended_err *ee,
+				     const struct flowside *toside,
+				     const struct in6_addr *saddr,
+				     void *in, size_t dlen, uint32_t flow)
+{
+	const struct in6_addr *oaddr = &toside->oaddr.a6;
+	const struct in6_addr *eaddr = &toside->eaddr.a6;
+	in_port_t eport = toside->eport;
+	in_port_t oport = toside->oport;
+	struct {
+		struct icmp6_hdr icmp6h;
+		struct ipv6hdr ip6h;
+		struct udphdr uh;
+		char data[ICMP6_MAX_DLEN];
+	} __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
+	size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
+	size_t l4len = dlen + sizeof(struct udphdr);
+
+	ASSERT(dlen <= ICMP6_MAX_DLEN);
+	memset(&msg, 0, sizeof(msg));
+	msg.icmp6h.icmp6_type = ee->ee_type;
+	msg.icmp6h.icmp6_code = ee->ee_code;
+	if (ee->ee_type == ICMP6_PACKET_TOO_BIG)
+		msg.icmp6h.icmp6_dataun.icmp6_un_data32[0] = htonl(ee->ee_info);
+
+	/* Reconstruct the original headers as returned in the ICMP message */
+	tap_push_ip6h(&msg.ip6h, eaddr, oaddr, l4len, IPPROTO_UDP, flow);
+	tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
+	memcpy(&msg.data, in, dlen);
+
+	tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
+}
+
 /**
  * udp_sock_recverr() - Receive and clear an error from a socket
  * @c:		Execution context
@@ -465,7 +514,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 	const struct cmsghdr *hdr;
 	union sockaddr_inany saddr;
 	char buf[CMSG_SPACE(sizeof(*ee))];
-	char data[ICMP4_MAX_DLEN];
+	char data[ICMP6_MAX_DLEN];
 	int s = ref.fd;
 	struct iovec iov = {
 		.iov_base = data,
@@ -508,9 +557,17 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 	if (ref.type == EPOLL_TYPE_UDP_REPLY) {
 		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
 		const struct flowside *toside = flowside_at_sidx(sidx);
+		size_t dlen = rc;
 
-		udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr,
-					 data, rc);
+		if (hdr->cmsg_level == IPPROTO_IP) {
+			dlen = MIN(dlen, ICMP4_MAX_DLEN);
+			udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr,
+						 data, dlen);
+		} else if (hdr->cmsg_level == IPPROTO_IPV6) {
+			udp_send_conn_fail_icmp6(c, ee, toside,
+						 &saddr.sa6.sin6_addr,
+						 data, dlen, sidx.flowi);
+		}
 	} else {
 		trace("Ignoring received IP_RECVERR cmsg on listener socket");
 	}

From 57d2db370b9c12aca84901d968c2c31db89ca462 Mon Sep 17 00:00:00 2001
From: David Gibson <dgibson@redhat.com>
Date: Wed, 5 Mar 2025 17:15:03 +1100
Subject: [PATCH 272/382] treewide: Mark assorted functions static

This marks static a number of functions which are only used in their .c
file, have no prototypes in a .h and were never intended to be globally
exposed.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 log.c     | 2 +-
 netlink.c | 2 +-
 passt.c   | 2 +-
 tcp.c     | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/log.c b/log.c
index 95e4576..b6bce21 100644
--- a/log.c
+++ b/log.c
@@ -56,7 +56,7 @@ bool		log_stderr = true;	/* Not daemonised, no shell spawned */
  *
  * Return: pointer to @now, or NULL if there was an error retrieving the time
  */
-const struct timespec *logtime(struct timespec *ts)
+static const struct timespec *logtime(struct timespec *ts)
 {
 	if (clock_gettime(CLOCK_MONOTONIC, ts))
 		return NULL;
diff --git a/netlink.c b/netlink.c
index 37d8b5b..a052504 100644
--- a/netlink.c
+++ b/netlink.c
@@ -355,7 +355,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
  *
  * Return: true if a gateway was found, false otherwise
  */
-bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
+static bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
 {
 	int nh_len = RTA_PAYLOAD(rta);
 	struct rtnexthop *rtnh;
diff --git a/passt.c b/passt.c
index 68d1a28..868842b 100644
--- a/passt.c
+++ b/passt.c
@@ -166,7 +166,7 @@ void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
  *
  * #syscalls exit_group
  */
-void exit_handler(int signal)
+static void exit_handler(int signal)
 {
 	(void)signal;
 
diff --git a/tcp.c b/tcp.c
index fb04e2e..4c24367 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2497,7 +2497,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
  * @c:		Execution context
  * @port:	Port, host order
  */
-void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
+static void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
 {
 	ASSERT(!c->no_tcp);
 
@@ -3141,7 +3141,7 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
  *
  * Return: 0 on success, negative error code on failure
  */
-int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
+static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
 {
 	const struct tcp_repair_opt opts[] = {
 		{ TCPOPT_WINDOW,		t->snd_ws + (t->rcv_ws << 16) },
@@ -3333,7 +3333,7 @@ fail:
  *
  * Return: 0 on success, negative error code on failure
  */
-int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
 {
 	sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
 	const struct flowside *sockside = HOSTFLOW(conn);

From e36c35c952ef0848383cba8ef71e13cf25dab2da Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:15:04 +1100
Subject: [PATCH 273/382] log: Don't export passt_vsyslog()

passt_vsyslog() is an exposed function in log.h.  However it shouldn't
be called from outside log.c: it writes specifically to the system log,
and most code should call passt's logging helpers which might go to the
syslog or to a log file.

Make passt_vsyslog() local to log.c.  This requires a code motion to avoid
a forward declaration.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 log.c | 48 ++++++++++++++++++++++++------------------------
 log.h |  1 -
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/log.c b/log.c
index b6bce21..6eda4c4 100644
--- a/log.c
+++ b/log.c
@@ -249,6 +249,30 @@ static void logfile_write(bool newline, bool cont, int pri,
 		log_written += n;
 }
 
+/**
+ * passt_vsyslog() - vsyslog() implementation not using heap memory
+ * @newline:	Append newline at the end of the message, if missing
+ * @pri:	Facility and level map, same as priority for vsyslog()
+ * @format:	Same as vsyslog() format
+ * @ap:		Same as vsyslog() ap
+ */
+static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
+{
+	char buf[BUFSIZ];
+	int n;
+
+	/* Send without timestamp, the system logger should add it */
+	n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
+
+	n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
+
+	if (newline && format[strlen(format)] != '\n')
+		n += snprintf(buf + n, BUFSIZ - n, "\n");
+
+	if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
+		FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
+}
+
 /**
  * vlogmsg() - Print or send messages to log or output files as configured
  * @newline:	Append newline at the end of the message, if missing
@@ -373,30 +397,6 @@ void __setlogmask(int mask)
 	setlogmask(mask);
 }
 
-/**
- * passt_vsyslog() - vsyslog() implementation not using heap memory
- * @newline:	Append newline at the end of the message, if missing
- * @pri:	Facility and level map, same as priority for vsyslog()
- * @format:	Same as vsyslog() format
- * @ap:		Same as vsyslog() ap
- */
-void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
-{
-	char buf[BUFSIZ];
-	int n;
-
-	/* Send without timestamp, the system logger should add it */
-	n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
-
-	n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
-
-	if (newline && format[strlen(format)] != '\n')
-		n += snprintf(buf + n, BUFSIZ - n, "\n");
-
-	if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
-		FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
-}
-
 /**
  * logfile_init() - Open log file and write header with PID, version, path
  * @name:	Identifier for header: passt or pasta
diff --git a/log.h b/log.h
index 22c7b9a..08aa88c 100644
--- a/log.h
+++ b/log.h
@@ -55,7 +55,6 @@ void trace_init(int enable);
 
 void __openlog(const char *ident, int option, int facility);
 void logfile_init(const char *name, const char *path, size_t size);
-void passt_vsyslog(bool newline, int pri, const char *format, va_list ap);
 void __setlogmask(int mask);
 
 #endif /* LOG_H */

From 12d5b36b2f17a1ddc9447b925dbec161b4da346a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:15:05 +1100
Subject: [PATCH 274/382] checksum: Don't export various functions

Several of the exposed functions in checksum.h are no longer directly used.
Remove them from the header, and make static.  In particular sum_16b()
should not be used outside: generally csum_unfolded() should be used which
will automatically use either the AVX2 optimized version or sum_16b() as
necessary.

csum_fold() and csum() could have external uses, but they're not used right
now.  We can expose them again if we need to.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 checksum.c | 34 +++++++++++++++++-----------------
 checksum.h |  3 ---
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/checksum.c b/checksum.c
index b01e0fe..0894eca 100644
--- a/checksum.c
+++ b/checksum.c
@@ -85,7 +85,7 @@
  */
 /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
 __attribute__((optimize("-fno-strict-aliasing")))
-uint32_t sum_16b(const void *buf, size_t len)
+static uint32_t sum_16b(const void *buf, size_t len)
 {
 	const uint16_t *p = buf;
 	uint32_t sum = 0;
@@ -107,7 +107,7 @@ uint32_t sum_16b(const void *buf, size_t len)
  *
  * Return: 16-bit folded sum
  */
-uint16_t csum_fold(uint32_t sum)
+static uint16_t csum_fold(uint32_t sum)
 {
 	while (sum >> 16)
 		sum = (sum & 0xffff) + (sum >> 16);
@@ -161,6 +161,21 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
 	return psum;
 }
 
+/**
+ * csum() - Compute TCP/IP-style checksum
+ * @buf:	Input buffer
+ * @len:	Input length
+ * @init:	Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit folded, complemented checksum
+ */
+/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
+__attribute__((optimize("-fno-strict-aliasing")))	/* See csum_16b() */
+static uint16_t csum(const void *buf, size_t len, uint32_t init)
+{
+	return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
+}
+
 /**
  * csum_udp4() - Calculate and set checksum for a UDP over IPv4 packet
  * @udp4hr:	UDP header, initialised apart from checksum
@@ -482,21 +497,6 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
 }
 #endif /* !__AVX2__ */
 
-/**
- * csum() - Compute TCP/IP-style checksum
- * @buf:	Input buffer
- * @len:	Input length
- * @init:	Initial 32-bit checksum, 0 for no pre-computed checksum
- *
- * Return: 16-bit folded, complemented checksum
- */
-/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
-__attribute__((optimize("-fno-strict-aliasing")))	/* See csum_16b() */
-uint16_t csum(const void *buf, size_t len, uint32_t init)
-{
-	return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
-}
-
 /**
  * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector
  * @tail:	IO vector tail to checksum
diff --git a/checksum.h b/checksum.h
index e243c97..683a09b 100644
--- a/checksum.h
+++ b/checksum.h
@@ -11,8 +11,6 @@ struct icmphdr;
 struct icmp6hdr;
 struct iov_tail;
 
-uint32_t sum_16b(const void *buf, size_t len);
-uint16_t csum_fold(uint32_t sum);
 uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init);
 uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
 			 struct in_addr saddr, struct in_addr daddr);
@@ -32,7 +30,6 @@ void csum_icmp6(struct icmp6hdr *icmp6hr,
 		const struct in6_addr *saddr, const struct in6_addr *daddr,
 		const void *payload, size_t dlen);
 uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
-uint16_t csum(const void *buf, size_t len, uint32_t init);
 uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init);
 
 #endif /* CHECKSUM_H */

From 27395e67c26a73e2e035360195b5928a07996dd5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:15:06 +1100
Subject: [PATCH 275/382] tcp: Don't export tcp_update_csum()

tcp_update_csum() is exposed in tcp_internal.h, but is only used in tcp.c.
Remove the unneded prototype and make it static.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 3 ++-
 tcp_internal.h | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tcp.c b/tcp.c
index 4c24367..32a08bd 100644
--- a/tcp.c
+++ b/tcp.c
@@ -787,7 +787,8 @@ static void tcp_sock_set_nodelay(int s)
  * @th:		TCP header (updated)
  * @payload:	TCP payload
  */
-void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload)
+static void tcp_update_csum(uint32_t psum, struct tcphdr *th,
+			    struct iov_tail *payload)
 {
 	th->check = 0;
 	psum = csum_unfolded(th, sizeof(*th), psum);
diff --git a/tcp_internal.h b/tcp_internal.h
index 9cf31f5..6f5e054 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -166,8 +166,6 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
 
 struct tcp_info_linux;
 
-void tcp_update_csum(uint32_t psum, struct tcphdr *th,
-		     struct iov_tail *payload);
 void tcp_fill_headers(const struct tcp_tap_conn *conn,
 		      struct tap_hdr *taph,
 		      struct iphdr *ip4h, struct ipv6hdr *ip6h,

From a83c806d1786fbe19bc6a3014f248e928e00651b Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:15:07 +1100
Subject: [PATCH 276/382] vhost_user: Don't export several functions

vhost-user added several functions which are exposed in headers, but not
used outside the file where they're defined.  I can't tell if these are
really internal functions, or of they're logically supposed to be exported,
but we don't happen to have anything using them yet.

For the time being, just remove the exports.  We can add them back if we
need to.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 2 +-
 vhost_user.h | 1 -
 virtio.c     | 9 +++++----
 virtio.h     | 4 ----
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/vhost_user.c b/vhost_user.c
index be1aa94..105f77a 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -517,7 +517,7 @@ static void vu_close_log(struct vu_dev *vdev)
  * vu_log_kick() - Inform the front-end that the log has been modified
  * @vdev:	vhost-user device
  */
-void vu_log_kick(const struct vu_dev *vdev)
+static void vu_log_kick(const struct vu_dev *vdev)
 {
 	if (vdev->log_call_fd != -1) {
 		int rc;
diff --git a/vhost_user.h b/vhost_user.h
index e769cb1..1daacd1 100644
--- a/vhost_user.h
+++ b/vhost_user.h
@@ -241,7 +241,6 @@ static inline bool vu_queue_started(const struct vu_virtq *vq)
 void vu_print_capabilities(void);
 void vu_init(struct ctx *c);
 void vu_cleanup(struct vu_dev *vdev);
-void vu_log_kick(const struct vu_dev *vdev);
 void vu_log_write(const struct vu_dev *vdev, uint64_t address,
 		  uint64_t length);
 void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events);
diff --git a/virtio.c b/virtio.c
index 2b58e4d..bc2b89a 100644
--- a/virtio.c
+++ b/virtio.c
@@ -286,7 +286,7 @@ static int virtqueue_read_next_desc(const struct vring_desc *desc,
  *
  * Return: true if the virtqueue is empty, false otherwise
  */
-bool vu_queue_empty(struct vu_virtq *vq)
+static bool vu_queue_empty(struct vu_virtq *vq)
 {
 	if (!vq->vring.avail)
 		return true;
@@ -671,9 +671,10 @@ static void vu_log_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq,
  * @len:	Size of the element
  * @idx:	Used ring entry index
  */
-void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq,
-			    unsigned int index, unsigned int len,
-			    unsigned int idx)
+static void vu_queue_fill_by_index(const struct vu_dev *vdev,
+				   struct vu_virtq *vq,
+				   unsigned int index, unsigned int len,
+				   unsigned int idx)
 {
 	struct vring_used_elem uelem;
 
diff --git a/virtio.h b/virtio.h
index 0a59441..7a370bd 100644
--- a/virtio.h
+++ b/virtio.h
@@ -174,16 +174,12 @@ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev,
 	return has_feature(vdev->protocol_features, fbit);
 }
 
-bool vu_queue_empty(struct vu_virtq *vq);
 void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq);
 int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
 		 struct vu_virtq_element *elem);
 void vu_queue_detach_element(struct vu_virtq *vq);
 void vu_queue_unpop(struct vu_virtq *vq);
 bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num);
-void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq,
-			    unsigned int index, unsigned int len,
-			    unsigned int idx);
 void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq,
 		   const struct vu_virtq_element *elem, unsigned int len,
 		   unsigned int idx);

From 2b58b22845a76baf24141155eb4d4a882f509e97 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:15:08 +1100
Subject: [PATCH 277/382] cppcheck: Add suppressions for "logically" exported
 functions

We have some functions in our headers which are definitely there on
purpose.  However, they're not yet used outside the files in which they're
defined.  That causes sufficiently recent cppcheck versions (2.17) to
complain they should be static.

Suppress the errors for these "logically" exported functions.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 iov.c | 1 +
 log.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/iov.c b/iov.c
index 3b12272..8c63b7e 100644
--- a/iov.c
+++ b/iov.c
@@ -203,6 +203,7 @@ size_t iov_tail_size(struct iov_tail *tail)
  *	    overruns the IO vector, is not contiguous or doesn't have the
  *	    requested alignment.
  */
+/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
 void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
 {
 	char *p;
diff --git a/log.c b/log.c
index 6eda4c4..d40d7ae 100644
--- a/log.c
+++ b/log.c
@@ -281,6 +281,7 @@ static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
  * @format:	Message
  * @ap:		Variable argument list
  */
+/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
 void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
 {
 	bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;

From 04701702471ececee362669cc6b49ed9e20a1b6d Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 7 Mar 2025 23:27:03 +0100
Subject: [PATCH 278/382] passt-repair: Add directory watch

It might not be feasible for users to start passt-repair after passt
is started, on a migration target, but before the migration process
starts.

For instance, with libvirt, the guest domain (and, hence, passt) is
started on the target as part of the migration process. At least for
the moment being, there's no hook a libvirt user (including KubeVirt)
can use to start passt-repair before the migration starts.

Add a directory watch using inotify: if PATH is a directory, instead
of connecting to it, we'll watch for a .repair socket file to appear
in it, and then attempt to connect to that socket.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 contrib/selinux/passt-repair.te | 16 +++----
 passt-repair.1                  |  6 ++-
 passt-repair.c                  | 84 +++++++++++++++++++++++++++++----
 3 files changed, 89 insertions(+), 17 deletions(-)

diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te
index f171be6..7157dfb 100644
--- a/contrib/selinux/passt-repair.te
+++ b/contrib/selinux/passt-repair.te
@@ -61,11 +61,11 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
 allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
 allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
 
-allow passt_repair_t user_tmp_t:dir search;
+allow passt_repair_t user_tmp_t:dir { getattr read search watch };
 
-allow passt_repair_t unconfined_t:sock_file { read write };
-allow passt_repair_t passt_t:sock_file { read write };
-allow passt_repair_t user_tmp_t:sock_file { read write };
+allow passt_repair_t unconfined_t:sock_file { getattr read write };
+allow passt_repair_t passt_t:sock_file { getattr read write };
+allow passt_repair_t user_tmp_t:sock_file { getattr read write };
 
 allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
 allow passt_repair_t passt_t:tcp_socket { read setopt write };
@@ -80,8 +80,8 @@ allow passt_repair_t passt_t:tcp_socket { read setopt write };
 allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write };
 allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write };
 
-allow passt_repair_t qemu_var_run_t:dir search;
-allow passt_repair_t virt_var_run_t:dir search;
+allow passt_repair_t qemu_var_run_t:dir { getattr read search watch };
+allow passt_repair_t virt_var_run_t:dir { getattr read search watch };
 
-allow passt_repair_t qemu_var_run_t:sock_file { read write };
-allow passt_repair_t virt_var_run_t:sock_file { read write };
+allow passt_repair_t qemu_var_run_t:sock_file { getattr read write };
+allow passt_repair_t virt_var_run_t:sock_file { getattr read write };
diff --git a/passt-repair.1 b/passt-repair.1
index 7c1b140..e65aadd 100644
--- a/passt-repair.1
+++ b/passt-repair.1
@@ -16,13 +16,17 @@
 .B passt-repair
 is a privileged helper setting and clearing repair mode on TCP sockets on behalf
 of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain
-socket, specified by \fIPATH\fR.
+socket.
 
 It can be used to migrate TCP connections between guests without granting
 additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections,
 \fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR
 capability (see \fBcapabilities\fR(7)) to be set or cleared.
 
+If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to
+connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file
+ending with \fI.repair\fR appears in it, and then attempts to connect to it.
+
 .SH PROTOCOL
 
 \fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via
diff --git a/passt-repair.c b/passt-repair.c
index e0c366e..8bb3f00 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -16,11 +16,14 @@
  * off. Reply by echoing the command. Exit on EOF.
  */
 
+#include <sys/inotify.h>
 #include <sys/prctl.h>
 #include <sys/types.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <sys/un.h>
 #include <errno.h>
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -39,6 +42,8 @@
 #include "seccomp_repair.h"
 
 #define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+#define REPAIR_EXT		".repair"
+#define REPAIR_EXT_LEN		strlen(REPAIR_EXT)
 
 /**
  * main() - Entry point and whole program with loop
@@ -51,6 +56,9 @@
  * #syscalls:repair socket s390x:socketcall i686:socketcall
  * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv
  * #syscalls:repair sendto sendmsg arm:send ppc64le:send
+ * #syscalls:repair stat|statx stat64|statx statx
+ * #syscalls:repair fstat|fstat64 newfstatat|fstatat64
+ * #syscalls:repair inotify_init1 inotify_add_watch
  */
 int main(int argc, char **argv)
 {
@@ -58,12 +66,14 @@ int main(int argc, char **argv)
 	     __attribute__ ((aligned(__alignof__(struct cmsghdr))));
 	struct sockaddr_un a = { AF_UNIX, "" };
 	int fds[SCM_MAX_FD], s, ret, i, n = 0;
+	bool inotify_dir = false;
 	struct sock_fprog prog;
 	int8_t cmd = INT8_MAX;
 	struct cmsghdr *cmsg;
 	struct msghdr msg;
 	struct iovec iov;
 	size_t cmsg_len;
+	struct stat sb;
 	int op;
 
 	prctl(PR_SET_DUMPABLE, 0);
@@ -90,19 +100,77 @@ int main(int argc, char **argv)
 		_exit(2);
 	}
 
-	ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
-	if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
-		fprintf(stderr, "Invalid socket path: %s\n", argv[1]);
-		_exit(2);
-	}
-
 	if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
 		fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
 		_exit(1);
 	}
 
-	if (connect(s, (struct sockaddr *)&a, sizeof(a))) {
-		fprintf(stderr, "Failed to connect to %s: %s\n", argv[1],
+	if ((stat(argv[1], &sb))) {
+		fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno);
+		_exit(1);
+	}
+
+	if ((sb.st_mode & S_IFMT) == S_IFDIR) {
+		char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
+		const struct inotify_event *ev;
+		char path[PATH_MAX + 1];
+		ssize_t n;
+		int fd;
+
+		ev = (struct inotify_event *)buf;
+
+		if ((fd = inotify_init1(IN_CLOEXEC)) < 0) {
+			fprintf(stderr, "inotify_init1: %i\n", errno);
+			_exit(1);
+		}
+
+		if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) {
+			fprintf(stderr, "inotify_add_watch: %i\n", errno);
+			_exit(1);
+		}
+
+		do {
+			n = read(fd, buf, sizeof(buf));
+			if (n < 0) {
+				fprintf(stderr, "inotify read: %i", errno);
+				_exit(1);
+			}
+
+			if (n < (ssize_t)sizeof(*ev)) {
+				fprintf(stderr, "Short inotify read: %zi", n);
+				_exit(1);
+			}
+		} while (ev->len < REPAIR_EXT_LEN ||
+			 memcmp(ev->name + strlen(ev->name) - REPAIR_EXT_LEN,
+				REPAIR_EXT, REPAIR_EXT_LEN));
+
+		snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name);
+		if ((stat(path, &sb))) {
+			fprintf(stderr, "Can't stat() %s: %i\n", path, errno);
+			_exit(1);
+		}
+
+		ret = snprintf(a.sun_path, sizeof(a.sun_path), path);
+		inotify_dir = true;
+	} else {
+		ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
+	}
+
+	if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
+		fprintf(stderr, "Invalid socket path");
+		_exit(2);
+	}
+
+	if ((sb.st_mode & S_IFMT) != S_IFSOCK) {
+		fprintf(stderr, "%s is not a socket\n", a.sun_path);
+		_exit(2);
+	}
+
+	while (connect(s, (struct sockaddr *)&a, sizeof(a))) {
+		if (inotify_dir && errno == ECONNREFUSED)
+			continue;
+
+		fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path,
 			strerror(errno));
 		_exit(1);
 	}

From c8b520c0625b440d0dcd588af085d35cf46aae2c Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 6 Mar 2025 20:00:51 +0100
Subject: [PATCH 279/382] flow, repair: Wait for a short while for passt-repair
 to connect

...and time out after that. This will be needed because of an upcoming
change to passt-repair enabling it to start before passt is started,
on both source and target, by means of an inotify watch.

Once the inotify watch triggers, passt-repair will connect right away,
but we have no guarantees that the connection completes before we
start the migration process, so wait for it (for a reasonable amount
of time).

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 flow.c   | 20 ++++++++++++++++++++
 repair.c | 32 ++++++++++++++++++++++++++++++++
 repair.h |  1 +
 3 files changed, 53 insertions(+)

diff --git a/flow.c b/flow.c
index 749c498..5e64b79 100644
--- a/flow.c
+++ b/flow.c
@@ -911,6 +911,21 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
 	return ret;
 }
 
+/**
+ * flow_migrate_need_repair() - Do we need to set repair mode for any flow?
+ *
+ * Return: true if repair mode is needed, false otherwise
+ */
+static bool flow_migrate_need_repair(void)
+{
+	union flow *flow;
+
+	foreach_established_tcp_flow(flow)
+		return true;
+
+	return false;
+}
+
 /**
  * flow_migrate_repair_all() - Turn repair mode on or off for all flows
  * @c:		Execution context
@@ -966,6 +981,9 @@ int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
 	(void)stage;
 	(void)fd;
 
+	if (flow_migrate_need_repair())
+		repair_wait(c);
+
 	if ((rc = flow_migrate_repair_all(c, true)))
 		return -rc;
 
@@ -1083,6 +1101,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	if (!count)
 		return 0;
 
+	repair_wait(c);
+
 	if ((rc = flow_migrate_repair_all(c, true)))
 		return -rc;
 
diff --git a/repair.c b/repair.c
index 3ee089f..149fe51 100644
--- a/repair.c
+++ b/repair.c
@@ -27,6 +27,10 @@
 
 #define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
 
+/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */
+#define REPAIR_ACCEPT_TIMEOUT_MS	10
+#define REPAIR_ACCEPT_TIMEOUT_US	(REPAIR_ACCEPT_TIMEOUT_MS * 1000)
+
 /* Pending file descriptors for next repair_flush() call, or command change */
 static int repair_fds[SCM_MAX_FD];
 
@@ -138,6 +142,34 @@ void repair_handler(struct ctx *c, uint32_t events)
 	repair_close(c);
 }
 
+/**
+ * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect
+ * @c:		Execution context
+ */
+void repair_wait(struct ctx *c)
+{
+	struct timeval tv = { .tv_sec = 0,
+			      .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) };
+	static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000,
+		      ".tv_usec is greater than 1000 * 1000");
+
+	if (c->fd_repair >= 0 || c->fd_repair_listen == -1)
+		return;
+
+	if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
+		       &tv, sizeof(tv))) {
+		err_perror("Set timeout on TCP_REPAIR listening socket");
+		return;
+	}
+
+	repair_listen_handler(c, EPOLLIN);
+
+	tv.tv_usec = 0;
+	if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
+		       &tv, sizeof(tv)))
+		err_perror("Clear timeout on TCP_REPAIR listening socket");
+}
+
 /**
  * repair_flush() - Flush current set of sockets to helper, with current command
  * @c:		Execution context
diff --git a/repair.h b/repair.h
index de279d6..1d37922 100644
--- a/repair.h
+++ b/repair.h
@@ -10,6 +10,7 @@ void repair_sock_init(const struct ctx *c);
 void repair_listen_handler(struct ctx *c, uint32_t events);
 void repair_handler(struct ctx *c, uint32_t events);
 void repair_close(struct ctx *c);
+void repair_wait(struct ctx *c);
 int repair_flush(struct ctx *c);
 int repair_set(struct ctx *c, int s, int cmd);
 

From bb00a0499fc9130e4b00a88928958b8b094ee2c9 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:31 +1100
Subject: [PATCH 280/382] conf: Use the same optstring for passt and pasta
 modes

Currently we rely on detecting our mode first and use different sets of
(single character) options for each.  This means that if you use an option
valid in only one mode in another you'll get the generic usage() message.

We can give more helpful errors with little extra effort by combining all
the options into a single value of the option string and giving bespoke
messages if an option for the wrong mode is used; in fact we already did
this for some single mode options like '-1'.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/conf.c b/conf.c
index 065e720..7f20bc8 100644
--- a/conf.c
+++ b/conf.c
@@ -1388,6 +1388,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"repair-path",	required_argument,	NULL,		28 },
 		{ 0 },
 	};
+	const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
 	const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
 	char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
 	bool copy_addrs_opt = false, copy_routes_opt = false;
@@ -1397,7 +1398,6 @@ void conf(struct ctx *c, int argc, char **argv)
 	struct fqdn *dnss = c->dns_search;
 	unsigned int ifi4 = 0, ifi6 = 0;
 	const char *logfile = NULL;
-	const char *optstring;
 	size_t logsize = 0;
 	char *runas = NULL;
 	long fd_tap_opt;
@@ -1408,9 +1408,6 @@ void conf(struct ctx *c, int argc, char **argv)
 	if (c->mode == MODE_PASTA) {
 		c->no_dhcp_dns = c->no_dhcp_dns_search = 1;
 		fwd_default = FWD_AUTO;
-		optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:";
-	} else {
-		optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:";
 	}
 
 	c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
@@ -1614,6 +1611,9 @@ void conf(struct ctx *c, int argc, char **argv)
 			c->foreground = 1;
 			break;
 		case 's':
+			if (c->mode == MODE_PASTA)
+				die("-s is for passt / vhost-user mode only");
+
 			ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s",
 				       optarg);
 			if (ret <= 0 || ret >= (int)sizeof(c->sock_path))
@@ -1634,6 +1634,9 @@ void conf(struct ctx *c, int argc, char **argv)
 			*c->sock_path = 0;
 			break;
 		case 'I':
+			if (c->mode != MODE_PASTA)
+				die("-I is for pasta mode only");
+
 			ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s",
 				       optarg);
 			if (ret <= 0 || ret >= IFNAMSIZ)
@@ -1790,11 +1793,16 @@ void conf(struct ctx *c, int argc, char **argv)
 			break;
 		case 't':
 		case 'u':
-		case 'T':
-		case 'U':
 		case 'D':
 			/* Handle these later, once addresses are configured */
 			break;
+		case 'T':
+		case 'U':
+			if (c->mode != MODE_PASTA)
+				die("-%c is for pasta mode only", name);
+
+			/* Handle properly later, once addresses are configured */
+			break;
 		case 'h':
 			usage(argv[0], stdout, EXIT_SUCCESS);
 			break;

From 4b17d042c7e4f6e5b5a770181e2ebd53ec8e73d4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:32 +1100
Subject: [PATCH 281/382] conf: Move mode detection into helper function

One of the first things we need to do is determine if we're in passt mode
or pasta mode.  Currently this is open-coded in main(), by examining
argv[0].  We want to complexify this a bit in future to cover vhost-user
mode as well.  Prepare for this, by moving the mode detection into a new
conf_mode() function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c  | 26 ++++++++++++++++++++++++++
 conf.h  |  1 +
 passt.c | 14 ++------------
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/conf.c b/conf.c
index 7f20bc8..2022ea1 100644
--- a/conf.c
+++ b/conf.c
@@ -991,6 +991,32 @@ pasta_opts:
 	_exit(status);
 }
 
+/**
+ * conf_mode() - Determine passt/pasta's operating mode from command line
+ * @argc:	Argument count
+ * @argv:	Command line arguments
+ *
+ * Return: mode to operate in, PASTA or PASST
+ */
+/* cppcheck-suppress constParameter */
+enum passt_modes conf_mode(int argc, char *argv[])
+{
+	char argv0[PATH_MAX], *basearg0;
+
+	if (argc < 1)
+		die("Cannot determine argv[0]");
+
+	strncpy(argv0, argv[0], PATH_MAX - 1);
+	basearg0 = basename(argv0);
+	if (strstr(basearg0, "pasta"))
+		return MODE_PASTA;
+
+	if (strstr(basearg0, "passt"))
+		return MODE_PASST;
+
+	die("Cannot determine mode, invoke as \"passt\" or \"pasta\"");
+}
+
 /**
  * conf_print() - Print fundamental configuration parameters
  * @c:		Execution context
diff --git a/conf.h b/conf.h
index 9d2143d..b45ad74 100644
--- a/conf.h
+++ b/conf.h
@@ -6,6 +6,7 @@
 #ifndef CONF_H
 #define CONF_H
 
+enum passt_modes conf_mode(int argc, char *argv[]);
 void conf(struct ctx *c, int argc, char **argv);
 
 #endif /* CONF_H */
diff --git a/passt.c b/passt.c
index 868842b..0bd2a29 100644
--- a/passt.c
+++ b/passt.c
@@ -191,7 +191,6 @@ int main(int argc, char **argv)
 {
 	struct epoll_event events[EPOLL_EVENTS];
 	int nfds, i, devnull_fd = -1;
-	char argv0[PATH_MAX], *name;
 	struct ctx c = { 0 };
 	struct rlimit limit;
 	struct timespec now;
@@ -213,21 +212,12 @@ int main(int argc, char **argv)
 	sigaction(SIGTERM, &sa, NULL);
 	sigaction(SIGQUIT, &sa, NULL);
 
-	if (argc < 1)
-		_exit(EXIT_FAILURE);
+	c.mode = conf_mode(argc, argv);
 
-	strncpy(argv0, argv[0], PATH_MAX - 1);
-	name = basename(argv0);
-	if (strstr(name, "pasta")) {
+	if (c.mode == MODE_PASTA) {
 		sa.sa_handler = pasta_child_handler;
 		if (sigaction(SIGCHLD, &sa, NULL))
 			die_perror("Couldn't install signal handlers");
-
-		c.mode = MODE_PASTA;
-	} else if (strstr(name, "passt")) {
-		c.mode = MODE_PASST;
-	} else {
-		_exit(EXIT_FAILURE);
 	}
 
 	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)

From 74cd82adc87552c7ef6d255069a974b4ebeab4a1 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:33 +1100
Subject: [PATCH 282/382] conf: Detect vhost-user mode earlier

We detect our operating mode in conf_mode(), unless we're using vhost-user
mode, in which case we change it later when we parse the --vhost-user
option.  That means we need to delay parsing the --repair-path option (for
vhost-user only) until still later.

However, there are many other places in the main option parsing loop which
also rely on mode.  We get away with those, because they happen to be able
to treat passt and vhost-user modes identically.  This is potentially
confusing, though.  So, move setting of MODE_VU into conf_mode() so
c->mode always has its final value from that point onwards.

To match, we move the parsing of --repair-path back into the main option
parsing loop.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/conf.c b/conf.c
index 2022ea1..b58e2a6 100644
--- a/conf.c
+++ b/conf.c
@@ -998,10 +998,23 @@ pasta_opts:
  *
  * Return: mode to operate in, PASTA or PASST
  */
-/* cppcheck-suppress constParameter */
 enum passt_modes conf_mode(int argc, char *argv[])
 {
+	int vhost_user = 0;
+	const struct option optvu[] = {
+		{"vhost-user",	no_argument,		&vhost_user,	1 },
+		{ 0 },
+	};
 	char argv0[PATH_MAX], *basearg0;
+	int name;
+
+	optind = 0;
+	do {
+		name = getopt_long(argc, argv, "-:", optvu, NULL);
+	} while (name != -1);
+
+	if (vhost_user)
+		return MODE_VU;
 
 	if (argc < 1)
 		die("Cannot determine argv[0]");
@@ -1604,9 +1617,8 @@ void conf(struct ctx *c, int argc, char **argv)
 
 			die("Invalid host nameserver address: %s", optarg);
 		case 25:
-			if (c->mode == MODE_PASTA)
-				die("--vhost-user is for passt mode only");
-			c->mode = MODE_VU;
+			/* Already handled in conf_mode() */
+			ASSERT(c->mode == MODE_VU);
 			break;
 		case 26:
 			vu_print_capabilities();
@@ -1617,7 +1629,14 @@ void conf(struct ctx *c, int argc, char **argv)
 				die("Invalid FQDN: %s", optarg);
 			break;
 		case 28:
-			/* Handle this once we checked --vhost-user */
+			if (c->mode != MODE_VU && strcmp(optarg, "none"))
+				die("--repair-path is for vhost-user mode only");
+
+			if (snprintf_check(c->repair_path,
+					   sizeof(c->repair_path), "%s",
+					   optarg))
+				die("Invalid passt-repair path: %s", optarg);
+
 			break;
 		case 'd':
 			c->debug = 1;
@@ -1917,8 +1936,8 @@ void conf(struct ctx *c, int argc, char **argv)
 	if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
 		c->no_dhcp = 1;
 
-	/* Inbound port options, DNS, and --repair-path can be parsed now, after
-	 * IPv4/IPv6 settings and --vhost-user.
+	/* Inbound port options and DNS can be parsed now, after IPv4/IPv6
+	 * settings
 	 */
 	fwd_probe_ephemeral();
 	udp_portmap_clear();
@@ -1964,16 +1983,6 @@ void conf(struct ctx *c, int argc, char **argv)
 			}
 
 			die("Cannot use DNS address %s", optarg);
-		} else if (name == 28) {
-			if (c->mode != MODE_VU && strcmp(optarg, "none"))
-				die("--repair-path is for vhost-user mode only");
-
-			if (snprintf_check(c->repair_path,
-					   sizeof(c->repair_path), "%s",
-					   optarg))
-				die("Invalid passt-repair path: %s", optarg);
-
-			break;
 		}
 	} while (name != -1);
 

From c43972ad67806fb403cdbc05179441917f2a776b Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:34 +1100
Subject: [PATCH 283/382] packet: Give explicit name to maximum packet size

We verify that every packet we store in a pool (and every partial packet
we retreive from it) has a length no longer than UINT16_MAX.  This
originated in the older packet pool implementation which stored packet
lengths in a uint16_t.  Now, that packets are represented by a struct
iovec with its size_t length, this check serves only as a sanity / security
check that we don't have some wildly out of range length due to a bug
elsewhere.

We have may reasons to (slightly) increase this limit in future, so in
preparation, give this quantity an explicit name - PACKET_MAX_LEN.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 4 ++--
 packet.h | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/packet.c b/packet.c
index 0330b54..bcac037 100644
--- a/packet.c
+++ b/packet.c
@@ -83,7 +83,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 	if (packet_check_range(p, start, len, func, line))
 		return;
 
-	if (len > UINT16_MAX) {
+	if (len > PACKET_MAX_LEN) {
 		trace("add packet length %zu, %s:%i", len, func, line);
 		return;
 	}
@@ -119,7 +119,7 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		return NULL;
 	}
 
-	if (len > UINT16_MAX) {
+	if (len > PACKET_MAX_LEN) {
 		if (func) {
 			trace("packet data length %zu, %s:%i",
 			      len, func, line);
diff --git a/packet.h b/packet.h
index bdc07fe..d099f02 100644
--- a/packet.h
+++ b/packet.h
@@ -6,6 +6,9 @@
 #ifndef PACKET_H
 #define PACKET_H
 
+/* Maximum size of a single packet stored in pool, including headers */
+#define PACKET_MAX_LEN	UINT16_MAX
+
 /**
  * struct pool - Generic pool of packets stored in a buffer
  * @buf:	Buffer storing packet descriptors,

From 1eda8de4384a93778a781257781c5b0967c8abfe Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:35 +1100
Subject: [PATCH 284/382] packet: Remove redundant TAP_BUF_BYTES define

Currently we define both TAP_BUF_BYTES and PKT_BUF_BYTES as essentially
the same thing.  They'll be different only if TAP_BUF_BYTES is negative,
which makes no sense.  So, remove TAP_BUF_BYTES and just use PKT_BUF_BYTES.

In addition, most places we use this to just mean the size of the main
packet buffer (pkt_buf) for which we can just directly use sizeof.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.c | 2 +-
 passt.h | 5 ++---
 tap.c   | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/passt.c b/passt.c
index 0bd2a29..cd06772 100644
--- a/passt.c
+++ b/passt.c
@@ -223,7 +223,7 @@ int main(int argc, char **argv)
 	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
 		die_perror("Couldn't set disposition for SIGPIPE");
 
-	madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
+	madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE);
 
 	c.epollfd = epoll_create1(EPOLL_CLOEXEC);
 	if (c.epollfd == -1)
diff --git a/passt.h b/passt.h
index 28d1389..6b24805 100644
--- a/passt.h
+++ b/passt.h
@@ -69,12 +69,11 @@ union epoll_ref {
 static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
 	      "epoll_ref must have same size as epoll_data");
 
-#define TAP_BUF_BYTES							\
+#define PKT_BUF_BYTES							\
 	ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
 #define TAP_MSGS							\
-	DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
+	DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
 
-#define PKT_BUF_BYTES		MAX(TAP_BUF_BYTES, 0)
 extern char pkt_buf		[PKT_BUF_BYTES];
 
 extern char *epoll_type_str[];
diff --git a/tap.c b/tap.c
index 4541f51..fb306e7 100644
--- a/tap.c
+++ b/tap.c
@@ -1080,7 +1080,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 
 	do {
 		n = recv(c->fd_tap, pkt_buf + partial_len,
-			 TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
+			 sizeof(pkt_buf) - partial_len, MSG_DONTWAIT);
 	} while ((n < 0) && errno == EINTR);
 
 	if (n < 0) {
@@ -1151,7 +1151,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 
 	tap_flush_pools();
 
-	for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
+	for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) {
 		len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
 
 		if (len == 0) {

From c4bfa3339cea586172d4b0fcd613b5638498651e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:36 +1100
Subject: [PATCH 285/382] tap: Use explicit defines for maximum length of L2
 frame

Currently in tap.c we (mostly) use ETH_MAX_MTU as the maximum length of
an L2 frame.  This define comes from the kernel, but it's badly named and
used confusingly.

First, it doesn't really have anything to do with Ethernet, which has no
structural limit on frame lengths.  It comes more from either a) IP which
imposes a 64k datagram limit or b) from internal buffers used in various
places in the kernel (and in passt).

Worse, MTU generally means the maximum size of the IP (L3) datagram which
may be transferred, _not_ counting the L2 headers.  In the kernel
ETH_MAX_MTU is sometimes used that way, but sometimes seems to be used as
a maximum frame length, _including_ L2 headers.  In tap.c we're mostly
using it in the second way.

Finally, each of our tap backends could have different limits on the frame
size imposed by the mechanisms they're using.

Start clearing up this confusion by replacing it in tap.c with new
L2_MAX_LEN_* defines which specifically refer to the maximum L2 frame
length for each backend.

Signed-off-by: David Gibson <dgibson@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 23 +++++++++++++++++++----
 tap.h | 25 +++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/tap.c b/tap.c
index fb306e7..ede547c 100644
--- a/tap.c
+++ b/tap.c
@@ -62,6 +62,19 @@
 #include "vhost_user.h"
 #include "vu_common.h"
 
+/* Maximum allowed frame lengths (including L2 header) */
+
+/* Verify that an L2 frame length limit is large enough to contain the header,
+ * but small enough to fit in the packet pool
+ */
+#define CHECK_FRAME_LEN(len) \
+	static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN,	\
+		      #len " has bad value")
+
+CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
+CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
+CHECK_FRAME_LEN(L2_MAX_LEN_VU);
+
 /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
 static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
 static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
@@ -1097,7 +1110,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 	while (n >= (ssize_t)sizeof(uint32_t)) {
 		uint32_t l2len = ntohl_unaligned(p);
 
-		if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
+		if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) {
 			err("Bad frame size from guest, resetting connection");
 			tap_sock_reset(c);
 			return;
@@ -1151,8 +1164,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 
 	tap_flush_pools();
 
-	for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) {
-		len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
+	for (n = 0;
+	     n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA);
+	     n += len) {
+		len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA);
 
 		if (len == 0) {
 			die("EOF on tap device, exiting");
@@ -1170,7 +1185,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 
 		/* Ignore frames of bad length */
 		if (len < (ssize_t)sizeof(struct ethhdr) ||
-		    len > (ssize_t)ETH_MAX_MTU)
+		    len > (ssize_t)L2_MAX_LEN_PASTA)
 			continue;
 
 		tap_add_packet(c, len, pkt_buf + n);
diff --git a/tap.h b/tap.h
index a2c3b87..84e9fdb 100644
--- a/tap.h
+++ b/tap.h
@@ -6,6 +6,31 @@
 #ifndef TAP_H
 #define TAP_H
 
+/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header)
+ *
+ * The kernel tuntap device imposes a maximum frame size of 65535 including
+ * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).
+ */
+#define L2_MAX_LEN_PASTA	USHRT_MAX
+
+/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header)
+ *
+ * The only structural limit the QEMU socket protocol imposes on frames is
+ * (2^32-1) bytes, but that would be ludicrously long in practice.  For now,
+ * limit it somewhat arbitrarily to 65535 bytes.  FIXME: Work out an appropriate
+ * limit with more precision.
+ */
+#define L2_MAX_LEN_PASST	USHRT_MAX
+
+/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header)
+ *
+ * vhost-user allows multiple buffers per frame, each of which can be quite
+ * large, so the inherent frame size limit is rather large.  Much larger than is
+ * actually useful for IP.  For now limit arbitrarily to 65535 bytes. FIXME:
+ * Work out an appropriate limit with more precision.
+ */
+#define L2_MAX_LEN_VU		USHRT_MAX
+
 struct udphdr;
 
 /**

From b6945e055376be944867479dcd8deb77e47b1fa4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:37 +1100
Subject: [PATCH 286/382] Simplify sizing of pkt_buf

We define the size of pkt_buf as large enough to hold 128 maximum size
packets.  Well, approximately, since we round down to the page size.  We
don't have any specific reliance on how many packets can fit in the buffer,
we just want it to be big enough to allow reasonable batching.  The
current definition relies on the confusingly named ETH_MAX_MTU and adds
in sizeof(uint32_t) rather non-obviously for the pseudo-physical header
used by the qemu socket (passt mode) protocol.

Instead, just define it to be 8MiB, which is what that complex calculation
works out to.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/passt.h b/passt.h
index 6b24805..8f45091 100644
--- a/passt.h
+++ b/passt.h
@@ -69,8 +69,8 @@ union epoll_ref {
 static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
 	      "epoll_ref must have same size as epoll_data");
 
-#define PKT_BUF_BYTES							\
-	ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
+/* Large enough for ~128 maximum size frames */
+#define PKT_BUF_BYTES		(8UL << 20)
 #define TAP_MSGS							\
 	DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
 

From 9d1a6b3eba9e6e5c4db4bfa0e395edc45ca6c39d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:38 +1100
Subject: [PATCH 287/382] pcap: Correctly set snaplen based on tap backend type

The pcap header includes a value indicating how much of each frame is
captured.  We always capture the entire frame, so we want to set this to
the maximum possible frame size.  Currently we do that by setting it to
ETH_MAX_MTU, but that's a confusingly named constant which might not always
be correct depending on the details of our tap backend.

Instead add a tap_l2_max_len() function that explicitly returns the maximum
frame size for the current mode and use that to set snaplen.  While we're
there, there's no particular need for the pcap header to be defined in a
global; make it local to pcap_init() instead.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 pcap.c | 46 ++++++++++++++++++++++++----------------------
 tap.c  | 19 +++++++++++++++++++
 tap.h  |  1 +
 3 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/pcap.c b/pcap.c
index 3d623cf..e95aa6f 100644
--- a/pcap.c
+++ b/pcap.c
@@ -33,33 +33,12 @@
 #include "log.h"
 #include "pcap.h"
 #include "iov.h"
+#include "tap.h"
 
 #define PCAP_VERSION_MINOR 4
 
 static int pcap_fd = -1;
 
-/* See pcap.h from libpcap, or pcap-savefile(5) */
-static const struct {
-	uint32_t magic;
-#define PCAP_MAGIC		0xa1b2c3d4
-
-	uint16_t major;
-#define PCAP_VERSION_MAJOR	2
-
-	uint16_t minor;
-#define PCAP_VERSION_MINOR	4
-
-	int32_t thiszone;
-	uint32_t sigfigs;
-	uint32_t snaplen;
-
-	uint32_t linktype;
-#define PCAP_LINKTYPE_ETHERNET	1
-} pcap_hdr = {
-	PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU,
-	PCAP_LINKTYPE_ETHERNET
-};
-
 struct pcap_pkthdr {
 	uint32_t tv_sec;
 	uint32_t tv_usec;
@@ -162,6 +141,29 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
  */
 void pcap_init(struct ctx *c)
 {
+	/* See pcap.h from libpcap, or pcap-savefile(5) */
+#define PCAP_MAGIC		0xa1b2c3d4
+#define PCAP_VERSION_MAJOR	2
+#define PCAP_VERSION_MINOR	4
+#define PCAP_LINKTYPE_ETHERNET	1
+	const struct {
+		uint32_t magic;
+		uint16_t major;
+		uint16_t minor;
+
+		int32_t thiszone;
+		uint32_t sigfigs;
+		uint32_t snaplen;
+
+		uint32_t linktype;
+	} pcap_hdr = {
+		.magic = PCAP_MAGIC,
+		.major = PCAP_VERSION_MAJOR,
+		.minor = PCAP_VERSION_MINOR,
+		.snaplen = tap_l2_max_len(c),
+		.linktype = PCAP_LINKTYPE_ETHERNET
+	};
+
 	if (pcap_fd != -1)
 		return;
 
diff --git a/tap.c b/tap.c
index ede547c..182a115 100644
--- a/tap.c
+++ b/tap.c
@@ -82,6 +82,25 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
 #define TAP_SEQS		128 /* Different L4 tuples in one batch */
 #define FRAGMENT_MSG_RATE	10  /* # seconds between fragment warnings */
 
+/**
+ * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
+ * @c:		Execution context
+ */
+unsigned long tap_l2_max_len(const struct ctx *c)
+{
+	/* NOLINTBEGIN(bugprone-branch-clone): values can be the same */
+	switch (c->mode) {
+	case MODE_PASST:
+		return L2_MAX_LEN_PASST;
+	case MODE_PASTA:
+		return L2_MAX_LEN_PASTA;
+	case MODE_VU:
+		return L2_MAX_LEN_VU;
+	}
+	/* NOLINTEND(bugprone-branch-clone) */
+	ASSERT(0);
+}
+
 /**
  * tap_send_single() - Send a single frame
  * @c:		Execution context
diff --git a/tap.h b/tap.h
index 84e9fdb..dd39fd8 100644
--- a/tap.h
+++ b/tap.h
@@ -69,6 +69,7 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
 		thdr->vnet_len = htonl(l2len);
 }
 
+unsigned long tap_l2_max_len(const struct ctx *c);
 void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
 void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 		     struct in_addr dst, size_t l4len, uint8_t proto);

From 26df8a3608e7b006c00f44a9029bcadb6d5e4153 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:39 +1100
Subject: [PATCH 288/382] conf: Limit maximum MTU based on backend frame size

The -m option controls the MTU, that is the maximum transmissible L3
datagram, not including L2 headers.  We currently limit it to ETH_MAX_MTU
which sounds like it makes sense.  But ETH_MAX_MTU is confusing: it's not
consistently used as to whether it means the maximum L3 datagram size or
the maximum L2 frame size.  Even within conf() we explicitly account for
the L2 header size when computing the default --mtu value, but not when
we compute the maximum --mtu value.

Clean this up by reworking the maximum MTU computation to be the minimum of
IP_MAX_MTU (65535) and the maximum sized IP datagram which can fit into
our L2 frames when we account for the L2 header.  The latter can vary
depending on our tap backend, although it doesn't right now.

Link: https://bugs.passt.top/show_bug.cgi?id=66
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 11 +++++++----
 util.h |  3 ---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/conf.c b/conf.c
index b58e2a6..c760f79 100644
--- a/conf.c
+++ b/conf.c
@@ -1434,6 +1434,7 @@ void conf(struct ctx *c, int argc, char **argv)
 	enum fwd_ports_mode fwd_default = FWD_NONE;
 	bool v4_only = false, v6_only = false;
 	unsigned dns4_idx = 0, dns6_idx = 0;
+	unsigned long max_mtu = IP_MAX_MTU;
 	struct fqdn *dnss = c->dns_search;
 	unsigned int ifi4 = 0, ifi6 = 0;
 	const char *logfile = NULL;
@@ -1449,7 +1450,9 @@ void conf(struct ctx *c, int argc, char **argv)
 		fwd_default = FWD_AUTO;
 	}
 
-	c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
+	if (tap_l2_max_len(c) - ETH_HLEN < max_mtu)
+		max_mtu = tap_l2_max_len(c) - ETH_HLEN;
+	c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t));
 	c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
 	c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET;
 	memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN);
@@ -1711,9 +1714,9 @@ void conf(struct ctx *c, int argc, char **argv)
 			if (errno || *e)
 				die("Invalid MTU: %s", optarg);
 
-			if (mtu > ETH_MAX_MTU) {
-				die("MTU %lu too large (max %u)",
-				    mtu, ETH_MAX_MTU);
+			if (mtu > max_mtu) {
+				die("MTU %lu too large (max %lu)",
+				    mtu, max_mtu);
 			}
 
 			c->mtu = mtu;
diff --git a/util.h b/util.h
index 0f70f4d..4d512fa 100644
--- a/util.h
+++ b/util.h
@@ -31,9 +31,6 @@
 #ifndef SECCOMP_RET_KILL_PROCESS
 #define SECCOMP_RET_KILL_PROCESS	SECCOMP_RET_KILL
 #endif
-#ifndef ETH_MAX_MTU
-#define ETH_MAX_MTU			USHRT_MAX
-#endif
 #ifndef IP_MAX_MTU
 #define IP_MAX_MTU			USHRT_MAX
 #endif

From 78f1f0fdfc1831f2ca3a65c2cee98c44ff3c30ab Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 16:26:57 +1100
Subject: [PATCH 289/382] test/perf: Simplify iperf3 server lifetime management

After we start the iperf3 server in the background, we have a sleep to
make sure it's ready to receive connections.  We can simplify this slightly
by using the -D option to have iperf3 background itself rather than
backgrounding it manually.  That won't return until the server is ready to
use.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/lib/test | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/test/lib/test b/test/lib/test
index 758250a..7349674 100755
--- a/test/lib/test
+++ b/test/lib/test
@@ -20,10 +20,7 @@ test_iperf3s() {
 	__sctx="${1}"
 	__port="${2}"
 
-	pane_or_context_run_bg "${__sctx}" 				\
-		 'iperf3 -s -p'${__port}' & echo $! > s.pid'		\
-
-	sleep 1		# Wait for server to be ready
+	pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid'
 }
 
 # test_iperf3k() - Kill iperf3 server
@@ -31,7 +28,7 @@ test_iperf3s() {
 test_iperf3k() {
 	__sctx="${1}"
 
-	pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'
+	pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)'
 
 	sleep 1		# Wait for kernel to free up ports
 }

From 96fe5548cb16fe2664ad121c2976048ccad6a1ab Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 14:43:59 +1100
Subject: [PATCH 290/382] conf: Unify several paths in conf_ports()

In conf_ports() we have three different paths which actually do the setup
of an individual forwarded port: one for the "all" case, one for the
exclusions only case and one for the range of ports with possible
exclusions case.

We can unify those cases using a new helper which handles a single range
of ports, with a bitmap of exclusions.  Although this is slightly longer
(largely due to the new helpers function comment), it reduces duplicated
logic.  It will also make future improvements to the tracking of port
forwards easier.

The new conf_ports_range_except() function has a pretty prodigious
parameter list, but I still think it's an overall improvement in conceptual
complexity.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 173 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 90 insertions(+), 83 deletions(-)

diff --git a/conf.c b/conf.c
index c760f79..0e2e8dc 100644
--- a/conf.c
+++ b/conf.c
@@ -123,6 +123,75 @@ static int parse_port_range(const char *s, char **endptr,
 	return 0;
 }
 
+/**
+ * conf_ports_range_except() - Set up forwarding for a range of ports minus a
+ *                             bitmap of exclusions
+ * @c:		Execution context
+ * @optname:	Short option name, t, T, u, or U
+ * @optarg:	Option argument (port specification)
+ * @fwd:	Pointer to @fwd_ports to be updated
+ * @addr:	Listening address
+ * @ifname:	Listening interface
+ * @first:	First port to forward
+ * @last:	Last port to forward
+ * @exclude:	Bitmap of ports to exclude
+ * @to:		Port to translate @first to when forwarding
+ * @weak:	Ignore errors, as long as at least one port is mapped
+ */
+static void conf_ports_range_except(const struct ctx *c, char optname,
+				    const char *optarg, struct fwd_ports *fwd,
+				    const union inany_addr *addr,
+				    const char *ifname,
+				    uint16_t first, uint16_t last,
+				    const uint8_t *exclude, uint16_t to,
+				    bool weak)
+{
+	bool bound_one = false;
+	unsigned i;
+	int ret;
+
+	if (first == 0) {
+		die("Can't forward port 0 for option '-%c %s'",
+		    optname, optarg);
+	}
+
+	for (i = first; i <= last; i++) {
+		if (bitmap_isset(exclude, i))
+			continue;
+
+		if (bitmap_isset(fwd->map, i)) {
+			warn(
+"Altering mapping of already mapped port number: %s", optarg);
+		}
+
+		bitmap_set(fwd->map, i);
+		fwd->delta[i] = to - first;
+
+		if (optname == 't')
+			ret = tcp_sock_init(c, addr, ifname, i);
+		else if (optname == 'u')
+			ret = udp_sock_init(c, 0, addr, ifname, i);
+		else
+			/* No way to check in advance for -T and -U */
+			ret = 0;
+
+		if (ret == -ENFILE || ret == -EMFILE) {
+			die("Can't open enough sockets for port specifier: %s",
+			    optarg);
+		}
+
+		if (!ret) {
+			bound_one = true;
+		} else if (!weak) {
+			die("Failed to bind port %u (%s) for option '-%c %s'",
+			    i, strerror_(-ret), optname, optarg);
+		}
+	}
+
+	if (!bound_one)
+		die("Failed to bind any port for '-%c %s'", optname, optarg);
+}
+
 /**
  * conf_ports() - Parse port configuration options, initialise UDP/TCP sockets
  * @c:		Execution context
@@ -135,10 +204,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 {
 	union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
 	char buf[BUFSIZ], *spec, *ifname = NULL, *p;
-	bool exclude_only = true, bound_one = false;
 	uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
+	bool exclude_only = true;
 	unsigned i;
-	int ret;
 
 	if (!strcmp(optarg, "none")) {
 		if (fwd->mode)
@@ -173,32 +241,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 
 		fwd->mode = FWD_ALL;
 
-		/* Skip port 0.  It has special meaning for many socket APIs, so
-		 * trying to bind it is not really safe.
-		 */
-		for (i = 1; i < NUM_PORTS; i++) {
+		/* Exclude ephemeral ports */
+		for (i = 0; i < NUM_PORTS; i++)
 			if (fwd_port_is_ephemeral(i))
-				continue;
-
-			bitmap_set(fwd->map, i);
-			if (optname == 't') {
-				ret = tcp_sock_init(c, NULL, NULL, i);
-				if (ret == -ENFILE || ret == -EMFILE)
-					goto enfile;
-				if (!ret)
-					bound_one = true;
-			} else if (optname == 'u') {
-				ret = udp_sock_init(c, 0, NULL, NULL, i);
-				if (ret == -ENFILE || ret == -EMFILE)
-					goto enfile;
-				if (!ret)
-					bound_one = true;
-			}
-		}
-
-		if (!bound_one)
-			goto bind_all_fail;
+				bitmap_set(exclude, i);
 
+		conf_ports_range_except(c, optname, optarg, fwd,
+					NULL, NULL,
+					1, NUM_PORTS - 1, exclude,
+					1, true);
 		return;
 	}
 
@@ -275,37 +326,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 	} while ((p = next_chunk(p, ',')));
 
 	if (exclude_only) {
-		/* Skip port 0.  It has special meaning for many socket APIs, so
-		 * trying to bind it is not really safe.
-		 */
-		for (i = 1; i < NUM_PORTS; i++) {
-			if (fwd_port_is_ephemeral(i) ||
-			    bitmap_isset(exclude, i))
-				continue;
-
-			bitmap_set(fwd->map, i);
-
-			if (optname == 't') {
-				ret = tcp_sock_init(c, addr, ifname, i);
-				if (ret == -ENFILE || ret == -EMFILE)
-					goto enfile;
-				if (!ret)
-					bound_one = true;
-			} else if (optname == 'u') {
-				ret = udp_sock_init(c, 0, addr, ifname, i);
-				if (ret == -ENFILE || ret == -EMFILE)
-					goto enfile;
-				if (!ret)
-					bound_one = true;
-			} else {
-				/* No way to check in advance for -T and -U */
-				bound_one = true;
-			}
-		}
-
-		if (!bound_one)
-			goto bind_all_fail;
+		/* Exclude ephemeral ports */
+		for (i = 0; i < NUM_PORTS; i++)
+			if (fwd_port_is_ephemeral(i))
+				bitmap_set(exclude, i);
 
+		conf_ports_range_except(c, optname, optarg, fwd,
+					addr, ifname,
+					1, NUM_PORTS - 1, exclude,
+					1, true);
 		return;
 	}
 
@@ -334,40 +363,18 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 		if ((*p != '\0')  && (*p != ',')) /* Garbage after the ranges */
 			goto bad;
 
-		for (i = orig_range.first; i <= orig_range.last; i++) {
-			if (bitmap_isset(fwd->map, i))
-				warn(
-"Altering mapping of already mapped port number: %s", optarg);
-
-			if (bitmap_isset(exclude, i))
-				continue;
-
-			bitmap_set(fwd->map, i);
-
-			fwd->delta[i] = mapped_range.first - orig_range.first;
-
-			ret = 0;
-			if (optname == 't')
-				ret = tcp_sock_init(c, addr, ifname, i);
-			else if (optname == 'u')
-				ret = udp_sock_init(c, 0, addr, ifname, i);
-			if (ret)
-				goto bind_fail;
-		}
+		conf_ports_range_except(c, optname, optarg, fwd,
+					addr, ifname,
+					orig_range.first, orig_range.last,
+					exclude,
+					mapped_range.first, false);
 	} while ((p = next_chunk(p, ',')));
 
 	return;
-enfile:
-	die("Can't open enough sockets for port specifier: %s", optarg);
 bad:
 	die("Invalid port specifier %s", optarg);
 mode_conflict:
 	die("Port forwarding mode '%s' conflicts with previous mode", optarg);
-bind_fail:
-	die("Failed to bind port %u (%s) for option '-%c %s', exiting",
-	    i, strerror_(-ret), optname, optarg);
-bind_all_fail:
-	die("Failed to bind any port for '-%c %s', exiting", optname, optarg);
 }
 
 /**

From cb5b593563402680bee850245667f2e71b0d1bda Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 13 Mar 2025 13:56:17 +1100
Subject: [PATCH 291/382] tcp, flow: Better use flow specific logging heleprs

A number of places in the TCP code use general logging functions, instead
of the flow specific ones.  That includes a few older ones as well as many
places in the new migration code.  Thus they either don't identify which
flow the problem happened on, or identify it in a non-standard way.

Convert many of these to use the existing flow specific helpers.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c         |  16 ++--
 tcp.c          | 252 +++++++++++++++++++++++++++----------------------
 tcp.h          |   1 -
 tcp_buf.c      |   4 +-
 tcp_internal.h |   1 +
 tcp_vu.c       |   2 +-
 6 files changed, 149 insertions(+), 127 deletions(-)

diff --git a/flow.c b/flow.c
index 5e64b79..8622242 100644
--- a/flow.c
+++ b/flow.c
@@ -1037,8 +1037,8 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	foreach_established_tcp_flow(flow) {
 		rc = tcp_flow_migrate_source(fd, &flow->tcp);
 		if (rc) {
-			err("Can't send data, flow %u: %s", FLOW_IDX(flow),
-			    strerror_(-rc));
+			flow_err(flow, "Can't send data: %s",
+				 strerror_(-rc));
 			if (!first)
 				die("Inconsistent migration state, exiting");
 
@@ -1064,8 +1064,8 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	foreach_established_tcp_flow(flow) {
 		rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
 		if (rc) {
-			err("Extended data for flow %u: %s", FLOW_IDX(flow),
-			    strerror_(-rc));
+			flow_err(flow, "Can't send extended data: %s",
+				 strerror_(-rc));
 
 			if (rc == -EIO)
 				die("Inconsistent migration state, exiting");
@@ -1112,8 +1112,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	for (i = 0; i < count; i++) {
 		rc = tcp_flow_migrate_target(c, fd);
 		if (rc) {
-			debug("Migration data failure at flow %u: %s, abort",
-			      i, strerror_(-rc));
+			flow_dbg(FLOW(i), "Migration data failure, abort: %s",
+				 strerror_(-rc));
 			return -rc;
 		}
 	}
@@ -1123,8 +1123,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	for (i = 0; i < count; i++) {
 		rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
 		if (rc) {
-			debug("Migration data failure at flow %u: %s, abort",
-			      i, strerror_(-rc));
+			flow_dbg(FLOW(i), "Migration data failure, abort: %s",
+				 strerror_(-rc));
 			return -rc;
 		}
 	}
diff --git a/tcp.c b/tcp.c
index 32a08bd..a4c840e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -434,19 +434,20 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx)
 }
 
 /**
- * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
- * @s:          Socket to update
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported
+ * @conn:	Pointer to the TCP connection structure
  * @offset:     Offset in bytes
  *
  * Return:      -1 when it fails, 0 otherwise.
  */
-int tcp_set_peek_offset(int s, int offset)
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset)
 {
 	if (!peek_offset_cap)
 		return 0;
 
-	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) {
-		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+	if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF,
+		       &offset, sizeof(offset))) {
+		flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset);
 		return -1;
 	}
 	return 0;
@@ -1757,7 +1758,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			   "fast re-transmit, ACK: %u, previous sequence: %u",
 			   max_ack_seq, conn->seq_to_tap);
 		conn->seq_to_tap = max_ack_seq;
-		if (tcp_set_peek_offset(conn->sock, 0)) {
+		if (tcp_set_peek_offset(conn, 0)) {
 			tcp_rst(c, conn);
 			return -1;
 		}
@@ -1854,7 +1855,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
 	conn_event(c, conn, ESTABLISHED);
-	if (tcp_set_peek_offset(conn->sock, 0)) {
+	if (tcp_set_peek_offset(conn, 0)) {
 		tcp_rst(c, conn);
 		return;
 	}
@@ -2022,7 +2023,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 			goto reset;
 
 		conn_event(c, conn, ESTABLISHED);
-		if (tcp_set_peek_offset(conn->sock, 0))
+		if (tcp_set_peek_offset(conn, 0))
 			goto reset;
 
 		if (th->fin) {
@@ -2286,7 +2287,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 			conn->seq_to_tap = conn->seq_ack_from_tap;
 			if (!conn->wnd_from_tap)
 				conn->wnd_from_tap = 1; /* Zero-window probe */
-			if (tcp_set_peek_offset(conn->sock, 0)) {
+			if (tcp_set_peek_offset(conn, 0)) {
 				tcp_rst(c, conn);
 			} else {
 				tcp_data_from_sock(c, conn);
@@ -2810,20 +2811,21 @@ int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
 
 /**
  * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options
- * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn,
+			       struct tcp_tap_transfer_ext *t)
 {
 	struct tcp_info tinfo;
 	socklen_t sl;
 
 	sl = sizeof(tinfo);
-	if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+	if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
 		int rc = -errno;
-		err_perror("Querying TCP_INFO, socket %i", s);
+		flow_perror(conn, "Querying TCP_INFO");
 		return rc;
 	}
 
@@ -2837,18 +2839,19 @@ static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG
- * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
+			     struct tcp_tap_transfer_ext *t)
 {
 	socklen_t sl = sizeof(t->mss);
 
-	if (getsockopt(s, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) {
+	if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) {
 		int rc = -errno;
-		err_perror("Getting MSS, socket %i", s);
+		flow_perror(conn, "Getting MSS");
 		return rc;
 	}
 
@@ -2857,19 +2860,20 @@ static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
- * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn,
+			     struct tcp_tap_transfer_ext *t)
 {
 	struct tcp_repair_window wnd;
 	socklen_t sl = sizeof(wnd);
 
-	if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
+	if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
 		int rc = -errno;
-		err_perror("Getting window repair data, socket %i", s);
+		flow_perror(conn, "Getting window repair data");
 		return rc;
 	}
 
@@ -2893,12 +2897,13 @@ static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_repair_wnd() - Restore window parameters from extended data
- * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
+static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn,
+			       const struct tcp_tap_transfer_ext *t)
 {
 	struct tcp_repair_window wnd;
 
@@ -2908,9 +2913,10 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
 	wnd.rcv_wnd	= t->rcv_wnd;
 	wnd.rcv_wup	= t->rcv_wup;
 
-	if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sizeof(wnd))) {
+	if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW,
+		       &wnd, sizeof(wnd))) {
 		int rc = -errno;
-		err_perror("Setting window data, socket %i", s);
+		flow_perror(conn, "Setting window data");
 		return rc;
 	}
 
@@ -2919,16 +2925,17 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_select_queue() - Select queue (receive or send) for next operation
- * @s:		Socket
+ * @conn:	Connection to select queue for
  * @queue:	TCP_RECV_QUEUE or TCP_SEND_QUEUE
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_select_queue(int s, int queue)
+static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue)
 {
-	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue))) {
+	if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE,
+		       &queue, sizeof(queue))) {
 		int rc = -errno;
-		err_perror("Selecting TCP_SEND_QUEUE, socket %i", s);
+		flow_perror(conn, "Selecting TCP_SEND_QUEUE");
 		return rc;
 	}
 
@@ -2937,26 +2944,28 @@ static int tcp_flow_select_queue(int s, int queue)
 
 /**
  * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data
- * @s:		Socket
+ * @conn:	Connection to dump queue for
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  *
  * #syscalls:vu ioctl
  */
-static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn,
+				  struct tcp_tap_transfer_ext *t)
 {
+	int s = conn->sock;
 	ssize_t rc;
 
 	if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) {
 		rc = -errno;
-		err_perror("Getting send queue size, socket %i", s);
+		flow_perror(conn, "Getting send queue size");
 		return rc;
 	}
 
 	if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) {
 		rc = -errno;
-		err_perror("Getting not sent count, socket %i", s);
+		flow_perror(conn, "Getting not sent count");
 		return rc;
 	}
 
@@ -2975,14 +2984,16 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
 	}
 
 	if (t->notsent > t->sndq) {
-		err("Invalid notsent count socket %i, send: %u, not sent: %u",
-		    s, t->sndq, t->notsent);
+		flow_err(conn,
+			 "Invalid notsent count socket %i, send: %u, not sent: %u",
+			 s, t->sndq, t->notsent);
 		return -EINVAL;
 	}
 
 	if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) {
-		err("Send queue too large to migrate socket %i: %u bytes",
-		    s, t->sndq);
+		flow_err(conn,
+			 "Send queue too large to migrate socket %i: %u bytes",
+			 s, t->sndq);
 		return -ENOBUFS;
 	}
 
@@ -2993,13 +3004,13 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
 			rc = 0;
 		} else {
 			rc = -errno;
-			err_perror("Can't read send queue, socket %i", s);
+			flow_perror(conn, "Can't read send queue");
 			return rc;
 		}
 	}
 
 	if ((uint32_t)rc < t->sndq) {
-		err("Short read migrating send queue");
+		flow_err(conn, "Short read migrating send queue");
 		return -ENXIO;
 	}
 
@@ -3010,19 +3021,20 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue
- * @s:		Socket
+ * @conn:	Connection to repair queue for
  * @len:	Length of data to be restored
  * @buf:	Buffer with content of pending data queue
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
+static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn,
+				 size_t len, uint8_t *buf)
 {
 	size_t chunk = len;
 	uint8_t *p = buf;
 
 	while (len > 0) {
-		ssize_t rc = send(s, p, MIN(len, chunk), 0);
+		ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0);
 
 		if (rc < 0) {
 			if ((errno == ENOBUFS || errno == ENOMEM) &&
@@ -3032,7 +3044,7 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
 			}
 
 			rc = -errno;
-			err_perror("Can't write queue, socket %i", s);
+			flow_perror(conn, "Can't write queue");
 			return rc;
 		}
 
@@ -3045,18 +3057,18 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
 
 /**
  * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue
- * @s:		Socket
+ * @conn:	Pointer to the TCP connection structure
  * @v:		Sequence value, set on return
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_dump_seq(int s, uint32_t *v)
+static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v)
 {
 	socklen_t sl = sizeof(*v);
 
-	if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
+	if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
 		int rc = -errno;
-		err_perror("Dumping sequence, socket %i", s);
+		flow_perror(conn, "Dumping sequence");
 		return rc;
 	}
 
@@ -3065,16 +3077,17 @@ static int tcp_flow_dump_seq(int s, uint32_t *v)
 
 /**
  * tcp_flow_repair_seq() - Restore sequence for pre-selected queue
- * @s:		Socket
+ * @conn:	Connection to repair sequences for
  * @v:		Sequence value to be set
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_repair_seq(int s, const uint32_t *v)
+static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn,
+			       const uint32_t *v)
 {
-	if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
+	if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
 		int rc = -errno;
-		err_perror("Setting sequence, socket %i", s);
+		flow_perror(conn, "Setting sequence");
 		return rc;
 	}
 
@@ -3083,15 +3096,17 @@ static int tcp_flow_repair_seq(int s, const uint32_t *v)
 
 /**
  * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it
- * @s:		Socket
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  *
  * #syscalls:vu ioctl
  */
-static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn,
+				  struct tcp_tap_transfer_ext *t)
 {
+	int s = conn->sock;
 	ssize_t rc;
 
 	if (ioctl(s, SIOCINQ, &t->rcvq) < 0) {
@@ -3111,8 +3126,9 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
 		t->rcvq--;
 
 	if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
-		err("Receive queue too large to migrate socket %i: %u bytes",
-		    s, t->rcvq);
+		flow_err(conn,
+			 "Receive queue too large to migrate socket: %u bytes",
+			 t->rcvq);
 		return -ENOBUFS;
 	}
 
@@ -3122,13 +3138,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
 			rc = 0;
 		} else {
 			rc = -errno;
-			err_perror("Can't read receive queue for socket %i", s);
+			flow_perror(conn, "Can't read receive queue");
 			return rc;
 		}
 	}
 
 	if ((uint32_t)rc < t->rcvq) {
-		err("Short read migrating receive queue");
+		flow_err(conn, "Short read migrating receive queue");
 		return -ENXIO;
 	}
 
@@ -3137,12 +3153,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps)
- * @s:		Socket
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
+static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
+			       const struct tcp_tap_transfer_ext *t)
 {
 	const struct tcp_repair_opt opts[] = {
 		{ TCPOPT_WINDOW,		t->snd_ws + (t->rcv_ws << 16) },
@@ -3156,9 +3173,9 @@ static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
 				!!(t->tcpi_options & TCPI_OPT_SACK) +
 				!!(t->tcpi_options & TCPI_OPT_TIMESTAMPS));
 
-	if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
+	if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
 		int rc = -errno;
-		err_perror("Setting repair options, socket %i", s);
+		flow_perror(conn, "Setting repair options");
 		return rc;
 	}
 
@@ -3229,36 +3246,36 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	/* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode
 	 * weird.
 	 */
-	if (tcp_set_peek_offset(s, -1)) {
+	if (tcp_set_peek_offset(conn, -1)) {
 		rc = -errno;
 		goto fail;
 	}
 
-	if ((rc = tcp_flow_dump_tinfo(s, t)))
+	if ((rc = tcp_flow_dump_tinfo(conn, t)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_mss(s, t)))
+	if ((rc = tcp_flow_dump_mss(conn, t)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_wnd(s, t)))
+	if ((rc = tcp_flow_dump_wnd(conn, t)))
 		goto fail;
 
-	if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE)))
+	if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_sndqueue(s, t)))
+	if ((rc = tcp_flow_dump_sndqueue(conn, t)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_seq(s, &t->seq_snd)))
+	if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd)))
 		goto fail;
 
-	if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE)))
+	if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_rcvqueue(s, t)))
+	if ((rc = tcp_flow_dump_rcvqueue(conn, t)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_seq(s, &t->seq_rcv)))
+	if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
 		goto fail;
 
 	close(s);
@@ -3269,14 +3286,14 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	t->seq_rcv	-= t->rcvq;
 	t->seq_snd	-= t->sndq;
 
-	debug("Extended migration data, socket %i sequences send %u receive %u",
-	      s, t->seq_snd, t->seq_rcv);
-	debug("  pending queues: send %u not sent %u receive %u",
-	      t->sndq, t->notsent, t->rcvq);
-	debug("  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
-	      t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
-	debug("  SO_PEEK_OFF %s  offset=%"PRIu32,
-	      peek_offset_cap ? "enabled" : "disabled", peek_offset);
+	flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u",
+		 s, t->seq_snd, t->seq_rcv);
+	flow_dbg(conn, "  pending queues: send %u not sent %u receive %u",
+		 t->sndq, t->notsent, t->rcvq);
+	flow_dbg(conn, "  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+		 t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
+	flow_dbg(conn, "  SO_PEEK_OFF %s  offset=%"PRIu32,
+		 peek_offset_cap ? "enabled" : "disabled", peek_offset);
 
 	/* Endianness fix-ups */
 	t->seq_snd	= htonl(t->seq_snd);
@@ -3292,17 +3309,17 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	t->rcv_wup	= htonl(t->rcv_wup);
 
 	if (write_all_buf(fd, t, sizeof(*t))) {
-		err_perror("Failed to write extended data, socket %i", s);
+		flow_perror(conn, "Failed to write extended data");
 		return -EIO;
 	}
 
 	if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) {
-		err_perror("Failed to write send queue data, socket %i", s);
+		flow_perror(conn, "Failed to write send queue data");
 		return -EIO;
 	}
 
 	if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) {
-		err_perror("Failed to write receive queue data, socket %i", s);
+		flow_perror(conn, "Failed to write receive queue data");
 		return -EIO;
 	}
 
@@ -3317,7 +3334,7 @@ fail:
 	t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */
 
 	if (write_all_buf(fd, t, sizeof(*t))) {
-		err_perror("Failed to write extended data, socket %i", s);
+		flow_perror(conn, "Failed to write extended data");
 		return -EIO;
 	}
 
@@ -3347,19 +3364,20 @@ static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
 	if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
 				 IPPROTO_TCP)) < 0) {
 		rc = -errno;
-		err_perror("Failed to create socket for migrated flow");
+		flow_perror(conn, "Failed to create socket for migrated flow");
 		return rc;
 	}
 	s = conn->sock;
 
 	if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)))
-		debug_perror("Setting SO_REUSEADDR on socket %i", s);
+		flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i",
+				s);
 
 	tcp_sock_set_nodelay(s);
 
 	if (bind(s, &a.sa, sizeof(a))) {
 		rc = -errno;
-		err_perror("Failed to bind socket for migrated flow");
+		flow_perror(conn, "Failed to bind socket for migrated flow");
 		goto err;
 	}
 
@@ -3390,7 +3408,7 @@ static int tcp_flow_repair_connect(const struct ctx *c,
 	rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
 	if (rc) {
 		rc = -errno;
-		err_perror("Failed to connect migrated socket %i", conn->sock);
+		flow_perror(conn, "Failed to connect migrated socket");
 		return rc;
 	}
 
@@ -3421,8 +3439,8 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 	}
 
 	if (read_all_buf(fd, &t, sizeof(t))) {
+		flow_perror(flow, "Failed to receive migration data");
 		flow_alloc_cancel(flow);
-		err_perror("Failed to receive migration data");
 		return -errno;
 	}
 
@@ -3481,7 +3499,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 
 	if (read_all_buf(fd, &t, sizeof(t))) {
 		rc = -errno;
-		err_perror("Failed to read extended data for socket %i", s);
+		flow_perror(conn, "Failed to read extended data");
 		return rc;
 	}
 
@@ -3503,31 +3521,34 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	t.rcv_wnd	= ntohl(t.rcv_wnd);
 	t.rcv_wup	= ntohl(t.rcv_wup);
 
-	debug("Extended migration data, socket %i sequences send %u receive %u",
-	      s, t.seq_snd, t.seq_rcv);
-	debug("  pending queues: send %u not sent %u receive %u",
-	      t.sndq, t.notsent, t.rcvq);
-	debug("  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
-	      t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
-	debug("  SO_PEEK_OFF %s  offset=%"PRIu32,
-	      peek_offset_cap ? "enabled" : "disabled", peek_offset);
+	flow_dbg(conn,
+		 "Extended migration data, socket %i sequences send %u receive %u",
+		 s, t.seq_snd, t.seq_rcv);
+	flow_dbg(conn, "  pending queues: send %u not sent %u receive %u",
+		 t.sndq, t.notsent, t.rcvq);
+	flow_dbg(conn,
+		 "  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+		 t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
+	flow_dbg(conn, "  SO_PEEK_OFF %s  offset=%"PRIu32,
+		 peek_offset_cap ? "enabled" : "disabled", peek_offset);
 
 	if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq ||
 	    t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
-		err("Bad queues socket %i, send: %u, not sent: %u, receive: %u",
-		    s, t.sndq, t.notsent, t.rcvq);
+		flow_err(conn,
+			 "Bad queues socket %i, send: %u, not sent: %u, receive: %u",
+			 s, t.sndq, t.notsent, t.rcvq);
 		return -EINVAL;
 	}
 
 	if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) {
 		rc = -errno;
-		err_perror("Failed to read send queue data, socket %i", s);
+		flow_perror(conn, "Failed to read send queue data");
 		return rc;
 	}
 
 	if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) {
 		rc = -errno;
-		err_perror("Failed to read receive queue data, socket %i", s);
+		flow_perror(conn, "Failed to read receive queue data");
 		return rc;
 	}
 
@@ -3535,32 +3556,32 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 		/* We weren't able to create the socket, discard flow */
 		goto fail;
 
-	if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
+	if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
 		goto fail;
 
-	if (tcp_flow_repair_seq(s, &t.seq_snd))
+	if (tcp_flow_repair_seq(conn, &t.seq_snd))
 		goto fail;
 
-	if (tcp_flow_select_queue(s, TCP_RECV_QUEUE))
+	if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE))
 		goto fail;
 
-	if (tcp_flow_repair_seq(s, &t.seq_rcv))
+	if (tcp_flow_repair_seq(conn, &t.seq_rcv))
 		goto fail;
 
 	if (tcp_flow_repair_connect(c, conn))
 		goto fail;
 
-	if (tcp_flow_repair_queue(s, t.rcvq, tcp_migrate_rcv_queue))
+	if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue))
 		goto fail;
 
-	if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
+	if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
 		goto fail;
 
-	if (tcp_flow_repair_queue(s, t.sndq - t.notsent,
+	if (tcp_flow_repair_queue(conn, t.sndq - t.notsent,
 				  tcp_migrate_snd_queue))
 		goto fail;
 
-	if (tcp_flow_repair_opt(s, &t))
+	if (tcp_flow_repair_opt(conn, &t))
 		goto fail;
 
 	/* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't
@@ -3575,19 +3596,19 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 
 		v = TCP_SEND_QUEUE;
 		if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
-			debug_perror("Selecting repair queue, socket %i", s);
+			flow_perror(conn, "Selecting repair queue");
 		else
 			shutdown(s, SHUT_WR);
 	}
 
-	if (tcp_flow_repair_wnd(s, &t))
+	if (tcp_flow_repair_wnd(conn, &t))
 		goto fail;
 
 	tcp_flow_repair_off(c, conn);
 	repair_flush(c);
 
 	if (t.notsent) {
-		if (tcp_flow_repair_queue(s, t.notsent,
+		if (tcp_flow_repair_queue(conn, t.notsent,
 					  tcp_migrate_snd_queue +
 					  (t.sndq - t.notsent))) {
 			/* This sometimes seems to fail for unclear reasons.
@@ -3607,15 +3628,16 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	if (t.tcpi_state == TCP_FIN_WAIT1)
 		shutdown(s, SHUT_WR);
 
-	if (tcp_set_peek_offset(conn->sock, peek_offset))
+	if (tcp_set_peek_offset(conn, peek_offset))
 		goto fail;
 
 	tcp_send_flag(c, conn, ACK);
 	tcp_data_from_sock(c, conn);
 
 	if ((rc = tcp_epoll_ctl(c, conn))) {
-		debug("Failed to subscribe to epoll for migrated socket %i: %s",
-		      conn->sock, strerror_(-rc));
+		flow_dbg(conn,
+			 "Failed to subscribe to epoll for migrated socket: %s",
+			 strerror_(-rc));
 		goto fail;
 	}
 
diff --git a/tcp.h b/tcp.h
index 9142eca..234a803 100644
--- a/tcp.h
+++ b/tcp.h
@@ -25,7 +25,6 @@ void tcp_timer(struct ctx *c, const struct timespec *now);
 void tcp_defer_handler(struct ctx *c);
 
 void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
-int tcp_set_peek_offset(int s, int offset);
 
 extern bool peek_offset_cap;
 
diff --git a/tcp_buf.c b/tcp_buf.c
index 72d99c5..0530563 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -125,7 +125,7 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
 
 		conn->seq_to_tap = seq;
 		peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
-		if (tcp_set_peek_offset(conn->sock, peek_offset))
+		if (tcp_set_peek_offset(conn, peek_offset))
 			tcp_rst(c, conn);
 	}
 }
@@ -304,7 +304,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
-		if (tcp_set_peek_offset(s, 0)) {
+		if (tcp_set_peek_offset(conn, 0)) {
 			tcp_rst(c, conn);
 			return -1;
 		}
diff --git a/tcp_internal.h b/tcp_internal.h
index 6f5e054..36c6533 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -177,5 +177,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 		      int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
 		      size_t *optlen);
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset);
 
 #endif /* TCP_INTERNAL_H */
diff --git a/tcp_vu.c b/tcp_vu.c
index 6891ed1..57587cc 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -376,7 +376,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
-		if (tcp_set_peek_offset(conn->sock, 0)) {
+		if (tcp_set_peek_offset(conn, 0)) {
 			tcp_rst(c, conn);
 			return -1;
 		}

From 51f3c071a76bd20677e72b49007b822dca71e755 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 18 Mar 2025 17:18:47 +0100
Subject: [PATCH 292/382] passt-repair: Fix build with -Werror=format-security

Fixes: 04701702471e ("passt-repair: Add directory watch")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt-repair.c b/passt-repair.c
index 8bb3f00..120f7aa 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -150,7 +150,7 @@ int main(int argc, char **argv)
 			_exit(1);
 		}
 
-		ret = snprintf(a.sun_path, sizeof(a.sun_path), path);
+		ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path);
 		inotify_dir = true;
 	} else {
 		ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);

From 28772ee91a60b34786023496ea17c2c2f4e5f7f5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Mar 2025 16:14:21 +1100
Subject: [PATCH 293/382] migrate, tcp: More careful marshalling of mss
 parameter during migration

During migration we extract the limit on segment size using TCP_MAXSEG,
and set it on the other side with TCP_REPAIR_OPTIONS.  However, unlike most
32-bit values we transfer we transfer it in native endian, not network
endian.  This is not correct; add it to the list of endian fixups we make.

In addition, while MAXSEG will be 32-bits in practice, and is given as such
to TCP_REPAIR_OPTIONS, the TCP_MAXSEG sockopt treats it as an 'int'.  It's
not strictly safe to pass a uint32_t to a getsockopt() expecting an int,
although we'll get away with it on most (maybe all) platforms.  Correct
this as well.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Minor coding style fix]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tcp.c b/tcp.c
index a4c840e..43ee76b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2848,13 +2848,16 @@ static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
 			     struct tcp_tap_transfer_ext *t)
 {
 	socklen_t sl = sizeof(t->mss);
+	int val;
 
-	if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) {
+	if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) {
 		int rc = -errno;
 		flow_perror(conn, "Getting MSS");
 		return rc;
 	}
 
+	t->mss = (uint32_t)val;
+
 	return 0;
 }
 
@@ -3301,6 +3304,7 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	t->sndq		= htonl(t->sndq);
 	t->notsent	= htonl(t->notsent);
 	t->rcvq		= htonl(t->rcvq);
+	t->mss		= htonl(t->mss);
 
 	t->snd_wl1	= htonl(t->snd_wl1);
 	t->snd_wnd	= htonl(t->snd_wnd);
@@ -3514,6 +3518,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	t.sndq		= ntohl(t.sndq);
 	t.notsent	= ntohl(t.notsent);
 	t.rcvq		= ntohl(t.rcvq);
+	t.mss		= ntohl(t.mss);
 
 	t.snd_wl1	= ntohl(t.snd_wl1);
 	t.snd_wnd	= ntohl(t.snd_wnd);

From cfb3740568ab291d7be00e457658c45ce9367ed5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Mar 2025 16:14:22 +1100
Subject: [PATCH 294/382] migrate, tcp: Migrate RFC 7323 timestamp

Currently our migration of the state of TCP sockets omits the RFC 7323
timestamp.  In some circumstances that can result in data sent from the
target machine not being received, because it is discarded on the peer due
to PAWS checking.

Add code to dump and restore the timestamp across migration.

Link: https://bugs.passt.top/show_bug.cgi?id=115
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Minor style fixes]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c      | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 tcp_conn.h |  2 ++
 2 files changed, 61 insertions(+)

diff --git a/tcp.c b/tcp.c
index 43ee76b..68af43d 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2861,6 +2861,57 @@ static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
 	return 0;
 }
 
+
+/**
+ * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data (tcpi_options must be populated)
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn,
+				   struct tcp_tap_transfer_ext *t)
+{
+	int val = 0;
+
+	if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+		socklen_t sl = sizeof(val);
+
+		if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) {
+			int rc = -errno;
+			flow_perror(conn, "Getting RFC 7323 timestamp");
+			return rc;
+		}
+	}
+
+	t->timestamp = (uint32_t)val;
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn,
+				   const struct tcp_tap_transfer_ext *t)
+{
+	int val = (int)t->timestamp;
+
+	if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+		if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP,
+			       &val, sizeof(val))) {
+			int rc = -errno;
+			flow_perror(conn, "Setting RFC 7323 timestamp");
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
  * @conn:	Pointer to the TCP connection structure
@@ -3260,6 +3311,9 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	if ((rc = tcp_flow_dump_mss(conn, t)))
 		goto fail;
 
+	if ((rc = tcp_flow_dump_timestamp(conn, t)))
+		goto fail;
+
 	if ((rc = tcp_flow_dump_wnd(conn, t)))
 		goto fail;
 
@@ -3305,6 +3359,7 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	t->notsent	= htonl(t->notsent);
 	t->rcvq		= htonl(t->rcvq);
 	t->mss		= htonl(t->mss);
+	t->timestamp	= htonl(t->timestamp);
 
 	t->snd_wl1	= htonl(t->snd_wl1);
 	t->snd_wnd	= htonl(t->snd_wnd);
@@ -3519,6 +3574,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	t.notsent	= ntohl(t.notsent);
 	t.rcvq		= ntohl(t.rcvq);
 	t.mss		= ntohl(t.mss);
+	t.timestamp	= ntohl(t.timestamp);
 
 	t.snd_wl1	= ntohl(t.snd_wl1);
 	t.snd_wnd	= ntohl(t.snd_wnd);
@@ -3561,6 +3617,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 		/* We weren't able to create the socket, discard flow */
 		goto fail;
 
+	if (tcp_flow_repair_timestamp(conn, &t))
+		goto fail;
+
 	if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
 		goto fail;
 
diff --git a/tcp_conn.h b/tcp_conn.h
index 9126a36..35d813d 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -152,6 +152,7 @@ struct tcp_tap_transfer {
  * @notsent:		Part of pending send queue that wasn't sent out yet
  * @rcvq:		Length of pending receive queue
  * @mss:		Socket-side MSS clamp
+ * @timestamp:		RFC 7323 timestamp
  * @snd_wl1:		Next sequence used in window probe (next sequence - 1)
  * @snd_wnd:		Socket-side sending window
  * @max_window:		Window clamp
@@ -171,6 +172,7 @@ struct tcp_tap_transfer_ext {
 	uint32_t	rcvq;
 
 	uint32_t	mss;
+	uint32_t	timestamp;
 
 	/* We can't just use struct tcp_repair_window: we need network order */
 	uint32_t	snd_wl1;

From c250ffc5c11385d9618b3a8165e676d68d5cbfa2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Mar 2025 16:14:23 +1100
Subject: [PATCH 295/382] migrate: Bump migration version number

v1 of the migration stream format, had some flaws: it didn't properly
handle endianness of the MSS field, and it didn't transfer the RFC7323
timestamp.  We've now fixed those bugs, but it requires incompatible
changes to the stream format.

Because of the timestamps in particular, v1 is not really usable, so there
is little point maintaining compatible support for it.  However, v1 is in
released packages, both upstream and downstream (RHEL at least).  Just
updating the stream format without bumping the version would lead to very
cryptic errors if anyone did attempt to migrate between an old and new
passt.

So, bump the migration version to v2, so we'll get a clear error message if
anyone attempts this.  We don't attempt to maintain backwards compatibility
with v1, however: we'll simply fail if given a v1 stream.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 migrate.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/migrate.c b/migrate.c
index 0fca77b..48d63a0 100644
--- a/migrate.c
+++ b/migrate.c
@@ -96,8 +96,8 @@ static int seen_addrs_target_v1(struct ctx *c,
 	return 0;
 }
 
-/* Stages for version 1 */
-static const struct migrate_stage stages_v1[] = {
+/* Stages for version 2 */
+static const struct migrate_stage stages_v2[] = {
 	{
 		.name = "observed addresses",
 		.source = seen_addrs_source_v1,
@@ -118,7 +118,11 @@ static const struct migrate_stage stages_v1[] = {
 
 /* Supported encoding versions, from latest (most preferred) to oldest */
 static const struct migrate_version versions[] = {
-	{ 1,	stages_v1, },
+	{ 2,	stages_v2, },
+	/* v1 was released, but not widely used.  It had bad endianness for the
+	 * MSS and omitted timestamps, which meant it usually wouldn't work.
+	 * Therefore we don't attempt to support compatibility with it.
+	 */
 	{ 0 },
 };
 

From ebdd46367ce1acba235013d97e362b8677b538d5 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 19 Mar 2025 17:57:45 +0100
Subject: [PATCH 296/382] tcp: Flush socket before checking for more data in
 active close state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Otherwise, if all the pending data is acknowledged:

- tcp_update_seqack_from_tap() updates the current tap-side ACK
  sequence (conn->seq_ack_from_tap)

- next, we compare the sequence we sent (conn->seq_to_tap) to the
  ACK sequence (conn->seq_ack_from_tap) in tcp_data_from_sock() to
  understand if there's more data we can send.

  If they match, we conclude that we haven't sent any of that data,
  and keep re-sending it.

We need, instead, to flush the socket (drop acknowledged data) before
calling tcp_update_seqack_from_tap(), so that once we update
conn->seq_ack_from_tap, we can be sure that all data until there is
gone from the socket.

Link: https://bugs.passt.top/show_bug.cgi?id=114
Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Fixes: 30f1e082c3c0 ("tcp: Keep updating window and checking for socket data after FIN from guest")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tcp.c b/tcp.c
index 68af43d..fa1d885 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2049,6 +2049,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 
 	/* Established connections not accepting data from tap */
 	if (conn->events & TAP_FIN_RCVD) {
+		tcp_sock_consume(conn, ntohl(th->ack_seq));
 		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
 		tcp_tap_window_update(conn, ntohs(th->window));
 		tcp_data_from_sock(c, conn);

From 07c2d584b334b0c405a5702a4f2fad104d03940b Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 19 Mar 2025 20:43:47 +0100
Subject: [PATCH 297/382] conf: Include libgen.h for basename(), fix build
 against musl

Fixes: 4b17d042c7e4 ("conf: Move mode detection into helper function")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf.c b/conf.c
index 0e2e8dc..b54c55d 100644
--- a/conf.c
+++ b/conf.c
@@ -16,6 +16,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
+#include <libgen.h>
 #include <string.h>
 #include <sched.h>
 #include <sys/types.h>

From 32f6212551c5db3b7b3548e8483e5d73f07a35ac Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 19 Mar 2025 20:45:12 +0100
Subject: [PATCH 298/382] Makefile: Enable -Wformat-security

It looks like an easy win to prevent a number of possible security
flaws.

Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index f2ac8e5..31cbac3 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /
 FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
 endif
 
-FLAGS := -Wall -Wextra -Wno-format-zero-length
+FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security
 FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
 FLAGS +=  $(FORTIFY_FLAG) -O2 -pie -fPIE
 FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)

From 4592719a744bcb47db2ff5680be4b8f6362a97ce Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:14 +1100
Subject: [PATCH 299/382] vu_common: Tighten vu_packet_check_range()

This function verifies that the given packet is within the mmap()ed memory
region of the vhost-user device.  We can do better, however.  The packet
should be not only within the mmap()ed range, but specifically in the
subsection of that range set aside for shared buffers, which starts at
dev_region->mmap_offset within there.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vu_common.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vu_common.c b/vu_common.c
index 686a09b..9eea4f2 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -37,10 +37,10 @@ int vu_packet_check_range(void *buf, const char *ptr, size_t len)
 
 	for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
 		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
-		char *m = (char *)(uintptr_t)dev_region->mmap_addr;
+		char *m = (char *)(uintptr_t)dev_region->mmap_addr +
+			dev_region->mmap_offset;
 
-		if (m <= ptr &&
-		    ptr + len <= m + dev_region->mmap_offset + dev_region->size)
+		if (m <= ptr && ptr + len <= m + dev_region->size)
 			return 0;
 	}
 

From e43e00719d7701301e4bc4fb179dc7adff175409 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:15 +1100
Subject: [PATCH 300/382] packet: More cautious checks to avoid pointer
 arithmetic UB

packet_check_range and vu_packet_check_range() verify that the packet or
section of packet we're interested in lies in the packet buffer pool we
expect it to.  However, in doing so it doesn't avoid the possibility of
an integer overflow while performing pointer arithmetic, with is UB.  In
fact, AFAICT it's UB even to use arbitrary pointer arithmetic to construct
a pointer outside of a known valid buffer.

To do this safely, we can't calculate the end of a memory region with
pointer addition when then the length as untrusted.  Instead we must work
out the offset of one memory region within another using pointer
subtraction, then do integer checks against the length of the outer region.
We then need to be careful about the order of checks so that those integer
checks can't themselves overflow.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c    | 12 +++++++++---
 vu_common.c | 10 +++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/packet.c b/packet.c
index bcac037..d1a51a5 100644
--- a/packet.c
+++ b/packet.c
@@ -52,9 +52,15 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
 		return -1;
 	}
 
-	if (ptr + len > p->buf + p->buf_size) {
-		trace("packet range end %p after buffer end %p, %s:%i",
-		      (void *)(ptr + len), (void *)(p->buf + p->buf_size),
+	if (len > p->buf_size) {
+		trace("packet range length %zu larger than buffer %zu, %s:%i",
+		      len, p->buf_size, func, line);
+		return -1;
+	}
+
+	if ((size_t)(ptr - p->buf) > p->buf_size - len) {
+		trace("packet range %p, len %zu after buffer end %p, %s:%i",
+		      (void *)ptr, len, (void *)(p->buf + p->buf_size),
 		      func, line);
 		return -1;
 	}
diff --git a/vu_common.c b/vu_common.c
index 9eea4f2..cefe5e2 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -36,11 +36,15 @@ int vu_packet_check_range(void *buf, const char *ptr, size_t len)
 	struct vu_dev_region *dev_region;
 
 	for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
-		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
-		char *m = (char *)(uintptr_t)dev_region->mmap_addr +
+		uintptr_t base_addr = dev_region->mmap_addr +
 			dev_region->mmap_offset;
+		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
+		const char *base = (const char *)base_addr;
 
-		if (m <= ptr && ptr + len <= m + dev_region->size)
+		ASSERT(base_addr >= dev_region->mmap_addr);
+
+		if (len <= dev_region->size && base <= ptr &&
+		    (size_t)(ptr - base) <= dev_region->size - len)
 			return 0;
 	}
 

From a41d6d125eca5ac8c54bed8157098be141557b03 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:16 +1100
Subject: [PATCH 301/382] tap: Make size of pool_tap[46] purely a tuning
 parameter

Currently we attempt to size pool_tap[46] so they have room for the maximum
possible number of packets that could fit in pkt_buf (TAP_MSGS).  However,
the calculation isn't quite correct: TAP_MSGS is based on ETH_ZLEN (60) as
the minimum possible L2 frame size.  But ETH_ZLEN is based on physical
constraints of Ethernet, which don't apply to our virtual devices.  It is
possible to generate a legitimate frame smaller than this, for example an
empty payload UDP/IPv4 frame on the 'pasta' backend is only 42 bytes long.

Further more, the same limit applies for vhost-user, which is not limited
by the size of pkt_buf like the other backends.  In that case we don't even
have full control of the maximum buffer size, so we can't really calculate
how many packets could fit in there.

If we exceed do TAP_MSGS we'll drop packets, not just use more batches,
which is moderately bad.  The fact that this needs to be sized just so for
correctness not merely for tuning is a fairly non-obvious coupling between
different parts of the code.

To make this more robust, alter the tap code so it doesn't rely on
everything fitting in a single batch of TAP_MSGS packets, instead breaking
into multiple batches as necessary.  This leaves TAP_MSGS as purely a
tuning parameter, which we can freely adjust based on performance measures.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c    | 13 ++++++++++++-
 packet.h    |  3 +++
 passt.h     |  2 --
 tap.c       | 19 ++++++++++++++++---
 tap.h       |  3 ++-
 vu_common.c |  5 +++--
 6 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/packet.c b/packet.c
index d1a51a5..08076d5 100644
--- a/packet.c
+++ b/packet.c
@@ -67,6 +67,17 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
 
 	return 0;
 }
+/**
+ * pool_full() - Is a packet pool full?
+ * @p:		Pointer to packet pool
+ *
+ * Return: true if the pool is full, false if more packets can be added
+ */
+bool pool_full(const struct pool *p)
+{
+	return p->count >= p->size;
+}
+
 /**
  * packet_add_do() - Add data as packet descriptor to given pool
  * @p:		Existing pool
@@ -80,7 +91,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 {
 	size_t idx = p->count;
 
-	if (idx >= p->size) {
+	if (pool_full(p)) {
 		trace("add packet index %zu to pool with size %zu, %s:%i",
 		      idx, p->size, func, line);
 		return;
diff --git a/packet.h b/packet.h
index d099f02..dd18461 100644
--- a/packet.h
+++ b/packet.h
@@ -6,6 +6,8 @@
 #ifndef PACKET_H
 #define PACKET_H
 
+#include <stdbool.h>
+
 /* Maximum size of a single packet stored in pool, including headers */
 #define PACKET_MAX_LEN	UINT16_MAX
 
@@ -33,6 +35,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 void *packet_get_do(const struct pool *p, const size_t idx,
 		    size_t offset, size_t len, size_t *left,
 		    const char *func, int line);
+bool pool_full(const struct pool *p);
 void pool_flush(struct pool *p);
 
 #define packet_add(p, len, start)					\
diff --git a/passt.h b/passt.h
index 8f45091..8693794 100644
--- a/passt.h
+++ b/passt.h
@@ -71,8 +71,6 @@ static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
 
 /* Large enough for ~128 maximum size frames */
 #define PKT_BUF_BYTES		(8UL << 20)
-#define TAP_MSGS							\
-	DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
 
 extern char pkt_buf		[PKT_BUF_BYTES];
 
diff --git a/tap.c b/tap.c
index 182a115..34e6774 100644
--- a/tap.c
+++ b/tap.c
@@ -75,6 +75,9 @@ CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
 CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
 CHECK_FRAME_LEN(L2_MAX_LEN_VU);
 
+#define TAP_MSGS							\
+	DIV_ROUND_UP(sizeof(pkt_buf), ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
+
 /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
 static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
 static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
@@ -1042,8 +1045,10 @@ void tap_handler(struct ctx *c, const struct timespec *now)
  * @c:		Execution context
  * @l2len:	Total L2 packet length
  * @p:		Packet buffer
+ * @now:	Current timestamp
  */
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
+void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
+		    const struct timespec *now)
 {
 	const struct ethhdr *eh;
 
@@ -1059,9 +1064,17 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
 	switch (ntohs(eh->h_proto)) {
 	case ETH_P_ARP:
 	case ETH_P_IP:
+		if (pool_full(pool_tap4)) {
+			tap4_handler(c, pool_tap4, now);
+			pool_flush(pool_tap4);
+		}
 		packet_add(pool_tap4, l2len, p);
 		break;
 	case ETH_P_IPV6:
+		if (pool_full(pool_tap6)) {
+			tap6_handler(c, pool_tap6, now);
+			pool_flush(pool_tap6);
+		}
 		packet_add(pool_tap6, l2len, p);
 		break;
 	default:
@@ -1142,7 +1155,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 		p += sizeof(uint32_t);
 		n -= sizeof(uint32_t);
 
-		tap_add_packet(c, l2len, p);
+		tap_add_packet(c, l2len, p, now);
 
 		p += l2len;
 		n -= l2len;
@@ -1207,7 +1220,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 		    len > (ssize_t)L2_MAX_LEN_PASTA)
 			continue;
 
-		tap_add_packet(c, len, pkt_buf + n);
+		tap_add_packet(c, len, pkt_buf + n, now);
 	}
 
 	tap_handler(c, now);
diff --git a/tap.h b/tap.h
index dd39fd8..6fe3d15 100644
--- a/tap.h
+++ b/tap.h
@@ -119,6 +119,7 @@ void tap_sock_update_pool(void *base, size_t size);
 void tap_backend_init(struct ctx *c);
 void tap_flush_pools(void);
 void tap_handler(struct ctx *c, const struct timespec *now);
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
+void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
+		    const struct timespec *now);
 
 #endif /* TAP_H */
diff --git a/vu_common.c b/vu_common.c
index cefe5e2..5e6fd4a 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -195,7 +195,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 			tap_add_packet(vdev->context,
 				       elem[count].out_sg[0].iov_len - hdrlen,
 				       (char *)elem[count].out_sg[0].iov_base +
-				        hdrlen);
+				       hdrlen, now);
 		} else {
 			/* vnet header can be in a separate iovec */
 			if (elem[count].out_num != 2) {
@@ -207,7 +207,8 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 			} else {
 				tap_add_packet(vdev->context,
 					       elem[count].out_sg[1].iov_len,
-					       (char *)elem[count].out_sg[1].iov_base);
+					       (char *)elem[count].out_sg[1].iov_base,
+					       now);
 			}
 		}
 

From 9866d146e654975dd7f5fd3f1294d5fc4628cef3 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:17 +1100
Subject: [PATCH 302/382] tap: Clarify calculation of TAP_MSGS

The rationale behind the calculation of TAP_MSGS isn't necessarily obvious.
It's supposed to be the maximum number of packets that can fit in pkt_buf.
However, the calculation is wrong in several ways:
 * It's based on ETH_ZLEN which isn't meaningful for virtual devices
 * It always includes the qemu socket header which isn't used for pasta
 * The size of pkt_buf isn't relevant for vhost-user

We've already made sure this is just a tuning parameter, not a hard limit.
Clarify what we're calculating here and why.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/tap.c b/tap.c
index 34e6774..3a6fcbe 100644
--- a/tap.c
+++ b/tap.c
@@ -75,12 +75,28 @@ CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
 CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
 CHECK_FRAME_LEN(L2_MAX_LEN_VU);
 
-#define TAP_MSGS							\
-	DIV_ROUND_UP(sizeof(pkt_buf), ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
+/* We try size the packet pools so that we can use a single batch for the entire
+ * packet buffer.  This might be exceeded for vhost-user, though, which uses its
+ * own buffers rather than pkt_buf.
+ *
+ * This is just a tuning parameter, the code will work with slightly more
+ * overhead if it's incorrect.  So, we estimate based on the minimum practical
+ * frame size - an empty UDP datagram - rather than the minimum theoretical
+ * frame size.
+ *
+ * FIXME: Profile to work out how big this actually needs to be to amortise
+ *        per-batch syscall overheads
+ */
+#define TAP_MSGS_IP4							\
+	DIV_ROUND_UP(sizeof(pkt_buf),					\
+		     ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr))
+#define TAP_MSGS_IP6							\
+	DIV_ROUND_UP(sizeof(pkt_buf),					\
+		     ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr))
 
 /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
-static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
-static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
+static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4, pkt_buf);
+static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf);
 
 #define TAP_SEQS		128 /* Different L4 tuples in one batch */
 #define FRAGMENT_MSG_RATE	10  /* # seconds between fragment warnings */
@@ -1418,8 +1434,8 @@ void tap_sock_update_pool(void *base, size_t size)
 {
 	int i;
 
-	pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size);
-	pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size);
+	pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size);
+	pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size);
 
 	for (i = 0; i < TAP_SEQS; i++) {
 		tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);

From c48331ca51399fe1779529511be395b576aaf0af Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:18 +1100
Subject: [PATCH 303/382] packet: Correct type of PACKET_MAX_LEN

PACKET_MAX_LEN is usually involved in calculations on size_t values - the
type of the iov_len field in struct iovec.  However, being defined bare as
UINT16_MAX, the compiled is likely to assign it a shorter type.  This can
lead to unexpected promotions (or lack thereof).  Add a cast to force the
type to be what we expect.

Fixes: c43972ad6 ("packet: Give explicit name to maximum packet size")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packet.h b/packet.h
index dd18461..9061dad 100644
--- a/packet.h
+++ b/packet.h
@@ -9,7 +9,7 @@
 #include <stdbool.h>
 
 /* Maximum size of a single packet stored in pool, including headers */
-#define PACKET_MAX_LEN	UINT16_MAX
+#define PACKET_MAX_LEN	((size_t)UINT16_MAX)
 
 /**
  * struct pool - Generic pool of packets stored in a buffer

From 37d9f374d9f0c47c092f80a5d85d4505ae4a9af7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:19 +1100
Subject: [PATCH 304/382] packet: Avoid integer overflows in packet_get_do()

In packet_get_do() both offset and len are essentially untrusted.  We do
some validation of len (check it's < PACKET_MAX_LEN), but that's not enough
to ensure that (len + offset) doesn't overflow.  Rearrange our calculation
to make sure it's safe regardless of the given offset & len values.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packet.c b/packet.c
index 08076d5..fdc4be7 100644
--- a/packet.c
+++ b/packet.c
@@ -144,7 +144,8 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		return NULL;
 	}
 
-	if (len + offset > p->pkt[idx].iov_len) {
+	if (offset > p->pkt[idx].iov_len ||
+	    len > (p->pkt[idx].iov_len - offset)) {
 		if (func) {
 			trace("data length %zu, offset %zu from length %zu, "
 			      "%s:%i", len, offset, p->pkt[idx].iov_len,

From 961aa6a0eb7fce956a34f8ccd883bfe12392d3d3 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:20 +1100
Subject: [PATCH 305/382] packet: Move checks against PACKET_MAX_LEN to
 packet_check_range()

Both the callers of packet_check_range() separately verify that the given
length does not exceed PACKET_MAX_LEN.  Fold that check into
packet_check_range() instead.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/packet.c b/packet.c
index fdc4be7..7cbe95d 100644
--- a/packet.c
+++ b/packet.c
@@ -35,6 +35,12 @@
 static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
 			      const char *func, int line)
 {
+	if (len > PACKET_MAX_LEN) {
+		trace("packet range length %zu (max %zu), %s:%i",
+		      len, PACKET_MAX_LEN, func, line);
+		return -1;
+	}
+
 	if (p->buf_size == 0) {
 		int ret;
 
@@ -100,11 +106,6 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 	if (packet_check_range(p, start, len, func, line))
 		return;
 
-	if (len > PACKET_MAX_LEN) {
-		trace("add packet length %zu, %s:%i", len, func, line);
-		return;
-	}
-
 	p->pkt[idx].iov_base = (void *)start;
 	p->pkt[idx].iov_len = len;
 
@@ -136,14 +137,6 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		return NULL;
 	}
 
-	if (len > PACKET_MAX_LEN) {
-		if (func) {
-			trace("packet data length %zu, %s:%i",
-			      len, func, line);
-		}
-		return NULL;
-	}
-
 	if (offset > p->pkt[idx].iov_len ||
 	    len > (p->pkt[idx].iov_len - offset)) {
 		if (func) {

From 38bcce997763f2e0c4bb6c0a3926674317796544 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:21 +1100
Subject: [PATCH 306/382] packet: Rework packet_get() versus packet_get_try()

Most failures of packet_get() indicate a serious problem, and log messages
accordingly.  However, a few callers expect failures here, because they're
probing for a certain range which might or might not be in a packet.  They
use packet_get_try() which passes a NULL func to packet_get_do() to
suppress the logging which is unwanted in this case.

However, this doesn't just suppress the log when packet_get_do() finds the
requested region isn't in the packet.  It suppresses logging for all other
errors too, which do indicate serious problems, even for the callers of
packet_get_try().  Worse it will pass the NULL func on to
packet_check_range() which doesn't expect it, meaning we'll get unhelpful
messages from there if there is a failure.

Fix this by making packet_get_try_do() the primary function which doesn't
log for the case of a range outside the packet.  packet_get_do() becomes a
trivial wrapper around that which logs a message if packet_get_try_do()
returns NULL.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 51 +++++++++++++++++++++++++++++++++++----------------
 packet.h |  8 +++++---
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/packet.c b/packet.c
index 7cbe95d..b3e8c79 100644
--- a/packet.c
+++ b/packet.c
@@ -89,7 +89,7 @@ bool pool_full(const struct pool *p)
  * @p:		Existing pool
  * @len:	Length of new descriptor
  * @start:	Start of data
- * @func:	For tracing: name of calling function, NULL means no trace()
+ * @func:	For tracing: name of calling function
  * @line:	For tracing: caller line of function call
  */
 void packet_add_do(struct pool *p, size_t len, const char *start,
@@ -113,39 +113,31 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 }
 
 /**
- * packet_get_do() - Get data range from packet descriptor from given pool
+ * packet_get_try_do() - Get data range from packet descriptor from given pool
  * @p:		Packet pool
  * @idx:	Index of packet descriptor in pool
  * @offset:	Offset of data range in packet descriptor
  * @len:	Length of desired data range
  * @left:	Length of available data after range, set on return, can be NULL
- * @func:	For tracing: name of calling function, NULL means no trace()
+ * @func:	For tracing: name of calling function
  * @line:	For tracing: caller line of function call
  *
  * Return: pointer to start of data range, NULL on invalid range or descriptor
  */
-void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
-		    size_t len, size_t *left, const char *func, int line)
+void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
+			size_t len, size_t *left, const char *func, int line)
 {
 	char *ptr;
 
 	if (idx >= p->size || idx >= p->count) {
-		if (func) {
-			trace("packet %zu from pool size: %zu, count: %zu, "
-			      "%s:%i", idx, p->size, p->count, func, line);
-		}
+		trace("packet %zu from pool size: %zu, count: %zu, %s:%i",
+		      idx, p->size, p->count, func, line);
 		return NULL;
 	}
 
 	if (offset > p->pkt[idx].iov_len ||
-	    len > (p->pkt[idx].iov_len - offset)) {
-		if (func) {
-			trace("data length %zu, offset %zu from length %zu, "
-			      "%s:%i", len, offset, p->pkt[idx].iov_len,
-			      func, line);
-		}
+	    len > (p->pkt[idx].iov_len - offset))
 		return NULL;
-	}
 
 	ptr = (char *)p->pkt[idx].iov_base + offset;
 
@@ -158,6 +150,33 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 	return ptr;
 }
 
+/**
+ * packet_get_do() - Get data range from packet descriptor from given pool
+ * @p:		Packet pool
+ * @idx:	Index of packet descriptor in pool
+ * @offset:	Offset of data range in packet descriptor
+ * @len:	Length of desired data range
+ * @left:	Length of available data after range, set on return, can be NULL
+ * @func:	For tracing: name of calling function
+ * @line:	For tracing: caller line of function call
+ *
+ * Return: as packet_get_try_do() but log a trace message when returning NULL
+ */
+void *packet_get_do(const struct pool *p, const size_t idx,
+		    size_t offset, size_t len, size_t *left,
+		    const char *func, int line)
+{
+	void *r = packet_get_try_do(p, idx, offset, len, left, func, line);
+
+	if (!r) {
+		trace("missing packet data length %zu, offset %zu from "
+		      "length %zu, %s:%i",
+		      len, offset, p->pkt[idx].iov_len, func, line);
+	}
+
+	return r;
+}
+
 /**
  * pool_flush() - Flush a packet pool
  * @p:		Pointer to packet pool
diff --git a/packet.h b/packet.h
index 9061dad..c94780a 100644
--- a/packet.h
+++ b/packet.h
@@ -32,6 +32,9 @@ struct pool {
 int vu_packet_check_range(void *buf, const char *ptr, size_t len);
 void packet_add_do(struct pool *p, size_t len, const char *start,
 		   const char *func, int line);
+void *packet_get_try_do(const struct pool *p, const size_t idx,
+			size_t offset, size_t len, size_t *left,
+			const char *func, int line);
 void *packet_get_do(const struct pool *p, const size_t idx,
 		    size_t offset, size_t len, size_t *left,
 		    const char *func, int line);
@@ -41,12 +44,11 @@ void pool_flush(struct pool *p);
 #define packet_add(p, len, start)					\
 	packet_add_do(p, len, start, __func__, __LINE__)
 
+#define packet_get_try(p, idx, offset, len, left)			\
+	packet_get_try_do(p, idx, offset, len, left, __func__, __LINE__)
 #define packet_get(p, idx, offset, len, left)				\
 	packet_get_do(p, idx, offset, len, left, __func__, __LINE__)
 
-#define packet_get_try(p, idx, offset, len, left)			\
-	packet_get_do(p, idx, offset, len, left, NULL, 0)
-
 #define PACKET_POOL_DECL(_name, _size, _buf)				\
 struct _name ## _t {							\
 	char *buf;							\

From 9153aca15bc1150e450dd56e79bc035cc2dbf27c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:22 +1100
Subject: [PATCH 307/382] util: Add abort_with_msg() and ASSERT_WITH_MSG()
 helpers

We already have the ASSERT() macro which will abort() passt based on a
condition.  It always has a fixed error message based on its location and
the asserted expression.  We have some upcoming cases where we want to
customise the message when hitting an assert.

Add abort_with_msg() and ASSERT_WITH_MSG() helpers to allow this.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 util.c | 19 +++++++++++++++++++
 util.h | 25 ++++++++++---------------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/util.c b/util.c
index 656e86a..b9a3d43 100644
--- a/util.c
+++ b/util.c
@@ -1017,3 +1017,22 @@ void encode_domain_name(char *buf, const char *domain_name)
 	}
 	p[i] = 0L;
 }
+
+/**
+ * abort_with_msg() - Print error message and abort
+ * @fmt:	Format string
+ * @...:	Format parameters
+ */
+void abort_with_msg(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vlogmsg(true, false, LOG_CRIT, fmt, ap);
+	va_end(ap);
+
+	/* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp,
+	 * but that will still get the job done.
+	 */
+	abort();
+}
diff --git a/util.h b/util.h
index 4d512fa..b1e7e79 100644
--- a/util.h
+++ b/util.h
@@ -61,27 +61,22 @@
 #define STRINGIFY(x)	#x
 #define STR(x)		STRINGIFY(x)
 
-#ifdef CPPCHECK_6936
+void abort_with_msg(const char *fmt, ...)
+	__attribute__((format(printf, 1, 2), noreturn));
+
 /* Some cppcheck versions get confused by aborts inside a loop, causing
  * it to give false positive uninitialised variable warnings later in
  * the function, because it doesn't realise the non-initialising path
  * already exited.  See https://trac.cppcheck.net/ticket/13227
+ *
+ * Therefore, avoid using the usual do while wrapper we use to force the macro
+ * to act like a single statement requiring a ';'.
  */
-#define ASSERT(expr)		\
-	((expr) ? (void)0 : abort())
-#else
+#define ASSERT_WITH_MSG(expr, ...)					\
+	((expr) ? (void)0 : abort_with_msg(__VA_ARGS__))
 #define ASSERT(expr)							\
-	do {								\
-		if (!(expr)) {						\
-			err("ASSERTION FAILED in %s (%s:%d): %s",	\
-			    __func__, __FILE__, __LINE__, STRINGIFY(expr)); \
-			/* This may actually SIGSYS, due to seccomp,	\
-			 * but that will still get the job done		\
-			 */						\
-			abort();					\
-		}							\
-	} while (0)
-#endif
+	ASSERT_WITH_MSG((expr), "ASSSERTION FAILED in %s (%s:%d): %s",	\
+			__func__, __FILE__, __LINE__, STRINGIFY(expr))
 
 #ifdef P_tmpdir
 #define TMPDIR		P_tmpdir

From 0857515c943d439eade80710c16f15f146dfa9e8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:23 +1100
Subject: [PATCH 308/382] packet: ASSERT on signs of pool corruption

If packet_check_range() fails in packet_get_try_do() we just return NULL.
But this check only takes places after we've already validated the given
range against the packet it's in.  That means that if packet_check_range()
fails, the packet pool is already in a corrupted state (we should have
made strictly stronger checks when the packet was added).  Simply returning
NULL and logging a trace() level message isn't really adequate for that
situation; ASSERT instead.

Similarly we check the given idx against both p->count and p->size.  The
latter should be redundant, because count should always be <= size.  If
that's not the case then, again, the pool is already in a corrupted state
and we may have overwritten unknown memory.  Assert for this case too.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/packet.c b/packet.c
index b3e8c79..be28f27 100644
--- a/packet.c
+++ b/packet.c
@@ -129,9 +129,13 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
 {
 	char *ptr;
 
-	if (idx >= p->size || idx >= p->count) {
-		trace("packet %zu from pool size: %zu, count: %zu, %s:%i",
-		      idx, p->size, p->count, func, line);
+	ASSERT_WITH_MSG(p->count <= p->size,
+			"Corrupt pool count: %zu, size: %zu, %s:%i",
+			p->count, p->size, func, line);
+
+	if (idx >= p->count) {
+		trace("packet %zu from pool count: %zu, %s:%i",
+		      idx, p->count, func, line);
 		return NULL;
 	}
 
@@ -141,8 +145,8 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
 
 	ptr = (char *)p->pkt[idx].iov_base + offset;
 
-	if (packet_check_range(p, ptr, len, func, line))
-		return NULL;
+	ASSERT_WITH_MSG(!packet_check_range(p, ptr, len, func, line),
+			"Corrupt packet pool, %s:%i", func, line);
 
 	if (left)
 		*left = p->pkt[idx].iov_len - offset - len;

From cf4d3f05c9263d1b0a88dbbcf9e48d34cac6708e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:24 +1100
Subject: [PATCH 309/382] packet: Upgrade severity of most packet errors

All errors from packet_range_check(), packet_add() and packet_get() are
trace level.  However, these are for the most part actual error conditions.
They're states that should not happen, in many cases indicating a bug
in the caller or elswhere.

We don't promote these to err() or ASSERT() level, for fear of a localised
bug on very specific input crashing the entire program, or flooding the
logs, but we can at least upgrade them to debug level.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/packet.c b/packet.c
index be28f27..72c6158 100644
--- a/packet.c
+++ b/packet.c
@@ -36,7 +36,7 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
 			      const char *func, int line)
 {
 	if (len > PACKET_MAX_LEN) {
-		trace("packet range length %zu (max %zu), %s:%i",
+		debug("packet range length %zu (max %zu), %s:%i",
 		      len, PACKET_MAX_LEN, func, line);
 		return -1;
 	}
@@ -47,25 +47,25 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
 		ret = vu_packet_check_range((void *)p->buf, ptr, len);
 
 		if (ret == -1)
-			trace("cannot find region, %s:%i", func, line);
+			debug("cannot find region, %s:%i", func, line);
 
 		return ret;
 	}
 
 	if (ptr < p->buf) {
-		trace("packet range start %p before buffer start %p, %s:%i",
+		debug("packet range start %p before buffer start %p, %s:%i",
 		      (void *)ptr, (void *)p->buf, func, line);
 		return -1;
 	}
 
 	if (len > p->buf_size) {
-		trace("packet range length %zu larger than buffer %zu, %s:%i",
+		debug("packet range length %zu larger than buffer %zu, %s:%i",
 		      len, p->buf_size, func, line);
 		return -1;
 	}
 
 	if ((size_t)(ptr - p->buf) > p->buf_size - len) {
-		trace("packet range %p, len %zu after buffer end %p, %s:%i",
+		debug("packet range %p, len %zu after buffer end %p, %s:%i",
 		      (void *)ptr, len, (void *)(p->buf + p->buf_size),
 		      func, line);
 		return -1;
@@ -98,7 +98,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 	size_t idx = p->count;
 
 	if (pool_full(p)) {
-		trace("add packet index %zu to pool with size %zu, %s:%i",
+		debug("add packet index %zu to pool with size %zu, %s:%i",
 		      idx, p->size, func, line);
 		return;
 	}
@@ -134,7 +134,7 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
 			p->count, p->size, func, line);
 
 	if (idx >= p->count) {
-		trace("packet %zu from pool count: %zu, %s:%i",
+		debug("packet %zu from pool count: %zu, %s:%i",
 		      idx, p->count, func, line);
 		return NULL;
 	}

From 89b203b851f32a532cc0406cf26a1d24950a207c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:01 +1100
Subject: [PATCH 310/382] udp: Common invocation of udp_sock_errs() for
 vhost-user and "buf" paths

The vhost-user and non-vhost-user paths for both udp_listen_sock_handler()
and udp_reply_sock_handler() are more or less completely separate.  Both,
however, start with essentially the same invocation of udp_sock_errs(), so
that can be made common.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c          | 37 ++++++++++++++++++++-----------------
 udp_internal.h |  2 +-
 udp_vu.c       | 15 ---------------
 3 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/udp.c b/udp.c
index 80520cb..4a06b16 100644
--- a/udp.c
+++ b/udp.c
@@ -585,7 +585,8 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events)
+static int udp_sock_errs(const struct ctx *c, union epoll_ref ref,
+			 uint32_t events)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
@@ -678,13 +679,6 @@ static void udp_buf_listen_sock_handler(const struct ctx *c,
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
 	int n, i;
 
-	if (udp_sock_errs(c, ref, events) < 0) {
-		err("UDP: Unrecoverable error on listening socket:"
-		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
-		/* FIXME: what now?  close/re-open socket? */
-		return;
-	}
-
 	if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
 		return;
 
@@ -750,6 +744,13 @@ void udp_listen_sock_handler(const struct ctx *c,
 			     union epoll_ref ref, uint32_t events,
 			     const struct timespec *now)
 {
+	if (udp_sock_errs(c, ref, events) < 0) {
+		err("UDP: Unrecoverable error on listening socket:"
+		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
+		/* FIXME: what now?  close/re-open socket? */
+		return;
+	}
+
 	if (c->mode == MODE_VU) {
 		udp_vu_listen_sock_handler(c, ref, events, now);
 		return;
@@ -777,17 +778,8 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	uint8_t topif = pif_at_sidx(tosidx);
 	int n, i, from_s;
 
-	ASSERT(!c->no_udp && uflow);
-
 	from_s = uflow->s[ref.flowside.sidei];
 
-	if (udp_sock_errs(c, ref, events) < 0) {
-		flow_err(uflow, "Unrecoverable error on reply socket");
-		flow_err_details(uflow);
-		udp_flow_close(c, uflow);
-		return;
-	}
-
 	if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
 		return;
 
@@ -825,6 +817,17 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now)
 {
+	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
+
+	ASSERT(!c->no_udp && uflow);
+
+	if (udp_sock_errs(c, ref, events) < 0) {
+		flow_err(uflow, "Unrecoverable error on reply socket");
+		flow_err_details(uflow);
+		udp_flow_close(c, uflow);
+		return;
+	}
+
 	if (c->mode == MODE_VU) {
 		udp_vu_reply_sock_handler(c, ref, events, now);
 		return;
diff --git a/udp_internal.h b/udp_internal.h
index 3b081f5..02724e5 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -30,5 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                        const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
-int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events);
+
 #endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index c26a223..84f52af 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -227,12 +227,6 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;
 
-	if (udp_sock_errs(c, ref, events) < 0) {
-		err("UDP: Unrecoverable error on listening socket:"
-		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
-		return;
-	}
-
 	for (i = 0; i < UDP_MAX_FRAMES; i++) {
 		const struct flowside *toside;
 		union sockaddr_inany s_in;
@@ -300,15 +294,6 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;
 
-	ASSERT(!c->no_udp);
-
-	if (udp_sock_errs(c, ref, events) < 0) {
-		flow_err(uflow, "Unrecoverable error on reply socket");
-		flow_err_details(uflow);
-		udp_flow_close(c, uflow);
-		return;
-	}
-
 	for (i = 0; i < UDP_MAX_FRAMES; i++) {
 		uint8_t topif = pif_at_sidx(tosidx);
 		ssize_t dlen;

From 5a977c2f4ee8926673554b2b456e7791962b2ce2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:02 +1100
Subject: [PATCH 311/382] udp: Simplify checking of epoll event bits

udp_{listen,reply}_sock_handler() can accept both EPOLLERR and EPOLLIN
events.  However, unlike most epoll event handlers we don't check the
event bits right there.  EPOLLERR is checked within udp_sock_errs() which
we call unconditionally.  Checking EPOLLIN is still more buried: it is
checked within both udp_sock_recv() and udp_vu_sock_recv().

We can simplify the logic and pass less extraneous parameters around by
moving the checking of the event bits to the top level event handlers.

This makes udp_{buf,vu}_{listen,reply}_sock_handler() no longer general
event handlers, but specific to EPOLLIN events, meaning new data.  So,
rename those functions to udp_{buf,vu}_{listen,reply}_sock_data() to better
reflect their function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c    | 78 ++++++++++++++++++++++++--------------------------------
 udp_vu.c | 25 +++++++-----------
 udp_vu.h |  8 +++---
 3 files changed, 47 insertions(+), 64 deletions(-)

diff --git a/udp.c b/udp.c
index 4a06b16..26a91c9 100644
--- a/udp.c
+++ b/udp.c
@@ -581,12 +581,10 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
  * udp_sock_errs() - Process errors on a socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-static int udp_sock_errs(const struct ctx *c, union epoll_ref ref,
-			 uint32_t events)
+static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
@@ -595,9 +593,6 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref,
 
 	ASSERT(!c->no_udp);
 
-	if (!(events & EPOLLERR))
-		return 0; /* Nothing to do */
-
 	/* Empty the error queue */
 	while ((rc = udp_sock_recverr(c, ref)) > 0)
 		n_err += rc;
@@ -630,15 +625,13 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref,
  * udp_sock_recv() - Receive datagrams from a socket
  * @c:		Execution context
  * @s:		Socket to receive from
- * @events:	epoll events bitmap
  * @mmh		mmsghdr array to receive into
  *
  * Return: Number of datagrams received
  *
  * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
  */
-static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
-			 struct mmsghdr *mmh)
+static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh)
 {
 	/* For not entirely clear reasons (data locality?) pasta gets better
 	 * throughput if we receive tap datagrams one at a atime.  For small
@@ -651,9 +644,6 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
 
 	ASSERT(!c->no_udp);
 
-	if (!(events & EPOLLIN))
-		return 0;
-
 	n = recvmmsg(s, mmh, n, 0, NULL);
 	if (n < 0) {
 		err_perror("Error receiving datagrams");
@@ -664,22 +654,20 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
 }
 
 /**
- * udp_buf_listen_sock_handler() - Handle new data from socket
+ * udp_buf_listen_sock_data() - Handle new data from socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
  * @now:	Current timestamp
  *
  * #syscalls recvmmsg
  */
-static void udp_buf_listen_sock_handler(const struct ctx *c,
-					union epoll_ref ref, uint32_t events,
-					const struct timespec *now)
+static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
+				     const struct timespec *now)
 {
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
 	int n, i;
 
-	if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
+	if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv)) <= 0)
 		return;
 
 	/* We divide datagrams into batches based on how we need to send them,
@@ -744,33 +732,33 @@ void udp_listen_sock_handler(const struct ctx *c,
 			     union epoll_ref ref, uint32_t events,
 			     const struct timespec *now)
 {
-	if (udp_sock_errs(c, ref, events) < 0) {
-		err("UDP: Unrecoverable error on listening socket:"
-		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
-		/* FIXME: what now?  close/re-open socket? */
-		return;
+	if (events & EPOLLERR) {
+		if (udp_sock_errs(c, ref) < 0) {
+			err("UDP: Unrecoverable error on listening socket:"
+			    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
+			/* FIXME: what now?  close/re-open socket? */
+			return;
+		}
 	}
 
-	if (c->mode == MODE_VU) {
-		udp_vu_listen_sock_handler(c, ref, events, now);
-		return;
+	if (events & EPOLLIN) {
+		if (c->mode == MODE_VU)
+			udp_vu_listen_sock_data(c, ref, now);
+		else
+			udp_buf_listen_sock_data(c, ref, now);
 	}
-
-	udp_buf_listen_sock_handler(c, ref, events, now);
 }
 
 /**
- * udp_buf_reply_sock_handler() - Handle new data from flow specific socket
+ * udp_buf_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
  * @now:	Current timestamp
  *
  * #syscalls recvmmsg
  */
-static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-				       uint32_t events,
-				       const struct timespec *now)
+static void udp_buf_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+				    const struct timespec *now)
 {
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
@@ -780,7 +768,7 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	from_s = uflow->s[ref.flowside.sidei];
 
-	if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
+	if ((n = udp_sock_recv(c, from_s, udp_mh_recv)) <= 0)
 		return;
 
 	flow_trace(uflow, "Received %d datagrams on reply socket", n);
@@ -821,19 +809,21 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	ASSERT(!c->no_udp && uflow);
 
-	if (udp_sock_errs(c, ref, events) < 0) {
-		flow_err(uflow, "Unrecoverable error on reply socket");
-		flow_err_details(uflow);
-		udp_flow_close(c, uflow);
-		return;
+	if (events & EPOLLERR) {
+		if (udp_sock_errs(c, ref) < 0) {
+			flow_err(uflow, "Unrecoverable error on reply socket");
+			flow_err_details(uflow);
+			udp_flow_close(c, uflow);
+			return;
+		}
 	}
 
-	if (c->mode == MODE_VU) {
-		udp_vu_reply_sock_handler(c, ref, events, now);
-		return;
+	if (events & EPOLLIN) {
+		if (c->mode == MODE_VU)
+			udp_vu_reply_sock_data(c, ref, now);
+		else
+			udp_buf_reply_sock_data(c, ref, now);
 	}
-
-	udp_buf_reply_sock_handler(c, ref, events, now);
 }
 
 /**
diff --git a/udp_vu.c b/udp_vu.c
index 84f52af..698667f 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -78,14 +78,12 @@ static int udp_vu_sock_info(int s, union sockaddr_inany *s_in)
  * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
  * @c:		Execution context
  * @s:		Socket to receive from
- * @events:	epoll events bitmap
  * @v6:		Set for IPv6 connections
  * @dlen:	Size of received data (output)
  *
  * Return: Number of iov entries used to store the datagram
  */
-static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
-			    bool v6, ssize_t *dlen)
+static int udp_vu_sock_recv(const struct ctx *c, int s, bool v6, ssize_t *dlen)
 {
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
@@ -95,9 +93,6 @@ static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
 
 	ASSERT(!c->no_udp);
 
-	if (!(events & EPOLLIN))
-		return 0;
-
 	/* compute L2 header length */
 	hdrlen = udp_vu_hdrlen(v6);
 
@@ -214,14 +209,13 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used)
 }
 
 /**
- * udp_vu_listen_sock_handler() - Handle new data from socket
+ * udp_vu_listen_sock_data() - Handle new data from socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
  * @now:	Current timestamp
  */
-void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
-				uint32_t events, const struct timespec *now)
+void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
+			     const struct timespec *now)
 {
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
@@ -262,7 +256,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
 
-		iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen);
+		iov_used = udp_vu_sock_recv(c, ref.fd, v6, &dlen);
 		if (iov_used <= 0)
 			break;
 
@@ -277,14 +271,13 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 }
 
 /**
- * udp_vu_reply_sock_handler() - Handle new data from flow specific socket
+ * udp_vu_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
  * @now:	Current timestamp
  */
-void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-			        uint32_t events, const struct timespec *now)
+void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+			    const struct timespec *now)
 {
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
@@ -313,7 +306,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
 
-		iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen);
+		iov_used = udp_vu_sock_recv(c, from_s, v6, &dlen);
 		if (iov_used <= 0)
 			break;
 		flow_trace(uflow, "Received 1 datagram on reply socket");
diff --git a/udp_vu.h b/udp_vu.h
index ba7018d..4f2262d 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -6,8 +6,8 @@
 #ifndef UDP_VU_H
 #define UDP_VU_H
 
-void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
-				uint32_t events, const struct timespec *now);
-void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-			       uint32_t events, const struct timespec *now);
+void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
+			     const struct timespec *now);
+void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+			    const struct timespec *now);
 #endif /* UDP_VU_H */

From d924b7dfc40cfaf9ebc64fe052efd8b0c45c6478 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:03 +1100
Subject: [PATCH 312/382] udp_vu: Factor things out of udp_vu_reply_sock_data()
 loop

At the start of every cycle of the loop in udp_vu_reply_sock_data() we:
 - ASSERT that uflow is not NULL
 - Check if the target pif is PIF_TAP
 - Initialize the v6 boolean

However, all of these depend only on the flow, which doesn't change across
the loop.  This is probably a duplication from udp_vu_listen_sock_data(),
where the flow can be different for each packet.  For the reply socket
case, however, factor that logic out of the loop.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_vu.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/udp_vu.c b/udp_vu.c
index 698667f..6e1823a 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -281,30 +281,28 @@ void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
 {
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
+	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
 	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
 	int from_s = uflow->s[ref.flowside.sidei];
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	uint8_t topif = pif_at_sidx(tosidx);
 	int i;
 
+	ASSERT(uflow);
+
+	if (topif != PIF_TAP) {
+		uint8_t frompif = pif_at_sidx(ref.flowside);
+
+		flow_err(uflow,
+			 "No support for forwarding UDP from %s to %s",
+			 pif_name(frompif), pif_name(topif));
+		return;
+	}
+
 	for (i = 0; i < UDP_MAX_FRAMES; i++) {
-		uint8_t topif = pif_at_sidx(tosidx);
 		ssize_t dlen;
 		int iov_used;
-		bool v6;
-
-		ASSERT(uflow);
-
-		if (topif != PIF_TAP) {
-			uint8_t frompif = pif_at_sidx(ref.flowside);
-
-			flow_err(uflow,
-				 "No support for forwarding UDP from %s to %s",
-				 pif_name(frompif), pif_name(topif));
-			continue;
-		}
-
-		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
 
 		iov_used = udp_vu_sock_recv(c, from_s, v6, &dlen);
 		if (iov_used <= 0)

From 269cf6a12a5f89683daa8da9232cc2524d7a4ae2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:04 +1100
Subject: [PATCH 313/382] udp: Share more logic between vu and non-vu reply
 socket paths

Share some additional miscellaneous logic between the vhost-user and "buf"
paths for data on udp reply sockets.  The biggest piece is error handling
of cases where we can't forward between the two pifs of the flow.  We also
make common some more simple logic locating the correct flow and its
parameters.

This adds some lines of code due to extra comment lines, but nonetheless
reduces logic duplication.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c    | 41 ++++++++++++++++++++++++++---------------
 udp_vu.c | 26 +++++++++++---------------
 udp_vu.h |  3 ++-
 3 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/udp.c b/udp.c
index 26a91c9..f417cea 100644
--- a/udp.c
+++ b/udp.c
@@ -752,24 +752,25 @@ void udp_listen_sock_handler(const struct ctx *c,
 /**
  * udp_buf_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to read data from
+ * @tosidx:	Flow & side to forward data from @s to
  * @now:	Current timestamp
  *
+ * Return: true on success, false if can't forward from socket to flow's pif
+ *
  * #syscalls recvmmsg
  */
-static void udp_buf_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+static bool udp_buf_reply_sock_data(const struct ctx *c,
+				    int s, flow_sidx_t tosidx,
 				    const struct timespec *now)
 {
-	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
-	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
+	struct udp_flow *uflow = udp_at_sidx(tosidx);
 	uint8_t topif = pif_at_sidx(tosidx);
-	int n, i, from_s;
+	int n, i;
 
-	from_s = uflow->s[ref.flowside.sidei];
-
-	if ((n = udp_sock_recv(c, from_s, udp_mh_recv)) <= 0)
-		return;
+	if ((n = udp_sock_recv(c, s, udp_mh_recv)) <= 0)
+		return true;
 
 	flow_trace(uflow, "Received %d datagrams on reply socket", n);
 	uflow->ts = now->tv_sec;
@@ -788,11 +789,10 @@ static void udp_buf_reply_sock_data(const struct ctx *c, union epoll_ref ref,
 	} else if (topif == PIF_TAP) {
 		tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
 	} else {
-		uint8_t frompif = pif_at_sidx(ref.flowside);
-
-		flow_err(uflow, "No support for forwarding UDP from %s to %s",
-			 pif_name(frompif), pif_name(topif));
+		return false;
 	}
+
+	return true;
 }
 
 /**
@@ -819,10 +819,21 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	}
 
 	if (events & EPOLLIN) {
+		flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+		int s = ref.fd;
+		bool ret;
+
 		if (c->mode == MODE_VU)
-			udp_vu_reply_sock_data(c, ref, now);
+			ret = udp_vu_reply_sock_data(c, s, tosidx, now);
 		else
-			udp_buf_reply_sock_data(c, ref, now);
+			ret = udp_buf_reply_sock_data(c, s, tosidx, now);
+
+		if (!ret) {
+			flow_err(uflow,
+				 "No support for forwarding UDP from %s to %s",
+				 pif_name(pif_at_sidx(ref.flowside)),
+				 pif_name(pif_at_sidx(tosidx)));
+		}
 	}
 }
 
diff --git a/udp_vu.c b/udp_vu.c
index 6e1823a..06bdeae 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -273,38 +273,32 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 /**
  * udp_vu_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to read data from
+ * @tosidx:	Flow & side to forward data from @s to
  * @now:	Current timestamp
+ *
+ * Return: true on success, false if can't forward from socket to flow's pif
  */
-void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx,
 			    const struct timespec *now)
 {
-	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
-	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
-	int from_s = uflow->s[ref.flowside.sidei];
+	struct udp_flow *uflow = udp_at_sidx(tosidx);
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
-	uint8_t topif = pif_at_sidx(tosidx);
 	int i;
 
 	ASSERT(uflow);
 
-	if (topif != PIF_TAP) {
-		uint8_t frompif = pif_at_sidx(ref.flowside);
-
-		flow_err(uflow,
-			 "No support for forwarding UDP from %s to %s",
-			 pif_name(frompif), pif_name(topif));
-		return;
-	}
+	if (pif_at_sidx(tosidx) != PIF_TAP)
+		return false;
 
 	for (i = 0; i < UDP_MAX_FRAMES; i++) {
 		ssize_t dlen;
 		int iov_used;
 
-		iov_used = udp_vu_sock_recv(c, from_s, v6, &dlen);
+		iov_used = udp_vu_sock_recv(c, s, v6, &dlen);
 		if (iov_used <= 0)
 			break;
 		flow_trace(uflow, "Received 1 datagram on reply socket");
@@ -318,4 +312,6 @@ void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
 		}
 		vu_flush(vdev, vq, elem, iov_used);
 	}
+
+	return true;
 }
diff --git a/udp_vu.h b/udp_vu.h
index 4f2262d..2299b51 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -8,6 +8,7 @@
 
 void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 			     const struct timespec *now);
-void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx,
 			    const struct timespec *now);
+
 #endif /* UDP_VU_H */

From f67c488b81ca2a4d9f819b625fceab10b71fc3a5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:05 +1100
Subject: [PATCH 314/382] udp: Better handling of failure to forward from reply
 socket

In udp_reply_sock_handler() if we're unable to forward the datagrams we
just print an error.  Generally this means we have an unsupported pair of
pifs in the flow table, though, and that hasn't change.  So, next time we
get a matching packet we'll just get the same failure.  In vhost-user mode
we don't even dequeue the incoming packets which triggered this so we're
likely to get the same failure immediately.

Instead, close the flow, in the same we we do for an unrecoverable error.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/udp.c b/udp.c
index f417cea..96e48dd 100644
--- a/udp.c
+++ b/udp.c
@@ -812,9 +812,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	if (events & EPOLLERR) {
 		if (udp_sock_errs(c, ref) < 0) {
 			flow_err(uflow, "Unrecoverable error on reply socket");
-			flow_err_details(uflow);
-			udp_flow_close(c, uflow);
-			return;
+			goto fail;
 		}
 	}
 
@@ -829,12 +827,15 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			ret = udp_buf_reply_sock_data(c, s, tosidx, now);
 
 		if (!ret) {
-			flow_err(uflow,
-				 "No support for forwarding UDP from %s to %s",
-				 pif_name(pif_at_sidx(ref.flowside)),
-				 pif_name(pif_at_sidx(tosidx)));
+			flow_err(uflow, "Unable to forward UDP");
+			goto fail;
 		}
 	}
+	return;
+
+fail:
+	flow_err_details(uflow);
+	udp_flow_close(c, uflow);
 }
 
 /**

From 37d78c9ef3944c1b060e3e8259b82fea3f8ec6bf Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:06 +1100
Subject: [PATCH 315/382] udp: Always hash socket facing flowsides

For UDP packets from the tap interface (like TCP) we use a hash table to
look up which flow they belong to.  Unlike TCP, we sometimes also create a
hash table entry for the socket side of UDP flows.  We need that when we
receive a UDP packet from a "listening" socket which isn't specific to a
single flow.

At present we only do this for the initiating side of flows, which re-use
the listening socket.  For the target side we use a connected "reply"
socket specific to the single flow.

We have in mind changes that maye introduce some edge cases were we could
receive UDP packets on a non flow specific socket more often.  To allow for
those changes - and slightly simplifying things in the meantime - always
put both sides of a UDP flow - tap or socket - in the hash table.  It's
not that costly, and means we always have the option of falling back to a
hash lookup.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_flow.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/udp_flow.c b/udp_flow.c
index c6b8630..7e80924 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -41,25 +41,23 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
  */
 void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 {
+	unsigned sidei;
+
 	if (uflow->closed)
 		return; /* Nothing to do */
 
-	if (uflow->s[INISIDE] >= 0) {
-		/* The listening socket needs to stay in epoll */
-		close(uflow->s[INISIDE]);
-		uflow->s[INISIDE] = -1;
+	flow_foreach_sidei(sidei) {
+		flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
+		if (uflow->s[sidei] >= 0) {
+			/* The listening socket needs to stay in epoll, but the
+			 * flow specific one needs to be removed */
+			if (sidei == TGTSIDE)
+				epoll_del(c, uflow->s[sidei]);
+			close(uflow->s[sidei]);
+			uflow->s[sidei] = -1;
+		}
 	}
 
-	if (uflow->s[TGTSIDE] >= 0) {
-		/* But the flow specific one needs to be removed */
-		epoll_del(c, uflow->s[TGTSIDE]);
-		close(uflow->s[TGTSIDE]);
-		uflow->s[TGTSIDE] = -1;
-	}
-	flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
-	if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
-		flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
-
 	uflow->closed = true;
 }
 
@@ -77,6 +75,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 {
 	struct udp_flow *uflow = NULL;
 	const struct flowside *tgt;
+	unsigned sidei;
 	uint8_t tgtpif;
 
 	if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
@@ -143,14 +142,14 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		}
 	}
 
-	flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
-
-	/* If the target side is a socket, it will be a reply socket that knows
-	 * its own flowside.  But if it's tap, then we need to look it up by
-	 * hash.
+	/* Tap sides always need to be looked up by hash.  Socket sides don't
+	 * always, but sometimes do (receiving packets on a socket not specific
+	 * to one flow).  Unconditionally hash both sides so all our bases are
+	 * covered
 	 */
-	if (!pif_is_socket(tgtpif))
-		flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE));
+	flow_foreach_sidei(sidei)
+		flow_hash_insert(c, FLOW_SIDX(uflow, sidei));
+
 	FLOW_ACTIVATE(uflow);
 
 	return FLOW_SIDX(uflow, TGTSIDE);

From 77883fbdd17e836247f746d888dcad3f611a6a59 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:07 +1100
Subject: [PATCH 316/382] udp: Add helper function for creating connected UDP
 socket

Currently udp_flow_new() open codes creating and connecting a socket to use
for reply messages.  We have in mind some more places to use this logic,
plus it just makes for a rather large function.  Split this handling out
into a new udp_flow_sock() function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_flow.c | 104 +++++++++++++++++++++++++++++------------------------
 1 file changed, 58 insertions(+), 46 deletions(-)

diff --git a/udp_flow.c b/udp_flow.c
index 7e80924..bf4b896 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -61,6 +61,61 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 	uflow->closed = true;
 }
 
+/**
+ * udp_flow_sock() - Create, bind and connect a flow specific UDP socket
+ * @c:		Execution context
+ * @uflow:	UDP flow to open socket for
+ * @sidei:	Side of @uflow to open socket for
+ *
+ * Return: fd of new socket on success, -ve error code on failure
+ */
+static int udp_flow_sock(const struct ctx *c,
+			 const struct udp_flow *uflow, unsigned sidei)
+{
+	const struct flowside *side = &uflow->f.side[sidei];
+	struct mmsghdr discard[UIO_MAXIOV] = { 0 };
+	uint8_t pif = uflow->f.pif[sidei];
+	union {
+		flow_sidx_t sidx;
+		uint32_t data;
+	} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
+	int rc, s;
+
+	s = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, pif, side, fref.data);
+	if (s < 0) {
+		flow_dbg_perror(uflow, "Couldn't open flow specific socket");
+		return s;
+	}
+
+	if (flowside_connect(c, s, pif, side) < 0) {
+		rc = -errno;
+		flow_dbg_perror(uflow, "Couldn't connect flow socket");
+		return rc;
+	}
+
+	/* It's possible, if unlikely, that we could receive some unrelated
+	 * packets in between the bind() and connect() of this socket.  For now
+	 * we just discard these.
+	 *
+	 * FIXME: Redirect these to an appropriate handler
+	 */
+	rc = recvmmsg(s, discard, ARRAY_SIZE(discard), MSG_DONTWAIT, NULL);
+	if (rc >= ARRAY_SIZE(discard)) {
+		flow_dbg(uflow, "Too many (%d) spurious reply datagrams", rc);
+		return -E2BIG;
+	}
+
+	if (rc > 0) {
+		flow_trace(uflow, "Discarded %d spurious reply datagrams", rc);
+	} else if (errno != EAGAIN) {
+		rc = -errno;
+		flow_perror(uflow, "Unexpected error discarding datagrams");
+		return rc;
+	}
+
+	return s;
+}
+
 /**
  * udp_flow_new() - Common setup for a new UDP flow
  * @c:		Execution context
@@ -74,13 +129,10 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 				int s_ini, const struct timespec *now)
 {
 	struct udp_flow *uflow = NULL;
-	const struct flowside *tgt;
 	unsigned sidei;
-	uint8_t tgtpif;
 
-	if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
+	if (!flow_target(c, flow, IPPROTO_UDP))
 		goto cancel;
-	tgtpif = flow->f.pif[TGTSIDE];
 
 	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
 	uflow->ts = now->tv_sec;
@@ -98,49 +150,9 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		}
 	}
 
-	if (pif_is_socket(tgtpif)) {
-		struct mmsghdr discard[UIO_MAXIOV] = { 0 };
-		union {
-			flow_sidx_t sidx;
-			uint32_t data;
-		} fref = {
-			.sidx = FLOW_SIDX(flow, TGTSIDE),
-		};
-		int rc;
-
-		uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY,
-						     tgtpif, tgt, fref.data);
-		if (uflow->s[TGTSIDE] < 0) {
-			flow_dbg_perror(uflow,
-					"Couldn't open socket for spliced flow");
+	if (pif_is_socket(flow->f.pif[TGTSIDE]))
+		if ((uflow->s[TGTSIDE] = udp_flow_sock(c, uflow, TGTSIDE)) < 0)
 			goto cancel;
-		}
-
-		if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) {
-			flow_dbg_perror(uflow, "Couldn't connect flow socket");
-			goto cancel;
-		}
-
-		/* It's possible, if unlikely, that we could receive some
-		 * unrelated packets in between the bind() and connect() of this
-		 * socket.  For now we just discard these.  We could consider
-		 * trying to redirect these to an appropriate handler, if we
-		 * need to.
-		 */
-		rc = recvmmsg(uflow->s[TGTSIDE], discard, ARRAY_SIZE(discard),
-			      MSG_DONTWAIT, NULL);
-		if (rc >= ARRAY_SIZE(discard)) {
-			flow_dbg(uflow,
-				 "Too many (%d) spurious reply datagrams", rc);
-			goto cancel;
-		} else if (rc > 0) {
-			flow_trace(uflow,
-				   "Discarded %d spurious reply datagrams", rc);
-		} else if (errno != EAGAIN) {
-			flow_perror(uflow,
-				    "Unexpected error discarding datagrams");
-		}
-	}
 
 	/* Tap sides always need to be looked up by hash.  Socket sides don't
 	 * always, but sometimes do (receiving packets on a socket not specific

From 664c588be752bf590adb55bf1f613d4a36f02e7c Mon Sep 17 00:00:00 2001
From: Julian Wundrak <julian@wundrak.net>
Date: Wed, 26 Mar 2025 20:14:31 +0000
Subject: [PATCH 317/382] build: normalize arm targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linux distributions use different dumpmachine outputs for the ARM
architecture. arm, armv6l, armv7l.
For the syscall annotation, these variants are standardized to “arm”.

Link: https://bugs.passt.top/show_bug.cgi?id=117
Signed-off-by: Julian Wundrak <julian@wundrak.net>
[sbrivio: Fix typo: assign from TARGET_ARCH, not from TARGET]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 31cbac3..3328f83 100644
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,7 @@ $(if $(TARGET),,$(error Failed to get target architecture))
 # Get 'uname -m'-like architecture description for target
 TARGET_ARCH := $(firstword $(subst -, ,$(TARGET)))
 TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH))
+TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH))
 TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH))
 
 # On some systems enabling optimization also enables source fortification,

From 65cca54be84ffc5d2e18fcb8229dcc9d1f229479 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Wed, 26 Mar 2025 11:59:02 -0400
Subject: [PATCH 318/382] udp: correct source address for ICMP messages

While developing traceroute forwarding tap-to-sock we found that
struct msghdr.msg_name for the ICMPs in the opposite direction always
contains the destination address of the original UDP message, and not,
as one might expect, the one of the host which created the error message.

Study of the kernel code reveals that this address instead is appended
as extra data after the received struct sock_extended_err area.

We now change the ICMP receive code accordingly.

Fixes: 55431f0077b6 ("udp: create and send ICMPv4 to local peer when applicable")
Fixes: 68b04182e07d ("udp: create and send ICMPv6 to local peer when applicable")
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/udp.c b/udp.c
index 96e48dd..0c223b4 100644
--- a/udp.c
+++ b/udp.c
@@ -510,10 +510,13 @@ static void udp_send_conn_fail_icmp6(const struct ctx *c,
  */
 static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 {
-	const struct sock_extended_err *ee;
+	struct errhdr {
+		struct sock_extended_err ee;
+		union sockaddr_inany saddr;
+	};
+	const struct errhdr *eh;
 	const struct cmsghdr *hdr;
-	union sockaddr_inany saddr;
-	char buf[CMSG_SPACE(sizeof(*ee))];
+	char buf[CMSG_SPACE(sizeof(struct errhdr))];
 	char data[ICMP6_MAX_DLEN];
 	int s = ref.fd;
 	struct iovec iov = {
@@ -521,8 +524,6 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		.iov_len = sizeof(data)
 	};
 	struct msghdr mh = {
-		.msg_name = &saddr,
-		.msg_namelen = sizeof(saddr),
 		.msg_iov = &iov,
 		.msg_iovlen = 1,
 		.msg_control = buf,
@@ -553,7 +554,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		return -1;
 	}
 
-	ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
+	eh = (const struct errhdr *)CMSG_DATA(hdr);
 	if (ref.type == EPOLL_TYPE_UDP_REPLY) {
 		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
 		const struct flowside *toside = flowside_at_sidx(sidx);
@@ -561,18 +562,19 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 
 		if (hdr->cmsg_level == IPPROTO_IP) {
 			dlen = MIN(dlen, ICMP4_MAX_DLEN);
-			udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr,
+			udp_send_conn_fail_icmp4(c, &eh->ee, toside,
+						 eh->saddr.sa4.sin_addr,
 						 data, dlen);
 		} else if (hdr->cmsg_level == IPPROTO_IPV6) {
-			udp_send_conn_fail_icmp6(c, ee, toside,
-						 &saddr.sa6.sin6_addr,
+			udp_send_conn_fail_icmp6(c, &eh->ee, toside,
+						 &eh->saddr.sa6.sin6_addr,
 						 data, dlen, sidx.flowi);
 		}
 	} else {
 		trace("Ignoring received IP_RECVERR cmsg on listener socket");
 	}
 	debug("%s error on UDP socket %i: %s",
-	      str_ee_origin(ee), s, strerror_(ee->ee_errno));
+	      str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno));
 
 	return 1;
 }

From 42a854a52b6fa2bbd70cbc0c7657c8a49a9c3d2d Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 28 Mar 2025 11:39:58 +1100
Subject: [PATCH 319/382] pasta, passt-repair: Support multiple events per
 read() in inotify handlers

The current code assumes that we'll get one event per read() on
inotify descriptors, but that's not the case, not from documentation,
and not from reports.

Add loops in the two inotify handlers we have, in pasta-specific code
and passt-repair, to go through all the events we receive.

Link: https://bugs.passt.top/show_bug.cgi?id=119
[dwg: Remove unnecessary buffer expansion, use strnlen instead of strlen
 to make Coverity happier]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Add additional check on ev->name and ev->len in passt-repair]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 32 +++++++++++++++++++++++++-------
 pasta.c        | 20 +++++++++++++-------
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/passt-repair.c b/passt-repair.c
index 120f7aa..86f0293 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -111,14 +111,14 @@ int main(int argc, char **argv)
 	}
 
 	if ((sb.st_mode & S_IFMT) == S_IFDIR) {
-		char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
+		char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
+		   __attribute__ ((aligned(__alignof__(struct inotify_event))));
 		const struct inotify_event *ev;
 		char path[PATH_MAX + 1];
+		bool found = false;
 		ssize_t n;
 		int fd;
 
-		ev = (struct inotify_event *)buf;
-
 		if ((fd = inotify_init1(IN_CLOEXEC)) < 0) {
 			fprintf(stderr, "inotify_init1: %i\n", errno);
 			_exit(1);
@@ -130,6 +130,8 @@ int main(int argc, char **argv)
 		}
 
 		do {
+			char *p;
+
 			n = read(fd, buf, sizeof(buf));
 			if (n < 0) {
 				fprintf(stderr, "inotify read: %i", errno);
@@ -138,11 +140,27 @@ int main(int argc, char **argv)
 
 			if (n < (ssize_t)sizeof(*ev)) {
 				fprintf(stderr, "Short inotify read: %zi", n);
-				_exit(1);
+				continue;
 			}
-		} while (ev->len < REPAIR_EXT_LEN ||
-			 memcmp(ev->name + strlen(ev->name) - REPAIR_EXT_LEN,
-				REPAIR_EXT, REPAIR_EXT_LEN));
+
+			for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
+				ev = (const struct inotify_event *)p;
+
+				if (ev->len >= REPAIR_EXT_LEN &&
+				    !memcmp(ev->name +
+					    strnlen(ev->name, ev->len) -
+					    REPAIR_EXT_LEN,
+					    REPAIR_EXT, REPAIR_EXT_LEN)) {
+					found = true;
+					break;
+				}
+			}
+		} while (!found);
+
+		if (ev->len > NAME_MAX + 1 || ev->name[ev->len] != '\0') {
+			fprintf(stderr, "Invalid filename from inotify\n");
+			_exit(1);
+		}
 
 		snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name);
 		if ((stat(path, &sb))) {
diff --git a/pasta.c b/pasta.c
index fa3e7de..017fa32 100644
--- a/pasta.c
+++ b/pasta.c
@@ -498,17 +498,23 @@ void pasta_netns_quit_init(const struct ctx *c)
  */
 void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd)
 {
-	char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
-	const struct inotify_event *in_ev = (struct inotify_event *)buf;
+	char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
+		__attribute__ ((aligned(__alignof__(struct inotify_event))));
+	const struct inotify_event *ev;
+	ssize_t n;
+	char *p;
 
-	if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev))
+	if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev))
 		return;
 
-	if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base)))
-		return;
+	for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
+		ev = (const struct inotify_event *)p;
 
-	info("Namespace %s is gone, exiting", c->netns_base);
-	_exit(EXIT_SUCCESS);
+		if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) {
+			info("Namespace %s is gone, exiting", c->netns_base);
+			_exit(EXIT_SUCCESS);
+		}
+	}
 }
 
 /**

From 025a3c2686b06be3fd09e29b2e3408d2c4ad6239 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 28 Mar 2025 14:34:14 +1100
Subject: [PATCH 320/382] udp: Don't attempt to forward ICMP socket errors to
 other sockets

Recently we added support for detecting ICMP triggered errors on UDP
sockets and forwarding them to the tap interface.  However, in
udp_sock_recverr() where this is handled we don't know for certain that
the tap interface is the other side of the UDP flow.  It could be a spliced
connection with another socket on the other side.

To forward errors in that case, we'd need to force the other side's socket
to trigger issue an ICMP error.  I'm not sure if there's a way to do that;
probably not for an arbitrary ICMP but it might be possible for certain
error conditions.

Nonetheless what we do now - synthesise an ICMP on the tap interface - is
certainly wrong.  It's probably harmless; for a spliced connection it will
have loopback addresses meaning we can expect the guest to discard it.
But, correct this for now, by not attempting to propagate errors when the
other side of the flow is a socket.

Fixes: 55431f0077b6 ("udp: create and send ICMPv4 to local peer when applicable")
Fixes: 68b04182e07d ("udp: create and send ICMPv6 to local peer when applicable")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/udp.c b/udp.c
index 0c223b4..e410f55 100644
--- a/udp.c
+++ b/udp.c
@@ -560,7 +560,10 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		const struct flowside *toside = flowside_at_sidx(sidx);
 		size_t dlen = rc;
 
-		if (hdr->cmsg_level == IPPROTO_IP) {
+		if (pif_is_socket(pif_at_sidx(sidx))) {
+			/* XXX Is there any way to propagate ICMPs from socket
+			 * to socket? */
+		} else if (hdr->cmsg_level == IPPROTO_IP) {
 			dlen = MIN(dlen, ICMP4_MAX_DLEN);
 			udp_send_conn_fail_icmp4(c, &eh->ee, toside,
 						 eh->saddr.sa4.sin_addr,

From 3de5af6e4145c6971be2597d7fb0386332d44a45 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 28 Mar 2025 14:34:15 +1100
Subject: [PATCH 321/382] udp: Improve name of UDP related ICMP sending
 functions

udp_send_conn_fail_icmp[46]() aren't actually specific to connections
failing: they can propagate a variety of ICMP errors, which might or might
not break a "connection".  They are, however, specific to sending ICMP
errors to the tap connection, not splice or host.  Rename them to better
reflect that.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/udp.c b/udp.c
index e410f55..39431d7 100644
--- a/udp.c
+++ b/udp.c
@@ -411,7 +411,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
 }
 
 /**
- * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer
+ * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer
  * @c:		Execution context
  * @ee:	Extended error descriptor
  * @toside:	Destination side of flow
@@ -419,11 +419,11 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
  * @in:	First bytes (max 8) of original UDP message body
  * @dlen:	Length of the read part of original UDP message body
  */
-static void udp_send_conn_fail_icmp4(const struct ctx *c,
-				     const struct sock_extended_err *ee,
-				     const struct flowside *toside,
-				     struct in_addr saddr,
-				     const void *in, size_t dlen)
+static void udp_send_tap_icmp4(const struct ctx *c,
+			       const struct sock_extended_err *ee,
+			       const struct flowside *toside,
+			       struct in_addr saddr,
+			       const void *in, size_t dlen)
 {
 	struct in_addr oaddr = toside->oaddr.v4mapped.a4;
 	struct in_addr eaddr = toside->eaddr.v4mapped.a4;
@@ -455,7 +455,7 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c,
 
 
 /**
- * udp_send_conn_fail_icmp6() - Construct and send ICMPv6 to local peer
+ * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer
  * @c:		Execution context
  * @ee:	Extended error descriptor
  * @toside:	Destination side of flow
@@ -464,11 +464,11 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c,
  * @dlen:	Length of the read part of original UDP message body
  * @flow:	IPv6 flow identifier
  */
-static void udp_send_conn_fail_icmp6(const struct ctx *c,
-				     const struct sock_extended_err *ee,
-				     const struct flowside *toside,
-				     const struct in6_addr *saddr,
-				     void *in, size_t dlen, uint32_t flow)
+static void udp_send_tap_icmp6(const struct ctx *c,
+			       const struct sock_extended_err *ee,
+			       const struct flowside *toside,
+			       const struct in6_addr *saddr,
+			       void *in, size_t dlen, uint32_t flow)
 {
 	const struct in6_addr *oaddr = &toside->oaddr.a6;
 	const struct in6_addr *eaddr = &toside->eaddr.a6;
@@ -565,13 +565,12 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 			 * to socket? */
 		} else if (hdr->cmsg_level == IPPROTO_IP) {
 			dlen = MIN(dlen, ICMP4_MAX_DLEN);
-			udp_send_conn_fail_icmp4(c, &eh->ee, toside,
-						 eh->saddr.sa4.sin_addr,
-						 data, dlen);
+			udp_send_tap_icmp4(c, &eh->ee, toside,
+					   eh->saddr.sa4.sin_addr, data, dlen);
 		} else if (hdr->cmsg_level == IPPROTO_IPV6) {
-			udp_send_conn_fail_icmp6(c, &eh->ee, toside,
-						 &eh->saddr.sa6.sin6_addr,
-						 data, dlen, sidx.flowi);
+			udp_send_tap_icmp6(c, &eh->ee, toside,
+					   &eh->saddr.sa6.sin6_addr, data,
+					   dlen, sidx.flowi);
 		}
 	} else {
 		trace("Ignoring received IP_RECVERR cmsg on listener socket");

From 2ed2d59def758b049f42e7c75bfb48957a73bd39 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 2 Apr 2025 14:13:16 +1100
Subject: [PATCH 322/382] platform requirements: Fix clang-tidy warning

Recent clang-tidy versions complain about enums defined with some but not
all entries given explicit values.  I'm not entirely convinced about
whether that's a useful warning, but in any case we really don't need the
explicit values in doc/platform-requirements/reuseaddr-priority.c, so
remove them to make clang happy.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 doc/platform-requirements/reuseaddr-priority.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/platform-requirements/reuseaddr-priority.c b/doc/platform-requirements/reuseaddr-priority.c
index 701b6ff..af39a39 100644
--- a/doc/platform-requirements/reuseaddr-priority.c
+++ b/doc/platform-requirements/reuseaddr-priority.c
@@ -46,13 +46,13 @@
 /* Different cases for receiving socket configuration */
 enum sock_type {
 	/* Socket is bound to 0.0.0.0:DSTPORT and not connected */
-	SOCK_BOUND_ANY = 0,
+	SOCK_BOUND_ANY,
 
 	/* Socket is bound to 127.0.0.1:DSTPORT and not connected */
-	SOCK_BOUND_LO = 1,
+	SOCK_BOUND_LO,
 
 	/* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */
-	SOCK_CONNECTED = 2,
+	SOCK_CONNECTED,
 
 	NUM_SOCK_TYPES,
 };

From 8e32881ef1d6d5867223a164052f8ff39d4ebb4e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 2 Apr 2025 14:13:17 +1100
Subject: [PATCH 323/382] platform requirements: Add attributes to die()
 function

Add both format string and ((noreturn)) attributes to the version of die()
used in the test programs in doc/platform-requirements.  As well as
potentially catching problems in format strings, this means that the
compiler and static checkers can properly reason about the fact that it
will exit, preventing bogus warnings.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 doc/platform-requirements/common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/platform-requirements/common.h b/doc/platform-requirements/common.h
index 8844b1e..e85fc2b 100644
--- a/doc/platform-requirements/common.h
+++ b/doc/platform-requirements/common.h
@@ -15,6 +15,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+__attribute__((format(printf, 1, 2), noreturn))
 static inline void die(const char *fmt, ...)
 {
 	va_list ap;

From 6bfc60b09522bd6f47660b835f0681977a28e1de Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 2 Apr 2025 14:13:18 +1100
Subject: [PATCH 324/382] platform requirements: Add test for address conflicts
 with TCP_REPAIR

Simple test program to check the behaviour we need for bind() address
conflicts between listening sockets and repair mode sockets.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 doc/platform-requirements/.gitignore         |   1 +
 doc/platform-requirements/Makefile           |   4 +-
 doc/platform-requirements/listen-vs-repair.c | 128 +++++++++++++++++++
 3 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 doc/platform-requirements/listen-vs-repair.c

diff --git a/doc/platform-requirements/.gitignore b/doc/platform-requirements/.gitignore
index 3b5a10a..f6272cf 100644
--- a/doc/platform-requirements/.gitignore
+++ b/doc/platform-requirements/.gitignore
@@ -1,3 +1,4 @@
+/listen-vs-repair
 /reuseaddr-priority
 /recv-zero
 /udp-close-dup
diff --git a/doc/platform-requirements/Makefile b/doc/platform-requirements/Makefile
index 6a7d374..83930ef 100644
--- a/doc/platform-requirements/Makefile
+++ b/doc/platform-requirements/Makefile
@@ -3,8 +3,8 @@
 # Copyright Red Hat
 # Author: David Gibson <david@gibson.dropbear.id.au>
 
-TARGETS = reuseaddr-priority recv-zero udp-close-dup
-SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c
+TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair
+SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c
 CFLAGS = -Wall
 
 all: cppcheck clang-tidy $(TARGETS:%=check-%)
diff --git a/doc/platform-requirements/listen-vs-repair.c b/doc/platform-requirements/listen-vs-repair.c
new file mode 100644
index 0000000..d31fe3f
--- /dev/null
+++ b/doc/platform-requirements/listen-vs-repair.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* liste-vs-repair.c
+ *
+ * Do listening sockets have address conflicts with sockets under repair
+ * ====================================================================
+ *
+ * When we accept() an incoming connection the accept()ed socket will have the
+ * same local address as the listening socket.  This can be a complication on
+ * migration.  On the migration target we've already set up listening sockets
+ * according to the command line.  However to restore connections that we're
+ * migrating in we need to bind the new sockets to the same address, which would
+ * be an address conflict on the face of it.  This test program verifies that
+ * enabling repair mode before bind() correctly suppresses that conflict.
+ *
+ * Copyright Red Hat
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ */
+
+/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "common.h"
+
+#define PORT	13256U
+#define CPORT	13257U
+
+/* 127.0.0.1:PORT */
+static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT);
+
+/* 127.0.0.1:CPORT */
+static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT);
+
+/* Put ourselves into a network sandbox */
+static void net_sandbox(void)
+{
+	/* NOLINTNEXTLINE(altera-struct-pack-align) */
+	const struct req_t {
+		struct nlmsghdr nlh;
+		struct ifinfomsg ifm;
+	} __attribute__((packed)) req = {
+		.nlh.nlmsg_type		= RTM_NEWLINK,
+		.nlh.nlmsg_flags	= NLM_F_REQUEST,
+		.nlh.nlmsg_len		= sizeof(req),
+		.nlh.nlmsg_seq		= 1,
+		.ifm.ifi_family		= AF_UNSPEC,
+                .ifm.ifi_index		= 1,
+                .ifm.ifi_flags		= IFF_UP,
+                .ifm.ifi_change		= IFF_UP,
+	};
+	int nl;
+
+	if (unshare(CLONE_NEWUSER | CLONE_NEWNET))
+		die("unshare(): %s\n", strerror(errno));
+
+	/* Bring up lo in the new netns */
+	nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (nl < 0)
+		die("Can't create netlink socket: %s\n", strerror(errno));
+
+	if (send(nl, &req, sizeof(req), 0) < 0)
+		die("Netlink send(): %s\n", strerror(errno));
+	close(nl);
+}
+
+static void check(void)
+{
+	int s1, s2, op;
+
+	s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (s1 < 0)
+		die("socket() 1: %s\n", strerror(errno));
+
+	if (bind(s1, (struct sockaddr *)&addr, sizeof(addr)))
+		die("bind() 1: %s\n", strerror(errno));
+
+	if (listen(s1, 0))
+		die("listen(): %s\n", strerror(errno));
+
+	s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (s2 < 0)
+		die("socket() 2: %s\n", strerror(errno));
+
+	op = TCP_REPAIR_ON;
+	if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
+		die("TCP_REPAIR: %s\n", strerror(errno));
+
+	if (bind(s2, (struct sockaddr *)&addr, sizeof(addr)))
+		die("bind() 2: %s\n", strerror(errno));
+
+	if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr)))
+		die("connect(): %s\n", strerror(errno));
+
+	op = TCP_REPAIR_OFF_NO_WP;
+	if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
+		die("TCP_REPAIR: %s\n", strerror(errno));
+
+	close(s1);
+	close(s2);
+}
+
+int main(int argc, char *argv[])
+{
+	(void)argc;
+	(void)argv;
+
+	net_sandbox();
+
+	check();
+
+	printf("Repair mode appears to properly suppress conflicts with listening sockets\n");
+
+	exit(0);
+}

From dec3d73e1e8e007d05f9dce9a48aca7cb8532992 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 2 Apr 2025 14:13:19 +1100
Subject: [PATCH 325/382] migrate, tcp: bind() migrated sockets in repair mode

Currently on a migration target, we create then immediately bind() new
sockets for the TCP connections we're reconstructing.  Mostly, this works,
since a socket() that is bound but hasn't had listen() or connect() called
is essentially passive.  However, this bind() is subject to the usual
address conflict checking.  In particular that means if we already have
a listening socket on that port, we'll get an EADDRINUSE.  This will happen
for every connection we try to migrate that was initiated from outside to
the guest, since we necessarily created a listening socket for that case.

We set SO_REUSEADDR on the socket in an attempt to avoid this, but that's
not sufficient; even with SO_REUSEADDR address conflicts are still
prohibited for listening sockets.  Of course once these incoming sockets
are fully repaired and connect()ed they'll no longer conflict, but that
doesn't help us if we fail at the bind().

We can avoid this by not calling bind() until we're already in repair mode
which suppresses this transient conflict.  Because of the batching of
setting repair mode, to do that we need to move the bind to a step in
tcp_flow_migrate_target_ext().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/tcp.c b/tcp.c
index fa1d885..35626c9 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3414,13 +3414,8 @@ fail:
 static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
 {
 	sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
-	const struct flowside *sockside = HOSTFLOW(conn);
-	union sockaddr_inany a;
-	socklen_t sl;
 	int s, rc;
 
-	pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
-
 	if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
 				 IPPROTO_TCP)) < 0) {
 		rc = -errno;
@@ -3435,12 +3430,6 @@ static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
 
 	tcp_sock_set_nodelay(s);
 
-	if (bind(s, &a.sa, sizeof(a))) {
-		rc = -errno;
-		flow_perror(conn, "Failed to bind socket for migrated flow");
-		goto err;
-	}
-
 	if ((rc = tcp_flow_repair_on(c, conn)))
 		goto err;
 
@@ -3452,6 +3441,30 @@ err:
 	return rc;
 }
 
+/**
+ * tcp_flow_repair_bind() - Bind socket in repair mode
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+	const struct flowside *sockside = HOSTFLOW(conn);
+	union sockaddr_inany a;
+	socklen_t sl;
+
+	pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
+
+	if (bind(conn->sock, &a.sa, sizeof(a))) {
+		int rc = -errno;
+		flow_perror(conn, "Failed to bind socket for migrated flow");
+		return rc;
+	}
+
+	return 0;
+}
+
 /**
  * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
  * @c:		Execution context
@@ -3618,6 +3631,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 		/* We weren't able to create the socket, discard flow */
 		goto fail;
 
+	if (tcp_flow_repair_bind(c, conn))
+		goto fail;
+
 	if (tcp_flow_repair_timestamp(conn, &t))
 		goto fail;
 

From 3d41e4d8389578e5d5f3cf2e47b9ff9cdd29ffd1 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 2 Apr 2025 15:43:40 +1100
Subject: [PATCH 326/382] passt-repair: Correct off-by-one error verifying name

passt-repair will generate an error if the name it gets from the kernel is
too long or not NUL terminated.  Downstream testing has reported
occasionally seeing this error in practice.

In turns out there is a trivial off-by-one error in the check: ev->len is
the length of the name, including terminating \0 characters, so to check
for a \0 at the end of the buffer we need to check ev->name[len - 1] not
ev->name[len].

Fixes: 42a854a52b6f ("pasta, passt-repair: Support multiple events per read() in inotify handlers")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt-repair.c b/passt-repair.c
index 86f0293..440c77a 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -157,7 +157,7 @@ int main(int argc, char **argv)
 			}
 		} while (!found);
 
-		if (ev->len > NAME_MAX + 1 || ev->name[ev->len] != '\0') {
+		if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') {
 			fprintf(stderr, "Invalid filename from inotify\n");
 			_exit(1);
 		}

From 8aa2d90c8d95d0fa1dad7027fdf92b48a1bbf3c6 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 1 Apr 2025 19:57:08 +1100
Subject: [PATCH 327/382] udp: Remove redundant udp_at_sidx() call in
 udp_tap_handler()

We've already have a pointer to the UDP flow in variable uflow, we can just
re-use it.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/udp.c b/udp.c
index 39431d7..ac168db 100644
--- a/udp.c
+++ b/udp.c
@@ -907,7 +907,7 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 	}
 	toside = flowside_at_sidx(tosidx);
 
-	s = udp_at_sidx(tosidx)->s[tosidx.sidei];
+	s = uflow->s[tosidx.sidei];
 	ASSERT(s >= 0);
 
 	pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport);

From 76e554d9ec8dc80c1856621e17e45be811d198d0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 1 Apr 2025 19:57:09 +1100
Subject: [PATCH 328/382] udp: Simplify updates to UDP flow timestamp

Since UDP has no built in knowledge of connections, the only way we
know when we're done with a UDP flow is a timeout with no activity.
To keep track of this struct udp_flow includes a timestamp to record
the last time we saw traffic on the flow.

For data from listening sockets and from tap, this is done implicitly via
udp_flow_from_{sock,tap}() but for reply sockets it's done explicitly.
However, that logic is duplicated between the vhost-user and "buf" paths.
Make it common in udp_reply_sock_handler() instead.

Technically this is a behavioural change: previously if we got an EPOLLIN
event, but there wasn't actually any data we wouldn't update the timestamp,
now we will.  This should be harmless: if there's an EPOLLIN we expect
there to be data, and even if there isn't the worst we can do is mildly
delay the cleanup of a stale flow.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c    | 15 ++++++---------
 udp_vu.c |  9 +--------
 udp_vu.h |  3 +--
 3 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/udp.c b/udp.c
index ac168db..44b58d1 100644
--- a/udp.c
+++ b/udp.c
@@ -758,27 +758,21 @@ void udp_listen_sock_handler(const struct ctx *c,
  * @c:		Execution context
  * @s:		Socket to read data from
  * @tosidx:	Flow & side to forward data from @s to
- * @now:	Current timestamp
  *
  * Return: true on success, false if can't forward from socket to flow's pif
  *
  * #syscalls recvmmsg
  */
 static bool udp_buf_reply_sock_data(const struct ctx *c,
-				    int s, flow_sidx_t tosidx,
-				    const struct timespec *now)
+				    int s, flow_sidx_t tosidx)
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
-	struct udp_flow *uflow = udp_at_sidx(tosidx);
 	uint8_t topif = pif_at_sidx(tosidx);
 	int n, i;
 
 	if ((n = udp_sock_recv(c, s, udp_mh_recv)) <= 0)
 		return true;
 
-	flow_trace(uflow, "Received %d datagrams on reply socket", n);
-	uflow->ts = now->tv_sec;
-
 	for (i = 0; i < n; i++) {
 		if (pif_is_socket(topif))
 			udp_splice_prepare(udp_mh_recv, i);
@@ -825,10 +819,13 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 		int s = ref.fd;
 		bool ret;
 
+		flow_trace(uflow, "Received data on reply socket");
+		uflow->ts = now->tv_sec;
+
 		if (c->mode == MODE_VU)
-			ret = udp_vu_reply_sock_data(c, s, tosidx, now);
+			ret = udp_vu_reply_sock_data(c, s, tosidx);
 		else
-			ret = udp_buf_reply_sock_data(c, s, tosidx, now);
+			ret = udp_buf_reply_sock_data(c, s, tosidx);
 
 		if (!ret) {
 			flow_err(uflow, "Unable to forward UDP");
diff --git a/udp_vu.c b/udp_vu.c
index 06bdeae..4153b6c 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -275,22 +275,17 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
  * @c:		Execution context
  * @s:		Socket to read data from
  * @tosidx:	Flow & side to forward data from @s to
- * @now:	Current timestamp
  *
  * Return: true on success, false if can't forward from socket to flow's pif
  */
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx,
-			    const struct timespec *now)
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx)
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
-	struct udp_flow *uflow = udp_at_sidx(tosidx);
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;
 
-	ASSERT(uflow);
-
 	if (pif_at_sidx(tosidx) != PIF_TAP)
 		return false;
 
@@ -301,8 +296,6 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx,
 		iov_used = udp_vu_sock_recv(c, s, v6, &dlen);
 		if (iov_used <= 0)
 			break;
-		flow_trace(uflow, "Received 1 datagram on reply socket");
-		uflow->ts = now->tv_sec;
 
 		udp_vu_prepare(c, toside, dlen);
 		if (*c->pcap) {
diff --git a/udp_vu.h b/udp_vu.h
index 2299b51..6d541a4 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -8,7 +8,6 @@
 
 void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 			     const struct timespec *now);
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx,
-			    const struct timespec *now);
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx);
 
 #endif /* UDP_VU_H */

From 684870a766e7f024a5720464ad070e666cb4793e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 1 Apr 2025 19:57:10 +1100
Subject: [PATCH 329/382] udp: Correct some seccomp filter annotations

Both udp_buf_listen_sock_data() and udp_buf_reply_sock_data() have comments
stating they use recvmmsg().  That's not correct, they only do so via
udp_sock_recv() which lists recvmmsg() itself.

In contrast udp_splice_send() and udp_tap_handler() both directly use
sendmmsg(), but only the latter lists it.  Add it to the former as well.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/udp.c b/udp.c
index 44b58d1..ab3e9d2 100644
--- a/udp.c
+++ b/udp.c
@@ -272,6 +272,8 @@ static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx)
  * @dst:	Destination port for datagrams (target side)
  * @ref:	epoll reference for origin socket
  * @now:	Timestamp
+ *
+ * #syscalls sendmmsg
  */
 static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
 			    flow_sidx_t tosidx)
@@ -662,8 +664,6 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh)
  * @c:		Execution context
  * @ref:	epoll reference
  * @now:	Current timestamp
- *
- * #syscalls recvmmsg
  */
 static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 				     const struct timespec *now)
@@ -760,8 +760,6 @@ void udp_listen_sock_handler(const struct ctx *c,
  * @tosidx:	Flow & side to forward data from @s to
  *
  * Return: true on success, false if can't forward from socket to flow's pif
- *
- * #syscalls recvmmsg
  */
 static bool udp_buf_reply_sock_data(const struct ctx *c,
 				    int s, flow_sidx_t tosidx)

From 06784d7fc6761528d587837b241d27c6d17c0842 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 3 Apr 2025 19:01:02 +0200
Subject: [PATCH 330/382] passt-repair: Ensure that read buffer is
 NULL-terminated

After 3d41e4d83895 ("passt-repair: Correct off-by-one error verifying
name"), Coverity Scan isn't convinced anymore about the fact that the
ev->name used in the snprintf() is NULL-terminated.

It comes from a read() call, and read() of course doesn't terminate
it, but we already check that the byte at ev->len - 1 is a NULL
terminator, so this is actually a false positive.

In any case, the logic ensuring that ev->name is NULL-terminated isn't
necessarily obvious, and additionally checking that the last byte in
the buffer we read is a NULL terminator is harmless, so do that
explicitly, even if it's redundant.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt-repair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/passt-repair.c b/passt-repair.c
index 440c77a..256a8c9 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -137,6 +137,7 @@ int main(int argc, char **argv)
 				fprintf(stderr, "inotify read: %i", errno);
 				_exit(1);
 			}
+			buf[n - 1] = '\0';
 
 			if (n < (ssize_t)sizeof(*ev)) {
 				fprintf(stderr, "Short inotify read: %zi", n);

From a7775e9550fa698e4af1322f6ef63924c24d1fab Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Sat, 5 Apr 2025 15:21:26 -0400
Subject: [PATCH 331/382] udp: support traceroute in direction tap-socket

Now that ICMP pass-through from socket-to-tap is in place, it is
easy to support UDP based traceroute functionality in direction
tap-to-socket.

We fix that in this commit.

Link: https://bugs.passt.top/show_bug.cgi?id=64
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c      | 17 +++++++++++++----
 udp.c      | 22 +++++++++++++++++++++-
 udp.h      |  3 ++-
 udp_flow.c |  1 +
 udp_flow.h |  4 +++-
 5 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/tap.c b/tap.c
index 3a6fcbe..d630f6d 100644
--- a/tap.c
+++ b/tap.c
@@ -559,6 +559,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
  * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
  * @msgs:	Count of messages in sequence
  * @protocol:	Protocol number
+ * @ttl:	Time to live
  * @source:	Source port
  * @dest:	Destination port
  * @saddr:	Source address
@@ -567,6 +568,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
  */
 static struct tap4_l4_t {
 	uint8_t protocol;
+	uint8_t ttl;
 
 	uint16_t source;
 	uint16_t dest;
@@ -586,6 +588,7 @@ static struct tap4_l4_t {
  * @dest:	Destination port
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @hop_limit:	Hop limit
  * @msg:	Array of messages that can be handled in a single call
  */
 static struct tap6_l4_t {
@@ -598,6 +601,8 @@ static struct tap6_l4_t {
 	struct in6_addr saddr;
 	struct in6_addr daddr;
 
+	uint8_t hop_limit;
+
 	struct pool_l4_t p;
 } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
 
@@ -786,7 +791,8 @@ resume:
 #define L4_MATCH(iph, uh, seq)							\
 	((seq)->protocol == (iph)->protocol &&					\
 	 (seq)->source   == (uh)->source    && (seq)->dest  == (uh)->dest &&	\
-	 (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
+	 (seq)->saddr.s_addr == (iph)->saddr &&				\
+	 (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
 
 #define L4_SET(iph, uh, seq)						\
 	do {								\
@@ -795,6 +801,7 @@ resume:
 		(seq)->dest		= (uh)->dest;			\
 		(seq)->saddr.s_addr	= (iph)->saddr;			\
 		(seq)->daddr.s_addr	= (iph)->daddr;			\
+		(seq)->ttl		= (iph)->ttl;			\
 	} while (0)
 
 		if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
@@ -843,7 +850,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += udp_tap_handler(c, PIF_TAP, AF_INET,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     seq->ttl, p, k, now);
 		}
 	}
 
@@ -966,7 +973,8 @@ resume:
 		 (seq)->dest == (uh)->dest                 &&		\
 		 (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) &&		\
 		 IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr)  &&		\
-		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
+		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)  &&		\
+		 (seq)->hop_limit == (ip6h)->hop_limit)
 
 #define L4_SET(ip6h, proto, uh, seq)					\
 	do {								\
@@ -976,6 +984,7 @@ resume:
 		(seq)->flow_lbl	= ip6_get_flow_lbl(ip6h);		\
 		(seq)->saddr	= *saddr;				\
 		(seq)->daddr	= *daddr;				\
+		(seq)->hop_limit = (ip6h)->hop_limit;			\
 	} while (0)
 
 		if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
@@ -1026,7 +1035,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += udp_tap_handler(c, PIF_TAP, AF_INET6,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     seq->hop_limit, p, k, now);
 		}
 	}
 
diff --git a/udp.c b/udp.c
index ab3e9d2..5a251df 100644
--- a/udp.c
+++ b/udp.c
@@ -844,6 +844,7 @@ fail:
  * @af:		Address family, AF_INET or AF_INET6
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @ttl:	TTL or hop limit for packets to be sent in this call
  * @p:		Pool of UDP packets, with UDP headers
  * @idx:	Index of first packet to process
  * @now:	Current timestamp
@@ -854,7 +855,8 @@ fail:
  */
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    const struct pool *p, int idx, const struct timespec *now)
+		    uint8_t ttl, const struct pool *p, int idx,
+		    const struct timespec *now)
 {
 	const struct flowside *toside;
 	struct mmsghdr mm[UIO_MAXIOV];
@@ -933,6 +935,24 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		mm[i].msg_hdr.msg_controllen = 0;
 		mm[i].msg_hdr.msg_flags = 0;
 
+		if (ttl != uflow->ttl[tosidx.sidei]) {
+			uflow->ttl[tosidx.sidei] = ttl;
+			if (af == AF_INET) {
+				if (setsockopt(s, IPPROTO_IP, IP_TTL,
+					       &ttl, sizeof(ttl)) < 0)
+					flow_perror(uflow,
+						    "setsockopt IP_TTL");
+			} else {
+				/* IPv6 hop_limit cannot be only 1 byte */
+				int hop_limit = ttl;
+
+				if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS,
+					       &hop_limit, sizeof(hop_limit)) < 0)
+					flow_perror(uflow,
+						    "setsockopt IPV6_UNICAST_HOPS");
+			}
+		}
+
 		count++;
 	}
 
diff --git a/udp.h b/udp.h
index de2df6d..a811475 100644
--- a/udp.h
+++ b/udp.h
@@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now);
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    const struct pool *p, int idx, const struct timespec *now);
+		    uint8_t ttl, const struct pool *p, int idx,
+		    const struct timespec *now);
 int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);
 int udp_init(struct ctx *c);
diff --git a/udp_flow.c b/udp_flow.c
index bf4b896..99ae490 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
 	uflow->ts = now->tv_sec;
 	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
+	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
 
 	if (s_ini >= 0) {
 		/* When using auto port-scanning the listening port could go
diff --git a/udp_flow.h b/udp_flow.h
index 9a1b059..520de62 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -8,11 +8,12 @@
 #define UDP_FLOW_H
 
 /**
- * struct udp - Descriptor for a flow of UDP packets
+ * struct udp_flow - Descriptor for a flow of UDP packets
  * @f:		Generic flow information
  * @closed:	Flow is already closed
  * @ts:		Activity timestamp
  * @s:		Socket fd (or -1) for each side of the flow
+ * @ttl:	TTL or hop_limit for both sides
  */
 struct udp_flow {
 	/* Must be first element */
@@ -21,6 +22,7 @@ struct udp_flow {
 	bool closed :1;
 	time_t ts;
 	int s[SIDES];
+	uint8_t ttl[SIDES];
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);

From d74b5a7c107006b95df6a69e5f1e6b9a373c7f53 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:31 +1100
Subject: [PATCH 332/382] udp: Use connect()ed sockets for initiating side

Currently we have an asymmetry in how we handle UDP sockets.  For flows
where the target side is a socket, we create a new connect()ed socket
- the "reply socket" specifically for that flow used for sending and
receiving datagrams on that flow and only that flow.  For flows where the
initiating side is a socket, we continue to use the "listening" socket (or
rather, a dup() of it).  This has some disadvantages:

 * We need a hash lookup for every datagram on the listening socket in
   order to work out what flow it belongs to
 * The dup() keeps the socket alive even if automatic forwarding removes
   the listening socket.  However, the epoll data remains the same
   including containing the now stale original fd.  This causes bug 103.
 * We can't (easily) set flow-specific options on an initiating side
   socket, because that could affect other flows as well

Alter the code to use a connect()ed socket on the initiating side as well
as the target side.  There's no way to "clone and connect" the listening
socket (a loose equivalent of accept() for UDP), so we have to create a
new socket.  We have to bind() this socket before we connect() it, which
is allowed thanks to SO_REUSEADDR, but does leave a small window where it
could receive datagrams not intended for this flow.  For now we handle this
by simply discarding any datagrams received between bind() and connect(),
but I intend to improve this in a later patch.

Link: https://bugs.passt.top/show_bug.cgi?id=103
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 epoll_type.h |  4 ++--
 passt.c      |  6 +++---
 udp.c        | 50 ++++++++++++++++++++++++++------------------------
 udp.h        |  4 ++--
 udp_flow.c   | 32 +++++++++-----------------------
 util.c       |  2 +-
 6 files changed, 43 insertions(+), 55 deletions(-)

diff --git a/epoll_type.h b/epoll_type.h
index 7f2a121..12ac59b 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -22,8 +22,8 @@ enum epoll_type {
 	EPOLL_TYPE_TCP_TIMER,
 	/* UDP "listening" sockets */
 	EPOLL_TYPE_UDP_LISTEN,
-	/* UDP socket for replies on a specific flow */
-	EPOLL_TYPE_UDP_REPLY,
+	/* UDP socket for a specific flow */
+	EPOLL_TYPE_UDP,
 	/* ICMP/ICMPv6 ping sockets */
 	EPOLL_TYPE_PING,
 	/* inotify fd watching for end of netns (pasta) */
diff --git a/passt.c b/passt.c
index cd06772..388d10f 100644
--- a/passt.c
+++ b/passt.c
@@ -68,7 +68,7 @@ char *epoll_type_str[] = {
 	[EPOLL_TYPE_TCP_LISTEN]		= "listening TCP socket",
 	[EPOLL_TYPE_TCP_TIMER]		= "TCP timer",
 	[EPOLL_TYPE_UDP_LISTEN]		= "listening UDP socket",
-	[EPOLL_TYPE_UDP_REPLY]		= "UDP reply socket",
+	[EPOLL_TYPE_UDP]		= "UDP flow socket",
 	[EPOLL_TYPE_PING]	= "ICMP/ICMPv6 ping socket",
 	[EPOLL_TYPE_NSQUIT_INOTIFY]	= "namespace inotify watch",
 	[EPOLL_TYPE_NSQUIT_TIMER]	= "namespace timer watch",
@@ -339,8 +339,8 @@ loop:
 		case EPOLL_TYPE_UDP_LISTEN:
 			udp_listen_sock_handler(&c, ref, eventmask, &now);
 			break;
-		case EPOLL_TYPE_UDP_REPLY:
-			udp_reply_sock_handler(&c, ref, eventmask, &now);
+		case EPOLL_TYPE_UDP:
+			udp_sock_handler(&c, ref, eventmask, &now);
 			break;
 		case EPOLL_TYPE_PING:
 			icmp_sock_handler(&c, ref);
diff --git a/udp.c b/udp.c
index 5a251df..1b3fffd 100644
--- a/udp.c
+++ b/udp.c
@@ -39,27 +39,30 @@
  * could receive packets from multiple flows, so we use a hash table match to
  * find the specific flow for a datagram.
  *
- * When a UDP flow is initiated from a listening socket we take a duplicate of
- * the socket and store it in uflow->s[INISIDE].  This will last for the
+ * Flow sockets
+ * ============
+ *
+ * When a UDP flow targets a socket, we create a "flow" socket in
+ * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
+ * replies on the target side.  This socket is both bound and connected and has
+ * EPOLL_TYPE_UDP.  The connect() means it will only receive datagrams
+ * associated with this flow, so the epoll reference directly points to the flow
+ * and we don't need a hash lookup.
+ *
+ * When a flow is initiated from a listening socket, we create a "flow" socket
+ * with the same bound address as the listening socket, but also connect()ed to
+ * the flow's peer.  This is stored in uflow->s[INISIDE] and will last for the
  * lifetime of the flow, even if the original listening socket is closed due to
  * port auto-probing.  The duplicate is used to deliver replies back to the
  * originating side.
  *
- * Reply sockets
- * =============
- *
- * When a UDP flow targets a socket, we create a "reply" socket in
- * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
- * replies on the target side.  This socket is both bound and connected and has
- * EPOLL_TYPE_UDP_REPLY.  The connect() means it will only receive datagrams
- * associated with this flow, so the epoll reference directly points to the flow
- * and we don't need a hash lookup.
- *
- * NOTE: it's possible that the reply socket could have a bound address
- * overlapping with an unrelated listening socket.  We assume datagrams for the
- * flow will come to the reply socket in preference to a listening socket.  The
- * sample program doc/platform-requirements/reuseaddr-priority.c documents and
- * tests that assumption.
+ * NOTE: A flow socket can have a bound address overlapping with a listening
+ * socket.  That will happen naturally for flows initiated from a socket, but is
+ * also possible (though unlikely) for tap initiated flows, depending on the
+ * source port.  We assume datagrams for the flow will come to a connect()ed
+ * socket in preference to a listening socket.  The sample program
+ * doc/platform-requirements/reuseaddr-priority.c documents and tests that
+ * assumption.
  *
  * "Spliced" flows
  * ===============
@@ -71,8 +74,7 @@
  * actually used; it doesn't make sense for datagrams and instead a pair of
  * recvmmsg() and sendmmsg() is used to forward the datagrams.
  *
- * Note that a spliced flow will have *both* a duplicated listening socket and a
- * reply socket (see above).
+ * Note that a spliced flow will have two flow sockets (see above).
  */
 
 #include <sched.h>
@@ -557,7 +559,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 	}
 
 	eh = (const struct errhdr *)CMSG_DATA(hdr);
-	if (ref.type == EPOLL_TYPE_UDP_REPLY) {
+	if (ref.type == EPOLL_TYPE_UDP) {
 		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
 		const struct flowside *toside = flowside_at_sidx(sidx);
 		size_t dlen = rc;
@@ -792,14 +794,14 @@ static bool udp_buf_reply_sock_data(const struct ctx *c,
 }
 
 /**
- * udp_reply_sock_handler() - Handle new data from flow specific socket
+ * udp_sock_handler() - Handle new data from flow specific socket
  * @c:		Execution context
  * @ref:	epoll reference
  * @events:	epoll events bitmap
  * @now:	Current timestamp
  */
-void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-			    uint32_t events, const struct timespec *now)
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
+		      uint32_t events, const struct timespec *now)
 {
 	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
 
@@ -807,7 +809,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	if (events & EPOLLERR) {
 		if (udp_sock_errs(c, ref) < 0) {
-			flow_err(uflow, "Unrecoverable error on reply socket");
+			flow_err(uflow, "Unrecoverable error on flow socket");
 			goto fail;
 		}
 	}
diff --git a/udp.h b/udp.h
index a811475..8f8531a 100644
--- a/udp.h
+++ b/udp.h
@@ -11,8 +11,8 @@
 void udp_portmap_clear(void);
 void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 			     uint32_t events, const struct timespec *now);
-void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-			    uint32_t events, const struct timespec *now);
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
+		      uint32_t events, const struct timespec *now);
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
 		    uint8_t ttl, const struct pool *p, int idx,
diff --git a/udp_flow.c b/udp_flow.c
index 99ae490..a2d417f 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -49,10 +49,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 	flow_foreach_sidei(sidei) {
 		flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
 		if (uflow->s[sidei] >= 0) {
-			/* The listening socket needs to stay in epoll, but the
-			 * flow specific one needs to be removed */
-			if (sidei == TGTSIDE)
-				epoll_del(c, uflow->s[sidei]);
+			epoll_del(c, uflow->s[sidei]);
 			close(uflow->s[sidei]);
 			uflow->s[sidei] = -1;
 		}
@@ -81,7 +78,7 @@ static int udp_flow_sock(const struct ctx *c,
 	} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
 	int rc, s;
 
-	s = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, pif, side, fref.data);
+	s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
 	if (s < 0) {
 		flow_dbg_perror(uflow, "Couldn't open flow specific socket");
 		return s;
@@ -120,13 +117,12 @@ static int udp_flow_sock(const struct ctx *c,
  * udp_flow_new() - Common setup for a new UDP flow
  * @c:		Execution context
  * @flow:	Initiated flow
- * @s_ini:	Initiating socket (or -1)
  * @now:	Timestamp
  *
  * Return: UDP specific flow, if successful, NULL on failure
  */
 static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
-				int s_ini, const struct timespec *now)
+				const struct timespec *now)
 {
 	struct udp_flow *uflow = NULL;
 	unsigned sidei;
@@ -139,22 +135,12 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
 	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
 
-	if (s_ini >= 0) {
-		/* When using auto port-scanning the listening port could go
-		 * away, so we need to duplicate the socket
-		 */
-		uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0);
-		if (uflow->s[INISIDE] < 0) {
-			flow_perror(uflow,
-				    "Couldn't duplicate listening socket");
-			goto cancel;
-		}
+	flow_foreach_sidei(sidei) {
+		if (pif_is_socket(uflow->f.pif[sidei]))
+			if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0)
+				goto cancel;
 	}
 
-	if (pif_is_socket(flow->f.pif[TGTSIDE]))
-		if ((uflow->s[TGTSIDE] = udp_flow_sock(c, uflow, TGTSIDE)) < 0)
-			goto cancel;
-
 	/* Tap sides always need to be looked up by hash.  Socket sides don't
 	 * always, but sometimes do (receiving packets on a socket not specific
 	 * to one flow).  Unconditionally hash both sides so all our bases are
@@ -225,7 +211,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 		return FLOW_SIDX_NONE;
 	}
 
-	return udp_flow_new(c, flow, ref.fd, now);
+	return udp_flow_new(c, flow, now);
 }
 
 /**
@@ -281,7 +267,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 		return FLOW_SIDX_NONE;
 	}
 
-	return udp_flow_new(c, flow, -1, now);
+	return udp_flow_new(c, flow, now);
 }
 
 /**
diff --git a/util.c b/util.c
index b9a3d43..0f68cf5 100644
--- a/util.c
+++ b/util.c
@@ -71,7 +71,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	case EPOLL_TYPE_UDP_LISTEN:
 		freebind = c->freebind;
 		/* fallthrough */
-	case EPOLL_TYPE_UDP_REPLY:
+	case EPOLL_TYPE_UDP:
 		proto = IPPROTO_UDP;
 		socktype = SOCK_DGRAM | SOCK_NONBLOCK;
 		break;

From 1d7bbb101a0b1dcbc99c51cd65abb90a0144ac7b Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:32 +1100
Subject: [PATCH 333/382] udp: Make udp_sock_recv() take max number of frames
 as a parameter

Currently udp_sock_recv() decides the maximum number of frames it is
willing to receive based on the mode.  However, we have upcoming use cases
where we will have different criteria for how many frames we want with
information that's not naturally available here but is in the caller.  So
make the maximum number of frames a parameter.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Fix typo in comment in udp_buf_reply_sock_data()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/udp.c b/udp.c
index 1b3fffd..53403bf 100644
--- a/udp.c
+++ b/udp.c
@@ -634,22 +634,14 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
  * @c:		Execution context
  * @s:		Socket to receive from
  * @mmh		mmsghdr array to receive into
+ * @n:		Maximum number of datagrams to receive
  *
  * Return: Number of datagrams received
  *
  * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
  */
-static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh)
+static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 {
-	/* For not entirely clear reasons (data locality?) pasta gets better
-	 * throughput if we receive tap datagrams one at a atime.  For small
-	 * splice datagrams throughput is slightly better if we do batch, but
-	 * it's slightly worse for large splice datagrams.  Since we don't know
-	 * before we receive whether we'll use tap or splice, always go one at a
-	 * time for pasta mode.
-	 */
-	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
-
 	ASSERT(!c->no_udp);
 
 	n = recvmmsg(s, mmh, n, 0, NULL);
@@ -671,9 +663,10 @@ static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 				     const struct timespec *now)
 {
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
-	int n, i;
+	/* See udp_buf_sock_data() comment */
+	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i;
 
-	if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv)) <= 0)
+	if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv, n)) <= 0)
 		return;
 
 	/* We divide datagrams into batches based on how we need to send them,
@@ -768,9 +761,15 @@ static bool udp_buf_reply_sock_data(const struct ctx *c,
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	uint8_t topif = pif_at_sidx(tosidx);
-	int n, i;
+	/* For not entirely clear reasons (data locality?) pasta gets better
+	 * throughput if we receive tap datagrams one at a time.  For small
+	 * splice datagrams throughput is slightly better if we do batch, but
+	 * it's slightly worse for large splice datagrams.  Since we don't know
+	 * the size before we receive, always go one at a time for pasta mode.
+	 */
+	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i;
 
-	if ((n = udp_sock_recv(c, s, udp_mh_recv)) <= 0)
+	if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
 		return true;
 
 	for (i = 0; i < n; i++) {

From 84ab1305fabaf07b5badf433e55a458de5b86918 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:33 +1100
Subject: [PATCH 334/382] udp: Polish udp_vu_sock_info() and remove from vu
 specific code

udp_vu_sock_info() uses MSG_PEEK to look ahead at the next datagram to be
received and gets its source address.  Currently we only use it in the
vhost-user path, but there's nothing inherently vhost-user specific about
it.  We have upcoming uses for it elsewhere so rename and move to udp.c.

While we're there, polish its error reporting a litle.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Drop excess newline before udp_sock_recv()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c          | 24 ++++++++++++++++++++++++
 udp_internal.h |  1 +
 udp_vu.c       | 19 +------------------
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/udp.c b/udp.c
index 53403bf..1e241c8 100644
--- a/udp.c
+++ b/udp.c
@@ -629,6 +629,30 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
 	return n_err;
 }
 
+/**
+ * udp_peek_addr() - Get source address for next packet
+ * @s:		Socket to get information from
+ * @src:	Socket address (output)
+ *
+ * Return: 0 on success, -1 otherwise
+ */
+int udp_peek_addr(int s, union sockaddr_inany *src)
+{
+	struct msghdr msg = {
+		.msg_name = src,
+		.msg_namelen = sizeof(*src),
+	};
+	int rc;
+
+	rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
+	if (rc < 0) {
+		if (errno != EAGAIN && errno != EWOULDBLOCK)
+			warn_perror("Error peeking at socket address");
+		return rc;
+	}
+	return 0;
+}
+
 /**
  * udp_sock_recv() - Receive datagrams from a socket
  * @c:		Execution context
diff --git a/udp_internal.h b/udp_internal.h
index 02724e5..43a6109 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -30,5 +30,6 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                        const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
+int udp_peek_addr(int s, union sockaddr_inany *src);
 
 #endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index 4153b6c..5faf1e1 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -57,23 +57,6 @@ static size_t udp_vu_hdrlen(bool v6)
 	return hdrlen;
 }
 
-/**
- * udp_vu_sock_info() - get socket information
- * @s:		Socket to get information from
- * @s_in:	Socket address (output)
- *
- * Return: 0 if socket address can be read, -1 otherwise
- */
-static int udp_vu_sock_info(int s, union sockaddr_inany *s_in)
-{
-	struct msghdr msg = {
-		.msg_name = s_in,
-		.msg_namelen = sizeof(union sockaddr_inany),
-	};
-
-	return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
-}
-
 /**
  * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
  * @c:		Execution context
@@ -230,7 +213,7 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 		int iov_used;
 		bool v6;
 
-		if (udp_vu_sock_info(ref.fd, &s_in) < 0)
+		if (udp_peek_addr(ref.fd, &s_in) < 0)
 			break;
 
 		sidx = udp_flow_from_sock(c, ref, &s_in, now);

From 3a0881dfd02d758b0dc8ca6f5732bcb666b6d21e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:34 +1100
Subject: [PATCH 335/382] udp: Don't bother to batch datagrams from "listening"
 socket

A "listening" UDP socket can receive datagrams from multiple flows.  So,
we currently have some quite subtle and complex code in
udp_buf_listen_sock_data() to group contiguously received packets for the
same flow into batches for forwarding.

However, since we are now always using flow specific connect()ed sockets
once a flow is established, handling of datagrams on listening sockets is
essentially a slow path.  Given that, it's not worth the complexity.
Substantially simplify the code by using an approach more like vhost-user,
and "peeking" at the address of the next datagram, one at a time to
determine the correct flow before we actually receive the data,

This removes all meaningful use of the s_in and tosidx fields in
udp_meta_t, so they too can be removed, along with setting of msg_name and
msg_namelen in the msghdr arrays which referenced them.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 75 +++++++++++++++--------------------------------------------
 1 file changed, 19 insertions(+), 56 deletions(-)

diff --git a/udp.c b/udp.c
index 1e241c8..4d32124 100644
--- a/udp.c
+++ b/udp.c
@@ -138,20 +138,15 @@ static struct ethhdr udp4_eth_hdr;
 static struct ethhdr udp6_eth_hdr;
 
 /**
- * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets
+ * struct udp_meta_t - Pre-cooked headers for UDP packets
  * @ip6h:	Pre-filled IPv6 header (except for payload_len and addresses)
  * @ip4h:	Pre-filled IPv4 header (except for tot_len and saddr)
  * @taph:	Tap backend specific header
- * @s_in:	Source socket address, filled in by recvmmsg()
- * @tosidx:	sidx for the destination side of this datagram's flow
  */
 static struct udp_meta_t {
 	struct ipv6hdr ip6h;
 	struct iphdr ip4h;
 	struct tap_hdr taph;
-
-	union sockaddr_inany s_in;
-	flow_sidx_t tosidx;
 }
 #ifdef __AVX2__
 __attribute__ ((aligned(32)))
@@ -234,8 +229,6 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
 	tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
 	tiov[UDP_IOV_PAYLOAD].iov_base = payload;
 
-	mh->msg_name	= &meta->s_in;
-	mh->msg_namelen	= sizeof(meta->s_in);
 	mh->msg_iov	= siov;
 	mh->msg_iovlen	= 1;
 }
@@ -686,60 +679,32 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 				     const struct timespec *now)
 {
-	const socklen_t sasize = sizeof(udp_meta[0].s_in);
-	/* See udp_buf_sock_data() comment */
-	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i;
+	union sockaddr_inany src;
 
-	if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv, n)) <= 0)
-		return;
+	while (udp_peek_addr(ref.fd, &src) == 0) {
+		flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now);
+		uint8_t topif = pif_at_sidx(tosidx);
 
-	/* We divide datagrams into batches based on how we need to send them,
-	 * determined by udp_meta[i].tosidx.  To avoid either two passes through
-	 * the array, or recalculating tosidx for a single entry, we have to
-	 * populate it one entry *ahead* of the loop counter.
-	 */
-	udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now);
-	udp_mh_recv[0].msg_hdr.msg_namelen = sasize;
-	for (i = 0; i < n; ) {
-		flow_sidx_t batchsidx = udp_meta[i].tosidx;
-		uint8_t batchpif = pif_at_sidx(batchsidx);
-		int batchstart = i;
+		if (udp_sock_recv(c, ref.fd, udp_mh_recv, 1) <= 0)
+			break;
 
-		do {
-			if (pif_is_socket(batchpif)) {
-				udp_splice_prepare(udp_mh_recv, i);
-			} else if (batchpif == PIF_TAP) {
-				udp_tap_prepare(udp_mh_recv, i,
-						flowside_at_sidx(batchsidx),
-						false);
-			}
-
-			if (++i >= n)
-				break;
-
-			udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
-								&udp_meta[i].s_in,
-								now);
-			udp_mh_recv[i].msg_hdr.msg_namelen = sasize;
-		} while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx));
-
-		if (pif_is_socket(batchpif)) {
-			udp_splice_send(c, batchstart, i - batchstart,
-					batchsidx);
-		} else if (batchpif == PIF_TAP) {
-			tap_send_frames(c, &udp_l2_iov[batchstart][0],
-					UDP_NUM_IOVS, i - batchstart);
-		} else if (flow_sidx_valid(batchsidx)) {
-			flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx);
-			struct udp_flow *uflow = udp_at_sidx(batchsidx);
+		if (pif_is_socket(topif)) {
+			udp_splice_prepare(udp_mh_recv, 0);
+			udp_splice_send(c, 0, 1, tosidx);
+		} else if (topif == PIF_TAP) {
+			udp_tap_prepare(udp_mh_recv, 0, flowside_at_sidx(tosidx),
+					false);
+			tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, 1);
+		} else if (flow_sidx_valid(tosidx)) {
+			flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
+			struct udp_flow *uflow = udp_at_sidx(tosidx);
 
 			flow_err(uflow,
 				 "No support for forwarding UDP from %s to %s",
 				 pif_name(pif_at_sidx(fromsidx)),
-				 pif_name(batchpif));
+				 pif_name(topif));
 		} else {
-			debug("Discarding %d datagrams without flow",
-			      i - batchstart);
+			debug("Discarding datagram without flow");
 		}
 	}
 }
@@ -801,8 +766,6 @@ static bool udp_buf_reply_sock_data(const struct ctx *c,
 			udp_splice_prepare(udp_mh_recv, i);
 		else if (topif == PIF_TAP)
 			udp_tap_prepare(udp_mh_recv, i, toside, false);
-		/* Restore sockaddr length clobbered by recvmsg() */
-		udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
 	}
 
 	if (pif_is_socket(topif)) {

From 5221e177e132b8b5001ec97f42975ad1251f7110 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:35 +1100
Subject: [PATCH 336/382] udp: Parameterize number of datagrams handled by
 udp_*_reply_sock_data()

Both udp_buf_reply_sock_data() and udp_vu_reply_sock_data() internally
decide what the maximum number of datagrams they will forward is.  We have
some upcoming reasons to allow the caller to decide that instead, so make
the maximum number of datagrams a parameter for both of them.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c    | 31 ++++++++++++++++++-------------
 udp_vu.c |  6 ++++--
 udp_vu.h |  3 ++-
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/udp.c b/udp.c
index 4d32124..0f09e67 100644
--- a/udp.c
+++ b/udp.c
@@ -741,22 +741,17 @@ void udp_listen_sock_handler(const struct ctx *c,
  * udp_buf_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
  * @s:		Socket to read data from
+ * @n:		Maximum number of datagrams to forward
  * @tosidx:	Flow & side to forward data from @s to
  *
  * Return: true on success, false if can't forward from socket to flow's pif
  */
-static bool udp_buf_reply_sock_data(const struct ctx *c,
-				    int s, flow_sidx_t tosidx)
+static bool udp_buf_reply_sock_data(const struct ctx *c, int s, int n,
+				    flow_sidx_t tosidx)
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	uint8_t topif = pif_at_sidx(tosidx);
-	/* For not entirely clear reasons (data locality?) pasta gets better
-	 * throughput if we receive tap datagrams one at a time.  For small
-	 * splice datagrams throughput is slightly better if we do batch, but
-	 * it's slightly worse for large splice datagrams.  Since we don't know
-	 * the size before we receive, always go one at a time for pasta mode.
-	 */
-	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i;
+	int i;
 
 	if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
 		return true;
@@ -801,6 +796,14 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 	}
 
 	if (events & EPOLLIN) {
+		/* For not entirely clear reasons (data locality?) pasta gets
+		 * better throughput if we receive tap datagrams one at a
+		 * time.  For small splice datagrams throughput is slightly
+		 * better if we do batch, but it's slightly worse for large
+		 * splice datagrams.  Since we don't know the size before we
+		 * receive, always go one at a time for pasta mode.
+		 */
+		size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
 		flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 		int s = ref.fd;
 		bool ret;
@@ -808,10 +811,12 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 		flow_trace(uflow, "Received data on reply socket");
 		uflow->ts = now->tv_sec;
 
-		if (c->mode == MODE_VU)
-			ret = udp_vu_reply_sock_data(c, s, tosidx);
-		else
-			ret = udp_buf_reply_sock_data(c, s, tosidx);
+		if (c->mode == MODE_VU) {
+			ret = udp_vu_reply_sock_data(c, s, UDP_MAX_FRAMES,
+						     tosidx);
+		} else {
+			ret = udp_buf_reply_sock_data(c, s, n, tosidx);
+		}
 
 		if (!ret) {
 			flow_err(uflow, "Unable to forward UDP");
diff --git a/udp_vu.c b/udp_vu.c
index 5faf1e1..b2618b3 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -257,11 +257,13 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
  * udp_vu_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
  * @s:		Socket to read data from
+ * @n:		Maximum number of datagrams to forward
  * @tosidx:	Flow & side to forward data from @s to
  *
  * Return: true on success, false if can't forward from socket to flow's pif
  */
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx)
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
+			    flow_sidx_t tosidx)
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
@@ -272,7 +274,7 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx)
 	if (pif_at_sidx(tosidx) != PIF_TAP)
 		return false;
 
-	for (i = 0; i < UDP_MAX_FRAMES; i++) {
+	for (i = 0; i < n; i++) {
 		ssize_t dlen;
 		int iov_used;
 
diff --git a/udp_vu.h b/udp_vu.h
index 6d541a4..c897c36 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -8,6 +8,7 @@
 
 void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 			     const struct timespec *now);
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx);
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
+			    flow_sidx_t tosidx);
 
 #endif /* UDP_VU_H */

From 0304dd9c34a7dd29c3a8a2058626a971d4e71a8e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:36 +1100
Subject: [PATCH 337/382] udp: Split spliced forwarding path from
 udp_buf_reply_sock_data()

udp_buf_reply_sock_data() can handle forwarding data either from socket
to socket ("splicing") or from socket to tap.  It has a test on each
datagram for which case we're in, but that will be the same for everything
in the batch.

Split out the spliced path into a separate udp_sock_to_sock() function.
This leaves udp_{buf,vu}_reply_sock_data() handling only forwards from
socket to tap, so rename and simplify them accordingly.

This makes the code slightly longer for now, but will allow future cleanups
to shrink it back down again.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Fix typos in comments to udp_sock_recv() and
 udp_vu_listen_sock_data()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c    | 103 ++++++++++++++++++++++++++++++-------------------------
 udp_vu.c |  12 ++-----
 udp_vu.h |   3 +-
 3 files changed, 60 insertions(+), 58 deletions(-)

diff --git a/udp.c b/udp.c
index 0f09e67..2745e5d 100644
--- a/udp.c
+++ b/udp.c
@@ -670,6 +670,49 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 	return n;
 }
 
+/**
+ * udp_sock_to_sock() - Forward datagrams from socket to socket
+ * @c:		Execution context
+ * @from_s:	Socket to receive datagrams from
+ * @n:		Maximum number of datagrams to forward
+ * @tosidx:	Flow & side to forward datagrams to
+ */
+static void udp_sock_to_sock(const struct ctx *c, int from_s, int n,
+			     flow_sidx_t tosidx)
+{
+	int i;
+
+	if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0)
+		return;
+
+	for (i = 0; i < n; i++)
+		udp_splice_prepare(udp_mh_recv, i);
+
+	udp_splice_send(c, 0, n, tosidx);
+}
+
+/**
+ * udp_buf_sock_to_tap() - Forward datagrams from socket to tap
+ * @c:		Execution context
+ * @s:		Socket to read data from
+ * @n:		Maximum number of datagrams to forward
+ * @tosidx:	Flow & side to forward data from @s to
+ */
+static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
+				flow_sidx_t tosidx)
+{
+	const struct flowside *toside = flowside_at_sidx(tosidx);
+	int i;
+
+	if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
+		return;
+
+	for (i = 0; i < n; i++)
+		udp_tap_prepare(udp_mh_recv, i, toside, false);
+
+	tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
+}
+
 /**
  * udp_buf_listen_sock_data() - Handle new data from socket
  * @c:		Execution context
@@ -737,43 +780,6 @@ void udp_listen_sock_handler(const struct ctx *c,
 	}
 }
 
-/**
- * udp_buf_reply_sock_data() - Handle new data from flow specific socket
- * @c:		Execution context
- * @s:		Socket to read data from
- * @n:		Maximum number of datagrams to forward
- * @tosidx:	Flow & side to forward data from @s to
- *
- * Return: true on success, false if can't forward from socket to flow's pif
- */
-static bool udp_buf_reply_sock_data(const struct ctx *c, int s, int n,
-				    flow_sidx_t tosidx)
-{
-	const struct flowside *toside = flowside_at_sidx(tosidx);
-	uint8_t topif = pif_at_sidx(tosidx);
-	int i;
-
-	if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
-		return true;
-
-	for (i = 0; i < n; i++) {
-		if (pif_is_socket(topif))
-			udp_splice_prepare(udp_mh_recv, i);
-		else if (topif == PIF_TAP)
-			udp_tap_prepare(udp_mh_recv, i, toside, false);
-	}
-
-	if (pif_is_socket(topif)) {
-		udp_splice_send(c, 0, n, tosidx);
-	} else if (topif == PIF_TAP) {
-		tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
-	} else {
-		return false;
-	}
-
-	return true;
-}
-
 /**
  * udp_sock_handler() - Handle new data from flow specific socket
  * @c:		Execution context
@@ -805,21 +811,26 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 		 */
 		size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
 		flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+		uint8_t topif = pif_at_sidx(tosidx);
 		int s = ref.fd;
-		bool ret;
 
 		flow_trace(uflow, "Received data on reply socket");
 		uflow->ts = now->tv_sec;
 
-		if (c->mode == MODE_VU) {
-			ret = udp_vu_reply_sock_data(c, s, UDP_MAX_FRAMES,
-						     tosidx);
+		if (pif_is_socket(topif)) {
+			udp_sock_to_sock(c, ref.fd, n, tosidx);
+		} else if (topif == PIF_TAP) {
+			if (c->mode == MODE_VU) {
+				udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES,
+						   tosidx);
+			} else {
+				udp_buf_sock_to_tap(c, s, n, tosidx);
+			}
 		} else {
-			ret = udp_buf_reply_sock_data(c, s, n, tosidx);
-		}
-
-		if (!ret) {
-			flow_err(uflow, "Unable to forward UDP");
+			flow_err(uflow,
+				 "No support for forwarding UDP from %s to %s",
+				 pif_name(pif_at_sidx(ref.flowside)),
+				 pif_name(topif));
 			goto fail;
 		}
 	}
diff --git a/udp_vu.c b/udp_vu.c
index b2618b3..8e02093 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -254,16 +254,13 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 }
 
 /**
- * udp_vu_reply_sock_data() - Handle new data from flow specific socket
+ * udp_vu_sock_to_tap() - Forward datagrams from socket to tap
  * @c:		Execution context
  * @s:		Socket to read data from
  * @n:		Maximum number of datagrams to forward
  * @tosidx:	Flow & side to forward data from @s to
- *
- * Return: true on success, false if can't forward from socket to flow's pif
  */
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
-			    flow_sidx_t tosidx)
+void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx)
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
@@ -271,9 +268,6 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;
 
-	if (pif_at_sidx(tosidx) != PIF_TAP)
-		return false;
-
 	for (i = 0; i < n; i++) {
 		ssize_t dlen;
 		int iov_used;
@@ -290,6 +284,4 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
 		}
 		vu_flush(vdev, vq, elem, iov_used);
 	}
-
-	return true;
 }
diff --git a/udp_vu.h b/udp_vu.h
index c897c36..576b0e7 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -8,7 +8,6 @@
 
 void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 			     const struct timespec *now);
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
-			    flow_sidx_t tosidx);
+void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx);
 
 #endif /* UDP_VU_H */

From fc6ee68ad3a8863cba534dfa4b88767114a6701e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:37 +1100
Subject: [PATCH 338/382] udp: Merge vhost-user and "buf" listening socket
 paths

udp_buf_listen_sock_data() and udp_vu_listen_sock_data() now have
effectively identical structure.  The forwarding functions used for flow
specific sockets (udp_buf_sock_to_tap(), udp_vu_sock_to_tap() and
udp_sock_to_sock()) also now take a number of datagrams.  This means we
can re-use them for the listening socket path, just passing '1' so they
handle a single datagram at a time.

This allows us to merge both the vhost-user and flow specific paths into
a single, simpler udp_listen_sock_data().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c          | 27 ++++++++--------------
 udp_internal.h |  1 -
 udp_vu.c       | 62 --------------------------------------------------
 3 files changed, 10 insertions(+), 80 deletions(-)

diff --git a/udp.c b/udp.c
index 2745e5d..b0a7bf7 100644
--- a/udp.c
+++ b/udp.c
@@ -629,7 +629,7 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
  *
  * Return: 0 on success, -1 otherwise
  */
-int udp_peek_addr(int s, union sockaddr_inany *src)
+static int udp_peek_addr(int s, union sockaddr_inany *src)
 {
 	struct msghdr msg = {
 		.msg_name = src,
@@ -714,12 +714,12 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
 }
 
 /**
- * udp_buf_listen_sock_data() - Handle new data from socket
+ * udp_listen_sock_data() - Handle new data from listening socket
  * @c:		Execution context
  * @ref:	epoll reference
  * @now:	Current timestamp
  */
-static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
+static void udp_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 				     const struct timespec *now)
 {
 	union sockaddr_inany src;
@@ -728,16 +728,13 @@ static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 		flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now);
 		uint8_t topif = pif_at_sidx(tosidx);
 
-		if (udp_sock_recv(c, ref.fd, udp_mh_recv, 1) <= 0)
-			break;
-
 		if (pif_is_socket(topif)) {
-			udp_splice_prepare(udp_mh_recv, 0);
-			udp_splice_send(c, 0, 1, tosidx);
+			udp_sock_to_sock(c, ref.fd, 1, tosidx);
 		} else if (topif == PIF_TAP) {
-			udp_tap_prepare(udp_mh_recv, 0, flowside_at_sidx(tosidx),
-					false);
-			tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, 1);
+			if (c->mode == MODE_VU)
+				udp_vu_sock_to_tap(c, ref.fd, 1, tosidx);
+			else
+				udp_buf_sock_to_tap(c, ref.fd, 1, tosidx);
 		} else if (flow_sidx_valid(tosidx)) {
 			flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
 			struct udp_flow *uflow = udp_at_sidx(tosidx);
@@ -772,12 +769,8 @@ void udp_listen_sock_handler(const struct ctx *c,
 		}
 	}
 
-	if (events & EPOLLIN) {
-		if (c->mode == MODE_VU)
-			udp_vu_listen_sock_data(c, ref, now);
-		else
-			udp_buf_listen_sock_data(c, ref, now);
-	}
+	if (events & EPOLLIN)
+		udp_listen_sock_data(c, ref, now);
 }
 
 /**
diff --git a/udp_internal.h b/udp_internal.h
index 43a6109..02724e5 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -30,6 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                        const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
-int udp_peek_addr(int s, union sockaddr_inany *src);
 
 #endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index 8e02093..1f89509 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -191,68 +191,6 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used)
 	}
 }
 
-/**
- * udp_vu_listen_sock_data() - Handle new data from socket
- * @c:		Execution context
- * @ref:	epoll reference
- * @now:	Current timestamp
- */
-void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
-			     const struct timespec *now)
-{
-	struct vu_dev *vdev = c->vdev;
-	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
-	int i;
-
-	for (i = 0; i < UDP_MAX_FRAMES; i++) {
-		const struct flowside *toside;
-		union sockaddr_inany s_in;
-		flow_sidx_t sidx;
-		uint8_t pif;
-		ssize_t dlen;
-		int iov_used;
-		bool v6;
-
-		if (udp_peek_addr(ref.fd, &s_in) < 0)
-			break;
-
-		sidx = udp_flow_from_sock(c, ref, &s_in, now);
-		pif = pif_at_sidx(sidx);
-
-		if (pif != PIF_TAP) {
-			if (flow_sidx_valid(sidx)) {
-				flow_sidx_t fromsidx = flow_sidx_opposite(sidx);
-				struct udp_flow *uflow = udp_at_sidx(sidx);
-
-				flow_err(uflow,
-					"No support for forwarding UDP from %s to %s",
-					pif_name(pif_at_sidx(fromsidx)),
-					pif_name(pif));
-			} else {
-				debug("Discarding 1 datagram without flow");
-			}
-
-			continue;
-		}
-
-		toside = flowside_at_sidx(sidx);
-
-		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
-
-		iov_used = udp_vu_sock_recv(c, ref.fd, v6, &dlen);
-		if (iov_used <= 0)
-			break;
-
-		udp_vu_prepare(c, toside, dlen);
-		if (*c->pcap) {
-			udp_vu_csum(toside, iov_used);
-			pcap_iov(iov_vu, iov_used,
-				 sizeof(struct virtio_net_hdr_mrg_rxbuf));
-		}
-		vu_flush(vdev, vq, elem, iov_used);
-	}
-}
-
 /**
  * udp_vu_sock_to_tap() - Forward datagrams from socket to tap
  * @c:		Execution context

From fd844a90bce0274d2488370ed7fadd850b6a0294 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:38 +1100
Subject: [PATCH 339/382] udp: Move UDP_MAX_FRAMES to udp.c

Recent changes mean that this define is no longer used anywhere except in
udp.c.  Move it back into udp.c from udp_internal.h.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c          | 2 ++
 udp_internal.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/udp.c b/udp.c
index b0a7bf7..f74a992 100644
--- a/udp.c
+++ b/udp.c
@@ -116,6 +116,8 @@
 #include "udp_internal.h"
 #include "udp_vu.h"
 
+#define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
+
 /* Maximum UDP data to be returned in ICMP messages */
 #define ICMP4_MAX_DLEN 8
 #define ICMP6_MAX_DLEN (IPV6_MIN_MTU			\
diff --git a/udp_internal.h b/udp_internal.h
index 02724e5..f7d8426 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -8,8 +8,6 @@
 
 #include "tap.h" /* needed by udp_meta_t */
 
-#define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
-
 /**
  * struct udp_payload_t - UDP header and data for inbound messages
  * @uh:		UDP header

From 159beefa36a09fc36cc9669fd536926d84c7c342 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:39 +1100
Subject: [PATCH 340/382] udp_flow: Take pif and port as explicit parameters to
 udp_flow_from_sock()

Currently udp_flow_from_sock() is only used when receiving a datagram
from a "listening" socket.  It takes the listening socket's epoll
reference to get the interface and port on which the datagram arrived.

We have some upcoming cases where we want to use this in different
contexts, so make it take the pif and port as direct parameters instead.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Drop @ref from comment to udp_flow_from_sock()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c      |  4 +++-
 udp_flow.c | 16 +++++++---------
 udp_flow.h |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/udp.c b/udp.c
index f74a992..157697e 100644
--- a/udp.c
+++ b/udp.c
@@ -727,7 +727,9 @@ static void udp_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 	union sockaddr_inany src;
 
 	while (udp_peek_addr(ref.fd, &src) == 0) {
-		flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now);
+		flow_sidx_t tosidx = udp_flow_from_sock(c, ref.udp.pif,
+							ref.udp.port, &src,
+							now);
 		uint8_t topif = pif_at_sidx(tosidx);
 
 		if (pif_is_socket(topif)) {
diff --git a/udp_flow.c b/udp_flow.c
index a2d417f..5afe6e5 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -161,9 +161,10 @@ cancel:
 }
 
 /**
- * udp_flow_from_sock() - Find or create UDP flow for "listening" socket
+ * udp_flow_from_sock() - Find or create UDP flow for incoming datagram
  * @c:		Execution context
- * @ref:	epoll reference of the receiving socket
+ * @pif:	Interface the datagram is arriving from
+ * @port:	Our (local) port number to which the datagram is arriving
  * @s_in:	Source socket address, filled in by recvmmsg()
  * @now:	Timestamp
  *
@@ -172,7 +173,7 @@ cancel:
  * Return: sidx for the destination side of the flow for this packet, or
  *         FLOW_SIDX_NONE if we couldn't find or create a flow.
  */
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now)
 {
@@ -181,9 +182,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 	union flow *flow;
 	flow_sidx_t sidx;
 
-	ASSERT(ref.type == EPOLL_TYPE_UDP_LISTEN);
-
-	sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, s_in, ref.udp.port);
+	sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, port);
 	if ((uflow = udp_at_sidx(sidx))) {
 		uflow->ts = now->tv_sec;
 		return flow_sidx_opposite(sidx);
@@ -193,12 +192,11 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 		char sastr[SOCKADDR_STRLEN];
 
 		debug("Couldn't allocate flow for UDP datagram from %s %s",
-		      pif_name(ref.udp.pif),
-		      sockaddr_ntop(s_in, sastr, sizeof(sastr)));
+		      pif_name(pif), sockaddr_ntop(s_in, sastr, sizeof(sastr)));
 		return FLOW_SIDX_NONE;
 	}
 
-	ini = flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port);
+	ini = flow_initiate_sa(flow, pif, s_in, port);
 
 	if (!inany_is_unicast(&ini->eaddr) ||
 	    ini->eport == 0 || ini->oport == 0) {
diff --git a/udp_flow.h b/udp_flow.h
index 520de62..bbdeb2a 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -26,7 +26,7 @@ struct udp_flow {
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now);
 flow_sidx_t udp_flow_from_tap(const struct ctx *c,

From bd6a41ee76bb9a0da2150d76dbabf9a3212d0fca Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:40 +1100
Subject: [PATCH 341/382] udp: Rework udp_listen_sock_data() into
 udp_sock_fwd()

udp_listen_sock_data() forwards datagrams from a "listening" socket until
there are no more (for now).  We have an upcoming use case where we want
to do that for a socket that's not a "listening" socket, and uses a
different epoll reference.  So, adjust the function to take the pieces it
needs from the reference as direct parameters and rename to udp_sock_fwd().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/udp.c b/udp.c
index 157697e..20d8f0c 100644
--- a/udp.c
+++ b/udp.c
@@ -716,37 +716,36 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
 }
 
 /**
- * udp_listen_sock_data() - Handle new data from listening socket
+ * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket
  * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to forward from
+ * @frompif:	Interface to which @s belongs
+ * @port:	Our (local) port number of @s
  * @now:	Current timestamp
  */
-static void udp_listen_sock_data(const struct ctx *c, union epoll_ref ref,
-				     const struct timespec *now)
+static void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+			 in_port_t port, const struct timespec *now)
 {
 	union sockaddr_inany src;
 
-	while (udp_peek_addr(ref.fd, &src) == 0) {
-		flow_sidx_t tosidx = udp_flow_from_sock(c, ref.udp.pif,
-							ref.udp.port, &src,
-							now);
+	while (udp_peek_addr(s, &src) == 0) {
+		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, port,
+							&src, now);
 		uint8_t topif = pif_at_sidx(tosidx);
 
 		if (pif_is_socket(topif)) {
-			udp_sock_to_sock(c, ref.fd, 1, tosidx);
+			udp_sock_to_sock(c, s, 1, tosidx);
 		} else if (topif == PIF_TAP) {
 			if (c->mode == MODE_VU)
-				udp_vu_sock_to_tap(c, ref.fd, 1, tosidx);
+				udp_vu_sock_to_tap(c, s, 1, tosidx);
 			else
-				udp_buf_sock_to_tap(c, ref.fd, 1, tosidx);
+				udp_buf_sock_to_tap(c, s, 1, tosidx);
 		} else if (flow_sidx_valid(tosidx)) {
-			flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
 			struct udp_flow *uflow = udp_at_sidx(tosidx);
 
 			flow_err(uflow,
 				 "No support for forwarding UDP from %s to %s",
-				 pif_name(pif_at_sidx(fromsidx)),
-				 pif_name(topif));
+				 pif_name(frompif), pif_name(topif));
 		} else {
 			debug("Discarding datagram without flow");
 		}
@@ -774,7 +773,7 @@ void udp_listen_sock_handler(const struct ctx *c,
 	}
 
 	if (events & EPOLLIN)
-		udp_listen_sock_data(c, ref, now);
+		udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now);
 }
 
 /**

From 9eb540626047bece3f25f38e47ec3b2b0030f9f4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:41 +1100
Subject: [PATCH 342/382] udp: Fold udp_splice_prepare and udp_splice_send into
 udp_sock_to_sock

udp_splice() prepare and udp_splice_send() are both quite simple functions
that now have only one caller: udp_sock_to_sock().  Fold them both into
that caller.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 55 +++++++++++++++----------------------------------------
 1 file changed, 15 insertions(+), 40 deletions(-)

diff --git a/udp.c b/udp.c
index 20d8f0c..d9d2183 100644
--- a/udp.c
+++ b/udp.c
@@ -250,43 +250,6 @@ static void udp_iov_init(const struct ctx *c)
 		udp_iov_init_one(c, i);
 }
 
-/**
- * udp_splice_prepare() - Prepare one datagram for splicing
- * @mmh:	Receiving mmsghdr array
- * @idx:	Index of the datagram to prepare
- */
-static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx)
-{
-	udp_mh_splice[idx].msg_hdr.msg_iov->iov_len = mmh[idx].msg_len;
-}
-
-/**
- * udp_splice_send() - Send a batch of datagrams from socket to socket
- * @c:		Execution context
- * @start:	Index of batch's first datagram in udp[46]_l2_buf
- * @n:		Number of datagrams in batch
- * @src:	Source port for datagram (target side)
- * @dst:	Destination port for datagrams (target side)
- * @ref:	epoll reference for origin socket
- * @now:	Timestamp
- *
- * #syscalls sendmmsg
- */
-static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
-			    flow_sidx_t tosidx)
-{
-	const struct flowside *toside = flowside_at_sidx(tosidx);
-	const struct udp_flow *uflow = udp_at_sidx(tosidx);
-	uint8_t topif = pif_at_sidx(tosidx);
-	int s = uflow->s[tosidx.sidei];
-	socklen_t sl;
-
-	pif_sockaddr(c, &udp_splice_to, &sl, topif,
-		     &toside->eaddr, toside->eport);
-
-	sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL);
-}
-
 /**
  * udp_update_hdr4() - Update headers for one IPv4 datagram
  * @ip4h:		Pre-filled IPv4 header (except for tot_len and saddr)
@@ -678,19 +641,31 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
  * @from_s:	Socket to receive datagrams from
  * @n:		Maximum number of datagrams to forward
  * @tosidx:	Flow & side to forward datagrams to
+ *
+ * #syscalls sendmmsg
  */
 static void udp_sock_to_sock(const struct ctx *c, int from_s, int n,
 			     flow_sidx_t tosidx)
 {
+	const struct flowside *toside = flowside_at_sidx(tosidx);
+	const struct udp_flow *uflow = udp_at_sidx(tosidx);
+	uint8_t topif = pif_at_sidx(tosidx);
+	int to_s = uflow->s[tosidx.sidei];
+	socklen_t sl;
 	int i;
 
 	if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0)
 		return;
 
-	for (i = 0; i < n; i++)
-		udp_splice_prepare(udp_mh_recv, i);
+	for (i = 0; i < n; i++) {
+		udp_mh_splice[i].msg_hdr.msg_iov->iov_len
+			= udp_mh_recv[i].msg_len;
+	}
 
-	udp_splice_send(c, 0, n, tosidx);
+	pif_sockaddr(c, &udp_splice_to, &sl, topif,
+		     &toside->eaddr, toside->eport);
+
+	sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL);
 }
 
 /**

From 9725e79888374a4e4060a2d798f3407c0006cc8a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:42 +1100
Subject: [PATCH 343/382] udp_flow: Don't discard packets that arrive between
 bind() and connect()

When we establish a new UDP flow we create connect()ed sockets that will
only handle datagrams for this flow.  However, there is a race between
bind() and connect() where they might get some packets queued for a
different flow.  Currently we handle this by simply discarding any
queued datagrams after the connect.  UDP protocols should be able to handle
such packet loss, but it's not ideal.

We now have the tools we need to handle this better, by redirecting any
datagrams received during that race to the appropriate flow.  We need to
use a deferred handler for this to avoid unexpectedly re-ordering datagrams
in some edge cases.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Update comment to udp_flow_defer()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c         |  2 +-
 udp.c          |  4 +--
 udp_flow.c     | 77 +++++++++++++++++++++++++++++++++++---------------
 udp_flow.h     |  6 +++-
 udp_internal.h |  2 ++
 5 files changed, 64 insertions(+), 27 deletions(-)

diff --git a/flow.c b/flow.c
index 8622242..29a83e1 100644
--- a/flow.c
+++ b/flow.c
@@ -850,7 +850,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 				closed = icmp_ping_timer(c, &flow->ping, now);
 			break;
 		case FLOW_UDP:
-			closed = udp_flow_defer(&flow->udp);
+			closed = udp_flow_defer(c, &flow->udp, now);
 			if (!closed && timer)
 				closed = udp_flow_timer(c, &flow->udp, now);
 			break;
diff --git a/udp.c b/udp.c
index d9d2183..ed6edc1 100644
--- a/udp.c
+++ b/udp.c
@@ -698,8 +698,8 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
  * @port:	Our (local) port number of @s
  * @now:	Current timestamp
  */
-static void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
-			 in_port_t port, const struct timespec *now)
+void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+		  in_port_t port, const struct timespec *now)
 {
 	union sockaddr_inany src;
 
diff --git a/udp_flow.c b/udp_flow.c
index 5afe6e5..75f5a0b 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -9,10 +9,12 @@
 #include <fcntl.h>
 #include <sys/uio.h>
 #include <unistd.h>
+#include <netinet/udp.h>
 
 #include "util.h"
 #include "passt.h"
 #include "flow_table.h"
+#include "udp_internal.h"
 
 #define UDP_CONN_TIMEOUT	180 /* s, timeout for ephemeral or local bind */
 
@@ -67,16 +69,15 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
  * Return: fd of new socket on success, -ve error code on failure
  */
 static int udp_flow_sock(const struct ctx *c,
-			 const struct udp_flow *uflow, unsigned sidei)
+			 struct udp_flow *uflow, unsigned sidei)
 {
 	const struct flowside *side = &uflow->f.side[sidei];
-	struct mmsghdr discard[UIO_MAXIOV] = { 0 };
 	uint8_t pif = uflow->f.pif[sidei];
 	union {
 		flow_sidx_t sidx;
 		uint32_t data;
 	} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
-	int rc, s;
+	int s;
 
 	s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
 	if (s < 0) {
@@ -85,30 +86,32 @@ static int udp_flow_sock(const struct ctx *c,
 	}
 
 	if (flowside_connect(c, s, pif, side) < 0) {
-		rc = -errno;
+		int rc = -errno;
 		flow_dbg_perror(uflow, "Couldn't connect flow socket");
 		return rc;
 	}
 
-	/* It's possible, if unlikely, that we could receive some unrelated
-	 * packets in between the bind() and connect() of this socket.  For now
-	 * we just discard these.
+	/* It's possible, if unlikely, that we could receive some packets in
+	 * between the bind() and connect() which may or may not be for this
+	 * flow.  Being UDP we could just discard them, but it's not ideal.
 	 *
-	 * FIXME: Redirect these to an appropriate handler
+	 * There's also a tricky case if a bunch of datagrams for a new flow
+	 * arrive in rapid succession, the first going to the original listening
+	 * socket and later ones going to this new socket.  If we forwarded the
+	 * datagrams from the new socket immediately here they would go before
+	 * the datagram which established the flow.  Again, not strictly wrong
+	 * for UDP, but not ideal.
+	 *
+	 * So, we flag that the new socket is in a transient state where it
+	 * might have datagrams for a different flow queued.  Before the next
+	 * epoll cycle, udp_flow_defer() will flush out any such datagrams, and
+	 * thereafter everything on the new socket should be strictly for this
+	 * flow.
 	 */
-	rc = recvmmsg(s, discard, ARRAY_SIZE(discard), MSG_DONTWAIT, NULL);
-	if (rc >= ARRAY_SIZE(discard)) {
-		flow_dbg(uflow, "Too many (%d) spurious reply datagrams", rc);
-		return -E2BIG;
-	}
-
-	if (rc > 0) {
-		flow_trace(uflow, "Discarded %d spurious reply datagrams", rc);
-	} else if (errno != EAGAIN) {
-		rc = -errno;
-		flow_perror(uflow, "Unexpected error discarding datagrams");
-		return rc;
-	}
+	if (sidei)
+		uflow->flush1 = true;
+	else
+		uflow->flush0 = true;
 
 	return s;
 }
@@ -269,13 +272,41 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 }
 
 /**
- * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
+ * udp_flush_flow() - Flush datagrams that might not be for this flow
+ * @c:		Execution context
  * @uflow:	Flow to handle
+ * @sidei:	Side of the flow to flush
+ * @now:	Current timestamp
+ */
+static void udp_flush_flow(const struct ctx *c,
+			   const struct udp_flow *uflow, unsigned sidei,
+			   const struct timespec *now)
+{
+	/* We don't know exactly where the datagrams will come from, but we know
+	 * they'll have an interface and oport matching this flow */
+	udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei],
+		     uflow->f.side[sidei].oport, now);
+}
+
+/**
+ * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
+ * @c:		Execution context
+ * @uflow:	Flow to handle
+ * @now:	Current timestamp
  *
  * Return: true if the connection is ready to free, false otherwise
  */
-bool udp_flow_defer(const struct udp_flow *uflow)
+bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
+		    const struct timespec *now)
 {
+	if (uflow->flush0) {
+		udp_flush_flow(c, uflow, INISIDE, now);
+		uflow->flush0 = false;
+	}
+	if (uflow->flush1) {
+		udp_flush_flow(c, uflow, TGTSIDE, now);
+		uflow->flush1 = false;
+	}
 	return uflow->closed;
 }
 
diff --git a/udp_flow.h b/udp_flow.h
index bbdeb2a..90d3b29 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -11,6 +11,8 @@
  * struct udp_flow - Descriptor for a flow of UDP packets
  * @f:		Generic flow information
  * @closed:	Flow is already closed
+ * @flush0:	@s[0] may have datagrams queued for other flows
+ * @flush1:	@s[1] may have datagrams queued for other flows
  * @ts:		Activity timestamp
  * @s:		Socket fd (or -1) for each side of the flow
  * @ttl:	TTL or hop_limit for both sides
@@ -20,6 +22,7 @@ struct udp_flow {
 	struct flow_common f;
 
 	bool closed :1;
+	bool flush0, flush1 :1;
 	time_t ts;
 	int s[SIDES];
 	uint8_t ttl[SIDES];
@@ -35,7 +38,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 			      in_port_t srcport, in_port_t dstport,
 			      const struct timespec *now);
 void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
-bool udp_flow_defer(const struct udp_flow *uflow);
+bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
+		    const struct timespec *now);
 bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
 		    const struct timespec *now);
 
diff --git a/udp_internal.h b/udp_internal.h
index f7d8426..96d11cf 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -28,5 +28,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                        const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
+void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+		  in_port_t port, const struct timespec *now);
 
 #endif /* UDP_INTERNAL_H */

From 06ef64cdb72475fd02c72cdd607a31a86605e734 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 8 Apr 2025 07:49:55 +0200
Subject: [PATCH 344/382] udp_flow: Save 8 bytes in struct udp_flow on 64-bit
 architectures

Shuffle the fields just added by commits a7775e9550fa ("udp: support
traceroute in direction tap-socket") and 9725e7988837 ("udp_flow:
Don't discard packets that arrive between bind() and connect()").

On x86_64, as reported by pahole(1), before:

struct udp_flow {
        struct flow_common         f;                    /*     0    76 */
        /* --- cacheline 1 boundary (64 bytes) was 12 bytes ago --- */
        _Bool                      closed:1;             /*    76: 0  1 */

        /* XXX 7 bits hole, try to pack */

        _Bool                      flush0;               /*    77     1 */
        _Bool                      flush1:1;             /*    78: 0  1 */

        /* XXX 7 bits hole, try to pack */
        /* XXX 1 byte hole, try to pack */

        time_t                     ts;                   /*    80     8 */
        int                        s[2];                 /*    88     8 */
        uint8_t                    ttl[2];               /*    96     2 */

        /* size: 104, cachelines: 2, members: 7 */
        /* sum members: 95, holes: 1, sum holes: 1 */
        /* sum bitfield members: 2 bits, bit holes: 2, sum bit holes: 14 bits */
        /* padding: 6 */
        /* last cacheline: 40 bytes */
};

and after:

struct udp_flow {
        struct flow_common         f;                    /*     0    76 */
        /* --- cacheline 1 boundary (64 bytes) was 12 bytes ago --- */
        uint8_t                    ttl[2];               /*    76     2 */
        _Bool                      closed:1;             /*    78: 0  1 */
        _Bool                      flush0:1;             /*    78: 1  1 */
        _Bool                      flush1:1;             /*    78: 2  1 */

        /* XXX 5 bits hole, try to pack */
        /* XXX 1 byte hole, try to pack */

        time_t                     ts;                   /*    80     8 */
        int                        s[2];                 /*    88     8 */

        /* size: 96, cachelines: 2, members: 7 */
        /* sum members: 94, holes: 1, sum holes: 1 */
        /* sum bitfield members: 3 bits, bit holes: 1, sum bit holes: 5 bits */
        /* last cacheline: 32 bytes */
};

It doesn't matter much because anyway the typical storage for struct
udp_flow is given by union flow:

union flow {
        struct flow_common         f;                  /*     0    76 */
        struct flow_free_cluster   free;               /*     0    84 */
        struct tcp_tap_conn        tcp;                /*     0   120 */
        struct tcp_splice_conn     tcp_splice;         /*     0   120 */
        struct icmp_ping_flow      ping;               /*     0    96 */
        struct udp_flow            udp;                /*     0    96 */
};

but it still improves data locality somewhat, so let me fix this up
now that commits are fresh.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp_flow.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/udp_flow.h b/udp_flow.h
index 90d3b29..e289122 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -10,22 +10,25 @@
 /**
  * struct udp_flow - Descriptor for a flow of UDP packets
  * @f:		Generic flow information
+ * @ttl:	TTL or hop_limit for both sides
  * @closed:	Flow is already closed
  * @flush0:	@s[0] may have datagrams queued for other flows
  * @flush1:	@s[1] may have datagrams queued for other flows
  * @ts:		Activity timestamp
  * @s:		Socket fd (or -1) for each side of the flow
- * @ttl:	TTL or hop_limit for both sides
  */
 struct udp_flow {
 	/* Must be first element */
 	struct flow_common f;
 
-	bool closed :1;
-	bool flush0, flush1 :1;
+	uint8_t ttl[SIDES];
+
+	bool	closed	:1,
+		flush0	:1,
+		flush1	:1;
+
 	time_t ts;
 	int s[SIDES];
-	uint8_t ttl[SIDES];
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);

From ffbef85e975ba117ed1c20f733d989ac08ebf325 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 8 Apr 2025 07:57:51 +0200
Subject: [PATCH 345/382] conf: Add missing return in conf_nat(), fix
 --map-guest-addr none

As reported by somebody on IRC:

  $ pasta --map-guest-addr none
  Invalid address to remap to host: none

that's because once we parsed "none", we try to parse it as an address
as well. But we already handled it, so stop once we're done.

Fixes: e813a4df7da2 ("conf: Allow address remapped to host to be configured")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/conf.c b/conf.c
index b54c55d..168646f 100644
--- a/conf.c
+++ b/conf.c
@@ -1272,6 +1272,8 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
 		*addr6 = in6addr_any;
 		if (no_map_gw)
 			*no_map_gw = 1;
+
+		return;
 	}
 
 	if (inet_pton(AF_INET6, arg, addr6)	&&

From d3f33f3b8ec4646dae3584b648cba142a73d3208 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 9 Apr 2025 16:35:40 +1000
Subject: [PATCH 346/382] tcp_splice: Don't double count bytes read on EINTR

In tcp_splice_sock_handler(), if we get an EINTR on our second splice()
(pipe to output socket) we - as we should - go back and retry it.  However,
we do so *after* we've already updated our byte counters.  That does no
harm for the conn->written[] counter - since the second splice() returned
an error it will be advanced by 0.  However we also advance the
conn->read[] counter, and then do so again when the splice() succeeds.
This results in the counters being out of sync, and us thinking we have
remaining data in the pipe when we don't, which can leave us in an
infinite loop once the stream finishes.

Fix this by moving the EINTR handling to directly next to the splice()
call (which is what we usually do for EINTR).  As a bonus this removes one
mildly confusing goto.

For symmetry, also rework the EINTR handling on the first splice() the same
way, although that doesn't (as far as I can tell) have buggy side effects.

Link: https://github.com/containers/podman/issues/23686#issuecomment-2779347687
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_splice.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/tcp_splice.c b/tcp_splice.c
index 0d10e3d..7c3b56f 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -520,15 +520,14 @@ swap:
 		int more = 0;
 
 retry:
-		readlen = splice(conn->s[fromsidei], NULL,
-				 conn->pipe[fromsidei][1], NULL,
-				 c->tcp.pipe_size,
-				 SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+		do
+			readlen = splice(conn->s[fromsidei], NULL,
+					 conn->pipe[fromsidei][1], NULL,
+					 c->tcp.pipe_size,
+					 SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+		while (readlen < 0 && errno == EINTR);
 		flow_trace(conn, "%zi from read-side call", readlen);
 		if (readlen < 0) {
-			if (errno == EINTR)
-				goto retry;
-
 			if (errno != EAGAIN)
 				goto close;
 		} else if (!readlen) {
@@ -543,10 +542,13 @@ retry:
 				conn_flag(c, conn, lowat_act_flag);
 		}
 
-eintr:
-		written = splice(conn->pipe[fromsidei][0], NULL,
-				 conn->s[!fromsidei], NULL, c->tcp.pipe_size,
-				 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+		do
+			written = splice(conn->pipe[fromsidei][0], NULL,
+					 conn->s[!fromsidei], NULL,
+					 c->tcp.pipe_size,
+					 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+		while (written < 0 && errno == EINTR);
+
 		flow_trace(conn, "%zi from write-side call (passed %zi)",
 			   written, c->tcp.pipe_size);
 
@@ -578,9 +580,6 @@ eintr:
 		conn->written[fromsidei] += written > 0 ? written : 0;
 
 		if (written < 0) {
-			if (errno == EINTR)
-				goto eintr;
-
 			if (errno != EAGAIN)
 				goto close;
 

From 6693fa115824d198b7cde46c272514be194500a9 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 9 Apr 2025 16:35:41 +1000
Subject: [PATCH 347/382] tcp_splice: Don't clobber errno before checking for
 EAGAIN

Like many places, tcp_splice_sock_handler() needs to handle EAGAIN
specially, in this case for both of its splice() calls.  Unfortunately it
tests for EAGAIN some time after those calls.  In between there has been
at least a flow_trace() which could have clobbered errno.  Move the test on
errno closer to the relevant system calls to avoid this problem.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_splice.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tcp_splice.c b/tcp_splice.c
index 7c3b56f..60455d6 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -526,13 +526,15 @@ retry:
 					 c->tcp.pipe_size,
 					 SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
 		while (readlen < 0 && errno == EINTR);
+
+		if (readlen < 0 && errno != EAGAIN)
+			goto close;
+
 		flow_trace(conn, "%zi from read-side call", readlen);
-		if (readlen < 0) {
-			if (errno != EAGAIN)
-				goto close;
-		} else if (!readlen) {
+
+		if (!readlen) {
 			eof = 1;
-		} else {
+		} else if (readlen > 0) {
 			never_read = 0;
 
 			if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
@@ -549,6 +551,9 @@ retry:
 					 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
 		while (written < 0 && errno == EINTR);
 
+		if (written < 0 && errno != EAGAIN)
+			goto close;
+
 		flow_trace(conn, "%zi from write-side call (passed %zi)",
 			   written, c->tcp.pipe_size);
 
@@ -580,9 +585,6 @@ retry:
 		conn->written[fromsidei] += written > 0 ? written : 0;
 
 		if (written < 0) {
-			if (errno != EAGAIN)
-				goto close;
-
 			if (conn->read[fromsidei] == conn->written[fromsidei])
 				break;
 

From f4b0dd8b06850bacb2da57c8576e3377daa88572 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 10 Apr 2025 17:16:38 +1000
Subject: [PATCH 348/382] udp: Use PKTINFO cmsgs to get destination address for
 received datagrams

Currently we get the source address for received datagrams from recvmsg(),
but we don't get the local destination address.  Sometimes we implicitly
know this because the receiving socket is bound to a specific address, but
when listening on 0.0.0.0 or ::, we don't.

We need this information to properly direct replies to flows which come in
to a non-default local address.  So, enable the IP_PKTINFO and IPV6_PKTINFO
control messages to obtain this information in udp_peek_addr().  For now
we log a trace messages but don't do anything more with the information.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c  | 37 +++++++++++++++++++++++++++++++++++--
 util.c |  8 ++++++--
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/udp.c b/udp.c
index ed6edc1..a71141a 100644
--- a/udp.c
+++ b/udp.c
@@ -587,18 +587,29 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
 	return n_err;
 }
 
+#define PKTINFO_SPACE					\
+	MAX(CMSG_SPACE(sizeof(struct in_pktinfo)),	\
+	    CMSG_SPACE(sizeof(struct in6_pktinfo)))
+
 /**
  * udp_peek_addr() - Get source address for next packet
  * @s:		Socket to get information from
  * @src:	Socket address (output)
+ * @dst:	(Local) destination address (output)
  *
  * Return: 0 on success, -1 otherwise
  */
-static int udp_peek_addr(int s, union sockaddr_inany *src)
+static int udp_peek_addr(int s, union sockaddr_inany *src,
+			 union inany_addr *dst)
 {
+	char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN];
+	const struct cmsghdr *hdr;
+	char cmsg[PKTINFO_SPACE];
 	struct msghdr msg = {
 		.msg_name = src,
 		.msg_namelen = sizeof(*src),
+		.msg_control = cmsg,
+		.msg_controllen = sizeof(cmsg),
 	};
 	int rc;
 
@@ -608,6 +619,27 @@ static int udp_peek_addr(int s, union sockaddr_inany *src)
 			warn_perror("Error peeking at socket address");
 		return rc;
 	}
+
+	hdr = CMSG_FIRSTHDR(&msg);
+	if (hdr && hdr->cmsg_level == IPPROTO_IP &&
+	    hdr->cmsg_type == IP_PKTINFO) {
+		const struct in_pktinfo *info4 = (void *)CMSG_DATA(hdr);
+
+		*dst = inany_from_v4(info4->ipi_addr);
+	} else if (hdr && hdr->cmsg_level == IPPROTO_IPV6 &&
+		   hdr->cmsg_type == IPV6_PKTINFO) {
+		const struct in6_pktinfo *info6 = (void *)CMSG_DATA(hdr);
+
+		dst->a6 = info6->ipi6_addr;
+	} else {
+		debug("Unexpected cmsg on UDP datagram");
+		*dst = inany_any6;
+	}
+
+	trace("Peeked UDP datagram: %s -> %s",
+	      sockaddr_ntop(src, sastr, sizeof(sastr)),
+	      inany_ntop(dst, dstr, sizeof(dstr)));
+
 	return 0;
 }
 
@@ -702,8 +734,9 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 		  in_port_t port, const struct timespec *now)
 {
 	union sockaddr_inany src;
+	union inany_addr dst;
 
-	while (udp_peek_addr(s, &src) == 0) {
+	while (udp_peek_addr(s, &src, &dst) == 0) {
 		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, port,
 							&src, now);
 		uint8_t topif = pif_at_sidx(tosidx);
diff --git a/util.c b/util.c
index 0f68cf5..62a6003 100644
--- a/util.c
+++ b/util.c
@@ -109,11 +109,15 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 		debug("Failed to set SO_REUSEADDR on socket %i", fd);
 
 	if (proto == IPPROTO_UDP) {
+		int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
+		int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
 		int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
-		int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
 
-		if (setsockopt(fd, level, opt, &y, sizeof(y)))
+		if (setsockopt(fd, level, recverr, &y, sizeof(y)))
 			die_perror("Failed to set RECVERR on socket %i", fd);
+
+		if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
+			die_perror("Failed to set PKTINFO on socket %i", fd);
 	}
 
 	if (ifname && *ifname) {

From 695c62396eb3f4627c1114ce444394e3ba34373a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 10 Apr 2025 17:16:39 +1000
Subject: [PATCH 349/382] inany: Improve ASSERT message for bad socket family

inany_from_sockaddr() can only handle sockaddrs of family AF_INET or
AF_INET6 and asserts if given something else.  I hit this assertion while
debugging something else, and wanted to see what the bad sockaddr family
was.  Now that we have ASSERT_WITH_MSG() its easy to add this information.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 inany.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inany.h b/inany.h
index 6a12c29..1c247e1 100644
--- a/inany.h
+++ b/inany.h
@@ -252,7 +252,8 @@ static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
 		*port = ntohs(sa->sa4.sin_port);
 	} else {
 		/* Not valid to call with other address families */
-		ASSERT(0);
+		ASSERT_WITH_MSG(0, "Unexpected sockaddr family: %u",
+				sa->sa_family);
 	}
 }
 

From 59cc89f4cc018988428637d97745cc4c919126cb Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 10 Apr 2025 17:16:40 +1000
Subject: [PATCH 350/382] udp, udp_flow: Track our specific address on socket
 interfaces

So far for UDP flows (like TCP connections) we didn't record our address
(oaddr) in the flow table entry for socket based pifs.  That's because we
didn't have that information when a flow was initiated by a datagram coming
to a "listening" socket with 0.0.0.0 or :: address.  Even when we did have
the information, we didn't record it, to simplify address matching on
lookups.

This meant that in some circumstances we could send replies on a UDP flow
from a different address than the originating request came to, which is
surprising and breaks certain setups.

We now have code in udp_peek_addr() which does determine our address for
incoming UDP datagrams.  We can use that information to properly populate
oaddr in the flow table for flow initiated from a socket.

In order to be able to consistently match datagrams to flows, we must
*always* have a specific oaddr, not an unspecified address (that's how the
flow hash table works).  So, we also need to fill in oaddr correctly for
flows we initiate *to* sockets.  Our forwarding logic doesn't specify
oaddr here, letting the kernel decide based on the routing table.  In this
case we need to call getsockname() after connect()ing the socket to find
which local address the kernel picked.

This adds getsockname() to our seccomp profile for all variants.

Link: https://bugs.passt.top/show_bug.cgi?id=99
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c       | 14 +++++++++++---
 flow.h       |  3 ++-
 flow_table.h |  1 +
 tcp.c        |  2 +-
 udp.c        |  4 ++--
 udp_flow.c   | 36 ++++++++++++++++++++++++++++++++----
 udp_flow.h   |  3 ++-
 util.h       | 10 ++++++++++
 8 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/flow.c b/flow.c
index 29a83e1..3c81cb4 100644
--- a/flow.c
+++ b/flow.c
@@ -396,18 +396,22 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
  * @flow:	Flow to change state
  * @pif:	pif of the initiating side
  * @ssa:	Source socket address
+ * @daddr:	Destination address (may be NULL)
  * @dport:	Destination port
  *
  * Return: pointer to the initiating flowside information
  */
 struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
 				  const union sockaddr_inany *ssa,
+				  const union inany_addr *daddr,
 				  in_port_t dport)
 {
 	struct flowside *ini = &flow->f.side[INISIDE];
 
 	inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
-	if (inany_v4(&ini->eaddr))
+	if (daddr)
+		ini->oaddr = *daddr;
+	else if (inany_v4(&ini->eaddr))
 		ini->oaddr = inany_any4;
 	else
 		ini->oaddr = inany_any6;
@@ -751,19 +755,23 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
  * @proto:	Protocol of the flow (IP L4 protocol number)
  * @pif:	Interface of the flow
  * @esa:	Socket address of the endpoint
+ * @oaddr:	Our address (may be NULL)
  * @oport:	Our port number
  *
  * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
  */
 flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
-			   const void *esa, in_port_t oport)
+			   const void *esa,
+			   const union inany_addr *oaddr, in_port_t oport)
 {
 	struct flowside side = {
 		.oport = oport,
 	};
 
 	inany_from_sockaddr(&side.eaddr, &side.eport, esa);
-	if (inany_v4(&side.eaddr))
+	if (oaddr)
+		side.oaddr = *oaddr;
+	else if (inany_v4(&side.eaddr))
 		side.oaddr = inany_any4;
 	else
 		side.oaddr = inany_any6;
diff --git a/flow.h b/flow.h
index dcf7645..cac618a 100644
--- a/flow.h
+++ b/flow.h
@@ -243,7 +243,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
 			   const void *eaddr, const void *oaddr,
 			   in_port_t eport, in_port_t oport);
 flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
-			   const void *esa, in_port_t oport);
+			   const void *esa,
+			   const union inany_addr *oaddr, in_port_t oport);
 
 union flow;
 
diff --git a/flow_table.h b/flow_table.h
index fd2c57b..2d5c65c 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -199,6 +199,7 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
 					const void *daddr, in_port_t dport);
 struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
 				  const union sockaddr_inany *ssa,
+				  const union inany_addr *daddr,
 				  in_port_t dport);
 const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
 				      sa_family_t af,
diff --git a/tcp.c b/tcp.c
index 35626c9..9c6bc52 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2201,7 +2201,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 	 * mode only, below.
 	 */
 	ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
-			       ref.tcp_listen.port);
+			       NULL, ref.tcp_listen.port);
 
 	if (c->mode == MODE_VU) { /* Rebind to same address after migration */
 		if (!getsockname(s, &sa.sa, &sl))
diff --git a/udp.c b/udp.c
index a71141a..40af7df 100644
--- a/udp.c
+++ b/udp.c
@@ -737,8 +737,8 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 	union inany_addr dst;
 
 	while (udp_peek_addr(s, &src, &dst) == 0) {
-		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, port,
-							&src, now);
+		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif,
+							&dst, port, &src, now);
 		uint8_t topif = pif_at_sidx(tosidx);
 
 		if (pif_is_socket(topif)) {
diff --git a/udp_flow.c b/udp_flow.c
index 75f5a0b..ef2cbb0 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -123,14 +123,17 @@ static int udp_flow_sock(const struct ctx *c,
  * @now:	Timestamp
  *
  * Return: UDP specific flow, if successful, NULL on failure
+ *
+ * #syscalls getsockname
  */
 static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 				const struct timespec *now)
 {
 	struct udp_flow *uflow = NULL;
+	const struct flowside *tgt;
 	unsigned sidei;
 
-	if (!flow_target(c, flow, IPPROTO_UDP))
+	if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
 		goto cancel;
 
 	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
@@ -144,6 +147,29 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 				goto cancel;
 	}
 
+	if (uflow->s[TGTSIDE] >= 0 && inany_is_unspecified(&tgt->oaddr)) {
+		/* When we target a socket, we connect() it, but might not
+		 * always bind(), leaving the kernel to pick our address.  In
+		 * that case connect() will implicitly bind() the socket, but we
+		 * need to determine its local address so that we can match
+		 * reply packets back to the correct flow.  Update the flow with
+		 * the information from getsockname() */
+		union sockaddr_inany sa;
+		socklen_t sl = sizeof(sa);
+		in_port_t port;
+
+		if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0) {
+			flow_perror(uflow, "Unable to determine local address");
+			goto cancel;
+		}
+		inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr,
+				    &port, &sa);
+		if (port != tgt->oport) {
+			flow_err(uflow, "Unexpected local port");
+			goto cancel;
+		}
+	}
+
 	/* Tap sides always need to be looked up by hash.  Socket sides don't
 	 * always, but sometimes do (receiving packets on a socket not specific
 	 * to one flow).  Unconditionally hash both sides so all our bases are
@@ -167,6 +193,7 @@ cancel:
  * udp_flow_from_sock() - Find or create UDP flow for incoming datagram
  * @c:		Execution context
  * @pif:	Interface the datagram is arriving from
+ * @dst:	Our (local) address to which the datagram is arriving
  * @port:	Our (local) port number to which the datagram is arriving
  * @s_in:	Source socket address, filled in by recvmmsg()
  * @now:	Timestamp
@@ -176,7 +203,8 @@ cancel:
  * Return: sidx for the destination side of the flow for this packet, or
  *         FLOW_SIDX_NONE if we couldn't find or create a flow.
  */
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
+			       const union inany_addr *dst, in_port_t port,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now)
 {
@@ -185,7 +213,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
 	union flow *flow;
 	flow_sidx_t sidx;
 
-	sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, port);
+	sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port);
 	if ((uflow = udp_at_sidx(sidx))) {
 		uflow->ts = now->tv_sec;
 		return flow_sidx_opposite(sidx);
@@ -199,7 +227,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
 		return FLOW_SIDX_NONE;
 	}
 
-	ini = flow_initiate_sa(flow, pif, s_in, port);
+	ini = flow_initiate_sa(flow, pif, s_in, dst, port);
 
 	if (!inany_is_unicast(&ini->eaddr) ||
 	    ini->eport == 0 || ini->oport == 0) {
diff --git a/udp_flow.h b/udp_flow.h
index e289122..4c528e9 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -32,7 +32,8 @@ struct udp_flow {
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
+			       const union inany_addr *dst, in_port_t port,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now);
 flow_sidx_t udp_flow_from_tap(const struct ctx *c,
diff --git a/util.h b/util.h
index b1e7e79..cc7d084 100644
--- a/util.h
+++ b/util.h
@@ -371,6 +371,16 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr,
 #define accept4(s, addr, addrlen, flags) \
 	wrap_accept4((s), (addr), (addrlen), (flags))
 
+static inline int wrap_getsockname(int sockfd, struct sockaddr *addr,
+/* cppcheck-suppress constParameterPointer */
+				   socklen_t *addrlen)
+{
+	sa_init(addr, addrlen);
+	return getsockname(sockfd, addr, addrlen);
+}
+#define getsockname(s, addr, addrlen) \
+	wrap_getsockname((s), (addr), (addrlen))
+
 #define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */
 void encode_domain_name(char *buf, const char *domain_name);
 

From bbff3653d6412690eee1a079d584a7365d2ed886 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 11 Apr 2025 09:58:31 +0200
Subject: [PATCH 351/382] conf: Split add_dns_resolv() into separate IPv4 and
 IPv6 versions

Not really valuable by itself, but dropping one level of nested blocks
makes the next change more convenient.

No functional changes intended.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Paul Holzinger <pholzing@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c | 101 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 60 insertions(+), 41 deletions(-)

diff --git a/conf.c b/conf.c
index 168646f..18ed11c 100644
--- a/conf.c
+++ b/conf.c
@@ -414,6 +414,62 @@ static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr,
 	return 1;
 }
 
+/**
+ * add_dns_resolv4() - Possibly add one IPv4 nameserver from host's resolv.conf
+ * @c:		Execution context
+ * @ns:		Nameserver address
+ * @idx:	Pointer to index of current IPv4 resolver entry, set on return
+ */
+static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx)
+{
+	if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
+		c->ip4.dns_host = *ns;
+
+	/* Special handling if guest or container can only access local
+	 * addresses via redirect, or if the host gateway is also a resolver and
+	 * we shadow its address
+	 */
+	if (IN4_IS_ADDR_LOOPBACK(ns) ||
+	    IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) {
+		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
+			return;
+
+		*ns = c->ip4.map_host_loopback;
+		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
+			c->ip4.dns_match = c->ip4.map_host_loopback;
+	}
+
+	*idx += add_dns4(c, ns, *idx);
+}
+
+/**
+ * add_dns_resolv6() - Possibly add one IPv6 nameserver from host's resolv.conf
+ * @c:		Execution context
+ * @ns:		Nameserver address
+ * @idx:	Pointer to index of current IPv6 resolver entry, set on return
+ */
+static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx)
+{
+	if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
+		c->ip6.dns_host = *ns;
+
+	/* Special handling if guest or container can only access local
+	 * addresses via redirect, or if the host gateway is also a resolver and
+	 * we shadow its address
+	 */
+	if (IN6_IS_ADDR_LOOPBACK(ns) ||
+	    IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) {
+		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
+			return;
+
+		*ns = c->ip6.map_host_loopback;
+		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
+			c->ip6.dns_match = c->ip6.map_host_loopback;
+	}
+
+	*idx += add_dns6(c, ns, *idx);
+}
+
 /**
  * add_dns_resolv() - Possibly add ns from host resolv.conf to configuration
  * @c:		Execution context
@@ -430,48 +486,11 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver,
 	struct in6_addr ns6;
 	struct in_addr ns4;
 
-	if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) {
-		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
-			c->ip4.dns_host = ns4;
+	if (idx4 && inet_pton(AF_INET, nameserver, &ns4))
+		add_dns_resolv4(c, &ns4, idx4);
 
-		/* Special handling if guest or container can only access local
-		 * addresses via redirect, or if the host gateway is also a
-		 * resolver and we shadow its address
-		 */
-		if (IN4_IS_ADDR_LOOPBACK(&ns4) ||
-		    IN4_ARE_ADDR_EQUAL(&ns4, &c->ip4.map_host_loopback)) {
-			if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
-				return;
-
-			ns4 = c->ip4.map_host_loopback;
-			if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
-				c->ip4.dns_match = c->ip4.map_host_loopback;
-		}
-
-		*idx4 += add_dns4(c, &ns4, *idx4);
-	}
-
-	if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) {
-		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
-			c->ip6.dns_host = ns6;
-
-		/* Special handling if guest or container can only access local
-		 * addresses via redirect, or if the host gateway is also a
-		 * resolver and we shadow its address
-		 */
-		if (IN6_IS_ADDR_LOOPBACK(&ns6) ||
-		    IN6_ARE_ADDR_EQUAL(&ns6, &c->ip6.map_host_loopback)) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
-				return;
-
-			ns6 = c->ip6.map_host_loopback;
-
-			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
-				c->ip6.dns_match = c->ip6.map_host_loopback;
-		}
-
-		*idx6 += add_dns6(c, &ns6, *idx6);
-	}
+	if (idx6 && inet_pton(AF_INET6, nameserver, &ns6))
+		add_dns_resolv6(c, &ns6, idx6);
 }
 
 /**

From 50249086a967c54ff5b2521038cbe1d27303958c Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 11 Apr 2025 10:50:00 +0200
Subject: [PATCH 352/382] conf: Honour --dns-forward for local resolver even
 with --no-map-gw

If the first resolver listed in the host's /etc/resolv.conf is a
loopback address, and --no-map-gw is given, we automatically conclude
that the resolver is not reachable, discard it, and, if it's the only
nameserver listed in /etc/resolv.conf, we'll warn that we:

  Couldn't get any nameserver address

However, this isn't true in a general case: the user might have passed
--dns-forward, and in that case, while we won't map the address of the
default gateway to the host, we're still supposed to map that
particular address. Otherwise, in this common Podman usage:

  pasta --config-net --dns-forward 169.254.1.1 -t none -u none -T none -U none --no-map-gw --netns /run/user/1000/netns/netns-c02a8d8f-6ee3-902e-33c5-317e0f24e0af --map-guest-addr 169.254.1.2

and with a loopback address in /etc/resolv.conf, we'll unexpectedly
refuse to forward DNS queries:

  # nslookup passt.top 169.254.1.1
  ;; connection timed out; no servers could be reached

To fix this, make an exception for --dns-forward: if &c->ip4.dns_match
or &c->ip6.dns_match are set in add_dns_resolv4() / add_dns_resolv6(),
use that address as guest-facing resolver.

We already set 'dns_host' to the address we found in /etc/resolv.conf,
that's correct in this case and it makes us forward queries as
expected.

I'm not changing the man page as the current description of
--dns-forward is already consistent with the new behaviour: there's no
described way in which --no-map-gw should affect it.

Reported-by: Andrew Sayers <andrew-bugs.passt.top@pileofstuff.org>
Link: https://bugs.passt.top/show_bug.cgi?id=111
Suggested-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Paul Holzinger <pholzing@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/conf.c b/conf.c
index 18ed11c..f942851 100644
--- a/conf.c
+++ b/conf.c
@@ -431,12 +431,19 @@ static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx)
 	 */
 	if (IN4_IS_ADDR_LOOPBACK(ns) ||
 	    IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) {
-		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
-			return;
+		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) {
+			if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
+				return;		/* Address unreachable */
 
-		*ns = c->ip4.map_host_loopback;
-		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
+			*ns = c->ip4.map_host_loopback;
 			c->ip4.dns_match = c->ip4.map_host_loopback;
+		} else {
+			/* No general host mapping, but requested for DNS
+			 * (--dns-forward and --no-map-gw): advertise resolver
+			 * address from --dns-forward, and map that to loopback
+			 */
+			*ns = c->ip4.dns_match;
+		}
 	}
 
 	*idx += add_dns4(c, ns, *idx);
@@ -459,12 +466,19 @@ static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx)
 	 */
 	if (IN6_IS_ADDR_LOOPBACK(ns) ||
 	    IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) {
-		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
-			return;
+		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) {
+			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
+				return;		/* Address unreachable */
 
-		*ns = c->ip6.map_host_loopback;
-		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
+			*ns = c->ip6.map_host_loopback;
 			c->ip6.dns_match = c->ip6.map_host_loopback;
+		} else {
+			/* No general host mapping, but requested for DNS
+			 * (--dns-forward and --no-map-gw): advertise resolver
+			 * address from --dns-forward, and map that to loopback
+			 */
+			*ns = c->ip6.dns_match;
+		}
 	}
 
 	*idx += add_dns6(c, ns, *idx);

From baf049f8e06b7f0a73dfa7913297679a75aad381 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:18 +1000
Subject: [PATCH 353/382] udp: Fix breakage of UDP error handling by PKTINFO
 support

We recently enabled the IP_PKTINFO / IPV6_RECVPKTINFO socket options on our
UDP sockets.  This lets us obtain and properly handle the specific local
address used when we're "listening" with a socket on 0.0.0.0 or ::.

However, the PKTINFO cmsgs this option generates appear on error queue
messages as well as regular datagrams.  udp_sock_recverr() doesn't expect
this and so flags an unrecoverable error when it can't parse the control
message.

Correct this by adding space in udp_sock_recverr()s control buffer for the
additional PKTINFO data, and scan through all cmsgs for the RECVERR, rather
than only looking at the first one.

Link: https://bugs.passt.top/show_bug.cgi?id=99
Fixes: f4b0dd8b0685 ("udp: Use PKTINFO cmsgs to get destination address for received datagrams")
Reported-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/udp.c b/udp.c
index 40af7df..f5fb98c 100644
--- a/udp.c
+++ b/udp.c
@@ -155,6 +155,10 @@ __attribute__ ((aligned(32)))
 #endif
 udp_meta[UDP_MAX_FRAMES];
 
+#define PKTINFO_SPACE					\
+	MAX(CMSG_SPACE(sizeof(struct in_pktinfo)),	\
+	    CMSG_SPACE(sizeof(struct in6_pktinfo)))
+
 /**
  * enum udp_iov_idx - Indices for the buffers making up a single UDP frame
  * @UDP_IOV_TAP         tap specific header
@@ -476,10 +480,10 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		struct sock_extended_err ee;
 		union sockaddr_inany saddr;
 	};
-	const struct errhdr *eh;
-	const struct cmsghdr *hdr;
-	char buf[CMSG_SPACE(sizeof(struct errhdr))];
+	char buf[PKTINFO_SPACE + CMSG_SPACE(sizeof(struct errhdr))];
 	char data[ICMP6_MAX_DLEN];
+	const struct errhdr *eh;
+	struct cmsghdr *hdr;
 	int s = ref.fd;
 	struct iovec iov = {
 		.iov_base = data,
@@ -507,12 +511,16 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		return -1;
 	}
 
-	hdr = CMSG_FIRSTHDR(&mh);
-	if (!((hdr->cmsg_level == IPPROTO_IP &&
-	       hdr->cmsg_type == IP_RECVERR) ||
-	      (hdr->cmsg_level == IPPROTO_IPV6 &&
-	       hdr->cmsg_type == IPV6_RECVERR))) {
-		err("Unexpected cmsg reading error queue");
+	for (hdr = CMSG_FIRSTHDR(&mh); hdr; hdr = CMSG_NXTHDR(&mh, hdr)) {
+		if ((hdr->cmsg_level == IPPROTO_IP &&
+		      hdr->cmsg_type == IP_RECVERR) ||
+		     (hdr->cmsg_level == IPPROTO_IPV6 &&
+		      hdr->cmsg_type == IPV6_RECVERR))
+		    break;
+	}
+
+	if (!hdr) {
+		err("Missing RECVERR cmsg in error queue");
 		return -1;
 	}
 
@@ -587,10 +595,6 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
 	return n_err;
 }
 
-#define PKTINFO_SPACE					\
-	MAX(CMSG_SPACE(sizeof(struct in_pktinfo)),	\
-	    CMSG_SPACE(sizeof(struct in6_pktinfo)))
-
 /**
  * udp_peek_addr() - Get source address for next packet
  * @s:		Socket to get information from

From 1bb8145c221a9124ca1671e64b27de173ff2d82d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:19 +1000
Subject: [PATCH 354/382] udp: Be quieter about errors on UDP receive

If we get an error on UDP receive, either in udp_peek_addr() or
udp_sock_recv(), we'll print an error message.  However, this could be
a perfectly routine UDP error triggered by an ICMP, which need not go to
the error log.

This doesn't usually happen, because before receiving we typically clear
the error queue from udp_sock_errs().  However, it's possible an error
could be flagged after udp_sock_errs() but before we receive.  So it's
better to handle this error "silently" (trace level only).  We'll bail out
of the receive, return to the epoll loop, and get an EPOLLERR where we'll
handle and report the error properly.

In particular there's one situation that can trigger this case much more
easily.  If we start a new outbound UDP flow to a local destination with
nothing listening, we'll get a more or less immediate connection refused
error.  So, we'll get that error on the very first receive after the
connect().  That will occur in udp_flow_defer() -> udp_flush_flow() ->
udp_sock_fwd() -> udp_peek_addr() -> recvmsg().  This path doesn't call
udp_sock_errs() first, so isn't (imperfectly) protected the way we are
most of the time.

Fixes: 84ab1305faba ("udp: Polish udp_vu_sock_info() and remove from vu specific code")
Fixes: 69e5393c3722 ("udp: Move some more of sock_handler tasks into sub-functions")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/udp.c b/udp.c
index f5fb98c..154f99b 100644
--- a/udp.c
+++ b/udp.c
@@ -619,8 +619,8 @@ static int udp_peek_addr(int s, union sockaddr_inany *src,
 
 	rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
 	if (rc < 0) {
-		if (errno != EAGAIN && errno != EWOULDBLOCK)
-			warn_perror("Error peeking at socket address");
+		trace("Error peeking at socket address: %s", strerror_(errno));
+		/* Bail out and let the EPOLLERR handler deal with it */
 		return rc;
 	}
 
@@ -664,7 +664,8 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 
 	n = recvmmsg(s, mmh, n, 0, NULL);
 	if (n < 0) {
-		err_perror("Error receiving datagrams");
+		trace("Error receiving datagrams: %s", strerror_(errno));
+		/* Bail out and let the EPOLLERR handler deal with it */
 		return 0;
 	}
 

From 3f995586b35494b08631081fbf609ff932110849 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:20 +1000
Subject: [PATCH 355/382] udp: Pass socket & flow information direction to
 error handling functions

udp_sock_recverr() and udp_sock_errs() take an epoll reference from which
they obtain both the socket fd to receive errors from, and - for flow
specific sockets - the flow and side the socket is associated with.

We have some upcoming cases where we want to clear errors when we're not
directly associated with receiving an epoll event, so it's not natural to
have an epoll reference.  Therefore, make these functions take the socket
and flow from explicit parameters.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/udp.c b/udp.c
index 154f99b..c51ac95 100644
--- a/udp.c
+++ b/udp.c
@@ -467,14 +467,15 @@ static void udp_send_tap_icmp6(const struct ctx *c,
 /**
  * udp_sock_recverr() - Receive and clear an error from a socket
  * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to receive errors from
+ * @sidx:	Flow and side of @s, or FLOW_SIDX_NONE if unknown
  *
  * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
  *         if there was an error reading the queue
  *
  * #syscalls recvmsg
  */
-static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
+static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
 {
 	struct errhdr {
 		struct sock_extended_err ee;
@@ -484,7 +485,6 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 	char data[ICMP6_MAX_DLEN];
 	const struct errhdr *eh;
 	struct cmsghdr *hdr;
-	int s = ref.fd;
 	struct iovec iov = {
 		.iov_base = data,
 		.iov_len = sizeof(data)
@@ -525,12 +525,12 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 	}
 
 	eh = (const struct errhdr *)CMSG_DATA(hdr);
-	if (ref.type == EPOLL_TYPE_UDP) {
-		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
-		const struct flowside *toside = flowside_at_sidx(sidx);
+	if (flow_sidx_valid(sidx)) {
+		flow_sidx_t tosidx = flow_sidx_opposite(sidx);
+		const struct flowside *toside = flowside_at_sidx(tosidx);
 		size_t dlen = rc;
 
-		if (pif_is_socket(pif_at_sidx(sidx))) {
+		if (pif_is_socket(pif_at_sidx(tosidx))) {
 			/* XXX Is there any way to propagate ICMPs from socket
 			 * to socket? */
 		} else if (hdr->cmsg_level == IPPROTO_IP) {
@@ -554,21 +554,21 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 /**
  * udp_sock_errs() - Process errors on a socket
  * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to receive errors from
+ * @sidx:	Flow and side of @s, or FLOW_SIDX_NONE if unknown
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
+static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
-	int s = ref.fd;
 	int rc, err;
 
 	ASSERT(!c->no_udp);
 
 	/* Empty the error queue */
-	while ((rc = udp_sock_recverr(c, ref)) > 0)
+	while ((rc = udp_sock_recverr(c, s, sidx)) > 0)
 		n_err += rc;
 
 	if (rc < 0)
@@ -777,7 +777,7 @@ void udp_listen_sock_handler(const struct ctx *c,
 			     const struct timespec *now)
 {
 	if (events & EPOLLERR) {
-		if (udp_sock_errs(c, ref) < 0) {
+		if (udp_sock_errs(c, ref.fd, FLOW_SIDX_NONE) < 0) {
 			err("UDP: Unrecoverable error on listening socket:"
 			    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
 			/* FIXME: what now?  close/re-open socket? */
@@ -804,7 +804,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 	ASSERT(!c->no_udp && uflow);
 
 	if (events & EPOLLERR) {
-		if (udp_sock_errs(c, ref) < 0) {
+		if (udp_sock_errs(c, ref.fd, ref.flowside) < 0) {
 			flow_err(uflow, "Unrecoverable error on flow socket");
 			goto fail;
 		}

From 04984578b00f7507a05544b7a5490b03ab2d5135 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:21 +1000
Subject: [PATCH 356/382] udp: Deal with errors as we go in udp_sock_fwd()

When we get an epoll event on a listening socket, we first deal with any
errors (udp_sock_errs()), then with any received packets (udp_sock_fwd()).
However, it's theoretically possible that new errors could get flagged on
the socket after we call udp_sock_errs(), in which case we could get errors
returned in in udp_sock_fwd() -> udp_peek_addr() -> recvmsg().

In fact, we do deal with this correctly, although the path is somewhat
non-obvious.  The recvmsg() error will cause us to bail out of
udp_sock_fwd(), but the EPOLLERR event will now be flagged, so we'll come
back here next epoll loop and call udp_sock_errs().

Except.. we call udp_sock_fwd() from udp_flush_flow() as well as from
epoll events.  This is to deal with any packets that arrived between bind()
and connect(), and so might not be associated with the socket's intended
flow.  This expects udp_sock_fwd() to flush _all_ queued datagrams, so that
anything received later must be for the correct flow.

At the moment, udp_sock_errs() might fail to flush all datagrams if errors
occur.  In particular this can happen in practice for locally reported
errors which occur immediately after connect() (e.g. connecting to a local
port with nothing listening).

We can deal with the problem case, and also make the flow a little more
natural for the common case by having udp_sock_fwd() call udp_sock_errs()
to handle errors as the occur, rather than trying to deal with all errors
in advance.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/udp.c b/udp.c
index c51ac95..0bec499 100644
--- a/udp.c
+++ b/udp.c
@@ -601,7 +601,7 @@ static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx)
  * @src:	Socket address (output)
  * @dst:	(Local) destination address (output)
  *
- * Return: 0 on success, -1 otherwise
+ * Return: 0 if no more packets, 1 on success, -ve error code on error
  */
 static int udp_peek_addr(int s, union sockaddr_inany *src,
 			 union inany_addr *dst)
@@ -619,9 +619,9 @@ static int udp_peek_addr(int s, union sockaddr_inany *src,
 
 	rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
 	if (rc < 0) {
-		trace("Error peeking at socket address: %s", strerror_(errno));
-		/* Bail out and let the EPOLLERR handler deal with it */
-		return rc;
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+			return 0;
+		return -errno;
 	}
 
 	hdr = CMSG_FIRSTHDR(&msg);
@@ -644,7 +644,7 @@ static int udp_peek_addr(int s, union sockaddr_inany *src,
 	      sockaddr_ntop(src, sastr, sizeof(sastr)),
 	      inany_ntop(dst, dstr, sizeof(dstr)));
 
-	return 0;
+	return 1;
 }
 
 /**
@@ -740,11 +740,27 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 {
 	union sockaddr_inany src;
 	union inany_addr dst;
+	int rc;
 
-	while (udp_peek_addr(s, &src, &dst) == 0) {
-		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif,
-							&dst, port, &src, now);
-		uint8_t topif = pif_at_sidx(tosidx);
+	while ((rc = udp_peek_addr(s, &src, &dst)) != 0) {
+		flow_sidx_t tosidx;
+		uint8_t topif;
+
+		if (rc < 0) {
+			trace("Error peeking at socket address: %s",
+			      strerror_(-rc));
+			/* Clear errors & carry on */
+			if (udp_sock_errs(c, s, FLOW_SIDX_NONE) < 0) {
+				err(
+"UDP: Unrecoverable error on listening socket: (%s port %hu)",
+				    pif_name(frompif), port);
+				/* FIXME: what now?  close/re-open socket? */
+			}
+			continue;
+		}
+
+		tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now);
+		topif = pif_at_sidx(tosidx);
 
 		if (pif_is_socket(topif)) {
 			udp_sock_to_sock(c, s, 1, tosidx);
@@ -776,16 +792,7 @@ void udp_listen_sock_handler(const struct ctx *c,
 			     union epoll_ref ref, uint32_t events,
 			     const struct timespec *now)
 {
-	if (events & EPOLLERR) {
-		if (udp_sock_errs(c, ref.fd, FLOW_SIDX_NONE) < 0) {
-			err("UDP: Unrecoverable error on listening socket:"
-			    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
-			/* FIXME: what now?  close/re-open socket? */
-			return;
-		}
-	}
-
-	if (events & EPOLLIN)
+	if (events & (EPOLLERR | EPOLLIN))
 		udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now);
 }
 

From f107a86cc05c83c5755861b00b85cdf0eb5c9534 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:22 +1000
Subject: [PATCH 357/382] udp: Add udp_pktinfo() helper

Currently we open code parsing the control message for IP_PKTINFO in
udp_peek_addr().  We have an upcoming case where we want to parse PKTINFO
in another place, so split this out into a helper function.

While we're there, make the parsing a bit more robust: scan all cmsgs to
look for the one we want, rather than assuming there's only one.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: udp_pktinfo(): Fix typo in comment and change err() to debug()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 52 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/udp.c b/udp.c
index 0bec499..97034f6 100644
--- a/udp.c
+++ b/udp.c
@@ -464,6 +464,41 @@ static void udp_send_tap_icmp6(const struct ctx *c,
 	tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
 }
 
+/**
+ * udp_pktinfo() - Retrieve packet destination address from cmsg
+ * @msg:	msghdr into which message has been received
+ * @dst:	(Local) destination address of message in @mh (output)
+ *
+ * Return: 0 on success, -1 if the information was missing (@dst is set to
+ *         inany_any6).
+ */
+static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst)
+{
+	struct cmsghdr *hdr;
+
+	for (hdr = CMSG_FIRSTHDR(msg); hdr; hdr = CMSG_NXTHDR(msg, hdr)) {
+		if (hdr->cmsg_level == IPPROTO_IP &&
+		    hdr->cmsg_type == IP_PKTINFO) {
+			const struct in_pktinfo *i4 = (void *)CMSG_DATA(hdr);
+
+			*dst = inany_from_v4(i4->ipi_addr);
+			return 0;
+		}
+
+		if (hdr->cmsg_level == IPPROTO_IPV6 &&
+			   hdr->cmsg_type == IPV6_PKTINFO) {
+			const struct in6_pktinfo *i6 = (void *)CMSG_DATA(hdr);
+
+			dst->a6 = i6->ipi6_addr;
+			return 0;
+		}
+	}
+
+	debug("Missing PKTINFO cmsg on datagram");
+	*dst = inany_any6;
+	return -1;
+}
+
 /**
  * udp_sock_recverr() - Receive and clear an error from a socket
  * @c:		Execution context
@@ -607,7 +642,6 @@ static int udp_peek_addr(int s, union sockaddr_inany *src,
 			 union inany_addr *dst)
 {
 	char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN];
-	const struct cmsghdr *hdr;
 	char cmsg[PKTINFO_SPACE];
 	struct msghdr msg = {
 		.msg_name = src,
@@ -624,21 +658,7 @@ static int udp_peek_addr(int s, union sockaddr_inany *src,
 		return -errno;
 	}
 
-	hdr = CMSG_FIRSTHDR(&msg);
-	if (hdr && hdr->cmsg_level == IPPROTO_IP &&
-	    hdr->cmsg_type == IP_PKTINFO) {
-		const struct in_pktinfo *info4 = (void *)CMSG_DATA(hdr);
-
-		*dst = inany_from_v4(info4->ipi_addr);
-	} else if (hdr && hdr->cmsg_level == IPPROTO_IPV6 &&
-		   hdr->cmsg_type == IPV6_PKTINFO) {
-		const struct in6_pktinfo *info6 = (void *)CMSG_DATA(hdr);
-
-		dst->a6 = info6->ipi6_addr;
-	} else {
-		debug("Unexpected cmsg on UDP datagram");
-		*dst = inany_any6;
-	}
+	udp_pktinfo(&msg, dst);
 
 	trace("Peeked UDP datagram: %s -> %s",
 	      sockaddr_ntop(src, sastr, sizeof(sastr)),

From cfc0ee145a5cdd29b6e584171085dac6539b86c0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:23 +1000
Subject: [PATCH 358/382] udp: Minor re-organisation of udp_sock_recverr()

Usually we work with the "exit early" flow style, where we return early
on "error" conditions in functions.  We don't currently do this in
udp_sock_recverr() for the case where we don't have a flow to associate
the error with.

Reorganise to use the "exit early" style, which will make some subsequent
changes less awkward.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/udp.c b/udp.c
index 97034f6..e8240fe 100644
--- a/udp.c
+++ b/udp.c
@@ -530,6 +530,9 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
 		.msg_control = buf,
 		.msg_controllen = sizeof(buf),
 	};
+	const struct flowside *toside;
+	flow_sidx_t tosidx;
+	size_t dlen;
 	ssize_t rc;
 
 	rc = recvmsg(s, &mh, MSG_ERRQUEUE);
@@ -560,29 +563,32 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
 	}
 
 	eh = (const struct errhdr *)CMSG_DATA(hdr);
-	if (flow_sidx_valid(sidx)) {
-		flow_sidx_t tosidx = flow_sidx_opposite(sidx);
-		const struct flowside *toside = flowside_at_sidx(tosidx);
-		size_t dlen = rc;
 
-		if (pif_is_socket(pif_at_sidx(tosidx))) {
-			/* XXX Is there any way to propagate ICMPs from socket
-			 * to socket? */
-		} else if (hdr->cmsg_level == IPPROTO_IP) {
-			dlen = MIN(dlen, ICMP4_MAX_DLEN);
-			udp_send_tap_icmp4(c, &eh->ee, toside,
-					   eh->saddr.sa4.sin_addr, data, dlen);
-		} else if (hdr->cmsg_level == IPPROTO_IPV6) {
-			udp_send_tap_icmp6(c, &eh->ee, toside,
-					   &eh->saddr.sa6.sin6_addr, data,
-					   dlen, sidx.flowi);
-		}
-	} else {
-		trace("Ignoring received IP_RECVERR cmsg on listener socket");
-	}
 	debug("%s error on UDP socket %i: %s",
 	      str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno));
 
+	if (!flow_sidx_valid(sidx)) {
+		trace("Ignoring received IP_RECVERR cmsg on listener socket");
+		return 1;
+	}
+
+	tosidx = flow_sidx_opposite(sidx);
+	toside = flowside_at_sidx(tosidx);
+	dlen = rc;
+
+	if (pif_is_socket(pif_at_sidx(tosidx))) {
+		/* XXX Is there any way to propagate ICMPs from socket to
+		 * socket? */
+	} else if (hdr->cmsg_level == IPPROTO_IP) {
+		dlen = MIN(dlen, ICMP4_MAX_DLEN);
+		udp_send_tap_icmp4(c, &eh->ee, toside,
+				   eh->saddr.sa4.sin_addr, data, dlen);
+	} else if (hdr->cmsg_level == IPPROTO_IPV6) {
+		udp_send_tap_icmp6(c, &eh->ee, toside,
+				   &eh->saddr.sa6.sin6_addr, data,
+				   dlen, sidx.flowi);
+	}
+
 	return 1;
 }
 

From 2340bbf867e6c3c3b5ac67345b0e841ab49bbaa5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:24 +1000
Subject: [PATCH 359/382] udp: Propagate errors on listening and brand new
 sockets

udp_sock_recverr() processes errors on UDP sockets and attempts to
propagate them as ICMP packets on the tap interface.  To do this it
currently requires the flow with which the error is associated as a
parameter.  If that's missing it will clear the error condition, but not
propagate it.

That means that we largely ignore errors on "listening" sockets.  It also
means we may discard some errors on flow specific sockets if they occur
very shortly after the socket is created.  In udp_flush_flow() we need to
clear any datagrams received between bind() and connect() which might not
be associated with the "final" flow for the socket.  If we get errors
before that point we'll ignore them in the same way because we don't know
the flow they're associated with in advance.

This can happen in practice if we have errors which occur almost
immediately after connect(), such as ECONNREFUSED when we connect() to a
local address where nothing is listening.

Between the extended error message itself and the PKTINFO information we
do actually have enough information to find the correct flow.  So, rather
than ignoring errors where we don't have a flow "hint", determine the flow
the hard way in udp_sock_recverr().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Change warn() to debug() in udp_sock_recverr()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/udp.c b/udp.c
index e8240fe..57769d0 100644
--- a/udp.c
+++ b/udp.c
@@ -504,27 +504,34 @@ static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst)
  * @c:		Execution context
  * @s:		Socket to receive errors from
  * @sidx:	Flow and side of @s, or FLOW_SIDX_NONE if unknown
+ * @pif:	Interface on which the error occurred
+ *              (only used if @sidx == FLOW_SIDX_NONE)
+ * @port:	Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
  *
  * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
  *         if there was an error reading the queue
  *
  * #syscalls recvmsg
  */
-static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
+static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
+			    uint8_t pif, in_port_t port)
 {
 	struct errhdr {
 		struct sock_extended_err ee;
 		union sockaddr_inany saddr;
 	};
 	char buf[PKTINFO_SPACE + CMSG_SPACE(sizeof(struct errhdr))];
+	const struct errhdr *eh = NULL;
 	char data[ICMP6_MAX_DLEN];
-	const struct errhdr *eh;
 	struct cmsghdr *hdr;
 	struct iovec iov = {
 		.iov_base = data,
 		.iov_len = sizeof(data)
 	};
+	union sockaddr_inany src;
 	struct msghdr mh = {
+		.msg_name = &src,
+		.msg_namelen = sizeof(src),
 		.msg_iov = &iov,
 		.msg_iovlen = 1,
 		.msg_control = buf,
@@ -554,7 +561,7 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
 		      hdr->cmsg_type == IP_RECVERR) ||
 		     (hdr->cmsg_level == IPPROTO_IPV6 &&
 		      hdr->cmsg_type == IPV6_RECVERR))
-		    break;
+			break;
 	}
 
 	if (!hdr) {
@@ -568,8 +575,19 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
 	      str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno));
 
 	if (!flow_sidx_valid(sidx)) {
-		trace("Ignoring received IP_RECVERR cmsg on listener socket");
-		return 1;
+		/* No hint from the socket, determine flow from addresses */
+		union inany_addr dst;
+
+		if (udp_pktinfo(&mh, &dst) < 0) {
+			debug("Missing PKTINFO on UDP error");
+			return 1;
+		}
+
+		sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, &src, &dst, port);
+		if (!flow_sidx_valid(sidx)) {
+			debug("Ignoring UDP error without flow");
+			return 1;
+		}
 	}
 
 	tosidx = flow_sidx_opposite(sidx);
@@ -597,10 +615,14 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
  * @c:		Execution context
  * @s:		Socket to receive errors from
  * @sidx:	Flow and side of @s, or FLOW_SIDX_NONE if unknown
+ * @pif:	Interface on which the error occurred
+ *              (only used if @sidx == FLOW_SIDX_NONE)
+ * @port:	Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx)
+static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx,
+			 uint8_t pif, in_port_t port)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
@@ -609,7 +631,7 @@ static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx)
 	ASSERT(!c->no_udp);
 
 	/* Empty the error queue */
-	while ((rc = udp_sock_recverr(c, s, sidx)) > 0)
+	while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0)
 		n_err += rc;
 
 	if (rc < 0)
@@ -776,7 +798,8 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 			trace("Error peeking at socket address: %s",
 			      strerror_(-rc));
 			/* Clear errors & carry on */
-			if (udp_sock_errs(c, s, FLOW_SIDX_NONE) < 0) {
+			if (udp_sock_errs(c, s, FLOW_SIDX_NONE,
+					  frompif, port) < 0) {
 				err(
 "UDP: Unrecoverable error on listening socket: (%s port %hu)",
 				    pif_name(frompif), port);
@@ -837,7 +860,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 	ASSERT(!c->no_udp && uflow);
 
 	if (events & EPOLLERR) {
-		if (udp_sock_errs(c, ref.fd, ref.flowside) < 0) {
+		if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) {
 			flow_err(uflow, "Unrecoverable error on flow socket");
 			goto fail;
 		}

From 9128f6e8f47d94c761b5fd8c0d0b8308758cbdc5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 17 Apr 2025 11:55:40 +1000
Subject: [PATCH 360/382] fwd: Split out helpers for port-independent NAT

Currently the functions fwd_nat_from_*() make some address translations
based on both the IP address and protocol port numbers, and others based
only on the address.  We have some upcoming cases where it's useful to use
the IP-address-only translations separately, so split them out into helper
functions.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 fwd.c | 87 ++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 25 deletions(-)

diff --git a/fwd.c b/fwd.c
index 2829cd2..5c70e83 100644
--- a/fwd.c
+++ b/fwd.c
@@ -323,6 +323,30 @@ static bool fwd_guest_accessible(const struct ctx *c,
 	return fwd_guest_accessible6(c, &addr->a6);
 }
 
+/**
+ * nat_outbound() - Apply address translation for outbound (TAP to HOST)
+ * @c:		Execution context
+ * @addr:	Input address (as seen on TAP interface)
+ * @translated:	Output address (as seen on HOST interface)
+ *
+ * Only handles translations that depend *only* on the address.  Anything
+ * related to specific ports or flows is handled elsewhere.
+ */
+static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
+			 union inany_addr *translated)
+{
+	if (inany_equals4(addr, &c->ip4.map_host_loopback))
+		*translated = inany_loopback4;
+	else if (inany_equals6(addr, &c->ip6.map_host_loopback))
+		*translated = inany_loopback6;
+	else if (inany_equals4(addr, &c->ip4.map_guest_addr))
+		*translated = inany_from_v4(c->ip4.addr);
+	else if (inany_equals6(addr, &c->ip6.map_guest_addr))
+		translated->a6 = c->ip6.addr;
+	else
+		*translated = *addr;
+}
+
 /**
  * fwd_nat_from_tap() - Determine to forward a flow from the tap interface
  * @c:		Execution context
@@ -342,16 +366,8 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
 	else if (is_dns_flow(proto, ini) &&
 		   inany_equals6(&ini->oaddr, &c->ip6.dns_match))
 		tgt->eaddr.a6 = c->ip6.dns_host;
-	else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
-		tgt->eaddr = inany_loopback4;
-	else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
-		tgt->eaddr = inany_loopback6;
-	else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
-		tgt->eaddr = inany_from_v4(c->ip4.addr);
-	else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
-		tgt->eaddr.a6 = c->ip6.addr;
 	else
-		tgt->eaddr = ini->oaddr;
+		nat_outbound(c, &ini->oaddr, &tgt->eaddr);
 
 	tgt->eport = ini->oport;
 
@@ -423,6 +439,42 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
 	return PIF_HOST;
 }
 
+/**
+ * nat_inbound() - Apply address translation for outbound (HOST to TAP)
+ * @c:		Execution context
+ * @addr:	Input address (as seen on HOST interface)
+ * @translated:	Output address (as seen on TAP interface)
+ *
+ * Return: true on success, false if it couldn't translate the address
+ *
+ * Only handles translations that depend *only* on the address.  Anything
+ * related to specific ports or flows is handled elsewhere.
+ */
+static bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
+			 union inany_addr *translated)
+{
+	if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
+	    inany_equals4(addr, &in4addr_loopback)) {
+		/* Specifically 127.0.0.1, not 127.0.0.0/8 */
+		*translated = inany_from_v4(c->ip4.map_host_loopback);
+	} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
+		   inany_equals6(addr, &in6addr_loopback)) {
+		translated->a6 = c->ip6.map_host_loopback;
+	} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
+		   inany_equals4(addr, &c->ip4.addr)) {
+		*translated = inany_from_v4(c->ip4.map_guest_addr);
+	} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
+		   inany_equals6(addr, &c->ip6.addr)) {
+		translated->a6 = c->ip6.map_guest_addr;
+	} else if (fwd_guest_accessible(c, addr)) {
+		*translated = *addr;
+	} else {
+		return false;
+	}
+
+	return true;
+}
+
 /**
  * fwd_nat_from_host() - Determine to forward a flow from the host interface
  * @c:		Execution context
@@ -479,20 +531,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
 		return PIF_SPLICE;
 	}
 
-	if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
-	    inany_equals4(&ini->eaddr, &in4addr_loopback)) {
-		/* Specifically 127.0.0.1, not 127.0.0.0/8 */
-		tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
-	} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
-		   inany_equals6(&ini->eaddr, &in6addr_loopback)) {
-		tgt->oaddr.a6 = c->ip6.map_host_loopback;
-	} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
-		   inany_equals4(&ini->eaddr, &c->ip4.addr)) {
-		tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
-	} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
-		   inany_equals6(&ini->eaddr, &c->ip6.addr)) {
-		tgt->oaddr.a6 = c->ip6.map_guest_addr;
-	} else if (!fwd_guest_accessible(c, &ini->eaddr)) {
+	if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) {
 		if (inany_v4(&ini->eaddr)) {
 			if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
 				/* No source address we can use */
@@ -501,8 +540,6 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
 		} else {
 			tgt->oaddr.a6 = c->ip6.our_tap_ll;
 		}
-	} else {
-		tgt->oaddr = ini->eaddr;
 	}
 	tgt->oport = ini->eport;
 

From 4668e9137806b551f6ee44609064cc40243c2b6b Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 17 Apr 2025 11:55:41 +1000
Subject: [PATCH 361/382] treewide: Improve robustness against sockaddrs of
 unexpected family

inany_from_sockaddr() expects a socket address of family AF_INET or
AF_INET6 and ASSERT()s if it gets anything else.  In many of the callers we
can handle an unexpected family more gracefully, though, e.g. by failing
a single flow rather than killing passt.

Change inany_from_sockaddr() to return an error instead of ASSERT()ing,
and handle those errors in the callers.  Improve the reporting of any such
errors while we're at it.

With this greater robustness, allow inany_from_sockaddr() to take a void *
rather than specifically a union sockaddr_inany *.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c     | 16 ++++++++++++++--
 inany.h    | 30 ++++++++++++++++++------------
 tcp.c      | 10 ++++------
 udp_flow.c |  6 +++---
 4 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/flow.c b/flow.c
index 3c81cb4..447c021 100644
--- a/flow.c
+++ b/flow.c
@@ -408,7 +408,12 @@ struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
 {
 	struct flowside *ini = &flow->f.side[INISIDE];
 
-	inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
+	if (inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa) < 0) {
+		char str[SOCKADDR_STRLEN];
+
+		ASSERT_WITH_MSG(0, "Bad socket address %s",
+				sockaddr_ntop(ssa, str, sizeof(str)));
+	}
 	if (daddr)
 		ini->oaddr = *daddr;
 	else if (inany_v4(&ini->eaddr))
@@ -768,7 +773,14 @@ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
 		.oport = oport,
 	};
 
-	inany_from_sockaddr(&side.eaddr, &side.eport, esa);
+	if (inany_from_sockaddr(&side.eaddr, &side.eport, esa) < 0) {
+		char str[SOCKADDR_STRLEN];
+
+		warn("Flow lookup on bad socket address %s",
+		     sockaddr_ntop(esa, str, sizeof(str)));
+		return FLOW_SIDX_NONE;
+	}
+
 	if (oaddr)
 		side.oaddr = *oaddr;
 	else if (inany_v4(&side.eaddr))
diff --git a/inany.h b/inany.h
index 1c247e1..7ca5cbd 100644
--- a/inany.h
+++ b/inany.h
@@ -237,24 +237,30 @@ static inline void inany_from_af(union inany_addr *aa,
 }
 
 /** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr
- * @aa:		Pointer to store IPv[46] address
+ * @dst:	Pointer to store IPv[46] address (output)
  * @port:	Pointer to store port number, host order
- * @addr:	AF_INET or AF_INET6 socket address
+ * @addr:	Socket address
+ *
+ * Return: 0 on success, -1 on error (bad address family)
  */
-static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
-				       const union sockaddr_inany *sa)
+static inline int inany_from_sockaddr(union inany_addr *dst, in_port_t *port,
+				      const void *addr)
 {
+	const union sockaddr_inany *sa = (const union sockaddr_inany *)addr;
+
 	if (sa->sa_family == AF_INET6) {
-		inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr);
+		inany_from_af(dst, AF_INET6, &sa->sa6.sin6_addr);
 		*port = ntohs(sa->sa6.sin6_port);
-	} else if (sa->sa_family == AF_INET) {
-		inany_from_af(aa, AF_INET, &sa->sa4.sin_addr);
-		*port = ntohs(sa->sa4.sin_port);
-	} else {
-		/* Not valid to call with other address families */
-		ASSERT_WITH_MSG(0, "Unexpected sockaddr family: %u",
-				sa->sa_family);
+		return 0;
 	}
+
+	if (sa->sa_family == AF_INET) {
+		inany_from_af(dst, AF_INET, &sa->sa4.sin_addr);
+		*port = ntohs(sa->sa4.sin_port);
+		return 0;
+	}
+
+	return -1;
 }
 
 /** inany_siphash_feed- Fold IPv[46] address into an in-progress siphash
diff --git a/tcp.c b/tcp.c
index 9c6bc52..0ac298a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1546,9 +1546,8 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 
 	if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
 		sl = sizeof(sa);
-		if (!getsockname(s, &sa.sa, &sl))
-			inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa);
-		else
+		if (getsockname(s, &sa.sa, &sl) ||
+		    inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0)
 			err_perror("Can't get local address for socket %i", s);
 	}
 
@@ -2204,9 +2203,8 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			       NULL, ref.tcp_listen.port);
 
 	if (c->mode == MODE_VU) { /* Rebind to same address after migration */
-		if (!getsockname(s, &sa.sa, &sl))
-			inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa);
-		else
+		if (getsockname(s, &sa.sa, &sl) ||
+		    inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0)
 			err_perror("Can't get local address for socket %i", s);
 	}
 
diff --git a/udp_flow.c b/udp_flow.c
index ef2cbb0..fea1cf3 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -158,12 +158,12 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		socklen_t sl = sizeof(sa);
 		in_port_t port;
 
-		if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0) {
+		if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0 ||
+		    inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr,
+					&port, &sa) < 0) {
 			flow_perror(uflow, "Unable to determine local address");
 			goto cancel;
 		}
-		inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr,
-				    &port, &sa);
 		if (port != tgt->oport) {
 			flow_err(uflow, "Unexpected local port");
 			goto cancel;

From 08e617ec2ba916d8250a41d3ac68183124a6ec3e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 17 Apr 2025 11:55:42 +1000
Subject: [PATCH 362/382] udp: Rework offender address handling in
 udp_sock_recverr()

Make a number of changes to udp_sock_recverr() to improve the robustness
of how we handle addresses.

 * Get the "offender" address (source of the ICMP packet) using the
   SO_EE_OFFENDER() macro, reducing assumptions about structure layout.
 * Parse the offender sockaddr using inany_from_sockaddr()
 * Check explicitly that the source and destination pifs are what we
   expect.  Previously we checked something that was probably equivalent
   in practice, but isn't strictly speaking what we require for the rest
   of the code.
 * Verify that for an ICMPv4 error we also have an IPv4 source/offender
   and destination/endpoint address
 * Verify that for an ICMPv6 error we have an IPv6 endpoint
 * Improve debug reporting of any failures

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 69 +++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/udp.c b/udp.c
index 57769d0..d09b3eb 100644
--- a/udp.c
+++ b/udp.c
@@ -159,6 +159,12 @@ udp_meta[UDP_MAX_FRAMES];
 	MAX(CMSG_SPACE(sizeof(struct in_pktinfo)),	\
 	    CMSG_SPACE(sizeof(struct in6_pktinfo)))
 
+#define RECVERR_SPACE							\
+	MAX(CMSG_SPACE(sizeof(struct sock_extended_err) +		\
+		       sizeof(struct sockaddr_in)),			\
+	    CMSG_SPACE(sizeof(struct sock_extended_err) +		\
+		       sizeof(struct sockaddr_in6)))
+
 /**
  * enum udp_iov_idx - Indices for the buffers making up a single UDP frame
  * @UDP_IOV_TAP         tap specific header
@@ -516,12 +522,8 @@ static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst)
 static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 			    uint8_t pif, in_port_t port)
 {
-	struct errhdr {
-		struct sock_extended_err ee;
-		union sockaddr_inany saddr;
-	};
-	char buf[PKTINFO_SPACE + CMSG_SPACE(sizeof(struct errhdr))];
-	const struct errhdr *eh = NULL;
+	char buf[PKTINFO_SPACE + RECVERR_SPACE];
+	const struct sock_extended_err *ee;
 	char data[ICMP6_MAX_DLEN];
 	struct cmsghdr *hdr;
 	struct iovec iov = {
@@ -538,7 +540,13 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 		.msg_controllen = sizeof(buf),
 	};
 	const struct flowside *toside;
-	flow_sidx_t tosidx;
+	char astr[INANY_ADDRSTRLEN];
+	char sastr[SOCKADDR_STRLEN];
+	union inany_addr offender;
+	const struct in_addr *o4;
+	in_port_t offender_port;
+	struct udp_flow *uflow;
+	uint8_t topif;
 	size_t dlen;
 	ssize_t rc;
 
@@ -569,10 +577,10 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 		return -1;
 	}
 
-	eh = (const struct errhdr *)CMSG_DATA(hdr);
+	ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
 
 	debug("%s error on UDP socket %i: %s",
-	      str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno));
+	      str_ee_origin(ee), s, strerror_(ee->ee_errno));
 
 	if (!flow_sidx_valid(sidx)) {
 		/* No hint from the socket, determine flow from addresses */
@@ -588,25 +596,44 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 			debug("Ignoring UDP error without flow");
 			return 1;
 		}
+	} else {
+		pif = pif_at_sidx(sidx);
 	}
 
-	tosidx = flow_sidx_opposite(sidx);
-	toside = flowside_at_sidx(tosidx);
+	uflow = udp_at_sidx(sidx);
+	ASSERT(uflow);
+	toside = &uflow->f.side[!sidx.sidei];
+	topif = uflow->f.pif[!sidx.sidei];
 	dlen = rc;
 
-	if (pif_is_socket(pif_at_sidx(tosidx))) {
-		/* XXX Is there any way to propagate ICMPs from socket to
-		 * socket? */
-	} else if (hdr->cmsg_level == IPPROTO_IP) {
+	if (inany_from_sockaddr(&offender, &offender_port,
+				SO_EE_OFFENDER(ee)) < 0)
+		goto fail;
+
+	if (pif != PIF_HOST || topif != PIF_TAP)
+		/* XXX Can we support any other cases? */
+		goto fail;
+
+	if (hdr->cmsg_level == IPPROTO_IP &&
+	    (o4 = inany_v4(&offender)) && inany_v4(&toside->eaddr)) {
 		dlen = MIN(dlen, ICMP4_MAX_DLEN);
-		udp_send_tap_icmp4(c, &eh->ee, toside,
-				   eh->saddr.sa4.sin_addr, data, dlen);
-	} else if (hdr->cmsg_level == IPPROTO_IPV6) {
-		udp_send_tap_icmp6(c, &eh->ee, toside,
-				   &eh->saddr.sa6.sin6_addr, data,
-				   dlen, sidx.flowi);
+		udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
+		return 1;
 	}
 
+	if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
+		udp_send_tap_icmp6(c, ee, toside, &offender.a6, data, dlen,
+				   FLOW_IDX(uflow));
+		return 1;
+	}
+
+fail:
+	flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s",
+		 str_ee_origin(ee),
+		 pif_name(pif),
+		 sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)),
+		 pif_name(topif),
+		 inany_ntop(&toside->eaddr, astr, sizeof(astr)));
 	return 1;
 }
 

From 436afc30447c6f0ce516f2b38c769833114bb5f8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 17 Apr 2025 11:55:43 +1000
Subject: [PATCH 363/382] udp: Translate offender addresses for ICMP messages

We've recently added support for propagating ICMP errors related to a UDP
flow from the host to the guest, by handling the extended UDP error on the
socket and synthesizing a suitable ICMP on the tap interface.

Currently we create that ICMP with a source address of the "offender" from
the extended error information - the source of the ICMP error received on
the host.  However, we don't translate this address for cases where we NAT
between host and guest.  This means (amongst other things) that we won't
get a "Connection refused" error as expected if send data from the guest to
the --map-host-loopback address.  The error comes from 127.0.0.1 on the
host, which doesn't make sense on the tap interface and will be discarded
by the guest.

Because ICMP errors can be sent by an intermediate host, not just by the
endpoints of the flow, we can't handle this translation purely with the
information in the flow table entry.  We need to explicitly translate this
address by our NAT rules, which we can do with the nat_inbound() helper.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 fwd.c |  4 ++--
 fwd.h |  3 +++
 udp.c | 18 ++++++++++++++----
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/fwd.c b/fwd.c
index 5c70e83..b73c2c8 100644
--- a/fwd.c
+++ b/fwd.c
@@ -450,8 +450,8 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
  * Only handles translations that depend *only* on the address.  Anything
  * related to specific ports or flows is handled elsewhere.
  */
-static bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
-			 union inany_addr *translated)
+bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
+		 union inany_addr *translated)
 {
 	if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
 	    inany_equals4(addr, &in4addr_loopback)) {
diff --git a/fwd.h b/fwd.h
index 3562f3c..0458a3c 100644
--- a/fwd.h
+++ b/fwd.h
@@ -7,6 +7,7 @@
 #ifndef FWD_H
 #define FWD_H
 
+union inany_addr;
 struct flowside;
 
 /* Number of ports for both TCP and UDP */
@@ -47,6 +48,8 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
 			const struct fwd_ports *tcp_rev);
 void fwd_scan_ports_init(struct ctx *c);
 
+bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
+		 union inany_addr *translated);
 uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
 			 const struct flowside *ini, struct flowside *tgt);
 uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
diff --git a/udp.c b/udp.c
index d09b3eb..f5a5cd1 100644
--- a/udp.c
+++ b/udp.c
@@ -539,10 +539,10 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 		.msg_control = buf,
 		.msg_controllen = sizeof(buf),
 	};
-	const struct flowside *toside;
+	const struct flowside *fromside, *toside;
+	union inany_addr offender, otap;
 	char astr[INANY_ADDRSTRLEN];
 	char sastr[SOCKADDR_STRLEN];
-	union inany_addr offender;
 	const struct in_addr *o4;
 	in_port_t offender_port;
 	struct udp_flow *uflow;
@@ -602,6 +602,7 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 
 	uflow = udp_at_sidx(sidx);
 	ASSERT(uflow);
+	fromside = &uflow->f.side[sidx.sidei];
 	toside = &uflow->f.side[!sidx.sidei];
 	topif = uflow->f.pif[!sidx.sidei];
 	dlen = rc;
@@ -614,15 +615,24 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 		/* XXX Can we support any other cases? */
 		goto fail;
 
+	/* If the offender *is* the endpoint, make sure our translation is
+	 * consistent with the flow's translation.  This matters if the flow
+	 * endpoint has a port specific translation (like --dns-match).
+	 */
+	if (inany_equals(&offender, &fromside->eaddr))
+		otap = toside->oaddr;
+	else if (!nat_inbound(c, &offender, &otap))
+		goto fail;
+
 	if (hdr->cmsg_level == IPPROTO_IP &&
-	    (o4 = inany_v4(&offender)) && inany_v4(&toside->eaddr)) {
+	    (o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) {
 		dlen = MIN(dlen, ICMP4_MAX_DLEN);
 		udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
 		return 1;
 	}
 
 	if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
-		udp_send_tap_icmp6(c, ee, toside, &offender.a6, data, dlen,
+		udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen,
 				   FLOW_IDX(uflow));
 		return 1;
 	}

From aa1cc8922867b8f7c17742f8da3b9fcc6291bbeb Mon Sep 17 00:00:00 2001
From: Alyssa Ross <hi@alyssa.is>
Date: Sat, 26 Apr 2025 10:44:25 +0200
Subject: [PATCH 364/382] conf: allow --fd 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

inetd-style socket passing traditionally starts a service with a
connected socket on file descriptors 0 and 1.  passt disallowing
obtaining its socket from either of these descriptors made it
difficult to use with super-servers providing this interface — in my
case I wanted to use passt with s6-ipcserver[1].  Since (as far as I
can tell) passt does not use standard input for anything else (unlike
standard output), it should be safe to relax the restrictions on --fd
to allow setting it to 0, enabling this use case.

Link: https://skarnet.org/software/s6/s6-ipcserver.html [1]
Signed-off-by: Alyssa Ross <hi@alyssa.is>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 3 ++-
 util.c | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/conf.c b/conf.c
index f942851..a6d7e22 100644
--- a/conf.c
+++ b/conf.c
@@ -1717,7 +1717,8 @@ void conf(struct ctx *c, int argc, char **argv)
 			fd_tap_opt = strtol(optarg, NULL, 0);
 
 			if (errno ||
-			    fd_tap_opt <= STDERR_FILENO || fd_tap_opt > INT_MAX)
+			    (fd_tap_opt != STDIN_FILENO && fd_tap_opt <= STDERR_FILENO) ||
+			    fd_tap_opt > INT_MAX)
 				die("Invalid --fd: %s", optarg);
 
 			c->fd_tap = fd_tap_opt;
diff --git a/util.c b/util.c
index 62a6003..f5497d4 100644
--- a/util.c
+++ b/util.c
@@ -875,7 +875,9 @@ void close_open_files(int argc, char **argv)
 			errno = 0;
 			fd = strtol(optarg, NULL, 0);
 
-			if (errno || fd <= STDERR_FILENO || fd > INT_MAX)
+			if (errno ||
+			    (fd != STDIN_FILENO && fd <= STDERR_FILENO) ||
+			    fd > INT_MAX)
 				die("Invalid --fd: %s", optarg);
 		}
 	} while (name != -1);

From ea0a1240df671de221f469327899564ed74b5edd Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 30 Apr 2025 16:48:34 +0200
Subject: [PATCH 365/382] passt-repair: Hide bogus gcc warning from -Og
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When building with gcc 13 and -Og, we get:

passt-repair.c: In function ‘main’:
passt-repair.c:161:23: warning: ‘ev’ may be used uninitialized [-Wmaybe-uninitialized]
  161 |                 if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') {
      |                     ~~^~~~~

but that can't actually happen, because we only exit the preceding
while loop if 'found' is true, and that only happens, in turn, as we
assign 'ev'.

Get rid of the warning by (redundantly) initialising ev to NULL.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt-repair.c b/passt-repair.c
index 256a8c9..ff1c44f 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -113,7 +113,7 @@ int main(int argc, char **argv)
 	if ((sb.st_mode & S_IFMT) == S_IFDIR) {
 		char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
 		   __attribute__ ((aligned(__alignof__(struct inotify_event))));
-		const struct inotify_event *ev;
+		const struct inotify_event *ev = NULL;
 		char path[PATH_MAX + 1];
 		bool found = false;
 		ssize_t n;

From 6a96cd97a5fda26a8f12531a72f6a969e476ad9e Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 30 Apr 2025 16:59:13 +0200
Subject: [PATCH 366/382] util: Fix typo, ASSSERTION -> ASSERTION

Fixes: 9153aca15bc1 ("util: Add abort_with_msg() and ASSERT_WITH_MSG() helpers")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util.h b/util.h
index cc7d084..5947337 100644
--- a/util.h
+++ b/util.h
@@ -75,7 +75,7 @@ void abort_with_msg(const char *fmt, ...)
 #define ASSERT_WITH_MSG(expr, ...)					\
 	((expr) ? (void)0 : abort_with_msg(__VA_ARGS__))
 #define ASSERT(expr)							\
-	ASSERT_WITH_MSG((expr), "ASSSERTION FAILED in %s (%s:%d): %s",	\
+	ASSERT_WITH_MSG((expr), "ASSERTION FAILED in %s (%s:%d): %s",	\
 			__func__, __FILE__, __LINE__, STRINGIFY(expr))
 
 #ifdef P_tmpdir

From 11be695f5c0a6a7d74e9628e9863e665f59d511f Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 30 Apr 2025 18:05:25 +0200
Subject: [PATCH 367/382] flow: fix podman issue #25959

While running piHole using podman, traffic can trigger the following
assert:

ASSSERTION FAILED in flow_alloc (flow.c:521): flow->f.state == FLOW_STATE_FREE

Backtrace shows that this happens in flow_defer_handler():

    #4  0x00005610d6f5b481 flow_alloc (passt + 0xb481)
    #5  0x00005610d6f74f86 udp_flow_from_sock (passt + 0x24f86)
    #6  0x00005610d6f737c3 udp_sock_fwd (passt + 0x237c3)
    #7  0x00005610d6f74c07 udp_flush_flow (passt + 0x24c07)
    #8  0x00005610d6f752c2 udp_flow_defer (passt + 0x252c2)
    #9  0x00005610d6f5bce1 flow_defer_handler (passt + 0xbce1)

We are trying to allocate a new flow inside the loop freeing them.

Inside the loop free_head points to the first free flow entry in the
current cluster. But if we allocate a new entry during the loop,
free_head is not updated and can point now to the entry we have just
allocated.

We can fix the problem by spliting the loop in two parts:
- first part where we can close some of them and allocate some new
  flow entries,
- second part where we free the entries closed in the previous loop
  and we aggregate the free entries to merge consecutive the clusters.

Reported-by: Martin Rijntjes <bugs@air-global.nl>
Link: https://github.com/containers/podman/issues/25959
Fixes: 9725e7988837 ("udp_flow: Don't discard packets that arrive between bind() and connect()")
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 109 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 58 insertions(+), 51 deletions(-)

diff --git a/flow.c b/flow.c
index 447c021..c5718e3 100644
--- a/flow.c
+++ b/flow.c
@@ -800,6 +800,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 {
 	struct flow_free_cluster *free_head = NULL;
 	unsigned *last_next = &flow_first_free;
+	bool to_free[FLOW_MAX] = { 0 };
 	bool timer = false;
 	union flow *flow;
 
@@ -810,9 +811,44 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 
 	ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
 
-	flow_foreach_slot(flow) {
+	/* Check which flows we might need to close first, but don't free them
+	 * yet as it's not safe to do that in the middle of flow_foreach().
+	 */
+	flow_foreach(flow) {
 		bool closed = false;
 
+		switch (flow->f.type) {
+		case FLOW_TYPE_NONE:
+			ASSERT(false);
+			break;
+		case FLOW_TCP:
+			closed = tcp_flow_defer(&flow->tcp);
+			break;
+		case FLOW_TCP_SPLICE:
+			closed = tcp_splice_flow_defer(&flow->tcp_splice);
+			if (!closed && timer)
+				tcp_splice_timer(c, &flow->tcp_splice);
+			break;
+		case FLOW_PING4:
+		case FLOW_PING6:
+			if (timer)
+				closed = icmp_ping_timer(c, &flow->ping, now);
+			break;
+		case FLOW_UDP:
+			closed = udp_flow_defer(c, &flow->udp, now);
+			if (!closed && timer)
+				closed = udp_flow_timer(c, &flow->udp, now);
+			break;
+		default:
+			/* Assume other flow types don't need any handling */
+			;
+		}
+
+		to_free[FLOW_IDX(flow)] = closed;
+	}
+
+	/* Second step: actually free the flows */
+	flow_foreach_slot(flow) {
 		switch (flow->f.state) {
 		case FLOW_STATE_FREE: {
 			unsigned skip = flow->free.n;
@@ -845,59 +881,30 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 			break;
 
 		case FLOW_STATE_ACTIVE:
-			/* Nothing to do */
-			break;
+			if (to_free[FLOW_IDX(flow)]) {
+				flow_set_state(&flow->f, FLOW_STATE_FREE);
+				memset(flow, 0, sizeof(*flow));
 
-		default:
-			ASSERT(false);
-		}
-
-		switch (flow->f.type) {
-		case FLOW_TYPE_NONE:
-			ASSERT(false);
-			break;
-		case FLOW_TCP:
-			closed = tcp_flow_defer(&flow->tcp);
-			break;
-		case FLOW_TCP_SPLICE:
-			closed = tcp_splice_flow_defer(&flow->tcp_splice);
-			if (!closed && timer)
-				tcp_splice_timer(c, &flow->tcp_splice);
-			break;
-		case FLOW_PING4:
-		case FLOW_PING6:
-			if (timer)
-				closed = icmp_ping_timer(c, &flow->ping, now);
-			break;
-		case FLOW_UDP:
-			closed = udp_flow_defer(c, &flow->udp, now);
-			if (!closed && timer)
-				closed = udp_flow_timer(c, &flow->udp, now);
-			break;
-		default:
-			/* Assume other flow types don't need any handling */
-			;
-		}
-
-		if (closed) {
-			flow_set_state(&flow->f, FLOW_STATE_FREE);
-			memset(flow, 0, sizeof(*flow));
-
-			if (free_head) {
-				/* Add slot to current free cluster */
-				ASSERT(FLOW_IDX(flow) ==
-				       FLOW_IDX(free_head) + free_head->n);
-				free_head->n++;
-				flow->free.n = flow->free.next = 0;
+				if (free_head) {
+					/* Add slot to current free cluster */
+					ASSERT(FLOW_IDX(flow) ==
+					    FLOW_IDX(free_head) + free_head->n);
+					free_head->n++;
+					flow->free.n = flow->free.next = 0;
+				} else {
+					/* Create new free cluster */
+					free_head = &flow->free;
+					free_head->n = 1;
+					*last_next = FLOW_IDX(flow);
+					last_next = &free_head->next;
+				}
 			} else {
-				/* Create new free cluster */
-				free_head = &flow->free;
-				free_head->n = 1;
-				*last_next = FLOW_IDX(flow);
-				last_next = &free_head->next;
+				free_head = NULL;
 			}
-		} else {
-			free_head = NULL;
+			break;
+
+		default:
+			ASSERT(false);
 		}
 	}
 

From 93394f4ef0966602b2ada8f72beaf75352add7b1 Mon Sep 17 00:00:00 2001
From: Janne Grunau <janne-psst@jannau.net>
Date: Thu, 1 May 2025 11:54:07 +0200
Subject: [PATCH 368/382] selinux: Add getattr to class udp_socket

Commit 59cc89f ("udp, udp_flow: Track our specific address on socket
interfaces") added a getsockname() call in udp_flow_new(). This requires
getattr. Fixes "Flow 0 (UDP flow): Unable to determine local address:
Permission denied" errors in muvm/passt on Fedora Linux 42 with SELinux.

The SELinux audit message is

| type=AVC msg=audit(1746083799.606:235): avc:  denied  { getattr } for
|   pid=2961 comm="passt" laddr=127.0.0.1 lport=49221
|   faddr=127.0.0.53 fport=53
|   scontext=unconfined_u:unconfined_r:passt_t:s0-s0:c0.c1023
|   tcontext=unconfined_u:unconfined_r:passt_t:s0-s0:c0.c1023
|   tclass=udp_socket permissive=0

Fixes: 59cc89f4cc01 ("udp, udp_flow: Track our specific address on socket interfaces")
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2363238
Signed-off-by: Janne Grunau <janne-psst@jannau.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/selinux/passt.te | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index f8ea672..eb9ce72 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -49,7 +49,7 @@ require {
 	type proc_net_t;
 	type node_t;
 	class tcp_socket { create accept listen name_bind name_connect getattr ioctl };
-	class udp_socket { create accept listen };
+	class udp_socket { create accept listen getattr };
 	class icmp_socket { bind create name_bind node_bind setopt read write };
 	class sock_file { create unlink write };
 
@@ -133,7 +133,7 @@ allow passt_t node_t:icmp_socket { name_bind node_bind };
 allow passt_t port_t:icmp_socket name_bind;
 
 allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl };
-allow passt_t self:udp_socket { create getopt setopt connect bind read write };
+allow passt_t self:udp_socket { create getopt setopt connect bind read write getattr };
 allow passt_t self:icmp_socket { bind create setopt read write };
 
 allow passt_t user_tmp_t:dir { add_name write };

From f0021f9e1d4f118f4167149b256346f3dfea9d2b Mon Sep 17 00:00:00 2001
From: Emanuel Valasiadis <emanuel@valasiadis.space>
Date: Fri, 2 May 2025 15:31:39 +0200
Subject: [PATCH 369/382] fwd: fix doc typo

Signed-off-by: Emanuel Valasiadis <emanuel@valasiadis.space>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 fwd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fwd.c b/fwd.c
index b73c2c8..49aabc3 100644
--- a/fwd.c
+++ b/fwd.c
@@ -440,7 +440,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
 }
 
 /**
- * nat_inbound() - Apply address translation for outbound (HOST to TAP)
+ * nat_inbound() - Apply address translation for inbound (HOST to TAP)
  * @c:		Execution context
  * @addr:	Input address (as seen on HOST interface)
  * @translated:	Output address (as seen on TAP interface)

From 587980ca1e9d5645f6738f67ec3f15cc61a7efa3 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 2 May 2025 21:56:30 +0200
Subject: [PATCH 370/382] udp: Actually discard datagrams we can't forward

Given that udp_sock_fwd() now loops on udp_peek_addr() to get endpoint
addresses for datagrams, if we can't forward one of these datagrams,
we need to make sure we actually discard it. Otherwise, with MSG_PEEK,
we won't dequeue and loop on it forever.

For example, if we fail to create a socket for a new flow, because,
say, the destination of an inbound packet is multicast, and we can't
bind() to a multicast address, the loop will look like this:

18.0563: Flow 0 (NEW): FREE -> NEW
18.0563: Flow 0 (INI): NEW -> INI
18.0563: Flow 0 (INI): HOST [127.0.0.1]:42487 -> [127.0.0.1]:9997 => ?
18.0563: Flow 0 (TGT): INI -> TGT
18.0563: Flow 0 (TGT): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0563: Flow 0 (UDP flow): TGT -> TYPED
18.0564: Flow 0 (UDP flow): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0564: Flow 0 (UDP flow): Couldn't open flow specific socket: Invalid argument
18.0564: Flow 0 (FREE): TYPED -> FREE
18.0564: Flow 0 (FREE): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0564: Discarding datagram without flow
18.0564: Flow 0 (NEW): FREE -> NEW
18.0564: Flow 0 (INI): NEW -> INI
18.0564: Flow 0 (INI): HOST [127.0.0.1]:42487 -> [127.0.0.1]:9997 => ?
18.0564: Flow 0 (TGT): INI -> TGT
18.0564: Flow 0 (TGT): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0564: Flow 0 (UDP flow): TGT -> TYPED
18.0564: Flow 0 (UDP flow): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0564: Flow 0 (UDP flow): Couldn't open flow specific socket: Invalid argument
18.0564: Flow 0 (FREE): TYPED -> FREE
18.0564: Flow 0 (FREE): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0564: Discarding datagram without flow

and seen from strace:

epoll_wait(3, [{events=EPOLLIN, data=0x1076c00000705}], 8, 1000) = 1
recvmsg(7, {msg_name={sa_family=AF_INET6, sin6_port=htons(55899), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "fe80::26e8:53ff:fef3:13b6", &sin6_addr), sin6_scope_id=if_nametoindex("wlp4s0")}, msg_namelen=28, msg_iov=NULL, msg_iovlen=0, msg_control=[{cmsg_len=36, cmsg_level=SOL_IPV6, cmsg_type=0x32, cmsg_data="\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x03\x00\x00\x00"}], msg_controllen=40, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_DONTWAIT) = 0
socket(AF_INET6, SOCK_DGRAM|SOCK_NONBLOCK, IPPROTO_UDP) = 12
setsockopt(12, SOL_IPV6, IPV6_V6ONLY, [1], 4) = 0
setsockopt(12, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
setsockopt(12, SOL_IPV6, IPV6_RECVERR, [1], 4) = 0
setsockopt(12, SOL_IPV6, IPV6_RECVPKTINFO, [1], 4) = 0
bind(12, {sa_family=AF_INET6, sin6_port=htons(1900), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "ff02::c", &sin6_addr), sin6_scope_id=0}, 28) = -1 EINVAL (Invalid argument)
close(12)                               = 0
recvmsg(7, {msg_name={sa_family=AF_INET6, sin6_port=htons(55899), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "fe80::26e8:53ff:fef3:13b6", &sin6_addr), sin6_scope_id=if_nametoindex("wlp4s0")}, msg_namelen=28, msg_iov=NULL, msg_iovlen=0, msg_control=[{cmsg_len=36, cmsg_level=SOL_IPV6, cmsg_type=0x32, cmsg_data="\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x03\x00\x00\x00"}], msg_controllen=40, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_DONTWAIT) = 0
socket(AF_INET6, SOCK_DGRAM|SOCK_NONBLOCK, IPPROTO_UDP) = 12
setsockopt(12, SOL_IPV6, IPV6_V6ONLY, [1], 4) = 0
setsockopt(12, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
setsockopt(12, SOL_IPV6, IPV6_RECVERR, [1], 4) = 0
setsockopt(12, SOL_IPV6, IPV6_RECVPKTINFO, [1], 4) = 0
bind(12, {sa_family=AF_INET6, sin6_port=htons(1900), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "ff02::c", &sin6_addr), sin6_scope_id=0}, 28) = -1 EINVAL (Invalid argument)
close(12)                               = 0

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/udp.c b/udp.c
index f5a5cd1..ca28b37 100644
--- a/udp.c
+++ b/udp.c
@@ -828,6 +828,7 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 	int rc;
 
 	while ((rc = udp_peek_addr(s, &src, &dst)) != 0) {
+		bool discard = false;
 		flow_sidx_t tosidx;
 		uint8_t topif;
 
@@ -861,8 +862,17 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 			flow_err(uflow,
 				 "No support for forwarding UDP from %s to %s",
 				 pif_name(frompif), pif_name(topif));
+			discard = true;
 		} else {
 			debug("Discarding datagram without flow");
+			discard = true;
+		}
+
+		if (discard) {
+			struct msghdr msg = { 0 };
+
+			if (recvmsg(s, &msg, MSG_DONTWAIT) < 0)
+				debug_perror("Failed to discard datagram");
 		}
 	}
 }

From eea8a76caf85f4bae5f92b695d09b9ddea354b57 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 7 May 2025 14:36:34 +0200
Subject: [PATCH 371/382] flow: fix podman issue #26073

While running pasta, we trigger the following assert:

  ASSERTION FAILED in udp_at_sidx (udp_flow.c:35): flow->f.type == FLOW_UDP

in udp_at_sidx() in the following path:

 902 void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 903                       uint32_t events, const struct timespec *now)
 904 {
 905         struct udp_flow *uflow = udp_at_sidx(ref.flowside);

The invalid sidx is comming from the epoll_ref provided by epoll_wait().

This assert follows the following error:

  Couldn't connect flow socket: Permission denied

It appears that an error happens in udp_flow_sock() and the recently
created fd is not removed from the epoll_ctl() pool:

 71 static int udp_flow_sock(const struct ctx *c,
 72                          struct udp_flow *uflow, unsigned sidei)
 73 {
...
 82         s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
 83         if (s < 0) {
 84                 flow_dbg_perror(uflow, "Couldn't open flow specific socket");
 85                 return s;
 86         }
 87
 88         if (flowside_connect(c, s, pif, side) < 0) {
 89                 int rc = -errno;
 90                 flow_dbg_perror(uflow, "Couldn't connect flow socket");
 91                 return rc;
 92         }
...

flowside_sock_l4() calls sock_l4_sa() that adds 's' to the epoll_ctl()
pool.

So to cleanly manage the error of flowside_connect() we need to remove
's' from the epoll_ctl() pool using epoll_del().

Link: https://github.com/containers/podman/issues/26073
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_flow.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/udp_flow.c b/udp_flow.c
index fea1cf3..b3a13b7 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -87,6 +87,10 @@ static int udp_flow_sock(const struct ctx *c,
 
 	if (flowside_connect(c, s, pif, side) < 0) {
 		int rc = -errno;
+
+		if (pif == PIF_HOST)
+			epoll_del(c, s);
+
 		flow_dbg_perror(uflow, "Couldn't connect flow socket");
 		return rc;
 	}

From 92d5d680134455f1a5b51fd8a3e9e64c99ac6d13 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 6 May 2025 16:13:25 +0200
Subject: [PATCH 372/382] flow: fix wrong macro name in comments

The maximum number of flow macro name is FLOW_MAX, not MAX_FLOW.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow.c b/flow.c
index c5718e3..6a5c8aa 100644
--- a/flow.c
+++ b/flow.c
@@ -81,7 +81,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
  *
  * Free cluster list
  *    flow_first_free gives the index of the first (lowest index) free cluster.
- *    Each free cluster has the index of the next free cluster, or MAX_FLOW if
+ *    Each free cluster has the index of the next free cluster, or FLOW_MAX if
  *    it is the last free cluster.  Together these form a linked list of free
  *    clusters, in strictly increasing order of index.
  *

From 8ec134109eb136432a29bdf5a14f8b1fd4e46208 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Mon, 12 May 2025 18:47:00 +0200
Subject: [PATCH 373/382] flow: close socket fd on error

In eea8a76caf85 ("flow: fix podman issue #26073"), we unregister
the fd from epoll_ctl() in case of error, but we also need to close it.

As flowside_sock_l4() also calls sock_l4_sa() via flowside_sock_splice()
we can do it unconditionally.

Fixes: eea8a76caf85 ("flow: fix podman issue #26073")
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_flow.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/udp_flow.c b/udp_flow.c
index b3a13b7..4c6b3c2 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -88,8 +88,8 @@ static int udp_flow_sock(const struct ctx *c,
 	if (flowside_connect(c, s, pif, side) < 0) {
 		int rc = -errno;
 
-		if (pif == PIF_HOST)
-			epoll_del(c, s);
+		epoll_del(c, s);
+		close(s);
 
 		flow_dbg_perror(uflow, "Couldn't connect flow socket");
 		return rc;

From 570e7b4454f2f879180ae3ca13dedd759aff5243 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 13 May 2025 11:40:59 +0200
Subject: [PATCH 374/382] dhcpv6: fix GCC error
 (unterminated-string-initialization)

The string STR_NOTONLINK is intentionally not NUL-terminated.
Ignore the GCC error using __attribute__((nonstring)).

This error is reported by GCC 15.1.1 on Fedora 42. However,
Clang 20.1.3 does not support __attribute__((nonstring)).
Therefore, NOLINTNEXTLINE(clang-diagnostic-unknown-attributes)
is also added to suppress Clang's unknown attribute warning.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 dhcpv6.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dhcpv6.c b/dhcpv6.c
index 373a988..ba16c66 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -144,7 +144,9 @@ struct opt_ia_addr {
 struct opt_status_code {
 	struct opt_hdr hdr;
 	uint16_t code;
-	char status_msg[sizeof(STR_NOTONLINK) - 1];
+	/* "nonstring" is only supported since clang 23 */
+	/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
+	__attribute__((nonstring)) char status_msg[sizeof(STR_NOTONLINK) - 1];
 } __attribute__((packed));
 
 /**

From a6b9832e495be636bcccf25e0aebdeb564addf06 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 13 May 2025 11:41:00 +0200
Subject: [PATCH 375/382] virtio: Fix Clang warning
 (bugprone-sizeof-expression, cert-arr39-c)

In `virtqueue_read_indirect_desc()`, the pointer arithmetic involving
`desc` is intentional. We add the length in bytes (`read_len`)
divided by the size of `struct vring_desc` to `desc`, which is
an array of `struct vring_desc`. This correctly calculates the
offset in terms of the number of `struct vring_desc` elements.

Clang issues the following warning due to this explicit scaling:

virtio.c:238:8: error: suspicious usage of 'sizeof(...)' in pointer
arithmetic; this scaled value will be scaled again by the '+='
operator [bugprone-sizeof-expression,cert-arr39-c,-Werror]
  238 |         desc += read_len / sizeof(struct vring_desc);
      |               ^            ~~~~~~~~~~~~~~~~~~~~~~~~~
virtio.c:238:8: note: '+=' in pointer arithmetic internally scales
with 'sizeof(struct vring_desc)' == 16

This behavior is intended, so the warning can be considered a
false positive in this context. The code correctly advances the
pointer by the desired number of descriptor entries.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 virtio.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/virtio.c b/virtio.c
index bc2b89a..f7db007 100644
--- a/virtio.c
+++ b/virtio.c
@@ -235,6 +235,7 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
 		memcpy(desc, orig_desc, read_len);
 		len -= read_len;
 		addr += read_len;
+		/* NOLINTNEXTLINE(bugprone-sizeof-expression,cert-arr39-c) */
 		desc += read_len / sizeof(struct vring_desc);
 	}
 

From 0f7bf10b0a5542690dc6c75e4b56a6030ca8a663 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 13 May 2025 11:41:01 +0200
Subject: [PATCH 376/382] ndp: Fix Clang analyzer warning
 (clang-analyzer-security.PointerSub)

Addresses Clang warning: "Subtraction of two pointers that do not
point into the same array is undefined behavior" for the line:
  `ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);`

Here, `ptr` is `&ra.var[0]`. The subtraction calculates the offset
of `var[0]` within the `struct ra_options ra`. Since `ptr` points
inside `ra`, this pointer arithmetic is well-defined for
calculating the size of the data to send, even if `ptr` and `&ra`
are not strictly considered part of the same "array" by the analyzer.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ndp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ndp.c b/ndp.c
index ded2081..b664034 100644
--- a/ndp.c
+++ b/ndp.c
@@ -328,6 +328,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
 
 	memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
 
+	/* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
 	ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);
 }
 

From 2d3d69c5c348d18112596bd3fdeed95689c613c8 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 13 May 2025 11:41:02 +0200
Subject: [PATCH 377/382] flow: Fix clang error
 (clang-analyzer-security.PointerSub)

Fixes the following clang-analyzer warning:

flow_table.h:96:25: note: Subtraction of two pointers that do not point into the same array is undefined behavior
   96 |         return (union flow *)f - flowtab;

The `flow_idx()` function is called via `FLOW_IDX()` from
`flow_foreach_slot()`, where `f` is set to `&flowtab[idx].f`.
Therefore, `f` and `flowtab` do point to the same array.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow_table.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flow_table.h b/flow_table.h
index 2d5c65c..3f3f4b7 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -93,6 +93,7 @@ extern union flow flowtab[];
  */
 static inline unsigned flow_idx(const struct flow_common *f)
 {
+	/* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
 	return (union flow *)f - flowtab;
 }
 

From 4234ace84cdf989cbcdb96a8165221dc83a11c85 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 14 May 2025 15:45:09 +0200
Subject: [PATCH 378/382] test: Display count of skipped tests in status and
 summary

This commit enhances test reporting by tracking and displaying the
number of skipped tests.

The skipped test count is now visible in the tmux status bar during
execution and included in the final test summary log. This provides
a more complete overview of test suite results.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/lib/term | 7 +++++--
 test/run      | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/lib/term b/test/lib/term
index ed690de..089364c 100755
--- a/test/lib/term
+++ b/test/lib/term
@@ -19,6 +19,7 @@ STATUS_FILE_INDEX=0
 STATUS_COLS=
 STATUS_PASS=0
 STATUS_FAIL=0
+STATUS_SKIPPED=0
 
 PR_RED='\033[1;31m'
 PR_GREEN='\033[1;32m'
@@ -439,19 +440,21 @@ info_layout() {
 # status_test_ok() - Update counter of passed tests, log and display message
 status_test_ok() {
 	STATUS_PASS=$((STATUS_PASS + 1))
-	tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
+	tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
 	info_passed
 }
 
 # status_test_fail() - Update counter of failed tests, log and display message
 status_test_fail() {
 	STATUS_FAIL=$((STATUS_FAIL + 1))
-	tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
+	tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
 	info_failed
 }
 
 # status_test_fail() - Update counter of failed tests, log and display message
 status_test_skip() {
+	STATUS_SKIPPED=$((STATUS_SKIPPED + 1))
+	tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
 	info_skipped
 }
 
diff --git a/test/run b/test/run
index 4e86f30..f73c311 100755
--- a/test/run
+++ b/test/run
@@ -202,7 +202,7 @@ skip_distro() {
 	perf_finish
 	[ ${CI} -eq 1 ] && video_stop
 
-	log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}"
+	log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}"
 
 	pause_continue \
 		"Press any key to keep test session open"	\
@@ -236,7 +236,7 @@ run_selected() {
 	done
 	teardown "${__setup}"
 
-	log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}"
+	log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}"
 
 	pause_continue \
 		"Press any key to keep test session open"	\
@@ -307,4 +307,4 @@ fi
 
 tail -n1 ${LOGFILE}
 echo "Log at ${LOGFILE}"
-exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\)$/\1/p')
+exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\),.*$/\1/p')

From 2046976866dd1f983cb0417a1d3ee3f64190805d Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 15 May 2025 11:41:51 +0200
Subject: [PATCH 379/382] codespell: Correct typos in comments and error
 message

This commit addresses several spelling errors identified by the `codespell`
tool. The corrections apply to:
- Code comments in `fwd.c`, `ip.h`, `isolation.c`, and `log.c`.
- An error message string in `vhost_user.c`.

Specifically, the following misspellings were corrected:
- "adddress" to "address"
- "capabilites" to "capabilities"
- "Musn't" to "Mustn't"
- "calculatd" to "calculated"
- "Invalide" to "Invalid"

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 fwd.c        | 2 +-
 ip.h         | 2 +-
 isolation.c  | 8 ++++----
 log.c        | 2 +-
 vhost_user.c | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fwd.c b/fwd.c
index 49aabc3..250cf56 100644
--- a/fwd.c
+++ b/fwd.c
@@ -418,7 +418,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
 	else
 		tgt->eaddr = inany_loopback6;
 
-	/* Preserve the specific loopback adddress used, but let the kernel pick
+	/* Preserve the specific loopback address used, but let the kernel pick
 	 * a source port on the target side
 	 */
 	tgt->oaddr = ini->eaddr;
diff --git a/ip.h b/ip.h
index 471c57e..24509d9 100644
--- a/ip.h
+++ b/ip.h
@@ -118,7 +118,7 @@ static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
 char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
 		 size_t *dlen);
 
-/* IPv6 link-local all-nodes multicast adddress, ff02::1 */
+/* IPv6 link-local all-nodes multicast address, ff02::1 */
 static const struct in6_addr in6addr_ll_all_nodes = {
 	.s6_addr = {
 		0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
diff --git a/isolation.c b/isolation.c
index c944fb3..bbcd23b 100644
--- a/isolation.c
+++ b/isolation.c
@@ -129,7 +129,7 @@ static void drop_caps_ep_except(uint64_t keep)
  * additional layer of protection.  Executing this requires
  * CAP_SETPCAP, which we will have within our userns.
  *
- * Note that dropping capabilites from the bounding set limits
+ * Note that dropping capabilities from the bounding set limits
  * exec()ed processes, but does not remove them from the effective or
  * permitted sets, so it doesn't reduce our own capabilities.
  */
@@ -174,8 +174,8 @@ static void clamp_caps(void)
  * Should:
  *  - drop unneeded capabilities
  *  - close all open files except for standard streams and the one from --fd
- * Musn't:
- *  - remove filesytem access (we need to access files during setup)
+ * Mustn't:
+ *  - remove filesystem access (we need to access files during setup)
  */
 void isolate_initial(int argc, char **argv)
 {
@@ -194,7 +194,7 @@ void isolate_initial(int argc, char **argv)
 	 *
 	 * It's debatable whether it's useful to drop caps when we
 	 * retain SETUID and SYS_ADMIN, but we might as well.  We drop
-	 * further capabilites in isolate_user() and
+	 * further capabilities in isolate_user() and
 	 * isolate_prefork().
 	 */
 	keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
diff --git a/log.c b/log.c
index d40d7ae..5d7d76f 100644
--- a/log.c
+++ b/log.c
@@ -402,7 +402,7 @@ void __setlogmask(int mask)
  * logfile_init() - Open log file and write header with PID, version, path
  * @name:	Identifier for header: passt or pasta
  * @path:	Path to log file
- * @size:	Maximum size of log file: log_cut_size is calculatd here
+ * @size:	Maximum size of log file: log_cut_size is calculated here
  */
 void logfile_init(const char *name, const char *path, size_t size)
 {
diff --git a/vhost_user.c b/vhost_user.c
index 105f77a..ca36763 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -1021,7 +1021,7 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
 
 	if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE &&
 	    direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
-		die("Invalide device_state_fd direction: %d", direction);
+		die("Invalid device_state_fd direction: %d", direction);
 
 	migrate_request(vdev->context, msg->fds[0],
 			direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);

From 2fd0944f21d6b9fce53c328acf1faaeb46b98528 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 16 May 2025 14:42:26 +0200
Subject: [PATCH 380/382] vhost_user: Correct and align function comment
 headers

This commit cleans up function comment headers in vhost_user.c to ensure
accuracy and consistency with the code. Changes include correcting
parameter names in comments and signatures (e.g., standardizing on vmsg
for vhost messages, fixing dev to vdev), updating function names in
comment descriptions, and removing/rectifying erroneous parameter
documentation.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 221 +++++++++++++++++++++++++--------------------------
 vhost_user.h |   2 +-
 2 files changed, 111 insertions(+), 112 deletions(-)

diff --git a/vhost_user.c b/vhost_user.c
index ca36763..e8377bb 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -302,13 +302,13 @@ static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg)
  * @conn_fd:	vhost-user command socket
  * @vmsg:	vhost-user message
  */
-static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
+static void vu_send_reply(int conn_fd, struct vhost_user_msg *vmsg)
 {
-	msg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
-	msg->hdr.flags |= VHOST_USER_VERSION;
-	msg->hdr.flags |= VHOST_USER_REPLY_MASK;
+	vmsg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
+	vmsg->hdr.flags |= VHOST_USER_VERSION;
+	vmsg->hdr.flags |= VHOST_USER_REPLY_MASK;
 
-	vu_message_write(conn_fd, msg);
+	vu_message_write(conn_fd, vmsg);
 }
 
 /**
@@ -319,7 +319,7 @@ static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
  * Return: True as a reply is requested
  */
 static bool vu_get_features_exec(struct vu_dev *vdev,
-				 struct vhost_user_msg *msg)
+				 struct vhost_user_msg *vmsg)
 {
 	uint64_t features =
 		1ULL << VIRTIO_F_VERSION_1 |
@@ -329,9 +329,9 @@ static bool vu_get_features_exec(struct vu_dev *vdev,
 
 	(void)vdev;
 
-	vmsg_set_reply_u64(msg, features);
+	vmsg_set_reply_u64(vmsg, features);
 
-	debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64);
+	debug("Sending back to guest u64: 0x%016"PRIx64, vmsg->payload.u64);
 
 	return true;
 }
@@ -357,11 +357,11 @@ static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable)
  * Return: False as no reply is requested
  */
 static bool vu_set_features_exec(struct vu_dev *vdev,
-				 struct vhost_user_msg *msg)
+				 struct vhost_user_msg *vmsg)
 {
-	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+	debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
 
-	vdev->features = msg->payload.u64;
+	vdev->features = vmsg->payload.u64;
 	/* We only support devices conforming to VIRTIO 1.0 or
 	 * later
 	 */
@@ -382,10 +382,10 @@ static bool vu_set_features_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_owner_exec(struct vu_dev *vdev,
-			      struct vhost_user_msg *msg)
+			      struct vhost_user_msg *vmsg)
 {
 	(void)vdev;
-	(void)msg;
+	(void)vmsg;
 
 	return false;
 }
@@ -423,9 +423,9 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
  * #syscalls:vu mmap|mmap2 munmap
  */
 static bool vu_set_mem_table_exec(struct vu_dev *vdev,
-				  struct vhost_user_msg *msg)
+				  struct vhost_user_msg *vmsg)
 {
-	struct vhost_user_memory m = msg->payload.memory, *memory = &m;
+	struct vhost_user_memory m = vmsg->payload.memory, *memory = &m;
 	unsigned int i;
 
 	for (i = 0; i < vdev->nregions; i++) {
@@ -465,7 +465,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
 		 */
 		mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
 				 PROT_READ | PROT_WRITE, MAP_SHARED |
-				 MAP_NORESERVE, msg->fds[i], 0);
+				 MAP_NORESERVE, vmsg->fds[i], 0);
 
 		if (mmap_addr == MAP_FAILED)
 			die_perror("vhost-user region mmap error");
@@ -474,7 +474,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
 		debug("    mmap_addr:       0x%016"PRIx64,
 		      dev_region->mmap_addr);
 
-		close(msg->fds[i]);
+		close(vmsg->fds[i]);
 	}
 
 	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
@@ -541,7 +541,7 @@ static void vu_log_page(uint8_t *log_table, uint64_t page)
 
 /**
  * vu_log_write() - Log memory write
- * @dev:	vhost-user device
+ * @vdev:	vhost-user device
  * @address:	Memory address
  * @length:	Memory size
  */
@@ -566,23 +566,23 @@ void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length)
  * @vdev:	vhost-user device
  * @vmsg:	vhost-user message
  *
- * Return: False as no reply is requested
+ * Return: True as a reply is requested
  *
  * #syscalls:vu mmap|mmap2 munmap
  */
 static bool vu_set_log_base_exec(struct vu_dev *vdev,
-				 struct vhost_user_msg *msg)
+				 struct vhost_user_msg *vmsg)
 {
 	uint64_t log_mmap_size, log_mmap_offset;
 	void *base;
 	int fd;
 
-	if (msg->fd_num != 1 || msg->hdr.size != sizeof(msg->payload.log))
+	if (vmsg->fd_num != 1 || vmsg->hdr.size != sizeof(vmsg->payload.log))
 		die("vhost-user: Invalid log_base message");
 
-	fd = msg->fds[0];
-	log_mmap_offset = msg->payload.log.mmap_offset;
-	log_mmap_size = msg->payload.log.mmap_size;
+	fd = vmsg->fds[0];
+	log_mmap_offset = vmsg->payload.log.mmap_offset;
+	log_mmap_size = vmsg->payload.log.mmap_size;
 
 	debug("vhost-user log mmap_offset: %"PRId64, log_mmap_offset);
 	debug("vhost-user log mmap_size:   %"PRId64, log_mmap_size);
@@ -599,8 +599,8 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev,
 	vdev->log_table = base;
 	vdev->log_size = log_mmap_size;
 
-	msg->hdr.size = sizeof(msg->payload.u64);
-	msg->fd_num = 0;
+	vmsg->hdr.size = sizeof(vmsg->payload.u64);
+	vmsg->fd_num = 0;
 
 	return true;
 }
@@ -613,15 +613,15 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_log_fd_exec(struct vu_dev *vdev,
-			       struct vhost_user_msg *msg)
+			       struct vhost_user_msg *vmsg)
 {
-	if (msg->fd_num != 1)
+	if (vmsg->fd_num != 1)
 		die("Invalid log_fd message");
 
 	if (vdev->log_call_fd != -1)
 		close(vdev->log_call_fd);
 
-	vdev->log_call_fd = msg->fds[0];
+	vdev->log_call_fd = vmsg->fds[0];
 
 	debug("Got log_call_fd: %d", vdev->log_call_fd);
 
@@ -636,10 +636,10 @@ static bool vu_set_log_fd_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_num_exec(struct vu_dev *vdev,
-				  struct vhost_user_msg *msg)
+				  struct vhost_user_msg *vmsg)
 {
-	unsigned int idx = msg->payload.state.index;
-	unsigned int num = msg->payload.state.num;
+	unsigned int idx = vmsg->payload.state.index;
+	unsigned int num = vmsg->payload.state.num;
 
 	trace("State.index: %u", idx);
 	trace("State.num:   %u", num);
@@ -656,13 +656,13 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
-				   struct vhost_user_msg *msg)
+				   struct vhost_user_msg *vmsg)
 {
 	/* We need to copy the payload to vhost_vring_addr structure
-         * to access index because address of msg->payload.addr
+         * to access index because address of vmsg->payload.addr
          * can be unaligned as it is packed.
          */
-	struct vhost_vring_addr addr = msg->payload.addr;
+	struct vhost_vring_addr addr = vmsg->payload.addr;
 	struct vu_virtq *vq = &vdev->vq[addr.index];
 
 	debug("vhost_vring_addr:");
@@ -677,7 +677,7 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
 	debug("    log_guest_addr:   0x%016" PRIx64,
 	      (uint64_t)addr.log_guest_addr);
 
-	vq->vra = msg->payload.addr;
+	vq->vra = vmsg->payload.addr;
 	vq->vring.flags = addr.flags;
 	vq->vring.log_guest_addr = addr.log_guest_addr;
 
@@ -702,10 +702,10 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_base_exec(struct vu_dev *vdev,
-				   struct vhost_user_msg *msg)
+				   struct vhost_user_msg *vmsg)
 {
-	unsigned int idx = msg->payload.state.index;
-	unsigned int num = msg->payload.state.num;
+	unsigned int idx = vmsg->payload.state.index;
+	unsigned int num = vmsg->payload.state.num;
 
 	debug("State.index: %u", idx);
 	debug("State.num:   %u", num);
@@ -723,13 +723,13 @@ static bool vu_set_vring_base_exec(struct vu_dev *vdev,
  * Return: True as a reply is requested
  */
 static bool vu_get_vring_base_exec(struct vu_dev *vdev,
-				   struct vhost_user_msg *msg)
+				   struct vhost_user_msg *vmsg)
 {
-	unsigned int idx = msg->payload.state.index;
+	unsigned int idx = vmsg->payload.state.index;
 
 	debug("State.index: %u", idx);
-	msg->payload.state.num = vdev->vq[idx].last_avail_idx;
-	msg->hdr.size = sizeof(msg->payload.state);
+	vmsg->payload.state.num = vdev->vq[idx].last_avail_idx;
+	vmsg->hdr.size = sizeof(vmsg->payload.state);
 
 	vdev->vq[idx].started = false;
 	vdev->vq[idx].vring.avail = 0;
@@ -771,21 +771,21 @@ static void vu_set_watch(const struct vu_dev *vdev, int idx)
  * 			       close fds if NOFD bit is set
  * @vmsg:	vhost-user message
  */
-static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
+static void vu_check_queue_msg_file(struct vhost_user_msg *vmsg)
 {
-	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
-	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 
 	if (idx >= VHOST_USER_MAX_QUEUES)
 		die("Invalid vhost-user queue index: %u", idx);
 
 	if (nofd) {
-		vmsg_close_fds(msg);
+		vmsg_close_fds(vmsg);
 		return;
 	}
 
-	if (msg->fd_num != 1)
-		die("Invalid fds in vhost-user request: %d", msg->hdr.request);
+	if (vmsg->fd_num != 1)
+		die("Invalid fds in vhost-user request: %d", vmsg->hdr.request);
 }
 
 /**
@@ -797,14 +797,14 @@ static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
-				   struct vhost_user_msg *msg)
+				   struct vhost_user_msg *vmsg)
 {
-	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
-	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 
-	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+	debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
 
-	vu_check_queue_msg_file(msg);
+	vu_check_queue_msg_file(vmsg);
 
 	if (vdev->vq[idx].kick_fd != -1) {
 		epoll_del(vdev->context, vdev->vq[idx].kick_fd);
@@ -813,7 +813,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
 	}
 
 	if (!nofd)
-		vdev->vq[idx].kick_fd = msg->fds[0];
+		vdev->vq[idx].kick_fd = vmsg->fds[0];
 
 	debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx);
 
@@ -837,14 +837,14 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_call_exec(struct vu_dev *vdev,
-				   struct vhost_user_msg *msg)
+				   struct vhost_user_msg *vmsg)
 {
-	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
-	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 
-	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+	debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
 
-	vu_check_queue_msg_file(msg);
+	vu_check_queue_msg_file(vmsg);
 
 	if (vdev->vq[idx].call_fd != -1) {
 		close(vdev->vq[idx].call_fd);
@@ -852,11 +852,11 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev,
 	}
 
 	if (!nofd)
-		vdev->vq[idx].call_fd = msg->fds[0];
+		vdev->vq[idx].call_fd = vmsg->fds[0];
 
 	/* in case of I/O hang after reconnecting */
 	if (vdev->vq[idx].call_fd != -1)
-		eventfd_write(msg->fds[0], 1);
+		eventfd_write(vmsg->fds[0], 1);
 
 	debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx);
 
@@ -872,14 +872,14 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_err_exec(struct vu_dev *vdev,
-				  struct vhost_user_msg *msg)
+				  struct vhost_user_msg *vmsg)
 {
-	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
-	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 
-	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+	debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
 
-	vu_check_queue_msg_file(msg);
+	vu_check_queue_msg_file(vmsg);
 
 	if (vdev->vq[idx].err_fd != -1) {
 		close(vdev->vq[idx].err_fd);
@@ -887,7 +887,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
 	}
 
 	if (!nofd)
-		vdev->vq[idx].err_fd = msg->fds[0];
+		vdev->vq[idx].err_fd = vmsg->fds[0];
 
 	return false;
 }
@@ -901,7 +901,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
  * Return: True as a reply is requested
  */
 static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
-					  struct vhost_user_msg *msg)
+					  struct vhost_user_msg *vmsg)
 {
 	uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
 			    1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
@@ -909,7 +909,7 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
 			    1ULL << VHOST_USER_PROTOCOL_F_RARP;
 
 	(void)vdev;
-	vmsg_set_reply_u64(msg, features);
+	vmsg_set_reply_u64(vmsg, features);
 
 	return true;
 }
@@ -922,13 +922,13 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
-					  struct vhost_user_msg *msg)
+					  struct vhost_user_msg *vmsg)
 {
-	uint64_t features = msg->payload.u64;
+	uint64_t features = vmsg->payload.u64;
 
 	debug("u64: 0x%016"PRIx64, features);
 
-	vdev->protocol_features = msg->payload.u64;
+	vdev->protocol_features = vmsg->payload.u64;
 
 	return false;
 }
@@ -941,11 +941,11 @@ static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
  * Return: True as a reply is requested
  */
 static bool vu_get_queue_num_exec(struct vu_dev *vdev,
-				  struct vhost_user_msg *msg)
+				  struct vhost_user_msg *vmsg)
 {
 	(void)vdev;
 
-	vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES);
+	vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_QUEUES);
 
 	return true;
 }
@@ -958,10 +958,10 @@ static bool vu_get_queue_num_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
-				     struct vhost_user_msg *msg)
+				     struct vhost_user_msg *vmsg)
 {
-	unsigned int enable = msg->payload.state.num;
-	unsigned int idx = msg->payload.state.index;
+	unsigned int enable = vmsg->payload.state.num;
+	unsigned int idx = vmsg->payload.state.index;
 
 	debug("State.index:  %u", idx);
 	debug("State.enable: %u", enable);
@@ -974,17 +974,17 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
 }
 
 /**
- * vu_set_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
- * 			     RARP to notify the migration is terminated",
- * 			     but passt doesn't need to update any ARP table,
- * 			     so do nothing to silence QEMU bogus error message
+ * vu_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
+ * 			 RARP to notify the migration is terminated",
+ * 			 but passt doesn't need to update any ARP table,
+ * 			 so do nothing to silence QEMU bogus error message
  * @vdev:	vhost-user device
  * @vmsg:	vhost-user message
  *
  * Return: False as no reply is requested
  */
 static bool vu_send_rarp_exec(struct vu_dev *vdev,
-			      struct vhost_user_msg *msg)
+			      struct vhost_user_msg *vmsg)
 {
 	char macstr[ETH_ADDRSTRLEN];
 
@@ -993,7 +993,7 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
 	/* ignore the command */
 
 	debug("Ignore command VHOST_USER_SEND_RARP for %s",
-	      eth_ntop((unsigned char *)&msg->payload.u64, macstr,
+	      eth_ntop((unsigned char *)&vmsg->payload.u64, macstr,
 		       sizeof(macstr)));
 
 	return false;
@@ -1008,12 +1008,12 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
  *         and set bit 8 as we don't provide our own fd.
  */
 static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
-					struct vhost_user_msg *msg)
+					struct vhost_user_msg *vmsg)
 {
-	unsigned int direction = msg->payload.transfer_state.direction;
-	unsigned int phase = msg->payload.transfer_state.phase;
+	unsigned int direction = vmsg->payload.transfer_state.direction;
+	unsigned int phase = vmsg->payload.transfer_state.phase;
 
-	if (msg->fd_num != 1)
+	if (vmsg->fd_num != 1)
 		die("Invalid device_state_fd message");
 
 	if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED)
@@ -1023,11 +1023,11 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
 	    direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
 		die("Invalid device_state_fd direction: %d", direction);
 
-	migrate_request(vdev->context, msg->fds[0],
+	migrate_request(vdev->context, vmsg->fds[0],
 			direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
 
 	/* We don't provide a new fd for the data transfer */
-	vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
+	vmsg_set_reply_u64(vmsg, VHOST_USER_VRING_NOFD_MASK);
 
 	return true;
 }
@@ -1041,9 +1041,9 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
  */
 /* cppcheck-suppress constParameterCallback */
 static bool vu_check_device_state_exec(struct vu_dev *vdev,
-				       struct vhost_user_msg *msg)
+				       struct vhost_user_msg *vmsg)
 {
-	vmsg_set_reply_u64(msg, vdev->context->device_state_result);
+	vmsg_set_reply_u64(vmsg, vdev->context->device_state_result);
 
 	return true;
 }
@@ -1051,7 +1051,6 @@ static bool vu_check_device_state_exec(struct vu_dev *vdev,
 /**
  * vu_init() - Initialize vhost-user device structure
  * @c:		execution context
- * @vdev:	vhost-user device
  */
 void vu_init(struct ctx *c)
 {
@@ -1134,7 +1133,7 @@ static void vu_sock_reset(struct vu_dev *vdev)
 }
 
 static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
-					struct vhost_user_msg *msg) = {
+					struct vhost_user_msg *vmsg) = {
 	[VHOST_USER_GET_FEATURES]	   = vu_get_features_exec,
 	[VHOST_USER_SET_FEATURES]	   = vu_set_features_exec,
 	[VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec,
@@ -1165,7 +1164,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
  */
 void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
 {
-	struct vhost_user_msg msg = { 0 };
+	struct vhost_user_msg vmsg = { 0 };
 	bool need_reply, reply_requested;
 	int ret;
 
@@ -1174,38 +1173,38 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
 		return;
 	}
 
-	ret = vu_message_read_default(fd, &msg);
+	ret = vu_message_read_default(fd, &vmsg);
 	if (ret == 0) {
 		vu_sock_reset(vdev);
 		return;
 	}
 	debug("================ Vhost user message ================");
-	debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
-		msg.hdr.request);
-	debug("Flags:   0x%x", msg.hdr.flags);
-	debug("Size:    %u", msg.hdr.size);
+	debug("Request: %s (%d)", vu_request_to_string(vmsg.hdr.request),
+		vmsg.hdr.request);
+	debug("Flags:   0x%x", vmsg.hdr.flags);
+	debug("Size:    %u", vmsg.hdr.size);
 
-	need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
+	need_reply = vmsg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
 
-	if (msg.hdr.request >= 0 && msg.hdr.request < VHOST_USER_MAX &&
-	    vu_handle[msg.hdr.request])
-		reply_requested = vu_handle[msg.hdr.request](vdev, &msg);
+	if (vmsg.hdr.request >= 0 && vmsg.hdr.request < VHOST_USER_MAX &&
+	    vu_handle[vmsg.hdr.request])
+		reply_requested = vu_handle[vmsg.hdr.request](vdev, &vmsg);
 	else
-		die("Unhandled request: %d", msg.hdr.request);
+		die("Unhandled request: %d", vmsg.hdr.request);
 
 	/* cppcheck-suppress legacyUninitvar */
 	if (!reply_requested && need_reply) {
-		msg.payload.u64 = 0;
-		msg.hdr.flags = 0;
-		msg.hdr.size = sizeof(msg.payload.u64);
-		msg.fd_num = 0;
+		vmsg.payload.u64 = 0;
+		vmsg.hdr.flags = 0;
+		vmsg.hdr.size = sizeof(vmsg.payload.u64);
+		vmsg.fd_num = 0;
 		reply_requested = true;
 	}
 
 	if (reply_requested)
-		vu_send_reply(fd, &msg);
+		vu_send_reply(fd, &vmsg);
 
-	if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
+	if (vmsg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
 	    vdev->context->device_state_result == 0 &&
 	    !vdev->context->migrate_target) {
 		info("Migration complete, exiting");
diff --git a/vhost_user.h b/vhost_user.h
index 1daacd1..f2ae2da 100644
--- a/vhost_user.h
+++ b/vhost_user.h
@@ -184,7 +184,7 @@ union vhost_user_payload {
 };
 
 /**
- * struct vhost_user_msg - vhost-use message
+ * struct vhost_user_msg - vhost-user message
  * @hdr:		Message header
  * @payload:		Message payload
  * @fds:		File descriptors associated with the message

From b915375a421d70065baa90444da49954ceacde38 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 16 May 2025 14:42:27 +0200
Subject: [PATCH 381/382] virtio: Correct and align comment headers

Standardize and fix issues in `virtio.c` and `virtio.h` comment headers.

Improvements include:
- Added `()` to function names in comment summaries.
- Added colons after parameter and enum member tags.
- Changed `/*` to `/**` for `virtq_avail_event()` comment.
- Fixed typos (e.g., "file"->"fill", "virqueue"->"virtqueue").
- Added missing `Return:` tag for `vu_queue_rewind()`.
- Corrected parameter names in `virtio.h` comments to match code.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 virtio.c | 29 ++++++++++++++++-------------
 virtio.h |  4 ++--
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/virtio.c b/virtio.c
index f7db007..83906aa 100644
--- a/virtio.c
+++ b/virtio.c
@@ -156,9 +156,9 @@ static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i)
 }
 
 /**
- * virtq_used_event - Get location of used event indices
+ * virtq_used_event() - Get location of used event indices
  *		      (only with VIRTIO_F_EVENT_IDX)
- * @vq		Virtqueue
+ * @vq:		Virtqueue
  *
  * Return: return the location of the used event index
  */
@@ -170,7 +170,7 @@ static inline uint16_t *virtq_used_event(const struct vu_virtq *vq)
 
 /**
  * vring_get_used_event() - Get the used event from the available ring
- * @vq		Virtqueue
+ * @vq:		Virtqueue
  *
  * Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set)
  *         used_event is a performant alternative where the driver
@@ -244,9 +244,9 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
 
 /**
  * enum virtqueue_read_desc_state - State in the descriptor chain
- * @VIRTQUEUE_READ_DESC_ERROR	Found an invalid descriptor
- * @VIRTQUEUE_READ_DESC_DONE	No more descriptors in the chain
- * @VIRTQUEUE_READ_DESC_MORE	there are more descriptors in the chain
+ * @VIRTQUEUE_READ_DESC_ERROR:	Found an invalid descriptor
+ * @VIRTQUEUE_READ_DESC_DONE:	No more descriptors in the chain
+ * @VIRTQUEUE_READ_DESC_MORE:	there are more descriptors in the chain
  */
 enum virtqueue_read_desc_state {
 	VIRTQUEUE_READ_DESC_ERROR = -1,
@@ -347,8 +347,9 @@ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
 		die_perror("Error writing vhost-user queue eventfd");
 }
 
-/* virtq_avail_event() -  Get location of available event indices
- *			      (only with VIRTIO_F_EVENT_IDX)
+/**
+ * virtq_avail_event() -  Get location of available event indices
+ *			  (only with VIRTIO_F_EVENT_IDX)
  * @vq:		Virtqueue
  *
  * Return: return the location of the available event index
@@ -421,8 +422,8 @@ static bool virtqueue_map_desc(const struct vu_dev *dev,
 }
 
 /**
- * vu_queue_map_desc - Map the virtqueue descriptor ring into our virtual
- * 		       address space
+ * vu_queue_map_desc() - Map the virtqueue descriptor ring into our virtual
+ * 			 address space
  * @dev:	Vhost-user device
  * @vq:		Virtqueue
  * @idx:	First descriptor ring entry to map
@@ -505,7 +506,7 @@ static int vu_queue_map_desc(const struct vu_dev *dev,
  * vu_queue_pop() - Pop an entry from the virtqueue
  * @dev:	Vhost-user device
  * @vq:		Virtqueue
- * @elem:	Virtqueue element to file with the entry information
+ * @elem:	Virtqueue element to fill with the entry information
  *
  * Return: -1 if there is an error, 0 otherwise
  */
@@ -545,7 +546,7 @@ int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
 }
 
 /**
- * vu_queue_detach_element() - Detach an element from the virqueue
+ * vu_queue_detach_element() - Detach an element from the virtqueue
  * @vq:		Virtqueue
  */
 void vu_queue_detach_element(struct vu_virtq *vq)
@@ -555,7 +556,7 @@ void vu_queue_detach_element(struct vu_virtq *vq)
 }
 
 /**
- * vu_queue_unpop() - Push back the previously popped element from the virqueue
+ * vu_queue_unpop() - Push back the previously popped element from the virtqueue
  * @vq:		Virtqueue
  */
 /* cppcheck-suppress unusedFunction */
@@ -569,6 +570,8 @@ void vu_queue_unpop(struct vu_virtq *vq)
  * vu_queue_rewind() - Push back a given number of popped elements
  * @vq:		Virtqueue
  * @num:	Number of element to unpop
+ *
+ * Return: True on success, false if not
  */
 bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num)
 {
diff --git a/virtio.h b/virtio.h
index 7a370bd..d8beb88 100644
--- a/virtio.h
+++ b/virtio.h
@@ -150,7 +150,7 @@ static inline bool has_feature(uint64_t features, unsigned int fbit)
 /**
  * vu_has_feature() - Check if a virtio-net feature is available
  * @vdev:	Vhost-user device
- * @bit:	Feature to check
+ * @fbit:	Feature to check
  *
  * Return:	True if the feature is available
  */
@@ -163,7 +163,7 @@ static inline bool vu_has_feature(const struct vu_dev *vdev,
 /**
  * vu_has_protocol_feature() - Check if a vhost-user feature is available
  * @vdev:	Vhost-user device
- * @bit:	Feature to check
+ * @fbit:	Feature to check
  *
  * Return:	True if the feature is available
  */

From 3262c9b088288902f28b5d09f61220fae5376082 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 16 May 2025 14:42:28 +0200
Subject: [PATCH 382/382] iov: Standardize function comment headers

Update function comment headers in iov.c to a consistent and
standardized format.

This change ensures:
- Comment blocks for functions consistently start with /**.
- Function names in the comment summary line include parentheses ().

This improves overall comment clarity and uniformity within the file.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 iov.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/iov.c b/iov.c
index 8c63b7e..91e87a7 100644
--- a/iov.c
+++ b/iov.c
@@ -26,7 +26,8 @@
 #include "iov.h"
 
 
-/* iov_skip_bytes() - Skip leading bytes of an IO vector
+/**
+ * iov_skip_bytes() - Skip leading bytes of an IO vector
  * @iov:	IO vector
  * @n:		Number of entries in @iov
  * @skip:	Number of leading bytes of @iov to skip
@@ -56,8 +57,8 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
 }
 
 /**
- * iov_from_buf - Copy data from a buffer to an I/O vector (struct iovec)
- *                efficiently.
+ * iov_from_buf() - Copy data from a buffer to an I/O vector (struct iovec)
+ *                  efficiently.
  *
  * @iov:       Pointer to the array of struct iovec describing the
  *             scatter/gather I/O vector.
@@ -96,8 +97,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
 }
 
 /**
- * iov_to_buf - Copy data from a scatter/gather I/O vector (struct iovec) to
- *		a buffer efficiently.
+ * iov_to_buf() - Copy data from a scatter/gather I/O vector (struct iovec) to
+ *		  a buffer efficiently.
  *
  * @iov:       Pointer to the array of struct iovec describing the scatter/gather
  *             I/O vector.
@@ -136,8 +137,8 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
 }
 
 /**
- * iov_size - Calculate the total size of a scatter/gather I/O vector
- *            (struct iovec).
+ * iov_size() - Calculate the total size of a scatter/gather I/O vector
+ *              (struct iovec).
  *
  * @iov:       Pointer to the array of struct iovec describing the
  *             scatter/gather I/O vector.