From bcc4908c2b4a20c581f2b03fed40da97b804106f Mon Sep 17 00:00:00 2001
From: Enrique Llorente <ellorent@redhat.com>
Date: Mon, 17 Feb 2025 10:28:14 +0100
Subject: [PATCH 001/144] dhcp: Remove option 255 length byte

The option 255 (end of options) do not need the length byte, this change
remove that allowing to have one extra byte at other dynamic options.

Signed-off-by: Enrique Llorente <ellorent@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 dhcp.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/dhcp.c b/dhcp.c
index 401cb5b..4a209f1 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -64,9 +64,9 @@ static struct opt opts[255];
 #define OPT_MIN		60 /* RFC 951 */
 
 /* Total option size (excluding end option) is 576 (RFC 2131), minus
- * offset of options (268), minus end option and its length (2).
+ * offset of options (268), minus end option (1).
  */
-#define OPT_MAX		306
+#define OPT_MAX		307
 
 /**
  * dhcp_init() - Initialise DHCP options
@@ -127,7 +127,7 @@ struct msg {
 	uint8_t sname[64];
 	uint8_t file[128];
 	uint32_t magic;
-	uint8_t o[OPT_MAX + 2 /* End option and its length */ ];
+	uint8_t o[OPT_MAX + 1 /* End option */ ];
 } __attribute__((__packed__));
 
 /**
@@ -194,7 +194,6 @@ static int fill(struct msg *m)
 	}
 
 	m->o[offset++] = 255;
-	m->o[offset++] = 0;
 
 	if (offset < OPT_MIN) {
 		memset(&m->o[offset], 0, OPT_MIN - offset);

From 0a51060f7ac3e1e1a9d87ffdb037b9c367a2a4d9 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 13:07:17 +1100
Subject: [PATCH 002/144] packet: Use flexible array member in struct pool

Currently we have a dummy pkt[1] array, which we alias with an array of
a different size via various macros.  However, we already require C11 which
includes flexible array members, so we can do better.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packet.h b/packet.h
index 3f70e94..85ee550 100644
--- a/packet.h
+++ b/packet.h
@@ -21,7 +21,7 @@ struct pool {
 	size_t buf_size;
 	size_t size;
 	size_t count;
-	struct iovec pkt[1];
+	struct iovec pkt[];
 };
 
 int vu_packet_check_range(void *buf, size_t offset, size_t len,

From 354bc0bab1cb6095592288674d375511443427fd Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 13:07:18 +1100
Subject: [PATCH 003/144] packet: Don't pass start and offset separately to
 packet_check_range()

Fundamentally what packet_check_range() does is to check whether a given
memory range is within the allowed / expected memory set aside for packets
from a particular pool.  That range could represent a whole packet (from
packet_add_do()) or part of a packet (from packet_get_do()), but it doesn't
really matter which.

However, we pass the start of the range as two parameters: @start which is
the start of the packet, and @offset which is the offset within the packet
of the range we're interested in.  We never use these separately, only as
(start + offset).  Simplify the interface of packet_check_range() and
vu_packet_check_range() to directly take the start of the relevant range.
This will allow some additional future improvements.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c    | 36 +++++++++++++++++++-----------------
 packet.h    |  3 +--
 vu_common.c | 11 ++++-------
 3 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/packet.c b/packet.c
index 03a11e6..0330b54 100644
--- a/packet.c
+++ b/packet.c
@@ -23,23 +23,22 @@
 #include "log.h"
 
 /**
- * packet_check_range() - Check if a packet memory range is valid
+ * packet_check_range() - Check if a memory range is valid for a pool
  * @p:		Packet pool
- * @offset:	Offset of data range in packet descriptor
+ * @ptr:	Start of desired data range
  * @len:	Length of desired data range
- * @start:	Start of the packet descriptor
  * @func:	For tracing: name of calling function
  * @line:	For tracing: caller line of function call
  *
  * Return: 0 if the range is valid, -1 otherwise
  */
-static int packet_check_range(const struct pool *p, size_t offset, size_t len,
-			      const char *start, const char *func, int line)
+static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
+			      const char *func, int line)
 {
 	if (p->buf_size == 0) {
 		int ret;
 
-		ret = vu_packet_check_range((void *)p->buf, offset, len, start);
+		ret = vu_packet_check_range((void *)p->buf, ptr, len);
 
 		if (ret == -1)
 			trace("cannot find region, %s:%i", func, line);
@@ -47,16 +46,16 @@ static int packet_check_range(const struct pool *p, size_t offset, size_t len,
 		return ret;
 	}
 
-	if (start < p->buf) {
-		trace("packet start %p before buffer start %p, "
-		      "%s:%i", (void *)start, (void *)p->buf, func, line);
+	if (ptr < p->buf) {
+		trace("packet range start %p before buffer start %p, %s:%i",
+		      (void *)ptr, (void *)p->buf, func, line);
 		return -1;
 	}
 
-	if (start + len + offset > p->buf + p->buf_size) {
-		trace("packet offset plus length %zu from size %zu, "
-		      "%s:%i", start - p->buf + len + offset,
-		      p->buf_size, func, line);
+	if (ptr + len > p->buf + p->buf_size) {
+		trace("packet range end %p after buffer end %p, %s:%i",
+		      (void *)(ptr + len), (void *)(p->buf + p->buf_size),
+		      func, line);
 		return -1;
 	}
 
@@ -81,7 +80,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 		return;
 	}
 
-	if (packet_check_range(p, 0, len, start, func, line))
+	if (packet_check_range(p, start, len, func, line))
 		return;
 
 	if (len > UINT16_MAX) {
@@ -110,6 +109,8 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		    size_t len, size_t *left, const char *func, int line)
 {
+	char *ptr;
+
 	if (idx >= p->size || idx >= p->count) {
 		if (func) {
 			trace("packet %zu from pool size: %zu, count: %zu, "
@@ -135,14 +136,15 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		return NULL;
 	}
 
-	if (packet_check_range(p, offset, len, p->pkt[idx].iov_base,
-			       func, line))
+	ptr = (char *)p->pkt[idx].iov_base + offset;
+
+	if (packet_check_range(p, ptr, len, func, line))
 		return NULL;
 
 	if (left)
 		*left = p->pkt[idx].iov_len - offset - len;
 
-	return (char *)p->pkt[idx].iov_base + offset;
+	return ptr;
 }
 
 /**
diff --git a/packet.h b/packet.h
index 85ee550..bdc07fe 100644
--- a/packet.h
+++ b/packet.h
@@ -24,8 +24,7 @@ struct pool {
 	struct iovec pkt[];
 };
 
-int vu_packet_check_range(void *buf, size_t offset, size_t len,
-			  const char *start);
+int vu_packet_check_range(void *buf, const char *ptr, size_t len);
 void packet_add_do(struct pool *p, size_t len, const char *start,
 		   const char *func, int line);
 void *packet_get_do(const struct pool *p, const size_t idx,
diff --git a/vu_common.c b/vu_common.c
index 48826b1..686a09b 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -26,14 +26,12 @@
  * vu_packet_check_range() - Check if a given memory zone is contained in
  * 			     a mapped guest memory region
  * @buf:	Array of the available memory regions
- * @offset:	Offset of data range in packet descriptor
+ * @ptr:	Start of desired data range
  * @size:	Length of desired data range
- * @start:	Start of the packet descriptor
  *
  * Return: 0 if the zone is in a mapped memory region, -1 otherwise
  */
-int vu_packet_check_range(void *buf, size_t offset, size_t len,
-			  const char *start)
+int vu_packet_check_range(void *buf, const char *ptr, size_t len)
 {
 	struct vu_dev_region *dev_region;
 
@@ -41,9 +39,8 @@ int vu_packet_check_range(void *buf, size_t offset, size_t len,
 		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
 		char *m = (char *)(uintptr_t)dev_region->mmap_addr;
 
-		if (m <= start &&
-		    start + offset + len <= m + dev_region->mmap_offset +
-					       dev_region->size)
+		if (m <= ptr &&
+		    ptr + len <= m + dev_region->mmap_offset + dev_region->size)
 			return 0;
 	}
 

From 6b4065153c67e7578d448927e49f244deea70e4d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 13:07:19 +1100
Subject: [PATCH 004/144] tap: Remove unused ETH_HDR_INIT() macro

The uses of this macro were removed in d4598e1d18ac ("udp: Use the same
buffer for the L2 header for all frames").

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tap.h b/tap.h
index dfbd8b9..a476a12 100644
--- a/tap.h
+++ b/tap.h
@@ -6,8 +6,6 @@
 #ifndef TAP_H
 #define TAP_H
 
-#define ETH_HDR_INIT(proto) { .h_proto = htons_constant(proto) }
-
 /**
  * struct tap_hdr - tap backend specific headers
  * @vnet_len:	Frame length (for qemu socket transport)

From 5a07eb3cccf1abf0a44d6ab01819f8f605c87ef4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 13:50:13 +1100
Subject: [PATCH 005/144] tcp_vu: head_cnt need not be global

head_cnt is a global variable which tracks how many entries in head[] are
currently used.  The fact that it's global obscures the fact that the
lifetime over which it has a meaningful value is quite short: a single
call to of tcp_vu_data_from_sock().

Make it a local to tcp_vu_data_from_sock() to make that lifetime clearer.
We keep the head[] array global for now - although technically it has the
same valid lifetime - because it's large enough we might not want to put
it on the stack.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_vu.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tcp_vu.c b/tcp_vu.c
index 0622f17..6891ed1 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -38,7 +38,6 @@
 static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
 static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
 static int head[VIRTQUEUE_MAX_SIZE + 1];
-static int head_cnt;
 
 /**
  * tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
@@ -183,7 +182,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 				const struct tcp_tap_conn *conn, bool v6,
 				uint32_t already_sent, size_t fillsize,
-				int *iov_cnt)
+				int *iov_cnt, int *head_cnt)
 {
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
@@ -202,7 +201,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 	vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
 
 	elem_cnt = 0;
-	head_cnt = 0;
+	*head_cnt = 0;
 	while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) {
 		struct iovec *iov;
 		size_t frame_size, dlen;
@@ -221,7 +220,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 		ASSERT(iov->iov_len >= hdrlen);
 		iov->iov_base = (char *)iov->iov_base + hdrlen;
 		iov->iov_len -= hdrlen;
-		head[head_cnt++] = elem_cnt;
+		head[(*head_cnt)++] = elem_cnt;
 
 		fillsize -= dlen;
 		elem_cnt += cnt;
@@ -261,17 +260,18 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 		len -= iov->iov_len;
 	}
 	/* adjust head count */
-	while (head_cnt > 0 && head[head_cnt - 1] >= i)
-		head_cnt--;
+	while (*head_cnt > 0 && head[*head_cnt - 1] >= i)
+		(*head_cnt)--;
+
 	/* mark end of array */
-	head[head_cnt] = i;
+	head[*head_cnt] = i;
 	*iov_cnt = i;
 
 	/* release unused buffers */
 	vu_queue_rewind(vq, elem_cnt - i);
 
 	/* restore space for headers in iov */
-	for (i = 0; i < head_cnt; i++) {
+	for (i = 0; i < *head_cnt; i++) {
 		struct iovec *iov = &elem[head[i]].in_sg[0];
 
 		iov->iov_base = (char *)iov->iov_base - hdrlen;
@@ -357,11 +357,11 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	ssize_t len, previous_dlen;
+	int i, iov_cnt, head_cnt;
 	size_t hdrlen, fillsize;
 	int v6 = CONN_V6(conn);
 	uint32_t already_sent;
 	const uint16_t *check;
-	int i, iov_cnt;
 
 	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
 		debug("Got packet, but RX virtqueue not usable yet");
@@ -396,7 +396,8 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	/* collect the buffers from vhost-user and fill them with the
 	 * data from the socket
 	 */
-	len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, &iov_cnt);
+	len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize,
+			       &iov_cnt, &head_cnt);
 	if (len < 0) {
 		if (len != -EAGAIN && len != -EWOULDBLOCK) {
 			tcp_rst(c, conn);

From e56c8038fc23a349ff4a457c6b447f927ac1a56e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 19:59:21 +1100
Subject: [PATCH 006/144] tcp: More type safety for
 tcp_flow_migrate_target_ext()

tcp_flow_migrate_target_ext() takes a raw union flow *, although it is TCP
specific, and requires a FLOW_TYPE_TCP entry.  Our usual convention is that
such functions should take a struct tcp_tap_conn * instead.  Convert it to
do so.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c     | 2 +-
 tcp.c      | 7 +++----
 tcp_conn.h | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/flow.c b/flow.c
index cc881e8..abe95b2 100644
--- a/flow.c
+++ b/flow.c
@@ -1106,7 +1106,7 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	repair_flush(c);
 
 	for (i = 0; i < count; i++) {
-		rc = tcp_flow_migrate_target_ext(c, flowtab + i, fd);
+		rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
 		if (rc) {
 			debug("Migration data failure at flow %u: %s, abort",
 			      i, strerror_(-rc));
diff --git a/tcp.c b/tcp.c
index 98e1c6a..272e4cd 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3394,14 +3394,13 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 /**
  * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect
  * @c:		Execution context
- * @flow:	Existing flow for this connection data
+ * @conn:	Connection entry to complete with extra data
  * @fd:		Descriptor for state migration
  *
  * Return: 0 on success, negative on fatal failure, but 0 on single flow failure
  */
-int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
+int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd)
 {
-	struct tcp_tap_conn *conn = &flow->tcp;
 	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
 	struct tcp_tap_transfer_ext t;
 	int s = conn->sock, rc;
@@ -3413,7 +3412,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
 	}
 
 	if (!t.tcpi_state) { /* Source wants us to skip this flow */
-		flow_err(flow, "Dropping as requested by source");
+		flow_err(conn, "Dropping as requested by source");
 		goto fail;
 	}
 
diff --git a/tcp_conn.h b/tcp_conn.h
index 42dff48..53887c0 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -239,7 +239,7 @@ int tcp_flow_migrate_source_ext(int fd, int fidx,
 				const struct tcp_tap_conn *conn);
 
 int tcp_flow_migrate_target(struct ctx *c, int fd);
-int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd);
+int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
 
 bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
 

From 854bc7b1a3b4e5443ea071e49b3a68198dbb88b3 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 19:59:22 +1100
Subject: [PATCH 007/144] tcp: Remove spurious prototype for
 tcp_flow_migrate_shrink_window

This function existed in drafts of the migration code, but not the final
version.  Get rid of the prototype.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_conn.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tcp_conn.h b/tcp_conn.h
index 53887c0..8a15b08 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -233,7 +233,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn);
 int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
 int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
 
-int tcp_flow_migrate_shrink_window(int fidx, const struct tcp_tap_conn *conn);
 int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
 int tcp_flow_migrate_source_ext(int fd, int fidx,
 				const struct tcp_tap_conn *conn);

From ba0823f8a0e60d4fc0cb21179aaf64940509156a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 19:59:23 +1100
Subject: [PATCH 008/144] tcp: Don't pass both flow pointer and flow index

tcp_flow_migrate_source_ext() is passed both the index of the flow it
operates on and the pointer to the connection structure.  However, the
former is trivially derived from the latter.  Simplify the interface.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c     | 2 +-
 tcp.c      | 6 ++----
 tcp_conn.h | 3 +--
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/flow.c b/flow.c
index abe95b2..cc393e0 100644
--- a/flow.c
+++ b/flow.c
@@ -1053,7 +1053,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * as EIO).
 	 */
 	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
-		rc = tcp_flow_migrate_source_ext(fd, i, &flow->tcp);
+		rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
 		if (rc) {
 			err("Extended data for flow %u: %s", i, strerror_(-rc));
 
diff --git a/tcp.c b/tcp.c
index 272e4cd..21b6c6c 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3141,16 +3141,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
 /**
  * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
  * @fd:		Descriptor for state migration
- * @fidx:	Flow index
  * @conn:	Pointer to the TCP connection structure
  *
  * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
  */
-int tcp_flow_migrate_source_ext(int fd, int fidx,
-				const struct tcp_tap_conn *conn)
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 {
 	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
-	struct tcp_tap_transfer_ext *t = &migrate_ext[fidx];
+	struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
 	int s = conn->sock;
 	int rc;
 
diff --git a/tcp_conn.h b/tcp_conn.h
index 8a15b08..9126a36 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -234,8 +234,7 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
 int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
 
 int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
-int tcp_flow_migrate_source_ext(int fd, int fidx,
-				const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
 
 int tcp_flow_migrate_target(struct ctx *c, int fd);
 int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);

From adb46c11d0ea67824cf8c4ef2113ec0b2c563c0e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 18 Feb 2025 19:59:24 +1100
Subject: [PATCH 009/144] flow: Add flow_perror() helper

Our general logging helpers include a number of _perror() variants which,
like perror(3) include the description of the current errno.  We didn't
have those for our flow specific logging helpers, though.  Fill this gap
with flow_perror() and flow_dbg_perror(), and use them where it's useful.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c       | 12 +++++++-----
 flow.h       | 18 ++++++++++++++----
 icmp.c       |  5 ++---
 tcp.c        | 33 +++++++++++++++------------------
 tcp_splice.c |  9 ++++-----
 udp_flow.c   | 19 +++++++------------
 6 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/flow.c b/flow.c
index cc393e0..c68f6bb 100644
--- a/flow.c
+++ b/flow.c
@@ -289,11 +289,13 @@ int flowside_connect(const struct ctx *c, int s,
 
 /** flow_log_ - Log flow-related message
  * @f:		flow the message is related to
+ * @newline:	Append newline at the end of the message, if missing
  * @pri:	Log priority
  * @fmt:	Format string
  * @...:	printf-arguments
  */
-void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
+void flow_log_(const struct flow_common *f, bool newline, int pri,
+	       const char *fmt, ...)
 {
 	const char *type_or_state;
 	char msg[BUFSIZ];
@@ -309,7 +311,7 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 	else
 		type_or_state = FLOW_TYPE(f);
 
-	logmsg(true, false, pri,
+	logmsg(newline, false, pri,
 	       "Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
 }
 
@@ -329,7 +331,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
 	const struct flowside *tgt = &f->side[TGTSIDE];
 
 	if (state >= FLOW_STATE_TGT)
-		flow_log_(f, pri,
+		flow_log_(f, true, pri,
 			  "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
 			  pif_name(f->pif[INISIDE]),
 			  inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
@@ -342,7 +344,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
 			  inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
 			  tgt->eport);
 	else if (state >= FLOW_STATE_INI)
-		flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
+		flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
 			  pif_name(f->pif[INISIDE]),
 			  inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
 			  ini->eport,
@@ -363,7 +365,7 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
 	ASSERT(oldstate < FLOW_NUM_STATES);
 
 	f->state = state;
-	flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
+	flow_log_(f, true, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
 		  FLOW_STATE(f));
 
 	flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
diff --git a/flow.h b/flow.h
index 675726e..dcf7645 100644
--- a/flow.h
+++ b/flow.h
@@ -258,11 +258,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 			int fd);
 
-void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
-	__attribute__((format(printf, 3, 4)));
-
-#define flow_log(f_, pri, ...)	flow_log_(&(f_)->f, (pri), __VA_ARGS__)
+void flow_log_(const struct flow_common *f, bool newline, int pri,
+	       const char *fmt, ...)
+	__attribute__((format(printf, 4, 5)));
 
+#define flow_log(f_, pri, ...)	flow_log_(&(f_)->f, true, (pri), __VA_ARGS__)
 #define flow_dbg(f, ...)	flow_log((f), LOG_DEBUG, __VA_ARGS__)
 #define flow_err(f, ...)	flow_log((f), LOG_ERR, __VA_ARGS__)
 
@@ -272,6 +272,16 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 			flow_dbg((f), __VA_ARGS__);			\
 	} while (0)
 
+#define flow_log_perror_(f, pri, ...)					\
+	do {								\
+		int errno_ = errno;					\
+		flow_log_((f), false, (pri), __VA_ARGS__);		\
+		logmsg(true, true, (pri), ": %s", strerror_(errno_));	\
+	} while (0)
+
+#define flow_dbg_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_DEBUG, __VA_ARGS__)
+#define flow_perror(f_, ...)	flow_log_perror_(&(f_)->f, LOG_ERR, __VA_ARGS__)
+
 void flow_log_details_(const struct flow_common *f, int pri,
 		       enum flow_state state);
 #define flow_log_details(f_, pri) \
diff --git a/icmp.c b/icmp.c
index bcf498d..7e2b342 100644
--- a/icmp.c
+++ b/icmp.c
@@ -85,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
 
 	n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
 	if (n < 0) {
-		flow_err(pingf, "recvfrom() error: %s", strerror_(errno));
+		flow_perror(pingf, "recvfrom() error");
 		return;
 	}
 
@@ -300,8 +300,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 
 	pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
 	if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
-		flow_dbg(pingf, "failed to relay request to socket: %s",
-			 strerror_(errno));
+		flow_dbg_perror(pingf, "failed to relay request to socket");
 	} else {
 		flow_dbg(pingf,
 			 "echo request to socket, ID: %"PRIu16", seq: %"PRIu16,
diff --git a/tcp.c b/tcp.c
index 21b6c6c..f498f5b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -551,8 +551,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 
 		fd = timerfd_create(CLOCK_MONOTONIC, 0);
 		if (fd == -1 || fd > FD_REF_MAX) {
-			flow_dbg(conn, "failed to get timer: %s",
-				 strerror_(errno));
+			flow_dbg_perror(conn, "failed to get timer");
 			if (fd > -1)
 				close(fd);
 			conn->timer = -1;
@@ -561,8 +560,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		conn->timer = fd;
 
 		if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
-			flow_dbg(conn, "failed to add timer: %s",
-				 strerror_(errno));
+			flow_dbg_perror(conn, "failed to add timer");
 			close(conn->timer);
 			conn->timer = -1;
 			return;
@@ -587,7 +585,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
 
 	if (timerfd_settime(conn->timer, 0, &it, NULL))
-		flow_err(conn, "failed to set timer: %s", strerror_(errno));
+		flow_perror(conn, "failed to set timer");
 }
 
 /**
@@ -1386,10 +1384,10 @@ static void tcp_bind_outbound(const struct ctx *c,
 		if (bind(s, &bind_sa.sa, sl)) {
 			char sstr[INANY_ADDRSTRLEN];
 
-			flow_dbg(conn,
-				 "Can't bind TCP outbound socket to %s:%hu: %s",
-				 inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
-				 tgt->oport, strerror_(errno));
+			flow_dbg_perror(conn,
+					"Can't bind TCP outbound socket to %s:%hu",
+					inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
+					tgt->oport);
 		}
 	}
 
@@ -1398,9 +1396,9 @@ static void tcp_bind_outbound(const struct ctx *c,
 			if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
 				       c->ip4.ifname_out,
 				       strlen(c->ip4.ifname_out))) {
-				flow_dbg(conn, "Can't bind IPv4 TCP socket to"
-					 " interface %s: %s", c->ip4.ifname_out,
-					 strerror_(errno));
+				flow_dbg_perror(conn,
+						"Can't bind IPv4 TCP socket to interface %s",
+						c->ip4.ifname_out);
 			}
 		}
 	} else if (bind_sa.sa_family == AF_INET6) {
@@ -1408,9 +1406,9 @@ static void tcp_bind_outbound(const struct ctx *c,
 			if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
 				       c->ip6.ifname_out,
 				       strlen(c->ip6.ifname_out))) {
-				flow_dbg(conn, "Can't bind IPv6 TCP socket to"
-					 " interface %s: %s", c->ip6.ifname_out,
-					 strerror_(errno));
+				flow_dbg_perror(conn,
+						"Can't bind IPv6 TCP socket to interface %s",
+						c->ip6.ifname_out);
 			}
 		}
 	}
@@ -2193,7 +2191,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 	 * and we just set the timer to a new point in the future: discard it.
 	 */
 	if (timerfd_gettime(conn->timer, &check_armed))
-		flow_err(conn, "failed to read timer: %s", strerror_(errno));
+		flow_perror(conn, "failed to read timer");
 
 	if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
 		return;
@@ -2235,8 +2233,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 		 * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
 		 */
 		if (timerfd_settime(conn->timer, 0, &new, &old))
-			flow_err(conn, "failed to set timer: %s",
-				 strerror_(errno));
+			flow_perror(conn, "failed to set timer");
 
 		if (old.it_value.tv_sec == ACT_TIMEOUT) {
 			flow_dbg(conn, "activity timeout");
diff --git a/tcp_splice.c b/tcp_splice.c
index 5d845c9..0d10e3d 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -164,7 +164,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
 	if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
 	    epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
 		int ret = -errno;
-		flow_err(conn, "ERROR on epoll_ctl(): %s", strerror_(errno));
+		flow_perror(conn, "ERROR on epoll_ctl()");
 		return ret;
 	}
 
@@ -317,8 +317,8 @@ static int tcp_splice_connect_finish(const struct ctx *c,
 
 		if (conn->pipe[sidei][0] < 0) {
 			if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
-				flow_err(conn, "cannot create %d->%d pipe: %s",
-					 sidei, !sidei, strerror_(errno));
+				flow_perror(conn, "cannot create %d->%d pipe",
+					    sidei, !sidei);
 				conn_flag(c, conn, CLOSING);
 				return -EIO;
 			}
@@ -482,8 +482,7 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
 
 		rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl);
 		if (rc)
-			flow_err(conn, "Error retrieving SO_ERROR: %s",
-				 strerror_(errno));
+			flow_perror(conn, "Error retrieving SO_ERROR");
 		else
 			flow_trace(conn, "Error event on socket: %s",
 				   strerror_(err));
diff --git a/udp_flow.c b/udp_flow.c
index 83c2568..c6b8630 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -93,9 +93,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		 */
 		uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0);
 		if (uflow->s[INISIDE] < 0) {
-			flow_err(uflow,
-				 "Couldn't duplicate listening socket: %s",
-				 strerror_(errno));
+			flow_perror(uflow,
+				    "Couldn't duplicate listening socket");
 			goto cancel;
 		}
 	}
@@ -113,16 +112,13 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY,
 						     tgtpif, tgt, fref.data);
 		if (uflow->s[TGTSIDE] < 0) {
-			flow_dbg(uflow,
-				 "Couldn't open socket for spliced flow: %s",
-				 strerror_(errno));
+			flow_dbg_perror(uflow,
+					"Couldn't open socket for spliced flow");
 			goto cancel;
 		}
 
 		if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) {
-			flow_dbg(uflow,
-				 "Couldn't connect flow socket: %s",
-				 strerror_(errno));
+			flow_dbg_perror(uflow, "Couldn't connect flow socket");
 			goto cancel;
 		}
 
@@ -142,9 +138,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 			flow_trace(uflow,
 				   "Discarded %d spurious reply datagrams", rc);
 		} else if (errno != EAGAIN) {
-			flow_err(uflow,
-				 "Unexpected error discarding datagrams: %s",
-				 strerror_(errno));
+			flow_perror(uflow,
+				    "Unexpected error discarding datagrams");
 		}
 	}
 

From 7ffca35fddf1568698199c931ba1877c1908b443 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Feb 2025 13:28:34 +1100
Subject: [PATCH 010/144] flow: Remove unneeded index from foreach_* macros

The foreach macros are odd in that they take two loop counters: an integer
index, and a pointer to the flow.  We nearly always want the latter, not
the former, and we can get the index from the pointer trivially when we
need it.  So, rearrange the macros not to need the integer index.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/flow.c b/flow.c
index c68f6bb..3fcdd9f 100644
--- a/flow.c
+++ b/flow.c
@@ -53,30 +53,28 @@ const uint8_t flow_proto[] = {
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");
 
-#define foreach_flow(i, flow, bound)					\
-	for ((i) = 0, (flow) = &flowtab[(i)];				\
-	     (i) < (bound);						\
-	     (i)++, (flow) = &flowtab[(i)])				\
+#define foreach_flow(flow, bound)					\
+	for ((flow) = flowtab; FLOW_IDX(flow) < (bound); (flow)++)	\
 		if ((flow)->f.state == FLOW_STATE_FREE)			\
-			(i) += (flow)->free.n - 1;			\
+			(flow) += (flow)->free.n - 1;			\
 		else
 
-#define foreach_active_flow(i, flow, bound)				\
-	foreach_flow((i), (flow), (bound))				\
+#define foreach_active_flow(flow, bound)				\
+	foreach_flow((flow), (bound))					\
 		if ((flow)->f.state != FLOW_STATE_ACTIVE)		\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
 		else
 
-#define foreach_tcp_flow(i, flow, bound)				\
-	foreach_active_flow((i), (flow), (bound))			\
+#define foreach_tcp_flow(flow, bound)					\
+	foreach_active_flow((flow), (bound))				\
 		if ((flow)->f.type != FLOW_TCP)				\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
 		else
 
-#define foreach_established_tcp_flow(i, flow, bound)			\
-	foreach_tcp_flow((i), (flow), (bound))				\
+#define foreach_established_tcp_flow(flow, bound)			\
+	foreach_tcp_flow((flow), (bound))				\
 		if (!tcp_flow_is_established(&(flow)->tcp))		\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
@@ -918,11 +916,10 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
 					int ret)
 {
 	union flow *flow;
-	unsigned i;
 
 	debug("...roll back migration");
 
-	foreach_established_tcp_flow(i, flow, max_flow)
+	foreach_established_tcp_flow(flow, max_flow)
 		if (tcp_flow_repair_off(c, &flow->tcp))
 			die("Failed to roll back TCP_REPAIR mode");
 
@@ -942,10 +939,9 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
 static int flow_migrate_repair_all(struct ctx *c, bool enable)
 {
 	union flow *flow;
-	unsigned i;
 	int rc;
 
-	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow, FLOW_MAX) {
 		if (enable)
 			rc = tcp_flow_repair_on(c, &flow->tcp);
 		else
@@ -954,14 +950,15 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable)
 		if (rc) {
 			debug("Can't %s repair mode: %s",
 			      enable ? "enable" : "disable", strerror_(-rc));
-			return flow_migrate_source_rollback(c, i, rc);
+			return flow_migrate_source_rollback(c, FLOW_IDX(flow),
+							    rc);
 		}
 	}
 
 	if ((rc = repair_flush(c))) {
 		debug("Can't %s repair mode: %s",
 		      enable ? "enable" : "disable", strerror_(-rc));
-		return flow_migrate_source_rollback(c, i, rc);
+		return flow_migrate_source_rollback(c, FLOW_IDX(flow), rc);
 	}
 
 	return 0;
@@ -1003,13 +1000,12 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	uint32_t count = 0;
 	bool first = true;
 	union flow *flow;
-	unsigned i;
 	int rc;
 
 	(void)c;
 	(void)stage;
 
-	foreach_established_tcp_flow(i, flow, FLOW_MAX)
+	foreach_established_tcp_flow(flow, FLOW_MAX)
 		count++;
 
 	count = htonl(count);
@@ -1028,10 +1024,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * stream might now be inconsistent, and we might have closed listening
 	 * TCP sockets, so just terminate.
 	 */
-	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow, FLOW_MAX) {
 		rc = tcp_flow_migrate_source(fd, &flow->tcp);
 		if (rc) {
-			err("Can't send data, flow %u: %s", i, strerror_(-rc));
+			err("Can't send data, flow %u: %s", FLOW_IDX(flow),
+			    strerror_(-rc));
 			if (!first)
 				die("Inconsistent migration state, exiting");
 
@@ -1054,10 +1051,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * failures but not if the stream might be inconsistent (reported here
 	 * as EIO).
 	 */
-	foreach_established_tcp_flow(i, flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow, FLOW_MAX) {
 		rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
 		if (rc) {
-			err("Extended data for flow %u: %s", i, strerror_(-rc));
+			err("Extended data for flow %u: %s", FLOW_IDX(flow),
+			    strerror_(-rc));
 
 			if (rc == -EIO)
 				die("Inconsistent migration state, exiting");

From b79a22d3601b69cf58b1803c5ead7f4667c46827 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Feb 2025 13:28:35 +1100
Subject: [PATCH 011/144] flow: Remove unneeded bound parameter from flow
 traversal macros

The foreach macros used to step through flows each take a 'bound' parameter
to only scan part of the flow table.  Only one place actually passes a
bound different from FLOW_MAX.  So we can simplify every other invocation
by having that one case manually handle the bound.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/flow.c b/flow.c
index 3fcdd9f..602fea7 100644
--- a/flow.c
+++ b/flow.c
@@ -53,28 +53,28 @@ const uint8_t flow_proto[] = {
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");
 
-#define foreach_flow(flow, bound)					\
-	for ((flow) = flowtab; FLOW_IDX(flow) < (bound); (flow)++)	\
+#define foreach_flow(flow)						\
+	for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++)	\
 		if ((flow)->f.state == FLOW_STATE_FREE)			\
 			(flow) += (flow)->free.n - 1;			\
 		else
 
-#define foreach_active_flow(flow, bound)				\
-	foreach_flow((flow), (bound))					\
+#define foreach_active_flow(flow)					\
+	foreach_flow((flow))						\
 		if ((flow)->f.state != FLOW_STATE_ACTIVE)		\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
 		else
 
-#define foreach_tcp_flow(flow, bound)					\
-	foreach_active_flow((flow), (bound))				\
+#define foreach_tcp_flow(flow)						\
+	foreach_active_flow((flow))					\
 		if ((flow)->f.type != FLOW_TCP)				\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
 		else
 
-#define foreach_established_tcp_flow(flow, bound)			\
-	foreach_tcp_flow((flow), (bound))				\
+#define foreach_established_tcp_flow(flow)				\
+	foreach_tcp_flow((flow))					\
 		if (!tcp_flow_is_established(&(flow)->tcp))		\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
@@ -907,21 +907,23 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 /**
  * flow_migrate_source_rollback() - Disable repair mode, return failure
  * @c:		Execution context
- * @max_flow:	Maximum index of affected flows
+ * @bound:	No need to roll back flow indices >= @bound
  * @ret:	Negative error code
  *
  * Return: @ret
  */
-static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
-					int ret)
+static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
 {
 	union flow *flow;
 
 	debug("...roll back migration");
 
-	foreach_established_tcp_flow(flow, max_flow)
+	foreach_established_tcp_flow(flow) {
+		if (FLOW_IDX(flow) >= bound)
+			break;
 		if (tcp_flow_repair_off(c, &flow->tcp))
 			die("Failed to roll back TCP_REPAIR mode");
+	}
 
 	if (repair_flush(c))
 		die("Failed to roll back TCP_REPAIR mode");
@@ -941,7 +943,7 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable)
 	union flow *flow;
 	int rc;
 
-	foreach_established_tcp_flow(flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow) {
 		if (enable)
 			rc = tcp_flow_repair_on(c, &flow->tcp);
 		else
@@ -1005,7 +1007,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	(void)c;
 	(void)stage;
 
-	foreach_established_tcp_flow(flow, FLOW_MAX)
+	foreach_established_tcp_flow(flow)
 		count++;
 
 	count = htonl(count);
@@ -1024,7 +1026,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * stream might now be inconsistent, and we might have closed listening
 	 * TCP sockets, so just terminate.
 	 */
-	foreach_established_tcp_flow(flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow) {
 		rc = tcp_flow_migrate_source(fd, &flow->tcp);
 		if (rc) {
 			err("Can't send data, flow %u: %s", FLOW_IDX(flow),
@@ -1051,7 +1053,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * failures but not if the stream might be inconsistent (reported here
 	 * as EIO).
 	 */
-	foreach_established_tcp_flow(flow, FLOW_MAX) {
+	foreach_established_tcp_flow(flow) {
 		rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
 		if (rc) {
 			err("Extended data for flow %u: %s", FLOW_IDX(flow),

From 65e317a8fca4eaf9efbfe642cc7e4322c56aa1f7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Feb 2025 13:28:36 +1100
Subject: [PATCH 012/144] flow: Clean up and generalise flow traversal macros

The migration code introduced a number of 'foreach' macros to traverse the
flow table.  These aren't inherently tied to migration, so polish up their
naming, move them to flow_table.h and also use in flow_defer_handler()
which is the other place we need to traverse the whole table.

For now we keep foreach_established_tcp_flow() as is.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c       | 36 ++++++++----------------------------
 flow_table.h | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/flow.c b/flow.c
index 602fea7..bb5dcc3 100644
--- a/flow.c
+++ b/flow.c
@@ -53,28 +53,8 @@ const uint8_t flow_proto[] = {
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");
 
-#define foreach_flow(flow)						\
-	for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++)	\
-		if ((flow)->f.state == FLOW_STATE_FREE)			\
-			(flow) += (flow)->free.n - 1;			\
-		else
-
-#define foreach_active_flow(flow)					\
-	foreach_flow((flow))						\
-		if ((flow)->f.state != FLOW_STATE_ACTIVE)		\
-			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
-			continue;					\
-		else
-
-#define foreach_tcp_flow(flow)						\
-	foreach_active_flow((flow))					\
-		if ((flow)->f.type != FLOW_TCP)				\
-			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
-			continue;					\
-		else
-
 #define foreach_established_tcp_flow(flow)				\
-	foreach_tcp_flow((flow))					\
+	flow_foreach_of_type((flow), FLOW_TCP)				\
 		if (!tcp_flow_is_established(&(flow)->tcp))		\
 			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
 			continue;					\
@@ -801,7 +781,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 	struct flow_free_cluster *free_head = NULL;
 	unsigned *last_next = &flow_first_free;
 	bool timer = false;
-	unsigned idx;
+	union flow *flow;
 
 	if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
 		timer = true;
@@ -810,8 +790,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 
 	ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
 
-	for (idx = 0; idx < FLOW_MAX; idx++) {
-		union flow *flow = &flowtab[idx];
+	flow_foreach_slot(flow) {
 		bool closed = false;
 
 		switch (flow->f.state) {
@@ -828,12 +807,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 			} else {
 				/* New free cluster, add to chain */
 				free_head = &flow->free;
-				*last_next = idx;
+				*last_next = FLOW_IDX(flow);
 				last_next = &free_head->next;
 			}
 
 			/* Skip remaining empty entries */
-			idx += skip - 1;
+			flow += skip - 1;
 			continue;
 		}
 
@@ -886,14 +865,15 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 
 			if (free_head) {
 				/* Add slot to current free cluster */
-				ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
+				ASSERT(FLOW_IDX(flow) ==
+				       FLOW_IDX(free_head) + free_head->n);
 				free_head->n++;
 				flow->free.n = flow->free.next = 0;
 			} else {
 				/* Create new free cluster */
 				free_head = &flow->free;
 				free_head->n = 1;
-				*last_next = idx;
+				*last_next = FLOW_IDX(flow);
 				last_next = &free_head->next;
 			}
 		} else {
diff --git a/flow_table.h b/flow_table.h
index 9a2ff24..fd2c57b 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -50,6 +50,42 @@ extern union flow flowtab[];
 #define flow_foreach_sidei(sidei_) \
 	for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++)
 
+
+/**
+ * flow_foreach_slot() - Step through each flow table entry
+ * @flow:	Takes values of pointer to each flow table entry
+ *
+ * Includes FREE slots.
+ */
+#define flow_foreach_slot(flow)						\
+	for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++)
+
+/**
+ * flow_foreach() - Step through each active flow
+ * @flow:	Takes values of pointer to each active flow
+ */
+#define flow_foreach(flow)						\
+	flow_foreach_slot((flow))					\
+		if ((flow)->f.state == FLOW_STATE_FREE)			\
+			(flow) += (flow)->free.n - 1;			\
+		else if ((flow)->f.state != FLOW_STATE_ACTIVE) {	\
+			flow_err((flow), "Bad flow state during traversal"); \
+			continue;					\
+		} else
+
+/**
+ * flow_foreach_of_type() - Step through each active flow of given type
+ * @flow:	Takes values of pointer to each flow
+ * @type_:	Type of flow to traverse
+ */
+#define flow_foreach_of_type(flow, type_)				\
+	flow_foreach((flow))						\
+	if ((flow)->f.type != (type_))					\
+			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
+			continue;					\
+		else
+
+
 /** flow_idx() - Index of flow from common structure
  * @f:	Common flow fields pointer
  *

From 3dc7da68a2731f661d7251a5fc759daffe24ca70 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Feb 2025 14:14:27 +1100
Subject: [PATCH 013/144] conf: More thorough error checking when parsing --mtu
 option

We're a bit sloppy with parsing MTU which can lead to some surprising,
though fairly harmless, results:
  * Passing a non-number like '-m xyz' will not give an error and act like
    -m 0
  * Junk after a number (e.g. '-m 1500pqr') will be ignored rather than
    giving an error
  * We parse the MTU as a long, then immediately assign to an int, so on
    some platforms certain ludicrously out of bounds values will be
    silently truncated, rather than giving an error

Be a bit more thorough with the error checking to avoid that.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/conf.c b/conf.c
index 18017f5..335f37c 100644
--- a/conf.c
+++ b/conf.c
@@ -1652,20 +1652,29 @@ void conf(struct ctx *c, int argc, char **argv)
 				die("Invalid PID file: %s", optarg);
 
 			break;
-		case 'm':
-			errno = 0;
-			c->mtu = strtol(optarg, NULL, 0);
+		case 'm': {
+			unsigned long mtu;
+			char *e;
 
-			if (!c->mtu) {
+			errno = 0;
+			mtu = strtoul(optarg, &e, 0);
+
+			if (errno || *e)
+				die("Invalid MTU: %s", optarg);
+
+			if (!mtu) {
 				c->mtu = -1;
 				break;
 			}
 
-			if (c->mtu < ETH_MIN_MTU || c->mtu > (int)ETH_MAX_MTU ||
-			    errno)
-				die("Invalid MTU: %s", optarg);
+			if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
+				die("MTU %lu out of range (%u..%u)", mtu,
+				    ETH_MIN_MTU, ETH_MAX_MTU);
+			}
 
+			c->mtu = mtu;
 			break;
+		}
 		case 'a':
 			if (inet_pton(AF_INET6, optarg, &c->ip6.addr)	&&
 			    !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)	&&

From 1cc5d4c9fe0a84d3d39fc07358996989ca1b5875 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Feb 2025 14:14:28 +1100
Subject: [PATCH 014/144] conf: Use 0 instead of -1 as "unassigned" mtu value

On the command line -m 0 means "don't assign an MTU" (letting the guest use
its default.  However, internally we use (c->mtu == -1) to represent that
state.  We use (c->mtu == 0) to represent "the user didn't specify on the
command line, so use the default" - but this is only used during conf(),
never afterwards.

This is unnecessarily confusing.  We can instead just initialise c->mtu to
its default (65520) before parsing options and use 0 on both the command
line and internally to represent the "don't assign" special case.  This
ensures that c->mtu is always 0..65535, so we can store it in a uint16_t
which is more natural.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c  | 11 ++---------
 dhcp.c  |  2 +-
 ndp.c   |  2 +-
 passt.h |  3 ++-
 pasta.c |  2 +-
 tcp.c   |  2 +-
 6 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/conf.c b/conf.c
index 335f37c..c5ee07b 100644
--- a/conf.c
+++ b/conf.c
@@ -1413,6 +1413,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:";
 	}
 
+	c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
 	c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
 	c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET;
 	memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN);
@@ -1662,12 +1663,7 @@ void conf(struct ctx *c, int argc, char **argv)
 			if (errno || *e)
 				die("Invalid MTU: %s", optarg);
 
-			if (!mtu) {
-				c->mtu = -1;
-				break;
-			}
-
-			if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
+			if (mtu && (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU)) {
 				die("MTU %lu out of range (%u..%u)", mtu,
 				    ETH_MIN_MTU, ETH_MAX_MTU);
 			}
@@ -1980,9 +1976,6 @@ void conf(struct ctx *c, int argc, char **argv)
 		c->no_dhcpv6 = 1;
 	}
 
-	if (!c->mtu)
-		c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
-
 	get_dns(c);
 
 	if (!*c->pasta_ifn) {
diff --git a/dhcp.c b/dhcp.c
index 4a209f1..66a716e 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -417,7 +417,7 @@ int dhcp(const struct ctx *c, const struct pool *p)
 		       &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
 	}
 
-	if (c->mtu != -1) {
+	if (c->mtu) {
 		opts[26].slen = 2;
 		opts[26].s[0] = c->mtu / 256;
 		opts[26].s[1] = c->mtu % 256;
diff --git a/ndp.c b/ndp.c
index 37bf7a3..ded2081 100644
--- a/ndp.c
+++ b/ndp.c
@@ -256,7 +256,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
 
 	ptr = &ra.var[0];
 
-	if (c->mtu != -1) {
+	if (c->mtu) {
 		struct opt_mtu *mtu = (struct opt_mtu *)ptr;
 		*mtu = (struct opt_mtu) {
 			.header = {
diff --git a/passt.h b/passt.h
index 1f0dab5..28d1389 100644
--- a/passt.h
+++ b/passt.h
@@ -274,6 +274,8 @@ struct ctx {
 	int fd_repair;
 	unsigned char our_tap_mac[ETH_ALEN];
 	unsigned char guest_mac[ETH_ALEN];
+	uint16_t mtu;
+
 	uint64_t hash_secret[2];
 
 	int ifi4;
@@ -298,7 +300,6 @@ struct ctx {
 	int no_icmp;
 	struct icmp_ctx icmp;
 
-	int mtu;
 	int no_dns;
 	int no_dns_search;
 	int no_dhcp_dns;
diff --git a/pasta.c b/pasta.c
index 585a51c..fa3e7de 100644
--- a/pasta.c
+++ b/pasta.c
@@ -319,7 +319,7 @@ void pasta_ns_conf(struct ctx *c)
 	if (c->pasta_conf_ns) {
 		unsigned int flags = IFF_UP;
 
-		if (c->mtu != -1)
+		if (c->mtu)
 			nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu);
 
 		if (c->ifi6) /* Avoid duplicate address detection on link up */
diff --git a/tcp.c b/tcp.c
index f498f5b..e3c0a53 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1139,7 +1139,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (flags & SYN) {
 		int mss;
 
-		if (c->mtu == -1) {
+		if (!c->mtu) {
 			mss = tinfo.tcpi_snd_mss;
 		} else {
 			mss = c->mtu - sizeof(struct tcphdr);

From 183bedf478e34079244fe4cfbb2c1a0f02a5a037 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 18 Feb 2025 09:34:26 +0100
Subject: [PATCH 015/144] Makefile: Use mmap2() as alternative for mmap() in
 valgrind extra syscalls

...instead of unconditionally trying to enable both: mmap2() is the
32-bit ARM variant for mmap() (and perhaps for other architectures),
bot if mmap() is available, valgrind will use that one.

This avoids seccomp.sh warning us about missing mmap2() if mmap() is
present, and is consistent with what we do in vhost-user code.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index d4e1096..f2ac8e5 100644
--- a/Makefile
+++ b/Makefile
@@ -109,9 +109,9 @@ passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h
 	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS)
 
 valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction	\
-			    rt_sigreturn getpid gettid kill clock_gettime mmap \
-			    mmap2 munmap open unlink gettimeofday futex statx \
-			    readlink
+			    rt_sigreturn getpid gettid kill clock_gettime \
+			    mmap|mmap2 munmap open unlink gettimeofday futex \
+			    statx readlink
 valgrind: FLAGS += -g -DVALGRIND
 valgrind: all
 

From 16553c82806e0a55508baf553cb79e902638c10f Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 18 Feb 2025 09:42:28 +0100
Subject: [PATCH 016/144] dhcp: Add option code byte in calculation for OPT_MAX
 boundary check

Otherwise we'll limit messages to 577 bytes, instead of 576 bytes as
intended:

  $ fqdn="thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.thirtytwocharactersforeachlabel.then_make_it_251_with_this"
  $ hostname="__eighteen_bytes__"
  $ ./pasta --fqdn ${fqdn} -H ${hostname} -p dhcp.pcap -- /sbin/dhclient -4
  Saving packet capture to dhcp.pcap
  $ tshark -r dhcp.pcap -V -Y 'dhcp.option.value == 5' | grep "Total Length"
      Total Length: 577

This was hidden by the issue fixed by commit bcc4908c2b4a ("dhcp
Remove option 255 length byte") until now.

Fixes: 31e8109a86ee ("dhcp, dhcpv6: Add hostname and client fqdn ops")
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Enrique Llorente <ellorent@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 dhcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhcp.c b/dhcp.c
index 66a716e..b0de04b 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -143,7 +143,7 @@ static bool fill_one(struct msg *m, int o, int *offset)
 	size_t slen = opts[o].slen;
 
 	/* If we don't have space to write the option, then just skip */
-	if (*offset + 1 /* length of option */ + slen > OPT_MAX)
+	if (*offset + 2 /* code and length of option */ + slen > OPT_MAX)
 		return true;
 
 	m->o[*offset] = o;

From 4dac2351fae5534c01e144273f849ce9ece0dca7 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 18 Feb 2025 09:49:40 +0100
Subject: [PATCH 017/144] contrib/fedora: Actually install passt-repair SELinux
 policy file

Otherwise we build it, but we don't install it. Not an issue that
warrants a a release right away as it's anyway usable.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/fedora/passt.spec | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec
index 6a83f8b..745cf01 100644
--- a/contrib/fedora/passt.spec
+++ b/contrib/fedora/passt.spec
@@ -44,7 +44,7 @@ Requires(preun): %{name}
 Requires(preun): policycoreutils
 
 %description selinux
-This package adds SELinux enforcement to passt(1) and pasta(1).
+This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1).
 
 %prep
 %setup -q -n passt-%{git_hash}
@@ -82,6 +82,7 @@ make -f %{_datadir}/selinux/devel/Makefile
 install -p -m 644 -D passt.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
 install -p -m 644 -D passt.if %{buildroot}%{_datadir}/selinux/devel/include/distributed/passt.if
 install -p -m 644 -D pasta.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
+install -p -m 644 -D passt-repair.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
 popd
 
 %pre selinux
@@ -90,11 +91,13 @@ popd
 %post selinux
 %selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
 %selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
+%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
 
 %postun selinux
 if [ $1 -eq 0 ]; then
 	%selinux_modules_uninstall -s %{selinuxtype} passt
 	%selinux_modules_uninstall -s %{selinuxtype} pasta
+	%selinux_modules_uninstall -s %{selinuxtype} passt-repair
 fi
 
 %posttrans selinux
@@ -124,6 +127,7 @@ fi
 %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
 %{_datadir}/selinux/devel/include/distributed/passt.if
 %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
+%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
 
 %changelog
 {{{ passt_git_changelog }}}

From ea69ca6a20ac7408a913fd5de383a5383d679678 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Wed, 19 Feb 2025 10:20:41 -0500
Subject: [PATCH 018/144] tap: always set the no_frag flag in IPv4 headers

When studying the Linux source code and Wireshark dumps it seems like
the no_frag flag in the IPv4 header is always set. Following discussions
in the Internet on this subject indicates that modern routers never
fragment packets, and that it isn't even supported in many cases.

Adding to this that incoming messages forwarded on the tap interface
never even pass through a router it seems safe to always set this flag.

This makes the IPv4 headers of forwarded messages identical to those
sent by the external sockets, something we must consider desirable.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ip.h  | 3 ++-
 tap.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ip.h b/ip.h
index 1544dbf..858cc89 100644
--- a/ip.h
+++ b/ip.h
@@ -36,13 +36,14 @@
 		.tos		= 0,					\
 		.tot_len	= 0,					\
 		.id		= 0,					\
-		.frag_off	= 0,					\
+		.frag_off	= htons(IP_DF), 			\
 		.ttl		= 0xff,					\
 		.protocol	= (proto),				\
 		.saddr		= 0,					\
 		.daddr		= 0,					\
 	}
 #define L2_BUF_IP4_PSUM(proto)	((uint32_t)htons_constant(0x4500) +	\
+				 (uint32_t)htons_constant(IP_DF) +	\
 				 (uint32_t)htons(0xff00 | (proto)))
 
 
diff --git a/tap.c b/tap.c
index d0673e5..44b0fc0 100644
--- a/tap.c
+++ b/tap.c
@@ -153,7 +153,7 @@ static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 	ip4h->tos = 0;
 	ip4h->tot_len = htons(l3len);
 	ip4h->id = 0;
-	ip4h->frag_off = 0;
+	ip4h->frag_off = htons(IP_DF);
 	ip4h->ttl = 255;
 	ip4h->protocol = proto;
 	ip4h->saddr = src.s_addr;

From be86232f72dcfbd51a889206e80d587fbcaa1c5b Mon Sep 17 00:00:00 2001
From: Michal Privoznik <mprivozn@redhat.com>
Date: Fri, 21 Feb 2025 12:53:13 +0100
Subject: [PATCH 019/144] seccomp.sh: Silence stty errors

When printing list of allowed syscalls the width of terminal is
obtained for nicer output (see commit below). The width is
obtained by running 'stty'. While this works when building from a
console, it doesn't work during rpmbuild/emerge/.. as stdout is
usually not a console but a logfile and stdin is usually
/dev/null or something. This results in stty reporting errors
like this:

  stty: 'standard input': Inappropriate ioctl for device

Redirect stty's stderr to /dev/null to silence it.

Fixes: 712ca3235329 ("seccomp.sh: Try to account for terminal width while formatting list of system calls")
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 seccomp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seccomp.sh b/seccomp.sh
index 4c521ae..a7bc417 100755
--- a/seccomp.sh
+++ b/seccomp.sh
@@ -255,7 +255,7 @@ for __p in ${__profiles}; do
 	__calls="${__calls} ${EXTRA_SYSCALLS:-}"
 	__calls="$(filter ${__calls})"
 
-	cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
+	cols="$(stty -a 2>/dev/null | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
 	case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac
 	echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args}
 

From 87471731e6bb0b5df3a50277527caf3381b45ee4 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 28 Feb 2025 01:14:01 +0100
Subject: [PATCH 020/144] selinux: Fixes/workarounds for passt and
 passt-repair, mostly for libvirt usage

Here are a bunch of workarounds and a couple of fixes for libvirt
usage which are rather hard to split into single logical patches
as there appear to be some obscure dependencies between some of them:

- passt-repair needs to have an exec_type typeattribute (otherwise
  the policy for lsmd(1) causes a violation on getattr on its
  executable) file, and that typeattribute just happened to be there
  for passt as a result of init_daemon_domain(), but passt-repair
  isn't a daemon, so we need an explicit corecmd_executable_file()

- passt-repair needs a workaround, which I'll revisit once
  https://github.com/fedora-selinux/selinux-policy/issues/2579 is
  solved, for usage with libvirt: allow it to use qemu_var_run_t
  and virt_var_run_t sockets

- add 'bpf' and 'dac_read_search' capabilities for passt-repair:
  they are needed (for whatever reason I didn't investigate) to
  actually receive socket files via SCM_RIGHTS

- passt needs further workarounds in the sense of
  https://github.com/fedora-selinux/selinux-policy/issues/2579:
  allow it to use map and use svirt_tmpfs_t (not just svirt_image_t):
  it depends on where the libvirt guest image is

- ...it also needs to map /dev/null if <access mode='shared'/> is
  enabled in libvirt's XML for the memoryBacking object, for
  vhost-user operation

- and 'ioctl' on the TCP socket appears to be actually needed, on top
  of 'getattr', to dump some socket parameters

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/selinux/passt-repair.te | 33 +++++++++++++++++++++++++++++++--
 contrib/selinux/passt.te        |  9 +++++++--
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te
index e3ffbcd..f171be6 100644
--- a/contrib/selinux/passt-repair.te
+++ b/contrib/selinux/passt-repair.te
@@ -28,12 +28,22 @@ require {
 	type console_device_t;
 	type user_devpts_t;
 	type user_tmp_t;
+
+	# Workaround: passt-repair needs to needs to access socket files
+	# that passt, started by libvirt, might create under different
+	# labels, depending on whether passt is started as root or not.
+	#
+	# However, libvirt doesn't maintain its own policy, which makes
+	# updates particularly complicated. To avoid breakage in the short
+	# term, deal with that in passt's own policy.
+	type qemu_var_run_t;
+	type virt_var_run_t;
 }
 
 type passt_repair_t;
 domain_type(passt_repair_t);
 type passt_repair_exec_t;
-files_type(passt_repair_exec_t);
+corecmd_executable_file(passt_repair_exec_t);
 
 role unconfined_r types passt_repair_t;
 
@@ -41,7 +51,8 @@ allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans en
 type_transition unconfined_t passt_repair_exec_t:process passt_repair_t;
 allow unconfined_t passt_repair_t:process transition;
 
-allow passt_repair_t self:capability { dac_override net_admin net_raw };
+allow passt_repair_t self:capability { dac_override dac_read_search net_admin net_raw };
+allow passt_repair_t self:capability2 bpf;
 
 allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl };
 allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl };
@@ -50,9 +61,27 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
 allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
 allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
 
+allow passt_repair_t user_tmp_t:dir search;
+
 allow passt_repair_t unconfined_t:sock_file { read write };
 allow passt_repair_t passt_t:sock_file { read write };
 allow passt_repair_t user_tmp_t:sock_file { read write };
 
 allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
 allow passt_repair_t passt_t:tcp_socket { read setopt write };
+
+# Workaround: passt-repair needs to needs to access socket files
+# that passt, started by libvirt, might create under different
+# labels, depending on whether passt is started as root or not.
+#
+# However, libvirt doesn't maintain its own policy, which makes
+# updates particularly complicated. To avoid breakage in the short
+# term, deal with that in passt's own policy.
+allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write };
+allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write };
+
+allow passt_repair_t qemu_var_run_t:dir search;
+allow passt_repair_t virt_var_run_t:dir search;
+
+allow passt_repair_t qemu_var_run_t:sock_file { read write };
+allow passt_repair_t virt_var_run_t:sock_file { read write };
diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index f595079..f8ea672 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -29,6 +29,9 @@ require {
 	# particularly complicated. To avoid breakage in the short term,
 	# deal with it in passt's own policy.
 	type svirt_image_t;
+	type svirt_tmpfs_t;
+	type svirt_t;
+	type null_device_t;
 
 	class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map };
 	class dir { search write add_name remove_name mounton };
@@ -45,7 +48,7 @@ require {
 	type net_conf_t;
 	type proc_net_t;
 	type node_t;
-	class tcp_socket { create accept listen name_bind name_connect getattr };
+	class tcp_socket { create accept listen name_bind name_connect getattr ioctl };
 	class udp_socket { create accept listen };
 	class icmp_socket { bind create name_bind node_bind setopt read write };
 	class sock_file { create unlink write };
@@ -129,7 +132,7 @@ corenet_udp_sendrecv_all_ports(passt_t)
 allow passt_t node_t:icmp_socket { name_bind node_bind };
 allow passt_t port_t:icmp_socket name_bind;
 
-allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr };
+allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl };
 allow passt_t self:udp_socket { create getopt setopt connect bind read write };
 allow passt_t self:icmp_socket { bind create setopt read write };
 
@@ -143,3 +146,5 @@ allow passt_t unconfined_t:unix_stream_socket { read write };
 # particularly complicated. To avoid breakage in the short term,
 # deal with it in passt's own policy.
 allow passt_t svirt_image_t:file { read write map };
+allow passt_t svirt_tmpfs_t:file { read write map };
+allow passt_t null_device_t:chr_file map;

From 7b92f2e8525a94fb6f80d5e0bedba7eacc378714 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 27 Feb 2025 16:55:13 +1100
Subject: [PATCH 021/144] migrate, flow: Trivially succeed if migrating with no
 flows

We could get a migration request when we have no active flows; or at least
none that we need or are able to migrate.  In this case after sending or
receiving the number of flows we continue to step through various lists.

In the target case, this could include communication with passt-repair.  If
passt-repair wasn't started that could cause further errors, but of course
they shouldn't matter if we have nothing to repair.

Make it more obvious that there's nothing to do and avoid such errors by
short-circuiting flow_migrate_{source,target}() if there are no migratable
flows.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/flow.c b/flow.c
index bb5dcc3..6cf96c2 100644
--- a/flow.c
+++ b/flow.c
@@ -999,6 +999,9 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 
 	debug("Sending %u flows", ntohl(count));
 
+	if (!count)
+		return 0;
+
 	/* Dump and send information that can be stored in the flow table.
 	 *
 	 * Limited rollback options here: if we fail to transfer any data (that
@@ -1070,6 +1073,9 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	count = ntohl(count);
 	debug("Receiving %u flows", count);
 
+	if (!count)
+		return 0;
+
 	if ((rc = flow_migrate_repair_all(c, true)))
 		return -rc;
 

From 39f85bce1a3b9da3bd11458c521e589f674e587a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 27 Feb 2025 16:55:14 +1100
Subject: [PATCH 022/144] migrate, flow: Don't attempt to migrate TCP flows
 without passt-repair

Migrating TCP flows requires passt-repair in order to use TCP_REPAIR.  If
passt-repair is not started, our failure mode is pretty ugly though: we'll
attempt the migration, hitting various problems when we can't enter repair
mode.  In some cases we may not roll back these changes properly, meaning
we break network connections on the source.

Our general approach is not to completely block migration if there are
problems, but simply to break any flows we can't migrate.  So, if we have
no connection from passt-repair carry on with the migration, but don't
attempt to migrate any TCP connections.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/flow.c b/flow.c
index 6cf96c2..749c498 100644
--- a/flow.c
+++ b/flow.c
@@ -923,6 +923,10 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable)
 	union flow *flow;
 	int rc;
 
+	/* If we don't have a repair helper, there's nothing we can do */
+	if (c->fd_repair < 0)
+		return 0;
+
 	foreach_established_tcp_flow(flow) {
 		if (enable)
 			rc = tcp_flow_repair_on(c, &flow->tcp);
@@ -987,8 +991,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	(void)c;
 	(void)stage;
 
-	foreach_established_tcp_flow(flow)
-		count++;
+	/* If we don't have a repair helper, we can't migrate TCP flows */
+	if (c->fd_repair >= 0) {
+		foreach_established_tcp_flow(flow)
+			count++;
+	}
 
 	count = htonl(count);
 	if (write_all_buf(fd, &count, sizeof(count))) {

From 56ce03ed0acf2a41c67d44e353c00a018604ccb7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 27 Feb 2025 16:55:15 +1100
Subject: [PATCH 023/144] tcp: Correct error code handling from
 tcp_flow_repair_socket()

There are two small bugs in error returns from tcp_low_repair_socket(),
which is supposed to return a negative errno code:

1) On bind() failures, wedirectly pass on the return code from bind(),
   which is just 0 or -1, instead of an error code.

2) In the caller, tcp_flow_migrate_target() we call strerror_() directly
   on the negative error code, but strerror() requires a positive error
   code.

Correct both of these.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index e3c0a53..8528ee3 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3280,7 +3280,8 @@ int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
 
 	tcp_sock_set_nodelay(s);
 
-	if ((rc = bind(s, &a.sa, sizeof(a)))) {
+	if (bind(s, &a.sa, sizeof(a))) {
+		rc = -errno;
 		err_perror("Failed to bind socket for migrated flow");
 		goto err;
 	}
@@ -3375,7 +3376,7 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 	conn->seq_init_from_tap		= ntohl(t.seq_init_from_tap);
 
 	if ((rc = tcp_flow_repair_socket(c, conn))) {
-		flow_err(flow, "Can't set up socket: %s, drop", strerror_(rc));
+		flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc));
 		flow_alloc_cancel(flow);
 		return 0;
 	}

From b2708218a6eec82fad98da52d7569d13cf35e05c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 27 Feb 2025 16:55:16 +1100
Subject: [PATCH 024/144] tcp: Unconditionally move to CLOSED state on
 tcp_rst()

tcp_rst() attempts to send an RST packet to the guest, and if that succeeds
moves the flow to CLOSED state.  However, even if the tcp_send_flag() fails
the flow is still dead: we've usually closed the socket already, and
something has already gone irretrievably wrong.  So we should still mark
the flow as CLOSED.  That will cause it to be cleaned up, meaning any
future packets from the guest for it won't match a flow, so should generate
new RSTs (they don't at the moment, but that's a separate bug).

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index 8528ee3..d23b6d9 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1214,8 +1214,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
 	if (conn->events == CLOSED)
 		return;
 
-	if (!tcp_send_flag(c, conn, RST))
-		conn_event(c, conn, CLOSED);
+	tcp_send_flag(c, conn, RST);
+	conn_event(c, conn, CLOSED);
 }
 
 /**

From 52419a64f2dfa31707b31148e6a311bb57be6e5f Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 27 Feb 2025 16:55:17 +1100
Subject: [PATCH 025/144] migrate, tcp: Don't flow_alloc_cancel() during
 incoming migration

In tcp_flow_migrate_target(), if we're unable to create and bind the new
socket, we print an error, cancel the flow and carry on.  This seems to
make sense based on our policy of generally letting the migration complete
even if some or all flows are lost in the process.  But it doesn't quite
work: the flow_alloc_cancel() means that the flows in the target's flow
table are no longer one to one match to the flows which the source is
sending data for.  This means that data for later flows will be mismatched
to a different flow.  Most likely that will cause some nasty error later,
but even worse it might appear to succeed but lead to data corruption due
to incorrectly restoring one of the flows.

Instead, we should leave the flow in the table until we've read all the
data for it, *then* discard it.  Technically removing the
flow_alloc_cancel() would be enough for this: if tcp_flow_repair_socket()
fails it leaves conn->sock == -1, which will cause the restore functions
in tcp_flow_migrate_target_ext() to fail, discarding the flow.  To make
what's going on clearer (and with less extraneous error messages), put
several explicit tests for a missing socket later in the migration path to
read the data associated with the flow but explicitly discard it.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tcp.c b/tcp.c
index d23b6d9..b3aa9a2 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2708,6 +2708,9 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
 {
 	int rc = 0;
 
+	if (conn->sock < 0)
+		return 0;
+
 	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON)))
 		err("Failed to set TCP_REPAIR");
 
@@ -2725,6 +2728,9 @@ int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
 {
 	int rc = 0;
 
+	if (conn->sock < 0)
+		return 0;
+
 	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
 		err("Failed to clear TCP_REPAIR");
 
@@ -3377,7 +3383,8 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 
 	if ((rc = tcp_flow_repair_socket(c, conn))) {
 		flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc));
-		flow_alloc_cancel(flow);
+		/* Can't leave the flow in an incomplete state */
+		FLOW_ACTIVATE(conn);
 		return 0;
 	}
 
@@ -3453,6 +3460,10 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 		return rc;
 	}
 
+	if (conn->sock < 0)
+		/* We weren't able to create the socket, discard flow */
+		goto fail;
+
 	if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
 		goto fail;
 
@@ -3540,8 +3551,10 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	return 0;
 
 fail:
-	tcp_flow_repair_off(c, conn);
-	repair_flush(c);
+	if (conn->sock >= 0) {
+		tcp_flow_repair_off(c, conn);
+		repair_flush(c);
+	}
 
 	conn->flags = 0; /* Not waiting for ACK, don't schedule timer */
 	tcp_rst(c, conn);

From 008175636c789d36ef585a94eee4d62536cac7d6 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 15:32:28 +1100
Subject: [PATCH 026/144] ip: Helpers to access IPv6 flow label

The flow label is a 20-bit field in the IPv6 header.  The length and
alignment make it awkward to pass around as is.  Obviously, it can be
packed into a 32-bit integer though, and we do this in two places.  We
have some further upcoming places where we want to manipulate the flow
label, so make some helpers for marshalling and unmarshalling it to an
integer.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ip.h  | 25 +++++++++++++++++++++++++
 tap.c |  4 +---
 tcp.c |  4 +---
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/ip.h b/ip.h
index 858cc89..5edb7e7 100644
--- a/ip.h
+++ b/ip.h
@@ -91,6 +91,31 @@ struct ipv6_opt_hdr {
 	 */
 } __attribute__((packed));	/* required for some archs */
 
+/**
+ * ip6_set_flow_lbl() - Set flow label in an IPv6 header
+ * @ip6h:	Pointer to IPv6 header, updated
+ * @flow:	Set @ip6h flow label to the low 20 bits of this integer
+ */
+static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow)
+{
+	ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
+	ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
+	ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
+}
+
+/** ip6_get_flow_lbl() - Get flow label from an IPv6 header
+ * @ip6h:	Pointer to IPv6 header
+ *
+ * Return: flow label from @ip6h as an integer (<= 20 bits)
+ */
+/* cppcheck-suppress unusedFunction */
+static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
+{
+	return (ip6h->flow_lbl[0] & 0xf) << 16 |
+		ip6h->flow_lbl[1] << 8 |
+		ip6h->flow_lbl[2];
+}
+
 char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
 		 size_t *dlen);
 
diff --git a/tap.c b/tap.c
index 44b0fc0..3908262 100644
--- a/tap.c
+++ b/tap.c
@@ -241,9 +241,7 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h,
 	ip6h->hop_limit = 255;
 	ip6h->saddr = *src;
 	ip6h->daddr = *dst;
-	ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
-	ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
-	ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
+	ip6_set_flow_lbl(ip6h, flow);
 	return ip6h + 1;
 }
 
diff --git a/tcp.c b/tcp.c
index b3aa9a2..7459803 100644
--- a/tcp.c
+++ b/tcp.c
@@ -963,9 +963,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
 		ip6h->version = 6;
 		ip6h->nexthdr = IPPROTO_TCP;
 
-		ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf;
-		ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
-		ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
+		ip6_set_flow_lbl(ip6h, conn->sock);
 
 		if (!no_tcp_csum) {
 			psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,

From 1f236817ea715e9215e0fe4ecb0938d0a9809ce1 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 15:32:29 +1100
Subject: [PATCH 027/144] tap: Consider IPv6 flow label when building packet
 sequences

To allow more batching, we group together related packets into "seqs" in
the tap layer, before passing them to the L4 protocol layers.  Currently
we consider the IP protocol, both IP addresses and also the L4 ports when
grouping things into seqs.  We ignore the IPv6 flow label.

We have some future cases where we want to consider the the flow label in
the L4 code, which is awkward if we could be given a single batch with
multiple labels.  Add the flow label to tap6_l4_t and group by it as well
as the other criteria.  In future we could possibly use the flow label
_instead_ of peeking into the L4 header for the ports, but we don't do so
for now.

The guest should use the same flow label for all packets in a low, but if
it doesn't this change won't break anything, it just means we'll batch
things a bit sub-optimally.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ip.h  | 1 -
 tap.c | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ip.h b/ip.h
index 5edb7e7..c82431e 100644
--- a/ip.h
+++ b/ip.h
@@ -108,7 +108,6 @@ static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow)
  *
  * Return: flow label from @ip6h as an integer (<= 20 bits)
  */
-/* cppcheck-suppress unusedFunction */
 static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
 {
 	return (ip6h->flow_lbl[0] & 0xf) << 16 |
diff --git a/tap.c b/tap.c
index 3908262..202abae 100644
--- a/tap.c
+++ b/tap.c
@@ -489,6 +489,7 @@ static struct tap4_l4_t {
  * struct l4_seq6_t - Message sequence for one protocol handler call, IPv6
  * @msgs:	Count of messages in sequence
  * @protocol:	Protocol number
+ * @flow_lbl:	IPv6 flow label
  * @source:	Source port
  * @dest:	Destination port
  * @saddr:	Source address
@@ -497,6 +498,7 @@ static struct tap4_l4_t {
  */
 static struct tap6_l4_t {
 	uint8_t protocol;
+	uint32_t flow_lbl :20;
 
 	uint16_t source;
 	uint16_t dest;
@@ -870,6 +872,7 @@ resume:
 		((seq)->protocol == (proto)                &&		\
 		 (seq)->source   == (uh)->source           &&		\
 		 (seq)->dest == (uh)->dest                 &&		\
+		 (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) &&		\
 		 IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr)  &&		\
 		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
 
@@ -878,6 +881,7 @@ resume:
 		(seq)->protocol	= (proto);				\
 		(seq)->source	= (uh)->source;				\
 		(seq)->dest	= (uh)->dest;				\
+		(seq)->flow_lbl	= ip6_get_flow_lbl(ip6h);		\
 		(seq)->saddr	= *saddr;				\
 		(seq)->daddr	= *daddr;				\
 	} while (0)

From 672d786de1c1f2aca32caedbcf440f710c4aecb5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 15:32:30 +1100
Subject: [PATCH 028/144] tcp: Send RST in response to guest packets that match
 no connection

Currently, if a non-SYN TCP packet arrives which doesn't match any existing
connection, we simply ignore it.  However RFC 9293, section 3.10.7.1 says
we should respond with an RST to a non-SYN, non-RST packet that's for a
CLOSED (i.e. non-existent) connection.

This can arise in practice with migration, in cases where some error means
we have to discard a connection.  We destroy the connection with tcp_rst()
in that case, but because the guest is stopped, we may not be able to
deliver the RST packet on the tap interface immediately.  This change
ensures an RST will be sent if the guest tries to use the connection again.

A similar situation can arise if a passt/pasta instance is killed or
crashes, but is then replaced with another attached to the same guest.
This can leave the guest with stale connections that the new passt instance
isn't aware of.  It's better to send an RST so the guest knows quickly
these are broken, rather than letting them linger until they time out.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 17 +++++++-------
 tap.h |  6 +++++
 tcp.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 tcp.h |  2 +-
 4 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/tap.c b/tap.c
index 202abae..86d051e 100644
--- a/tap.c
+++ b/tap.c
@@ -122,7 +122,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
  *
  * Return: pointer at which to write the packet's payload
  */
-static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
+void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
 {
 	struct ethhdr *eh = (struct ethhdr *)buf;
 
@@ -143,8 +143,8 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
  *
  * Return: pointer at which to write the packet's payload
  */
-static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
-			   struct in_addr dst, size_t l4len, uint8_t proto)
+void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+		    struct in_addr dst, size_t l4len, uint8_t proto)
 {
 	uint16_t l3len = l4len + sizeof(*ip4h);
 
@@ -229,10 +229,9 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
  *
  * Return: pointer at which to write the packet's payload
  */
-static void *tap_push_ip6h(struct ipv6hdr *ip6h,
-			   const struct in6_addr *src,
-			   const struct in6_addr *dst,
-			   size_t l4len, uint8_t proto, uint32_t flow)
+void *tap_push_ip6h(struct ipv6hdr *ip6h,
+		    const struct in6_addr *src, const struct in6_addr *dst,
+		    size_t l4len, uint8_t proto, uint32_t flow)
 {
 	ip6h->payload_len = htons(l4len);
 	ip6h->priority = 0;
@@ -744,7 +743,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += tcp_tap_handler(c, PIF_TAP, AF_INET,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     0, p, k, now);
 		} else if (seq->protocol == IPPROTO_UDP) {
 			if (c->no_udp)
 				continue;
@@ -927,7 +926,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += tcp_tap_handler(c, PIF_TAP, AF_INET6,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     seq->flow_lbl, p, k, now);
 		} else if (seq->protocol == IPPROTO_UDP) {
 			if (c->no_udp)
 				continue;
diff --git a/tap.h b/tap.h
index a476a12..390ac12 100644
--- a/tap.h
+++ b/tap.h
@@ -42,6 +42,9 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
 		thdr->vnet_len = htonl(l2len);
 }
 
+void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
+void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+		     struct in_addr dst, size_t l4len, uint8_t proto);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
@@ -49,6 +52,9 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
 		    const void *in, size_t l4len);
 const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
 				     const struct in6_addr *src);
+void *tap_push_ip6h(struct ipv6hdr *ip6h,
+		    const struct in6_addr *src, const struct in6_addr *dst,
+		    size_t l4len, uint8_t proto, uint32_t flow);
 void tap_udp6_send(const struct ctx *c,
 		   const struct in6_addr *src, in_port_t sport,
 		   const struct in6_addr *dst, in_port_t dport,
diff --git a/tcp.c b/tcp.c
index 7459803..fb04e2e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1866,6 +1866,75 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
 	tcp_data_from_sock(c, conn);
 }
 
+/**
+ * tcp_rst_no_conn() - Send RST in response to a packet with no connection
+ * @c:		Execution context
+ * @af:		Address family, AF_INET or AF_INET6
+ * @saddr:	Source address of the packet we're responding to
+ * @daddr:	Destination address of the packet we're responding to
+ * @flow_lbl:	IPv6 flow label (ignored for IPv4)
+ * @th:		TCP header of the packet we're responding to
+ * @l4len:	Packet length, including TCP header
+ */
+static void tcp_rst_no_conn(const struct ctx *c, int af,
+			    const void *saddr, const void *daddr,
+			    uint32_t flow_lbl,
+			    const struct tcphdr *th, size_t l4len)
+{
+	struct iov_tail payload = IOV_TAIL(NULL, 0, 0);
+	struct tcphdr *rsth;
+	char buf[USHRT_MAX];
+	uint32_t psum = 0;
+	size_t rst_l2len;
+
+	/* Don't respond to RSTs without a connection */
+	if (th->rst)
+		return;
+
+	if (af == AF_INET) {
+		struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+		const struct in_addr *rst_src = daddr;
+		const struct in_addr *rst_dst = saddr;
+
+		rsth = tap_push_ip4h(ip4h, *rst_src, *rst_dst,
+				     sizeof(*rsth), IPPROTO_TCP);
+		psum = proto_ipv4_header_psum(sizeof(*rsth), IPPROTO_TCP,
+					      *rst_src, *rst_dst);
+
+	} else {
+		struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+		const struct in6_addr *rst_src = daddr;
+		const struct in6_addr *rst_dst = saddr;
+
+		rsth = tap_push_ip6h(ip6h, rst_src, rst_dst,
+				     sizeof(*rsth), IPPROTO_TCP, flow_lbl);
+		psum = proto_ipv6_header_psum(sizeof(*rsth), IPPROTO_TCP,
+					      rst_src, rst_dst);
+	}
+
+	memset(rsth, 0, sizeof(*rsth));
+
+	rsth->source = th->dest;
+	rsth->dest = th->source;
+	rsth->rst = 1;
+	rsth->doff = sizeof(*rsth) / 4UL;
+
+	/* Sequence matching logic from RFC 9293 section 3.10.7.1 */
+	if (th->ack) {
+		rsth->seq = th->ack_seq;
+	} else {
+		size_t dlen = l4len - th->doff * 4UL;
+		uint32_t ack = ntohl(th->seq) + dlen;
+
+		rsth->ack_seq = htonl(ack);
+		rsth->ack = 1;
+	}
+
+	tcp_update_csum(psum, rsth, &payload);
+	rst_l2len = ((char *)rsth - buf) + sizeof(*rsth);
+	tap_send_single(c, buf, rst_l2len);
+}
+
 /**
  * tcp_tap_handler() - Handle packets from tap and state transitions
  * @c:		Execution context
@@ -1873,6 +1942,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
  * @af:		Address family, AF_INET or AF_INET6
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @flow_lbl:	IPv6 flow label (ignored for IPv4)
  * @p:		Pool of TCP packets, with TCP headers
  * @idx:	Index of first packet in pool to process
  * @now:	Current timestamp
@@ -1880,7 +1950,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
  * Return: count of consumed packets
  */
 int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
-		    const void *saddr, const void *daddr,
+		    const void *saddr, const void *daddr, uint32_t flow_lbl,
 		    const struct pool *p, int idx, const struct timespec *now)
 {
 	struct tcp_tap_conn *conn;
@@ -1913,6 +1983,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		if (opts && th->syn && !th->ack)
 			tcp_conn_from_tap(c, af, saddr, daddr, th,
 					  opts, optlen, now);
+		else
+			tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, len);
 		return 1;
 	}
 
diff --git a/tcp.h b/tcp.h
index cf30744..9142eca 100644
--- a/tcp.h
+++ b/tcp.h
@@ -16,7 +16,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
 		      uint32_t events);
 int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
-		    const void *saddr, const void *daddr,
+		    const void *saddr, const void *daddr, uint32_t flow_lbl,
 		    const struct pool *p, int idx, const struct timespec *now);
 int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);

From 1924e25f0723c0a86c1e33812f8e1d8aa045a146 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:20:03 +1100
Subject: [PATCH 029/144] conf: Be more precise about minimum MTUs

Currently we reject the -m option if given a value less than ETH_MIN_MTU
(68).  That define is derived from the kernel, but its name is misleading:
it doesn't really have anything to do with Ethernet per se, but is rather
the minimum payload any L2 link must be able to handle in order to carry
IPv4.  For IPv6, it's not sufficient: that requires an MTU of at least
1280.

Newer kernels have better named constants IPV4_MIN_MTU and IPv6_MIN_MTU.
Copy and use those constants instead, along with some more specific error
messages.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 18 +++++++++++++++---
 ip.h   |  7 +++++++
 util.h |  6 ------
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/conf.c b/conf.c
index c5ee07b..065e720 100644
--- a/conf.c
+++ b/conf.c
@@ -1663,9 +1663,9 @@ void conf(struct ctx *c, int argc, char **argv)
 			if (errno || *e)
 				die("Invalid MTU: %s", optarg);
 
-			if (mtu && (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU)) {
-				die("MTU %lu out of range (%u..%u)", mtu,
-				    ETH_MIN_MTU, ETH_MAX_MTU);
+			if (mtu > ETH_MAX_MTU) {
+				die("MTU %lu too large (max %u)",
+				    mtu, ETH_MAX_MTU);
 			}
 
 			c->mtu = mtu;
@@ -1842,9 +1842,21 @@ void conf(struct ctx *c, int argc, char **argv)
 		c->ifi4 = conf_ip4(ifi4, &c->ip4);
 	if (!v4_only)
 		c->ifi6 = conf_ip6(ifi6, &c->ip6);
+
+	if (c->ifi4 && c->mtu < IPV4_MIN_MTU) {
+		warn("MTU %"PRIu16" is too small for IPv4 (minimum %u)",
+		     c->mtu, IPV4_MIN_MTU);
+	}
+	if (c->ifi6 && c->mtu < IPV6_MIN_MTU) {
+		warn("MTU %"PRIu16" is too small for IPv6 (minimum %u)",
+			     c->mtu, IPV6_MIN_MTU);
+	}
+
 	if ((*c->ip4.ifname_out && !c->ifi4) ||
 	    (*c->ip6.ifname_out && !c->ifi6))
 		die("External interface not usable");
+
+
 	if (!c->ifi4 && !c->ifi6) {
 		info("No external interface as template, switch to local mode");
 
diff --git a/ip.h b/ip.h
index c82431e..471c57e 100644
--- a/ip.h
+++ b/ip.h
@@ -129,4 +129,11 @@ static const struct in6_addr in6addr_ll_all_nodes = {
 /* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */
 static const struct in_addr in4addr_broadcast = { 0xffffffff };
 
+#ifndef IPV4_MIN_MTU
+#define IPV4_MIN_MTU		68
+#endif
+#ifndef IPV6_MIN_MTU
+#define IPV6_MIN_MTU		1280
+#endif
+
 #endif /* IP_H */
diff --git a/util.h b/util.h
index 50e96d3..0f70f4d 100644
--- a/util.h
+++ b/util.h
@@ -34,15 +34,9 @@
 #ifndef ETH_MAX_MTU
 #define ETH_MAX_MTU			USHRT_MAX
 #endif
-#ifndef ETH_MIN_MTU
-#define ETH_MIN_MTU			68
-#endif
 #ifndef IP_MAX_MTU
 #define IP_MAX_MTU			USHRT_MAX
 #endif
-#ifndef IPV6_MIN_MTU
-#define IPV6_MIN_MTU			1280
-#endif
 
 #ifndef MIN
 #define MIN(x, y)		(((x) < (y)) ? (x) : (y))

From 82a839be988ecfdb013b5823afc93211200a9f55 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Thu, 6 Mar 2025 13:00:03 -0500
Subject: [PATCH 030/144] tap: break out building of udp header from
 tap_udp4_send function

We will need to build the UDP header at other locations than in function
tap_udp4_send(), so we break that part out to a separate function.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 34 +++++++++++++++++++++++++++-------
 tap.h |  5 +++++
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/tap.c b/tap.c
index 86d051e..6f7063e 100644
--- a/tap.c
+++ b/tap.c
@@ -163,7 +163,7 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 }
 
 /**
- * tap_udp4_send() - Send UDP over IPv4 packet
+ * tap_push_uh4() - Build UDPv4 header with checksum
  * @c:		Execution context
  * @src:	IPv4 source address
  * @sport:	UDP source port
@@ -171,16 +171,14 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
  * @dport:	UDP destination port
  * @in:		UDP payload contents (not including UDP header)
  * @dlen:	UDP payload length (not including UDP header)
+ *
+ * Return: pointer at which to write the packet's payload
  */
-void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
+void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen)
 {
 	size_t l4len = dlen + sizeof(struct udphdr);
-	char buf[USHRT_MAX];
-	struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
-	struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
-	char *data = (char *)(uh + 1);
 	const struct iovec iov = {
 		.iov_base = (void *)in,
 		.iov_len = dlen
@@ -191,8 +189,30 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 	uh->dest = htons(dport);
 	uh->len = htons(l4len);
 	csum_udp4(uh, src, dst, &payload);
-	memcpy(data, in, dlen);
+	return (char *)uh + sizeof(*uh);
+}
 
+/**
+ * tap_udp4_send() - Send UDP over IPv4 packet
+ * @c:		Execution context
+ * @src:	IPv4 source address
+ * @sport:	UDP source port
+ * @dst:	IPv4 destination address
+ * @dport:	UDP destination port
+ * @in:	UDP payload contents (not including UDP header)
+ * @dlen:	UDP payload length (not including UDP header)
+ */
+void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
+		   struct in_addr dst, in_port_t dport,
+		   const void *in, size_t dlen)
+{
+	size_t l4len = dlen + sizeof(struct udphdr);
+	char buf[USHRT_MAX];
+	struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+	struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
+	char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
+
+	memcpy(data, in, dlen);
 	tap_send_single(c, buf, dlen + (data - buf));
 }
 
diff --git a/tap.h b/tap.h
index 390ac12..a2cf9bc 100644
--- a/tap.h
+++ b/tap.h
@@ -6,6 +6,8 @@
 #ifndef TAP_H
 #define TAP_H
 
+struct udphdr;
+
 /**
  * struct tap_hdr - tap backend specific headers
  * @vnet_len:	Frame length (for qemu socket transport)
@@ -45,6 +47,9 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
 void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
 void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 		     struct in_addr dst, size_t l4len, uint8_t proto);
+void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
+		   struct in_addr dst, in_port_t dport,
+		   const void *in, size_t dlen);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);

From 55431f0077b6a25c264bd2492680d7f99815cc5f Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Thu, 6 Mar 2025 13:00:04 -0500
Subject: [PATCH 031/144] udp: create and send ICMPv4 to local peer when
 applicable

When a local peer sends a UDP message to a non-existing port on an
existing remote host, that host will return an ICMP message containing
the error code ICMP_PORT_UNREACH, plus the header and the first eight
bytes of the original message. If the sender socket has been connected,
it uses this message to issue a "Connection Refused" event to the user.

Until now, we have only read such events from the externally facing
socket, but we don't forward them back to the local sender because
we cannot read the ICMP message directly to user space. Because of
this, the local peer will hang and wait for a response that never
arrives.

We now fix this for IPv4 by recreating and forwarding a correct ICMP
message back to the internal sender. We synthesize the message based
on the information in the extended error structure, plus the returned
part of the original message body.

Note that for the sake of completeness, we even produce ICMP messages
for other error codes. We have noticed that at least ICMP_PROT_UNREACH
is propagated as an error event back to the user.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
[sbrivio: fix cppcheck warning: udp_send_conn_fail_icmp4() doesn't
 modify 'in', it can be declared as const]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c          |  2 +-
 tap.h          |  2 ++
 udp.c          | 87 +++++++++++++++++++++++++++++++++++++++++++-------
 udp_internal.h |  2 +-
 udp_vu.c       |  4 +--
 5 files changed, 81 insertions(+), 16 deletions(-)

diff --git a/tap.c b/tap.c
index 6f7063e..57d0795 100644
--- a/tap.c
+++ b/tap.c
@@ -159,7 +159,7 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 	ip4h->saddr = src.s_addr;
 	ip4h->daddr = dst.s_addr;
 	ip4h->check = csum_ip4_header(l3len, proto, src, dst);
-	return ip4h + 1;
+	return (char *)ip4h + sizeof(*ip4h);
 }
 
 /**
diff --git a/tap.h b/tap.h
index a2cf9bc..9ac17ce 100644
--- a/tap.h
+++ b/tap.h
@@ -50,6 +50,8 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
+void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+		    struct in_addr dst, size_t l4len, uint8_t proto);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
diff --git a/udp.c b/udp.c
index 923cc38..b72c3ce 100644
--- a/udp.c
+++ b/udp.c
@@ -87,6 +87,7 @@
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/udp.h>
+#include <netinet/ip_icmp.h>
 #include <stdint.h>
 #include <stddef.h>
 #include <string.h>
@@ -112,6 +113,9 @@
 #include "udp_internal.h"
 #include "udp_vu.h"
 
+/* Maximum UDP data to be returned in ICMP messages */
+#define ICMP4_MAX_DLEN 8
+
 /* "Spliced" sockets indexed by bound port (host order) */
 static int udp_splice_ns  [IP_VERSIONS][NUM_PORTS];
 static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
@@ -402,25 +406,76 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
 	(*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
 }
 
+/**
+ * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer
+ * @c:		Execution context
+ * @ee:	Extended error descriptor
+ * @toside:	Destination side of flow
+ * @saddr:	Address of ICMP generating node
+ * @in:	First bytes (max 8) of original UDP message body
+ * @dlen:	Length of the read part of original UDP message body
+ */
+static void udp_send_conn_fail_icmp4(const struct ctx *c,
+				     const struct sock_extended_err *ee,
+				     const struct flowside *toside,
+				     struct in_addr saddr,
+				     const void *in, size_t dlen)
+{
+	struct in_addr oaddr = toside->oaddr.v4mapped.a4;
+	struct in_addr eaddr = toside->eaddr.v4mapped.a4;
+	in_port_t eport = toside->eport;
+	in_port_t oport = toside->oport;
+	struct {
+		struct icmphdr icmp4h;
+		struct iphdr ip4h;
+		struct udphdr uh;
+		char data[ICMP4_MAX_DLEN];
+	} __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
+	size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
+	size_t l4len = dlen + sizeof(struct udphdr);
+
+	ASSERT(dlen <= ICMP4_MAX_DLEN);
+	memset(&msg, 0, sizeof(msg));
+	msg.icmp4h.type = ee->ee_type;
+	msg.icmp4h.code = ee->ee_code;
+	if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED)
+		msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info);
+
+	/* Reconstruct the original headers as returned in the ICMP message */
+	tap_push_ip4h(&msg.ip4h, eaddr, oaddr, l4len, IPPROTO_UDP);
+	tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
+	memcpy(&msg.data, in, dlen);
+
+	tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
+}
+
 /**
  * udp_sock_recverr() - Receive and clear an error from a socket
- * @s:		Socket to receive from
+ * @c:		Execution context
+ * @ref:	epoll reference
  *
  * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
  *         if there was an error reading the queue
  *
  * #syscalls recvmsg
  */
-static int udp_sock_recverr(int s)
+static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 {
 	const struct sock_extended_err *ee;
 	const struct cmsghdr *hdr;
+	union sockaddr_inany saddr;
 	char buf[CMSG_SPACE(sizeof(*ee))];
+	char data[ICMP4_MAX_DLEN];
+	int s = ref.fd;
+	struct iovec iov = {
+		.iov_base = data,
+		.iov_len = sizeof(data)
+	};
 	struct msghdr mh = {
-		.msg_name = NULL,
-		.msg_namelen = 0,
-		.msg_iov = NULL,
-		.msg_iovlen = 0,
+		.msg_name = &saddr,
+		.msg_namelen = sizeof(saddr),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
 		.msg_control = buf,
 		.msg_controllen = sizeof(buf),
 	};
@@ -450,8 +505,15 @@ static int udp_sock_recverr(int s)
 	}
 
 	ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
+	if (ref.type == EPOLL_TYPE_UDP_REPLY) {
+		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
+		const struct flowside *toside = flowside_at_sidx(sidx);
 
-	/* TODO: When possible propagate and otherwise handle errors */
+		udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr,
+					 data, rc);
+	} else {
+		trace("Ignoring received IP_RECVERR cmsg on listener socket");
+	}
 	debug("%s error on UDP socket %i: %s",
 	      str_ee_origin(ee), s, strerror_(ee->ee_errno));
 
@@ -461,15 +523,16 @@ static int udp_sock_recverr(int s)
 /**
  * udp_sock_errs() - Process errors on a socket
  * @c:		Execution context
- * @s:		Socket to receive from
+ * @ref:	epoll reference
  * @events:	epoll events bitmap
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
+int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
+	int s = ref.fd;
 	int rc, err;
 
 	ASSERT(!c->no_udp);
@@ -478,7 +541,7 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
 		return 0; /* Nothing to do */
 
 	/* Empty the error queue */
-	while ((rc = udp_sock_recverr(s)) > 0)
+	while ((rc = udp_sock_recverr(c, ref)) > 0)
 		n_err += rc;
 
 	if (rc < 0)
@@ -558,7 +621,7 @@ static void udp_buf_listen_sock_handler(const struct ctx *c,
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
 	int n, i;
 
-	if (udp_sock_errs(c, ref.fd, events) < 0) {
+	if (udp_sock_errs(c, ref, events) < 0) {
 		err("UDP: Unrecoverable error on listening socket:"
 		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
 		/* FIXME: what now?  close/re-open socket? */
@@ -661,7 +724,7 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	from_s = uflow->s[ref.flowside.sidei];
 
-	if (udp_sock_errs(c, from_s, events) < 0) {
+	if (udp_sock_errs(c, ref, events) < 0) {
 		flow_err(uflow, "Unrecoverable error on reply socket");
 		flow_err_details(uflow);
 		udp_flow_close(c, uflow);
diff --git a/udp_internal.h b/udp_internal.h
index cc80e30..3b081f5 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -30,5 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                        const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
-int udp_sock_errs(const struct ctx *c, int s, uint32_t events);
+int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events);
 #endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index 4123510..c26a223 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -227,7 +227,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;
 
-	if (udp_sock_errs(c, ref.fd, events) < 0) {
+	if (udp_sock_errs(c, ref, events) < 0) {
 		err("UDP: Unrecoverable error on listening socket:"
 		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
 		return;
@@ -302,7 +302,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	ASSERT(!c->no_udp);
 
-	if (udp_sock_errs(c, from_s, events) < 0) {
+	if (udp_sock_errs(c, ref, events) < 0) {
 		flow_err(uflow, "Unrecoverable error on reply socket");
 		flow_err_details(uflow);
 		udp_flow_close(c, uflow);

From 87e6a464429372dfaa7212b61e5062dad87179dc Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Thu, 6 Mar 2025 13:00:05 -0500
Subject: [PATCH 032/144] tap: break out building of udp header from
 tap_udp6_send function

We will need to build the UDP header at other locations than in function
tap_udp6_send(), so we break that part out to a separate function.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 46 ++++++++++++++++++++++++++++++++++------------
 tap.h |  4 ++++
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/tap.c b/tap.c
index 57d0795..7082620 100644
--- a/tap.c
+++ b/tap.c
@@ -265,7 +265,7 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h,
 }
 
 /**
- * tap_udp6_send() - Send UDP over IPv6 packet
+ * tap_push_uh6() - Build UDPv6 header with checksum
  * @c:		Execution context
  * @src:	IPv6 source address
  * @sport:	UDP source port
@@ -274,6 +274,38 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h,
  * @flow:	Flow label
  * @in:		UDP payload contents (not including UDP header)
  * @dlen:	UDP payload length (not including UDP header)
+ *
+ * Return: pointer at which to write the packet's payload
+ */
+void *tap_push_uh6(struct udphdr *uh,
+		   const struct in6_addr *src, in_port_t sport,
+		   const struct in6_addr *dst, in_port_t dport,
+		   void *in, size_t dlen)
+{
+	size_t l4len = dlen + sizeof(struct udphdr);
+	const struct iovec iov = {
+		.iov_base = in,
+		.iov_len = dlen
+	};
+	struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
+
+	uh->source = htons(sport);
+	uh->dest = htons(dport);
+	uh->len = htons(l4len);
+	csum_udp6(uh, src, dst, &payload);
+	return (char *)uh + sizeof(*uh);
+}
+
+/**
+ * tap_udp6_send() - Send UDP over IPv6 packet
+ * @c:		Execution context
+ * @src:	IPv6 source address
+ * @sport:	UDP source port
+ * @dst:	IPv6 destination address
+ * @dport:	UDP destination port
+ * @flow:	Flow label
+ * @in:	UDP payload contents (not including UDP header)
+ * @dlen:	UDP payload length (not including UDP header)
  */
 void tap_udp6_send(const struct ctx *c,
 		   const struct in6_addr *src, in_port_t sport,
@@ -285,19 +317,9 @@ void tap_udp6_send(const struct ctx *c,
 	struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
 	struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
 					  l4len, IPPROTO_UDP, flow);
-	char *data = (char *)(uh + 1);
-	const struct iovec iov = {
-		.iov_base = in,
-		.iov_len = dlen
-	};
-	struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
+	char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
 
-	uh->source = htons(sport);
-	uh->dest = htons(dport);
-	uh->len = htons(l4len);
-	csum_udp6(uh, src, dst, &payload);
 	memcpy(data, in, dlen);
-
 	tap_send_single(c, buf, dlen + (data - buf));
 }
 
diff --git a/tap.h b/tap.h
index 9ac17ce..b53a5b8 100644
--- a/tap.h
+++ b/tap.h
@@ -50,6 +50,10 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
+void *tap_push_uh6(struct udphdr *uh,
+		   const struct in6_addr *src, in_port_t sport,
+		   const struct in6_addr *dst, in_port_t dport,
+		   void *in, size_t dlen);
 void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 		    struct in_addr dst, size_t l4len, uint8_t proto);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,

From 68b04182e07da6a437479cb191e5468db382bc56 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Thu, 6 Mar 2025 13:00:06 -0500
Subject: [PATCH 033/144] udp: create and send ICMPv6 to local peer when
 applicable

When a local peer sends a UDP message to a non-existing port on an
existing remote host, that host will return an ICMPv6 message containing
the error code ICMP6_DST_UNREACH_NOPORT, plus the IPv6 header, UDP header
and the first 1232 bytes of the original message, if any. If the sender
socket has been connected, it uses this message to issue a
"Connection Refused" event to the user.

Until now, we have only read such events from the externally facing
socket, but we don't forward them back to the local sender because
we cannot read the ICMP message directly to user space. Because of
this, the local peer will hang and wait for a response that never
arrives.

We now fix this for IPv6 by recreating and forwarding a correct ICMP
message back to the internal sender. We synthesize the message based
on the information in the extended error structure, plus the returned
part of the original message body.

Note that for the sake of completeness, we even produce ICMP messages
for other error types and codes. We have noticed that at least
ICMP_PROT_UNREACH is propagated as an error event back to the user.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
[sbrivio: fix cppcheck warning, udp_send_conn_fail_icmp6() doesn't
 modify saddr which can be declared as const]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c |  2 +-
 tap.h |  4 ++++
 udp.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/tap.c b/tap.c
index 7082620..4541f51 100644
--- a/tap.c
+++ b/tap.c
@@ -261,7 +261,7 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h,
 	ip6h->saddr = *src;
 	ip6h->daddr = *dst;
 	ip6_set_flow_lbl(ip6h, flow);
-	return ip6h + 1;
+	return (char *)ip6h + sizeof(*ip6h);
 }
 
 /**
diff --git a/tap.h b/tap.h
index b53a5b8..a2c3b87 100644
--- a/tap.h
+++ b/tap.h
@@ -56,6 +56,10 @@ void *tap_push_uh6(struct udphdr *uh,
 		   void *in, size_t dlen);
 void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 		    struct in_addr dst, size_t l4len, uint8_t proto);
+void *tap_push_ip6h(struct ipv6hdr *ip6h,
+		    const struct in6_addr *src,
+		    const struct in6_addr *dst,
+		    size_t l4len, uint8_t proto, uint32_t flow);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
diff --git a/udp.c b/udp.c
index b72c3ce..80520cb 100644
--- a/udp.c
+++ b/udp.c
@@ -88,6 +88,7 @@
 #include <netinet/ip.h>
 #include <netinet/udp.h>
 #include <netinet/ip_icmp.h>
+#include <netinet/icmp6.h>
 #include <stdint.h>
 #include <stddef.h>
 #include <string.h>
@@ -115,6 +116,9 @@
 
 /* Maximum UDP data to be returned in ICMP messages */
 #define ICMP4_MAX_DLEN 8
+#define ICMP6_MAX_DLEN (IPV6_MIN_MTU			\
+			- sizeof(struct udphdr)	\
+			- sizeof(struct ipv6hdr))
 
 /* "Spliced" sockets indexed by bound port (host order) */
 static int udp_splice_ns  [IP_VERSIONS][NUM_PORTS];
@@ -449,6 +453,51 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c,
 	tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
 }
 
+
+/**
+ * udp_send_conn_fail_icmp6() - Construct and send ICMPv6 to local peer
+ * @c:		Execution context
+ * @ee:	Extended error descriptor
+ * @toside:	Destination side of flow
+ * @saddr:	Address of ICMP generating node
+ * @in:	First bytes (max 1232) of original UDP message body
+ * @dlen:	Length of the read part of original UDP message body
+ * @flow:	IPv6 flow identifier
+ */
+static void udp_send_conn_fail_icmp6(const struct ctx *c,
+				     const struct sock_extended_err *ee,
+				     const struct flowside *toside,
+				     const struct in6_addr *saddr,
+				     void *in, size_t dlen, uint32_t flow)
+{
+	const struct in6_addr *oaddr = &toside->oaddr.a6;
+	const struct in6_addr *eaddr = &toside->eaddr.a6;
+	in_port_t eport = toside->eport;
+	in_port_t oport = toside->oport;
+	struct {
+		struct icmp6_hdr icmp6h;
+		struct ipv6hdr ip6h;
+		struct udphdr uh;
+		char data[ICMP6_MAX_DLEN];
+	} __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
+	size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
+	size_t l4len = dlen + sizeof(struct udphdr);
+
+	ASSERT(dlen <= ICMP6_MAX_DLEN);
+	memset(&msg, 0, sizeof(msg));
+	msg.icmp6h.icmp6_type = ee->ee_type;
+	msg.icmp6h.icmp6_code = ee->ee_code;
+	if (ee->ee_type == ICMP6_PACKET_TOO_BIG)
+		msg.icmp6h.icmp6_dataun.icmp6_un_data32[0] = htonl(ee->ee_info);
+
+	/* Reconstruct the original headers as returned in the ICMP message */
+	tap_push_ip6h(&msg.ip6h, eaddr, oaddr, l4len, IPPROTO_UDP, flow);
+	tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
+	memcpy(&msg.data, in, dlen);
+
+	tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
+}
+
 /**
  * udp_sock_recverr() - Receive and clear an error from a socket
  * @c:		Execution context
@@ -465,7 +514,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 	const struct cmsghdr *hdr;
 	union sockaddr_inany saddr;
 	char buf[CMSG_SPACE(sizeof(*ee))];
-	char data[ICMP4_MAX_DLEN];
+	char data[ICMP6_MAX_DLEN];
 	int s = ref.fd;
 	struct iovec iov = {
 		.iov_base = data,
@@ -508,9 +557,17 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 	if (ref.type == EPOLL_TYPE_UDP_REPLY) {
 		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
 		const struct flowside *toside = flowside_at_sidx(sidx);
+		size_t dlen = rc;
 
-		udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr,
-					 data, rc);
+		if (hdr->cmsg_level == IPPROTO_IP) {
+			dlen = MIN(dlen, ICMP4_MAX_DLEN);
+			udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr,
+						 data, dlen);
+		} else if (hdr->cmsg_level == IPPROTO_IPV6) {
+			udp_send_conn_fail_icmp6(c, ee, toside,
+						 &saddr.sa6.sin6_addr,
+						 data, dlen, sidx.flowi);
+		}
 	} else {
 		trace("Ignoring received IP_RECVERR cmsg on listener socket");
 	}

From 57d2db370b9c12aca84901d968c2c31db89ca462 Mon Sep 17 00:00:00 2001
From: David Gibson <dgibson@redhat.com>
Date: Wed, 5 Mar 2025 17:15:03 +1100
Subject: [PATCH 034/144] treewide: Mark assorted functions static

This marks static a number of functions which are only used in their .c
file, have no prototypes in a .h and were never intended to be globally
exposed.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 log.c     | 2 +-
 netlink.c | 2 +-
 passt.c   | 2 +-
 tcp.c     | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/log.c b/log.c
index 95e4576..b6bce21 100644
--- a/log.c
+++ b/log.c
@@ -56,7 +56,7 @@ bool		log_stderr = true;	/* Not daemonised, no shell spawned */
  *
  * Return: pointer to @now, or NULL if there was an error retrieving the time
  */
-const struct timespec *logtime(struct timespec *ts)
+static const struct timespec *logtime(struct timespec *ts)
 {
 	if (clock_gettime(CLOCK_MONOTONIC, ts))
 		return NULL;
diff --git a/netlink.c b/netlink.c
index 37d8b5b..a052504 100644
--- a/netlink.c
+++ b/netlink.c
@@ -355,7 +355,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
  *
  * Return: true if a gateway was found, false otherwise
  */
-bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
+static bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
 {
 	int nh_len = RTA_PAYLOAD(rta);
 	struct rtnexthop *rtnh;
diff --git a/passt.c b/passt.c
index 68d1a28..868842b 100644
--- a/passt.c
+++ b/passt.c
@@ -166,7 +166,7 @@ void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
  *
  * #syscalls exit_group
  */
-void exit_handler(int signal)
+static void exit_handler(int signal)
 {
 	(void)signal;
 
diff --git a/tcp.c b/tcp.c
index fb04e2e..4c24367 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2497,7 +2497,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
  * @c:		Execution context
  * @port:	Port, host order
  */
-void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
+static void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
 {
 	ASSERT(!c->no_tcp);
 
@@ -3141,7 +3141,7 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
  *
  * Return: 0 on success, negative error code on failure
  */
-int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
+static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
 {
 	const struct tcp_repair_opt opts[] = {
 		{ TCPOPT_WINDOW,		t->snd_ws + (t->rcv_ws << 16) },
@@ -3333,7 +3333,7 @@ fail:
  *
  * Return: 0 on success, negative error code on failure
  */
-int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
 {
 	sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
 	const struct flowside *sockside = HOSTFLOW(conn);

From e36c35c952ef0848383cba8ef71e13cf25dab2da Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:15:04 +1100
Subject: [PATCH 035/144] log: Don't export passt_vsyslog()

passt_vsyslog() is an exposed function in log.h.  However it shouldn't
be called from outside log.c: it writes specifically to the system log,
and most code should call passt's logging helpers which might go to the
syslog or to a log file.

Make passt_vsyslog() local to log.c.  This requires a code motion to avoid
a forward declaration.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 log.c | 48 ++++++++++++++++++++++++------------------------
 log.h |  1 -
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/log.c b/log.c
index b6bce21..6eda4c4 100644
--- a/log.c
+++ b/log.c
@@ -249,6 +249,30 @@ static void logfile_write(bool newline, bool cont, int pri,
 		log_written += n;
 }
 
+/**
+ * passt_vsyslog() - vsyslog() implementation not using heap memory
+ * @newline:	Append newline at the end of the message, if missing
+ * @pri:	Facility and level map, same as priority for vsyslog()
+ * @format:	Same as vsyslog() format
+ * @ap:		Same as vsyslog() ap
+ */
+static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
+{
+	char buf[BUFSIZ];
+	int n;
+
+	/* Send without timestamp, the system logger should add it */
+	n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
+
+	n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
+
+	if (newline && format[strlen(format)] != '\n')
+		n += snprintf(buf + n, BUFSIZ - n, "\n");
+
+	if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
+		FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
+}
+
 /**
  * vlogmsg() - Print or send messages to log or output files as configured
  * @newline:	Append newline at the end of the message, if missing
@@ -373,30 +397,6 @@ void __setlogmask(int mask)
 	setlogmask(mask);
 }
 
-/**
- * passt_vsyslog() - vsyslog() implementation not using heap memory
- * @newline:	Append newline at the end of the message, if missing
- * @pri:	Facility and level map, same as priority for vsyslog()
- * @format:	Same as vsyslog() format
- * @ap:		Same as vsyslog() ap
- */
-void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
-{
-	char buf[BUFSIZ];
-	int n;
-
-	/* Send without timestamp, the system logger should add it */
-	n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
-
-	n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
-
-	if (newline && format[strlen(format)] != '\n')
-		n += snprintf(buf + n, BUFSIZ - n, "\n");
-
-	if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
-		FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
-}
-
 /**
  * logfile_init() - Open log file and write header with PID, version, path
  * @name:	Identifier for header: passt or pasta
diff --git a/log.h b/log.h
index 22c7b9a..08aa88c 100644
--- a/log.h
+++ b/log.h
@@ -55,7 +55,6 @@ void trace_init(int enable);
 
 void __openlog(const char *ident, int option, int facility);
 void logfile_init(const char *name, const char *path, size_t size);
-void passt_vsyslog(bool newline, int pri, const char *format, va_list ap);
 void __setlogmask(int mask);
 
 #endif /* LOG_H */

From 12d5b36b2f17a1ddc9447b925dbec161b4da346a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:15:05 +1100
Subject: [PATCH 036/144] checksum: Don't export various functions

Several of the exposed functions in checksum.h are no longer directly used.
Remove them from the header, and make static.  In particular sum_16b()
should not be used outside: generally csum_unfolded() should be used which
will automatically use either the AVX2 optimized version or sum_16b() as
necessary.

csum_fold() and csum() could have external uses, but they're not used right
now.  We can expose them again if we need to.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 checksum.c | 34 +++++++++++++++++-----------------
 checksum.h |  3 ---
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/checksum.c b/checksum.c
index b01e0fe..0894eca 100644
--- a/checksum.c
+++ b/checksum.c
@@ -85,7 +85,7 @@
  */
 /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
 __attribute__((optimize("-fno-strict-aliasing")))
-uint32_t sum_16b(const void *buf, size_t len)
+static uint32_t sum_16b(const void *buf, size_t len)
 {
 	const uint16_t *p = buf;
 	uint32_t sum = 0;
@@ -107,7 +107,7 @@ uint32_t sum_16b(const void *buf, size_t len)
  *
  * Return: 16-bit folded sum
  */
-uint16_t csum_fold(uint32_t sum)
+static uint16_t csum_fold(uint32_t sum)
 {
 	while (sum >> 16)
 		sum = (sum & 0xffff) + (sum >> 16);
@@ -161,6 +161,21 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
 	return psum;
 }
 
+/**
+ * csum() - Compute TCP/IP-style checksum
+ * @buf:	Input buffer
+ * @len:	Input length
+ * @init:	Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit folded, complemented checksum
+ */
+/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
+__attribute__((optimize("-fno-strict-aliasing")))	/* See csum_16b() */
+static uint16_t csum(const void *buf, size_t len, uint32_t init)
+{
+	return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
+}
+
 /**
  * csum_udp4() - Calculate and set checksum for a UDP over IPv4 packet
  * @udp4hr:	UDP header, initialised apart from checksum
@@ -482,21 +497,6 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
 }
 #endif /* !__AVX2__ */
 
-/**
- * csum() - Compute TCP/IP-style checksum
- * @buf:	Input buffer
- * @len:	Input length
- * @init:	Initial 32-bit checksum, 0 for no pre-computed checksum
- *
- * Return: 16-bit folded, complemented checksum
- */
-/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
-__attribute__((optimize("-fno-strict-aliasing")))	/* See csum_16b() */
-uint16_t csum(const void *buf, size_t len, uint32_t init)
-{
-	return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
-}
-
 /**
  * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector
  * @tail:	IO vector tail to checksum
diff --git a/checksum.h b/checksum.h
index e243c97..683a09b 100644
--- a/checksum.h
+++ b/checksum.h
@@ -11,8 +11,6 @@ struct icmphdr;
 struct icmp6hdr;
 struct iov_tail;
 
-uint32_t sum_16b(const void *buf, size_t len);
-uint16_t csum_fold(uint32_t sum);
 uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init);
 uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
 			 struct in_addr saddr, struct in_addr daddr);
@@ -32,7 +30,6 @@ void csum_icmp6(struct icmp6hdr *icmp6hr,
 		const struct in6_addr *saddr, const struct in6_addr *daddr,
 		const void *payload, size_t dlen);
 uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
-uint16_t csum(const void *buf, size_t len, uint32_t init);
 uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init);
 
 #endif /* CHECKSUM_H */

From 27395e67c26a73e2e035360195b5928a07996dd5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:15:06 +1100
Subject: [PATCH 037/144] tcp: Don't export tcp_update_csum()

tcp_update_csum() is exposed in tcp_internal.h, but is only used in tcp.c.
Remove the unneded prototype and make it static.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c          | 3 ++-
 tcp_internal.h | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tcp.c b/tcp.c
index 4c24367..32a08bd 100644
--- a/tcp.c
+++ b/tcp.c
@@ -787,7 +787,8 @@ static void tcp_sock_set_nodelay(int s)
  * @th:		TCP header (updated)
  * @payload:	TCP payload
  */
-void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload)
+static void tcp_update_csum(uint32_t psum, struct tcphdr *th,
+			    struct iov_tail *payload)
 {
 	th->check = 0;
 	psum = csum_unfolded(th, sizeof(*th), psum);
diff --git a/tcp_internal.h b/tcp_internal.h
index 9cf31f5..6f5e054 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -166,8 +166,6 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
 
 struct tcp_info_linux;
 
-void tcp_update_csum(uint32_t psum, struct tcphdr *th,
-		     struct iov_tail *payload);
 void tcp_fill_headers(const struct tcp_tap_conn *conn,
 		      struct tap_hdr *taph,
 		      struct iphdr *ip4h, struct ipv6hdr *ip6h,

From a83c806d1786fbe19bc6a3014f248e928e00651b Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:15:07 +1100
Subject: [PATCH 038/144] vhost_user: Don't export several functions

vhost-user added several functions which are exposed in headers, but not
used outside the file where they're defined.  I can't tell if these are
really internal functions, or of they're logically supposed to be exported,
but we don't happen to have anything using them yet.

For the time being, just remove the exports.  We can add them back if we
need to.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 2 +-
 vhost_user.h | 1 -
 virtio.c     | 9 +++++----
 virtio.h     | 4 ----
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/vhost_user.c b/vhost_user.c
index be1aa94..105f77a 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -517,7 +517,7 @@ static void vu_close_log(struct vu_dev *vdev)
  * vu_log_kick() - Inform the front-end that the log has been modified
  * @vdev:	vhost-user device
  */
-void vu_log_kick(const struct vu_dev *vdev)
+static void vu_log_kick(const struct vu_dev *vdev)
 {
 	if (vdev->log_call_fd != -1) {
 		int rc;
diff --git a/vhost_user.h b/vhost_user.h
index e769cb1..1daacd1 100644
--- a/vhost_user.h
+++ b/vhost_user.h
@@ -241,7 +241,6 @@ static inline bool vu_queue_started(const struct vu_virtq *vq)
 void vu_print_capabilities(void);
 void vu_init(struct ctx *c);
 void vu_cleanup(struct vu_dev *vdev);
-void vu_log_kick(const struct vu_dev *vdev);
 void vu_log_write(const struct vu_dev *vdev, uint64_t address,
 		  uint64_t length);
 void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events);
diff --git a/virtio.c b/virtio.c
index 2b58e4d..bc2b89a 100644
--- a/virtio.c
+++ b/virtio.c
@@ -286,7 +286,7 @@ static int virtqueue_read_next_desc(const struct vring_desc *desc,
  *
  * Return: true if the virtqueue is empty, false otherwise
  */
-bool vu_queue_empty(struct vu_virtq *vq)
+static bool vu_queue_empty(struct vu_virtq *vq)
 {
 	if (!vq->vring.avail)
 		return true;
@@ -671,9 +671,10 @@ static void vu_log_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq,
  * @len:	Size of the element
  * @idx:	Used ring entry index
  */
-void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq,
-			    unsigned int index, unsigned int len,
-			    unsigned int idx)
+static void vu_queue_fill_by_index(const struct vu_dev *vdev,
+				   struct vu_virtq *vq,
+				   unsigned int index, unsigned int len,
+				   unsigned int idx)
 {
 	struct vring_used_elem uelem;
 
diff --git a/virtio.h b/virtio.h
index 0a59441..7a370bd 100644
--- a/virtio.h
+++ b/virtio.h
@@ -174,16 +174,12 @@ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev,
 	return has_feature(vdev->protocol_features, fbit);
 }
 
-bool vu_queue_empty(struct vu_virtq *vq);
 void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq);
 int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
 		 struct vu_virtq_element *elem);
 void vu_queue_detach_element(struct vu_virtq *vq);
 void vu_queue_unpop(struct vu_virtq *vq);
 bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num);
-void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq,
-			    unsigned int index, unsigned int len,
-			    unsigned int idx);
 void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq,
 		   const struct vu_virtq_element *elem, unsigned int len,
 		   unsigned int idx);

From 2b58b22845a76baf24141155eb4d4a882f509e97 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 5 Mar 2025 17:15:08 +1100
Subject: [PATCH 039/144] cppcheck: Add suppressions for "logically" exported
 functions

We have some functions in our headers which are definitely there on
purpose.  However, they're not yet used outside the files in which they're
defined.  That causes sufficiently recent cppcheck versions (2.17) to
complain they should be static.

Suppress the errors for these "logically" exported functions.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 iov.c | 1 +
 log.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/iov.c b/iov.c
index 3b12272..8c63b7e 100644
--- a/iov.c
+++ b/iov.c
@@ -203,6 +203,7 @@ size_t iov_tail_size(struct iov_tail *tail)
  *	    overruns the IO vector, is not contiguous or doesn't have the
  *	    requested alignment.
  */
+/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
 void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
 {
 	char *p;
diff --git a/log.c b/log.c
index 6eda4c4..d40d7ae 100644
--- a/log.c
+++ b/log.c
@@ -281,6 +281,7 @@ static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
  * @format:	Message
  * @ap:		Variable argument list
  */
+/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
 void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
 {
 	bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;

From 04701702471ececee362669cc6b49ed9e20a1b6d Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 7 Mar 2025 23:27:03 +0100
Subject: [PATCH 040/144] passt-repair: Add directory watch

It might not be feasible for users to start passt-repair after passt
is started, on a migration target, but before the migration process
starts.

For instance, with libvirt, the guest domain (and, hence, passt) is
started on the target as part of the migration process. At least for
the moment being, there's no hook a libvirt user (including KubeVirt)
can use to start passt-repair before the migration starts.

Add a directory watch using inotify: if PATH is a directory, instead
of connecting to it, we'll watch for a .repair socket file to appear
in it, and then attempt to connect to that socket.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 contrib/selinux/passt-repair.te | 16 +++----
 passt-repair.1                  |  6 ++-
 passt-repair.c                  | 84 +++++++++++++++++++++++++++++----
 3 files changed, 89 insertions(+), 17 deletions(-)

diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te
index f171be6..7157dfb 100644
--- a/contrib/selinux/passt-repair.te
+++ b/contrib/selinux/passt-repair.te
@@ -61,11 +61,11 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
 allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
 allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
 
-allow passt_repair_t user_tmp_t:dir search;
+allow passt_repair_t user_tmp_t:dir { getattr read search watch };
 
-allow passt_repair_t unconfined_t:sock_file { read write };
-allow passt_repair_t passt_t:sock_file { read write };
-allow passt_repair_t user_tmp_t:sock_file { read write };
+allow passt_repair_t unconfined_t:sock_file { getattr read write };
+allow passt_repair_t passt_t:sock_file { getattr read write };
+allow passt_repair_t user_tmp_t:sock_file { getattr read write };
 
 allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
 allow passt_repair_t passt_t:tcp_socket { read setopt write };
@@ -80,8 +80,8 @@ allow passt_repair_t passt_t:tcp_socket { read setopt write };
 allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write };
 allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write };
 
-allow passt_repair_t qemu_var_run_t:dir search;
-allow passt_repair_t virt_var_run_t:dir search;
+allow passt_repair_t qemu_var_run_t:dir { getattr read search watch };
+allow passt_repair_t virt_var_run_t:dir { getattr read search watch };
 
-allow passt_repair_t qemu_var_run_t:sock_file { read write };
-allow passt_repair_t virt_var_run_t:sock_file { read write };
+allow passt_repair_t qemu_var_run_t:sock_file { getattr read write };
+allow passt_repair_t virt_var_run_t:sock_file { getattr read write };
diff --git a/passt-repair.1 b/passt-repair.1
index 7c1b140..e65aadd 100644
--- a/passt-repair.1
+++ b/passt-repair.1
@@ -16,13 +16,17 @@
 .B passt-repair
 is a privileged helper setting and clearing repair mode on TCP sockets on behalf
 of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain
-socket, specified by \fIPATH\fR.
+socket.
 
 It can be used to migrate TCP connections between guests without granting
 additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections,
 \fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR
 capability (see \fBcapabilities\fR(7)) to be set or cleared.
 
+If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to
+connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file
+ending with \fI.repair\fR appears in it, and then attempts to connect to it.
+
 .SH PROTOCOL
 
 \fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via
diff --git a/passt-repair.c b/passt-repair.c
index e0c366e..8bb3f00 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -16,11 +16,14 @@
  * off. Reply by echoing the command. Exit on EOF.
  */
 
+#include <sys/inotify.h>
 #include <sys/prctl.h>
 #include <sys/types.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <sys/un.h>
 #include <errno.h>
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -39,6 +42,8 @@
 #include "seccomp_repair.h"
 
 #define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+#define REPAIR_EXT		".repair"
+#define REPAIR_EXT_LEN		strlen(REPAIR_EXT)
 
 /**
  * main() - Entry point and whole program with loop
@@ -51,6 +56,9 @@
  * #syscalls:repair socket s390x:socketcall i686:socketcall
  * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv
  * #syscalls:repair sendto sendmsg arm:send ppc64le:send
+ * #syscalls:repair stat|statx stat64|statx statx
+ * #syscalls:repair fstat|fstat64 newfstatat|fstatat64
+ * #syscalls:repair inotify_init1 inotify_add_watch
  */
 int main(int argc, char **argv)
 {
@@ -58,12 +66,14 @@ int main(int argc, char **argv)
 	     __attribute__ ((aligned(__alignof__(struct cmsghdr))));
 	struct sockaddr_un a = { AF_UNIX, "" };
 	int fds[SCM_MAX_FD], s, ret, i, n = 0;
+	bool inotify_dir = false;
 	struct sock_fprog prog;
 	int8_t cmd = INT8_MAX;
 	struct cmsghdr *cmsg;
 	struct msghdr msg;
 	struct iovec iov;
 	size_t cmsg_len;
+	struct stat sb;
 	int op;
 
 	prctl(PR_SET_DUMPABLE, 0);
@@ -90,19 +100,77 @@ int main(int argc, char **argv)
 		_exit(2);
 	}
 
-	ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
-	if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
-		fprintf(stderr, "Invalid socket path: %s\n", argv[1]);
-		_exit(2);
-	}
-
 	if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
 		fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
 		_exit(1);
 	}
 
-	if (connect(s, (struct sockaddr *)&a, sizeof(a))) {
-		fprintf(stderr, "Failed to connect to %s: %s\n", argv[1],
+	if ((stat(argv[1], &sb))) {
+		fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno);
+		_exit(1);
+	}
+
+	if ((sb.st_mode & S_IFMT) == S_IFDIR) {
+		char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
+		const struct inotify_event *ev;
+		char path[PATH_MAX + 1];
+		ssize_t n;
+		int fd;
+
+		ev = (struct inotify_event *)buf;
+
+		if ((fd = inotify_init1(IN_CLOEXEC)) < 0) {
+			fprintf(stderr, "inotify_init1: %i\n", errno);
+			_exit(1);
+		}
+
+		if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) {
+			fprintf(stderr, "inotify_add_watch: %i\n", errno);
+			_exit(1);
+		}
+
+		do {
+			n = read(fd, buf, sizeof(buf));
+			if (n < 0) {
+				fprintf(stderr, "inotify read: %i", errno);
+				_exit(1);
+			}
+
+			if (n < (ssize_t)sizeof(*ev)) {
+				fprintf(stderr, "Short inotify read: %zi", n);
+				_exit(1);
+			}
+		} while (ev->len < REPAIR_EXT_LEN ||
+			 memcmp(ev->name + strlen(ev->name) - REPAIR_EXT_LEN,
+				REPAIR_EXT, REPAIR_EXT_LEN));
+
+		snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name);
+		if ((stat(path, &sb))) {
+			fprintf(stderr, "Can't stat() %s: %i\n", path, errno);
+			_exit(1);
+		}
+
+		ret = snprintf(a.sun_path, sizeof(a.sun_path), path);
+		inotify_dir = true;
+	} else {
+		ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
+	}
+
+	if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
+		fprintf(stderr, "Invalid socket path");
+		_exit(2);
+	}
+
+	if ((sb.st_mode & S_IFMT) != S_IFSOCK) {
+		fprintf(stderr, "%s is not a socket\n", a.sun_path);
+		_exit(2);
+	}
+
+	while (connect(s, (struct sockaddr *)&a, sizeof(a))) {
+		if (inotify_dir && errno == ECONNREFUSED)
+			continue;
+
+		fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path,
 			strerror(errno));
 		_exit(1);
 	}

From c8b520c0625b440d0dcd588af085d35cf46aae2c Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 6 Mar 2025 20:00:51 +0100
Subject: [PATCH 041/144] flow, repair: Wait for a short while for passt-repair
 to connect

...and time out after that. This will be needed because of an upcoming
change to passt-repair enabling it to start before passt is started,
on both source and target, by means of an inotify watch.

Once the inotify watch triggers, passt-repair will connect right away,
but we have no guarantees that the connection completes before we
start the migration process, so wait for it (for a reasonable amount
of time).

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 flow.c   | 20 ++++++++++++++++++++
 repair.c | 32 ++++++++++++++++++++++++++++++++
 repair.h |  1 +
 3 files changed, 53 insertions(+)

diff --git a/flow.c b/flow.c
index 749c498..5e64b79 100644
--- a/flow.c
+++ b/flow.c
@@ -911,6 +911,21 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
 	return ret;
 }
 
+/**
+ * flow_migrate_need_repair() - Do we need to set repair mode for any flow?
+ *
+ * Return: true if repair mode is needed, false otherwise
+ */
+static bool flow_migrate_need_repair(void)
+{
+	union flow *flow;
+
+	foreach_established_tcp_flow(flow)
+		return true;
+
+	return false;
+}
+
 /**
  * flow_migrate_repair_all() - Turn repair mode on or off for all flows
  * @c:		Execution context
@@ -966,6 +981,9 @@ int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
 	(void)stage;
 	(void)fd;
 
+	if (flow_migrate_need_repair())
+		repair_wait(c);
+
 	if ((rc = flow_migrate_repair_all(c, true)))
 		return -rc;
 
@@ -1083,6 +1101,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	if (!count)
 		return 0;
 
+	repair_wait(c);
+
 	if ((rc = flow_migrate_repair_all(c, true)))
 		return -rc;
 
diff --git a/repair.c b/repair.c
index 3ee089f..149fe51 100644
--- a/repair.c
+++ b/repair.c
@@ -27,6 +27,10 @@
 
 #define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
 
+/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */
+#define REPAIR_ACCEPT_TIMEOUT_MS	10
+#define REPAIR_ACCEPT_TIMEOUT_US	(REPAIR_ACCEPT_TIMEOUT_MS * 1000)
+
 /* Pending file descriptors for next repair_flush() call, or command change */
 static int repair_fds[SCM_MAX_FD];
 
@@ -138,6 +142,34 @@ void repair_handler(struct ctx *c, uint32_t events)
 	repair_close(c);
 }
 
+/**
+ * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect
+ * @c:		Execution context
+ */
+void repair_wait(struct ctx *c)
+{
+	struct timeval tv = { .tv_sec = 0,
+			      .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) };
+	static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000,
+		      ".tv_usec is greater than 1000 * 1000");
+
+	if (c->fd_repair >= 0 || c->fd_repair_listen == -1)
+		return;
+
+	if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
+		       &tv, sizeof(tv))) {
+		err_perror("Set timeout on TCP_REPAIR listening socket");
+		return;
+	}
+
+	repair_listen_handler(c, EPOLLIN);
+
+	tv.tv_usec = 0;
+	if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
+		       &tv, sizeof(tv)))
+		err_perror("Clear timeout on TCP_REPAIR listening socket");
+}
+
 /**
  * repair_flush() - Flush current set of sockets to helper, with current command
  * @c:		Execution context
diff --git a/repair.h b/repair.h
index de279d6..1d37922 100644
--- a/repair.h
+++ b/repair.h
@@ -10,6 +10,7 @@ void repair_sock_init(const struct ctx *c);
 void repair_listen_handler(struct ctx *c, uint32_t events);
 void repair_handler(struct ctx *c, uint32_t events);
 void repair_close(struct ctx *c);
+void repair_wait(struct ctx *c);
 int repair_flush(struct ctx *c);
 int repair_set(struct ctx *c, int s, int cmd);
 

From bb00a0499fc9130e4b00a88928958b8b094ee2c9 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:31 +1100
Subject: [PATCH 042/144] conf: Use the same optstring for passt and pasta
 modes

Currently we rely on detecting our mode first and use different sets of
(single character) options for each.  This means that if you use an option
valid in only one mode in another you'll get the generic usage() message.

We can give more helpful errors with little extra effort by combining all
the options into a single value of the option string and giving bespoke
messages if an option for the wrong mode is used; in fact we already did
this for some single mode options like '-1'.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/conf.c b/conf.c
index 065e720..7f20bc8 100644
--- a/conf.c
+++ b/conf.c
@@ -1388,6 +1388,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"repair-path",	required_argument,	NULL,		28 },
 		{ 0 },
 	};
+	const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
 	const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
 	char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
 	bool copy_addrs_opt = false, copy_routes_opt = false;
@@ -1397,7 +1398,6 @@ void conf(struct ctx *c, int argc, char **argv)
 	struct fqdn *dnss = c->dns_search;
 	unsigned int ifi4 = 0, ifi6 = 0;
 	const char *logfile = NULL;
-	const char *optstring;
 	size_t logsize = 0;
 	char *runas = NULL;
 	long fd_tap_opt;
@@ -1408,9 +1408,6 @@ void conf(struct ctx *c, int argc, char **argv)
 	if (c->mode == MODE_PASTA) {
 		c->no_dhcp_dns = c->no_dhcp_dns_search = 1;
 		fwd_default = FWD_AUTO;
-		optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:";
-	} else {
-		optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:";
 	}
 
 	c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
@@ -1614,6 +1611,9 @@ void conf(struct ctx *c, int argc, char **argv)
 			c->foreground = 1;
 			break;
 		case 's':
+			if (c->mode == MODE_PASTA)
+				die("-s is for passt / vhost-user mode only");
+
 			ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s",
 				       optarg);
 			if (ret <= 0 || ret >= (int)sizeof(c->sock_path))
@@ -1634,6 +1634,9 @@ void conf(struct ctx *c, int argc, char **argv)
 			*c->sock_path = 0;
 			break;
 		case 'I':
+			if (c->mode != MODE_PASTA)
+				die("-I is for pasta mode only");
+
 			ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s",
 				       optarg);
 			if (ret <= 0 || ret >= IFNAMSIZ)
@@ -1790,11 +1793,16 @@ void conf(struct ctx *c, int argc, char **argv)
 			break;
 		case 't':
 		case 'u':
-		case 'T':
-		case 'U':
 		case 'D':
 			/* Handle these later, once addresses are configured */
 			break;
+		case 'T':
+		case 'U':
+			if (c->mode != MODE_PASTA)
+				die("-%c is for pasta mode only", name);
+
+			/* Handle properly later, once addresses are configured */
+			break;
 		case 'h':
 			usage(argv[0], stdout, EXIT_SUCCESS);
 			break;

From 4b17d042c7e4f6e5b5a770181e2ebd53ec8e73d4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:32 +1100
Subject: [PATCH 043/144] conf: Move mode detection into helper function

One of the first things we need to do is determine if we're in passt mode
or pasta mode.  Currently this is open-coded in main(), by examining
argv[0].  We want to complexify this a bit in future to cover vhost-user
mode as well.  Prepare for this, by moving the mode detection into a new
conf_mode() function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c  | 26 ++++++++++++++++++++++++++
 conf.h  |  1 +
 passt.c | 14 ++------------
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/conf.c b/conf.c
index 7f20bc8..2022ea1 100644
--- a/conf.c
+++ b/conf.c
@@ -991,6 +991,32 @@ pasta_opts:
 	_exit(status);
 }
 
+/**
+ * conf_mode() - Determine passt/pasta's operating mode from command line
+ * @argc:	Argument count
+ * @argv:	Command line arguments
+ *
+ * Return: mode to operate in, PASTA or PASST
+ */
+/* cppcheck-suppress constParameter */
+enum passt_modes conf_mode(int argc, char *argv[])
+{
+	char argv0[PATH_MAX], *basearg0;
+
+	if (argc < 1)
+		die("Cannot determine argv[0]");
+
+	strncpy(argv0, argv[0], PATH_MAX - 1);
+	basearg0 = basename(argv0);
+	if (strstr(basearg0, "pasta"))
+		return MODE_PASTA;
+
+	if (strstr(basearg0, "passt"))
+		return MODE_PASST;
+
+	die("Cannot determine mode, invoke as \"passt\" or \"pasta\"");
+}
+
 /**
  * conf_print() - Print fundamental configuration parameters
  * @c:		Execution context
diff --git a/conf.h b/conf.h
index 9d2143d..b45ad74 100644
--- a/conf.h
+++ b/conf.h
@@ -6,6 +6,7 @@
 #ifndef CONF_H
 #define CONF_H
 
+enum passt_modes conf_mode(int argc, char *argv[]);
 void conf(struct ctx *c, int argc, char **argv);
 
 #endif /* CONF_H */
diff --git a/passt.c b/passt.c
index 868842b..0bd2a29 100644
--- a/passt.c
+++ b/passt.c
@@ -191,7 +191,6 @@ int main(int argc, char **argv)
 {
 	struct epoll_event events[EPOLL_EVENTS];
 	int nfds, i, devnull_fd = -1;
-	char argv0[PATH_MAX], *name;
 	struct ctx c = { 0 };
 	struct rlimit limit;
 	struct timespec now;
@@ -213,21 +212,12 @@ int main(int argc, char **argv)
 	sigaction(SIGTERM, &sa, NULL);
 	sigaction(SIGQUIT, &sa, NULL);
 
-	if (argc < 1)
-		_exit(EXIT_FAILURE);
+	c.mode = conf_mode(argc, argv);
 
-	strncpy(argv0, argv[0], PATH_MAX - 1);
-	name = basename(argv0);
-	if (strstr(name, "pasta")) {
+	if (c.mode == MODE_PASTA) {
 		sa.sa_handler = pasta_child_handler;
 		if (sigaction(SIGCHLD, &sa, NULL))
 			die_perror("Couldn't install signal handlers");
-
-		c.mode = MODE_PASTA;
-	} else if (strstr(name, "passt")) {
-		c.mode = MODE_PASST;
-	} else {
-		_exit(EXIT_FAILURE);
 	}
 
 	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)

From 74cd82adc87552c7ef6d255069a974b4ebeab4a1 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:33 +1100
Subject: [PATCH 044/144] conf: Detect vhost-user mode earlier

We detect our operating mode in conf_mode(), unless we're using vhost-user
mode, in which case we change it later when we parse the --vhost-user
option.  That means we need to delay parsing the --repair-path option (for
vhost-user only) until still later.

However, there are many other places in the main option parsing loop which
also rely on mode.  We get away with those, because they happen to be able
to treat passt and vhost-user modes identically.  This is potentially
confusing, though.  So, move setting of MODE_VU into conf_mode() so
c->mode always has its final value from that point onwards.

To match, we move the parsing of --repair-path back into the main option
parsing loop.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/conf.c b/conf.c
index 2022ea1..b58e2a6 100644
--- a/conf.c
+++ b/conf.c
@@ -998,10 +998,23 @@ pasta_opts:
  *
  * Return: mode to operate in, PASTA or PASST
  */
-/* cppcheck-suppress constParameter */
 enum passt_modes conf_mode(int argc, char *argv[])
 {
+	int vhost_user = 0;
+	const struct option optvu[] = {
+		{"vhost-user",	no_argument,		&vhost_user,	1 },
+		{ 0 },
+	};
 	char argv0[PATH_MAX], *basearg0;
+	int name;
+
+	optind = 0;
+	do {
+		name = getopt_long(argc, argv, "-:", optvu, NULL);
+	} while (name != -1);
+
+	if (vhost_user)
+		return MODE_VU;
 
 	if (argc < 1)
 		die("Cannot determine argv[0]");
@@ -1604,9 +1617,8 @@ void conf(struct ctx *c, int argc, char **argv)
 
 			die("Invalid host nameserver address: %s", optarg);
 		case 25:
-			if (c->mode == MODE_PASTA)
-				die("--vhost-user is for passt mode only");
-			c->mode = MODE_VU;
+			/* Already handled in conf_mode() */
+			ASSERT(c->mode == MODE_VU);
 			break;
 		case 26:
 			vu_print_capabilities();
@@ -1617,7 +1629,14 @@ void conf(struct ctx *c, int argc, char **argv)
 				die("Invalid FQDN: %s", optarg);
 			break;
 		case 28:
-			/* Handle this once we checked --vhost-user */
+			if (c->mode != MODE_VU && strcmp(optarg, "none"))
+				die("--repair-path is for vhost-user mode only");
+
+			if (snprintf_check(c->repair_path,
+					   sizeof(c->repair_path), "%s",
+					   optarg))
+				die("Invalid passt-repair path: %s", optarg);
+
 			break;
 		case 'd':
 			c->debug = 1;
@@ -1917,8 +1936,8 @@ void conf(struct ctx *c, int argc, char **argv)
 	if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
 		c->no_dhcp = 1;
 
-	/* Inbound port options, DNS, and --repair-path can be parsed now, after
-	 * IPv4/IPv6 settings and --vhost-user.
+	/* Inbound port options and DNS can be parsed now, after IPv4/IPv6
+	 * settings
 	 */
 	fwd_probe_ephemeral();
 	udp_portmap_clear();
@@ -1964,16 +1983,6 @@ void conf(struct ctx *c, int argc, char **argv)
 			}
 
 			die("Cannot use DNS address %s", optarg);
-		} else if (name == 28) {
-			if (c->mode != MODE_VU && strcmp(optarg, "none"))
-				die("--repair-path is for vhost-user mode only");
-
-			if (snprintf_check(c->repair_path,
-					   sizeof(c->repair_path), "%s",
-					   optarg))
-				die("Invalid passt-repair path: %s", optarg);
-
-			break;
 		}
 	} while (name != -1);
 

From c43972ad67806fb403cdbc05179441917f2a776b Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:34 +1100
Subject: [PATCH 045/144] packet: Give explicit name to maximum packet size

We verify that every packet we store in a pool (and every partial packet
we retreive from it) has a length no longer than UINT16_MAX.  This
originated in the older packet pool implementation which stored packet
lengths in a uint16_t.  Now, that packets are represented by a struct
iovec with its size_t length, this check serves only as a sanity / security
check that we don't have some wildly out of range length due to a bug
elsewhere.

We have may reasons to (slightly) increase this limit in future, so in
preparation, give this quantity an explicit name - PACKET_MAX_LEN.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 4 ++--
 packet.h | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/packet.c b/packet.c
index 0330b54..bcac037 100644
--- a/packet.c
+++ b/packet.c
@@ -83,7 +83,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 	if (packet_check_range(p, start, len, func, line))
 		return;
 
-	if (len > UINT16_MAX) {
+	if (len > PACKET_MAX_LEN) {
 		trace("add packet length %zu, %s:%i", len, func, line);
 		return;
 	}
@@ -119,7 +119,7 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		return NULL;
 	}
 
-	if (len > UINT16_MAX) {
+	if (len > PACKET_MAX_LEN) {
 		if (func) {
 			trace("packet data length %zu, %s:%i",
 			      len, func, line);
diff --git a/packet.h b/packet.h
index bdc07fe..d099f02 100644
--- a/packet.h
+++ b/packet.h
@@ -6,6 +6,9 @@
 #ifndef PACKET_H
 #define PACKET_H
 
+/* Maximum size of a single packet stored in pool, including headers */
+#define PACKET_MAX_LEN	UINT16_MAX
+
 /**
  * struct pool - Generic pool of packets stored in a buffer
  * @buf:	Buffer storing packet descriptors,

From 1eda8de4384a93778a781257781c5b0967c8abfe Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:35 +1100
Subject: [PATCH 046/144] packet: Remove redundant TAP_BUF_BYTES define

Currently we define both TAP_BUF_BYTES and PKT_BUF_BYTES as essentially
the same thing.  They'll be different only if TAP_BUF_BYTES is negative,
which makes no sense.  So, remove TAP_BUF_BYTES and just use PKT_BUF_BYTES.

In addition, most places we use this to just mean the size of the main
packet buffer (pkt_buf) for which we can just directly use sizeof.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.c | 2 +-
 passt.h | 5 ++---
 tap.c   | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/passt.c b/passt.c
index 0bd2a29..cd06772 100644
--- a/passt.c
+++ b/passt.c
@@ -223,7 +223,7 @@ int main(int argc, char **argv)
 	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
 		die_perror("Couldn't set disposition for SIGPIPE");
 
-	madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
+	madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE);
 
 	c.epollfd = epoll_create1(EPOLL_CLOEXEC);
 	if (c.epollfd == -1)
diff --git a/passt.h b/passt.h
index 28d1389..6b24805 100644
--- a/passt.h
+++ b/passt.h
@@ -69,12 +69,11 @@ union epoll_ref {
 static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
 	      "epoll_ref must have same size as epoll_data");
 
-#define TAP_BUF_BYTES							\
+#define PKT_BUF_BYTES							\
 	ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
 #define TAP_MSGS							\
-	DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
+	DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
 
-#define PKT_BUF_BYTES		MAX(TAP_BUF_BYTES, 0)
 extern char pkt_buf		[PKT_BUF_BYTES];
 
 extern char *epoll_type_str[];
diff --git a/tap.c b/tap.c
index 4541f51..fb306e7 100644
--- a/tap.c
+++ b/tap.c
@@ -1080,7 +1080,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 
 	do {
 		n = recv(c->fd_tap, pkt_buf + partial_len,
-			 TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
+			 sizeof(pkt_buf) - partial_len, MSG_DONTWAIT);
 	} while ((n < 0) && errno == EINTR);
 
 	if (n < 0) {
@@ -1151,7 +1151,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 
 	tap_flush_pools();
 
-	for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
+	for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) {
 		len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
 
 		if (len == 0) {

From c4bfa3339cea586172d4b0fcd613b5638498651e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:36 +1100
Subject: [PATCH 047/144] tap: Use explicit defines for maximum length of L2
 frame

Currently in tap.c we (mostly) use ETH_MAX_MTU as the maximum length of
an L2 frame.  This define comes from the kernel, but it's badly named and
used confusingly.

First, it doesn't really have anything to do with Ethernet, which has no
structural limit on frame lengths.  It comes more from either a) IP which
imposes a 64k datagram limit or b) from internal buffers used in various
places in the kernel (and in passt).

Worse, MTU generally means the maximum size of the IP (L3) datagram which
may be transferred, _not_ counting the L2 headers.  In the kernel
ETH_MAX_MTU is sometimes used that way, but sometimes seems to be used as
a maximum frame length, _including_ L2 headers.  In tap.c we're mostly
using it in the second way.

Finally, each of our tap backends could have different limits on the frame
size imposed by the mechanisms they're using.

Start clearing up this confusion by replacing it in tap.c with new
L2_MAX_LEN_* defines which specifically refer to the maximum L2 frame
length for each backend.

Signed-off-by: David Gibson <dgibson@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 23 +++++++++++++++++++----
 tap.h | 25 +++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/tap.c b/tap.c
index fb306e7..ede547c 100644
--- a/tap.c
+++ b/tap.c
@@ -62,6 +62,19 @@
 #include "vhost_user.h"
 #include "vu_common.h"
 
+/* Maximum allowed frame lengths (including L2 header) */
+
+/* Verify that an L2 frame length limit is large enough to contain the header,
+ * but small enough to fit in the packet pool
+ */
+#define CHECK_FRAME_LEN(len) \
+	static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN,	\
+		      #len " has bad value")
+
+CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
+CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
+CHECK_FRAME_LEN(L2_MAX_LEN_VU);
+
 /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
 static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
 static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
@@ -1097,7 +1110,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 	while (n >= (ssize_t)sizeof(uint32_t)) {
 		uint32_t l2len = ntohl_unaligned(p);
 
-		if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
+		if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) {
 			err("Bad frame size from guest, resetting connection");
 			tap_sock_reset(c);
 			return;
@@ -1151,8 +1164,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 
 	tap_flush_pools();
 
-	for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) {
-		len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
+	for (n = 0;
+	     n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA);
+	     n += len) {
+		len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA);
 
 		if (len == 0) {
 			die("EOF on tap device, exiting");
@@ -1170,7 +1185,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 
 		/* Ignore frames of bad length */
 		if (len < (ssize_t)sizeof(struct ethhdr) ||
-		    len > (ssize_t)ETH_MAX_MTU)
+		    len > (ssize_t)L2_MAX_LEN_PASTA)
 			continue;
 
 		tap_add_packet(c, len, pkt_buf + n);
diff --git a/tap.h b/tap.h
index a2c3b87..84e9fdb 100644
--- a/tap.h
+++ b/tap.h
@@ -6,6 +6,31 @@
 #ifndef TAP_H
 #define TAP_H
 
+/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header)
+ *
+ * The kernel tuntap device imposes a maximum frame size of 65535 including
+ * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).
+ */
+#define L2_MAX_LEN_PASTA	USHRT_MAX
+
+/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header)
+ *
+ * The only structural limit the QEMU socket protocol imposes on frames is
+ * (2^32-1) bytes, but that would be ludicrously long in practice.  For now,
+ * limit it somewhat arbitrarily to 65535 bytes.  FIXME: Work out an appropriate
+ * limit with more precision.
+ */
+#define L2_MAX_LEN_PASST	USHRT_MAX
+
+/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header)
+ *
+ * vhost-user allows multiple buffers per frame, each of which can be quite
+ * large, so the inherent frame size limit is rather large.  Much larger than is
+ * actually useful for IP.  For now limit arbitrarily to 65535 bytes. FIXME:
+ * Work out an appropriate limit with more precision.
+ */
+#define L2_MAX_LEN_VU		USHRT_MAX
+
 struct udphdr;
 
 /**

From b6945e055376be944867479dcd8deb77e47b1fa4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:37 +1100
Subject: [PATCH 048/144] Simplify sizing of pkt_buf

We define the size of pkt_buf as large enough to hold 128 maximum size
packets.  Well, approximately, since we round down to the page size.  We
don't have any specific reliance on how many packets can fit in the buffer,
we just want it to be big enough to allow reasonable batching.  The
current definition relies on the confusingly named ETH_MAX_MTU and adds
in sizeof(uint32_t) rather non-obviously for the pseudo-physical header
used by the qemu socket (passt mode) protocol.

Instead, just define it to be 8MiB, which is what that complex calculation
works out to.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/passt.h b/passt.h
index 6b24805..8f45091 100644
--- a/passt.h
+++ b/passt.h
@@ -69,8 +69,8 @@ union epoll_ref {
 static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
 	      "epoll_ref must have same size as epoll_data");
 
-#define PKT_BUF_BYTES							\
-	ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
+/* Large enough for ~128 maximum size frames */
+#define PKT_BUF_BYTES		(8UL << 20)
 #define TAP_MSGS							\
 	DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
 

From 9d1a6b3eba9e6e5c4db4bfa0e395edc45ca6c39d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:38 +1100
Subject: [PATCH 049/144] pcap: Correctly set snaplen based on tap backend type

The pcap header includes a value indicating how much of each frame is
captured.  We always capture the entire frame, so we want to set this to
the maximum possible frame size.  Currently we do that by setting it to
ETH_MAX_MTU, but that's a confusingly named constant which might not always
be correct depending on the details of our tap backend.

Instead add a tap_l2_max_len() function that explicitly returns the maximum
frame size for the current mode and use that to set snaplen.  While we're
there, there's no particular need for the pcap header to be defined in a
global; make it local to pcap_init() instead.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 pcap.c | 46 ++++++++++++++++++++++++----------------------
 tap.c  | 19 +++++++++++++++++++
 tap.h  |  1 +
 3 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/pcap.c b/pcap.c
index 3d623cf..e95aa6f 100644
--- a/pcap.c
+++ b/pcap.c
@@ -33,33 +33,12 @@
 #include "log.h"
 #include "pcap.h"
 #include "iov.h"
+#include "tap.h"
 
 #define PCAP_VERSION_MINOR 4
 
 static int pcap_fd = -1;
 
-/* See pcap.h from libpcap, or pcap-savefile(5) */
-static const struct {
-	uint32_t magic;
-#define PCAP_MAGIC		0xa1b2c3d4
-
-	uint16_t major;
-#define PCAP_VERSION_MAJOR	2
-
-	uint16_t minor;
-#define PCAP_VERSION_MINOR	4
-
-	int32_t thiszone;
-	uint32_t sigfigs;
-	uint32_t snaplen;
-
-	uint32_t linktype;
-#define PCAP_LINKTYPE_ETHERNET	1
-} pcap_hdr = {
-	PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU,
-	PCAP_LINKTYPE_ETHERNET
-};
-
 struct pcap_pkthdr {
 	uint32_t tv_sec;
 	uint32_t tv_usec;
@@ -162,6 +141,29 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
  */
 void pcap_init(struct ctx *c)
 {
+	/* See pcap.h from libpcap, or pcap-savefile(5) */
+#define PCAP_MAGIC		0xa1b2c3d4
+#define PCAP_VERSION_MAJOR	2
+#define PCAP_VERSION_MINOR	4
+#define PCAP_LINKTYPE_ETHERNET	1
+	const struct {
+		uint32_t magic;
+		uint16_t major;
+		uint16_t minor;
+
+		int32_t thiszone;
+		uint32_t sigfigs;
+		uint32_t snaplen;
+
+		uint32_t linktype;
+	} pcap_hdr = {
+		.magic = PCAP_MAGIC,
+		.major = PCAP_VERSION_MAJOR,
+		.minor = PCAP_VERSION_MINOR,
+		.snaplen = tap_l2_max_len(c),
+		.linktype = PCAP_LINKTYPE_ETHERNET
+	};
+
 	if (pcap_fd != -1)
 		return;
 
diff --git a/tap.c b/tap.c
index ede547c..182a115 100644
--- a/tap.c
+++ b/tap.c
@@ -82,6 +82,25 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
 #define TAP_SEQS		128 /* Different L4 tuples in one batch */
 #define FRAGMENT_MSG_RATE	10  /* # seconds between fragment warnings */
 
+/**
+ * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
+ * @c:		Execution context
+ */
+unsigned long tap_l2_max_len(const struct ctx *c)
+{
+	/* NOLINTBEGIN(bugprone-branch-clone): values can be the same */
+	switch (c->mode) {
+	case MODE_PASST:
+		return L2_MAX_LEN_PASST;
+	case MODE_PASTA:
+		return L2_MAX_LEN_PASTA;
+	case MODE_VU:
+		return L2_MAX_LEN_VU;
+	}
+	/* NOLINTEND(bugprone-branch-clone) */
+	ASSERT(0);
+}
+
 /**
  * tap_send_single() - Send a single frame
  * @c:		Execution context
diff --git a/tap.h b/tap.h
index 84e9fdb..dd39fd8 100644
--- a/tap.h
+++ b/tap.h
@@ -69,6 +69,7 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
 		thdr->vnet_len = htonl(l2len);
 }
 
+unsigned long tap_l2_max_len(const struct ctx *c);
 void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
 void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 		     struct in_addr dst, size_t l4len, uint8_t proto);

From 26df8a3608e7b006c00f44a9029bcadb6d5e4153 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 13:18:39 +1100
Subject: [PATCH 050/144] conf: Limit maximum MTU based on backend frame size

The -m option controls the MTU, that is the maximum transmissible L3
datagram, not including L2 headers.  We currently limit it to ETH_MAX_MTU
which sounds like it makes sense.  But ETH_MAX_MTU is confusing: it's not
consistently used as to whether it means the maximum L3 datagram size or
the maximum L2 frame size.  Even within conf() we explicitly account for
the L2 header size when computing the default --mtu value, but not when
we compute the maximum --mtu value.

Clean this up by reworking the maximum MTU computation to be the minimum of
IP_MAX_MTU (65535) and the maximum sized IP datagram which can fit into
our L2 frames when we account for the L2 header.  The latter can vary
depending on our tap backend, although it doesn't right now.

Link: https://bugs.passt.top/show_bug.cgi?id=66
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 11 +++++++----
 util.h |  3 ---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/conf.c b/conf.c
index b58e2a6..c760f79 100644
--- a/conf.c
+++ b/conf.c
@@ -1434,6 +1434,7 @@ void conf(struct ctx *c, int argc, char **argv)
 	enum fwd_ports_mode fwd_default = FWD_NONE;
 	bool v4_only = false, v6_only = false;
 	unsigned dns4_idx = 0, dns6_idx = 0;
+	unsigned long max_mtu = IP_MAX_MTU;
 	struct fqdn *dnss = c->dns_search;
 	unsigned int ifi4 = 0, ifi6 = 0;
 	const char *logfile = NULL;
@@ -1449,7 +1450,9 @@ void conf(struct ctx *c, int argc, char **argv)
 		fwd_default = FWD_AUTO;
 	}
 
-	c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
+	if (tap_l2_max_len(c) - ETH_HLEN < max_mtu)
+		max_mtu = tap_l2_max_len(c) - ETH_HLEN;
+	c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t));
 	c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
 	c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET;
 	memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN);
@@ -1711,9 +1714,9 @@ void conf(struct ctx *c, int argc, char **argv)
 			if (errno || *e)
 				die("Invalid MTU: %s", optarg);
 
-			if (mtu > ETH_MAX_MTU) {
-				die("MTU %lu too large (max %u)",
-				    mtu, ETH_MAX_MTU);
+			if (mtu > max_mtu) {
+				die("MTU %lu too large (max %lu)",
+				    mtu, max_mtu);
 			}
 
 			c->mtu = mtu;
diff --git a/util.h b/util.h
index 0f70f4d..4d512fa 100644
--- a/util.h
+++ b/util.h
@@ -31,9 +31,6 @@
 #ifndef SECCOMP_RET_KILL_PROCESS
 #define SECCOMP_RET_KILL_PROCESS	SECCOMP_RET_KILL
 #endif
-#ifndef ETH_MAX_MTU
-#define ETH_MAX_MTU			USHRT_MAX
-#endif
 #ifndef IP_MAX_MTU
 #define IP_MAX_MTU			USHRT_MAX
 #endif

From 78f1f0fdfc1831f2ca3a65c2cee98c44ff3c30ab Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 16:26:57 +1100
Subject: [PATCH 051/144] test/perf: Simplify iperf3 server lifetime management

After we start the iperf3 server in the background, we have a sleep to
make sure it's ready to receive connections.  We can simplify this slightly
by using the -D option to have iperf3 background itself rather than
backgrounding it manually.  That won't return until the server is ready to
use.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/lib/test | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/test/lib/test b/test/lib/test
index 758250a..7349674 100755
--- a/test/lib/test
+++ b/test/lib/test
@@ -20,10 +20,7 @@ test_iperf3s() {
 	__sctx="${1}"
 	__port="${2}"
 
-	pane_or_context_run_bg "${__sctx}" 				\
-		 'iperf3 -s -p'${__port}' & echo $! > s.pid'		\
-
-	sleep 1		# Wait for server to be ready
+	pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid'
 }
 
 # test_iperf3k() - Kill iperf3 server
@@ -31,7 +28,7 @@ test_iperf3s() {
 test_iperf3k() {
 	__sctx="${1}"
 
-	pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'
+	pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)'
 
 	sleep 1		# Wait for kernel to free up ports
 }

From 96fe5548cb16fe2664ad121c2976048ccad6a1ab Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Mar 2025 14:43:59 +1100
Subject: [PATCH 052/144] conf: Unify several paths in conf_ports()

In conf_ports() we have three different paths which actually do the setup
of an individual forwarded port: one for the "all" case, one for the
exclusions only case and one for the range of ports with possible
exclusions case.

We can unify those cases using a new helper which handles a single range
of ports, with a bitmap of exclusions.  Although this is slightly longer
(largely due to the new helpers function comment), it reduces duplicated
logic.  It will also make future improvements to the tracking of port
forwards easier.

The new conf_ports_range_except() function has a pretty prodigious
parameter list, but I still think it's an overall improvement in conceptual
complexity.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 173 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 90 insertions(+), 83 deletions(-)

diff --git a/conf.c b/conf.c
index c760f79..0e2e8dc 100644
--- a/conf.c
+++ b/conf.c
@@ -123,6 +123,75 @@ static int parse_port_range(const char *s, char **endptr,
 	return 0;
 }
 
+/**
+ * conf_ports_range_except() - Set up forwarding for a range of ports minus a
+ *                             bitmap of exclusions
+ * @c:		Execution context
+ * @optname:	Short option name, t, T, u, or U
+ * @optarg:	Option argument (port specification)
+ * @fwd:	Pointer to @fwd_ports to be updated
+ * @addr:	Listening address
+ * @ifname:	Listening interface
+ * @first:	First port to forward
+ * @last:	Last port to forward
+ * @exclude:	Bitmap of ports to exclude
+ * @to:		Port to translate @first to when forwarding
+ * @weak:	Ignore errors, as long as at least one port is mapped
+ */
+static void conf_ports_range_except(const struct ctx *c, char optname,
+				    const char *optarg, struct fwd_ports *fwd,
+				    const union inany_addr *addr,
+				    const char *ifname,
+				    uint16_t first, uint16_t last,
+				    const uint8_t *exclude, uint16_t to,
+				    bool weak)
+{
+	bool bound_one = false;
+	unsigned i;
+	int ret;
+
+	if (first == 0) {
+		die("Can't forward port 0 for option '-%c %s'",
+		    optname, optarg);
+	}
+
+	for (i = first; i <= last; i++) {
+		if (bitmap_isset(exclude, i))
+			continue;
+
+		if (bitmap_isset(fwd->map, i)) {
+			warn(
+"Altering mapping of already mapped port number: %s", optarg);
+		}
+
+		bitmap_set(fwd->map, i);
+		fwd->delta[i] = to - first;
+
+		if (optname == 't')
+			ret = tcp_sock_init(c, addr, ifname, i);
+		else if (optname == 'u')
+			ret = udp_sock_init(c, 0, addr, ifname, i);
+		else
+			/* No way to check in advance for -T and -U */
+			ret = 0;
+
+		if (ret == -ENFILE || ret == -EMFILE) {
+			die("Can't open enough sockets for port specifier: %s",
+			    optarg);
+		}
+
+		if (!ret) {
+			bound_one = true;
+		} else if (!weak) {
+			die("Failed to bind port %u (%s) for option '-%c %s'",
+			    i, strerror_(-ret), optname, optarg);
+		}
+	}
+
+	if (!bound_one)
+		die("Failed to bind any port for '-%c %s'", optname, optarg);
+}
+
 /**
  * conf_ports() - Parse port configuration options, initialise UDP/TCP sockets
  * @c:		Execution context
@@ -135,10 +204,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 {
 	union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
 	char buf[BUFSIZ], *spec, *ifname = NULL, *p;
-	bool exclude_only = true, bound_one = false;
 	uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
+	bool exclude_only = true;
 	unsigned i;
-	int ret;
 
 	if (!strcmp(optarg, "none")) {
 		if (fwd->mode)
@@ -173,32 +241,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 
 		fwd->mode = FWD_ALL;
 
-		/* Skip port 0.  It has special meaning for many socket APIs, so
-		 * trying to bind it is not really safe.
-		 */
-		for (i = 1; i < NUM_PORTS; i++) {
+		/* Exclude ephemeral ports */
+		for (i = 0; i < NUM_PORTS; i++)
 			if (fwd_port_is_ephemeral(i))
-				continue;
-
-			bitmap_set(fwd->map, i);
-			if (optname == 't') {
-				ret = tcp_sock_init(c, NULL, NULL, i);
-				if (ret == -ENFILE || ret == -EMFILE)
-					goto enfile;
-				if (!ret)
-					bound_one = true;
-			} else if (optname == 'u') {
-				ret = udp_sock_init(c, 0, NULL, NULL, i);
-				if (ret == -ENFILE || ret == -EMFILE)
-					goto enfile;
-				if (!ret)
-					bound_one = true;
-			}
-		}
-
-		if (!bound_one)
-			goto bind_all_fail;
+				bitmap_set(exclude, i);
 
+		conf_ports_range_except(c, optname, optarg, fwd,
+					NULL, NULL,
+					1, NUM_PORTS - 1, exclude,
+					1, true);
 		return;
 	}
 
@@ -275,37 +326,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 	} while ((p = next_chunk(p, ',')));
 
 	if (exclude_only) {
-		/* Skip port 0.  It has special meaning for many socket APIs, so
-		 * trying to bind it is not really safe.
-		 */
-		for (i = 1; i < NUM_PORTS; i++) {
-			if (fwd_port_is_ephemeral(i) ||
-			    bitmap_isset(exclude, i))
-				continue;
-
-			bitmap_set(fwd->map, i);
-
-			if (optname == 't') {
-				ret = tcp_sock_init(c, addr, ifname, i);
-				if (ret == -ENFILE || ret == -EMFILE)
-					goto enfile;
-				if (!ret)
-					bound_one = true;
-			} else if (optname == 'u') {
-				ret = udp_sock_init(c, 0, addr, ifname, i);
-				if (ret == -ENFILE || ret == -EMFILE)
-					goto enfile;
-				if (!ret)
-					bound_one = true;
-			} else {
-				/* No way to check in advance for -T and -U */
-				bound_one = true;
-			}
-		}
-
-		if (!bound_one)
-			goto bind_all_fail;
+		/* Exclude ephemeral ports */
+		for (i = 0; i < NUM_PORTS; i++)
+			if (fwd_port_is_ephemeral(i))
+				bitmap_set(exclude, i);
 
+		conf_ports_range_except(c, optname, optarg, fwd,
+					addr, ifname,
+					1, NUM_PORTS - 1, exclude,
+					1, true);
 		return;
 	}
 
@@ -334,40 +363,18 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 		if ((*p != '\0')  && (*p != ',')) /* Garbage after the ranges */
 			goto bad;
 
-		for (i = orig_range.first; i <= orig_range.last; i++) {
-			if (bitmap_isset(fwd->map, i))
-				warn(
-"Altering mapping of already mapped port number: %s", optarg);
-
-			if (bitmap_isset(exclude, i))
-				continue;
-
-			bitmap_set(fwd->map, i);
-
-			fwd->delta[i] = mapped_range.first - orig_range.first;
-
-			ret = 0;
-			if (optname == 't')
-				ret = tcp_sock_init(c, addr, ifname, i);
-			else if (optname == 'u')
-				ret = udp_sock_init(c, 0, addr, ifname, i);
-			if (ret)
-				goto bind_fail;
-		}
+		conf_ports_range_except(c, optname, optarg, fwd,
+					addr, ifname,
+					orig_range.first, orig_range.last,
+					exclude,
+					mapped_range.first, false);
 	} while ((p = next_chunk(p, ',')));
 
 	return;
-enfile:
-	die("Can't open enough sockets for port specifier: %s", optarg);
 bad:
 	die("Invalid port specifier %s", optarg);
 mode_conflict:
 	die("Port forwarding mode '%s' conflicts with previous mode", optarg);
-bind_fail:
-	die("Failed to bind port %u (%s) for option '-%c %s', exiting",
-	    i, strerror_(-ret), optname, optarg);
-bind_all_fail:
-	die("Failed to bind any port for '-%c %s', exiting", optname, optarg);
 }
 
 /**

From cb5b593563402680bee850245667f2e71b0d1bda Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 13 Mar 2025 13:56:17 +1100
Subject: [PATCH 053/144] tcp, flow: Better use flow specific logging heleprs

A number of places in the TCP code use general logging functions, instead
of the flow specific ones.  That includes a few older ones as well as many
places in the new migration code.  Thus they either don't identify which
flow the problem happened on, or identify it in a non-standard way.

Convert many of these to use the existing flow specific helpers.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c         |  16 ++--
 tcp.c          | 252 +++++++++++++++++++++++++++----------------------
 tcp.h          |   1 -
 tcp_buf.c      |   4 +-
 tcp_internal.h |   1 +
 tcp_vu.c       |   2 +-
 6 files changed, 149 insertions(+), 127 deletions(-)

diff --git a/flow.c b/flow.c
index 5e64b79..8622242 100644
--- a/flow.c
+++ b/flow.c
@@ -1037,8 +1037,8 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	foreach_established_tcp_flow(flow) {
 		rc = tcp_flow_migrate_source(fd, &flow->tcp);
 		if (rc) {
-			err("Can't send data, flow %u: %s", FLOW_IDX(flow),
-			    strerror_(-rc));
+			flow_err(flow, "Can't send data: %s",
+				 strerror_(-rc));
 			if (!first)
 				die("Inconsistent migration state, exiting");
 
@@ -1064,8 +1064,8 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	foreach_established_tcp_flow(flow) {
 		rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
 		if (rc) {
-			err("Extended data for flow %u: %s", FLOW_IDX(flow),
-			    strerror_(-rc));
+			flow_err(flow, "Can't send extended data: %s",
+				 strerror_(-rc));
 
 			if (rc == -EIO)
 				die("Inconsistent migration state, exiting");
@@ -1112,8 +1112,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	for (i = 0; i < count; i++) {
 		rc = tcp_flow_migrate_target(c, fd);
 		if (rc) {
-			debug("Migration data failure at flow %u: %s, abort",
-			      i, strerror_(-rc));
+			flow_dbg(FLOW(i), "Migration data failure, abort: %s",
+				 strerror_(-rc));
 			return -rc;
 		}
 	}
@@ -1123,8 +1123,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	for (i = 0; i < count; i++) {
 		rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
 		if (rc) {
-			debug("Migration data failure at flow %u: %s, abort",
-			      i, strerror_(-rc));
+			flow_dbg(FLOW(i), "Migration data failure, abort: %s",
+				 strerror_(-rc));
 			return -rc;
 		}
 	}
diff --git a/tcp.c b/tcp.c
index 32a08bd..a4c840e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -434,19 +434,20 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx)
 }
 
 /**
- * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
- * @s:          Socket to update
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported
+ * @conn:	Pointer to the TCP connection structure
  * @offset:     Offset in bytes
  *
  * Return:      -1 when it fails, 0 otherwise.
  */
-int tcp_set_peek_offset(int s, int offset)
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset)
 {
 	if (!peek_offset_cap)
 		return 0;
 
-	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) {
-		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+	if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF,
+		       &offset, sizeof(offset))) {
+		flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset);
 		return -1;
 	}
 	return 0;
@@ -1757,7 +1758,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			   "fast re-transmit, ACK: %u, previous sequence: %u",
 			   max_ack_seq, conn->seq_to_tap);
 		conn->seq_to_tap = max_ack_seq;
-		if (tcp_set_peek_offset(conn->sock, 0)) {
+		if (tcp_set_peek_offset(conn, 0)) {
 			tcp_rst(c, conn);
 			return -1;
 		}
@@ -1854,7 +1855,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
 	conn_event(c, conn, ESTABLISHED);
-	if (tcp_set_peek_offset(conn->sock, 0)) {
+	if (tcp_set_peek_offset(conn, 0)) {
 		tcp_rst(c, conn);
 		return;
 	}
@@ -2022,7 +2023,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 			goto reset;
 
 		conn_event(c, conn, ESTABLISHED);
-		if (tcp_set_peek_offset(conn->sock, 0))
+		if (tcp_set_peek_offset(conn, 0))
 			goto reset;
 
 		if (th->fin) {
@@ -2286,7 +2287,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 			conn->seq_to_tap = conn->seq_ack_from_tap;
 			if (!conn->wnd_from_tap)
 				conn->wnd_from_tap = 1; /* Zero-window probe */
-			if (tcp_set_peek_offset(conn->sock, 0)) {
+			if (tcp_set_peek_offset(conn, 0)) {
 				tcp_rst(c, conn);
 			} else {
 				tcp_data_from_sock(c, conn);
@@ -2810,20 +2811,21 @@ int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
 
 /**
  * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options
- * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn,
+			       struct tcp_tap_transfer_ext *t)
 {
 	struct tcp_info tinfo;
 	socklen_t sl;
 
 	sl = sizeof(tinfo);
-	if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+	if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
 		int rc = -errno;
-		err_perror("Querying TCP_INFO, socket %i", s);
+		flow_perror(conn, "Querying TCP_INFO");
 		return rc;
 	}
 
@@ -2837,18 +2839,19 @@ static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG
- * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
+			     struct tcp_tap_transfer_ext *t)
 {
 	socklen_t sl = sizeof(t->mss);
 
-	if (getsockopt(s, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) {
+	if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) {
 		int rc = -errno;
-		err_perror("Getting MSS, socket %i", s);
+		flow_perror(conn, "Getting MSS");
 		return rc;
 	}
 
@@ -2857,19 +2860,20 @@ static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
- * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn,
+			     struct tcp_tap_transfer_ext *t)
 {
 	struct tcp_repair_window wnd;
 	socklen_t sl = sizeof(wnd);
 
-	if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
+	if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
 		int rc = -errno;
-		err_perror("Getting window repair data, socket %i", s);
+		flow_perror(conn, "Getting window repair data");
 		return rc;
 	}
 
@@ -2893,12 +2897,13 @@ static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_repair_wnd() - Restore window parameters from extended data
- * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
+static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn,
+			       const struct tcp_tap_transfer_ext *t)
 {
 	struct tcp_repair_window wnd;
 
@@ -2908,9 +2913,10 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
 	wnd.rcv_wnd	= t->rcv_wnd;
 	wnd.rcv_wup	= t->rcv_wup;
 
-	if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sizeof(wnd))) {
+	if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW,
+		       &wnd, sizeof(wnd))) {
 		int rc = -errno;
-		err_perror("Setting window data, socket %i", s);
+		flow_perror(conn, "Setting window data");
 		return rc;
 	}
 
@@ -2919,16 +2925,17 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_select_queue() - Select queue (receive or send) for next operation
- * @s:		Socket
+ * @conn:	Connection to select queue for
  * @queue:	TCP_RECV_QUEUE or TCP_SEND_QUEUE
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_select_queue(int s, int queue)
+static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue)
 {
-	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue))) {
+	if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE,
+		       &queue, sizeof(queue))) {
 		int rc = -errno;
-		err_perror("Selecting TCP_SEND_QUEUE, socket %i", s);
+		flow_perror(conn, "Selecting TCP_SEND_QUEUE");
 		return rc;
 	}
 
@@ -2937,26 +2944,28 @@ static int tcp_flow_select_queue(int s, int queue)
 
 /**
  * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data
- * @s:		Socket
+ * @conn:	Connection to dump queue for
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  *
  * #syscalls:vu ioctl
  */
-static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn,
+				  struct tcp_tap_transfer_ext *t)
 {
+	int s = conn->sock;
 	ssize_t rc;
 
 	if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) {
 		rc = -errno;
-		err_perror("Getting send queue size, socket %i", s);
+		flow_perror(conn, "Getting send queue size");
 		return rc;
 	}
 
 	if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) {
 		rc = -errno;
-		err_perror("Getting not sent count, socket %i", s);
+		flow_perror(conn, "Getting not sent count");
 		return rc;
 	}
 
@@ -2975,14 +2984,16 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
 	}
 
 	if (t->notsent > t->sndq) {
-		err("Invalid notsent count socket %i, send: %u, not sent: %u",
-		    s, t->sndq, t->notsent);
+		flow_err(conn,
+			 "Invalid notsent count socket %i, send: %u, not sent: %u",
+			 s, t->sndq, t->notsent);
 		return -EINVAL;
 	}
 
 	if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) {
-		err("Send queue too large to migrate socket %i: %u bytes",
-		    s, t->sndq);
+		flow_err(conn,
+			 "Send queue too large to migrate socket %i: %u bytes",
+			 s, t->sndq);
 		return -ENOBUFS;
 	}
 
@@ -2993,13 +3004,13 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
 			rc = 0;
 		} else {
 			rc = -errno;
-			err_perror("Can't read send queue, socket %i", s);
+			flow_perror(conn, "Can't read send queue");
 			return rc;
 		}
 	}
 
 	if ((uint32_t)rc < t->sndq) {
-		err("Short read migrating send queue");
+		flow_err(conn, "Short read migrating send queue");
 		return -ENXIO;
 	}
 
@@ -3010,19 +3021,20 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue
- * @s:		Socket
+ * @conn:	Connection to repair queue for
  * @len:	Length of data to be restored
  * @buf:	Buffer with content of pending data queue
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
+static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn,
+				 size_t len, uint8_t *buf)
 {
 	size_t chunk = len;
 	uint8_t *p = buf;
 
 	while (len > 0) {
-		ssize_t rc = send(s, p, MIN(len, chunk), 0);
+		ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0);
 
 		if (rc < 0) {
 			if ((errno == ENOBUFS || errno == ENOMEM) &&
@@ -3032,7 +3044,7 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
 			}
 
 			rc = -errno;
-			err_perror("Can't write queue, socket %i", s);
+			flow_perror(conn, "Can't write queue");
 			return rc;
 		}
 
@@ -3045,18 +3057,18 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
 
 /**
  * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue
- * @s:		Socket
+ * @conn:	Pointer to the TCP connection structure
  * @v:		Sequence value, set on return
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_dump_seq(int s, uint32_t *v)
+static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v)
 {
 	socklen_t sl = sizeof(*v);
 
-	if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
+	if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
 		int rc = -errno;
-		err_perror("Dumping sequence, socket %i", s);
+		flow_perror(conn, "Dumping sequence");
 		return rc;
 	}
 
@@ -3065,16 +3077,17 @@ static int tcp_flow_dump_seq(int s, uint32_t *v)
 
 /**
  * tcp_flow_repair_seq() - Restore sequence for pre-selected queue
- * @s:		Socket
+ * @conn:	Connection to repair sequences for
  * @v:		Sequence value to be set
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_repair_seq(int s, const uint32_t *v)
+static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn,
+			       const uint32_t *v)
 {
-	if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
+	if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
 		int rc = -errno;
-		err_perror("Setting sequence, socket %i", s);
+		flow_perror(conn, "Setting sequence");
 		return rc;
 	}
 
@@ -3083,15 +3096,17 @@ static int tcp_flow_repair_seq(int s, const uint32_t *v)
 
 /**
  * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it
- * @s:		Socket
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  *
  * #syscalls:vu ioctl
  */
-static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn,
+				  struct tcp_tap_transfer_ext *t)
 {
+	int s = conn->sock;
 	ssize_t rc;
 
 	if (ioctl(s, SIOCINQ, &t->rcvq) < 0) {
@@ -3111,8 +3126,9 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
 		t->rcvq--;
 
 	if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
-		err("Receive queue too large to migrate socket %i: %u bytes",
-		    s, t->rcvq);
+		flow_err(conn,
+			 "Receive queue too large to migrate socket: %u bytes",
+			 t->rcvq);
 		return -ENOBUFS;
 	}
 
@@ -3122,13 +3138,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
 			rc = 0;
 		} else {
 			rc = -errno;
-			err_perror("Can't read receive queue for socket %i", s);
+			flow_perror(conn, "Can't read receive queue");
 			return rc;
 		}
 	}
 
 	if ((uint32_t)rc < t->rcvq) {
-		err("Short read migrating receive queue");
+		flow_err(conn, "Short read migrating receive queue");
 		return -ENXIO;
 	}
 
@@ -3137,12 +3153,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
 
 /**
  * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps)
- * @s:		Socket
+ * @conn:	Pointer to the TCP connection structure
  * @t:		Extended migration data
  *
  * Return: 0 on success, negative error code on failure
  */
-static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
+static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
+			       const struct tcp_tap_transfer_ext *t)
 {
 	const struct tcp_repair_opt opts[] = {
 		{ TCPOPT_WINDOW,		t->snd_ws + (t->rcv_ws << 16) },
@@ -3156,9 +3173,9 @@ static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
 				!!(t->tcpi_options & TCPI_OPT_SACK) +
 				!!(t->tcpi_options & TCPI_OPT_TIMESTAMPS));
 
-	if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
+	if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
 		int rc = -errno;
-		err_perror("Setting repair options, socket %i", s);
+		flow_perror(conn, "Setting repair options");
 		return rc;
 	}
 
@@ -3229,36 +3246,36 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	/* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode
 	 * weird.
 	 */
-	if (tcp_set_peek_offset(s, -1)) {
+	if (tcp_set_peek_offset(conn, -1)) {
 		rc = -errno;
 		goto fail;
 	}
 
-	if ((rc = tcp_flow_dump_tinfo(s, t)))
+	if ((rc = tcp_flow_dump_tinfo(conn, t)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_mss(s, t)))
+	if ((rc = tcp_flow_dump_mss(conn, t)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_wnd(s, t)))
+	if ((rc = tcp_flow_dump_wnd(conn, t)))
 		goto fail;
 
-	if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE)))
+	if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_sndqueue(s, t)))
+	if ((rc = tcp_flow_dump_sndqueue(conn, t)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_seq(s, &t->seq_snd)))
+	if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd)))
 		goto fail;
 
-	if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE)))
+	if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_rcvqueue(s, t)))
+	if ((rc = tcp_flow_dump_rcvqueue(conn, t)))
 		goto fail;
 
-	if ((rc = tcp_flow_dump_seq(s, &t->seq_rcv)))
+	if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
 		goto fail;
 
 	close(s);
@@ -3269,14 +3286,14 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	t->seq_rcv	-= t->rcvq;
 	t->seq_snd	-= t->sndq;
 
-	debug("Extended migration data, socket %i sequences send %u receive %u",
-	      s, t->seq_snd, t->seq_rcv);
-	debug("  pending queues: send %u not sent %u receive %u",
-	      t->sndq, t->notsent, t->rcvq);
-	debug("  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
-	      t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
-	debug("  SO_PEEK_OFF %s  offset=%"PRIu32,
-	      peek_offset_cap ? "enabled" : "disabled", peek_offset);
+	flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u",
+		 s, t->seq_snd, t->seq_rcv);
+	flow_dbg(conn, "  pending queues: send %u not sent %u receive %u",
+		 t->sndq, t->notsent, t->rcvq);
+	flow_dbg(conn, "  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+		 t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
+	flow_dbg(conn, "  SO_PEEK_OFF %s  offset=%"PRIu32,
+		 peek_offset_cap ? "enabled" : "disabled", peek_offset);
 
 	/* Endianness fix-ups */
 	t->seq_snd	= htonl(t->seq_snd);
@@ -3292,17 +3309,17 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	t->rcv_wup	= htonl(t->rcv_wup);
 
 	if (write_all_buf(fd, t, sizeof(*t))) {
-		err_perror("Failed to write extended data, socket %i", s);
+		flow_perror(conn, "Failed to write extended data");
 		return -EIO;
 	}
 
 	if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) {
-		err_perror("Failed to write send queue data, socket %i", s);
+		flow_perror(conn, "Failed to write send queue data");
 		return -EIO;
 	}
 
 	if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) {
-		err_perror("Failed to write receive queue data, socket %i", s);
+		flow_perror(conn, "Failed to write receive queue data");
 		return -EIO;
 	}
 
@@ -3317,7 +3334,7 @@ fail:
 	t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */
 
 	if (write_all_buf(fd, t, sizeof(*t))) {
-		err_perror("Failed to write extended data, socket %i", s);
+		flow_perror(conn, "Failed to write extended data");
 		return -EIO;
 	}
 
@@ -3347,19 +3364,20 @@ static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
 	if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
 				 IPPROTO_TCP)) < 0) {
 		rc = -errno;
-		err_perror("Failed to create socket for migrated flow");
+		flow_perror(conn, "Failed to create socket for migrated flow");
 		return rc;
 	}
 	s = conn->sock;
 
 	if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)))
-		debug_perror("Setting SO_REUSEADDR on socket %i", s);
+		flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i",
+				s);
 
 	tcp_sock_set_nodelay(s);
 
 	if (bind(s, &a.sa, sizeof(a))) {
 		rc = -errno;
-		err_perror("Failed to bind socket for migrated flow");
+		flow_perror(conn, "Failed to bind socket for migrated flow");
 		goto err;
 	}
 
@@ -3390,7 +3408,7 @@ static int tcp_flow_repair_connect(const struct ctx *c,
 	rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
 	if (rc) {
 		rc = -errno;
-		err_perror("Failed to connect migrated socket %i", conn->sock);
+		flow_perror(conn, "Failed to connect migrated socket");
 		return rc;
 	}
 
@@ -3421,8 +3439,8 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 	}
 
 	if (read_all_buf(fd, &t, sizeof(t))) {
+		flow_perror(flow, "Failed to receive migration data");
 		flow_alloc_cancel(flow);
-		err_perror("Failed to receive migration data");
 		return -errno;
 	}
 
@@ -3481,7 +3499,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 
 	if (read_all_buf(fd, &t, sizeof(t))) {
 		rc = -errno;
-		err_perror("Failed to read extended data for socket %i", s);
+		flow_perror(conn, "Failed to read extended data");
 		return rc;
 	}
 
@@ -3503,31 +3521,34 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	t.rcv_wnd	= ntohl(t.rcv_wnd);
 	t.rcv_wup	= ntohl(t.rcv_wup);
 
-	debug("Extended migration data, socket %i sequences send %u receive %u",
-	      s, t.seq_snd, t.seq_rcv);
-	debug("  pending queues: send %u not sent %u receive %u",
-	      t.sndq, t.notsent, t.rcvq);
-	debug("  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
-	      t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
-	debug("  SO_PEEK_OFF %s  offset=%"PRIu32,
-	      peek_offset_cap ? "enabled" : "disabled", peek_offset);
+	flow_dbg(conn,
+		 "Extended migration data, socket %i sequences send %u receive %u",
+		 s, t.seq_snd, t.seq_rcv);
+	flow_dbg(conn, "  pending queues: send %u not sent %u receive %u",
+		 t.sndq, t.notsent, t.rcvq);
+	flow_dbg(conn,
+		 "  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+		 t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
+	flow_dbg(conn, "  SO_PEEK_OFF %s  offset=%"PRIu32,
+		 peek_offset_cap ? "enabled" : "disabled", peek_offset);
 
 	if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq ||
 	    t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
-		err("Bad queues socket %i, send: %u, not sent: %u, receive: %u",
-		    s, t.sndq, t.notsent, t.rcvq);
+		flow_err(conn,
+			 "Bad queues socket %i, send: %u, not sent: %u, receive: %u",
+			 s, t.sndq, t.notsent, t.rcvq);
 		return -EINVAL;
 	}
 
 	if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) {
 		rc = -errno;
-		err_perror("Failed to read send queue data, socket %i", s);
+		flow_perror(conn, "Failed to read send queue data");
 		return rc;
 	}
 
 	if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) {
 		rc = -errno;
-		err_perror("Failed to read receive queue data, socket %i", s);
+		flow_perror(conn, "Failed to read receive queue data");
 		return rc;
 	}
 
@@ -3535,32 +3556,32 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 		/* We weren't able to create the socket, discard flow */
 		goto fail;
 
-	if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
+	if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
 		goto fail;
 
-	if (tcp_flow_repair_seq(s, &t.seq_snd))
+	if (tcp_flow_repair_seq(conn, &t.seq_snd))
 		goto fail;
 
-	if (tcp_flow_select_queue(s, TCP_RECV_QUEUE))
+	if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE))
 		goto fail;
 
-	if (tcp_flow_repair_seq(s, &t.seq_rcv))
+	if (tcp_flow_repair_seq(conn, &t.seq_rcv))
 		goto fail;
 
 	if (tcp_flow_repair_connect(c, conn))
 		goto fail;
 
-	if (tcp_flow_repair_queue(s, t.rcvq, tcp_migrate_rcv_queue))
+	if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue))
 		goto fail;
 
-	if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
+	if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
 		goto fail;
 
-	if (tcp_flow_repair_queue(s, t.sndq - t.notsent,
+	if (tcp_flow_repair_queue(conn, t.sndq - t.notsent,
 				  tcp_migrate_snd_queue))
 		goto fail;
 
-	if (tcp_flow_repair_opt(s, &t))
+	if (tcp_flow_repair_opt(conn, &t))
 		goto fail;
 
 	/* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't
@@ -3575,19 +3596,19 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 
 		v = TCP_SEND_QUEUE;
 		if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
-			debug_perror("Selecting repair queue, socket %i", s);
+			flow_perror(conn, "Selecting repair queue");
 		else
 			shutdown(s, SHUT_WR);
 	}
 
-	if (tcp_flow_repair_wnd(s, &t))
+	if (tcp_flow_repair_wnd(conn, &t))
 		goto fail;
 
 	tcp_flow_repair_off(c, conn);
 	repair_flush(c);
 
 	if (t.notsent) {
-		if (tcp_flow_repair_queue(s, t.notsent,
+		if (tcp_flow_repair_queue(conn, t.notsent,
 					  tcp_migrate_snd_queue +
 					  (t.sndq - t.notsent))) {
 			/* This sometimes seems to fail for unclear reasons.
@@ -3607,15 +3628,16 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	if (t.tcpi_state == TCP_FIN_WAIT1)
 		shutdown(s, SHUT_WR);
 
-	if (tcp_set_peek_offset(conn->sock, peek_offset))
+	if (tcp_set_peek_offset(conn, peek_offset))
 		goto fail;
 
 	tcp_send_flag(c, conn, ACK);
 	tcp_data_from_sock(c, conn);
 
 	if ((rc = tcp_epoll_ctl(c, conn))) {
-		debug("Failed to subscribe to epoll for migrated socket %i: %s",
-		      conn->sock, strerror_(-rc));
+		flow_dbg(conn,
+			 "Failed to subscribe to epoll for migrated socket: %s",
+			 strerror_(-rc));
 		goto fail;
 	}
 
diff --git a/tcp.h b/tcp.h
index 9142eca..234a803 100644
--- a/tcp.h
+++ b/tcp.h
@@ -25,7 +25,6 @@ void tcp_timer(struct ctx *c, const struct timespec *now);
 void tcp_defer_handler(struct ctx *c);
 
 void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
-int tcp_set_peek_offset(int s, int offset);
 
 extern bool peek_offset_cap;
 
diff --git a/tcp_buf.c b/tcp_buf.c
index 72d99c5..0530563 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -125,7 +125,7 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
 
 		conn->seq_to_tap = seq;
 		peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
-		if (tcp_set_peek_offset(conn->sock, peek_offset))
+		if (tcp_set_peek_offset(conn, peek_offset))
 			tcp_rst(c, conn);
 	}
 }
@@ -304,7 +304,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
-		if (tcp_set_peek_offset(s, 0)) {
+		if (tcp_set_peek_offset(conn, 0)) {
 			tcp_rst(c, conn);
 			return -1;
 		}
diff --git a/tcp_internal.h b/tcp_internal.h
index 6f5e054..36c6533 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -177,5 +177,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 		      int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
 		      size_t *optlen);
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset);
 
 #endif /* TCP_INTERNAL_H */
diff --git a/tcp_vu.c b/tcp_vu.c
index 6891ed1..57587cc 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -376,7 +376,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
-		if (tcp_set_peek_offset(conn->sock, 0)) {
+		if (tcp_set_peek_offset(conn, 0)) {
 			tcp_rst(c, conn);
 			return -1;
 		}

From 51f3c071a76bd20677e72b49007b822dca71e755 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 18 Mar 2025 17:18:47 +0100
Subject: [PATCH 054/144] passt-repair: Fix build with -Werror=format-security

Fixes: 04701702471e ("passt-repair: Add directory watch")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt-repair.c b/passt-repair.c
index 8bb3f00..120f7aa 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -150,7 +150,7 @@ int main(int argc, char **argv)
 			_exit(1);
 		}
 
-		ret = snprintf(a.sun_path, sizeof(a.sun_path), path);
+		ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path);
 		inotify_dir = true;
 	} else {
 		ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);

From 28772ee91a60b34786023496ea17c2c2f4e5f7f5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Mar 2025 16:14:21 +1100
Subject: [PATCH 055/144] migrate, tcp: More careful marshalling of mss
 parameter during migration

During migration we extract the limit on segment size using TCP_MAXSEG,
and set it on the other side with TCP_REPAIR_OPTIONS.  However, unlike most
32-bit values we transfer we transfer it in native endian, not network
endian.  This is not correct; add it to the list of endian fixups we make.

In addition, while MAXSEG will be 32-bits in practice, and is given as such
to TCP_REPAIR_OPTIONS, the TCP_MAXSEG sockopt treats it as an 'int'.  It's
not strictly safe to pass a uint32_t to a getsockopt() expecting an int,
although we'll get away with it on most (maybe all) platforms.  Correct
this as well.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Minor coding style fix]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tcp.c b/tcp.c
index a4c840e..43ee76b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2848,13 +2848,16 @@ static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
 			     struct tcp_tap_transfer_ext *t)
 {
 	socklen_t sl = sizeof(t->mss);
+	int val;
 
-	if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) {
+	if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) {
 		int rc = -errno;
 		flow_perror(conn, "Getting MSS");
 		return rc;
 	}
 
+	t->mss = (uint32_t)val;
+
 	return 0;
 }
 
@@ -3301,6 +3304,7 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	t->sndq		= htonl(t->sndq);
 	t->notsent	= htonl(t->notsent);
 	t->rcvq		= htonl(t->rcvq);
+	t->mss		= htonl(t->mss);
 
 	t->snd_wl1	= htonl(t->snd_wl1);
 	t->snd_wnd	= htonl(t->snd_wnd);
@@ -3514,6 +3518,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	t.sndq		= ntohl(t.sndq);
 	t.notsent	= ntohl(t.notsent);
 	t.rcvq		= ntohl(t.rcvq);
+	t.mss		= ntohl(t.mss);
 
 	t.snd_wl1	= ntohl(t.snd_wl1);
 	t.snd_wnd	= ntohl(t.snd_wnd);

From cfb3740568ab291d7be00e457658c45ce9367ed5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Mar 2025 16:14:22 +1100
Subject: [PATCH 056/144] migrate, tcp: Migrate RFC 7323 timestamp

Currently our migration of the state of TCP sockets omits the RFC 7323
timestamp.  In some circumstances that can result in data sent from the
target machine not being received, because it is discarded on the peer due
to PAWS checking.

Add code to dump and restore the timestamp across migration.

Link: https://bugs.passt.top/show_bug.cgi?id=115
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Minor style fixes]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c      | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 tcp_conn.h |  2 ++
 2 files changed, 61 insertions(+)

diff --git a/tcp.c b/tcp.c
index 43ee76b..68af43d 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2861,6 +2861,57 @@ static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
 	return 0;
 }
 
+
+/**
+ * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data (tcpi_options must be populated)
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn,
+				   struct tcp_tap_transfer_ext *t)
+{
+	int val = 0;
+
+	if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+		socklen_t sl = sizeof(val);
+
+		if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) {
+			int rc = -errno;
+			flow_perror(conn, "Getting RFC 7323 timestamp");
+			return rc;
+		}
+	}
+
+	t->timestamp = (uint32_t)val;
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn,
+				   const struct tcp_tap_transfer_ext *t)
+{
+	int val = (int)t->timestamp;
+
+	if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+		if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP,
+			       &val, sizeof(val))) {
+			int rc = -errno;
+			flow_perror(conn, "Setting RFC 7323 timestamp");
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
  * @conn:	Pointer to the TCP connection structure
@@ -3260,6 +3311,9 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	if ((rc = tcp_flow_dump_mss(conn, t)))
 		goto fail;
 
+	if ((rc = tcp_flow_dump_timestamp(conn, t)))
+		goto fail;
+
 	if ((rc = tcp_flow_dump_wnd(conn, t)))
 		goto fail;
 
@@ -3305,6 +3359,7 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	t->notsent	= htonl(t->notsent);
 	t->rcvq		= htonl(t->rcvq);
 	t->mss		= htonl(t->mss);
+	t->timestamp	= htonl(t->timestamp);
 
 	t->snd_wl1	= htonl(t->snd_wl1);
 	t->snd_wnd	= htonl(t->snd_wnd);
@@ -3519,6 +3574,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	t.notsent	= ntohl(t.notsent);
 	t.rcvq		= ntohl(t.rcvq);
 	t.mss		= ntohl(t.mss);
+	t.timestamp	= ntohl(t.timestamp);
 
 	t.snd_wl1	= ntohl(t.snd_wl1);
 	t.snd_wnd	= ntohl(t.snd_wnd);
@@ -3561,6 +3617,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 		/* We weren't able to create the socket, discard flow */
 		goto fail;
 
+	if (tcp_flow_repair_timestamp(conn, &t))
+		goto fail;
+
 	if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
 		goto fail;
 
diff --git a/tcp_conn.h b/tcp_conn.h
index 9126a36..35d813d 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -152,6 +152,7 @@ struct tcp_tap_transfer {
  * @notsent:		Part of pending send queue that wasn't sent out yet
  * @rcvq:		Length of pending receive queue
  * @mss:		Socket-side MSS clamp
+ * @timestamp:		RFC 7323 timestamp
  * @snd_wl1:		Next sequence used in window probe (next sequence - 1)
  * @snd_wnd:		Socket-side sending window
  * @max_window:		Window clamp
@@ -171,6 +172,7 @@ struct tcp_tap_transfer_ext {
 	uint32_t	rcvq;
 
 	uint32_t	mss;
+	uint32_t	timestamp;
 
 	/* We can't just use struct tcp_repair_window: we need network order */
 	uint32_t	snd_wl1;

From c250ffc5c11385d9618b3a8165e676d68d5cbfa2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Mar 2025 16:14:23 +1100
Subject: [PATCH 057/144] migrate: Bump migration version number

v1 of the migration stream format, had some flaws: it didn't properly
handle endianness of the MSS field, and it didn't transfer the RFC7323
timestamp.  We've now fixed those bugs, but it requires incompatible
changes to the stream format.

Because of the timestamps in particular, v1 is not really usable, so there
is little point maintaining compatible support for it.  However, v1 is in
released packages, both upstream and downstream (RHEL at least).  Just
updating the stream format without bumping the version would lead to very
cryptic errors if anyone did attempt to migrate between an old and new
passt.

So, bump the migration version to v2, so we'll get a clear error message if
anyone attempts this.  We don't attempt to maintain backwards compatibility
with v1, however: we'll simply fail if given a v1 stream.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 migrate.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/migrate.c b/migrate.c
index 0fca77b..48d63a0 100644
--- a/migrate.c
+++ b/migrate.c
@@ -96,8 +96,8 @@ static int seen_addrs_target_v1(struct ctx *c,
 	return 0;
 }
 
-/* Stages for version 1 */
-static const struct migrate_stage stages_v1[] = {
+/* Stages for version 2 */
+static const struct migrate_stage stages_v2[] = {
 	{
 		.name = "observed addresses",
 		.source = seen_addrs_source_v1,
@@ -118,7 +118,11 @@ static const struct migrate_stage stages_v1[] = {
 
 /* Supported encoding versions, from latest (most preferred) to oldest */
 static const struct migrate_version versions[] = {
-	{ 1,	stages_v1, },
+	{ 2,	stages_v2, },
+	/* v1 was released, but not widely used.  It had bad endianness for the
+	 * MSS and omitted timestamps, which meant it usually wouldn't work.
+	 * Therefore we don't attempt to support compatibility with it.
+	 */
 	{ 0 },
 };
 

From ebdd46367ce1acba235013d97e362b8677b538d5 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 19 Mar 2025 17:57:45 +0100
Subject: [PATCH 058/144] tcp: Flush socket before checking for more data in
 active close state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Otherwise, if all the pending data is acknowledged:

- tcp_update_seqack_from_tap() updates the current tap-side ACK
  sequence (conn->seq_ack_from_tap)

- next, we compare the sequence we sent (conn->seq_to_tap) to the
  ACK sequence (conn->seq_ack_from_tap) in tcp_data_from_sock() to
  understand if there's more data we can send.

  If they match, we conclude that we haven't sent any of that data,
  and keep re-sending it.

We need, instead, to flush the socket (drop acknowledged data) before
calling tcp_update_seqack_from_tap(), so that once we update
conn->seq_ack_from_tap, we can be sure that all data until there is
gone from the socket.

Link: https://bugs.passt.top/show_bug.cgi?id=114
Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Fixes: 30f1e082c3c0 ("tcp: Keep updating window and checking for socket data after FIN from guest")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tcp.c b/tcp.c
index 68af43d..fa1d885 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2049,6 +2049,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 
 	/* Established connections not accepting data from tap */
 	if (conn->events & TAP_FIN_RCVD) {
+		tcp_sock_consume(conn, ntohl(th->ack_seq));
 		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
 		tcp_tap_window_update(conn, ntohs(th->window));
 		tcp_data_from_sock(c, conn);

From 07c2d584b334b0c405a5702a4f2fad104d03940b Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 19 Mar 2025 20:43:47 +0100
Subject: [PATCH 059/144] conf: Include libgen.h for basename(), fix build
 against musl

Fixes: 4b17d042c7e4 ("conf: Move mode detection into helper function")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf.c b/conf.c
index 0e2e8dc..b54c55d 100644
--- a/conf.c
+++ b/conf.c
@@ -16,6 +16,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
+#include <libgen.h>
 #include <string.h>
 #include <sched.h>
 #include <sys/types.h>

From 32f6212551c5db3b7b3548e8483e5d73f07a35ac Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 19 Mar 2025 20:45:12 +0100
Subject: [PATCH 060/144] Makefile: Enable -Wformat-security

It looks like an easy win to prevent a number of possible security
flaws.

Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index f2ac8e5..31cbac3 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /
 FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
 endif
 
-FLAGS := -Wall -Wextra -Wno-format-zero-length
+FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security
 FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
 FLAGS +=  $(FORTIFY_FLAG) -O2 -pie -fPIE
 FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)

From 4592719a744bcb47db2ff5680be4b8f6362a97ce Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:14 +1100
Subject: [PATCH 061/144] vu_common: Tighten vu_packet_check_range()

This function verifies that the given packet is within the mmap()ed memory
region of the vhost-user device.  We can do better, however.  The packet
should be not only within the mmap()ed range, but specifically in the
subsection of that range set aside for shared buffers, which starts at
dev_region->mmap_offset within there.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vu_common.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vu_common.c b/vu_common.c
index 686a09b..9eea4f2 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -37,10 +37,10 @@ int vu_packet_check_range(void *buf, const char *ptr, size_t len)
 
 	for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
 		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
-		char *m = (char *)(uintptr_t)dev_region->mmap_addr;
+		char *m = (char *)(uintptr_t)dev_region->mmap_addr +
+			dev_region->mmap_offset;
 
-		if (m <= ptr &&
-		    ptr + len <= m + dev_region->mmap_offset + dev_region->size)
+		if (m <= ptr && ptr + len <= m + dev_region->size)
 			return 0;
 	}
 

From e43e00719d7701301e4bc4fb179dc7adff175409 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:15 +1100
Subject: [PATCH 062/144] packet: More cautious checks to avoid pointer
 arithmetic UB

packet_check_range and vu_packet_check_range() verify that the packet or
section of packet we're interested in lies in the packet buffer pool we
expect it to.  However, in doing so it doesn't avoid the possibility of
an integer overflow while performing pointer arithmetic, with is UB.  In
fact, AFAICT it's UB even to use arbitrary pointer arithmetic to construct
a pointer outside of a known valid buffer.

To do this safely, we can't calculate the end of a memory region with
pointer addition when then the length as untrusted.  Instead we must work
out the offset of one memory region within another using pointer
subtraction, then do integer checks against the length of the outer region.
We then need to be careful about the order of checks so that those integer
checks can't themselves overflow.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c    | 12 +++++++++---
 vu_common.c | 10 +++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/packet.c b/packet.c
index bcac037..d1a51a5 100644
--- a/packet.c
+++ b/packet.c
@@ -52,9 +52,15 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
 		return -1;
 	}
 
-	if (ptr + len > p->buf + p->buf_size) {
-		trace("packet range end %p after buffer end %p, %s:%i",
-		      (void *)(ptr + len), (void *)(p->buf + p->buf_size),
+	if (len > p->buf_size) {
+		trace("packet range length %zu larger than buffer %zu, %s:%i",
+		      len, p->buf_size, func, line);
+		return -1;
+	}
+
+	if ((size_t)(ptr - p->buf) > p->buf_size - len) {
+		trace("packet range %p, len %zu after buffer end %p, %s:%i",
+		      (void *)ptr, len, (void *)(p->buf + p->buf_size),
 		      func, line);
 		return -1;
 	}
diff --git a/vu_common.c b/vu_common.c
index 9eea4f2..cefe5e2 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -36,11 +36,15 @@ int vu_packet_check_range(void *buf, const char *ptr, size_t len)
 	struct vu_dev_region *dev_region;
 
 	for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
-		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
-		char *m = (char *)(uintptr_t)dev_region->mmap_addr +
+		uintptr_t base_addr = dev_region->mmap_addr +
 			dev_region->mmap_offset;
+		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
+		const char *base = (const char *)base_addr;
 
-		if (m <= ptr && ptr + len <= m + dev_region->size)
+		ASSERT(base_addr >= dev_region->mmap_addr);
+
+		if (len <= dev_region->size && base <= ptr &&
+		    (size_t)(ptr - base) <= dev_region->size - len)
 			return 0;
 	}
 

From a41d6d125eca5ac8c54bed8157098be141557b03 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:16 +1100
Subject: [PATCH 063/144] tap: Make size of pool_tap[46] purely a tuning
 parameter

Currently we attempt to size pool_tap[46] so they have room for the maximum
possible number of packets that could fit in pkt_buf (TAP_MSGS).  However,
the calculation isn't quite correct: TAP_MSGS is based on ETH_ZLEN (60) as
the minimum possible L2 frame size.  But ETH_ZLEN is based on physical
constraints of Ethernet, which don't apply to our virtual devices.  It is
possible to generate a legitimate frame smaller than this, for example an
empty payload UDP/IPv4 frame on the 'pasta' backend is only 42 bytes long.

Further more, the same limit applies for vhost-user, which is not limited
by the size of pkt_buf like the other backends.  In that case we don't even
have full control of the maximum buffer size, so we can't really calculate
how many packets could fit in there.

If we exceed do TAP_MSGS we'll drop packets, not just use more batches,
which is moderately bad.  The fact that this needs to be sized just so for
correctness not merely for tuning is a fairly non-obvious coupling between
different parts of the code.

To make this more robust, alter the tap code so it doesn't rely on
everything fitting in a single batch of TAP_MSGS packets, instead breaking
into multiple batches as necessary.  This leaves TAP_MSGS as purely a
tuning parameter, which we can freely adjust based on performance measures.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c    | 13 ++++++++++++-
 packet.h    |  3 +++
 passt.h     |  2 --
 tap.c       | 19 ++++++++++++++++---
 tap.h       |  3 ++-
 vu_common.c |  5 +++--
 6 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/packet.c b/packet.c
index d1a51a5..08076d5 100644
--- a/packet.c
+++ b/packet.c
@@ -67,6 +67,17 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
 
 	return 0;
 }
+/**
+ * pool_full() - Is a packet pool full?
+ * @p:		Pointer to packet pool
+ *
+ * Return: true if the pool is full, false if more packets can be added
+ */
+bool pool_full(const struct pool *p)
+{
+	return p->count >= p->size;
+}
+
 /**
  * packet_add_do() - Add data as packet descriptor to given pool
  * @p:		Existing pool
@@ -80,7 +91,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 {
 	size_t idx = p->count;
 
-	if (idx >= p->size) {
+	if (pool_full(p)) {
 		trace("add packet index %zu to pool with size %zu, %s:%i",
 		      idx, p->size, func, line);
 		return;
diff --git a/packet.h b/packet.h
index d099f02..dd18461 100644
--- a/packet.h
+++ b/packet.h
@@ -6,6 +6,8 @@
 #ifndef PACKET_H
 #define PACKET_H
 
+#include <stdbool.h>
+
 /* Maximum size of a single packet stored in pool, including headers */
 #define PACKET_MAX_LEN	UINT16_MAX
 
@@ -33,6 +35,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 void *packet_get_do(const struct pool *p, const size_t idx,
 		    size_t offset, size_t len, size_t *left,
 		    const char *func, int line);
+bool pool_full(const struct pool *p);
 void pool_flush(struct pool *p);
 
 #define packet_add(p, len, start)					\
diff --git a/passt.h b/passt.h
index 8f45091..8693794 100644
--- a/passt.h
+++ b/passt.h
@@ -71,8 +71,6 @@ static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
 
 /* Large enough for ~128 maximum size frames */
 #define PKT_BUF_BYTES		(8UL << 20)
-#define TAP_MSGS							\
-	DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
 
 extern char pkt_buf		[PKT_BUF_BYTES];
 
diff --git a/tap.c b/tap.c
index 182a115..34e6774 100644
--- a/tap.c
+++ b/tap.c
@@ -75,6 +75,9 @@ CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
 CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
 CHECK_FRAME_LEN(L2_MAX_LEN_VU);
 
+#define TAP_MSGS							\
+	DIV_ROUND_UP(sizeof(pkt_buf), ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
+
 /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
 static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
 static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
@@ -1042,8 +1045,10 @@ void tap_handler(struct ctx *c, const struct timespec *now)
  * @c:		Execution context
  * @l2len:	Total L2 packet length
  * @p:		Packet buffer
+ * @now:	Current timestamp
  */
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
+void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
+		    const struct timespec *now)
 {
 	const struct ethhdr *eh;
 
@@ -1059,9 +1064,17 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
 	switch (ntohs(eh->h_proto)) {
 	case ETH_P_ARP:
 	case ETH_P_IP:
+		if (pool_full(pool_tap4)) {
+			tap4_handler(c, pool_tap4, now);
+			pool_flush(pool_tap4);
+		}
 		packet_add(pool_tap4, l2len, p);
 		break;
 	case ETH_P_IPV6:
+		if (pool_full(pool_tap6)) {
+			tap6_handler(c, pool_tap6, now);
+			pool_flush(pool_tap6);
+		}
 		packet_add(pool_tap6, l2len, p);
 		break;
 	default:
@@ -1142,7 +1155,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 		p += sizeof(uint32_t);
 		n -= sizeof(uint32_t);
 
-		tap_add_packet(c, l2len, p);
+		tap_add_packet(c, l2len, p, now);
 
 		p += l2len;
 		n -= l2len;
@@ -1207,7 +1220,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 		    len > (ssize_t)L2_MAX_LEN_PASTA)
 			continue;
 
-		tap_add_packet(c, len, pkt_buf + n);
+		tap_add_packet(c, len, pkt_buf + n, now);
 	}
 
 	tap_handler(c, now);
diff --git a/tap.h b/tap.h
index dd39fd8..6fe3d15 100644
--- a/tap.h
+++ b/tap.h
@@ -119,6 +119,7 @@ void tap_sock_update_pool(void *base, size_t size);
 void tap_backend_init(struct ctx *c);
 void tap_flush_pools(void);
 void tap_handler(struct ctx *c, const struct timespec *now);
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
+void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
+		    const struct timespec *now);
 
 #endif /* TAP_H */
diff --git a/vu_common.c b/vu_common.c
index cefe5e2..5e6fd4a 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -195,7 +195,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 			tap_add_packet(vdev->context,
 				       elem[count].out_sg[0].iov_len - hdrlen,
 				       (char *)elem[count].out_sg[0].iov_base +
-				        hdrlen);
+				       hdrlen, now);
 		} else {
 			/* vnet header can be in a separate iovec */
 			if (elem[count].out_num != 2) {
@@ -207,7 +207,8 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 			} else {
 				tap_add_packet(vdev->context,
 					       elem[count].out_sg[1].iov_len,
-					       (char *)elem[count].out_sg[1].iov_base);
+					       (char *)elem[count].out_sg[1].iov_base,
+					       now);
 			}
 		}
 

From 9866d146e654975dd7f5fd3f1294d5fc4628cef3 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:17 +1100
Subject: [PATCH 064/144] tap: Clarify calculation of TAP_MSGS

The rationale behind the calculation of TAP_MSGS isn't necessarily obvious.
It's supposed to be the maximum number of packets that can fit in pkt_buf.
However, the calculation is wrong in several ways:
 * It's based on ETH_ZLEN which isn't meaningful for virtual devices
 * It always includes the qemu socket header which isn't used for pasta
 * The size of pkt_buf isn't relevant for vhost-user

We've already made sure this is just a tuning parameter, not a hard limit.
Clarify what we're calculating here and why.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/tap.c b/tap.c
index 34e6774..3a6fcbe 100644
--- a/tap.c
+++ b/tap.c
@@ -75,12 +75,28 @@ CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
 CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
 CHECK_FRAME_LEN(L2_MAX_LEN_VU);
 
-#define TAP_MSGS							\
-	DIV_ROUND_UP(sizeof(pkt_buf), ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
+/* We try size the packet pools so that we can use a single batch for the entire
+ * packet buffer.  This might be exceeded for vhost-user, though, which uses its
+ * own buffers rather than pkt_buf.
+ *
+ * This is just a tuning parameter, the code will work with slightly more
+ * overhead if it's incorrect.  So, we estimate based on the minimum practical
+ * frame size - an empty UDP datagram - rather than the minimum theoretical
+ * frame size.
+ *
+ * FIXME: Profile to work out how big this actually needs to be to amortise
+ *        per-batch syscall overheads
+ */
+#define TAP_MSGS_IP4							\
+	DIV_ROUND_UP(sizeof(pkt_buf),					\
+		     ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr))
+#define TAP_MSGS_IP6							\
+	DIV_ROUND_UP(sizeof(pkt_buf),					\
+		     ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr))
 
 /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
-static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
-static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
+static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4, pkt_buf);
+static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf);
 
 #define TAP_SEQS		128 /* Different L4 tuples in one batch */
 #define FRAGMENT_MSG_RATE	10  /* # seconds between fragment warnings */
@@ -1418,8 +1434,8 @@ void tap_sock_update_pool(void *base, size_t size)
 {
 	int i;
 
-	pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size);
-	pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size);
+	pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size);
+	pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size);
 
 	for (i = 0; i < TAP_SEQS; i++) {
 		tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);

From c48331ca51399fe1779529511be395b576aaf0af Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:18 +1100
Subject: [PATCH 065/144] packet: Correct type of PACKET_MAX_LEN

PACKET_MAX_LEN is usually involved in calculations on size_t values - the
type of the iov_len field in struct iovec.  However, being defined bare as
UINT16_MAX, the compiled is likely to assign it a shorter type.  This can
lead to unexpected promotions (or lack thereof).  Add a cast to force the
type to be what we expect.

Fixes: c43972ad6 ("packet: Give explicit name to maximum packet size")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packet.h b/packet.h
index dd18461..9061dad 100644
--- a/packet.h
+++ b/packet.h
@@ -9,7 +9,7 @@
 #include <stdbool.h>
 
 /* Maximum size of a single packet stored in pool, including headers */
-#define PACKET_MAX_LEN	UINT16_MAX
+#define PACKET_MAX_LEN	((size_t)UINT16_MAX)
 
 /**
  * struct pool - Generic pool of packets stored in a buffer

From 37d9f374d9f0c47c092f80a5d85d4505ae4a9af7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:19 +1100
Subject: [PATCH 066/144] packet: Avoid integer overflows in packet_get_do()

In packet_get_do() both offset and len are essentially untrusted.  We do
some validation of len (check it's < PACKET_MAX_LEN), but that's not enough
to ensure that (len + offset) doesn't overflow.  Rearrange our calculation
to make sure it's safe regardless of the given offset & len values.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packet.c b/packet.c
index 08076d5..fdc4be7 100644
--- a/packet.c
+++ b/packet.c
@@ -144,7 +144,8 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		return NULL;
 	}
 
-	if (len + offset > p->pkt[idx].iov_len) {
+	if (offset > p->pkt[idx].iov_len ||
+	    len > (p->pkt[idx].iov_len - offset)) {
 		if (func) {
 			trace("data length %zu, offset %zu from length %zu, "
 			      "%s:%i", len, offset, p->pkt[idx].iov_len,

From 961aa6a0eb7fce956a34f8ccd883bfe12392d3d3 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:20 +1100
Subject: [PATCH 067/144] packet: Move checks against PACKET_MAX_LEN to
 packet_check_range()

Both the callers of packet_check_range() separately verify that the given
length does not exceed PACKET_MAX_LEN.  Fold that check into
packet_check_range() instead.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/packet.c b/packet.c
index fdc4be7..7cbe95d 100644
--- a/packet.c
+++ b/packet.c
@@ -35,6 +35,12 @@
 static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
 			      const char *func, int line)
 {
+	if (len > PACKET_MAX_LEN) {
+		trace("packet range length %zu (max %zu), %s:%i",
+		      len, PACKET_MAX_LEN, func, line);
+		return -1;
+	}
+
 	if (p->buf_size == 0) {
 		int ret;
 
@@ -100,11 +106,6 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 	if (packet_check_range(p, start, len, func, line))
 		return;
 
-	if (len > PACKET_MAX_LEN) {
-		trace("add packet length %zu, %s:%i", len, func, line);
-		return;
-	}
-
 	p->pkt[idx].iov_base = (void *)start;
 	p->pkt[idx].iov_len = len;
 
@@ -136,14 +137,6 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 		return NULL;
 	}
 
-	if (len > PACKET_MAX_LEN) {
-		if (func) {
-			trace("packet data length %zu, %s:%i",
-			      len, func, line);
-		}
-		return NULL;
-	}
-
 	if (offset > p->pkt[idx].iov_len ||
 	    len > (p->pkt[idx].iov_len - offset)) {
 		if (func) {

From 38bcce997763f2e0c4bb6c0a3926674317796544 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:21 +1100
Subject: [PATCH 068/144] packet: Rework packet_get() versus packet_get_try()

Most failures of packet_get() indicate a serious problem, and log messages
accordingly.  However, a few callers expect failures here, because they're
probing for a certain range which might or might not be in a packet.  They
use packet_get_try() which passes a NULL func to packet_get_do() to
suppress the logging which is unwanted in this case.

However, this doesn't just suppress the log when packet_get_do() finds the
requested region isn't in the packet.  It suppresses logging for all other
errors too, which do indicate serious problems, even for the callers of
packet_get_try().  Worse it will pass the NULL func on to
packet_check_range() which doesn't expect it, meaning we'll get unhelpful
messages from there if there is a failure.

Fix this by making packet_get_try_do() the primary function which doesn't
log for the case of a range outside the packet.  packet_get_do() becomes a
trivial wrapper around that which logs a message if packet_get_try_do()
returns NULL.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 51 +++++++++++++++++++++++++++++++++++----------------
 packet.h |  8 +++++---
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/packet.c b/packet.c
index 7cbe95d..b3e8c79 100644
--- a/packet.c
+++ b/packet.c
@@ -89,7 +89,7 @@ bool pool_full(const struct pool *p)
  * @p:		Existing pool
  * @len:	Length of new descriptor
  * @start:	Start of data
- * @func:	For tracing: name of calling function, NULL means no trace()
+ * @func:	For tracing: name of calling function
  * @line:	For tracing: caller line of function call
  */
 void packet_add_do(struct pool *p, size_t len, const char *start,
@@ -113,39 +113,31 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 }
 
 /**
- * packet_get_do() - Get data range from packet descriptor from given pool
+ * packet_get_try_do() - Get data range from packet descriptor from given pool
  * @p:		Packet pool
  * @idx:	Index of packet descriptor in pool
  * @offset:	Offset of data range in packet descriptor
  * @len:	Length of desired data range
  * @left:	Length of available data after range, set on return, can be NULL
- * @func:	For tracing: name of calling function, NULL means no trace()
+ * @func:	For tracing: name of calling function
  * @line:	For tracing: caller line of function call
  *
  * Return: pointer to start of data range, NULL on invalid range or descriptor
  */
-void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
-		    size_t len, size_t *left, const char *func, int line)
+void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
+			size_t len, size_t *left, const char *func, int line)
 {
 	char *ptr;
 
 	if (idx >= p->size || idx >= p->count) {
-		if (func) {
-			trace("packet %zu from pool size: %zu, count: %zu, "
-			      "%s:%i", idx, p->size, p->count, func, line);
-		}
+		trace("packet %zu from pool size: %zu, count: %zu, %s:%i",
+		      idx, p->size, p->count, func, line);
 		return NULL;
 	}
 
 	if (offset > p->pkt[idx].iov_len ||
-	    len > (p->pkt[idx].iov_len - offset)) {
-		if (func) {
-			trace("data length %zu, offset %zu from length %zu, "
-			      "%s:%i", len, offset, p->pkt[idx].iov_len,
-			      func, line);
-		}
+	    len > (p->pkt[idx].iov_len - offset))
 		return NULL;
-	}
 
 	ptr = (char *)p->pkt[idx].iov_base + offset;
 
@@ -158,6 +150,33 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
 	return ptr;
 }
 
+/**
+ * packet_get_do() - Get data range from packet descriptor from given pool
+ * @p:		Packet pool
+ * @idx:	Index of packet descriptor in pool
+ * @offset:	Offset of data range in packet descriptor
+ * @len:	Length of desired data range
+ * @left:	Length of available data after range, set on return, can be NULL
+ * @func:	For tracing: name of calling function
+ * @line:	For tracing: caller line of function call
+ *
+ * Return: as packet_get_try_do() but log a trace message when returning NULL
+ */
+void *packet_get_do(const struct pool *p, const size_t idx,
+		    size_t offset, size_t len, size_t *left,
+		    const char *func, int line)
+{
+	void *r = packet_get_try_do(p, idx, offset, len, left, func, line);
+
+	if (!r) {
+		trace("missing packet data length %zu, offset %zu from "
+		      "length %zu, %s:%i",
+		      len, offset, p->pkt[idx].iov_len, func, line);
+	}
+
+	return r;
+}
+
 /**
  * pool_flush() - Flush a packet pool
  * @p:		Pointer to packet pool
diff --git a/packet.h b/packet.h
index 9061dad..c94780a 100644
--- a/packet.h
+++ b/packet.h
@@ -32,6 +32,9 @@ struct pool {
 int vu_packet_check_range(void *buf, const char *ptr, size_t len);
 void packet_add_do(struct pool *p, size_t len, const char *start,
 		   const char *func, int line);
+void *packet_get_try_do(const struct pool *p, const size_t idx,
+			size_t offset, size_t len, size_t *left,
+			const char *func, int line);
 void *packet_get_do(const struct pool *p, const size_t idx,
 		    size_t offset, size_t len, size_t *left,
 		    const char *func, int line);
@@ -41,12 +44,11 @@ void pool_flush(struct pool *p);
 #define packet_add(p, len, start)					\
 	packet_add_do(p, len, start, __func__, __LINE__)
 
+#define packet_get_try(p, idx, offset, len, left)			\
+	packet_get_try_do(p, idx, offset, len, left, __func__, __LINE__)
 #define packet_get(p, idx, offset, len, left)				\
 	packet_get_do(p, idx, offset, len, left, __func__, __LINE__)
 
-#define packet_get_try(p, idx, offset, len, left)			\
-	packet_get_do(p, idx, offset, len, left, NULL, 0)
-
 #define PACKET_POOL_DECL(_name, _size, _buf)				\
 struct _name ## _t {							\
 	char *buf;							\

From 9153aca15bc1150e450dd56e79bc035cc2dbf27c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:22 +1100
Subject: [PATCH 069/144] util: Add abort_with_msg() and ASSERT_WITH_MSG()
 helpers

We already have the ASSERT() macro which will abort() passt based on a
condition.  It always has a fixed error message based on its location and
the asserted expression.  We have some upcoming cases where we want to
customise the message when hitting an assert.

Add abort_with_msg() and ASSERT_WITH_MSG() helpers to allow this.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 util.c | 19 +++++++++++++++++++
 util.h | 25 ++++++++++---------------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/util.c b/util.c
index 656e86a..b9a3d43 100644
--- a/util.c
+++ b/util.c
@@ -1017,3 +1017,22 @@ void encode_domain_name(char *buf, const char *domain_name)
 	}
 	p[i] = 0L;
 }
+
+/**
+ * abort_with_msg() - Print error message and abort
+ * @fmt:	Format string
+ * @...:	Format parameters
+ */
+void abort_with_msg(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vlogmsg(true, false, LOG_CRIT, fmt, ap);
+	va_end(ap);
+
+	/* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp,
+	 * but that will still get the job done.
+	 */
+	abort();
+}
diff --git a/util.h b/util.h
index 4d512fa..b1e7e79 100644
--- a/util.h
+++ b/util.h
@@ -61,27 +61,22 @@
 #define STRINGIFY(x)	#x
 #define STR(x)		STRINGIFY(x)
 
-#ifdef CPPCHECK_6936
+void abort_with_msg(const char *fmt, ...)
+	__attribute__((format(printf, 1, 2), noreturn));
+
 /* Some cppcheck versions get confused by aborts inside a loop, causing
  * it to give false positive uninitialised variable warnings later in
  * the function, because it doesn't realise the non-initialising path
  * already exited.  See https://trac.cppcheck.net/ticket/13227
+ *
+ * Therefore, avoid using the usual do while wrapper we use to force the macro
+ * to act like a single statement requiring a ';'.
  */
-#define ASSERT(expr)		\
-	((expr) ? (void)0 : abort())
-#else
+#define ASSERT_WITH_MSG(expr, ...)					\
+	((expr) ? (void)0 : abort_with_msg(__VA_ARGS__))
 #define ASSERT(expr)							\
-	do {								\
-		if (!(expr)) {						\
-			err("ASSERTION FAILED in %s (%s:%d): %s",	\
-			    __func__, __FILE__, __LINE__, STRINGIFY(expr)); \
-			/* This may actually SIGSYS, due to seccomp,	\
-			 * but that will still get the job done		\
-			 */						\
-			abort();					\
-		}							\
-	} while (0)
-#endif
+	ASSERT_WITH_MSG((expr), "ASSSERTION FAILED in %s (%s:%d): %s",	\
+			__func__, __FILE__, __LINE__, STRINGIFY(expr))
 
 #ifdef P_tmpdir
 #define TMPDIR		P_tmpdir

From 0857515c943d439eade80710c16f15f146dfa9e8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:23 +1100
Subject: [PATCH 070/144] packet: ASSERT on signs of pool corruption

If packet_check_range() fails in packet_get_try_do() we just return NULL.
But this check only takes places after we've already validated the given
range against the packet it's in.  That means that if packet_check_range()
fails, the packet pool is already in a corrupted state (we should have
made strictly stronger checks when the packet was added).  Simply returning
NULL and logging a trace() level message isn't really adequate for that
situation; ASSERT instead.

Similarly we check the given idx against both p->count and p->size.  The
latter should be redundant, because count should always be <= size.  If
that's not the case then, again, the pool is already in a corrupted state
and we may have overwritten unknown memory.  Assert for this case too.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/packet.c b/packet.c
index b3e8c79..be28f27 100644
--- a/packet.c
+++ b/packet.c
@@ -129,9 +129,13 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
 {
 	char *ptr;
 
-	if (idx >= p->size || idx >= p->count) {
-		trace("packet %zu from pool size: %zu, count: %zu, %s:%i",
-		      idx, p->size, p->count, func, line);
+	ASSERT_WITH_MSG(p->count <= p->size,
+			"Corrupt pool count: %zu, size: %zu, %s:%i",
+			p->count, p->size, func, line);
+
+	if (idx >= p->count) {
+		trace("packet %zu from pool count: %zu, %s:%i",
+		      idx, p->count, func, line);
 		return NULL;
 	}
 
@@ -141,8 +145,8 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
 
 	ptr = (char *)p->pkt[idx].iov_base + offset;
 
-	if (packet_check_range(p, ptr, len, func, line))
-		return NULL;
+	ASSERT_WITH_MSG(!packet_check_range(p, ptr, len, func, line),
+			"Corrupt packet pool, %s:%i", func, line);
 
 	if (left)
 		*left = p->pkt[idx].iov_len - offset - len;

From cf4d3f05c9263d1b0a88dbbcf9e48d34cac6708e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 17 Mar 2025 20:24:24 +1100
Subject: [PATCH 071/144] packet: Upgrade severity of most packet errors

All errors from packet_range_check(), packet_add() and packet_get() are
trace level.  However, these are for the most part actual error conditions.
They're states that should not happen, in many cases indicating a bug
in the caller or elswhere.

We don't promote these to err() or ASSERT() level, for fear of a localised
bug on very specific input crashing the entire program, or flooding the
logs, but we can at least upgrade them to debug level.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 packet.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/packet.c b/packet.c
index be28f27..72c6158 100644
--- a/packet.c
+++ b/packet.c
@@ -36,7 +36,7 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
 			      const char *func, int line)
 {
 	if (len > PACKET_MAX_LEN) {
-		trace("packet range length %zu (max %zu), %s:%i",
+		debug("packet range length %zu (max %zu), %s:%i",
 		      len, PACKET_MAX_LEN, func, line);
 		return -1;
 	}
@@ -47,25 +47,25 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
 		ret = vu_packet_check_range((void *)p->buf, ptr, len);
 
 		if (ret == -1)
-			trace("cannot find region, %s:%i", func, line);
+			debug("cannot find region, %s:%i", func, line);
 
 		return ret;
 	}
 
 	if (ptr < p->buf) {
-		trace("packet range start %p before buffer start %p, %s:%i",
+		debug("packet range start %p before buffer start %p, %s:%i",
 		      (void *)ptr, (void *)p->buf, func, line);
 		return -1;
 	}
 
 	if (len > p->buf_size) {
-		trace("packet range length %zu larger than buffer %zu, %s:%i",
+		debug("packet range length %zu larger than buffer %zu, %s:%i",
 		      len, p->buf_size, func, line);
 		return -1;
 	}
 
 	if ((size_t)(ptr - p->buf) > p->buf_size - len) {
-		trace("packet range %p, len %zu after buffer end %p, %s:%i",
+		debug("packet range %p, len %zu after buffer end %p, %s:%i",
 		      (void *)ptr, len, (void *)(p->buf + p->buf_size),
 		      func, line);
 		return -1;
@@ -98,7 +98,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 	size_t idx = p->count;
 
 	if (pool_full(p)) {
-		trace("add packet index %zu to pool with size %zu, %s:%i",
+		debug("add packet index %zu to pool with size %zu, %s:%i",
 		      idx, p->size, func, line);
 		return;
 	}
@@ -134,7 +134,7 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
 			p->count, p->size, func, line);
 
 	if (idx >= p->count) {
-		trace("packet %zu from pool count: %zu, %s:%i",
+		debug("packet %zu from pool count: %zu, %s:%i",
 		      idx, p->count, func, line);
 		return NULL;
 	}

From 89b203b851f32a532cc0406cf26a1d24950a207c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:01 +1100
Subject: [PATCH 072/144] udp: Common invocation of udp_sock_errs() for
 vhost-user and "buf" paths

The vhost-user and non-vhost-user paths for both udp_listen_sock_handler()
and udp_reply_sock_handler() are more or less completely separate.  Both,
however, start with essentially the same invocation of udp_sock_errs(), so
that can be made common.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c          | 37 ++++++++++++++++++++-----------------
 udp_internal.h |  2 +-
 udp_vu.c       | 15 ---------------
 3 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/udp.c b/udp.c
index 80520cb..4a06b16 100644
--- a/udp.c
+++ b/udp.c
@@ -585,7 +585,8 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events)
+static int udp_sock_errs(const struct ctx *c, union epoll_ref ref,
+			 uint32_t events)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
@@ -678,13 +679,6 @@ static void udp_buf_listen_sock_handler(const struct ctx *c,
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
 	int n, i;
 
-	if (udp_sock_errs(c, ref, events) < 0) {
-		err("UDP: Unrecoverable error on listening socket:"
-		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
-		/* FIXME: what now?  close/re-open socket? */
-		return;
-	}
-
 	if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
 		return;
 
@@ -750,6 +744,13 @@ void udp_listen_sock_handler(const struct ctx *c,
 			     union epoll_ref ref, uint32_t events,
 			     const struct timespec *now)
 {
+	if (udp_sock_errs(c, ref, events) < 0) {
+		err("UDP: Unrecoverable error on listening socket:"
+		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
+		/* FIXME: what now?  close/re-open socket? */
+		return;
+	}
+
 	if (c->mode == MODE_VU) {
 		udp_vu_listen_sock_handler(c, ref, events, now);
 		return;
@@ -777,17 +778,8 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	uint8_t topif = pif_at_sidx(tosidx);
 	int n, i, from_s;
 
-	ASSERT(!c->no_udp && uflow);
-
 	from_s = uflow->s[ref.flowside.sidei];
 
-	if (udp_sock_errs(c, ref, events) < 0) {
-		flow_err(uflow, "Unrecoverable error on reply socket");
-		flow_err_details(uflow);
-		udp_flow_close(c, uflow);
-		return;
-	}
-
 	if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
 		return;
 
@@ -825,6 +817,17 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now)
 {
+	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
+
+	ASSERT(!c->no_udp && uflow);
+
+	if (udp_sock_errs(c, ref, events) < 0) {
+		flow_err(uflow, "Unrecoverable error on reply socket");
+		flow_err_details(uflow);
+		udp_flow_close(c, uflow);
+		return;
+	}
+
 	if (c->mode == MODE_VU) {
 		udp_vu_reply_sock_handler(c, ref, events, now);
 		return;
diff --git a/udp_internal.h b/udp_internal.h
index 3b081f5..02724e5 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -30,5 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                        const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
-int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events);
+
 #endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index c26a223..84f52af 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -227,12 +227,6 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;
 
-	if (udp_sock_errs(c, ref, events) < 0) {
-		err("UDP: Unrecoverable error on listening socket:"
-		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
-		return;
-	}
-
 	for (i = 0; i < UDP_MAX_FRAMES; i++) {
 		const struct flowside *toside;
 		union sockaddr_inany s_in;
@@ -300,15 +294,6 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;
 
-	ASSERT(!c->no_udp);
-
-	if (udp_sock_errs(c, ref, events) < 0) {
-		flow_err(uflow, "Unrecoverable error on reply socket");
-		flow_err_details(uflow);
-		udp_flow_close(c, uflow);
-		return;
-	}
-
 	for (i = 0; i < UDP_MAX_FRAMES; i++) {
 		uint8_t topif = pif_at_sidx(tosidx);
 		ssize_t dlen;

From 5a977c2f4ee8926673554b2b456e7791962b2ce2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:02 +1100
Subject: [PATCH 073/144] udp: Simplify checking of epoll event bits

udp_{listen,reply}_sock_handler() can accept both EPOLLERR and EPOLLIN
events.  However, unlike most epoll event handlers we don't check the
event bits right there.  EPOLLERR is checked within udp_sock_errs() which
we call unconditionally.  Checking EPOLLIN is still more buried: it is
checked within both udp_sock_recv() and udp_vu_sock_recv().

We can simplify the logic and pass less extraneous parameters around by
moving the checking of the event bits to the top level event handlers.

This makes udp_{buf,vu}_{listen,reply}_sock_handler() no longer general
event handlers, but specific to EPOLLIN events, meaning new data.  So,
rename those functions to udp_{buf,vu}_{listen,reply}_sock_data() to better
reflect their function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c    | 78 ++++++++++++++++++++++++--------------------------------
 udp_vu.c | 25 +++++++-----------
 udp_vu.h |  8 +++---
 3 files changed, 47 insertions(+), 64 deletions(-)

diff --git a/udp.c b/udp.c
index 4a06b16..26a91c9 100644
--- a/udp.c
+++ b/udp.c
@@ -581,12 +581,10 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
  * udp_sock_errs() - Process errors on a socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-static int udp_sock_errs(const struct ctx *c, union epoll_ref ref,
-			 uint32_t events)
+static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
@@ -595,9 +593,6 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref,
 
 	ASSERT(!c->no_udp);
 
-	if (!(events & EPOLLERR))
-		return 0; /* Nothing to do */
-
 	/* Empty the error queue */
 	while ((rc = udp_sock_recverr(c, ref)) > 0)
 		n_err += rc;
@@ -630,15 +625,13 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref,
  * udp_sock_recv() - Receive datagrams from a socket
  * @c:		Execution context
  * @s:		Socket to receive from
- * @events:	epoll events bitmap
  * @mmh		mmsghdr array to receive into
  *
  * Return: Number of datagrams received
  *
  * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
  */
-static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
-			 struct mmsghdr *mmh)
+static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh)
 {
 	/* For not entirely clear reasons (data locality?) pasta gets better
 	 * throughput if we receive tap datagrams one at a atime.  For small
@@ -651,9 +644,6 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
 
 	ASSERT(!c->no_udp);
 
-	if (!(events & EPOLLIN))
-		return 0;
-
 	n = recvmmsg(s, mmh, n, 0, NULL);
 	if (n < 0) {
 		err_perror("Error receiving datagrams");
@@ -664,22 +654,20 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
 }
 
 /**
- * udp_buf_listen_sock_handler() - Handle new data from socket
+ * udp_buf_listen_sock_data() - Handle new data from socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
  * @now:	Current timestamp
  *
  * #syscalls recvmmsg
  */
-static void udp_buf_listen_sock_handler(const struct ctx *c,
-					union epoll_ref ref, uint32_t events,
-					const struct timespec *now)
+static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
+				     const struct timespec *now)
 {
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
 	int n, i;
 
-	if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
+	if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv)) <= 0)
 		return;
 
 	/* We divide datagrams into batches based on how we need to send them,
@@ -744,33 +732,33 @@ void udp_listen_sock_handler(const struct ctx *c,
 			     union epoll_ref ref, uint32_t events,
 			     const struct timespec *now)
 {
-	if (udp_sock_errs(c, ref, events) < 0) {
-		err("UDP: Unrecoverable error on listening socket:"
-		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
-		/* FIXME: what now?  close/re-open socket? */
-		return;
+	if (events & EPOLLERR) {
+		if (udp_sock_errs(c, ref) < 0) {
+			err("UDP: Unrecoverable error on listening socket:"
+			    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
+			/* FIXME: what now?  close/re-open socket? */
+			return;
+		}
 	}
 
-	if (c->mode == MODE_VU) {
-		udp_vu_listen_sock_handler(c, ref, events, now);
-		return;
+	if (events & EPOLLIN) {
+		if (c->mode == MODE_VU)
+			udp_vu_listen_sock_data(c, ref, now);
+		else
+			udp_buf_listen_sock_data(c, ref, now);
 	}
-
-	udp_buf_listen_sock_handler(c, ref, events, now);
 }
 
 /**
- * udp_buf_reply_sock_handler() - Handle new data from flow specific socket
+ * udp_buf_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
  * @now:	Current timestamp
  *
  * #syscalls recvmmsg
  */
-static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-				       uint32_t events,
-				       const struct timespec *now)
+static void udp_buf_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+				    const struct timespec *now)
 {
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
@@ -780,7 +768,7 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	from_s = uflow->s[ref.flowside.sidei];
 
-	if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
+	if ((n = udp_sock_recv(c, from_s, udp_mh_recv)) <= 0)
 		return;
 
 	flow_trace(uflow, "Received %d datagrams on reply socket", n);
@@ -821,19 +809,21 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	ASSERT(!c->no_udp && uflow);
 
-	if (udp_sock_errs(c, ref, events) < 0) {
-		flow_err(uflow, "Unrecoverable error on reply socket");
-		flow_err_details(uflow);
-		udp_flow_close(c, uflow);
-		return;
+	if (events & EPOLLERR) {
+		if (udp_sock_errs(c, ref) < 0) {
+			flow_err(uflow, "Unrecoverable error on reply socket");
+			flow_err_details(uflow);
+			udp_flow_close(c, uflow);
+			return;
+		}
 	}
 
-	if (c->mode == MODE_VU) {
-		udp_vu_reply_sock_handler(c, ref, events, now);
-		return;
+	if (events & EPOLLIN) {
+		if (c->mode == MODE_VU)
+			udp_vu_reply_sock_data(c, ref, now);
+		else
+			udp_buf_reply_sock_data(c, ref, now);
 	}
-
-	udp_buf_reply_sock_handler(c, ref, events, now);
 }
 
 /**
diff --git a/udp_vu.c b/udp_vu.c
index 84f52af..698667f 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -78,14 +78,12 @@ static int udp_vu_sock_info(int s, union sockaddr_inany *s_in)
  * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
  * @c:		Execution context
  * @s:		Socket to receive from
- * @events:	epoll events bitmap
  * @v6:		Set for IPv6 connections
  * @dlen:	Size of received data (output)
  *
  * Return: Number of iov entries used to store the datagram
  */
-static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
-			    bool v6, ssize_t *dlen)
+static int udp_vu_sock_recv(const struct ctx *c, int s, bool v6, ssize_t *dlen)
 {
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
@@ -95,9 +93,6 @@ static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
 
 	ASSERT(!c->no_udp);
 
-	if (!(events & EPOLLIN))
-		return 0;
-
 	/* compute L2 header length */
 	hdrlen = udp_vu_hdrlen(v6);
 
@@ -214,14 +209,13 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used)
 }
 
 /**
- * udp_vu_listen_sock_handler() - Handle new data from socket
+ * udp_vu_listen_sock_data() - Handle new data from socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
  * @now:	Current timestamp
  */
-void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
-				uint32_t events, const struct timespec *now)
+void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
+			     const struct timespec *now)
 {
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
@@ -262,7 +256,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
 
-		iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen);
+		iov_used = udp_vu_sock_recv(c, ref.fd, v6, &dlen);
 		if (iov_used <= 0)
 			break;
 
@@ -277,14 +271,13 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 }
 
 /**
- * udp_vu_reply_sock_handler() - Handle new data from flow specific socket
+ * udp_vu_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
  * @now:	Current timestamp
  */
-void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-			        uint32_t events, const struct timespec *now)
+void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+			    const struct timespec *now)
 {
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
@@ -313,7 +306,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
 
-		iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen);
+		iov_used = udp_vu_sock_recv(c, from_s, v6, &dlen);
 		if (iov_used <= 0)
 			break;
 		flow_trace(uflow, "Received 1 datagram on reply socket");
diff --git a/udp_vu.h b/udp_vu.h
index ba7018d..4f2262d 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -6,8 +6,8 @@
 #ifndef UDP_VU_H
 #define UDP_VU_H
 
-void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
-				uint32_t events, const struct timespec *now);
-void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-			       uint32_t events, const struct timespec *now);
+void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
+			     const struct timespec *now);
+void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+			    const struct timespec *now);
 #endif /* UDP_VU_H */

From d924b7dfc40cfaf9ebc64fe052efd8b0c45c6478 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:03 +1100
Subject: [PATCH 074/144] udp_vu: Factor things out of udp_vu_reply_sock_data()
 loop

At the start of every cycle of the loop in udp_vu_reply_sock_data() we:
 - ASSERT that uflow is not NULL
 - Check if the target pif is PIF_TAP
 - Initialize the v6 boolean

However, all of these depend only on the flow, which doesn't change across
the loop.  This is probably a duplication from udp_vu_listen_sock_data(),
where the flow can be different for each packet.  For the reply socket
case, however, factor that logic out of the loop.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_vu.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/udp_vu.c b/udp_vu.c
index 698667f..6e1823a 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -281,30 +281,28 @@ void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
 {
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
+	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
 	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
 	int from_s = uflow->s[ref.flowside.sidei];
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	uint8_t topif = pif_at_sidx(tosidx);
 	int i;
 
+	ASSERT(uflow);
+
+	if (topif != PIF_TAP) {
+		uint8_t frompif = pif_at_sidx(ref.flowside);
+
+		flow_err(uflow,
+			 "No support for forwarding UDP from %s to %s",
+			 pif_name(frompif), pif_name(topif));
+		return;
+	}
+
 	for (i = 0; i < UDP_MAX_FRAMES; i++) {
-		uint8_t topif = pif_at_sidx(tosidx);
 		ssize_t dlen;
 		int iov_used;
-		bool v6;
-
-		ASSERT(uflow);
-
-		if (topif != PIF_TAP) {
-			uint8_t frompif = pif_at_sidx(ref.flowside);
-
-			flow_err(uflow,
-				 "No support for forwarding UDP from %s to %s",
-				 pif_name(frompif), pif_name(topif));
-			continue;
-		}
-
-		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
 
 		iov_used = udp_vu_sock_recv(c, from_s, v6, &dlen);
 		if (iov_used <= 0)

From 269cf6a12a5f89683daa8da9232cc2524d7a4ae2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:04 +1100
Subject: [PATCH 075/144] udp: Share more logic between vu and non-vu reply
 socket paths

Share some additional miscellaneous logic between the vhost-user and "buf"
paths for data on udp reply sockets.  The biggest piece is error handling
of cases where we can't forward between the two pifs of the flow.  We also
make common some more simple logic locating the correct flow and its
parameters.

This adds some lines of code due to extra comment lines, but nonetheless
reduces logic duplication.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c    | 41 ++++++++++++++++++++++++++---------------
 udp_vu.c | 26 +++++++++++---------------
 udp_vu.h |  3 ++-
 3 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/udp.c b/udp.c
index 26a91c9..f417cea 100644
--- a/udp.c
+++ b/udp.c
@@ -752,24 +752,25 @@ void udp_listen_sock_handler(const struct ctx *c,
 /**
  * udp_buf_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to read data from
+ * @tosidx:	Flow & side to forward data from @s to
  * @now:	Current timestamp
  *
+ * Return: true on success, false if can't forward from socket to flow's pif
+ *
  * #syscalls recvmmsg
  */
-static void udp_buf_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+static bool udp_buf_reply_sock_data(const struct ctx *c,
+				    int s, flow_sidx_t tosidx,
 				    const struct timespec *now)
 {
-	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
-	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
+	struct udp_flow *uflow = udp_at_sidx(tosidx);
 	uint8_t topif = pif_at_sidx(tosidx);
-	int n, i, from_s;
+	int n, i;
 
-	from_s = uflow->s[ref.flowside.sidei];
-
-	if ((n = udp_sock_recv(c, from_s, udp_mh_recv)) <= 0)
-		return;
+	if ((n = udp_sock_recv(c, s, udp_mh_recv)) <= 0)
+		return true;
 
 	flow_trace(uflow, "Received %d datagrams on reply socket", n);
 	uflow->ts = now->tv_sec;
@@ -788,11 +789,10 @@ static void udp_buf_reply_sock_data(const struct ctx *c, union epoll_ref ref,
 	} else if (topif == PIF_TAP) {
 		tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
 	} else {
-		uint8_t frompif = pif_at_sidx(ref.flowside);
-
-		flow_err(uflow, "No support for forwarding UDP from %s to %s",
-			 pif_name(frompif), pif_name(topif));
+		return false;
 	}
+
+	return true;
 }
 
 /**
@@ -819,10 +819,21 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	}
 
 	if (events & EPOLLIN) {
+		flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+		int s = ref.fd;
+		bool ret;
+
 		if (c->mode == MODE_VU)
-			udp_vu_reply_sock_data(c, ref, now);
+			ret = udp_vu_reply_sock_data(c, s, tosidx, now);
 		else
-			udp_buf_reply_sock_data(c, ref, now);
+			ret = udp_buf_reply_sock_data(c, s, tosidx, now);
+
+		if (!ret) {
+			flow_err(uflow,
+				 "No support for forwarding UDP from %s to %s",
+				 pif_name(pif_at_sidx(ref.flowside)),
+				 pif_name(pif_at_sidx(tosidx)));
+		}
 	}
 }
 
diff --git a/udp_vu.c b/udp_vu.c
index 6e1823a..06bdeae 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -273,38 +273,32 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 /**
  * udp_vu_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to read data from
+ * @tosidx:	Flow & side to forward data from @s to
  * @now:	Current timestamp
+ *
+ * Return: true on success, false if can't forward from socket to flow's pif
  */
-void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx,
 			    const struct timespec *now)
 {
-	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
-	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
-	int from_s = uflow->s[ref.flowside.sidei];
+	struct udp_flow *uflow = udp_at_sidx(tosidx);
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
-	uint8_t topif = pif_at_sidx(tosidx);
 	int i;
 
 	ASSERT(uflow);
 
-	if (topif != PIF_TAP) {
-		uint8_t frompif = pif_at_sidx(ref.flowside);
-
-		flow_err(uflow,
-			 "No support for forwarding UDP from %s to %s",
-			 pif_name(frompif), pif_name(topif));
-		return;
-	}
+	if (pif_at_sidx(tosidx) != PIF_TAP)
+		return false;
 
 	for (i = 0; i < UDP_MAX_FRAMES; i++) {
 		ssize_t dlen;
 		int iov_used;
 
-		iov_used = udp_vu_sock_recv(c, from_s, v6, &dlen);
+		iov_used = udp_vu_sock_recv(c, s, v6, &dlen);
 		if (iov_used <= 0)
 			break;
 		flow_trace(uflow, "Received 1 datagram on reply socket");
@@ -318,4 +312,6 @@ void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
 		}
 		vu_flush(vdev, vq, elem, iov_used);
 	}
+
+	return true;
 }
diff --git a/udp_vu.h b/udp_vu.h
index 4f2262d..2299b51 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -8,6 +8,7 @@
 
 void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 			     const struct timespec *now);
-void udp_vu_reply_sock_data(const struct ctx *c, union epoll_ref ref,
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx,
 			    const struct timespec *now);
+
 #endif /* UDP_VU_H */

From f67c488b81ca2a4d9f819b625fceab10b71fc3a5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:05 +1100
Subject: [PATCH 076/144] udp: Better handling of failure to forward from reply
 socket

In udp_reply_sock_handler() if we're unable to forward the datagrams we
just print an error.  Generally this means we have an unsupported pair of
pifs in the flow table, though, and that hasn't change.  So, next time we
get a matching packet we'll just get the same failure.  In vhost-user mode
we don't even dequeue the incoming packets which triggered this so we're
likely to get the same failure immediately.

Instead, close the flow, in the same we we do for an unrecoverable error.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/udp.c b/udp.c
index f417cea..96e48dd 100644
--- a/udp.c
+++ b/udp.c
@@ -812,9 +812,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	if (events & EPOLLERR) {
 		if (udp_sock_errs(c, ref) < 0) {
 			flow_err(uflow, "Unrecoverable error on reply socket");
-			flow_err_details(uflow);
-			udp_flow_close(c, uflow);
-			return;
+			goto fail;
 		}
 	}
 
@@ -829,12 +827,15 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			ret = udp_buf_reply_sock_data(c, s, tosidx, now);
 
 		if (!ret) {
-			flow_err(uflow,
-				 "No support for forwarding UDP from %s to %s",
-				 pif_name(pif_at_sidx(ref.flowside)),
-				 pif_name(pif_at_sidx(tosidx)));
+			flow_err(uflow, "Unable to forward UDP");
+			goto fail;
 		}
 	}
+	return;
+
+fail:
+	flow_err_details(uflow);
+	udp_flow_close(c, uflow);
 }
 
 /**

From 37d78c9ef3944c1b060e3e8259b82fea3f8ec6bf Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:06 +1100
Subject: [PATCH 077/144] udp: Always hash socket facing flowsides

For UDP packets from the tap interface (like TCP) we use a hash table to
look up which flow they belong to.  Unlike TCP, we sometimes also create a
hash table entry for the socket side of UDP flows.  We need that when we
receive a UDP packet from a "listening" socket which isn't specific to a
single flow.

At present we only do this for the initiating side of flows, which re-use
the listening socket.  For the target side we use a connected "reply"
socket specific to the single flow.

We have in mind changes that maye introduce some edge cases were we could
receive UDP packets on a non flow specific socket more often.  To allow for
those changes - and slightly simplifying things in the meantime - always
put both sides of a UDP flow - tap or socket - in the hash table.  It's
not that costly, and means we always have the option of falling back to a
hash lookup.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_flow.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/udp_flow.c b/udp_flow.c
index c6b8630..7e80924 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -41,25 +41,23 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
  */
 void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 {
+	unsigned sidei;
+
 	if (uflow->closed)
 		return; /* Nothing to do */
 
-	if (uflow->s[INISIDE] >= 0) {
-		/* The listening socket needs to stay in epoll */
-		close(uflow->s[INISIDE]);
-		uflow->s[INISIDE] = -1;
+	flow_foreach_sidei(sidei) {
+		flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
+		if (uflow->s[sidei] >= 0) {
+			/* The listening socket needs to stay in epoll, but the
+			 * flow specific one needs to be removed */
+			if (sidei == TGTSIDE)
+				epoll_del(c, uflow->s[sidei]);
+			close(uflow->s[sidei]);
+			uflow->s[sidei] = -1;
+		}
 	}
 
-	if (uflow->s[TGTSIDE] >= 0) {
-		/* But the flow specific one needs to be removed */
-		epoll_del(c, uflow->s[TGTSIDE]);
-		close(uflow->s[TGTSIDE]);
-		uflow->s[TGTSIDE] = -1;
-	}
-	flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
-	if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
-		flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
-
 	uflow->closed = true;
 }
 
@@ -77,6 +75,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 {
 	struct udp_flow *uflow = NULL;
 	const struct flowside *tgt;
+	unsigned sidei;
 	uint8_t tgtpif;
 
 	if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
@@ -143,14 +142,14 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		}
 	}
 
-	flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
-
-	/* If the target side is a socket, it will be a reply socket that knows
-	 * its own flowside.  But if it's tap, then we need to look it up by
-	 * hash.
+	/* Tap sides always need to be looked up by hash.  Socket sides don't
+	 * always, but sometimes do (receiving packets on a socket not specific
+	 * to one flow).  Unconditionally hash both sides so all our bases are
+	 * covered
 	 */
-	if (!pif_is_socket(tgtpif))
-		flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE));
+	flow_foreach_sidei(sidei)
+		flow_hash_insert(c, FLOW_SIDX(uflow, sidei));
+
 	FLOW_ACTIVATE(uflow);
 
 	return FLOW_SIDX(uflow, TGTSIDE);

From 77883fbdd17e836247f746d888dcad3f611a6a59 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Mar 2025 14:44:07 +1100
Subject: [PATCH 078/144] udp: Add helper function for creating connected UDP
 socket

Currently udp_flow_new() open codes creating and connecting a socket to use
for reply messages.  We have in mind some more places to use this logic,
plus it just makes for a rather large function.  Split this handling out
into a new udp_flow_sock() function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_flow.c | 104 +++++++++++++++++++++++++++++------------------------
 1 file changed, 58 insertions(+), 46 deletions(-)

diff --git a/udp_flow.c b/udp_flow.c
index 7e80924..bf4b896 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -61,6 +61,61 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 	uflow->closed = true;
 }
 
+/**
+ * udp_flow_sock() - Create, bind and connect a flow specific UDP socket
+ * @c:		Execution context
+ * @uflow:	UDP flow to open socket for
+ * @sidei:	Side of @uflow to open socket for
+ *
+ * Return: fd of new socket on success, -ve error code on failure
+ */
+static int udp_flow_sock(const struct ctx *c,
+			 const struct udp_flow *uflow, unsigned sidei)
+{
+	const struct flowside *side = &uflow->f.side[sidei];
+	struct mmsghdr discard[UIO_MAXIOV] = { 0 };
+	uint8_t pif = uflow->f.pif[sidei];
+	union {
+		flow_sidx_t sidx;
+		uint32_t data;
+	} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
+	int rc, s;
+
+	s = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, pif, side, fref.data);
+	if (s < 0) {
+		flow_dbg_perror(uflow, "Couldn't open flow specific socket");
+		return s;
+	}
+
+	if (flowside_connect(c, s, pif, side) < 0) {
+		rc = -errno;
+		flow_dbg_perror(uflow, "Couldn't connect flow socket");
+		return rc;
+	}
+
+	/* It's possible, if unlikely, that we could receive some unrelated
+	 * packets in between the bind() and connect() of this socket.  For now
+	 * we just discard these.
+	 *
+	 * FIXME: Redirect these to an appropriate handler
+	 */
+	rc = recvmmsg(s, discard, ARRAY_SIZE(discard), MSG_DONTWAIT, NULL);
+	if (rc >= ARRAY_SIZE(discard)) {
+		flow_dbg(uflow, "Too many (%d) spurious reply datagrams", rc);
+		return -E2BIG;
+	}
+
+	if (rc > 0) {
+		flow_trace(uflow, "Discarded %d spurious reply datagrams", rc);
+	} else if (errno != EAGAIN) {
+		rc = -errno;
+		flow_perror(uflow, "Unexpected error discarding datagrams");
+		return rc;
+	}
+
+	return s;
+}
+
 /**
  * udp_flow_new() - Common setup for a new UDP flow
  * @c:		Execution context
@@ -74,13 +129,10 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 				int s_ini, const struct timespec *now)
 {
 	struct udp_flow *uflow = NULL;
-	const struct flowside *tgt;
 	unsigned sidei;
-	uint8_t tgtpif;
 
-	if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
+	if (!flow_target(c, flow, IPPROTO_UDP))
 		goto cancel;
-	tgtpif = flow->f.pif[TGTSIDE];
 
 	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
 	uflow->ts = now->tv_sec;
@@ -98,49 +150,9 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		}
 	}
 
-	if (pif_is_socket(tgtpif)) {
-		struct mmsghdr discard[UIO_MAXIOV] = { 0 };
-		union {
-			flow_sidx_t sidx;
-			uint32_t data;
-		} fref = {
-			.sidx = FLOW_SIDX(flow, TGTSIDE),
-		};
-		int rc;
-
-		uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY,
-						     tgtpif, tgt, fref.data);
-		if (uflow->s[TGTSIDE] < 0) {
-			flow_dbg_perror(uflow,
-					"Couldn't open socket for spliced flow");
+	if (pif_is_socket(flow->f.pif[TGTSIDE]))
+		if ((uflow->s[TGTSIDE] = udp_flow_sock(c, uflow, TGTSIDE)) < 0)
 			goto cancel;
-		}
-
-		if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) {
-			flow_dbg_perror(uflow, "Couldn't connect flow socket");
-			goto cancel;
-		}
-
-		/* It's possible, if unlikely, that we could receive some
-		 * unrelated packets in between the bind() and connect() of this
-		 * socket.  For now we just discard these.  We could consider
-		 * trying to redirect these to an appropriate handler, if we
-		 * need to.
-		 */
-		rc = recvmmsg(uflow->s[TGTSIDE], discard, ARRAY_SIZE(discard),
-			      MSG_DONTWAIT, NULL);
-		if (rc >= ARRAY_SIZE(discard)) {
-			flow_dbg(uflow,
-				 "Too many (%d) spurious reply datagrams", rc);
-			goto cancel;
-		} else if (rc > 0) {
-			flow_trace(uflow,
-				   "Discarded %d spurious reply datagrams", rc);
-		} else if (errno != EAGAIN) {
-			flow_perror(uflow,
-				    "Unexpected error discarding datagrams");
-		}
-	}
 
 	/* Tap sides always need to be looked up by hash.  Socket sides don't
 	 * always, but sometimes do (receiving packets on a socket not specific

From 664c588be752bf590adb55bf1f613d4a36f02e7c Mon Sep 17 00:00:00 2001
From: Julian Wundrak <julian@wundrak.net>
Date: Wed, 26 Mar 2025 20:14:31 +0000
Subject: [PATCH 079/144] build: normalize arm targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linux distributions use different dumpmachine outputs for the ARM
architecture. arm, armv6l, armv7l.
For the syscall annotation, these variants are standardized to “arm”.

Link: https://bugs.passt.top/show_bug.cgi?id=117
Signed-off-by: Julian Wundrak <julian@wundrak.net>
[sbrivio: Fix typo: assign from TARGET_ARCH, not from TARGET]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 31cbac3..3328f83 100644
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,7 @@ $(if $(TARGET),,$(error Failed to get target architecture))
 # Get 'uname -m'-like architecture description for target
 TARGET_ARCH := $(firstword $(subst -, ,$(TARGET)))
 TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH))
+TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH))
 TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH))
 
 # On some systems enabling optimization also enables source fortification,

From 65cca54be84ffc5d2e18fcb8229dcc9d1f229479 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Wed, 26 Mar 2025 11:59:02 -0400
Subject: [PATCH 080/144] udp: correct source address for ICMP messages

While developing traceroute forwarding tap-to-sock we found that
struct msghdr.msg_name for the ICMPs in the opposite direction always
contains the destination address of the original UDP message, and not,
as one might expect, the one of the host which created the error message.

Study of the kernel code reveals that this address instead is appended
as extra data after the received struct sock_extended_err area.

We now change the ICMP receive code accordingly.

Fixes: 55431f0077b6 ("udp: create and send ICMPv4 to local peer when applicable")
Fixes: 68b04182e07d ("udp: create and send ICMPv6 to local peer when applicable")
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/udp.c b/udp.c
index 96e48dd..0c223b4 100644
--- a/udp.c
+++ b/udp.c
@@ -510,10 +510,13 @@ static void udp_send_conn_fail_icmp6(const struct ctx *c,
  */
 static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 {
-	const struct sock_extended_err *ee;
+	struct errhdr {
+		struct sock_extended_err ee;
+		union sockaddr_inany saddr;
+	};
+	const struct errhdr *eh;
 	const struct cmsghdr *hdr;
-	union sockaddr_inany saddr;
-	char buf[CMSG_SPACE(sizeof(*ee))];
+	char buf[CMSG_SPACE(sizeof(struct errhdr))];
 	char data[ICMP6_MAX_DLEN];
 	int s = ref.fd;
 	struct iovec iov = {
@@ -521,8 +524,6 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		.iov_len = sizeof(data)
 	};
 	struct msghdr mh = {
-		.msg_name = &saddr,
-		.msg_namelen = sizeof(saddr),
 		.msg_iov = &iov,
 		.msg_iovlen = 1,
 		.msg_control = buf,
@@ -553,7 +554,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		return -1;
 	}
 
-	ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
+	eh = (const struct errhdr *)CMSG_DATA(hdr);
 	if (ref.type == EPOLL_TYPE_UDP_REPLY) {
 		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
 		const struct flowside *toside = flowside_at_sidx(sidx);
@@ -561,18 +562,19 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 
 		if (hdr->cmsg_level == IPPROTO_IP) {
 			dlen = MIN(dlen, ICMP4_MAX_DLEN);
-			udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr,
+			udp_send_conn_fail_icmp4(c, &eh->ee, toside,
+						 eh->saddr.sa4.sin_addr,
 						 data, dlen);
 		} else if (hdr->cmsg_level == IPPROTO_IPV6) {
-			udp_send_conn_fail_icmp6(c, ee, toside,
-						 &saddr.sa6.sin6_addr,
+			udp_send_conn_fail_icmp6(c, &eh->ee, toside,
+						 &eh->saddr.sa6.sin6_addr,
 						 data, dlen, sidx.flowi);
 		}
 	} else {
 		trace("Ignoring received IP_RECVERR cmsg on listener socket");
 	}
 	debug("%s error on UDP socket %i: %s",
-	      str_ee_origin(ee), s, strerror_(ee->ee_errno));
+	      str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno));
 
 	return 1;
 }

From 42a854a52b6fa2bbd70cbc0c7657c8a49a9c3d2d Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 28 Mar 2025 11:39:58 +1100
Subject: [PATCH 081/144] pasta, passt-repair: Support multiple events per
 read() in inotify handlers

The current code assumes that we'll get one event per read() on
inotify descriptors, but that's not the case, not from documentation,
and not from reports.

Add loops in the two inotify handlers we have, in pasta-specific code
and passt-repair, to go through all the events we receive.

Link: https://bugs.passt.top/show_bug.cgi?id=119
[dwg: Remove unnecessary buffer expansion, use strnlen instead of strlen
 to make Coverity happier]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Add additional check on ev->name and ev->len in passt-repair]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 32 +++++++++++++++++++++++++-------
 pasta.c        | 20 +++++++++++++-------
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/passt-repair.c b/passt-repair.c
index 120f7aa..86f0293 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -111,14 +111,14 @@ int main(int argc, char **argv)
 	}
 
 	if ((sb.st_mode & S_IFMT) == S_IFDIR) {
-		char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
+		char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
+		   __attribute__ ((aligned(__alignof__(struct inotify_event))));
 		const struct inotify_event *ev;
 		char path[PATH_MAX + 1];
+		bool found = false;
 		ssize_t n;
 		int fd;
 
-		ev = (struct inotify_event *)buf;
-
 		if ((fd = inotify_init1(IN_CLOEXEC)) < 0) {
 			fprintf(stderr, "inotify_init1: %i\n", errno);
 			_exit(1);
@@ -130,6 +130,8 @@ int main(int argc, char **argv)
 		}
 
 		do {
+			char *p;
+
 			n = read(fd, buf, sizeof(buf));
 			if (n < 0) {
 				fprintf(stderr, "inotify read: %i", errno);
@@ -138,11 +140,27 @@ int main(int argc, char **argv)
 
 			if (n < (ssize_t)sizeof(*ev)) {
 				fprintf(stderr, "Short inotify read: %zi", n);
-				_exit(1);
+				continue;
 			}
-		} while (ev->len < REPAIR_EXT_LEN ||
-			 memcmp(ev->name + strlen(ev->name) - REPAIR_EXT_LEN,
-				REPAIR_EXT, REPAIR_EXT_LEN));
+
+			for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
+				ev = (const struct inotify_event *)p;
+
+				if (ev->len >= REPAIR_EXT_LEN &&
+				    !memcmp(ev->name +
+					    strnlen(ev->name, ev->len) -
+					    REPAIR_EXT_LEN,
+					    REPAIR_EXT, REPAIR_EXT_LEN)) {
+					found = true;
+					break;
+				}
+			}
+		} while (!found);
+
+		if (ev->len > NAME_MAX + 1 || ev->name[ev->len] != '\0') {
+			fprintf(stderr, "Invalid filename from inotify\n");
+			_exit(1);
+		}
 
 		snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name);
 		if ((stat(path, &sb))) {
diff --git a/pasta.c b/pasta.c
index fa3e7de..017fa32 100644
--- a/pasta.c
+++ b/pasta.c
@@ -498,17 +498,23 @@ void pasta_netns_quit_init(const struct ctx *c)
  */
 void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd)
 {
-	char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
-	const struct inotify_event *in_ev = (struct inotify_event *)buf;
+	char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
+		__attribute__ ((aligned(__alignof__(struct inotify_event))));
+	const struct inotify_event *ev;
+	ssize_t n;
+	char *p;
 
-	if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev))
+	if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev))
 		return;
 
-	if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base)))
-		return;
+	for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
+		ev = (const struct inotify_event *)p;
 
-	info("Namespace %s is gone, exiting", c->netns_base);
-	_exit(EXIT_SUCCESS);
+		if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) {
+			info("Namespace %s is gone, exiting", c->netns_base);
+			_exit(EXIT_SUCCESS);
+		}
+	}
 }
 
 /**

From 025a3c2686b06be3fd09e29b2e3408d2c4ad6239 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 28 Mar 2025 14:34:14 +1100
Subject: [PATCH 082/144] udp: Don't attempt to forward ICMP socket errors to
 other sockets

Recently we added support for detecting ICMP triggered errors on UDP
sockets and forwarding them to the tap interface.  However, in
udp_sock_recverr() where this is handled we don't know for certain that
the tap interface is the other side of the UDP flow.  It could be a spliced
connection with another socket on the other side.

To forward errors in that case, we'd need to force the other side's socket
to trigger issue an ICMP error.  I'm not sure if there's a way to do that;
probably not for an arbitrary ICMP but it might be possible for certain
error conditions.

Nonetheless what we do now - synthesise an ICMP on the tap interface - is
certainly wrong.  It's probably harmless; for a spliced connection it will
have loopback addresses meaning we can expect the guest to discard it.
But, correct this for now, by not attempting to propagate errors when the
other side of the flow is a socket.

Fixes: 55431f0077b6 ("udp: create and send ICMPv4 to local peer when applicable")
Fixes: 68b04182e07d ("udp: create and send ICMPv6 to local peer when applicable")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/udp.c b/udp.c
index 0c223b4..e410f55 100644
--- a/udp.c
+++ b/udp.c
@@ -560,7 +560,10 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		const struct flowside *toside = flowside_at_sidx(sidx);
 		size_t dlen = rc;
 
-		if (hdr->cmsg_level == IPPROTO_IP) {
+		if (pif_is_socket(pif_at_sidx(sidx))) {
+			/* XXX Is there any way to propagate ICMPs from socket
+			 * to socket? */
+		} else if (hdr->cmsg_level == IPPROTO_IP) {
 			dlen = MIN(dlen, ICMP4_MAX_DLEN);
 			udp_send_conn_fail_icmp4(c, &eh->ee, toside,
 						 eh->saddr.sa4.sin_addr,

From 3de5af6e4145c6971be2597d7fb0386332d44a45 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 28 Mar 2025 14:34:15 +1100
Subject: [PATCH 083/144] udp: Improve name of UDP related ICMP sending
 functions

udp_send_conn_fail_icmp[46]() aren't actually specific to connections
failing: they can propagate a variety of ICMP errors, which might or might
not break a "connection".  They are, however, specific to sending ICMP
errors to the tap connection, not splice or host.  Rename them to better
reflect that.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/udp.c b/udp.c
index e410f55..39431d7 100644
--- a/udp.c
+++ b/udp.c
@@ -411,7 +411,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
 }
 
 /**
- * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer
+ * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer
  * @c:		Execution context
  * @ee:	Extended error descriptor
  * @toside:	Destination side of flow
@@ -419,11 +419,11 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
  * @in:	First bytes (max 8) of original UDP message body
  * @dlen:	Length of the read part of original UDP message body
  */
-static void udp_send_conn_fail_icmp4(const struct ctx *c,
-				     const struct sock_extended_err *ee,
-				     const struct flowside *toside,
-				     struct in_addr saddr,
-				     const void *in, size_t dlen)
+static void udp_send_tap_icmp4(const struct ctx *c,
+			       const struct sock_extended_err *ee,
+			       const struct flowside *toside,
+			       struct in_addr saddr,
+			       const void *in, size_t dlen)
 {
 	struct in_addr oaddr = toside->oaddr.v4mapped.a4;
 	struct in_addr eaddr = toside->eaddr.v4mapped.a4;
@@ -455,7 +455,7 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c,
 
 
 /**
- * udp_send_conn_fail_icmp6() - Construct and send ICMPv6 to local peer
+ * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer
  * @c:		Execution context
  * @ee:	Extended error descriptor
  * @toside:	Destination side of flow
@@ -464,11 +464,11 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c,
  * @dlen:	Length of the read part of original UDP message body
  * @flow:	IPv6 flow identifier
  */
-static void udp_send_conn_fail_icmp6(const struct ctx *c,
-				     const struct sock_extended_err *ee,
-				     const struct flowside *toside,
-				     const struct in6_addr *saddr,
-				     void *in, size_t dlen, uint32_t flow)
+static void udp_send_tap_icmp6(const struct ctx *c,
+			       const struct sock_extended_err *ee,
+			       const struct flowside *toside,
+			       const struct in6_addr *saddr,
+			       void *in, size_t dlen, uint32_t flow)
 {
 	const struct in6_addr *oaddr = &toside->oaddr.a6;
 	const struct in6_addr *eaddr = &toside->eaddr.a6;
@@ -565,13 +565,12 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 			 * to socket? */
 		} else if (hdr->cmsg_level == IPPROTO_IP) {
 			dlen = MIN(dlen, ICMP4_MAX_DLEN);
-			udp_send_conn_fail_icmp4(c, &eh->ee, toside,
-						 eh->saddr.sa4.sin_addr,
-						 data, dlen);
+			udp_send_tap_icmp4(c, &eh->ee, toside,
+					   eh->saddr.sa4.sin_addr, data, dlen);
 		} else if (hdr->cmsg_level == IPPROTO_IPV6) {
-			udp_send_conn_fail_icmp6(c, &eh->ee, toside,
-						 &eh->saddr.sa6.sin6_addr,
-						 data, dlen, sidx.flowi);
+			udp_send_tap_icmp6(c, &eh->ee, toside,
+					   &eh->saddr.sa6.sin6_addr, data,
+					   dlen, sidx.flowi);
 		}
 	} else {
 		trace("Ignoring received IP_RECVERR cmsg on listener socket");

From 2ed2d59def758b049f42e7c75bfb48957a73bd39 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 2 Apr 2025 14:13:16 +1100
Subject: [PATCH 084/144] platform requirements: Fix clang-tidy warning

Recent clang-tidy versions complain about enums defined with some but not
all entries given explicit values.  I'm not entirely convinced about
whether that's a useful warning, but in any case we really don't need the
explicit values in doc/platform-requirements/reuseaddr-priority.c, so
remove them to make clang happy.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 doc/platform-requirements/reuseaddr-priority.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/platform-requirements/reuseaddr-priority.c b/doc/platform-requirements/reuseaddr-priority.c
index 701b6ff..af39a39 100644
--- a/doc/platform-requirements/reuseaddr-priority.c
+++ b/doc/platform-requirements/reuseaddr-priority.c
@@ -46,13 +46,13 @@
 /* Different cases for receiving socket configuration */
 enum sock_type {
 	/* Socket is bound to 0.0.0.0:DSTPORT and not connected */
-	SOCK_BOUND_ANY = 0,
+	SOCK_BOUND_ANY,
 
 	/* Socket is bound to 127.0.0.1:DSTPORT and not connected */
-	SOCK_BOUND_LO = 1,
+	SOCK_BOUND_LO,
 
 	/* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */
-	SOCK_CONNECTED = 2,
+	SOCK_CONNECTED,
 
 	NUM_SOCK_TYPES,
 };

From 8e32881ef1d6d5867223a164052f8ff39d4ebb4e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 2 Apr 2025 14:13:17 +1100
Subject: [PATCH 085/144] platform requirements: Add attributes to die()
 function

Add both format string and ((noreturn)) attributes to the version of die()
used in the test programs in doc/platform-requirements.  As well as
potentially catching problems in format strings, this means that the
compiler and static checkers can properly reason about the fact that it
will exit, preventing bogus warnings.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 doc/platform-requirements/common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/platform-requirements/common.h b/doc/platform-requirements/common.h
index 8844b1e..e85fc2b 100644
--- a/doc/platform-requirements/common.h
+++ b/doc/platform-requirements/common.h
@@ -15,6 +15,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+__attribute__((format(printf, 1, 2), noreturn))
 static inline void die(const char *fmt, ...)
 {
 	va_list ap;

From 6bfc60b09522bd6f47660b835f0681977a28e1de Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 2 Apr 2025 14:13:18 +1100
Subject: [PATCH 086/144] platform requirements: Add test for address conflicts
 with TCP_REPAIR

Simple test program to check the behaviour we need for bind() address
conflicts between listening sockets and repair mode sockets.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 doc/platform-requirements/.gitignore         |   1 +
 doc/platform-requirements/Makefile           |   4 +-
 doc/platform-requirements/listen-vs-repair.c | 128 +++++++++++++++++++
 3 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 doc/platform-requirements/listen-vs-repair.c

diff --git a/doc/platform-requirements/.gitignore b/doc/platform-requirements/.gitignore
index 3b5a10a..f6272cf 100644
--- a/doc/platform-requirements/.gitignore
+++ b/doc/platform-requirements/.gitignore
@@ -1,3 +1,4 @@
+/listen-vs-repair
 /reuseaddr-priority
 /recv-zero
 /udp-close-dup
diff --git a/doc/platform-requirements/Makefile b/doc/platform-requirements/Makefile
index 6a7d374..83930ef 100644
--- a/doc/platform-requirements/Makefile
+++ b/doc/platform-requirements/Makefile
@@ -3,8 +3,8 @@
 # Copyright Red Hat
 # Author: David Gibson <david@gibson.dropbear.id.au>
 
-TARGETS = reuseaddr-priority recv-zero udp-close-dup
-SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c
+TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair
+SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c
 CFLAGS = -Wall
 
 all: cppcheck clang-tidy $(TARGETS:%=check-%)
diff --git a/doc/platform-requirements/listen-vs-repair.c b/doc/platform-requirements/listen-vs-repair.c
new file mode 100644
index 0000000..d31fe3f
--- /dev/null
+++ b/doc/platform-requirements/listen-vs-repair.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* liste-vs-repair.c
+ *
+ * Do listening sockets have address conflicts with sockets under repair
+ * ====================================================================
+ *
+ * When we accept() an incoming connection the accept()ed socket will have the
+ * same local address as the listening socket.  This can be a complication on
+ * migration.  On the migration target we've already set up listening sockets
+ * according to the command line.  However to restore connections that we're
+ * migrating in we need to bind the new sockets to the same address, which would
+ * be an address conflict on the face of it.  This test program verifies that
+ * enabling repair mode before bind() correctly suppresses that conflict.
+ *
+ * Copyright Red Hat
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ */
+
+/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "common.h"
+
+#define PORT	13256U
+#define CPORT	13257U
+
+/* 127.0.0.1:PORT */
+static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT);
+
+/* 127.0.0.1:CPORT */
+static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT);
+
+/* Put ourselves into a network sandbox */
+static void net_sandbox(void)
+{
+	/* NOLINTNEXTLINE(altera-struct-pack-align) */
+	const struct req_t {
+		struct nlmsghdr nlh;
+		struct ifinfomsg ifm;
+	} __attribute__((packed)) req = {
+		.nlh.nlmsg_type		= RTM_NEWLINK,
+		.nlh.nlmsg_flags	= NLM_F_REQUEST,
+		.nlh.nlmsg_len		= sizeof(req),
+		.nlh.nlmsg_seq		= 1,
+		.ifm.ifi_family		= AF_UNSPEC,
+                .ifm.ifi_index		= 1,
+                .ifm.ifi_flags		= IFF_UP,
+                .ifm.ifi_change		= IFF_UP,
+	};
+	int nl;
+
+	if (unshare(CLONE_NEWUSER | CLONE_NEWNET))
+		die("unshare(): %s\n", strerror(errno));
+
+	/* Bring up lo in the new netns */
+	nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (nl < 0)
+		die("Can't create netlink socket: %s\n", strerror(errno));
+
+	if (send(nl, &req, sizeof(req), 0) < 0)
+		die("Netlink send(): %s\n", strerror(errno));
+	close(nl);
+}
+
+static void check(void)
+{
+	int s1, s2, op;
+
+	s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (s1 < 0)
+		die("socket() 1: %s\n", strerror(errno));
+
+	if (bind(s1, (struct sockaddr *)&addr, sizeof(addr)))
+		die("bind() 1: %s\n", strerror(errno));
+
+	if (listen(s1, 0))
+		die("listen(): %s\n", strerror(errno));
+
+	s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (s2 < 0)
+		die("socket() 2: %s\n", strerror(errno));
+
+	op = TCP_REPAIR_ON;
+	if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
+		die("TCP_REPAIR: %s\n", strerror(errno));
+
+	if (bind(s2, (struct sockaddr *)&addr, sizeof(addr)))
+		die("bind() 2: %s\n", strerror(errno));
+
+	if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr)))
+		die("connect(): %s\n", strerror(errno));
+
+	op = TCP_REPAIR_OFF_NO_WP;
+	if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
+		die("TCP_REPAIR: %s\n", strerror(errno));
+
+	close(s1);
+	close(s2);
+}
+
+int main(int argc, char *argv[])
+{
+	(void)argc;
+	(void)argv;
+
+	net_sandbox();
+
+	check();
+
+	printf("Repair mode appears to properly suppress conflicts with listening sockets\n");
+
+	exit(0);
+}

From dec3d73e1e8e007d05f9dce9a48aca7cb8532992 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 2 Apr 2025 14:13:19 +1100
Subject: [PATCH 087/144] migrate, tcp: bind() migrated sockets in repair mode

Currently on a migration target, we create then immediately bind() new
sockets for the TCP connections we're reconstructing.  Mostly, this works,
since a socket() that is bound but hasn't had listen() or connect() called
is essentially passive.  However, this bind() is subject to the usual
address conflict checking.  In particular that means if we already have
a listening socket on that port, we'll get an EADDRINUSE.  This will happen
for every connection we try to migrate that was initiated from outside to
the guest, since we necessarily created a listening socket for that case.

We set SO_REUSEADDR on the socket in an attempt to avoid this, but that's
not sufficient; even with SO_REUSEADDR address conflicts are still
prohibited for listening sockets.  Of course once these incoming sockets
are fully repaired and connect()ed they'll no longer conflict, but that
doesn't help us if we fail at the bind().

We can avoid this by not calling bind() until we're already in repair mode
which suppresses this transient conflict.  Because of the batching of
setting repair mode, to do that we need to move the bind to a step in
tcp_flow_migrate_target_ext().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/tcp.c b/tcp.c
index fa1d885..35626c9 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3414,13 +3414,8 @@ fail:
 static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
 {
 	sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
-	const struct flowside *sockside = HOSTFLOW(conn);
-	union sockaddr_inany a;
-	socklen_t sl;
 	int s, rc;
 
-	pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
-
 	if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
 				 IPPROTO_TCP)) < 0) {
 		rc = -errno;
@@ -3435,12 +3430,6 @@ static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
 
 	tcp_sock_set_nodelay(s);
 
-	if (bind(s, &a.sa, sizeof(a))) {
-		rc = -errno;
-		flow_perror(conn, "Failed to bind socket for migrated flow");
-		goto err;
-	}
-
 	if ((rc = tcp_flow_repair_on(c, conn)))
 		goto err;
 
@@ -3452,6 +3441,30 @@ err:
 	return rc;
 }
 
+/**
+ * tcp_flow_repair_bind() - Bind socket in repair mode
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+	const struct flowside *sockside = HOSTFLOW(conn);
+	union sockaddr_inany a;
+	socklen_t sl;
+
+	pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
+
+	if (bind(conn->sock, &a.sa, sizeof(a))) {
+		int rc = -errno;
+		flow_perror(conn, "Failed to bind socket for migrated flow");
+		return rc;
+	}
+
+	return 0;
+}
+
 /**
  * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
  * @c:		Execution context
@@ -3618,6 +3631,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 		/* We weren't able to create the socket, discard flow */
 		goto fail;
 
+	if (tcp_flow_repair_bind(c, conn))
+		goto fail;
+
 	if (tcp_flow_repair_timestamp(conn, &t))
 		goto fail;
 

From 3d41e4d8389578e5d5f3cf2e47b9ff9cdd29ffd1 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 2 Apr 2025 15:43:40 +1100
Subject: [PATCH 088/144] passt-repair: Correct off-by-one error verifying name

passt-repair will generate an error if the name it gets from the kernel is
too long or not NUL terminated.  Downstream testing has reported
occasionally seeing this error in practice.

In turns out there is a trivial off-by-one error in the check: ev->len is
the length of the name, including terminating \0 characters, so to check
for a \0 at the end of the buffer we need to check ev->name[len - 1] not
ev->name[len].

Fixes: 42a854a52b6f ("pasta, passt-repair: Support multiple events per read() in inotify handlers")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt-repair.c b/passt-repair.c
index 86f0293..440c77a 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -157,7 +157,7 @@ int main(int argc, char **argv)
 			}
 		} while (!found);
 
-		if (ev->len > NAME_MAX + 1 || ev->name[ev->len] != '\0') {
+		if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') {
 			fprintf(stderr, "Invalid filename from inotify\n");
 			_exit(1);
 		}

From 8aa2d90c8d95d0fa1dad7027fdf92b48a1bbf3c6 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 1 Apr 2025 19:57:08 +1100
Subject: [PATCH 089/144] udp: Remove redundant udp_at_sidx() call in
 udp_tap_handler()

We've already have a pointer to the UDP flow in variable uflow, we can just
re-use it.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/udp.c b/udp.c
index 39431d7..ac168db 100644
--- a/udp.c
+++ b/udp.c
@@ -907,7 +907,7 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 	}
 	toside = flowside_at_sidx(tosidx);
 
-	s = udp_at_sidx(tosidx)->s[tosidx.sidei];
+	s = uflow->s[tosidx.sidei];
 	ASSERT(s >= 0);
 
 	pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport);

From 76e554d9ec8dc80c1856621e17e45be811d198d0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 1 Apr 2025 19:57:09 +1100
Subject: [PATCH 090/144] udp: Simplify updates to UDP flow timestamp

Since UDP has no built in knowledge of connections, the only way we
know when we're done with a UDP flow is a timeout with no activity.
To keep track of this struct udp_flow includes a timestamp to record
the last time we saw traffic on the flow.

For data from listening sockets and from tap, this is done implicitly via
udp_flow_from_{sock,tap}() but for reply sockets it's done explicitly.
However, that logic is duplicated between the vhost-user and "buf" paths.
Make it common in udp_reply_sock_handler() instead.

Technically this is a behavioural change: previously if we got an EPOLLIN
event, but there wasn't actually any data we wouldn't update the timestamp,
now we will.  This should be harmless: if there's an EPOLLIN we expect
there to be data, and even if there isn't the worst we can do is mildly
delay the cleanup of a stale flow.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c    | 15 ++++++---------
 udp_vu.c |  9 +--------
 udp_vu.h |  3 +--
 3 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/udp.c b/udp.c
index ac168db..44b58d1 100644
--- a/udp.c
+++ b/udp.c
@@ -758,27 +758,21 @@ void udp_listen_sock_handler(const struct ctx *c,
  * @c:		Execution context
  * @s:		Socket to read data from
  * @tosidx:	Flow & side to forward data from @s to
- * @now:	Current timestamp
  *
  * Return: true on success, false if can't forward from socket to flow's pif
  *
  * #syscalls recvmmsg
  */
 static bool udp_buf_reply_sock_data(const struct ctx *c,
-				    int s, flow_sidx_t tosidx,
-				    const struct timespec *now)
+				    int s, flow_sidx_t tosidx)
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
-	struct udp_flow *uflow = udp_at_sidx(tosidx);
 	uint8_t topif = pif_at_sidx(tosidx);
 	int n, i;
 
 	if ((n = udp_sock_recv(c, s, udp_mh_recv)) <= 0)
 		return true;
 
-	flow_trace(uflow, "Received %d datagrams on reply socket", n);
-	uflow->ts = now->tv_sec;
-
 	for (i = 0; i < n; i++) {
 		if (pif_is_socket(topif))
 			udp_splice_prepare(udp_mh_recv, i);
@@ -825,10 +819,13 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 		int s = ref.fd;
 		bool ret;
 
+		flow_trace(uflow, "Received data on reply socket");
+		uflow->ts = now->tv_sec;
+
 		if (c->mode == MODE_VU)
-			ret = udp_vu_reply_sock_data(c, s, tosidx, now);
+			ret = udp_vu_reply_sock_data(c, s, tosidx);
 		else
-			ret = udp_buf_reply_sock_data(c, s, tosidx, now);
+			ret = udp_buf_reply_sock_data(c, s, tosidx);
 
 		if (!ret) {
 			flow_err(uflow, "Unable to forward UDP");
diff --git a/udp_vu.c b/udp_vu.c
index 06bdeae..4153b6c 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -275,22 +275,17 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
  * @c:		Execution context
  * @s:		Socket to read data from
  * @tosidx:	Flow & side to forward data from @s to
- * @now:	Current timestamp
  *
  * Return: true on success, false if can't forward from socket to flow's pif
  */
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx,
-			    const struct timespec *now)
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx)
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
-	struct udp_flow *uflow = udp_at_sidx(tosidx);
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;
 
-	ASSERT(uflow);
-
 	if (pif_at_sidx(tosidx) != PIF_TAP)
 		return false;
 
@@ -301,8 +296,6 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx,
 		iov_used = udp_vu_sock_recv(c, s, v6, &dlen);
 		if (iov_used <= 0)
 			break;
-		flow_trace(uflow, "Received 1 datagram on reply socket");
-		uflow->ts = now->tv_sec;
 
 		udp_vu_prepare(c, toside, dlen);
 		if (*c->pcap) {
diff --git a/udp_vu.h b/udp_vu.h
index 2299b51..6d541a4 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -8,7 +8,6 @@
 
 void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 			     const struct timespec *now);
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx,
-			    const struct timespec *now);
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx);
 
 #endif /* UDP_VU_H */

From 684870a766e7f024a5720464ad070e666cb4793e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 1 Apr 2025 19:57:10 +1100
Subject: [PATCH 091/144] udp: Correct some seccomp filter annotations

Both udp_buf_listen_sock_data() and udp_buf_reply_sock_data() have comments
stating they use recvmmsg().  That's not correct, they only do so via
udp_sock_recv() which lists recvmmsg() itself.

In contrast udp_splice_send() and udp_tap_handler() both directly use
sendmmsg(), but only the latter lists it.  Add it to the former as well.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/udp.c b/udp.c
index 44b58d1..ab3e9d2 100644
--- a/udp.c
+++ b/udp.c
@@ -272,6 +272,8 @@ static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx)
  * @dst:	Destination port for datagrams (target side)
  * @ref:	epoll reference for origin socket
  * @now:	Timestamp
+ *
+ * #syscalls sendmmsg
  */
 static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
 			    flow_sidx_t tosidx)
@@ -662,8 +664,6 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh)
  * @c:		Execution context
  * @ref:	epoll reference
  * @now:	Current timestamp
- *
- * #syscalls recvmmsg
  */
 static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 				     const struct timespec *now)
@@ -760,8 +760,6 @@ void udp_listen_sock_handler(const struct ctx *c,
  * @tosidx:	Flow & side to forward data from @s to
  *
  * Return: true on success, false if can't forward from socket to flow's pif
- *
- * #syscalls recvmmsg
  */
 static bool udp_buf_reply_sock_data(const struct ctx *c,
 				    int s, flow_sidx_t tosidx)

From 06784d7fc6761528d587837b241d27c6d17c0842 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 3 Apr 2025 19:01:02 +0200
Subject: [PATCH 092/144] passt-repair: Ensure that read buffer is
 NULL-terminated

After 3d41e4d83895 ("passt-repair: Correct off-by-one error verifying
name"), Coverity Scan isn't convinced anymore about the fact that the
ev->name used in the snprintf() is NULL-terminated.

It comes from a read() call, and read() of course doesn't terminate
it, but we already check that the byte at ev->len - 1 is a NULL
terminator, so this is actually a false positive.

In any case, the logic ensuring that ev->name is NULL-terminated isn't
necessarily obvious, and additionally checking that the last byte in
the buffer we read is a NULL terminator is harmless, so do that
explicitly, even if it's redundant.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt-repair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/passt-repair.c b/passt-repair.c
index 440c77a..256a8c9 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -137,6 +137,7 @@ int main(int argc, char **argv)
 				fprintf(stderr, "inotify read: %i", errno);
 				_exit(1);
 			}
+			buf[n - 1] = '\0';
 
 			if (n < (ssize_t)sizeof(*ev)) {
 				fprintf(stderr, "Short inotify read: %zi", n);

From a7775e9550fa698e4af1322f6ef63924c24d1fab Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Sat, 5 Apr 2025 15:21:26 -0400
Subject: [PATCH 093/144] udp: support traceroute in direction tap-socket

Now that ICMP pass-through from socket-to-tap is in place, it is
easy to support UDP based traceroute functionality in direction
tap-to-socket.

We fix that in this commit.

Link: https://bugs.passt.top/show_bug.cgi?id=64
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c      | 17 +++++++++++++----
 udp.c      | 22 +++++++++++++++++++++-
 udp.h      |  3 ++-
 udp_flow.c |  1 +
 udp_flow.h |  4 +++-
 5 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/tap.c b/tap.c
index 3a6fcbe..d630f6d 100644
--- a/tap.c
+++ b/tap.c
@@ -559,6 +559,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
  * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
  * @msgs:	Count of messages in sequence
  * @protocol:	Protocol number
+ * @ttl:	Time to live
  * @source:	Source port
  * @dest:	Destination port
  * @saddr:	Source address
@@ -567,6 +568,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
  */
 static struct tap4_l4_t {
 	uint8_t protocol;
+	uint8_t ttl;
 
 	uint16_t source;
 	uint16_t dest;
@@ -586,6 +588,7 @@ static struct tap4_l4_t {
  * @dest:	Destination port
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @hop_limit:	Hop limit
  * @msg:	Array of messages that can be handled in a single call
  */
 static struct tap6_l4_t {
@@ -598,6 +601,8 @@ static struct tap6_l4_t {
 	struct in6_addr saddr;
 	struct in6_addr daddr;
 
+	uint8_t hop_limit;
+
 	struct pool_l4_t p;
 } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
 
@@ -786,7 +791,8 @@ resume:
 #define L4_MATCH(iph, uh, seq)							\
 	((seq)->protocol == (iph)->protocol &&					\
 	 (seq)->source   == (uh)->source    && (seq)->dest  == (uh)->dest &&	\
-	 (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
+	 (seq)->saddr.s_addr == (iph)->saddr &&				\
+	 (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
 
 #define L4_SET(iph, uh, seq)						\
 	do {								\
@@ -795,6 +801,7 @@ resume:
 		(seq)->dest		= (uh)->dest;			\
 		(seq)->saddr.s_addr	= (iph)->saddr;			\
 		(seq)->daddr.s_addr	= (iph)->daddr;			\
+		(seq)->ttl		= (iph)->ttl;			\
 	} while (0)
 
 		if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
@@ -843,7 +850,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += udp_tap_handler(c, PIF_TAP, AF_INET,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     seq->ttl, p, k, now);
 		}
 	}
 
@@ -966,7 +973,8 @@ resume:
 		 (seq)->dest == (uh)->dest                 &&		\
 		 (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) &&		\
 		 IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr)  &&		\
-		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
+		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)  &&		\
+		 (seq)->hop_limit == (ip6h)->hop_limit)
 
 #define L4_SET(ip6h, proto, uh, seq)					\
 	do {								\
@@ -976,6 +984,7 @@ resume:
 		(seq)->flow_lbl	= ip6_get_flow_lbl(ip6h);		\
 		(seq)->saddr	= *saddr;				\
 		(seq)->daddr	= *daddr;				\
+		(seq)->hop_limit = (ip6h)->hop_limit;			\
 	} while (0)
 
 		if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
@@ -1026,7 +1035,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += udp_tap_handler(c, PIF_TAP, AF_INET6,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     seq->hop_limit, p, k, now);
 		}
 	}
 
diff --git a/udp.c b/udp.c
index ab3e9d2..5a251df 100644
--- a/udp.c
+++ b/udp.c
@@ -844,6 +844,7 @@ fail:
  * @af:		Address family, AF_INET or AF_INET6
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @ttl:	TTL or hop limit for packets to be sent in this call
  * @p:		Pool of UDP packets, with UDP headers
  * @idx:	Index of first packet to process
  * @now:	Current timestamp
@@ -854,7 +855,8 @@ fail:
  */
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    const struct pool *p, int idx, const struct timespec *now)
+		    uint8_t ttl, const struct pool *p, int idx,
+		    const struct timespec *now)
 {
 	const struct flowside *toside;
 	struct mmsghdr mm[UIO_MAXIOV];
@@ -933,6 +935,24 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		mm[i].msg_hdr.msg_controllen = 0;
 		mm[i].msg_hdr.msg_flags = 0;
 
+		if (ttl != uflow->ttl[tosidx.sidei]) {
+			uflow->ttl[tosidx.sidei] = ttl;
+			if (af == AF_INET) {
+				if (setsockopt(s, IPPROTO_IP, IP_TTL,
+					       &ttl, sizeof(ttl)) < 0)
+					flow_perror(uflow,
+						    "setsockopt IP_TTL");
+			} else {
+				/* IPv6 hop_limit cannot be only 1 byte */
+				int hop_limit = ttl;
+
+				if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS,
+					       &hop_limit, sizeof(hop_limit)) < 0)
+					flow_perror(uflow,
+						    "setsockopt IPV6_UNICAST_HOPS");
+			}
+		}
+
 		count++;
 	}
 
diff --git a/udp.h b/udp.h
index de2df6d..a811475 100644
--- a/udp.h
+++ b/udp.h
@@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now);
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    const struct pool *p, int idx, const struct timespec *now);
+		    uint8_t ttl, const struct pool *p, int idx,
+		    const struct timespec *now);
 int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);
 int udp_init(struct ctx *c);
diff --git a/udp_flow.c b/udp_flow.c
index bf4b896..99ae490 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
 	uflow->ts = now->tv_sec;
 	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
+	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
 
 	if (s_ini >= 0) {
 		/* When using auto port-scanning the listening port could go
diff --git a/udp_flow.h b/udp_flow.h
index 9a1b059..520de62 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -8,11 +8,12 @@
 #define UDP_FLOW_H
 
 /**
- * struct udp - Descriptor for a flow of UDP packets
+ * struct udp_flow - Descriptor for a flow of UDP packets
  * @f:		Generic flow information
  * @closed:	Flow is already closed
  * @ts:		Activity timestamp
  * @s:		Socket fd (or -1) for each side of the flow
+ * @ttl:	TTL or hop_limit for both sides
  */
 struct udp_flow {
 	/* Must be first element */
@@ -21,6 +22,7 @@ struct udp_flow {
 	bool closed :1;
 	time_t ts;
 	int s[SIDES];
+	uint8_t ttl[SIDES];
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);

From d74b5a7c107006b95df6a69e5f1e6b9a373c7f53 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:31 +1100
Subject: [PATCH 094/144] udp: Use connect()ed sockets for initiating side

Currently we have an asymmetry in how we handle UDP sockets.  For flows
where the target side is a socket, we create a new connect()ed socket
- the "reply socket" specifically for that flow used for sending and
receiving datagrams on that flow and only that flow.  For flows where the
initiating side is a socket, we continue to use the "listening" socket (or
rather, a dup() of it).  This has some disadvantages:

 * We need a hash lookup for every datagram on the listening socket in
   order to work out what flow it belongs to
 * The dup() keeps the socket alive even if automatic forwarding removes
   the listening socket.  However, the epoll data remains the same
   including containing the now stale original fd.  This causes bug 103.
 * We can't (easily) set flow-specific options on an initiating side
   socket, because that could affect other flows as well

Alter the code to use a connect()ed socket on the initiating side as well
as the target side.  There's no way to "clone and connect" the listening
socket (a loose equivalent of accept() for UDP), so we have to create a
new socket.  We have to bind() this socket before we connect() it, which
is allowed thanks to SO_REUSEADDR, but does leave a small window where it
could receive datagrams not intended for this flow.  For now we handle this
by simply discarding any datagrams received between bind() and connect(),
but I intend to improve this in a later patch.

Link: https://bugs.passt.top/show_bug.cgi?id=103
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 epoll_type.h |  4 ++--
 passt.c      |  6 +++---
 udp.c        | 50 ++++++++++++++++++++++++++------------------------
 udp.h        |  4 ++--
 udp_flow.c   | 32 +++++++++-----------------------
 util.c       |  2 +-
 6 files changed, 43 insertions(+), 55 deletions(-)

diff --git a/epoll_type.h b/epoll_type.h
index 7f2a121..12ac59b 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -22,8 +22,8 @@ enum epoll_type {
 	EPOLL_TYPE_TCP_TIMER,
 	/* UDP "listening" sockets */
 	EPOLL_TYPE_UDP_LISTEN,
-	/* UDP socket for replies on a specific flow */
-	EPOLL_TYPE_UDP_REPLY,
+	/* UDP socket for a specific flow */
+	EPOLL_TYPE_UDP,
 	/* ICMP/ICMPv6 ping sockets */
 	EPOLL_TYPE_PING,
 	/* inotify fd watching for end of netns (pasta) */
diff --git a/passt.c b/passt.c
index cd06772..388d10f 100644
--- a/passt.c
+++ b/passt.c
@@ -68,7 +68,7 @@ char *epoll_type_str[] = {
 	[EPOLL_TYPE_TCP_LISTEN]		= "listening TCP socket",
 	[EPOLL_TYPE_TCP_TIMER]		= "TCP timer",
 	[EPOLL_TYPE_UDP_LISTEN]		= "listening UDP socket",
-	[EPOLL_TYPE_UDP_REPLY]		= "UDP reply socket",
+	[EPOLL_TYPE_UDP]		= "UDP flow socket",
 	[EPOLL_TYPE_PING]	= "ICMP/ICMPv6 ping socket",
 	[EPOLL_TYPE_NSQUIT_INOTIFY]	= "namespace inotify watch",
 	[EPOLL_TYPE_NSQUIT_TIMER]	= "namespace timer watch",
@@ -339,8 +339,8 @@ loop:
 		case EPOLL_TYPE_UDP_LISTEN:
 			udp_listen_sock_handler(&c, ref, eventmask, &now);
 			break;
-		case EPOLL_TYPE_UDP_REPLY:
-			udp_reply_sock_handler(&c, ref, eventmask, &now);
+		case EPOLL_TYPE_UDP:
+			udp_sock_handler(&c, ref, eventmask, &now);
 			break;
 		case EPOLL_TYPE_PING:
 			icmp_sock_handler(&c, ref);
diff --git a/udp.c b/udp.c
index 5a251df..1b3fffd 100644
--- a/udp.c
+++ b/udp.c
@@ -39,27 +39,30 @@
  * could receive packets from multiple flows, so we use a hash table match to
  * find the specific flow for a datagram.
  *
- * When a UDP flow is initiated from a listening socket we take a duplicate of
- * the socket and store it in uflow->s[INISIDE].  This will last for the
+ * Flow sockets
+ * ============
+ *
+ * When a UDP flow targets a socket, we create a "flow" socket in
+ * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
+ * replies on the target side.  This socket is both bound and connected and has
+ * EPOLL_TYPE_UDP.  The connect() means it will only receive datagrams
+ * associated with this flow, so the epoll reference directly points to the flow
+ * and we don't need a hash lookup.
+ *
+ * When a flow is initiated from a listening socket, we create a "flow" socket
+ * with the same bound address as the listening socket, but also connect()ed to
+ * the flow's peer.  This is stored in uflow->s[INISIDE] and will last for the
  * lifetime of the flow, even if the original listening socket is closed due to
  * port auto-probing.  The duplicate is used to deliver replies back to the
  * originating side.
  *
- * Reply sockets
- * =============
- *
- * When a UDP flow targets a socket, we create a "reply" socket in
- * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
- * replies on the target side.  This socket is both bound and connected and has
- * EPOLL_TYPE_UDP_REPLY.  The connect() means it will only receive datagrams
- * associated with this flow, so the epoll reference directly points to the flow
- * and we don't need a hash lookup.
- *
- * NOTE: it's possible that the reply socket could have a bound address
- * overlapping with an unrelated listening socket.  We assume datagrams for the
- * flow will come to the reply socket in preference to a listening socket.  The
- * sample program doc/platform-requirements/reuseaddr-priority.c documents and
- * tests that assumption.
+ * NOTE: A flow socket can have a bound address overlapping with a listening
+ * socket.  That will happen naturally for flows initiated from a socket, but is
+ * also possible (though unlikely) for tap initiated flows, depending on the
+ * source port.  We assume datagrams for the flow will come to a connect()ed
+ * socket in preference to a listening socket.  The sample program
+ * doc/platform-requirements/reuseaddr-priority.c documents and tests that
+ * assumption.
  *
  * "Spliced" flows
  * ===============
@@ -71,8 +74,7 @@
  * actually used; it doesn't make sense for datagrams and instead a pair of
  * recvmmsg() and sendmmsg() is used to forward the datagrams.
  *
- * Note that a spliced flow will have *both* a duplicated listening socket and a
- * reply socket (see above).
+ * Note that a spliced flow will have two flow sockets (see above).
  */
 
 #include <sched.h>
@@ -557,7 +559,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 	}
 
 	eh = (const struct errhdr *)CMSG_DATA(hdr);
-	if (ref.type == EPOLL_TYPE_UDP_REPLY) {
+	if (ref.type == EPOLL_TYPE_UDP) {
 		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
 		const struct flowside *toside = flowside_at_sidx(sidx);
 		size_t dlen = rc;
@@ -792,14 +794,14 @@ static bool udp_buf_reply_sock_data(const struct ctx *c,
 }
 
 /**
- * udp_reply_sock_handler() - Handle new data from flow specific socket
+ * udp_sock_handler() - Handle new data from flow specific socket
  * @c:		Execution context
  * @ref:	epoll reference
  * @events:	epoll events bitmap
  * @now:	Current timestamp
  */
-void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-			    uint32_t events, const struct timespec *now)
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
+		      uint32_t events, const struct timespec *now)
 {
 	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
 
@@ -807,7 +809,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 
 	if (events & EPOLLERR) {
 		if (udp_sock_errs(c, ref) < 0) {
-			flow_err(uflow, "Unrecoverable error on reply socket");
+			flow_err(uflow, "Unrecoverable error on flow socket");
 			goto fail;
 		}
 	}
diff --git a/udp.h b/udp.h
index a811475..8f8531a 100644
--- a/udp.h
+++ b/udp.h
@@ -11,8 +11,8 @@
 void udp_portmap_clear(void);
 void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 			     uint32_t events, const struct timespec *now);
-void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
-			    uint32_t events, const struct timespec *now);
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
+		      uint32_t events, const struct timespec *now);
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
 		    uint8_t ttl, const struct pool *p, int idx,
diff --git a/udp_flow.c b/udp_flow.c
index 99ae490..a2d417f 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -49,10 +49,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 	flow_foreach_sidei(sidei) {
 		flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
 		if (uflow->s[sidei] >= 0) {
-			/* The listening socket needs to stay in epoll, but the
-			 * flow specific one needs to be removed */
-			if (sidei == TGTSIDE)
-				epoll_del(c, uflow->s[sidei]);
+			epoll_del(c, uflow->s[sidei]);
 			close(uflow->s[sidei]);
 			uflow->s[sidei] = -1;
 		}
@@ -81,7 +78,7 @@ static int udp_flow_sock(const struct ctx *c,
 	} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
 	int rc, s;
 
-	s = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, pif, side, fref.data);
+	s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
 	if (s < 0) {
 		flow_dbg_perror(uflow, "Couldn't open flow specific socket");
 		return s;
@@ -120,13 +117,12 @@ static int udp_flow_sock(const struct ctx *c,
  * udp_flow_new() - Common setup for a new UDP flow
  * @c:		Execution context
  * @flow:	Initiated flow
- * @s_ini:	Initiating socket (or -1)
  * @now:	Timestamp
  *
  * Return: UDP specific flow, if successful, NULL on failure
  */
 static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
-				int s_ini, const struct timespec *now)
+				const struct timespec *now)
 {
 	struct udp_flow *uflow = NULL;
 	unsigned sidei;
@@ -139,22 +135,12 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
 	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
 
-	if (s_ini >= 0) {
-		/* When using auto port-scanning the listening port could go
-		 * away, so we need to duplicate the socket
-		 */
-		uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0);
-		if (uflow->s[INISIDE] < 0) {
-			flow_perror(uflow,
-				    "Couldn't duplicate listening socket");
-			goto cancel;
-		}
+	flow_foreach_sidei(sidei) {
+		if (pif_is_socket(uflow->f.pif[sidei]))
+			if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0)
+				goto cancel;
 	}
 
-	if (pif_is_socket(flow->f.pif[TGTSIDE]))
-		if ((uflow->s[TGTSIDE] = udp_flow_sock(c, uflow, TGTSIDE)) < 0)
-			goto cancel;
-
 	/* Tap sides always need to be looked up by hash.  Socket sides don't
 	 * always, but sometimes do (receiving packets on a socket not specific
 	 * to one flow).  Unconditionally hash both sides so all our bases are
@@ -225,7 +211,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 		return FLOW_SIDX_NONE;
 	}
 
-	return udp_flow_new(c, flow, ref.fd, now);
+	return udp_flow_new(c, flow, now);
 }
 
 /**
@@ -281,7 +267,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 		return FLOW_SIDX_NONE;
 	}
 
-	return udp_flow_new(c, flow, -1, now);
+	return udp_flow_new(c, flow, now);
 }
 
 /**
diff --git a/util.c b/util.c
index b9a3d43..0f68cf5 100644
--- a/util.c
+++ b/util.c
@@ -71,7 +71,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	case EPOLL_TYPE_UDP_LISTEN:
 		freebind = c->freebind;
 		/* fallthrough */
-	case EPOLL_TYPE_UDP_REPLY:
+	case EPOLL_TYPE_UDP:
 		proto = IPPROTO_UDP;
 		socktype = SOCK_DGRAM | SOCK_NONBLOCK;
 		break;

From 1d7bbb101a0b1dcbc99c51cd65abb90a0144ac7b Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:32 +1100
Subject: [PATCH 095/144] udp: Make udp_sock_recv() take max number of frames
 as a parameter

Currently udp_sock_recv() decides the maximum number of frames it is
willing to receive based on the mode.  However, we have upcoming use cases
where we will have different criteria for how many frames we want with
information that's not naturally available here but is in the caller.  So
make the maximum number of frames a parameter.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Fix typo in comment in udp_buf_reply_sock_data()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/udp.c b/udp.c
index 1b3fffd..53403bf 100644
--- a/udp.c
+++ b/udp.c
@@ -634,22 +634,14 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
  * @c:		Execution context
  * @s:		Socket to receive from
  * @mmh		mmsghdr array to receive into
+ * @n:		Maximum number of datagrams to receive
  *
  * Return: Number of datagrams received
  *
  * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
  */
-static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh)
+static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 {
-	/* For not entirely clear reasons (data locality?) pasta gets better
-	 * throughput if we receive tap datagrams one at a atime.  For small
-	 * splice datagrams throughput is slightly better if we do batch, but
-	 * it's slightly worse for large splice datagrams.  Since we don't know
-	 * before we receive whether we'll use tap or splice, always go one at a
-	 * time for pasta mode.
-	 */
-	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
-
 	ASSERT(!c->no_udp);
 
 	n = recvmmsg(s, mmh, n, 0, NULL);
@@ -671,9 +663,10 @@ static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 				     const struct timespec *now)
 {
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
-	int n, i;
+	/* See udp_buf_sock_data() comment */
+	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i;
 
-	if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv)) <= 0)
+	if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv, n)) <= 0)
 		return;
 
 	/* We divide datagrams into batches based on how we need to send them,
@@ -768,9 +761,15 @@ static bool udp_buf_reply_sock_data(const struct ctx *c,
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	uint8_t topif = pif_at_sidx(tosidx);
-	int n, i;
+	/* For not entirely clear reasons (data locality?) pasta gets better
+	 * throughput if we receive tap datagrams one at a time.  For small
+	 * splice datagrams throughput is slightly better if we do batch, but
+	 * it's slightly worse for large splice datagrams.  Since we don't know
+	 * the size before we receive, always go one at a time for pasta mode.
+	 */
+	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i;
 
-	if ((n = udp_sock_recv(c, s, udp_mh_recv)) <= 0)
+	if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
 		return true;
 
 	for (i = 0; i < n; i++) {

From 84ab1305fabaf07b5badf433e55a458de5b86918 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:33 +1100
Subject: [PATCH 096/144] udp: Polish udp_vu_sock_info() and remove from vu
 specific code

udp_vu_sock_info() uses MSG_PEEK to look ahead at the next datagram to be
received and gets its source address.  Currently we only use it in the
vhost-user path, but there's nothing inherently vhost-user specific about
it.  We have upcoming uses for it elsewhere so rename and move to udp.c.

While we're there, polish its error reporting a litle.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Drop excess newline before udp_sock_recv()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c          | 24 ++++++++++++++++++++++++
 udp_internal.h |  1 +
 udp_vu.c       | 19 +------------------
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/udp.c b/udp.c
index 53403bf..1e241c8 100644
--- a/udp.c
+++ b/udp.c
@@ -629,6 +629,30 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
 	return n_err;
 }
 
+/**
+ * udp_peek_addr() - Get source address for next packet
+ * @s:		Socket to get information from
+ * @src:	Socket address (output)
+ *
+ * Return: 0 on success, -1 otherwise
+ */
+int udp_peek_addr(int s, union sockaddr_inany *src)
+{
+	struct msghdr msg = {
+		.msg_name = src,
+		.msg_namelen = sizeof(*src),
+	};
+	int rc;
+
+	rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
+	if (rc < 0) {
+		if (errno != EAGAIN && errno != EWOULDBLOCK)
+			warn_perror("Error peeking at socket address");
+		return rc;
+	}
+	return 0;
+}
+
 /**
  * udp_sock_recv() - Receive datagrams from a socket
  * @c:		Execution context
diff --git a/udp_internal.h b/udp_internal.h
index 02724e5..43a6109 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -30,5 +30,6 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                        const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
+int udp_peek_addr(int s, union sockaddr_inany *src);
 
 #endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index 4153b6c..5faf1e1 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -57,23 +57,6 @@ static size_t udp_vu_hdrlen(bool v6)
 	return hdrlen;
 }
 
-/**
- * udp_vu_sock_info() - get socket information
- * @s:		Socket to get information from
- * @s_in:	Socket address (output)
- *
- * Return: 0 if socket address can be read, -1 otherwise
- */
-static int udp_vu_sock_info(int s, union sockaddr_inany *s_in)
-{
-	struct msghdr msg = {
-		.msg_name = s_in,
-		.msg_namelen = sizeof(union sockaddr_inany),
-	};
-
-	return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
-}
-
 /**
  * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
  * @c:		Execution context
@@ -230,7 +213,7 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 		int iov_used;
 		bool v6;
 
-		if (udp_vu_sock_info(ref.fd, &s_in) < 0)
+		if (udp_peek_addr(ref.fd, &s_in) < 0)
 			break;
 
 		sidx = udp_flow_from_sock(c, ref, &s_in, now);

From 3a0881dfd02d758b0dc8ca6f5732bcb666b6d21e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:34 +1100
Subject: [PATCH 097/144] udp: Don't bother to batch datagrams from "listening"
 socket

A "listening" UDP socket can receive datagrams from multiple flows.  So,
we currently have some quite subtle and complex code in
udp_buf_listen_sock_data() to group contiguously received packets for the
same flow into batches for forwarding.

However, since we are now always using flow specific connect()ed sockets
once a flow is established, handling of datagrams on listening sockets is
essentially a slow path.  Given that, it's not worth the complexity.
Substantially simplify the code by using an approach more like vhost-user,
and "peeking" at the address of the next datagram, one at a time to
determine the correct flow before we actually receive the data,

This removes all meaningful use of the s_in and tosidx fields in
udp_meta_t, so they too can be removed, along with setting of msg_name and
msg_namelen in the msghdr arrays which referenced them.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 75 +++++++++++++++--------------------------------------------
 1 file changed, 19 insertions(+), 56 deletions(-)

diff --git a/udp.c b/udp.c
index 1e241c8..4d32124 100644
--- a/udp.c
+++ b/udp.c
@@ -138,20 +138,15 @@ static struct ethhdr udp4_eth_hdr;
 static struct ethhdr udp6_eth_hdr;
 
 /**
- * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets
+ * struct udp_meta_t - Pre-cooked headers for UDP packets
  * @ip6h:	Pre-filled IPv6 header (except for payload_len and addresses)
  * @ip4h:	Pre-filled IPv4 header (except for tot_len and saddr)
  * @taph:	Tap backend specific header
- * @s_in:	Source socket address, filled in by recvmmsg()
- * @tosidx:	sidx for the destination side of this datagram's flow
  */
 static struct udp_meta_t {
 	struct ipv6hdr ip6h;
 	struct iphdr ip4h;
 	struct tap_hdr taph;
-
-	union sockaddr_inany s_in;
-	flow_sidx_t tosidx;
 }
 #ifdef __AVX2__
 __attribute__ ((aligned(32)))
@@ -234,8 +229,6 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
 	tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
 	tiov[UDP_IOV_PAYLOAD].iov_base = payload;
 
-	mh->msg_name	= &meta->s_in;
-	mh->msg_namelen	= sizeof(meta->s_in);
 	mh->msg_iov	= siov;
 	mh->msg_iovlen	= 1;
 }
@@ -686,60 +679,32 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 				     const struct timespec *now)
 {
-	const socklen_t sasize = sizeof(udp_meta[0].s_in);
-	/* See udp_buf_sock_data() comment */
-	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i;
+	union sockaddr_inany src;
 
-	if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv, n)) <= 0)
-		return;
+	while (udp_peek_addr(ref.fd, &src) == 0) {
+		flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now);
+		uint8_t topif = pif_at_sidx(tosidx);
 
-	/* We divide datagrams into batches based on how we need to send them,
-	 * determined by udp_meta[i].tosidx.  To avoid either two passes through
-	 * the array, or recalculating tosidx for a single entry, we have to
-	 * populate it one entry *ahead* of the loop counter.
-	 */
-	udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now);
-	udp_mh_recv[0].msg_hdr.msg_namelen = sasize;
-	for (i = 0; i < n; ) {
-		flow_sidx_t batchsidx = udp_meta[i].tosidx;
-		uint8_t batchpif = pif_at_sidx(batchsidx);
-		int batchstart = i;
+		if (udp_sock_recv(c, ref.fd, udp_mh_recv, 1) <= 0)
+			break;
 
-		do {
-			if (pif_is_socket(batchpif)) {
-				udp_splice_prepare(udp_mh_recv, i);
-			} else if (batchpif == PIF_TAP) {
-				udp_tap_prepare(udp_mh_recv, i,
-						flowside_at_sidx(batchsidx),
-						false);
-			}
-
-			if (++i >= n)
-				break;
-
-			udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
-								&udp_meta[i].s_in,
-								now);
-			udp_mh_recv[i].msg_hdr.msg_namelen = sasize;
-		} while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx));
-
-		if (pif_is_socket(batchpif)) {
-			udp_splice_send(c, batchstart, i - batchstart,
-					batchsidx);
-		} else if (batchpif == PIF_TAP) {
-			tap_send_frames(c, &udp_l2_iov[batchstart][0],
-					UDP_NUM_IOVS, i - batchstart);
-		} else if (flow_sidx_valid(batchsidx)) {
-			flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx);
-			struct udp_flow *uflow = udp_at_sidx(batchsidx);
+		if (pif_is_socket(topif)) {
+			udp_splice_prepare(udp_mh_recv, 0);
+			udp_splice_send(c, 0, 1, tosidx);
+		} else if (topif == PIF_TAP) {
+			udp_tap_prepare(udp_mh_recv, 0, flowside_at_sidx(tosidx),
+					false);
+			tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, 1);
+		} else if (flow_sidx_valid(tosidx)) {
+			flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
+			struct udp_flow *uflow = udp_at_sidx(tosidx);
 
 			flow_err(uflow,
 				 "No support for forwarding UDP from %s to %s",
 				 pif_name(pif_at_sidx(fromsidx)),
-				 pif_name(batchpif));
+				 pif_name(topif));
 		} else {
-			debug("Discarding %d datagrams without flow",
-			      i - batchstart);
+			debug("Discarding datagram without flow");
 		}
 	}
 }
@@ -801,8 +766,6 @@ static bool udp_buf_reply_sock_data(const struct ctx *c,
 			udp_splice_prepare(udp_mh_recv, i);
 		else if (topif == PIF_TAP)
 			udp_tap_prepare(udp_mh_recv, i, toside, false);
-		/* Restore sockaddr length clobbered by recvmsg() */
-		udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
 	}
 
 	if (pif_is_socket(topif)) {

From 5221e177e132b8b5001ec97f42975ad1251f7110 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:35 +1100
Subject: [PATCH 098/144] udp: Parameterize number of datagrams handled by
 udp_*_reply_sock_data()

Both udp_buf_reply_sock_data() and udp_vu_reply_sock_data() internally
decide what the maximum number of datagrams they will forward is.  We have
some upcoming reasons to allow the caller to decide that instead, so make
the maximum number of datagrams a parameter for both of them.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c    | 31 ++++++++++++++++++-------------
 udp_vu.c |  6 ++++--
 udp_vu.h |  3 ++-
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/udp.c b/udp.c
index 4d32124..0f09e67 100644
--- a/udp.c
+++ b/udp.c
@@ -741,22 +741,17 @@ void udp_listen_sock_handler(const struct ctx *c,
  * udp_buf_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
  * @s:		Socket to read data from
+ * @n:		Maximum number of datagrams to forward
  * @tosidx:	Flow & side to forward data from @s to
  *
  * Return: true on success, false if can't forward from socket to flow's pif
  */
-static bool udp_buf_reply_sock_data(const struct ctx *c,
-				    int s, flow_sidx_t tosidx)
+static bool udp_buf_reply_sock_data(const struct ctx *c, int s, int n,
+				    flow_sidx_t tosidx)
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	uint8_t topif = pif_at_sidx(tosidx);
-	/* For not entirely clear reasons (data locality?) pasta gets better
-	 * throughput if we receive tap datagrams one at a time.  For small
-	 * splice datagrams throughput is slightly better if we do batch, but
-	 * it's slightly worse for large splice datagrams.  Since we don't know
-	 * the size before we receive, always go one at a time for pasta mode.
-	 */
-	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i;
+	int i;
 
 	if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
 		return true;
@@ -801,6 +796,14 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 	}
 
 	if (events & EPOLLIN) {
+		/* For not entirely clear reasons (data locality?) pasta gets
+		 * better throughput if we receive tap datagrams one at a
+		 * time.  For small splice datagrams throughput is slightly
+		 * better if we do batch, but it's slightly worse for large
+		 * splice datagrams.  Since we don't know the size before we
+		 * receive, always go one at a time for pasta mode.
+		 */
+		size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
 		flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 		int s = ref.fd;
 		bool ret;
@@ -808,10 +811,12 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 		flow_trace(uflow, "Received data on reply socket");
 		uflow->ts = now->tv_sec;
 
-		if (c->mode == MODE_VU)
-			ret = udp_vu_reply_sock_data(c, s, tosidx);
-		else
-			ret = udp_buf_reply_sock_data(c, s, tosidx);
+		if (c->mode == MODE_VU) {
+			ret = udp_vu_reply_sock_data(c, s, UDP_MAX_FRAMES,
+						     tosidx);
+		} else {
+			ret = udp_buf_reply_sock_data(c, s, n, tosidx);
+		}
 
 		if (!ret) {
 			flow_err(uflow, "Unable to forward UDP");
diff --git a/udp_vu.c b/udp_vu.c
index 5faf1e1..b2618b3 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -257,11 +257,13 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
  * udp_vu_reply_sock_data() - Handle new data from flow specific socket
  * @c:		Execution context
  * @s:		Socket to read data from
+ * @n:		Maximum number of datagrams to forward
  * @tosidx:	Flow & side to forward data from @s to
  *
  * Return: true on success, false if can't forward from socket to flow's pif
  */
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx)
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
+			    flow_sidx_t tosidx)
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
@@ -272,7 +274,7 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx)
 	if (pif_at_sidx(tosidx) != PIF_TAP)
 		return false;
 
-	for (i = 0; i < UDP_MAX_FRAMES; i++) {
+	for (i = 0; i < n; i++) {
 		ssize_t dlen;
 		int iov_used;
 
diff --git a/udp_vu.h b/udp_vu.h
index 6d541a4..c897c36 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -8,6 +8,7 @@
 
 void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 			     const struct timespec *now);
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, flow_sidx_t tosidx);
+bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
+			    flow_sidx_t tosidx);
 
 #endif /* UDP_VU_H */

From 0304dd9c34a7dd29c3a8a2058626a971d4e71a8e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:36 +1100
Subject: [PATCH 099/144] udp: Split spliced forwarding path from
 udp_buf_reply_sock_data()

udp_buf_reply_sock_data() can handle forwarding data either from socket
to socket ("splicing") or from socket to tap.  It has a test on each
datagram for which case we're in, but that will be the same for everything
in the batch.

Split out the spliced path into a separate udp_sock_to_sock() function.
This leaves udp_{buf,vu}_reply_sock_data() handling only forwards from
socket to tap, so rename and simplify them accordingly.

This makes the code slightly longer for now, but will allow future cleanups
to shrink it back down again.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Fix typos in comments to udp_sock_recv() and
 udp_vu_listen_sock_data()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c    | 103 ++++++++++++++++++++++++++++++-------------------------
 udp_vu.c |  12 ++-----
 udp_vu.h |   3 +-
 3 files changed, 60 insertions(+), 58 deletions(-)

diff --git a/udp.c b/udp.c
index 0f09e67..2745e5d 100644
--- a/udp.c
+++ b/udp.c
@@ -670,6 +670,49 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 	return n;
 }
 
+/**
+ * udp_sock_to_sock() - Forward datagrams from socket to socket
+ * @c:		Execution context
+ * @from_s:	Socket to receive datagrams from
+ * @n:		Maximum number of datagrams to forward
+ * @tosidx:	Flow & side to forward datagrams to
+ */
+static void udp_sock_to_sock(const struct ctx *c, int from_s, int n,
+			     flow_sidx_t tosidx)
+{
+	int i;
+
+	if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0)
+		return;
+
+	for (i = 0; i < n; i++)
+		udp_splice_prepare(udp_mh_recv, i);
+
+	udp_splice_send(c, 0, n, tosidx);
+}
+
+/**
+ * udp_buf_sock_to_tap() - Forward datagrams from socket to tap
+ * @c:		Execution context
+ * @s:		Socket to read data from
+ * @n:		Maximum number of datagrams to forward
+ * @tosidx:	Flow & side to forward data from @s to
+ */
+static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
+				flow_sidx_t tosidx)
+{
+	const struct flowside *toside = flowside_at_sidx(tosidx);
+	int i;
+
+	if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
+		return;
+
+	for (i = 0; i < n; i++)
+		udp_tap_prepare(udp_mh_recv, i, toside, false);
+
+	tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
+}
+
 /**
  * udp_buf_listen_sock_data() - Handle new data from socket
  * @c:		Execution context
@@ -737,43 +780,6 @@ void udp_listen_sock_handler(const struct ctx *c,
 	}
 }
 
-/**
- * udp_buf_reply_sock_data() - Handle new data from flow specific socket
- * @c:		Execution context
- * @s:		Socket to read data from
- * @n:		Maximum number of datagrams to forward
- * @tosidx:	Flow & side to forward data from @s to
- *
- * Return: true on success, false if can't forward from socket to flow's pif
- */
-static bool udp_buf_reply_sock_data(const struct ctx *c, int s, int n,
-				    flow_sidx_t tosidx)
-{
-	const struct flowside *toside = flowside_at_sidx(tosidx);
-	uint8_t topif = pif_at_sidx(tosidx);
-	int i;
-
-	if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
-		return true;
-
-	for (i = 0; i < n; i++) {
-		if (pif_is_socket(topif))
-			udp_splice_prepare(udp_mh_recv, i);
-		else if (topif == PIF_TAP)
-			udp_tap_prepare(udp_mh_recv, i, toside, false);
-	}
-
-	if (pif_is_socket(topif)) {
-		udp_splice_send(c, 0, n, tosidx);
-	} else if (topif == PIF_TAP) {
-		tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
-	} else {
-		return false;
-	}
-
-	return true;
-}
-
 /**
  * udp_sock_handler() - Handle new data from flow specific socket
  * @c:		Execution context
@@ -805,21 +811,26 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 		 */
 		size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
 		flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+		uint8_t topif = pif_at_sidx(tosidx);
 		int s = ref.fd;
-		bool ret;
 
 		flow_trace(uflow, "Received data on reply socket");
 		uflow->ts = now->tv_sec;
 
-		if (c->mode == MODE_VU) {
-			ret = udp_vu_reply_sock_data(c, s, UDP_MAX_FRAMES,
-						     tosidx);
+		if (pif_is_socket(topif)) {
+			udp_sock_to_sock(c, ref.fd, n, tosidx);
+		} else if (topif == PIF_TAP) {
+			if (c->mode == MODE_VU) {
+				udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES,
+						   tosidx);
+			} else {
+				udp_buf_sock_to_tap(c, s, n, tosidx);
+			}
 		} else {
-			ret = udp_buf_reply_sock_data(c, s, n, tosidx);
-		}
-
-		if (!ret) {
-			flow_err(uflow, "Unable to forward UDP");
+			flow_err(uflow,
+				 "No support for forwarding UDP from %s to %s",
+				 pif_name(pif_at_sidx(ref.flowside)),
+				 pif_name(topif));
 			goto fail;
 		}
 	}
diff --git a/udp_vu.c b/udp_vu.c
index b2618b3..8e02093 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -254,16 +254,13 @@ void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 }
 
 /**
- * udp_vu_reply_sock_data() - Handle new data from flow specific socket
+ * udp_vu_sock_to_tap() - Forward datagrams from socket to tap
  * @c:		Execution context
  * @s:		Socket to read data from
  * @n:		Maximum number of datagrams to forward
  * @tosidx:	Flow & side to forward data from @s to
- *
- * Return: true on success, false if can't forward from socket to flow's pif
  */
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
-			    flow_sidx_t tosidx)
+void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx)
 {
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
@@ -271,9 +268,6 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;
 
-	if (pif_at_sidx(tosidx) != PIF_TAP)
-		return false;
-
 	for (i = 0; i < n; i++) {
 		ssize_t dlen;
 		int iov_used;
@@ -290,6 +284,4 @@ bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
 		}
 		vu_flush(vdev, vq, elem, iov_used);
 	}
-
-	return true;
 }
diff --git a/udp_vu.h b/udp_vu.h
index c897c36..576b0e7 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -8,7 +8,6 @@
 
 void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 			     const struct timespec *now);
-bool udp_vu_reply_sock_data(const struct ctx *c, int s, int n,
-			    flow_sidx_t tosidx);
+void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx);
 
 #endif /* UDP_VU_H */

From fc6ee68ad3a8863cba534dfa4b88767114a6701e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:37 +1100
Subject: [PATCH 100/144] udp: Merge vhost-user and "buf" listening socket
 paths

udp_buf_listen_sock_data() and udp_vu_listen_sock_data() now have
effectively identical structure.  The forwarding functions used for flow
specific sockets (udp_buf_sock_to_tap(), udp_vu_sock_to_tap() and
udp_sock_to_sock()) also now take a number of datagrams.  This means we
can re-use them for the listening socket path, just passing '1' so they
handle a single datagram at a time.

This allows us to merge both the vhost-user and flow specific paths into
a single, simpler udp_listen_sock_data().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c          | 27 ++++++++--------------
 udp_internal.h |  1 -
 udp_vu.c       | 62 --------------------------------------------------
 3 files changed, 10 insertions(+), 80 deletions(-)

diff --git a/udp.c b/udp.c
index 2745e5d..b0a7bf7 100644
--- a/udp.c
+++ b/udp.c
@@ -629,7 +629,7 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
  *
  * Return: 0 on success, -1 otherwise
  */
-int udp_peek_addr(int s, union sockaddr_inany *src)
+static int udp_peek_addr(int s, union sockaddr_inany *src)
 {
 	struct msghdr msg = {
 		.msg_name = src,
@@ -714,12 +714,12 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
 }
 
 /**
- * udp_buf_listen_sock_data() - Handle new data from socket
+ * udp_listen_sock_data() - Handle new data from listening socket
  * @c:		Execution context
  * @ref:	epoll reference
  * @now:	Current timestamp
  */
-static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
+static void udp_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 				     const struct timespec *now)
 {
 	union sockaddr_inany src;
@@ -728,16 +728,13 @@ static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 		flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now);
 		uint8_t topif = pif_at_sidx(tosidx);
 
-		if (udp_sock_recv(c, ref.fd, udp_mh_recv, 1) <= 0)
-			break;
-
 		if (pif_is_socket(topif)) {
-			udp_splice_prepare(udp_mh_recv, 0);
-			udp_splice_send(c, 0, 1, tosidx);
+			udp_sock_to_sock(c, ref.fd, 1, tosidx);
 		} else if (topif == PIF_TAP) {
-			udp_tap_prepare(udp_mh_recv, 0, flowside_at_sidx(tosidx),
-					false);
-			tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, 1);
+			if (c->mode == MODE_VU)
+				udp_vu_sock_to_tap(c, ref.fd, 1, tosidx);
+			else
+				udp_buf_sock_to_tap(c, ref.fd, 1, tosidx);
 		} else if (flow_sidx_valid(tosidx)) {
 			flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
 			struct udp_flow *uflow = udp_at_sidx(tosidx);
@@ -772,12 +769,8 @@ void udp_listen_sock_handler(const struct ctx *c,
 		}
 	}
 
-	if (events & EPOLLIN) {
-		if (c->mode == MODE_VU)
-			udp_vu_listen_sock_data(c, ref, now);
-		else
-			udp_buf_listen_sock_data(c, ref, now);
-	}
+	if (events & EPOLLIN)
+		udp_listen_sock_data(c, ref, now);
 }
 
 /**
diff --git a/udp_internal.h b/udp_internal.h
index 43a6109..02724e5 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -30,6 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                        const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
-int udp_peek_addr(int s, union sockaddr_inany *src);
 
 #endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index 8e02093..1f89509 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -191,68 +191,6 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used)
 	}
 }
 
-/**
- * udp_vu_listen_sock_data() - Handle new data from socket
- * @c:		Execution context
- * @ref:	epoll reference
- * @now:	Current timestamp
- */
-void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
-			     const struct timespec *now)
-{
-	struct vu_dev *vdev = c->vdev;
-	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
-	int i;
-
-	for (i = 0; i < UDP_MAX_FRAMES; i++) {
-		const struct flowside *toside;
-		union sockaddr_inany s_in;
-		flow_sidx_t sidx;
-		uint8_t pif;
-		ssize_t dlen;
-		int iov_used;
-		bool v6;
-
-		if (udp_peek_addr(ref.fd, &s_in) < 0)
-			break;
-
-		sidx = udp_flow_from_sock(c, ref, &s_in, now);
-		pif = pif_at_sidx(sidx);
-
-		if (pif != PIF_TAP) {
-			if (flow_sidx_valid(sidx)) {
-				flow_sidx_t fromsidx = flow_sidx_opposite(sidx);
-				struct udp_flow *uflow = udp_at_sidx(sidx);
-
-				flow_err(uflow,
-					"No support for forwarding UDP from %s to %s",
-					pif_name(pif_at_sidx(fromsidx)),
-					pif_name(pif));
-			} else {
-				debug("Discarding 1 datagram without flow");
-			}
-
-			continue;
-		}
-
-		toside = flowside_at_sidx(sidx);
-
-		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
-
-		iov_used = udp_vu_sock_recv(c, ref.fd, v6, &dlen);
-		if (iov_used <= 0)
-			break;
-
-		udp_vu_prepare(c, toside, dlen);
-		if (*c->pcap) {
-			udp_vu_csum(toside, iov_used);
-			pcap_iov(iov_vu, iov_used,
-				 sizeof(struct virtio_net_hdr_mrg_rxbuf));
-		}
-		vu_flush(vdev, vq, elem, iov_used);
-	}
-}
-
 /**
  * udp_vu_sock_to_tap() - Forward datagrams from socket to tap
  * @c:		Execution context

From fd844a90bce0274d2488370ed7fadd850b6a0294 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:38 +1100
Subject: [PATCH 101/144] udp: Move UDP_MAX_FRAMES to udp.c

Recent changes mean that this define is no longer used anywhere except in
udp.c.  Move it back into udp.c from udp_internal.h.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c          | 2 ++
 udp_internal.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/udp.c b/udp.c
index b0a7bf7..f74a992 100644
--- a/udp.c
+++ b/udp.c
@@ -116,6 +116,8 @@
 #include "udp_internal.h"
 #include "udp_vu.h"
 
+#define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
+
 /* Maximum UDP data to be returned in ICMP messages */
 #define ICMP4_MAX_DLEN 8
 #define ICMP6_MAX_DLEN (IPV6_MIN_MTU			\
diff --git a/udp_internal.h b/udp_internal.h
index 02724e5..f7d8426 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -8,8 +8,6 @@
 
 #include "tap.h" /* needed by udp_meta_t */
 
-#define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
-
 /**
  * struct udp_payload_t - UDP header and data for inbound messages
  * @uh:		UDP header

From 159beefa36a09fc36cc9669fd536926d84c7c342 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:39 +1100
Subject: [PATCH 102/144] udp_flow: Take pif and port as explicit parameters to
 udp_flow_from_sock()

Currently udp_flow_from_sock() is only used when receiving a datagram
from a "listening" socket.  It takes the listening socket's epoll
reference to get the interface and port on which the datagram arrived.

We have some upcoming cases where we want to use this in different
contexts, so make it take the pif and port as direct parameters instead.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Drop @ref from comment to udp_flow_from_sock()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c      |  4 +++-
 udp_flow.c | 16 +++++++---------
 udp_flow.h |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/udp.c b/udp.c
index f74a992..157697e 100644
--- a/udp.c
+++ b/udp.c
@@ -727,7 +727,9 @@ static void udp_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 	union sockaddr_inany src;
 
 	while (udp_peek_addr(ref.fd, &src) == 0) {
-		flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now);
+		flow_sidx_t tosidx = udp_flow_from_sock(c, ref.udp.pif,
+							ref.udp.port, &src,
+							now);
 		uint8_t topif = pif_at_sidx(tosidx);
 
 		if (pif_is_socket(topif)) {
diff --git a/udp_flow.c b/udp_flow.c
index a2d417f..5afe6e5 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -161,9 +161,10 @@ cancel:
 }
 
 /**
- * udp_flow_from_sock() - Find or create UDP flow for "listening" socket
+ * udp_flow_from_sock() - Find or create UDP flow for incoming datagram
  * @c:		Execution context
- * @ref:	epoll reference of the receiving socket
+ * @pif:	Interface the datagram is arriving from
+ * @port:	Our (local) port number to which the datagram is arriving
  * @s_in:	Source socket address, filled in by recvmmsg()
  * @now:	Timestamp
  *
@@ -172,7 +173,7 @@ cancel:
  * Return: sidx for the destination side of the flow for this packet, or
  *         FLOW_SIDX_NONE if we couldn't find or create a flow.
  */
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now)
 {
@@ -181,9 +182,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 	union flow *flow;
 	flow_sidx_t sidx;
 
-	ASSERT(ref.type == EPOLL_TYPE_UDP_LISTEN);
-
-	sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, s_in, ref.udp.port);
+	sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, port);
 	if ((uflow = udp_at_sidx(sidx))) {
 		uflow->ts = now->tv_sec;
 		return flow_sidx_opposite(sidx);
@@ -193,12 +192,11 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 		char sastr[SOCKADDR_STRLEN];
 
 		debug("Couldn't allocate flow for UDP datagram from %s %s",
-		      pif_name(ref.udp.pif),
-		      sockaddr_ntop(s_in, sastr, sizeof(sastr)));
+		      pif_name(pif), sockaddr_ntop(s_in, sastr, sizeof(sastr)));
 		return FLOW_SIDX_NONE;
 	}
 
-	ini = flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port);
+	ini = flow_initiate_sa(flow, pif, s_in, port);
 
 	if (!inany_is_unicast(&ini->eaddr) ||
 	    ini->eport == 0 || ini->oport == 0) {
diff --git a/udp_flow.h b/udp_flow.h
index 520de62..bbdeb2a 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -26,7 +26,7 @@ struct udp_flow {
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now);
 flow_sidx_t udp_flow_from_tap(const struct ctx *c,

From bd6a41ee76bb9a0da2150d76dbabf9a3212d0fca Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:40 +1100
Subject: [PATCH 103/144] udp: Rework udp_listen_sock_data() into
 udp_sock_fwd()

udp_listen_sock_data() forwards datagrams from a "listening" socket until
there are no more (for now).  We have an upcoming use case where we want
to do that for a socket that's not a "listening" socket, and uses a
different epoll reference.  So, adjust the function to take the pieces it
needs from the reference as direct parameters and rename to udp_sock_fwd().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/udp.c b/udp.c
index 157697e..20d8f0c 100644
--- a/udp.c
+++ b/udp.c
@@ -716,37 +716,36 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
 }
 
 /**
- * udp_listen_sock_data() - Handle new data from listening socket
+ * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket
  * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to forward from
+ * @frompif:	Interface to which @s belongs
+ * @port:	Our (local) port number of @s
  * @now:	Current timestamp
  */
-static void udp_listen_sock_data(const struct ctx *c, union epoll_ref ref,
-				     const struct timespec *now)
+static void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+			 in_port_t port, const struct timespec *now)
 {
 	union sockaddr_inany src;
 
-	while (udp_peek_addr(ref.fd, &src) == 0) {
-		flow_sidx_t tosidx = udp_flow_from_sock(c, ref.udp.pif,
-							ref.udp.port, &src,
-							now);
+	while (udp_peek_addr(s, &src) == 0) {
+		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, port,
+							&src, now);
 		uint8_t topif = pif_at_sidx(tosidx);
 
 		if (pif_is_socket(topif)) {
-			udp_sock_to_sock(c, ref.fd, 1, tosidx);
+			udp_sock_to_sock(c, s, 1, tosidx);
 		} else if (topif == PIF_TAP) {
 			if (c->mode == MODE_VU)
-				udp_vu_sock_to_tap(c, ref.fd, 1, tosidx);
+				udp_vu_sock_to_tap(c, s, 1, tosidx);
 			else
-				udp_buf_sock_to_tap(c, ref.fd, 1, tosidx);
+				udp_buf_sock_to_tap(c, s, 1, tosidx);
 		} else if (flow_sidx_valid(tosidx)) {
-			flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
 			struct udp_flow *uflow = udp_at_sidx(tosidx);
 
 			flow_err(uflow,
 				 "No support for forwarding UDP from %s to %s",
-				 pif_name(pif_at_sidx(fromsidx)),
-				 pif_name(topif));
+				 pif_name(frompif), pif_name(topif));
 		} else {
 			debug("Discarding datagram without flow");
 		}
@@ -774,7 +773,7 @@ void udp_listen_sock_handler(const struct ctx *c,
 	}
 
 	if (events & EPOLLIN)
-		udp_listen_sock_data(c, ref, now);
+		udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now);
 }
 
 /**

From 9eb540626047bece3f25f38e47ec3b2b0030f9f4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:41 +1100
Subject: [PATCH 104/144] udp: Fold udp_splice_prepare and udp_splice_send into
 udp_sock_to_sock

udp_splice() prepare and udp_splice_send() are both quite simple functions
that now have only one caller: udp_sock_to_sock().  Fold them both into
that caller.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 55 +++++++++++++++----------------------------------------
 1 file changed, 15 insertions(+), 40 deletions(-)

diff --git a/udp.c b/udp.c
index 20d8f0c..d9d2183 100644
--- a/udp.c
+++ b/udp.c
@@ -250,43 +250,6 @@ static void udp_iov_init(const struct ctx *c)
 		udp_iov_init_one(c, i);
 }
 
-/**
- * udp_splice_prepare() - Prepare one datagram for splicing
- * @mmh:	Receiving mmsghdr array
- * @idx:	Index of the datagram to prepare
- */
-static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx)
-{
-	udp_mh_splice[idx].msg_hdr.msg_iov->iov_len = mmh[idx].msg_len;
-}
-
-/**
- * udp_splice_send() - Send a batch of datagrams from socket to socket
- * @c:		Execution context
- * @start:	Index of batch's first datagram in udp[46]_l2_buf
- * @n:		Number of datagrams in batch
- * @src:	Source port for datagram (target side)
- * @dst:	Destination port for datagrams (target side)
- * @ref:	epoll reference for origin socket
- * @now:	Timestamp
- *
- * #syscalls sendmmsg
- */
-static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
-			    flow_sidx_t tosidx)
-{
-	const struct flowside *toside = flowside_at_sidx(tosidx);
-	const struct udp_flow *uflow = udp_at_sidx(tosidx);
-	uint8_t topif = pif_at_sidx(tosidx);
-	int s = uflow->s[tosidx.sidei];
-	socklen_t sl;
-
-	pif_sockaddr(c, &udp_splice_to, &sl, topif,
-		     &toside->eaddr, toside->eport);
-
-	sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL);
-}
-
 /**
  * udp_update_hdr4() - Update headers for one IPv4 datagram
  * @ip4h:		Pre-filled IPv4 header (except for tot_len and saddr)
@@ -678,19 +641,31 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
  * @from_s:	Socket to receive datagrams from
  * @n:		Maximum number of datagrams to forward
  * @tosidx:	Flow & side to forward datagrams to
+ *
+ * #syscalls sendmmsg
  */
 static void udp_sock_to_sock(const struct ctx *c, int from_s, int n,
 			     flow_sidx_t tosidx)
 {
+	const struct flowside *toside = flowside_at_sidx(tosidx);
+	const struct udp_flow *uflow = udp_at_sidx(tosidx);
+	uint8_t topif = pif_at_sidx(tosidx);
+	int to_s = uflow->s[tosidx.sidei];
+	socklen_t sl;
 	int i;
 
 	if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0)
 		return;
 
-	for (i = 0; i < n; i++)
-		udp_splice_prepare(udp_mh_recv, i);
+	for (i = 0; i < n; i++) {
+		udp_mh_splice[i].msg_hdr.msg_iov->iov_len
+			= udp_mh_recv[i].msg_len;
+	}
 
-	udp_splice_send(c, 0, n, tosidx);
+	pif_sockaddr(c, &udp_splice_to, &sl, topif,
+		     &toside->eaddr, toside->eport);
+
+	sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL);
 }
 
 /**

From 9725e79888374a4e4060a2d798f3407c0006cc8a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 4 Apr 2025 21:15:42 +1100
Subject: [PATCH 105/144] udp_flow: Don't discard packets that arrive between
 bind() and connect()

When we establish a new UDP flow we create connect()ed sockets that will
only handle datagrams for this flow.  However, there is a race between
bind() and connect() where they might get some packets queued for a
different flow.  Currently we handle this by simply discarding any
queued datagrams after the connect.  UDP protocols should be able to handle
such packet loss, but it's not ideal.

We now have the tools we need to handle this better, by redirecting any
datagrams received during that race to the appropriate flow.  We need to
use a deferred handler for this to avoid unexpectedly re-ordering datagrams
in some edge cases.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Update comment to udp_flow_defer()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c         |  2 +-
 udp.c          |  4 +--
 udp_flow.c     | 77 +++++++++++++++++++++++++++++++++++---------------
 udp_flow.h     |  6 +++-
 udp_internal.h |  2 ++
 5 files changed, 64 insertions(+), 27 deletions(-)

diff --git a/flow.c b/flow.c
index 8622242..29a83e1 100644
--- a/flow.c
+++ b/flow.c
@@ -850,7 +850,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 				closed = icmp_ping_timer(c, &flow->ping, now);
 			break;
 		case FLOW_UDP:
-			closed = udp_flow_defer(&flow->udp);
+			closed = udp_flow_defer(c, &flow->udp, now);
 			if (!closed && timer)
 				closed = udp_flow_timer(c, &flow->udp, now);
 			break;
diff --git a/udp.c b/udp.c
index d9d2183..ed6edc1 100644
--- a/udp.c
+++ b/udp.c
@@ -698,8 +698,8 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
  * @port:	Our (local) port number of @s
  * @now:	Current timestamp
  */
-static void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
-			 in_port_t port, const struct timespec *now)
+void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+		  in_port_t port, const struct timespec *now)
 {
 	union sockaddr_inany src;
 
diff --git a/udp_flow.c b/udp_flow.c
index 5afe6e5..75f5a0b 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -9,10 +9,12 @@
 #include <fcntl.h>
 #include <sys/uio.h>
 #include <unistd.h>
+#include <netinet/udp.h>
 
 #include "util.h"
 #include "passt.h"
 #include "flow_table.h"
+#include "udp_internal.h"
 
 #define UDP_CONN_TIMEOUT	180 /* s, timeout for ephemeral or local bind */
 
@@ -67,16 +69,15 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
  * Return: fd of new socket on success, -ve error code on failure
  */
 static int udp_flow_sock(const struct ctx *c,
-			 const struct udp_flow *uflow, unsigned sidei)
+			 struct udp_flow *uflow, unsigned sidei)
 {
 	const struct flowside *side = &uflow->f.side[sidei];
-	struct mmsghdr discard[UIO_MAXIOV] = { 0 };
 	uint8_t pif = uflow->f.pif[sidei];
 	union {
 		flow_sidx_t sidx;
 		uint32_t data;
 	} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
-	int rc, s;
+	int s;
 
 	s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
 	if (s < 0) {
@@ -85,30 +86,32 @@ static int udp_flow_sock(const struct ctx *c,
 	}
 
 	if (flowside_connect(c, s, pif, side) < 0) {
-		rc = -errno;
+		int rc = -errno;
 		flow_dbg_perror(uflow, "Couldn't connect flow socket");
 		return rc;
 	}
 
-	/* It's possible, if unlikely, that we could receive some unrelated
-	 * packets in between the bind() and connect() of this socket.  For now
-	 * we just discard these.
+	/* It's possible, if unlikely, that we could receive some packets in
+	 * between the bind() and connect() which may or may not be for this
+	 * flow.  Being UDP we could just discard them, but it's not ideal.
 	 *
-	 * FIXME: Redirect these to an appropriate handler
+	 * There's also a tricky case if a bunch of datagrams for a new flow
+	 * arrive in rapid succession, the first going to the original listening
+	 * socket and later ones going to this new socket.  If we forwarded the
+	 * datagrams from the new socket immediately here they would go before
+	 * the datagram which established the flow.  Again, not strictly wrong
+	 * for UDP, but not ideal.
+	 *
+	 * So, we flag that the new socket is in a transient state where it
+	 * might have datagrams for a different flow queued.  Before the next
+	 * epoll cycle, udp_flow_defer() will flush out any such datagrams, and
+	 * thereafter everything on the new socket should be strictly for this
+	 * flow.
 	 */
-	rc = recvmmsg(s, discard, ARRAY_SIZE(discard), MSG_DONTWAIT, NULL);
-	if (rc >= ARRAY_SIZE(discard)) {
-		flow_dbg(uflow, "Too many (%d) spurious reply datagrams", rc);
-		return -E2BIG;
-	}
-
-	if (rc > 0) {
-		flow_trace(uflow, "Discarded %d spurious reply datagrams", rc);
-	} else if (errno != EAGAIN) {
-		rc = -errno;
-		flow_perror(uflow, "Unexpected error discarding datagrams");
-		return rc;
-	}
+	if (sidei)
+		uflow->flush1 = true;
+	else
+		uflow->flush0 = true;
 
 	return s;
 }
@@ -269,13 +272,41 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 }
 
 /**
- * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
+ * udp_flush_flow() - Flush datagrams that might not be for this flow
+ * @c:		Execution context
  * @uflow:	Flow to handle
+ * @sidei:	Side of the flow to flush
+ * @now:	Current timestamp
+ */
+static void udp_flush_flow(const struct ctx *c,
+			   const struct udp_flow *uflow, unsigned sidei,
+			   const struct timespec *now)
+{
+	/* We don't know exactly where the datagrams will come from, but we know
+	 * they'll have an interface and oport matching this flow */
+	udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei],
+		     uflow->f.side[sidei].oport, now);
+}
+
+/**
+ * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
+ * @c:		Execution context
+ * @uflow:	Flow to handle
+ * @now:	Current timestamp
  *
  * Return: true if the connection is ready to free, false otherwise
  */
-bool udp_flow_defer(const struct udp_flow *uflow)
+bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
+		    const struct timespec *now)
 {
+	if (uflow->flush0) {
+		udp_flush_flow(c, uflow, INISIDE, now);
+		uflow->flush0 = false;
+	}
+	if (uflow->flush1) {
+		udp_flush_flow(c, uflow, TGTSIDE, now);
+		uflow->flush1 = false;
+	}
 	return uflow->closed;
 }
 
diff --git a/udp_flow.h b/udp_flow.h
index bbdeb2a..90d3b29 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -11,6 +11,8 @@
  * struct udp_flow - Descriptor for a flow of UDP packets
  * @f:		Generic flow information
  * @closed:	Flow is already closed
+ * @flush0:	@s[0] may have datagrams queued for other flows
+ * @flush1:	@s[1] may have datagrams queued for other flows
  * @ts:		Activity timestamp
  * @s:		Socket fd (or -1) for each side of the flow
  * @ttl:	TTL or hop_limit for both sides
@@ -20,6 +22,7 @@ struct udp_flow {
 	struct flow_common f;
 
 	bool closed :1;
+	bool flush0, flush1 :1;
 	time_t ts;
 	int s[SIDES];
 	uint8_t ttl[SIDES];
@@ -35,7 +38,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 			      in_port_t srcport, in_port_t dstport,
 			      const struct timespec *now);
 void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
-bool udp_flow_defer(const struct udp_flow *uflow);
+bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
+		    const struct timespec *now);
 bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
 		    const struct timespec *now);
 
diff --git a/udp_internal.h b/udp_internal.h
index f7d8426..96d11cf 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -28,5 +28,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                        const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
+void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+		  in_port_t port, const struct timespec *now);
 
 #endif /* UDP_INTERNAL_H */

From 06ef64cdb72475fd02c72cdd607a31a86605e734 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 8 Apr 2025 07:49:55 +0200
Subject: [PATCH 106/144] udp_flow: Save 8 bytes in struct udp_flow on 64-bit
 architectures

Shuffle the fields just added by commits a7775e9550fa ("udp: support
traceroute in direction tap-socket") and 9725e7988837 ("udp_flow:
Don't discard packets that arrive between bind() and connect()").

On x86_64, as reported by pahole(1), before:

struct udp_flow {
        struct flow_common         f;                    /*     0    76 */
        /* --- cacheline 1 boundary (64 bytes) was 12 bytes ago --- */
        _Bool                      closed:1;             /*    76: 0  1 */

        /* XXX 7 bits hole, try to pack */

        _Bool                      flush0;               /*    77     1 */
        _Bool                      flush1:1;             /*    78: 0  1 */

        /* XXX 7 bits hole, try to pack */
        /* XXX 1 byte hole, try to pack */

        time_t                     ts;                   /*    80     8 */
        int                        s[2];                 /*    88     8 */
        uint8_t                    ttl[2];               /*    96     2 */

        /* size: 104, cachelines: 2, members: 7 */
        /* sum members: 95, holes: 1, sum holes: 1 */
        /* sum bitfield members: 2 bits, bit holes: 2, sum bit holes: 14 bits */
        /* padding: 6 */
        /* last cacheline: 40 bytes */
};

and after:

struct udp_flow {
        struct flow_common         f;                    /*     0    76 */
        /* --- cacheline 1 boundary (64 bytes) was 12 bytes ago --- */
        uint8_t                    ttl[2];               /*    76     2 */
        _Bool                      closed:1;             /*    78: 0  1 */
        _Bool                      flush0:1;             /*    78: 1  1 */
        _Bool                      flush1:1;             /*    78: 2  1 */

        /* XXX 5 bits hole, try to pack */
        /* XXX 1 byte hole, try to pack */

        time_t                     ts;                   /*    80     8 */
        int                        s[2];                 /*    88     8 */

        /* size: 96, cachelines: 2, members: 7 */
        /* sum members: 94, holes: 1, sum holes: 1 */
        /* sum bitfield members: 3 bits, bit holes: 1, sum bit holes: 5 bits */
        /* last cacheline: 32 bytes */
};

It doesn't matter much because anyway the typical storage for struct
udp_flow is given by union flow:

union flow {
        struct flow_common         f;                  /*     0    76 */
        struct flow_free_cluster   free;               /*     0    84 */
        struct tcp_tap_conn        tcp;                /*     0   120 */
        struct tcp_splice_conn     tcp_splice;         /*     0   120 */
        struct icmp_ping_flow      ping;               /*     0    96 */
        struct udp_flow            udp;                /*     0    96 */
};

but it still improves data locality somewhat, so let me fix this up
now that commits are fresh.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp_flow.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/udp_flow.h b/udp_flow.h
index 90d3b29..e289122 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -10,22 +10,25 @@
 /**
  * struct udp_flow - Descriptor for a flow of UDP packets
  * @f:		Generic flow information
+ * @ttl:	TTL or hop_limit for both sides
  * @closed:	Flow is already closed
  * @flush0:	@s[0] may have datagrams queued for other flows
  * @flush1:	@s[1] may have datagrams queued for other flows
  * @ts:		Activity timestamp
  * @s:		Socket fd (or -1) for each side of the flow
- * @ttl:	TTL or hop_limit for both sides
  */
 struct udp_flow {
 	/* Must be first element */
 	struct flow_common f;
 
-	bool closed :1;
-	bool flush0, flush1 :1;
+	uint8_t ttl[SIDES];
+
+	bool	closed	:1,
+		flush0	:1,
+		flush1	:1;
+
 	time_t ts;
 	int s[SIDES];
-	uint8_t ttl[SIDES];
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);

From ffbef85e975ba117ed1c20f733d989ac08ebf325 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 8 Apr 2025 07:57:51 +0200
Subject: [PATCH 107/144] conf: Add missing return in conf_nat(), fix
 --map-guest-addr none

As reported by somebody on IRC:

  $ pasta --map-guest-addr none
  Invalid address to remap to host: none

that's because once we parsed "none", we try to parse it as an address
as well. But we already handled it, so stop once we're done.

Fixes: e813a4df7da2 ("conf: Allow address remapped to host to be configured")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/conf.c b/conf.c
index b54c55d..168646f 100644
--- a/conf.c
+++ b/conf.c
@@ -1272,6 +1272,8 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
 		*addr6 = in6addr_any;
 		if (no_map_gw)
 			*no_map_gw = 1;
+
+		return;
 	}
 
 	if (inet_pton(AF_INET6, arg, addr6)	&&

From d3f33f3b8ec4646dae3584b648cba142a73d3208 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 9 Apr 2025 16:35:40 +1000
Subject: [PATCH 108/144] tcp_splice: Don't double count bytes read on EINTR

In tcp_splice_sock_handler(), if we get an EINTR on our second splice()
(pipe to output socket) we - as we should - go back and retry it.  However,
we do so *after* we've already updated our byte counters.  That does no
harm for the conn->written[] counter - since the second splice() returned
an error it will be advanced by 0.  However we also advance the
conn->read[] counter, and then do so again when the splice() succeeds.
This results in the counters being out of sync, and us thinking we have
remaining data in the pipe when we don't, which can leave us in an
infinite loop once the stream finishes.

Fix this by moving the EINTR handling to directly next to the splice()
call (which is what we usually do for EINTR).  As a bonus this removes one
mildly confusing goto.

For symmetry, also rework the EINTR handling on the first splice() the same
way, although that doesn't (as far as I can tell) have buggy side effects.

Link: https://github.com/containers/podman/issues/23686#issuecomment-2779347687
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_splice.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/tcp_splice.c b/tcp_splice.c
index 0d10e3d..7c3b56f 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -520,15 +520,14 @@ swap:
 		int more = 0;
 
 retry:
-		readlen = splice(conn->s[fromsidei], NULL,
-				 conn->pipe[fromsidei][1], NULL,
-				 c->tcp.pipe_size,
-				 SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+		do
+			readlen = splice(conn->s[fromsidei], NULL,
+					 conn->pipe[fromsidei][1], NULL,
+					 c->tcp.pipe_size,
+					 SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+		while (readlen < 0 && errno == EINTR);
 		flow_trace(conn, "%zi from read-side call", readlen);
 		if (readlen < 0) {
-			if (errno == EINTR)
-				goto retry;
-
 			if (errno != EAGAIN)
 				goto close;
 		} else if (!readlen) {
@@ -543,10 +542,13 @@ retry:
 				conn_flag(c, conn, lowat_act_flag);
 		}
 
-eintr:
-		written = splice(conn->pipe[fromsidei][0], NULL,
-				 conn->s[!fromsidei], NULL, c->tcp.pipe_size,
-				 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+		do
+			written = splice(conn->pipe[fromsidei][0], NULL,
+					 conn->s[!fromsidei], NULL,
+					 c->tcp.pipe_size,
+					 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+		while (written < 0 && errno == EINTR);
+
 		flow_trace(conn, "%zi from write-side call (passed %zi)",
 			   written, c->tcp.pipe_size);
 
@@ -578,9 +580,6 @@ eintr:
 		conn->written[fromsidei] += written > 0 ? written : 0;
 
 		if (written < 0) {
-			if (errno == EINTR)
-				goto eintr;
-
 			if (errno != EAGAIN)
 				goto close;
 

From 6693fa115824d198b7cde46c272514be194500a9 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 9 Apr 2025 16:35:41 +1000
Subject: [PATCH 109/144] tcp_splice: Don't clobber errno before checking for
 EAGAIN

Like many places, tcp_splice_sock_handler() needs to handle EAGAIN
specially, in this case for both of its splice() calls.  Unfortunately it
tests for EAGAIN some time after those calls.  In between there has been
at least a flow_trace() which could have clobbered errno.  Move the test on
errno closer to the relevant system calls to avoid this problem.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp_splice.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tcp_splice.c b/tcp_splice.c
index 7c3b56f..60455d6 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -526,13 +526,15 @@ retry:
 					 c->tcp.pipe_size,
 					 SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
 		while (readlen < 0 && errno == EINTR);
+
+		if (readlen < 0 && errno != EAGAIN)
+			goto close;
+
 		flow_trace(conn, "%zi from read-side call", readlen);
-		if (readlen < 0) {
-			if (errno != EAGAIN)
-				goto close;
-		} else if (!readlen) {
+
+		if (!readlen) {
 			eof = 1;
-		} else {
+		} else if (readlen > 0) {
 			never_read = 0;
 
 			if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
@@ -549,6 +551,9 @@ retry:
 					 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
 		while (written < 0 && errno == EINTR);
 
+		if (written < 0 && errno != EAGAIN)
+			goto close;
+
 		flow_trace(conn, "%zi from write-side call (passed %zi)",
 			   written, c->tcp.pipe_size);
 
@@ -580,9 +585,6 @@ retry:
 		conn->written[fromsidei] += written > 0 ? written : 0;
 
 		if (written < 0) {
-			if (errno != EAGAIN)
-				goto close;
-
 			if (conn->read[fromsidei] == conn->written[fromsidei])
 				break;
 

From f4b0dd8b06850bacb2da57c8576e3377daa88572 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 10 Apr 2025 17:16:38 +1000
Subject: [PATCH 110/144] udp: Use PKTINFO cmsgs to get destination address for
 received datagrams

Currently we get the source address for received datagrams from recvmsg(),
but we don't get the local destination address.  Sometimes we implicitly
know this because the receiving socket is bound to a specific address, but
when listening on 0.0.0.0 or ::, we don't.

We need this information to properly direct replies to flows which come in
to a non-default local address.  So, enable the IP_PKTINFO and IPV6_PKTINFO
control messages to obtain this information in udp_peek_addr().  For now
we log a trace messages but don't do anything more with the information.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c  | 37 +++++++++++++++++++++++++++++++++++--
 util.c |  8 ++++++--
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/udp.c b/udp.c
index ed6edc1..a71141a 100644
--- a/udp.c
+++ b/udp.c
@@ -587,18 +587,29 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
 	return n_err;
 }
 
+#define PKTINFO_SPACE					\
+	MAX(CMSG_SPACE(sizeof(struct in_pktinfo)),	\
+	    CMSG_SPACE(sizeof(struct in6_pktinfo)))
+
 /**
  * udp_peek_addr() - Get source address for next packet
  * @s:		Socket to get information from
  * @src:	Socket address (output)
+ * @dst:	(Local) destination address (output)
  *
  * Return: 0 on success, -1 otherwise
  */
-static int udp_peek_addr(int s, union sockaddr_inany *src)
+static int udp_peek_addr(int s, union sockaddr_inany *src,
+			 union inany_addr *dst)
 {
+	char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN];
+	const struct cmsghdr *hdr;
+	char cmsg[PKTINFO_SPACE];
 	struct msghdr msg = {
 		.msg_name = src,
 		.msg_namelen = sizeof(*src),
+		.msg_control = cmsg,
+		.msg_controllen = sizeof(cmsg),
 	};
 	int rc;
 
@@ -608,6 +619,27 @@ static int udp_peek_addr(int s, union sockaddr_inany *src)
 			warn_perror("Error peeking at socket address");
 		return rc;
 	}
+
+	hdr = CMSG_FIRSTHDR(&msg);
+	if (hdr && hdr->cmsg_level == IPPROTO_IP &&
+	    hdr->cmsg_type == IP_PKTINFO) {
+		const struct in_pktinfo *info4 = (void *)CMSG_DATA(hdr);
+
+		*dst = inany_from_v4(info4->ipi_addr);
+	} else if (hdr && hdr->cmsg_level == IPPROTO_IPV6 &&
+		   hdr->cmsg_type == IPV6_PKTINFO) {
+		const struct in6_pktinfo *info6 = (void *)CMSG_DATA(hdr);
+
+		dst->a6 = info6->ipi6_addr;
+	} else {
+		debug("Unexpected cmsg on UDP datagram");
+		*dst = inany_any6;
+	}
+
+	trace("Peeked UDP datagram: %s -> %s",
+	      sockaddr_ntop(src, sastr, sizeof(sastr)),
+	      inany_ntop(dst, dstr, sizeof(dstr)));
+
 	return 0;
 }
 
@@ -702,8 +734,9 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 		  in_port_t port, const struct timespec *now)
 {
 	union sockaddr_inany src;
+	union inany_addr dst;
 
-	while (udp_peek_addr(s, &src) == 0) {
+	while (udp_peek_addr(s, &src, &dst) == 0) {
 		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, port,
 							&src, now);
 		uint8_t topif = pif_at_sidx(tosidx);
diff --git a/util.c b/util.c
index 0f68cf5..62a6003 100644
--- a/util.c
+++ b/util.c
@@ -109,11 +109,15 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 		debug("Failed to set SO_REUSEADDR on socket %i", fd);
 
 	if (proto == IPPROTO_UDP) {
+		int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
+		int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
 		int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
-		int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
 
-		if (setsockopt(fd, level, opt, &y, sizeof(y)))
+		if (setsockopt(fd, level, recverr, &y, sizeof(y)))
 			die_perror("Failed to set RECVERR on socket %i", fd);
+
+		if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
+			die_perror("Failed to set PKTINFO on socket %i", fd);
 	}
 
 	if (ifname && *ifname) {

From 695c62396eb3f4627c1114ce444394e3ba34373a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 10 Apr 2025 17:16:39 +1000
Subject: [PATCH 111/144] inany: Improve ASSERT message for bad socket family

inany_from_sockaddr() can only handle sockaddrs of family AF_INET or
AF_INET6 and asserts if given something else.  I hit this assertion while
debugging something else, and wanted to see what the bad sockaddr family
was.  Now that we have ASSERT_WITH_MSG() its easy to add this information.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 inany.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inany.h b/inany.h
index 6a12c29..1c247e1 100644
--- a/inany.h
+++ b/inany.h
@@ -252,7 +252,8 @@ static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
 		*port = ntohs(sa->sa4.sin_port);
 	} else {
 		/* Not valid to call with other address families */
-		ASSERT(0);
+		ASSERT_WITH_MSG(0, "Unexpected sockaddr family: %u",
+				sa->sa_family);
 	}
 }
 

From 59cc89f4cc018988428637d97745cc4c919126cb Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 10 Apr 2025 17:16:40 +1000
Subject: [PATCH 112/144] udp, udp_flow: Track our specific address on socket
 interfaces

So far for UDP flows (like TCP connections) we didn't record our address
(oaddr) in the flow table entry for socket based pifs.  That's because we
didn't have that information when a flow was initiated by a datagram coming
to a "listening" socket with 0.0.0.0 or :: address.  Even when we did have
the information, we didn't record it, to simplify address matching on
lookups.

This meant that in some circumstances we could send replies on a UDP flow
from a different address than the originating request came to, which is
surprising and breaks certain setups.

We now have code in udp_peek_addr() which does determine our address for
incoming UDP datagrams.  We can use that information to properly populate
oaddr in the flow table for flow initiated from a socket.

In order to be able to consistently match datagrams to flows, we must
*always* have a specific oaddr, not an unspecified address (that's how the
flow hash table works).  So, we also need to fill in oaddr correctly for
flows we initiate *to* sockets.  Our forwarding logic doesn't specify
oaddr here, letting the kernel decide based on the routing table.  In this
case we need to call getsockname() after connect()ing the socket to find
which local address the kernel picked.

This adds getsockname() to our seccomp profile for all variants.

Link: https://bugs.passt.top/show_bug.cgi?id=99
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c       | 14 +++++++++++---
 flow.h       |  3 ++-
 flow_table.h |  1 +
 tcp.c        |  2 +-
 udp.c        |  4 ++--
 udp_flow.c   | 36 ++++++++++++++++++++++++++++++++----
 udp_flow.h   |  3 ++-
 util.h       | 10 ++++++++++
 8 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/flow.c b/flow.c
index 29a83e1..3c81cb4 100644
--- a/flow.c
+++ b/flow.c
@@ -396,18 +396,22 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
  * @flow:	Flow to change state
  * @pif:	pif of the initiating side
  * @ssa:	Source socket address
+ * @daddr:	Destination address (may be NULL)
  * @dport:	Destination port
  *
  * Return: pointer to the initiating flowside information
  */
 struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
 				  const union sockaddr_inany *ssa,
+				  const union inany_addr *daddr,
 				  in_port_t dport)
 {
 	struct flowside *ini = &flow->f.side[INISIDE];
 
 	inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
-	if (inany_v4(&ini->eaddr))
+	if (daddr)
+		ini->oaddr = *daddr;
+	else if (inany_v4(&ini->eaddr))
 		ini->oaddr = inany_any4;
 	else
 		ini->oaddr = inany_any6;
@@ -751,19 +755,23 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
  * @proto:	Protocol of the flow (IP L4 protocol number)
  * @pif:	Interface of the flow
  * @esa:	Socket address of the endpoint
+ * @oaddr:	Our address (may be NULL)
  * @oport:	Our port number
  *
  * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
  */
 flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
-			   const void *esa, in_port_t oport)
+			   const void *esa,
+			   const union inany_addr *oaddr, in_port_t oport)
 {
 	struct flowside side = {
 		.oport = oport,
 	};
 
 	inany_from_sockaddr(&side.eaddr, &side.eport, esa);
-	if (inany_v4(&side.eaddr))
+	if (oaddr)
+		side.oaddr = *oaddr;
+	else if (inany_v4(&side.eaddr))
 		side.oaddr = inany_any4;
 	else
 		side.oaddr = inany_any6;
diff --git a/flow.h b/flow.h
index dcf7645..cac618a 100644
--- a/flow.h
+++ b/flow.h
@@ -243,7 +243,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
 			   const void *eaddr, const void *oaddr,
 			   in_port_t eport, in_port_t oport);
 flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
-			   const void *esa, in_port_t oport);
+			   const void *esa,
+			   const union inany_addr *oaddr, in_port_t oport);
 
 union flow;
 
diff --git a/flow_table.h b/flow_table.h
index fd2c57b..2d5c65c 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -199,6 +199,7 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
 					const void *daddr, in_port_t dport);
 struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
 				  const union sockaddr_inany *ssa,
+				  const union inany_addr *daddr,
 				  in_port_t dport);
 const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
 				      sa_family_t af,
diff --git a/tcp.c b/tcp.c
index 35626c9..9c6bc52 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2201,7 +2201,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 	 * mode only, below.
 	 */
 	ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
-			       ref.tcp_listen.port);
+			       NULL, ref.tcp_listen.port);
 
 	if (c->mode == MODE_VU) { /* Rebind to same address after migration */
 		if (!getsockname(s, &sa.sa, &sl))
diff --git a/udp.c b/udp.c
index a71141a..40af7df 100644
--- a/udp.c
+++ b/udp.c
@@ -737,8 +737,8 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 	union inany_addr dst;
 
 	while (udp_peek_addr(s, &src, &dst) == 0) {
-		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif, port,
-							&src, now);
+		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif,
+							&dst, port, &src, now);
 		uint8_t topif = pif_at_sidx(tosidx);
 
 		if (pif_is_socket(topif)) {
diff --git a/udp_flow.c b/udp_flow.c
index 75f5a0b..ef2cbb0 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -123,14 +123,17 @@ static int udp_flow_sock(const struct ctx *c,
  * @now:	Timestamp
  *
  * Return: UDP specific flow, if successful, NULL on failure
+ *
+ * #syscalls getsockname
  */
 static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 				const struct timespec *now)
 {
 	struct udp_flow *uflow = NULL;
+	const struct flowside *tgt;
 	unsigned sidei;
 
-	if (!flow_target(c, flow, IPPROTO_UDP))
+	if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
 		goto cancel;
 
 	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
@@ -144,6 +147,29 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 				goto cancel;
 	}
 
+	if (uflow->s[TGTSIDE] >= 0 && inany_is_unspecified(&tgt->oaddr)) {
+		/* When we target a socket, we connect() it, but might not
+		 * always bind(), leaving the kernel to pick our address.  In
+		 * that case connect() will implicitly bind() the socket, but we
+		 * need to determine its local address so that we can match
+		 * reply packets back to the correct flow.  Update the flow with
+		 * the information from getsockname() */
+		union sockaddr_inany sa;
+		socklen_t sl = sizeof(sa);
+		in_port_t port;
+
+		if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0) {
+			flow_perror(uflow, "Unable to determine local address");
+			goto cancel;
+		}
+		inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr,
+				    &port, &sa);
+		if (port != tgt->oport) {
+			flow_err(uflow, "Unexpected local port");
+			goto cancel;
+		}
+	}
+
 	/* Tap sides always need to be looked up by hash.  Socket sides don't
 	 * always, but sometimes do (receiving packets on a socket not specific
 	 * to one flow).  Unconditionally hash both sides so all our bases are
@@ -167,6 +193,7 @@ cancel:
  * udp_flow_from_sock() - Find or create UDP flow for incoming datagram
  * @c:		Execution context
  * @pif:	Interface the datagram is arriving from
+ * @dst:	Our (local) address to which the datagram is arriving
  * @port:	Our (local) port number to which the datagram is arriving
  * @s_in:	Source socket address, filled in by recvmmsg()
  * @now:	Timestamp
@@ -176,7 +203,8 @@ cancel:
  * Return: sidx for the destination side of the flow for this packet, or
  *         FLOW_SIDX_NONE if we couldn't find or create a flow.
  */
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
+			       const union inany_addr *dst, in_port_t port,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now)
 {
@@ -185,7 +213,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
 	union flow *flow;
 	flow_sidx_t sidx;
 
-	sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, port);
+	sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port);
 	if ((uflow = udp_at_sidx(sidx))) {
 		uflow->ts = now->tv_sec;
 		return flow_sidx_opposite(sidx);
@@ -199,7 +227,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
 		return FLOW_SIDX_NONE;
 	}
 
-	ini = flow_initiate_sa(flow, pif, s_in, port);
+	ini = flow_initiate_sa(flow, pif, s_in, dst, port);
 
 	if (!inany_is_unicast(&ini->eaddr) ||
 	    ini->eport == 0 || ini->oport == 0) {
diff --git a/udp_flow.h b/udp_flow.h
index e289122..4c528e9 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -32,7 +32,8 @@ struct udp_flow {
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, in_port_t port,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
+			       const union inany_addr *dst, in_port_t port,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now);
 flow_sidx_t udp_flow_from_tap(const struct ctx *c,
diff --git a/util.h b/util.h
index b1e7e79..cc7d084 100644
--- a/util.h
+++ b/util.h
@@ -371,6 +371,16 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr,
 #define accept4(s, addr, addrlen, flags) \
 	wrap_accept4((s), (addr), (addrlen), (flags))
 
+static inline int wrap_getsockname(int sockfd, struct sockaddr *addr,
+/* cppcheck-suppress constParameterPointer */
+				   socklen_t *addrlen)
+{
+	sa_init(addr, addrlen);
+	return getsockname(sockfd, addr, addrlen);
+}
+#define getsockname(s, addr, addrlen) \
+	wrap_getsockname((s), (addr), (addrlen))
+
 #define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */
 void encode_domain_name(char *buf, const char *domain_name);
 

From bbff3653d6412690eee1a079d584a7365d2ed886 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 11 Apr 2025 09:58:31 +0200
Subject: [PATCH 113/144] conf: Split add_dns_resolv() into separate IPv4 and
 IPv6 versions

Not really valuable by itself, but dropping one level of nested blocks
makes the next change more convenient.

No functional changes intended.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Paul Holzinger <pholzing@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c | 101 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 60 insertions(+), 41 deletions(-)

diff --git a/conf.c b/conf.c
index 168646f..18ed11c 100644
--- a/conf.c
+++ b/conf.c
@@ -414,6 +414,62 @@ static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr,
 	return 1;
 }
 
+/**
+ * add_dns_resolv4() - Possibly add one IPv4 nameserver from host's resolv.conf
+ * @c:		Execution context
+ * @ns:		Nameserver address
+ * @idx:	Pointer to index of current IPv4 resolver entry, set on return
+ */
+static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx)
+{
+	if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
+		c->ip4.dns_host = *ns;
+
+	/* Special handling if guest or container can only access local
+	 * addresses via redirect, or if the host gateway is also a resolver and
+	 * we shadow its address
+	 */
+	if (IN4_IS_ADDR_LOOPBACK(ns) ||
+	    IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) {
+		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
+			return;
+
+		*ns = c->ip4.map_host_loopback;
+		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
+			c->ip4.dns_match = c->ip4.map_host_loopback;
+	}
+
+	*idx += add_dns4(c, ns, *idx);
+}
+
+/**
+ * add_dns_resolv6() - Possibly add one IPv6 nameserver from host's resolv.conf
+ * @c:		Execution context
+ * @ns:		Nameserver address
+ * @idx:	Pointer to index of current IPv6 resolver entry, set on return
+ */
+static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx)
+{
+	if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
+		c->ip6.dns_host = *ns;
+
+	/* Special handling if guest or container can only access local
+	 * addresses via redirect, or if the host gateway is also a resolver and
+	 * we shadow its address
+	 */
+	if (IN6_IS_ADDR_LOOPBACK(ns) ||
+	    IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) {
+		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
+			return;
+
+		*ns = c->ip6.map_host_loopback;
+		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
+			c->ip6.dns_match = c->ip6.map_host_loopback;
+	}
+
+	*idx += add_dns6(c, ns, *idx);
+}
+
 /**
  * add_dns_resolv() - Possibly add ns from host resolv.conf to configuration
  * @c:		Execution context
@@ -430,48 +486,11 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver,
 	struct in6_addr ns6;
 	struct in_addr ns4;
 
-	if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) {
-		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
-			c->ip4.dns_host = ns4;
+	if (idx4 && inet_pton(AF_INET, nameserver, &ns4))
+		add_dns_resolv4(c, &ns4, idx4);
 
-		/* Special handling if guest or container can only access local
-		 * addresses via redirect, or if the host gateway is also a
-		 * resolver and we shadow its address
-		 */
-		if (IN4_IS_ADDR_LOOPBACK(&ns4) ||
-		    IN4_ARE_ADDR_EQUAL(&ns4, &c->ip4.map_host_loopback)) {
-			if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
-				return;
-
-			ns4 = c->ip4.map_host_loopback;
-			if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
-				c->ip4.dns_match = c->ip4.map_host_loopback;
-		}
-
-		*idx4 += add_dns4(c, &ns4, *idx4);
-	}
-
-	if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) {
-		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
-			c->ip6.dns_host = ns6;
-
-		/* Special handling if guest or container can only access local
-		 * addresses via redirect, or if the host gateway is also a
-		 * resolver and we shadow its address
-		 */
-		if (IN6_IS_ADDR_LOOPBACK(&ns6) ||
-		    IN6_ARE_ADDR_EQUAL(&ns6, &c->ip6.map_host_loopback)) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
-				return;
-
-			ns6 = c->ip6.map_host_loopback;
-
-			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
-				c->ip6.dns_match = c->ip6.map_host_loopback;
-		}
-
-		*idx6 += add_dns6(c, &ns6, *idx6);
-	}
+	if (idx6 && inet_pton(AF_INET6, nameserver, &ns6))
+		add_dns_resolv6(c, &ns6, idx6);
 }
 
 /**

From 50249086a967c54ff5b2521038cbe1d27303958c Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 11 Apr 2025 10:50:00 +0200
Subject: [PATCH 114/144] conf: Honour --dns-forward for local resolver even
 with --no-map-gw

If the first resolver listed in the host's /etc/resolv.conf is a
loopback address, and --no-map-gw is given, we automatically conclude
that the resolver is not reachable, discard it, and, if it's the only
nameserver listed in /etc/resolv.conf, we'll warn that we:

  Couldn't get any nameserver address

However, this isn't true in a general case: the user might have passed
--dns-forward, and in that case, while we won't map the address of the
default gateway to the host, we're still supposed to map that
particular address. Otherwise, in this common Podman usage:

  pasta --config-net --dns-forward 169.254.1.1 -t none -u none -T none -U none --no-map-gw --netns /run/user/1000/netns/netns-c02a8d8f-6ee3-902e-33c5-317e0f24e0af --map-guest-addr 169.254.1.2

and with a loopback address in /etc/resolv.conf, we'll unexpectedly
refuse to forward DNS queries:

  # nslookup passt.top 169.254.1.1
  ;; connection timed out; no servers could be reached

To fix this, make an exception for --dns-forward: if &c->ip4.dns_match
or &c->ip6.dns_match are set in add_dns_resolv4() / add_dns_resolv6(),
use that address as guest-facing resolver.

We already set 'dns_host' to the address we found in /etc/resolv.conf,
that's correct in this case and it makes us forward queries as
expected.

I'm not changing the man page as the current description of
--dns-forward is already consistent with the new behaviour: there's no
described way in which --no-map-gw should affect it.

Reported-by: Andrew Sayers <andrew-bugs.passt.top@pileofstuff.org>
Link: https://bugs.passt.top/show_bug.cgi?id=111
Suggested-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Paul Holzinger <pholzing@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 conf.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/conf.c b/conf.c
index 18ed11c..f942851 100644
--- a/conf.c
+++ b/conf.c
@@ -431,12 +431,19 @@ static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx)
 	 */
 	if (IN4_IS_ADDR_LOOPBACK(ns) ||
 	    IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) {
-		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
-			return;
+		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) {
+			if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
+				return;		/* Address unreachable */
 
-		*ns = c->ip4.map_host_loopback;
-		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
+			*ns = c->ip4.map_host_loopback;
 			c->ip4.dns_match = c->ip4.map_host_loopback;
+		} else {
+			/* No general host mapping, but requested for DNS
+			 * (--dns-forward and --no-map-gw): advertise resolver
+			 * address from --dns-forward, and map that to loopback
+			 */
+			*ns = c->ip4.dns_match;
+		}
 	}
 
 	*idx += add_dns4(c, ns, *idx);
@@ -459,12 +466,19 @@ static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx)
 	 */
 	if (IN6_IS_ADDR_LOOPBACK(ns) ||
 	    IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) {
-		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
-			return;
+		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) {
+			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
+				return;		/* Address unreachable */
 
-		*ns = c->ip6.map_host_loopback;
-		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
+			*ns = c->ip6.map_host_loopback;
 			c->ip6.dns_match = c->ip6.map_host_loopback;
+		} else {
+			/* No general host mapping, but requested for DNS
+			 * (--dns-forward and --no-map-gw): advertise resolver
+			 * address from --dns-forward, and map that to loopback
+			 */
+			*ns = c->ip6.dns_match;
+		}
 	}
 
 	*idx += add_dns6(c, ns, *idx);

From baf049f8e06b7f0a73dfa7913297679a75aad381 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:18 +1000
Subject: [PATCH 115/144] udp: Fix breakage of UDP error handling by PKTINFO
 support

We recently enabled the IP_PKTINFO / IPV6_RECVPKTINFO socket options on our
UDP sockets.  This lets us obtain and properly handle the specific local
address used when we're "listening" with a socket on 0.0.0.0 or ::.

However, the PKTINFO cmsgs this option generates appear on error queue
messages as well as regular datagrams.  udp_sock_recverr() doesn't expect
this and so flags an unrecoverable error when it can't parse the control
message.

Correct this by adding space in udp_sock_recverr()s control buffer for the
additional PKTINFO data, and scan through all cmsgs for the RECVERR, rather
than only looking at the first one.

Link: https://bugs.passt.top/show_bug.cgi?id=99
Fixes: f4b0dd8b0685 ("udp: Use PKTINFO cmsgs to get destination address for received datagrams")
Reported-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/udp.c b/udp.c
index 40af7df..f5fb98c 100644
--- a/udp.c
+++ b/udp.c
@@ -155,6 +155,10 @@ __attribute__ ((aligned(32)))
 #endif
 udp_meta[UDP_MAX_FRAMES];
 
+#define PKTINFO_SPACE					\
+	MAX(CMSG_SPACE(sizeof(struct in_pktinfo)),	\
+	    CMSG_SPACE(sizeof(struct in6_pktinfo)))
+
 /**
  * enum udp_iov_idx - Indices for the buffers making up a single UDP frame
  * @UDP_IOV_TAP         tap specific header
@@ -476,10 +480,10 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		struct sock_extended_err ee;
 		union sockaddr_inany saddr;
 	};
-	const struct errhdr *eh;
-	const struct cmsghdr *hdr;
-	char buf[CMSG_SPACE(sizeof(struct errhdr))];
+	char buf[PKTINFO_SPACE + CMSG_SPACE(sizeof(struct errhdr))];
 	char data[ICMP6_MAX_DLEN];
+	const struct errhdr *eh;
+	struct cmsghdr *hdr;
 	int s = ref.fd;
 	struct iovec iov = {
 		.iov_base = data,
@@ -507,12 +511,16 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		return -1;
 	}
 
-	hdr = CMSG_FIRSTHDR(&mh);
-	if (!((hdr->cmsg_level == IPPROTO_IP &&
-	       hdr->cmsg_type == IP_RECVERR) ||
-	      (hdr->cmsg_level == IPPROTO_IPV6 &&
-	       hdr->cmsg_type == IPV6_RECVERR))) {
-		err("Unexpected cmsg reading error queue");
+	for (hdr = CMSG_FIRSTHDR(&mh); hdr; hdr = CMSG_NXTHDR(&mh, hdr)) {
+		if ((hdr->cmsg_level == IPPROTO_IP &&
+		      hdr->cmsg_type == IP_RECVERR) ||
+		     (hdr->cmsg_level == IPPROTO_IPV6 &&
+		      hdr->cmsg_type == IPV6_RECVERR))
+		    break;
+	}
+
+	if (!hdr) {
+		err("Missing RECVERR cmsg in error queue");
 		return -1;
 	}
 
@@ -587,10 +595,6 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
 	return n_err;
 }
 
-#define PKTINFO_SPACE					\
-	MAX(CMSG_SPACE(sizeof(struct in_pktinfo)),	\
-	    CMSG_SPACE(sizeof(struct in6_pktinfo)))
-
 /**
  * udp_peek_addr() - Get source address for next packet
  * @s:		Socket to get information from

From 1bb8145c221a9124ca1671e64b27de173ff2d82d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:19 +1000
Subject: [PATCH 116/144] udp: Be quieter about errors on UDP receive

If we get an error on UDP receive, either in udp_peek_addr() or
udp_sock_recv(), we'll print an error message.  However, this could be
a perfectly routine UDP error triggered by an ICMP, which need not go to
the error log.

This doesn't usually happen, because before receiving we typically clear
the error queue from udp_sock_errs().  However, it's possible an error
could be flagged after udp_sock_errs() but before we receive.  So it's
better to handle this error "silently" (trace level only).  We'll bail out
of the receive, return to the epoll loop, and get an EPOLLERR where we'll
handle and report the error properly.

In particular there's one situation that can trigger this case much more
easily.  If we start a new outbound UDP flow to a local destination with
nothing listening, we'll get a more or less immediate connection refused
error.  So, we'll get that error on the very first receive after the
connect().  That will occur in udp_flow_defer() -> udp_flush_flow() ->
udp_sock_fwd() -> udp_peek_addr() -> recvmsg().  This path doesn't call
udp_sock_errs() first, so isn't (imperfectly) protected the way we are
most of the time.

Fixes: 84ab1305faba ("udp: Polish udp_vu_sock_info() and remove from vu specific code")
Fixes: 69e5393c3722 ("udp: Move some more of sock_handler tasks into sub-functions")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/udp.c b/udp.c
index f5fb98c..154f99b 100644
--- a/udp.c
+++ b/udp.c
@@ -619,8 +619,8 @@ static int udp_peek_addr(int s, union sockaddr_inany *src,
 
 	rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
 	if (rc < 0) {
-		if (errno != EAGAIN && errno != EWOULDBLOCK)
-			warn_perror("Error peeking at socket address");
+		trace("Error peeking at socket address: %s", strerror_(errno));
+		/* Bail out and let the EPOLLERR handler deal with it */
 		return rc;
 	}
 
@@ -664,7 +664,8 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 
 	n = recvmmsg(s, mmh, n, 0, NULL);
 	if (n < 0) {
-		err_perror("Error receiving datagrams");
+		trace("Error receiving datagrams: %s", strerror_(errno));
+		/* Bail out and let the EPOLLERR handler deal with it */
 		return 0;
 	}
 

From 3f995586b35494b08631081fbf609ff932110849 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:20 +1000
Subject: [PATCH 117/144] udp: Pass socket & flow information direction to
 error handling functions

udp_sock_recverr() and udp_sock_errs() take an epoll reference from which
they obtain both the socket fd to receive errors from, and - for flow
specific sockets - the flow and side the socket is associated with.

We have some upcoming cases where we want to clear errors when we're not
directly associated with receiving an epoll event, so it's not natural to
have an epoll reference.  Therefore, make these functions take the socket
and flow from explicit parameters.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/udp.c b/udp.c
index 154f99b..c51ac95 100644
--- a/udp.c
+++ b/udp.c
@@ -467,14 +467,15 @@ static void udp_send_tap_icmp6(const struct ctx *c,
 /**
  * udp_sock_recverr() - Receive and clear an error from a socket
  * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to receive errors from
+ * @sidx:	Flow and side of @s, or FLOW_SIDX_NONE if unknown
  *
  * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
  *         if there was an error reading the queue
  *
  * #syscalls recvmsg
  */
-static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
+static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
 {
 	struct errhdr {
 		struct sock_extended_err ee;
@@ -484,7 +485,6 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 	char data[ICMP6_MAX_DLEN];
 	const struct errhdr *eh;
 	struct cmsghdr *hdr;
-	int s = ref.fd;
 	struct iovec iov = {
 		.iov_base = data,
 		.iov_len = sizeof(data)
@@ -525,12 +525,12 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 	}
 
 	eh = (const struct errhdr *)CMSG_DATA(hdr);
-	if (ref.type == EPOLL_TYPE_UDP) {
-		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
-		const struct flowside *toside = flowside_at_sidx(sidx);
+	if (flow_sidx_valid(sidx)) {
+		flow_sidx_t tosidx = flow_sidx_opposite(sidx);
+		const struct flowside *toside = flowside_at_sidx(tosidx);
 		size_t dlen = rc;
 
-		if (pif_is_socket(pif_at_sidx(sidx))) {
+		if (pif_is_socket(pif_at_sidx(tosidx))) {
 			/* XXX Is there any way to propagate ICMPs from socket
 			 * to socket? */
 		} else if (hdr->cmsg_level == IPPROTO_IP) {
@@ -554,21 +554,21 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 /**
  * udp_sock_errs() - Process errors on a socket
  * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to receive errors from
+ * @sidx:	Flow and side of @s, or FLOW_SIDX_NONE if unknown
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
+static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
-	int s = ref.fd;
 	int rc, err;
 
 	ASSERT(!c->no_udp);
 
 	/* Empty the error queue */
-	while ((rc = udp_sock_recverr(c, ref)) > 0)
+	while ((rc = udp_sock_recverr(c, s, sidx)) > 0)
 		n_err += rc;
 
 	if (rc < 0)
@@ -777,7 +777,7 @@ void udp_listen_sock_handler(const struct ctx *c,
 			     const struct timespec *now)
 {
 	if (events & EPOLLERR) {
-		if (udp_sock_errs(c, ref) < 0) {
+		if (udp_sock_errs(c, ref.fd, FLOW_SIDX_NONE) < 0) {
 			err("UDP: Unrecoverable error on listening socket:"
 			    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
 			/* FIXME: what now?  close/re-open socket? */
@@ -804,7 +804,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 	ASSERT(!c->no_udp && uflow);
 
 	if (events & EPOLLERR) {
-		if (udp_sock_errs(c, ref) < 0) {
+		if (udp_sock_errs(c, ref.fd, ref.flowside) < 0) {
 			flow_err(uflow, "Unrecoverable error on flow socket");
 			goto fail;
 		}

From 04984578b00f7507a05544b7a5490b03ab2d5135 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:21 +1000
Subject: [PATCH 118/144] udp: Deal with errors as we go in udp_sock_fwd()

When we get an epoll event on a listening socket, we first deal with any
errors (udp_sock_errs()), then with any received packets (udp_sock_fwd()).
However, it's theoretically possible that new errors could get flagged on
the socket after we call udp_sock_errs(), in which case we could get errors
returned in in udp_sock_fwd() -> udp_peek_addr() -> recvmsg().

In fact, we do deal with this correctly, although the path is somewhat
non-obvious.  The recvmsg() error will cause us to bail out of
udp_sock_fwd(), but the EPOLLERR event will now be flagged, so we'll come
back here next epoll loop and call udp_sock_errs().

Except.. we call udp_sock_fwd() from udp_flush_flow() as well as from
epoll events.  This is to deal with any packets that arrived between bind()
and connect(), and so might not be associated with the socket's intended
flow.  This expects udp_sock_fwd() to flush _all_ queued datagrams, so that
anything received later must be for the correct flow.

At the moment, udp_sock_errs() might fail to flush all datagrams if errors
occur.  In particular this can happen in practice for locally reported
errors which occur immediately after connect() (e.g. connecting to a local
port with nothing listening).

We can deal with the problem case, and also make the flow a little more
natural for the common case by having udp_sock_fwd() call udp_sock_errs()
to handle errors as the occur, rather than trying to deal with all errors
in advance.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/udp.c b/udp.c
index c51ac95..0bec499 100644
--- a/udp.c
+++ b/udp.c
@@ -601,7 +601,7 @@ static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx)
  * @src:	Socket address (output)
  * @dst:	(Local) destination address (output)
  *
- * Return: 0 on success, -1 otherwise
+ * Return: 0 if no more packets, 1 on success, -ve error code on error
  */
 static int udp_peek_addr(int s, union sockaddr_inany *src,
 			 union inany_addr *dst)
@@ -619,9 +619,9 @@ static int udp_peek_addr(int s, union sockaddr_inany *src,
 
 	rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
 	if (rc < 0) {
-		trace("Error peeking at socket address: %s", strerror_(errno));
-		/* Bail out and let the EPOLLERR handler deal with it */
-		return rc;
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+			return 0;
+		return -errno;
 	}
 
 	hdr = CMSG_FIRSTHDR(&msg);
@@ -644,7 +644,7 @@ static int udp_peek_addr(int s, union sockaddr_inany *src,
 	      sockaddr_ntop(src, sastr, sizeof(sastr)),
 	      inany_ntop(dst, dstr, sizeof(dstr)));
 
-	return 0;
+	return 1;
 }
 
 /**
@@ -740,11 +740,27 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 {
 	union sockaddr_inany src;
 	union inany_addr dst;
+	int rc;
 
-	while (udp_peek_addr(s, &src, &dst) == 0) {
-		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif,
-							&dst, port, &src, now);
-		uint8_t topif = pif_at_sidx(tosidx);
+	while ((rc = udp_peek_addr(s, &src, &dst)) != 0) {
+		flow_sidx_t tosidx;
+		uint8_t topif;
+
+		if (rc < 0) {
+			trace("Error peeking at socket address: %s",
+			      strerror_(-rc));
+			/* Clear errors & carry on */
+			if (udp_sock_errs(c, s, FLOW_SIDX_NONE) < 0) {
+				err(
+"UDP: Unrecoverable error on listening socket: (%s port %hu)",
+				    pif_name(frompif), port);
+				/* FIXME: what now?  close/re-open socket? */
+			}
+			continue;
+		}
+
+		tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now);
+		topif = pif_at_sidx(tosidx);
 
 		if (pif_is_socket(topif)) {
 			udp_sock_to_sock(c, s, 1, tosidx);
@@ -776,16 +792,7 @@ void udp_listen_sock_handler(const struct ctx *c,
 			     union epoll_ref ref, uint32_t events,
 			     const struct timespec *now)
 {
-	if (events & EPOLLERR) {
-		if (udp_sock_errs(c, ref.fd, FLOW_SIDX_NONE) < 0) {
-			err("UDP: Unrecoverable error on listening socket:"
-			    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
-			/* FIXME: what now?  close/re-open socket? */
-			return;
-		}
-	}
-
-	if (events & EPOLLIN)
+	if (events & (EPOLLERR | EPOLLIN))
 		udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now);
 }
 

From f107a86cc05c83c5755861b00b85cdf0eb5c9534 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:22 +1000
Subject: [PATCH 119/144] udp: Add udp_pktinfo() helper

Currently we open code parsing the control message for IP_PKTINFO in
udp_peek_addr().  We have an upcoming case where we want to parse PKTINFO
in another place, so split this out into a helper function.

While we're there, make the parsing a bit more robust: scan all cmsgs to
look for the one we want, rather than assuming there's only one.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: udp_pktinfo(): Fix typo in comment and change err() to debug()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 52 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/udp.c b/udp.c
index 0bec499..97034f6 100644
--- a/udp.c
+++ b/udp.c
@@ -464,6 +464,41 @@ static void udp_send_tap_icmp6(const struct ctx *c,
 	tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
 }
 
+/**
+ * udp_pktinfo() - Retrieve packet destination address from cmsg
+ * @msg:	msghdr into which message has been received
+ * @dst:	(Local) destination address of message in @mh (output)
+ *
+ * Return: 0 on success, -1 if the information was missing (@dst is set to
+ *         inany_any6).
+ */
+static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst)
+{
+	struct cmsghdr *hdr;
+
+	for (hdr = CMSG_FIRSTHDR(msg); hdr; hdr = CMSG_NXTHDR(msg, hdr)) {
+		if (hdr->cmsg_level == IPPROTO_IP &&
+		    hdr->cmsg_type == IP_PKTINFO) {
+			const struct in_pktinfo *i4 = (void *)CMSG_DATA(hdr);
+
+			*dst = inany_from_v4(i4->ipi_addr);
+			return 0;
+		}
+
+		if (hdr->cmsg_level == IPPROTO_IPV6 &&
+			   hdr->cmsg_type == IPV6_PKTINFO) {
+			const struct in6_pktinfo *i6 = (void *)CMSG_DATA(hdr);
+
+			dst->a6 = i6->ipi6_addr;
+			return 0;
+		}
+	}
+
+	debug("Missing PKTINFO cmsg on datagram");
+	*dst = inany_any6;
+	return -1;
+}
+
 /**
  * udp_sock_recverr() - Receive and clear an error from a socket
  * @c:		Execution context
@@ -607,7 +642,6 @@ static int udp_peek_addr(int s, union sockaddr_inany *src,
 			 union inany_addr *dst)
 {
 	char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN];
-	const struct cmsghdr *hdr;
 	char cmsg[PKTINFO_SPACE];
 	struct msghdr msg = {
 		.msg_name = src,
@@ -624,21 +658,7 @@ static int udp_peek_addr(int s, union sockaddr_inany *src,
 		return -errno;
 	}
 
-	hdr = CMSG_FIRSTHDR(&msg);
-	if (hdr && hdr->cmsg_level == IPPROTO_IP &&
-	    hdr->cmsg_type == IP_PKTINFO) {
-		const struct in_pktinfo *info4 = (void *)CMSG_DATA(hdr);
-
-		*dst = inany_from_v4(info4->ipi_addr);
-	} else if (hdr && hdr->cmsg_level == IPPROTO_IPV6 &&
-		   hdr->cmsg_type == IPV6_PKTINFO) {
-		const struct in6_pktinfo *info6 = (void *)CMSG_DATA(hdr);
-
-		dst->a6 = info6->ipi6_addr;
-	} else {
-		debug("Unexpected cmsg on UDP datagram");
-		*dst = inany_any6;
-	}
+	udp_pktinfo(&msg, dst);
 
 	trace("Peeked UDP datagram: %s -> %s",
 	      sockaddr_ntop(src, sastr, sizeof(sastr)),

From cfc0ee145a5cdd29b6e584171085dac6539b86c0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:23 +1000
Subject: [PATCH 120/144] udp: Minor re-organisation of udp_sock_recverr()

Usually we work with the "exit early" flow style, where we return early
on "error" conditions in functions.  We don't currently do this in
udp_sock_recverr() for the case where we don't have a flow to associate
the error with.

Reorganise to use the "exit early" style, which will make some subsequent
changes less awkward.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/udp.c b/udp.c
index 97034f6..e8240fe 100644
--- a/udp.c
+++ b/udp.c
@@ -530,6 +530,9 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
 		.msg_control = buf,
 		.msg_controllen = sizeof(buf),
 	};
+	const struct flowside *toside;
+	flow_sidx_t tosidx;
+	size_t dlen;
 	ssize_t rc;
 
 	rc = recvmsg(s, &mh, MSG_ERRQUEUE);
@@ -560,29 +563,32 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
 	}
 
 	eh = (const struct errhdr *)CMSG_DATA(hdr);
-	if (flow_sidx_valid(sidx)) {
-		flow_sidx_t tosidx = flow_sidx_opposite(sidx);
-		const struct flowside *toside = flowside_at_sidx(tosidx);
-		size_t dlen = rc;
 
-		if (pif_is_socket(pif_at_sidx(tosidx))) {
-			/* XXX Is there any way to propagate ICMPs from socket
-			 * to socket? */
-		} else if (hdr->cmsg_level == IPPROTO_IP) {
-			dlen = MIN(dlen, ICMP4_MAX_DLEN);
-			udp_send_tap_icmp4(c, &eh->ee, toside,
-					   eh->saddr.sa4.sin_addr, data, dlen);
-		} else if (hdr->cmsg_level == IPPROTO_IPV6) {
-			udp_send_tap_icmp6(c, &eh->ee, toside,
-					   &eh->saddr.sa6.sin6_addr, data,
-					   dlen, sidx.flowi);
-		}
-	} else {
-		trace("Ignoring received IP_RECVERR cmsg on listener socket");
-	}
 	debug("%s error on UDP socket %i: %s",
 	      str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno));
 
+	if (!flow_sidx_valid(sidx)) {
+		trace("Ignoring received IP_RECVERR cmsg on listener socket");
+		return 1;
+	}
+
+	tosidx = flow_sidx_opposite(sidx);
+	toside = flowside_at_sidx(tosidx);
+	dlen = rc;
+
+	if (pif_is_socket(pif_at_sidx(tosidx))) {
+		/* XXX Is there any way to propagate ICMPs from socket to
+		 * socket? */
+	} else if (hdr->cmsg_level == IPPROTO_IP) {
+		dlen = MIN(dlen, ICMP4_MAX_DLEN);
+		udp_send_tap_icmp4(c, &eh->ee, toside,
+				   eh->saddr.sa4.sin_addr, data, dlen);
+	} else if (hdr->cmsg_level == IPPROTO_IPV6) {
+		udp_send_tap_icmp6(c, &eh->ee, toside,
+				   &eh->saddr.sa6.sin6_addr, data,
+				   dlen, sidx.flowi);
+	}
+
 	return 1;
 }
 

From 2340bbf867e6c3c3b5ac67345b0e841ab49bbaa5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 15 Apr 2025 17:16:24 +1000
Subject: [PATCH 121/144] udp: Propagate errors on listening and brand new
 sockets

udp_sock_recverr() processes errors on UDP sockets and attempts to
propagate them as ICMP packets on the tap interface.  To do this it
currently requires the flow with which the error is associated as a
parameter.  If that's missing it will clear the error condition, but not
propagate it.

That means that we largely ignore errors on "listening" sockets.  It also
means we may discard some errors on flow specific sockets if they occur
very shortly after the socket is created.  In udp_flush_flow() we need to
clear any datagrams received between bind() and connect() which might not
be associated with the "final" flow for the socket.  If we get errors
before that point we'll ignore them in the same way because we don't know
the flow they're associated with in advance.

This can happen in practice if we have errors which occur almost
immediately after connect(), such as ECONNREFUSED when we connect() to a
local address where nothing is listening.

Between the extended error message itself and the PKTINFO information we
do actually have enough information to find the correct flow.  So, rather
than ignoring errors where we don't have a flow "hint", determine the flow
the hard way in udp_sock_recverr().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Change warn() to debug() in udp_sock_recverr()]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/udp.c b/udp.c
index e8240fe..57769d0 100644
--- a/udp.c
+++ b/udp.c
@@ -504,27 +504,34 @@ static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst)
  * @c:		Execution context
  * @s:		Socket to receive errors from
  * @sidx:	Flow and side of @s, or FLOW_SIDX_NONE if unknown
+ * @pif:	Interface on which the error occurred
+ *              (only used if @sidx == FLOW_SIDX_NONE)
+ * @port:	Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
  *
  * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
  *         if there was an error reading the queue
  *
  * #syscalls recvmsg
  */
-static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
+static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
+			    uint8_t pif, in_port_t port)
 {
 	struct errhdr {
 		struct sock_extended_err ee;
 		union sockaddr_inany saddr;
 	};
 	char buf[PKTINFO_SPACE + CMSG_SPACE(sizeof(struct errhdr))];
+	const struct errhdr *eh = NULL;
 	char data[ICMP6_MAX_DLEN];
-	const struct errhdr *eh;
 	struct cmsghdr *hdr;
 	struct iovec iov = {
 		.iov_base = data,
 		.iov_len = sizeof(data)
 	};
+	union sockaddr_inany src;
 	struct msghdr mh = {
+		.msg_name = &src,
+		.msg_namelen = sizeof(src),
 		.msg_iov = &iov,
 		.msg_iovlen = 1,
 		.msg_control = buf,
@@ -554,7 +561,7 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
 		      hdr->cmsg_type == IP_RECVERR) ||
 		     (hdr->cmsg_level == IPPROTO_IPV6 &&
 		      hdr->cmsg_type == IPV6_RECVERR))
-		    break;
+			break;
 	}
 
 	if (!hdr) {
@@ -568,8 +575,19 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
 	      str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno));
 
 	if (!flow_sidx_valid(sidx)) {
-		trace("Ignoring received IP_RECVERR cmsg on listener socket");
-		return 1;
+		/* No hint from the socket, determine flow from addresses */
+		union inany_addr dst;
+
+		if (udp_pktinfo(&mh, &dst) < 0) {
+			debug("Missing PKTINFO on UDP error");
+			return 1;
+		}
+
+		sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, &src, &dst, port);
+		if (!flow_sidx_valid(sidx)) {
+			debug("Ignoring UDP error without flow");
+			return 1;
+		}
 	}
 
 	tosidx = flow_sidx_opposite(sidx);
@@ -597,10 +615,14 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx)
  * @c:		Execution context
  * @s:		Socket to receive errors from
  * @sidx:	Flow and side of @s, or FLOW_SIDX_NONE if unknown
+ * @pif:	Interface on which the error occurred
+ *              (only used if @sidx == FLOW_SIDX_NONE)
+ * @port:	Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
  *
  * Return: Number of errors handled, or < 0 if we have an unrecoverable error
  */
-static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx)
+static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx,
+			 uint8_t pif, in_port_t port)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
@@ -609,7 +631,7 @@ static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx)
 	ASSERT(!c->no_udp);
 
 	/* Empty the error queue */
-	while ((rc = udp_sock_recverr(c, s, sidx)) > 0)
+	while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0)
 		n_err += rc;
 
 	if (rc < 0)
@@ -776,7 +798,8 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 			trace("Error peeking at socket address: %s",
 			      strerror_(-rc));
 			/* Clear errors & carry on */
-			if (udp_sock_errs(c, s, FLOW_SIDX_NONE) < 0) {
+			if (udp_sock_errs(c, s, FLOW_SIDX_NONE,
+					  frompif, port) < 0) {
 				err(
 "UDP: Unrecoverable error on listening socket: (%s port %hu)",
 				    pif_name(frompif), port);
@@ -837,7 +860,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 	ASSERT(!c->no_udp && uflow);
 
 	if (events & EPOLLERR) {
-		if (udp_sock_errs(c, ref.fd, ref.flowside) < 0) {
+		if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) {
 			flow_err(uflow, "Unrecoverable error on flow socket");
 			goto fail;
 		}

From 9128f6e8f47d94c761b5fd8c0d0b8308758cbdc5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 17 Apr 2025 11:55:40 +1000
Subject: [PATCH 122/144] fwd: Split out helpers for port-independent NAT

Currently the functions fwd_nat_from_*() make some address translations
based on both the IP address and protocol port numbers, and others based
only on the address.  We have some upcoming cases where it's useful to use
the IP-address-only translations separately, so split them out into helper
functions.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 fwd.c | 87 ++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 25 deletions(-)

diff --git a/fwd.c b/fwd.c
index 2829cd2..5c70e83 100644
--- a/fwd.c
+++ b/fwd.c
@@ -323,6 +323,30 @@ static bool fwd_guest_accessible(const struct ctx *c,
 	return fwd_guest_accessible6(c, &addr->a6);
 }
 
+/**
+ * nat_outbound() - Apply address translation for outbound (TAP to HOST)
+ * @c:		Execution context
+ * @addr:	Input address (as seen on TAP interface)
+ * @translated:	Output address (as seen on HOST interface)
+ *
+ * Only handles translations that depend *only* on the address.  Anything
+ * related to specific ports or flows is handled elsewhere.
+ */
+static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
+			 union inany_addr *translated)
+{
+	if (inany_equals4(addr, &c->ip4.map_host_loopback))
+		*translated = inany_loopback4;
+	else if (inany_equals6(addr, &c->ip6.map_host_loopback))
+		*translated = inany_loopback6;
+	else if (inany_equals4(addr, &c->ip4.map_guest_addr))
+		*translated = inany_from_v4(c->ip4.addr);
+	else if (inany_equals6(addr, &c->ip6.map_guest_addr))
+		translated->a6 = c->ip6.addr;
+	else
+		*translated = *addr;
+}
+
 /**
  * fwd_nat_from_tap() - Determine to forward a flow from the tap interface
  * @c:		Execution context
@@ -342,16 +366,8 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
 	else if (is_dns_flow(proto, ini) &&
 		   inany_equals6(&ini->oaddr, &c->ip6.dns_match))
 		tgt->eaddr.a6 = c->ip6.dns_host;
-	else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
-		tgt->eaddr = inany_loopback4;
-	else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
-		tgt->eaddr = inany_loopback6;
-	else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
-		tgt->eaddr = inany_from_v4(c->ip4.addr);
-	else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
-		tgt->eaddr.a6 = c->ip6.addr;
 	else
-		tgt->eaddr = ini->oaddr;
+		nat_outbound(c, &ini->oaddr, &tgt->eaddr);
 
 	tgt->eport = ini->oport;
 
@@ -423,6 +439,42 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
 	return PIF_HOST;
 }
 
+/**
+ * nat_inbound() - Apply address translation for outbound (HOST to TAP)
+ * @c:		Execution context
+ * @addr:	Input address (as seen on HOST interface)
+ * @translated:	Output address (as seen on TAP interface)
+ *
+ * Return: true on success, false if it couldn't translate the address
+ *
+ * Only handles translations that depend *only* on the address.  Anything
+ * related to specific ports or flows is handled elsewhere.
+ */
+static bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
+			 union inany_addr *translated)
+{
+	if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
+	    inany_equals4(addr, &in4addr_loopback)) {
+		/* Specifically 127.0.0.1, not 127.0.0.0/8 */
+		*translated = inany_from_v4(c->ip4.map_host_loopback);
+	} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
+		   inany_equals6(addr, &in6addr_loopback)) {
+		translated->a6 = c->ip6.map_host_loopback;
+	} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
+		   inany_equals4(addr, &c->ip4.addr)) {
+		*translated = inany_from_v4(c->ip4.map_guest_addr);
+	} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
+		   inany_equals6(addr, &c->ip6.addr)) {
+		translated->a6 = c->ip6.map_guest_addr;
+	} else if (fwd_guest_accessible(c, addr)) {
+		*translated = *addr;
+	} else {
+		return false;
+	}
+
+	return true;
+}
+
 /**
  * fwd_nat_from_host() - Determine to forward a flow from the host interface
  * @c:		Execution context
@@ -479,20 +531,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
 		return PIF_SPLICE;
 	}
 
-	if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
-	    inany_equals4(&ini->eaddr, &in4addr_loopback)) {
-		/* Specifically 127.0.0.1, not 127.0.0.0/8 */
-		tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
-	} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
-		   inany_equals6(&ini->eaddr, &in6addr_loopback)) {
-		tgt->oaddr.a6 = c->ip6.map_host_loopback;
-	} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
-		   inany_equals4(&ini->eaddr, &c->ip4.addr)) {
-		tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
-	} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
-		   inany_equals6(&ini->eaddr, &c->ip6.addr)) {
-		tgt->oaddr.a6 = c->ip6.map_guest_addr;
-	} else if (!fwd_guest_accessible(c, &ini->eaddr)) {
+	if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) {
 		if (inany_v4(&ini->eaddr)) {
 			if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
 				/* No source address we can use */
@@ -501,8 +540,6 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
 		} else {
 			tgt->oaddr.a6 = c->ip6.our_tap_ll;
 		}
-	} else {
-		tgt->oaddr = ini->eaddr;
 	}
 	tgt->oport = ini->eport;
 

From 4668e9137806b551f6ee44609064cc40243c2b6b Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 17 Apr 2025 11:55:41 +1000
Subject: [PATCH 123/144] treewide: Improve robustness against sockaddrs of
 unexpected family

inany_from_sockaddr() expects a socket address of family AF_INET or
AF_INET6 and ASSERT()s if it gets anything else.  In many of the callers we
can handle an unexpected family more gracefully, though, e.g. by failing
a single flow rather than killing passt.

Change inany_from_sockaddr() to return an error instead of ASSERT()ing,
and handle those errors in the callers.  Improve the reporting of any such
errors while we're at it.

With this greater robustness, allow inany_from_sockaddr() to take a void *
rather than specifically a union sockaddr_inany *.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c     | 16 ++++++++++++++--
 inany.h    | 30 ++++++++++++++++++------------
 tcp.c      | 10 ++++------
 udp_flow.c |  6 +++---
 4 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/flow.c b/flow.c
index 3c81cb4..447c021 100644
--- a/flow.c
+++ b/flow.c
@@ -408,7 +408,12 @@ struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
 {
 	struct flowside *ini = &flow->f.side[INISIDE];
 
-	inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
+	if (inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa) < 0) {
+		char str[SOCKADDR_STRLEN];
+
+		ASSERT_WITH_MSG(0, "Bad socket address %s",
+				sockaddr_ntop(ssa, str, sizeof(str)));
+	}
 	if (daddr)
 		ini->oaddr = *daddr;
 	else if (inany_v4(&ini->eaddr))
@@ -768,7 +773,14 @@ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
 		.oport = oport,
 	};
 
-	inany_from_sockaddr(&side.eaddr, &side.eport, esa);
+	if (inany_from_sockaddr(&side.eaddr, &side.eport, esa) < 0) {
+		char str[SOCKADDR_STRLEN];
+
+		warn("Flow lookup on bad socket address %s",
+		     sockaddr_ntop(esa, str, sizeof(str)));
+		return FLOW_SIDX_NONE;
+	}
+
 	if (oaddr)
 		side.oaddr = *oaddr;
 	else if (inany_v4(&side.eaddr))
diff --git a/inany.h b/inany.h
index 1c247e1..7ca5cbd 100644
--- a/inany.h
+++ b/inany.h
@@ -237,24 +237,30 @@ static inline void inany_from_af(union inany_addr *aa,
 }
 
 /** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr
- * @aa:		Pointer to store IPv[46] address
+ * @dst:	Pointer to store IPv[46] address (output)
  * @port:	Pointer to store port number, host order
- * @addr:	AF_INET or AF_INET6 socket address
+ * @addr:	Socket address
+ *
+ * Return: 0 on success, -1 on error (bad address family)
  */
-static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
-				       const union sockaddr_inany *sa)
+static inline int inany_from_sockaddr(union inany_addr *dst, in_port_t *port,
+				      const void *addr)
 {
+	const union sockaddr_inany *sa = (const union sockaddr_inany *)addr;
+
 	if (sa->sa_family == AF_INET6) {
-		inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr);
+		inany_from_af(dst, AF_INET6, &sa->sa6.sin6_addr);
 		*port = ntohs(sa->sa6.sin6_port);
-	} else if (sa->sa_family == AF_INET) {
-		inany_from_af(aa, AF_INET, &sa->sa4.sin_addr);
-		*port = ntohs(sa->sa4.sin_port);
-	} else {
-		/* Not valid to call with other address families */
-		ASSERT_WITH_MSG(0, "Unexpected sockaddr family: %u",
-				sa->sa_family);
+		return 0;
 	}
+
+	if (sa->sa_family == AF_INET) {
+		inany_from_af(dst, AF_INET, &sa->sa4.sin_addr);
+		*port = ntohs(sa->sa4.sin_port);
+		return 0;
+	}
+
+	return -1;
 }
 
 /** inany_siphash_feed- Fold IPv[46] address into an in-progress siphash
diff --git a/tcp.c b/tcp.c
index 9c6bc52..0ac298a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1546,9 +1546,8 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 
 	if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
 		sl = sizeof(sa);
-		if (!getsockname(s, &sa.sa, &sl))
-			inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa);
-		else
+		if (getsockname(s, &sa.sa, &sl) ||
+		    inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0)
 			err_perror("Can't get local address for socket %i", s);
 	}
 
@@ -2204,9 +2203,8 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			       NULL, ref.tcp_listen.port);
 
 	if (c->mode == MODE_VU) { /* Rebind to same address after migration */
-		if (!getsockname(s, &sa.sa, &sl))
-			inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa);
-		else
+		if (getsockname(s, &sa.sa, &sl) ||
+		    inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0)
 			err_perror("Can't get local address for socket %i", s);
 	}
 
diff --git a/udp_flow.c b/udp_flow.c
index ef2cbb0..fea1cf3 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -158,12 +158,12 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 		socklen_t sl = sizeof(sa);
 		in_port_t port;
 
-		if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0) {
+		if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0 ||
+		    inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr,
+					&port, &sa) < 0) {
 			flow_perror(uflow, "Unable to determine local address");
 			goto cancel;
 		}
-		inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr,
-				    &port, &sa);
 		if (port != tgt->oport) {
 			flow_err(uflow, "Unexpected local port");
 			goto cancel;

From 08e617ec2ba916d8250a41d3ac68183124a6ec3e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 17 Apr 2025 11:55:42 +1000
Subject: [PATCH 124/144] udp: Rework offender address handling in
 udp_sock_recverr()

Make a number of changes to udp_sock_recverr() to improve the robustness
of how we handle addresses.

 * Get the "offender" address (source of the ICMP packet) using the
   SO_EE_OFFENDER() macro, reducing assumptions about structure layout.
 * Parse the offender sockaddr using inany_from_sockaddr()
 * Check explicitly that the source and destination pifs are what we
   expect.  Previously we checked something that was probably equivalent
   in practice, but isn't strictly speaking what we require for the rest
   of the code.
 * Verify that for an ICMPv4 error we also have an IPv4 source/offender
   and destination/endpoint address
 * Verify that for an ICMPv6 error we have an IPv6 endpoint
 * Improve debug reporting of any failures

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp.c | 69 +++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/udp.c b/udp.c
index 57769d0..d09b3eb 100644
--- a/udp.c
+++ b/udp.c
@@ -159,6 +159,12 @@ udp_meta[UDP_MAX_FRAMES];
 	MAX(CMSG_SPACE(sizeof(struct in_pktinfo)),	\
 	    CMSG_SPACE(sizeof(struct in6_pktinfo)))
 
+#define RECVERR_SPACE							\
+	MAX(CMSG_SPACE(sizeof(struct sock_extended_err) +		\
+		       sizeof(struct sockaddr_in)),			\
+	    CMSG_SPACE(sizeof(struct sock_extended_err) +		\
+		       sizeof(struct sockaddr_in6)))
+
 /**
  * enum udp_iov_idx - Indices for the buffers making up a single UDP frame
  * @UDP_IOV_TAP         tap specific header
@@ -516,12 +522,8 @@ static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst)
 static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 			    uint8_t pif, in_port_t port)
 {
-	struct errhdr {
-		struct sock_extended_err ee;
-		union sockaddr_inany saddr;
-	};
-	char buf[PKTINFO_SPACE + CMSG_SPACE(sizeof(struct errhdr))];
-	const struct errhdr *eh = NULL;
+	char buf[PKTINFO_SPACE + RECVERR_SPACE];
+	const struct sock_extended_err *ee;
 	char data[ICMP6_MAX_DLEN];
 	struct cmsghdr *hdr;
 	struct iovec iov = {
@@ -538,7 +540,13 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 		.msg_controllen = sizeof(buf),
 	};
 	const struct flowside *toside;
-	flow_sidx_t tosidx;
+	char astr[INANY_ADDRSTRLEN];
+	char sastr[SOCKADDR_STRLEN];
+	union inany_addr offender;
+	const struct in_addr *o4;
+	in_port_t offender_port;
+	struct udp_flow *uflow;
+	uint8_t topif;
 	size_t dlen;
 	ssize_t rc;
 
@@ -569,10 +577,10 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 		return -1;
 	}
 
-	eh = (const struct errhdr *)CMSG_DATA(hdr);
+	ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
 
 	debug("%s error on UDP socket %i: %s",
-	      str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno));
+	      str_ee_origin(ee), s, strerror_(ee->ee_errno));
 
 	if (!flow_sidx_valid(sidx)) {
 		/* No hint from the socket, determine flow from addresses */
@@ -588,25 +596,44 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 			debug("Ignoring UDP error without flow");
 			return 1;
 		}
+	} else {
+		pif = pif_at_sidx(sidx);
 	}
 
-	tosidx = flow_sidx_opposite(sidx);
-	toside = flowside_at_sidx(tosidx);
+	uflow = udp_at_sidx(sidx);
+	ASSERT(uflow);
+	toside = &uflow->f.side[!sidx.sidei];
+	topif = uflow->f.pif[!sidx.sidei];
 	dlen = rc;
 
-	if (pif_is_socket(pif_at_sidx(tosidx))) {
-		/* XXX Is there any way to propagate ICMPs from socket to
-		 * socket? */
-	} else if (hdr->cmsg_level == IPPROTO_IP) {
+	if (inany_from_sockaddr(&offender, &offender_port,
+				SO_EE_OFFENDER(ee)) < 0)
+		goto fail;
+
+	if (pif != PIF_HOST || topif != PIF_TAP)
+		/* XXX Can we support any other cases? */
+		goto fail;
+
+	if (hdr->cmsg_level == IPPROTO_IP &&
+	    (o4 = inany_v4(&offender)) && inany_v4(&toside->eaddr)) {
 		dlen = MIN(dlen, ICMP4_MAX_DLEN);
-		udp_send_tap_icmp4(c, &eh->ee, toside,
-				   eh->saddr.sa4.sin_addr, data, dlen);
-	} else if (hdr->cmsg_level == IPPROTO_IPV6) {
-		udp_send_tap_icmp6(c, &eh->ee, toside,
-				   &eh->saddr.sa6.sin6_addr, data,
-				   dlen, sidx.flowi);
+		udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
+		return 1;
 	}
 
+	if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
+		udp_send_tap_icmp6(c, ee, toside, &offender.a6, data, dlen,
+				   FLOW_IDX(uflow));
+		return 1;
+	}
+
+fail:
+	flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s",
+		 str_ee_origin(ee),
+		 pif_name(pif),
+		 sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)),
+		 pif_name(topif),
+		 inany_ntop(&toside->eaddr, astr, sizeof(astr)));
 	return 1;
 }
 

From 436afc30447c6f0ce516f2b38c769833114bb5f8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 17 Apr 2025 11:55:43 +1000
Subject: [PATCH 125/144] udp: Translate offender addresses for ICMP messages

We've recently added support for propagating ICMP errors related to a UDP
flow from the host to the guest, by handling the extended UDP error on the
socket and synthesizing a suitable ICMP on the tap interface.

Currently we create that ICMP with a source address of the "offender" from
the extended error information - the source of the ICMP error received on
the host.  However, we don't translate this address for cases where we NAT
between host and guest.  This means (amongst other things) that we won't
get a "Connection refused" error as expected if send data from the guest to
the --map-host-loopback address.  The error comes from 127.0.0.1 on the
host, which doesn't make sense on the tap interface and will be discarded
by the guest.

Because ICMP errors can be sent by an intermediate host, not just by the
endpoints of the flow, we can't handle this translation purely with the
information in the flow table entry.  We need to explicitly translate this
address by our NAT rules, which we can do with the nat_inbound() helper.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 fwd.c |  4 ++--
 fwd.h |  3 +++
 udp.c | 18 ++++++++++++++----
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/fwd.c b/fwd.c
index 5c70e83..b73c2c8 100644
--- a/fwd.c
+++ b/fwd.c
@@ -450,8 +450,8 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
  * Only handles translations that depend *only* on the address.  Anything
  * related to specific ports or flows is handled elsewhere.
  */
-static bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
-			 union inany_addr *translated)
+bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
+		 union inany_addr *translated)
 {
 	if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
 	    inany_equals4(addr, &in4addr_loopback)) {
diff --git a/fwd.h b/fwd.h
index 3562f3c..0458a3c 100644
--- a/fwd.h
+++ b/fwd.h
@@ -7,6 +7,7 @@
 #ifndef FWD_H
 #define FWD_H
 
+union inany_addr;
 struct flowside;
 
 /* Number of ports for both TCP and UDP */
@@ -47,6 +48,8 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
 			const struct fwd_ports *tcp_rev);
 void fwd_scan_ports_init(struct ctx *c);
 
+bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
+		 union inany_addr *translated);
 uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
 			 const struct flowside *ini, struct flowside *tgt);
 uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
diff --git a/udp.c b/udp.c
index d09b3eb..f5a5cd1 100644
--- a/udp.c
+++ b/udp.c
@@ -539,10 +539,10 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 		.msg_control = buf,
 		.msg_controllen = sizeof(buf),
 	};
-	const struct flowside *toside;
+	const struct flowside *fromside, *toside;
+	union inany_addr offender, otap;
 	char astr[INANY_ADDRSTRLEN];
 	char sastr[SOCKADDR_STRLEN];
-	union inany_addr offender;
 	const struct in_addr *o4;
 	in_port_t offender_port;
 	struct udp_flow *uflow;
@@ -602,6 +602,7 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 
 	uflow = udp_at_sidx(sidx);
 	ASSERT(uflow);
+	fromside = &uflow->f.side[sidx.sidei];
 	toside = &uflow->f.side[!sidx.sidei];
 	topif = uflow->f.pif[!sidx.sidei];
 	dlen = rc;
@@ -614,15 +615,24 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
 		/* XXX Can we support any other cases? */
 		goto fail;
 
+	/* If the offender *is* the endpoint, make sure our translation is
+	 * consistent with the flow's translation.  This matters if the flow
+	 * endpoint has a port specific translation (like --dns-match).
+	 */
+	if (inany_equals(&offender, &fromside->eaddr))
+		otap = toside->oaddr;
+	else if (!nat_inbound(c, &offender, &otap))
+		goto fail;
+
 	if (hdr->cmsg_level == IPPROTO_IP &&
-	    (o4 = inany_v4(&offender)) && inany_v4(&toside->eaddr)) {
+	    (o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) {
 		dlen = MIN(dlen, ICMP4_MAX_DLEN);
 		udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
 		return 1;
 	}
 
 	if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
-		udp_send_tap_icmp6(c, ee, toside, &offender.a6, data, dlen,
+		udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen,
 				   FLOW_IDX(uflow));
 		return 1;
 	}

From aa1cc8922867b8f7c17742f8da3b9fcc6291bbeb Mon Sep 17 00:00:00 2001
From: Alyssa Ross <hi@alyssa.is>
Date: Sat, 26 Apr 2025 10:44:25 +0200
Subject: [PATCH 126/144] conf: allow --fd 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

inetd-style socket passing traditionally starts a service with a
connected socket on file descriptors 0 and 1.  passt disallowing
obtaining its socket from either of these descriptors made it
difficult to use with super-servers providing this interface — in my
case I wanted to use passt with s6-ipcserver[1].  Since (as far as I
can tell) passt does not use standard input for anything else (unlike
standard output), it should be safe to relax the restrictions on --fd
to allow setting it to 0, enabling this use case.

Link: https://skarnet.org/software/s6/s6-ipcserver.html [1]
Signed-off-by: Alyssa Ross <hi@alyssa.is>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c | 3 ++-
 util.c | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/conf.c b/conf.c
index f942851..a6d7e22 100644
--- a/conf.c
+++ b/conf.c
@@ -1717,7 +1717,8 @@ void conf(struct ctx *c, int argc, char **argv)
 			fd_tap_opt = strtol(optarg, NULL, 0);
 
 			if (errno ||
-			    fd_tap_opt <= STDERR_FILENO || fd_tap_opt > INT_MAX)
+			    (fd_tap_opt != STDIN_FILENO && fd_tap_opt <= STDERR_FILENO) ||
+			    fd_tap_opt > INT_MAX)
 				die("Invalid --fd: %s", optarg);
 
 			c->fd_tap = fd_tap_opt;
diff --git a/util.c b/util.c
index 62a6003..f5497d4 100644
--- a/util.c
+++ b/util.c
@@ -875,7 +875,9 @@ void close_open_files(int argc, char **argv)
 			errno = 0;
 			fd = strtol(optarg, NULL, 0);
 
-			if (errno || fd <= STDERR_FILENO || fd > INT_MAX)
+			if (errno ||
+			    (fd != STDIN_FILENO && fd <= STDERR_FILENO) ||
+			    fd > INT_MAX)
 				die("Invalid --fd: %s", optarg);
 		}
 	} while (name != -1);

From ea0a1240df671de221f469327899564ed74b5edd Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 30 Apr 2025 16:48:34 +0200
Subject: [PATCH 127/144] passt-repair: Hide bogus gcc warning from -Og
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When building with gcc 13 and -Og, we get:

passt-repair.c: In function ‘main’:
passt-repair.c:161:23: warning: ‘ev’ may be used uninitialized [-Wmaybe-uninitialized]
  161 |                 if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') {
      |                     ~~^~~~~

but that can't actually happen, because we only exit the preceding
while loop if 'found' is true, and that only happens, in turn, as we
assign 'ev'.

Get rid of the warning by (redundantly) initialising ev to NULL.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 passt-repair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passt-repair.c b/passt-repair.c
index 256a8c9..ff1c44f 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -113,7 +113,7 @@ int main(int argc, char **argv)
 	if ((sb.st_mode & S_IFMT) == S_IFDIR) {
 		char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
 		   __attribute__ ((aligned(__alignof__(struct inotify_event))));
-		const struct inotify_event *ev;
+		const struct inotify_event *ev = NULL;
 		char path[PATH_MAX + 1];
 		bool found = false;
 		ssize_t n;

From 6a96cd97a5fda26a8f12531a72f6a969e476ad9e Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 30 Apr 2025 16:59:13 +0200
Subject: [PATCH 128/144] util: Fix typo, ASSSERTION -> ASSERTION

Fixes: 9153aca15bc1 ("util: Add abort_with_msg() and ASSERT_WITH_MSG() helpers")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util.h b/util.h
index cc7d084..5947337 100644
--- a/util.h
+++ b/util.h
@@ -75,7 +75,7 @@ void abort_with_msg(const char *fmt, ...)
 #define ASSERT_WITH_MSG(expr, ...)					\
 	((expr) ? (void)0 : abort_with_msg(__VA_ARGS__))
 #define ASSERT(expr)							\
-	ASSERT_WITH_MSG((expr), "ASSSERTION FAILED in %s (%s:%d): %s",	\
+	ASSERT_WITH_MSG((expr), "ASSERTION FAILED in %s (%s:%d): %s",	\
 			__func__, __FILE__, __LINE__, STRINGIFY(expr))
 
 #ifdef P_tmpdir

From 11be695f5c0a6a7d74e9628e9863e665f59d511f Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 30 Apr 2025 18:05:25 +0200
Subject: [PATCH 129/144] flow: fix podman issue #25959

While running piHole using podman, traffic can trigger the following
assert:

ASSSERTION FAILED in flow_alloc (flow.c:521): flow->f.state == FLOW_STATE_FREE

Backtrace shows that this happens in flow_defer_handler():

    #4  0x00005610d6f5b481 flow_alloc (passt + 0xb481)
    #5  0x00005610d6f74f86 udp_flow_from_sock (passt + 0x24f86)
    #6  0x00005610d6f737c3 udp_sock_fwd (passt + 0x237c3)
    #7  0x00005610d6f74c07 udp_flush_flow (passt + 0x24c07)
    #8  0x00005610d6f752c2 udp_flow_defer (passt + 0x252c2)
    #9  0x00005610d6f5bce1 flow_defer_handler (passt + 0xbce1)

We are trying to allocate a new flow inside the loop freeing them.

Inside the loop free_head points to the first free flow entry in the
current cluster. But if we allocate a new entry during the loop,
free_head is not updated and can point now to the entry we have just
allocated.

We can fix the problem by spliting the loop in two parts:
- first part where we can close some of them and allocate some new
  flow entries,
- second part where we free the entries closed in the previous loop
  and we aggregate the free entries to merge consecutive the clusters.

Reported-by: Martin Rijntjes <bugs@air-global.nl>
Link: https://github.com/containers/podman/issues/25959
Fixes: 9725e7988837 ("udp_flow: Don't discard packets that arrive between bind() and connect()")
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 109 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 58 insertions(+), 51 deletions(-)

diff --git a/flow.c b/flow.c
index 447c021..c5718e3 100644
--- a/flow.c
+++ b/flow.c
@@ -800,6 +800,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 {
 	struct flow_free_cluster *free_head = NULL;
 	unsigned *last_next = &flow_first_free;
+	bool to_free[FLOW_MAX] = { 0 };
 	bool timer = false;
 	union flow *flow;
 
@@ -810,9 +811,44 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 
 	ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
 
-	flow_foreach_slot(flow) {
+	/* Check which flows we might need to close first, but don't free them
+	 * yet as it's not safe to do that in the middle of flow_foreach().
+	 */
+	flow_foreach(flow) {
 		bool closed = false;
 
+		switch (flow->f.type) {
+		case FLOW_TYPE_NONE:
+			ASSERT(false);
+			break;
+		case FLOW_TCP:
+			closed = tcp_flow_defer(&flow->tcp);
+			break;
+		case FLOW_TCP_SPLICE:
+			closed = tcp_splice_flow_defer(&flow->tcp_splice);
+			if (!closed && timer)
+				tcp_splice_timer(c, &flow->tcp_splice);
+			break;
+		case FLOW_PING4:
+		case FLOW_PING6:
+			if (timer)
+				closed = icmp_ping_timer(c, &flow->ping, now);
+			break;
+		case FLOW_UDP:
+			closed = udp_flow_defer(c, &flow->udp, now);
+			if (!closed && timer)
+				closed = udp_flow_timer(c, &flow->udp, now);
+			break;
+		default:
+			/* Assume other flow types don't need any handling */
+			;
+		}
+
+		to_free[FLOW_IDX(flow)] = closed;
+	}
+
+	/* Second step: actually free the flows */
+	flow_foreach_slot(flow) {
 		switch (flow->f.state) {
 		case FLOW_STATE_FREE: {
 			unsigned skip = flow->free.n;
@@ -845,59 +881,30 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 			break;
 
 		case FLOW_STATE_ACTIVE:
-			/* Nothing to do */
-			break;
+			if (to_free[FLOW_IDX(flow)]) {
+				flow_set_state(&flow->f, FLOW_STATE_FREE);
+				memset(flow, 0, sizeof(*flow));
 
-		default:
-			ASSERT(false);
-		}
-
-		switch (flow->f.type) {
-		case FLOW_TYPE_NONE:
-			ASSERT(false);
-			break;
-		case FLOW_TCP:
-			closed = tcp_flow_defer(&flow->tcp);
-			break;
-		case FLOW_TCP_SPLICE:
-			closed = tcp_splice_flow_defer(&flow->tcp_splice);
-			if (!closed && timer)
-				tcp_splice_timer(c, &flow->tcp_splice);
-			break;
-		case FLOW_PING4:
-		case FLOW_PING6:
-			if (timer)
-				closed = icmp_ping_timer(c, &flow->ping, now);
-			break;
-		case FLOW_UDP:
-			closed = udp_flow_defer(c, &flow->udp, now);
-			if (!closed && timer)
-				closed = udp_flow_timer(c, &flow->udp, now);
-			break;
-		default:
-			/* Assume other flow types don't need any handling */
-			;
-		}
-
-		if (closed) {
-			flow_set_state(&flow->f, FLOW_STATE_FREE);
-			memset(flow, 0, sizeof(*flow));
-
-			if (free_head) {
-				/* Add slot to current free cluster */
-				ASSERT(FLOW_IDX(flow) ==
-				       FLOW_IDX(free_head) + free_head->n);
-				free_head->n++;
-				flow->free.n = flow->free.next = 0;
+				if (free_head) {
+					/* Add slot to current free cluster */
+					ASSERT(FLOW_IDX(flow) ==
+					    FLOW_IDX(free_head) + free_head->n);
+					free_head->n++;
+					flow->free.n = flow->free.next = 0;
+				} else {
+					/* Create new free cluster */
+					free_head = &flow->free;
+					free_head->n = 1;
+					*last_next = FLOW_IDX(flow);
+					last_next = &free_head->next;
+				}
 			} else {
-				/* Create new free cluster */
-				free_head = &flow->free;
-				free_head->n = 1;
-				*last_next = FLOW_IDX(flow);
-				last_next = &free_head->next;
+				free_head = NULL;
 			}
-		} else {
-			free_head = NULL;
+			break;
+
+		default:
+			ASSERT(false);
 		}
 	}
 

From 93394f4ef0966602b2ada8f72beaf75352add7b1 Mon Sep 17 00:00:00 2001
From: Janne Grunau <janne-psst@jannau.net>
Date: Thu, 1 May 2025 11:54:07 +0200
Subject: [PATCH 130/144] selinux: Add getattr to class udp_socket

Commit 59cc89f ("udp, udp_flow: Track our specific address on socket
interfaces") added a getsockname() call in udp_flow_new(). This requires
getattr. Fixes "Flow 0 (UDP flow): Unable to determine local address:
Permission denied" errors in muvm/passt on Fedora Linux 42 with SELinux.

The SELinux audit message is

| type=AVC msg=audit(1746083799.606:235): avc:  denied  { getattr } for
|   pid=2961 comm="passt" laddr=127.0.0.1 lport=49221
|   faddr=127.0.0.53 fport=53
|   scontext=unconfined_u:unconfined_r:passt_t:s0-s0:c0.c1023
|   tcontext=unconfined_u:unconfined_r:passt_t:s0-s0:c0.c1023
|   tclass=udp_socket permissive=0

Fixes: 59cc89f4cc01 ("udp, udp_flow: Track our specific address on socket interfaces")
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2363238
Signed-off-by: Janne Grunau <janne-psst@jannau.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 contrib/selinux/passt.te | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index f8ea672..eb9ce72 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -49,7 +49,7 @@ require {
 	type proc_net_t;
 	type node_t;
 	class tcp_socket { create accept listen name_bind name_connect getattr ioctl };
-	class udp_socket { create accept listen };
+	class udp_socket { create accept listen getattr };
 	class icmp_socket { bind create name_bind node_bind setopt read write };
 	class sock_file { create unlink write };
 
@@ -133,7 +133,7 @@ allow passt_t node_t:icmp_socket { name_bind node_bind };
 allow passt_t port_t:icmp_socket name_bind;
 
 allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl };
-allow passt_t self:udp_socket { create getopt setopt connect bind read write };
+allow passt_t self:udp_socket { create getopt setopt connect bind read write getattr };
 allow passt_t self:icmp_socket { bind create setopt read write };
 
 allow passt_t user_tmp_t:dir { add_name write };

From f0021f9e1d4f118f4167149b256346f3dfea9d2b Mon Sep 17 00:00:00 2001
From: Emanuel Valasiadis <emanuel@valasiadis.space>
Date: Fri, 2 May 2025 15:31:39 +0200
Subject: [PATCH 131/144] fwd: fix doc typo

Signed-off-by: Emanuel Valasiadis <emanuel@valasiadis.space>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 fwd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fwd.c b/fwd.c
index b73c2c8..49aabc3 100644
--- a/fwd.c
+++ b/fwd.c
@@ -440,7 +440,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
 }
 
 /**
- * nat_inbound() - Apply address translation for outbound (HOST to TAP)
+ * nat_inbound() - Apply address translation for inbound (HOST to TAP)
  * @c:		Execution context
  * @addr:	Input address (as seen on HOST interface)
  * @translated:	Output address (as seen on TAP interface)

From 587980ca1e9d5645f6738f67ec3f15cc61a7efa3 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 2 May 2025 21:56:30 +0200
Subject: [PATCH 132/144] udp: Actually discard datagrams we can't forward

Given that udp_sock_fwd() now loops on udp_peek_addr() to get endpoint
addresses for datagrams, if we can't forward one of these datagrams,
we need to make sure we actually discard it. Otherwise, with MSG_PEEK,
we won't dequeue and loop on it forever.

For example, if we fail to create a socket for a new flow, because,
say, the destination of an inbound packet is multicast, and we can't
bind() to a multicast address, the loop will look like this:

18.0563: Flow 0 (NEW): FREE -> NEW
18.0563: Flow 0 (INI): NEW -> INI
18.0563: Flow 0 (INI): HOST [127.0.0.1]:42487 -> [127.0.0.1]:9997 => ?
18.0563: Flow 0 (TGT): INI -> TGT
18.0563: Flow 0 (TGT): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0563: Flow 0 (UDP flow): TGT -> TYPED
18.0564: Flow 0 (UDP flow): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0564: Flow 0 (UDP flow): Couldn't open flow specific socket: Invalid argument
18.0564: Flow 0 (FREE): TYPED -> FREE
18.0564: Flow 0 (FREE): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0564: Discarding datagram without flow
18.0564: Flow 0 (NEW): FREE -> NEW
18.0564: Flow 0 (INI): NEW -> INI
18.0564: Flow 0 (INI): HOST [127.0.0.1]:42487 -> [127.0.0.1]:9997 => ?
18.0564: Flow 0 (TGT): INI -> TGT
18.0564: Flow 0 (TGT): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0564: Flow 0 (UDP flow): TGT -> TYPED
18.0564: Flow 0 (UDP flow): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0564: Flow 0 (UDP flow): Couldn't open flow specific socket: Invalid argument
18.0564: Flow 0 (FREE): TYPED -> FREE
18.0564: Flow 0 (FREE): HOST [127.0.0.1]:42487 -> [ff02::c]:9997 => SPLICE [0.0.0.0]:42487 -> [88.198.0.164]:9997
18.0564: Discarding datagram without flow

and seen from strace:

epoll_wait(3, [{events=EPOLLIN, data=0x1076c00000705}], 8, 1000) = 1
recvmsg(7, {msg_name={sa_family=AF_INET6, sin6_port=htons(55899), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "fe80::26e8:53ff:fef3:13b6", &sin6_addr), sin6_scope_id=if_nametoindex("wlp4s0")}, msg_namelen=28, msg_iov=NULL, msg_iovlen=0, msg_control=[{cmsg_len=36, cmsg_level=SOL_IPV6, cmsg_type=0x32, cmsg_data="\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x03\x00\x00\x00"}], msg_controllen=40, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_DONTWAIT) = 0
socket(AF_INET6, SOCK_DGRAM|SOCK_NONBLOCK, IPPROTO_UDP) = 12
setsockopt(12, SOL_IPV6, IPV6_V6ONLY, [1], 4) = 0
setsockopt(12, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
setsockopt(12, SOL_IPV6, IPV6_RECVERR, [1], 4) = 0
setsockopt(12, SOL_IPV6, IPV6_RECVPKTINFO, [1], 4) = 0
bind(12, {sa_family=AF_INET6, sin6_port=htons(1900), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "ff02::c", &sin6_addr), sin6_scope_id=0}, 28) = -1 EINVAL (Invalid argument)
close(12)                               = 0
recvmsg(7, {msg_name={sa_family=AF_INET6, sin6_port=htons(55899), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "fe80::26e8:53ff:fef3:13b6", &sin6_addr), sin6_scope_id=if_nametoindex("wlp4s0")}, msg_namelen=28, msg_iov=NULL, msg_iovlen=0, msg_control=[{cmsg_len=36, cmsg_level=SOL_IPV6, cmsg_type=0x32, cmsg_data="\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x03\x00\x00\x00"}], msg_controllen=40, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_DONTWAIT) = 0
socket(AF_INET6, SOCK_DGRAM|SOCK_NONBLOCK, IPPROTO_UDP) = 12
setsockopt(12, SOL_IPV6, IPV6_V6ONLY, [1], 4) = 0
setsockopt(12, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
setsockopt(12, SOL_IPV6, IPV6_RECVERR, [1], 4) = 0
setsockopt(12, SOL_IPV6, IPV6_RECVPKTINFO, [1], 4) = 0
bind(12, {sa_family=AF_INET6, sin6_port=htons(1900), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "ff02::c", &sin6_addr), sin6_scope_id=0}, 28) = -1 EINVAL (Invalid argument)
close(12)                               = 0

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/udp.c b/udp.c
index f5a5cd1..ca28b37 100644
--- a/udp.c
+++ b/udp.c
@@ -828,6 +828,7 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 	int rc;
 
 	while ((rc = udp_peek_addr(s, &src, &dst)) != 0) {
+		bool discard = false;
 		flow_sidx_t tosidx;
 		uint8_t topif;
 
@@ -861,8 +862,17 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
 			flow_err(uflow,
 				 "No support for forwarding UDP from %s to %s",
 				 pif_name(frompif), pif_name(topif));
+			discard = true;
 		} else {
 			debug("Discarding datagram without flow");
+			discard = true;
+		}
+
+		if (discard) {
+			struct msghdr msg = { 0 };
+
+			if (recvmsg(s, &msg, MSG_DONTWAIT) < 0)
+				debug_perror("Failed to discard datagram");
 		}
 	}
 }

From eea8a76caf85f4bae5f92b695d09b9ddea354b57 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 7 May 2025 14:36:34 +0200
Subject: [PATCH 133/144] flow: fix podman issue #26073

While running pasta, we trigger the following assert:

  ASSERTION FAILED in udp_at_sidx (udp_flow.c:35): flow->f.type == FLOW_UDP

in udp_at_sidx() in the following path:

 902 void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
 903                       uint32_t events, const struct timespec *now)
 904 {
 905         struct udp_flow *uflow = udp_at_sidx(ref.flowside);

The invalid sidx is comming from the epoll_ref provided by epoll_wait().

This assert follows the following error:

  Couldn't connect flow socket: Permission denied

It appears that an error happens in udp_flow_sock() and the recently
created fd is not removed from the epoll_ctl() pool:

 71 static int udp_flow_sock(const struct ctx *c,
 72                          struct udp_flow *uflow, unsigned sidei)
 73 {
...
 82         s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
 83         if (s < 0) {
 84                 flow_dbg_perror(uflow, "Couldn't open flow specific socket");
 85                 return s;
 86         }
 87
 88         if (flowside_connect(c, s, pif, side) < 0) {
 89                 int rc = -errno;
 90                 flow_dbg_perror(uflow, "Couldn't connect flow socket");
 91                 return rc;
 92         }
...

flowside_sock_l4() calls sock_l4_sa() that adds 's' to the epoll_ctl()
pool.

So to cleanly manage the error of flowside_connect() we need to remove
's' from the epoll_ctl() pool using epoll_del().

Link: https://github.com/containers/podman/issues/26073
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_flow.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/udp_flow.c b/udp_flow.c
index fea1cf3..b3a13b7 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -87,6 +87,10 @@ static int udp_flow_sock(const struct ctx *c,
 
 	if (flowside_connect(c, s, pif, side) < 0) {
 		int rc = -errno;
+
+		if (pif == PIF_HOST)
+			epoll_del(c, s);
+
 		flow_dbg_perror(uflow, "Couldn't connect flow socket");
 		return rc;
 	}

From 92d5d680134455f1a5b51fd8a3e9e64c99ac6d13 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 6 May 2025 16:13:25 +0200
Subject: [PATCH 134/144] flow: fix wrong macro name in comments

The maximum number of flow macro name is FLOW_MAX, not MAX_FLOW.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow.c b/flow.c
index c5718e3..6a5c8aa 100644
--- a/flow.c
+++ b/flow.c
@@ -81,7 +81,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
  *
  * Free cluster list
  *    flow_first_free gives the index of the first (lowest index) free cluster.
- *    Each free cluster has the index of the next free cluster, or MAX_FLOW if
+ *    Each free cluster has the index of the next free cluster, or FLOW_MAX if
  *    it is the last free cluster.  Together these form a linked list of free
  *    clusters, in strictly increasing order of index.
  *

From 8ec134109eb136432a29bdf5a14f8b1fd4e46208 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Mon, 12 May 2025 18:47:00 +0200
Subject: [PATCH 135/144] flow: close socket fd on error

In eea8a76caf85 ("flow: fix podman issue #26073"), we unregister
the fd from epoll_ctl() in case of error, but we also need to close it.

As flowside_sock_l4() also calls sock_l4_sa() via flowside_sock_splice()
we can do it unconditionally.

Fixes: eea8a76caf85 ("flow: fix podman issue #26073")
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 udp_flow.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/udp_flow.c b/udp_flow.c
index b3a13b7..4c6b3c2 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -88,8 +88,8 @@ static int udp_flow_sock(const struct ctx *c,
 	if (flowside_connect(c, s, pif, side) < 0) {
 		int rc = -errno;
 
-		if (pif == PIF_HOST)
-			epoll_del(c, s);
+		epoll_del(c, s);
+		close(s);
 
 		flow_dbg_perror(uflow, "Couldn't connect flow socket");
 		return rc;

From 570e7b4454f2f879180ae3ca13dedd759aff5243 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 13 May 2025 11:40:59 +0200
Subject: [PATCH 136/144] dhcpv6: fix GCC error
 (unterminated-string-initialization)

The string STR_NOTONLINK is intentionally not NUL-terminated.
Ignore the GCC error using __attribute__((nonstring)).

This error is reported by GCC 15.1.1 on Fedora 42. However,
Clang 20.1.3 does not support __attribute__((nonstring)).
Therefore, NOLINTNEXTLINE(clang-diagnostic-unknown-attributes)
is also added to suppress Clang's unknown attribute warning.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 dhcpv6.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dhcpv6.c b/dhcpv6.c
index 373a988..ba16c66 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -144,7 +144,9 @@ struct opt_ia_addr {
 struct opt_status_code {
 	struct opt_hdr hdr;
 	uint16_t code;
-	char status_msg[sizeof(STR_NOTONLINK) - 1];
+	/* "nonstring" is only supported since clang 23 */
+	/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
+	__attribute__((nonstring)) char status_msg[sizeof(STR_NOTONLINK) - 1];
 } __attribute__((packed));
 
 /**

From a6b9832e495be636bcccf25e0aebdeb564addf06 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 13 May 2025 11:41:00 +0200
Subject: [PATCH 137/144] virtio: Fix Clang warning
 (bugprone-sizeof-expression, cert-arr39-c)

In `virtqueue_read_indirect_desc()`, the pointer arithmetic involving
`desc` is intentional. We add the length in bytes (`read_len`)
divided by the size of `struct vring_desc` to `desc`, which is
an array of `struct vring_desc`. This correctly calculates the
offset in terms of the number of `struct vring_desc` elements.

Clang issues the following warning due to this explicit scaling:

virtio.c:238:8: error: suspicious usage of 'sizeof(...)' in pointer
arithmetic; this scaled value will be scaled again by the '+='
operator [bugprone-sizeof-expression,cert-arr39-c,-Werror]
  238 |         desc += read_len / sizeof(struct vring_desc);
      |               ^            ~~~~~~~~~~~~~~~~~~~~~~~~~
virtio.c:238:8: note: '+=' in pointer arithmetic internally scales
with 'sizeof(struct vring_desc)' == 16

This behavior is intended, so the warning can be considered a
false positive in this context. The code correctly advances the
pointer by the desired number of descriptor entries.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 virtio.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/virtio.c b/virtio.c
index bc2b89a..f7db007 100644
--- a/virtio.c
+++ b/virtio.c
@@ -235,6 +235,7 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
 		memcpy(desc, orig_desc, read_len);
 		len -= read_len;
 		addr += read_len;
+		/* NOLINTNEXTLINE(bugprone-sizeof-expression,cert-arr39-c) */
 		desc += read_len / sizeof(struct vring_desc);
 	}
 

From 0f7bf10b0a5542690dc6c75e4b56a6030ca8a663 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 13 May 2025 11:41:01 +0200
Subject: [PATCH 138/144] ndp: Fix Clang analyzer warning
 (clang-analyzer-security.PointerSub)

Addresses Clang warning: "Subtraction of two pointers that do not
point into the same array is undefined behavior" for the line:
  `ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);`

Here, `ptr` is `&ra.var[0]`. The subtraction calculates the offset
of `var[0]` within the `struct ra_options ra`. Since `ptr` points
inside `ra`, this pointer arithmetic is well-defined for
calculating the size of the data to send, even if `ptr` and `&ra`
are not strictly considered part of the same "array" by the analyzer.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 ndp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ndp.c b/ndp.c
index ded2081..b664034 100644
--- a/ndp.c
+++ b/ndp.c
@@ -328,6 +328,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
 
 	memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
 
+	/* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
 	ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);
 }
 

From 2d3d69c5c348d18112596bd3fdeed95689c613c8 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 13 May 2025 11:41:02 +0200
Subject: [PATCH 139/144] flow: Fix clang error
 (clang-analyzer-security.PointerSub)

Fixes the following clang-analyzer warning:

flow_table.h:96:25: note: Subtraction of two pointers that do not point into the same array is undefined behavior
   96 |         return (union flow *)f - flowtab;

The `flow_idx()` function is called via `FLOW_IDX()` from
`flow_foreach_slot()`, where `f` is set to `&flowtab[idx].f`.
Therefore, `f` and `flowtab` do point to the same array.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow_table.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flow_table.h b/flow_table.h
index 2d5c65c..3f3f4b7 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -93,6 +93,7 @@ extern union flow flowtab[];
  */
 static inline unsigned flow_idx(const struct flow_common *f)
 {
+	/* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
 	return (union flow *)f - flowtab;
 }
 

From 4234ace84cdf989cbcdb96a8165221dc83a11c85 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 14 May 2025 15:45:09 +0200
Subject: [PATCH 140/144] test: Display count of skipped tests in status and
 summary

This commit enhances test reporting by tracking and displaying the
number of skipped tests.

The skipped test count is now visible in the tmux status bar during
execution and included in the final test summary log. This provides
a more complete overview of test suite results.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 test/lib/term | 7 +++++--
 test/run      | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/lib/term b/test/lib/term
index ed690de..089364c 100755
--- a/test/lib/term
+++ b/test/lib/term
@@ -19,6 +19,7 @@ STATUS_FILE_INDEX=0
 STATUS_COLS=
 STATUS_PASS=0
 STATUS_FAIL=0
+STATUS_SKIPPED=0
 
 PR_RED='\033[1;31m'
 PR_GREEN='\033[1;32m'
@@ -439,19 +440,21 @@ info_layout() {
 # status_test_ok() - Update counter of passed tests, log and display message
 status_test_ok() {
 	STATUS_PASS=$((STATUS_PASS + 1))
-	tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
+	tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
 	info_passed
 }
 
 # status_test_fail() - Update counter of failed tests, log and display message
 status_test_fail() {
 	STATUS_FAIL=$((STATUS_FAIL + 1))
-	tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
+	tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
 	info_failed
 }
 
 # status_test_fail() - Update counter of failed tests, log and display message
 status_test_skip() {
+	STATUS_SKIPPED=$((STATUS_SKIPPED + 1))
+	tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
 	info_skipped
 }
 
diff --git a/test/run b/test/run
index 4e86f30..f73c311 100755
--- a/test/run
+++ b/test/run
@@ -202,7 +202,7 @@ skip_distro() {
 	perf_finish
 	[ ${CI} -eq 1 ] && video_stop
 
-	log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}"
+	log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}"
 
 	pause_continue \
 		"Press any key to keep test session open"	\
@@ -236,7 +236,7 @@ run_selected() {
 	done
 	teardown "${__setup}"
 
-	log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}"
+	log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}"
 
 	pause_continue \
 		"Press any key to keep test session open"	\
@@ -307,4 +307,4 @@ fi
 
 tail -n1 ${LOGFILE}
 echo "Log at ${LOGFILE}"
-exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\)$/\1/p')
+exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\),.*$/\1/p')

From 2046976866dd1f983cb0417a1d3ee3f64190805d Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 15 May 2025 11:41:51 +0200
Subject: [PATCH 141/144] codespell: Correct typos in comments and error
 message

This commit addresses several spelling errors identified by the `codespell`
tool. The corrections apply to:
- Code comments in `fwd.c`, `ip.h`, `isolation.c`, and `log.c`.
- An error message string in `vhost_user.c`.

Specifically, the following misspellings were corrected:
- "adddress" to "address"
- "capabilites" to "capabilities"
- "Musn't" to "Mustn't"
- "calculatd" to "calculated"
- "Invalide" to "Invalid"

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 fwd.c        | 2 +-
 ip.h         | 2 +-
 isolation.c  | 8 ++++----
 log.c        | 2 +-
 vhost_user.c | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fwd.c b/fwd.c
index 49aabc3..250cf56 100644
--- a/fwd.c
+++ b/fwd.c
@@ -418,7 +418,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
 	else
 		tgt->eaddr = inany_loopback6;
 
-	/* Preserve the specific loopback adddress used, but let the kernel pick
+	/* Preserve the specific loopback address used, but let the kernel pick
 	 * a source port on the target side
 	 */
 	tgt->oaddr = ini->eaddr;
diff --git a/ip.h b/ip.h
index 471c57e..24509d9 100644
--- a/ip.h
+++ b/ip.h
@@ -118,7 +118,7 @@ static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
 char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
 		 size_t *dlen);
 
-/* IPv6 link-local all-nodes multicast adddress, ff02::1 */
+/* IPv6 link-local all-nodes multicast address, ff02::1 */
 static const struct in6_addr in6addr_ll_all_nodes = {
 	.s6_addr = {
 		0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
diff --git a/isolation.c b/isolation.c
index c944fb3..bbcd23b 100644
--- a/isolation.c
+++ b/isolation.c
@@ -129,7 +129,7 @@ static void drop_caps_ep_except(uint64_t keep)
  * additional layer of protection.  Executing this requires
  * CAP_SETPCAP, which we will have within our userns.
  *
- * Note that dropping capabilites from the bounding set limits
+ * Note that dropping capabilities from the bounding set limits
  * exec()ed processes, but does not remove them from the effective or
  * permitted sets, so it doesn't reduce our own capabilities.
  */
@@ -174,8 +174,8 @@ static void clamp_caps(void)
  * Should:
  *  - drop unneeded capabilities
  *  - close all open files except for standard streams and the one from --fd
- * Musn't:
- *  - remove filesytem access (we need to access files during setup)
+ * Mustn't:
+ *  - remove filesystem access (we need to access files during setup)
  */
 void isolate_initial(int argc, char **argv)
 {
@@ -194,7 +194,7 @@ void isolate_initial(int argc, char **argv)
 	 *
 	 * It's debatable whether it's useful to drop caps when we
 	 * retain SETUID and SYS_ADMIN, but we might as well.  We drop
-	 * further capabilites in isolate_user() and
+	 * further capabilities in isolate_user() and
 	 * isolate_prefork().
 	 */
 	keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
diff --git a/log.c b/log.c
index d40d7ae..5d7d76f 100644
--- a/log.c
+++ b/log.c
@@ -402,7 +402,7 @@ void __setlogmask(int mask)
  * logfile_init() - Open log file and write header with PID, version, path
  * @name:	Identifier for header: passt or pasta
  * @path:	Path to log file
- * @size:	Maximum size of log file: log_cut_size is calculatd here
+ * @size:	Maximum size of log file: log_cut_size is calculated here
  */
 void logfile_init(const char *name, const char *path, size_t size)
 {
diff --git a/vhost_user.c b/vhost_user.c
index 105f77a..ca36763 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -1021,7 +1021,7 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
 
 	if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE &&
 	    direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
-		die("Invalide device_state_fd direction: %d", direction);
+		die("Invalid device_state_fd direction: %d", direction);
 
 	migrate_request(vdev->context, msg->fds[0],
 			direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);

From 2fd0944f21d6b9fce53c328acf1faaeb46b98528 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 16 May 2025 14:42:26 +0200
Subject: [PATCH 142/144] vhost_user: Correct and align function comment
 headers

This commit cleans up function comment headers in vhost_user.c to ensure
accuracy and consistency with the code. Changes include correcting
parameter names in comments and signatures (e.g., standardizing on vmsg
for vhost messages, fixing dev to vdev), updating function names in
comment descriptions, and removing/rectifying erroneous parameter
documentation.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 vhost_user.c | 221 +++++++++++++++++++++++++--------------------------
 vhost_user.h |   2 +-
 2 files changed, 111 insertions(+), 112 deletions(-)

diff --git a/vhost_user.c b/vhost_user.c
index ca36763..e8377bb 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -302,13 +302,13 @@ static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg)
  * @conn_fd:	vhost-user command socket
  * @vmsg:	vhost-user message
  */
-static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
+static void vu_send_reply(int conn_fd, struct vhost_user_msg *vmsg)
 {
-	msg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
-	msg->hdr.flags |= VHOST_USER_VERSION;
-	msg->hdr.flags |= VHOST_USER_REPLY_MASK;
+	vmsg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
+	vmsg->hdr.flags |= VHOST_USER_VERSION;
+	vmsg->hdr.flags |= VHOST_USER_REPLY_MASK;
 
-	vu_message_write(conn_fd, msg);
+	vu_message_write(conn_fd, vmsg);
 }
 
 /**
@@ -319,7 +319,7 @@ static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
  * Return: True as a reply is requested
  */
 static bool vu_get_features_exec(struct vu_dev *vdev,
-				 struct vhost_user_msg *msg)
+				 struct vhost_user_msg *vmsg)
 {
 	uint64_t features =
 		1ULL << VIRTIO_F_VERSION_1 |
@@ -329,9 +329,9 @@ static bool vu_get_features_exec(struct vu_dev *vdev,
 
 	(void)vdev;
 
-	vmsg_set_reply_u64(msg, features);
+	vmsg_set_reply_u64(vmsg, features);
 
-	debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64);
+	debug("Sending back to guest u64: 0x%016"PRIx64, vmsg->payload.u64);
 
 	return true;
 }
@@ -357,11 +357,11 @@ static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable)
  * Return: False as no reply is requested
  */
 static bool vu_set_features_exec(struct vu_dev *vdev,
-				 struct vhost_user_msg *msg)
+				 struct vhost_user_msg *vmsg)
 {
-	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+	debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
 
-	vdev->features = msg->payload.u64;
+	vdev->features = vmsg->payload.u64;
 	/* We only support devices conforming to VIRTIO 1.0 or
 	 * later
 	 */
@@ -382,10 +382,10 @@ static bool vu_set_features_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_owner_exec(struct vu_dev *vdev,
-			      struct vhost_user_msg *msg)
+			      struct vhost_user_msg *vmsg)
 {
 	(void)vdev;
-	(void)msg;
+	(void)vmsg;
 
 	return false;
 }
@@ -423,9 +423,9 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
  * #syscalls:vu mmap|mmap2 munmap
  */
 static bool vu_set_mem_table_exec(struct vu_dev *vdev,
-				  struct vhost_user_msg *msg)
+				  struct vhost_user_msg *vmsg)
 {
-	struct vhost_user_memory m = msg->payload.memory, *memory = &m;
+	struct vhost_user_memory m = vmsg->payload.memory, *memory = &m;
 	unsigned int i;
 
 	for (i = 0; i < vdev->nregions; i++) {
@@ -465,7 +465,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
 		 */
 		mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
 				 PROT_READ | PROT_WRITE, MAP_SHARED |
-				 MAP_NORESERVE, msg->fds[i], 0);
+				 MAP_NORESERVE, vmsg->fds[i], 0);
 
 		if (mmap_addr == MAP_FAILED)
 			die_perror("vhost-user region mmap error");
@@ -474,7 +474,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
 		debug("    mmap_addr:       0x%016"PRIx64,
 		      dev_region->mmap_addr);
 
-		close(msg->fds[i]);
+		close(vmsg->fds[i]);
 	}
 
 	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
@@ -541,7 +541,7 @@ static void vu_log_page(uint8_t *log_table, uint64_t page)
 
 /**
  * vu_log_write() - Log memory write
- * @dev:	vhost-user device
+ * @vdev:	vhost-user device
  * @address:	Memory address
  * @length:	Memory size
  */
@@ -566,23 +566,23 @@ void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length)
  * @vdev:	vhost-user device
  * @vmsg:	vhost-user message
  *
- * Return: False as no reply is requested
+ * Return: True as a reply is requested
  *
  * #syscalls:vu mmap|mmap2 munmap
  */
 static bool vu_set_log_base_exec(struct vu_dev *vdev,
-				 struct vhost_user_msg *msg)
+				 struct vhost_user_msg *vmsg)
 {
 	uint64_t log_mmap_size, log_mmap_offset;
 	void *base;
 	int fd;
 
-	if (msg->fd_num != 1 || msg->hdr.size != sizeof(msg->payload.log))
+	if (vmsg->fd_num != 1 || vmsg->hdr.size != sizeof(vmsg->payload.log))
 		die("vhost-user: Invalid log_base message");
 
-	fd = msg->fds[0];
-	log_mmap_offset = msg->payload.log.mmap_offset;
-	log_mmap_size = msg->payload.log.mmap_size;
+	fd = vmsg->fds[0];
+	log_mmap_offset = vmsg->payload.log.mmap_offset;
+	log_mmap_size = vmsg->payload.log.mmap_size;
 
 	debug("vhost-user log mmap_offset: %"PRId64, log_mmap_offset);
 	debug("vhost-user log mmap_size:   %"PRId64, log_mmap_size);
@@ -599,8 +599,8 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev,
 	vdev->log_table = base;
 	vdev->log_size = log_mmap_size;
 
-	msg->hdr.size = sizeof(msg->payload.u64);
-	msg->fd_num = 0;
+	vmsg->hdr.size = sizeof(vmsg->payload.u64);
+	vmsg->fd_num = 0;
 
 	return true;
 }
@@ -613,15 +613,15 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_log_fd_exec(struct vu_dev *vdev,
-			       struct vhost_user_msg *msg)
+			       struct vhost_user_msg *vmsg)
 {
-	if (msg->fd_num != 1)
+	if (vmsg->fd_num != 1)
 		die("Invalid log_fd message");
 
 	if (vdev->log_call_fd != -1)
 		close(vdev->log_call_fd);
 
-	vdev->log_call_fd = msg->fds[0];
+	vdev->log_call_fd = vmsg->fds[0];
 
 	debug("Got log_call_fd: %d", vdev->log_call_fd);
 
@@ -636,10 +636,10 @@ static bool vu_set_log_fd_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_num_exec(struct vu_dev *vdev,
-				  struct vhost_user_msg *msg)
+				  struct vhost_user_msg *vmsg)
 {
-	unsigned int idx = msg->payload.state.index;
-	unsigned int num = msg->payload.state.num;
+	unsigned int idx = vmsg->payload.state.index;
+	unsigned int num = vmsg->payload.state.num;
 
 	trace("State.index: %u", idx);
 	trace("State.num:   %u", num);
@@ -656,13 +656,13 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
-				   struct vhost_user_msg *msg)
+				   struct vhost_user_msg *vmsg)
 {
 	/* We need to copy the payload to vhost_vring_addr structure
-         * to access index because address of msg->payload.addr
+         * to access index because address of vmsg->payload.addr
          * can be unaligned as it is packed.
          */
-	struct vhost_vring_addr addr = msg->payload.addr;
+	struct vhost_vring_addr addr = vmsg->payload.addr;
 	struct vu_virtq *vq = &vdev->vq[addr.index];
 
 	debug("vhost_vring_addr:");
@@ -677,7 +677,7 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
 	debug("    log_guest_addr:   0x%016" PRIx64,
 	      (uint64_t)addr.log_guest_addr);
 
-	vq->vra = msg->payload.addr;
+	vq->vra = vmsg->payload.addr;
 	vq->vring.flags = addr.flags;
 	vq->vring.log_guest_addr = addr.log_guest_addr;
 
@@ -702,10 +702,10 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_base_exec(struct vu_dev *vdev,
-				   struct vhost_user_msg *msg)
+				   struct vhost_user_msg *vmsg)
 {
-	unsigned int idx = msg->payload.state.index;
-	unsigned int num = msg->payload.state.num;
+	unsigned int idx = vmsg->payload.state.index;
+	unsigned int num = vmsg->payload.state.num;
 
 	debug("State.index: %u", idx);
 	debug("State.num:   %u", num);
@@ -723,13 +723,13 @@ static bool vu_set_vring_base_exec(struct vu_dev *vdev,
  * Return: True as a reply is requested
  */
 static bool vu_get_vring_base_exec(struct vu_dev *vdev,
-				   struct vhost_user_msg *msg)
+				   struct vhost_user_msg *vmsg)
 {
-	unsigned int idx = msg->payload.state.index;
+	unsigned int idx = vmsg->payload.state.index;
 
 	debug("State.index: %u", idx);
-	msg->payload.state.num = vdev->vq[idx].last_avail_idx;
-	msg->hdr.size = sizeof(msg->payload.state);
+	vmsg->payload.state.num = vdev->vq[idx].last_avail_idx;
+	vmsg->hdr.size = sizeof(vmsg->payload.state);
 
 	vdev->vq[idx].started = false;
 	vdev->vq[idx].vring.avail = 0;
@@ -771,21 +771,21 @@ static void vu_set_watch(const struct vu_dev *vdev, int idx)
  * 			       close fds if NOFD bit is set
  * @vmsg:	vhost-user message
  */
-static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
+static void vu_check_queue_msg_file(struct vhost_user_msg *vmsg)
 {
-	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
-	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 
 	if (idx >= VHOST_USER_MAX_QUEUES)
 		die("Invalid vhost-user queue index: %u", idx);
 
 	if (nofd) {
-		vmsg_close_fds(msg);
+		vmsg_close_fds(vmsg);
 		return;
 	}
 
-	if (msg->fd_num != 1)
-		die("Invalid fds in vhost-user request: %d", msg->hdr.request);
+	if (vmsg->fd_num != 1)
+		die("Invalid fds in vhost-user request: %d", vmsg->hdr.request);
 }
 
 /**
@@ -797,14 +797,14 @@ static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
-				   struct vhost_user_msg *msg)
+				   struct vhost_user_msg *vmsg)
 {
-	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
-	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 
-	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+	debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
 
-	vu_check_queue_msg_file(msg);
+	vu_check_queue_msg_file(vmsg);
 
 	if (vdev->vq[idx].kick_fd != -1) {
 		epoll_del(vdev->context, vdev->vq[idx].kick_fd);
@@ -813,7 +813,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
 	}
 
 	if (!nofd)
-		vdev->vq[idx].kick_fd = msg->fds[0];
+		vdev->vq[idx].kick_fd = vmsg->fds[0];
 
 	debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx);
 
@@ -837,14 +837,14 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_call_exec(struct vu_dev *vdev,
-				   struct vhost_user_msg *msg)
+				   struct vhost_user_msg *vmsg)
 {
-	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
-	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 
-	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+	debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
 
-	vu_check_queue_msg_file(msg);
+	vu_check_queue_msg_file(vmsg);
 
 	if (vdev->vq[idx].call_fd != -1) {
 		close(vdev->vq[idx].call_fd);
@@ -852,11 +852,11 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev,
 	}
 
 	if (!nofd)
-		vdev->vq[idx].call_fd = msg->fds[0];
+		vdev->vq[idx].call_fd = vmsg->fds[0];
 
 	/* in case of I/O hang after reconnecting */
 	if (vdev->vq[idx].call_fd != -1)
-		eventfd_write(msg->fds[0], 1);
+		eventfd_write(vmsg->fds[0], 1);
 
 	debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx);
 
@@ -872,14 +872,14 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_err_exec(struct vu_dev *vdev,
-				  struct vhost_user_msg *msg)
+				  struct vhost_user_msg *vmsg)
 {
-	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
-	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+	int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 
-	debug("u64: 0x%016"PRIx64, msg->payload.u64);
+	debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
 
-	vu_check_queue_msg_file(msg);
+	vu_check_queue_msg_file(vmsg);
 
 	if (vdev->vq[idx].err_fd != -1) {
 		close(vdev->vq[idx].err_fd);
@@ -887,7 +887,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
 	}
 
 	if (!nofd)
-		vdev->vq[idx].err_fd = msg->fds[0];
+		vdev->vq[idx].err_fd = vmsg->fds[0];
 
 	return false;
 }
@@ -901,7 +901,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
  * Return: True as a reply is requested
  */
 static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
-					  struct vhost_user_msg *msg)
+					  struct vhost_user_msg *vmsg)
 {
 	uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
 			    1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
@@ -909,7 +909,7 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
 			    1ULL << VHOST_USER_PROTOCOL_F_RARP;
 
 	(void)vdev;
-	vmsg_set_reply_u64(msg, features);
+	vmsg_set_reply_u64(vmsg, features);
 
 	return true;
 }
@@ -922,13 +922,13 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
-					  struct vhost_user_msg *msg)
+					  struct vhost_user_msg *vmsg)
 {
-	uint64_t features = msg->payload.u64;
+	uint64_t features = vmsg->payload.u64;
 
 	debug("u64: 0x%016"PRIx64, features);
 
-	vdev->protocol_features = msg->payload.u64;
+	vdev->protocol_features = vmsg->payload.u64;
 
 	return false;
 }
@@ -941,11 +941,11 @@ static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
  * Return: True as a reply is requested
  */
 static bool vu_get_queue_num_exec(struct vu_dev *vdev,
-				  struct vhost_user_msg *msg)
+				  struct vhost_user_msg *vmsg)
 {
 	(void)vdev;
 
-	vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES);
+	vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_QUEUES);
 
 	return true;
 }
@@ -958,10 +958,10 @@ static bool vu_get_queue_num_exec(struct vu_dev *vdev,
  * Return: False as no reply is requested
  */
 static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
-				     struct vhost_user_msg *msg)
+				     struct vhost_user_msg *vmsg)
 {
-	unsigned int enable = msg->payload.state.num;
-	unsigned int idx = msg->payload.state.index;
+	unsigned int enable = vmsg->payload.state.num;
+	unsigned int idx = vmsg->payload.state.index;
 
 	debug("State.index:  %u", idx);
 	debug("State.enable: %u", enable);
@@ -974,17 +974,17 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
 }
 
 /**
- * vu_set_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
- * 			     RARP to notify the migration is terminated",
- * 			     but passt doesn't need to update any ARP table,
- * 			     so do nothing to silence QEMU bogus error message
+ * vu_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
+ * 			 RARP to notify the migration is terminated",
+ * 			 but passt doesn't need to update any ARP table,
+ * 			 so do nothing to silence QEMU bogus error message
  * @vdev:	vhost-user device
  * @vmsg:	vhost-user message
  *
  * Return: False as no reply is requested
  */
 static bool vu_send_rarp_exec(struct vu_dev *vdev,
-			      struct vhost_user_msg *msg)
+			      struct vhost_user_msg *vmsg)
 {
 	char macstr[ETH_ADDRSTRLEN];
 
@@ -993,7 +993,7 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
 	/* ignore the command */
 
 	debug("Ignore command VHOST_USER_SEND_RARP for %s",
-	      eth_ntop((unsigned char *)&msg->payload.u64, macstr,
+	      eth_ntop((unsigned char *)&vmsg->payload.u64, macstr,
 		       sizeof(macstr)));
 
 	return false;
@@ -1008,12 +1008,12 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
  *         and set bit 8 as we don't provide our own fd.
  */
 static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
-					struct vhost_user_msg *msg)
+					struct vhost_user_msg *vmsg)
 {
-	unsigned int direction = msg->payload.transfer_state.direction;
-	unsigned int phase = msg->payload.transfer_state.phase;
+	unsigned int direction = vmsg->payload.transfer_state.direction;
+	unsigned int phase = vmsg->payload.transfer_state.phase;
 
-	if (msg->fd_num != 1)
+	if (vmsg->fd_num != 1)
 		die("Invalid device_state_fd message");
 
 	if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED)
@@ -1023,11 +1023,11 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
 	    direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
 		die("Invalid device_state_fd direction: %d", direction);
 
-	migrate_request(vdev->context, msg->fds[0],
+	migrate_request(vdev->context, vmsg->fds[0],
 			direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
 
 	/* We don't provide a new fd for the data transfer */
-	vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
+	vmsg_set_reply_u64(vmsg, VHOST_USER_VRING_NOFD_MASK);
 
 	return true;
 }
@@ -1041,9 +1041,9 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
  */
 /* cppcheck-suppress constParameterCallback */
 static bool vu_check_device_state_exec(struct vu_dev *vdev,
-				       struct vhost_user_msg *msg)
+				       struct vhost_user_msg *vmsg)
 {
-	vmsg_set_reply_u64(msg, vdev->context->device_state_result);
+	vmsg_set_reply_u64(vmsg, vdev->context->device_state_result);
 
 	return true;
 }
@@ -1051,7 +1051,6 @@ static bool vu_check_device_state_exec(struct vu_dev *vdev,
 /**
  * vu_init() - Initialize vhost-user device structure
  * @c:		execution context
- * @vdev:	vhost-user device
  */
 void vu_init(struct ctx *c)
 {
@@ -1134,7 +1133,7 @@ static void vu_sock_reset(struct vu_dev *vdev)
 }
 
 static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
-					struct vhost_user_msg *msg) = {
+					struct vhost_user_msg *vmsg) = {
 	[VHOST_USER_GET_FEATURES]	   = vu_get_features_exec,
 	[VHOST_USER_SET_FEATURES]	   = vu_set_features_exec,
 	[VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec,
@@ -1165,7 +1164,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
  */
 void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
 {
-	struct vhost_user_msg msg = { 0 };
+	struct vhost_user_msg vmsg = { 0 };
 	bool need_reply, reply_requested;
 	int ret;
 
@@ -1174,38 +1173,38 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
 		return;
 	}
 
-	ret = vu_message_read_default(fd, &msg);
+	ret = vu_message_read_default(fd, &vmsg);
 	if (ret == 0) {
 		vu_sock_reset(vdev);
 		return;
 	}
 	debug("================ Vhost user message ================");
-	debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
-		msg.hdr.request);
-	debug("Flags:   0x%x", msg.hdr.flags);
-	debug("Size:    %u", msg.hdr.size);
+	debug("Request: %s (%d)", vu_request_to_string(vmsg.hdr.request),
+		vmsg.hdr.request);
+	debug("Flags:   0x%x", vmsg.hdr.flags);
+	debug("Size:    %u", vmsg.hdr.size);
 
-	need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
+	need_reply = vmsg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
 
-	if (msg.hdr.request >= 0 && msg.hdr.request < VHOST_USER_MAX &&
-	    vu_handle[msg.hdr.request])
-		reply_requested = vu_handle[msg.hdr.request](vdev, &msg);
+	if (vmsg.hdr.request >= 0 && vmsg.hdr.request < VHOST_USER_MAX &&
+	    vu_handle[vmsg.hdr.request])
+		reply_requested = vu_handle[vmsg.hdr.request](vdev, &vmsg);
 	else
-		die("Unhandled request: %d", msg.hdr.request);
+		die("Unhandled request: %d", vmsg.hdr.request);
 
 	/* cppcheck-suppress legacyUninitvar */
 	if (!reply_requested && need_reply) {
-		msg.payload.u64 = 0;
-		msg.hdr.flags = 0;
-		msg.hdr.size = sizeof(msg.payload.u64);
-		msg.fd_num = 0;
+		vmsg.payload.u64 = 0;
+		vmsg.hdr.flags = 0;
+		vmsg.hdr.size = sizeof(vmsg.payload.u64);
+		vmsg.fd_num = 0;
 		reply_requested = true;
 	}
 
 	if (reply_requested)
-		vu_send_reply(fd, &msg);
+		vu_send_reply(fd, &vmsg);
 
-	if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
+	if (vmsg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
 	    vdev->context->device_state_result == 0 &&
 	    !vdev->context->migrate_target) {
 		info("Migration complete, exiting");
diff --git a/vhost_user.h b/vhost_user.h
index 1daacd1..f2ae2da 100644
--- a/vhost_user.h
+++ b/vhost_user.h
@@ -184,7 +184,7 @@ union vhost_user_payload {
 };
 
 /**
- * struct vhost_user_msg - vhost-use message
+ * struct vhost_user_msg - vhost-user message
  * @hdr:		Message header
  * @payload:		Message payload
  * @fds:		File descriptors associated with the message

From b915375a421d70065baa90444da49954ceacde38 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 16 May 2025 14:42:27 +0200
Subject: [PATCH 143/144] virtio: Correct and align comment headers

Standardize and fix issues in `virtio.c` and `virtio.h` comment headers.

Improvements include:
- Added `()` to function names in comment summaries.
- Added colons after parameter and enum member tags.
- Changed `/*` to `/**` for `virtq_avail_event()` comment.
- Fixed typos (e.g., "file"->"fill", "virqueue"->"virtqueue").
- Added missing `Return:` tag for `vu_queue_rewind()`.
- Corrected parameter names in `virtio.h` comments to match code.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 virtio.c | 29 ++++++++++++++++-------------
 virtio.h |  4 ++--
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/virtio.c b/virtio.c
index f7db007..83906aa 100644
--- a/virtio.c
+++ b/virtio.c
@@ -156,9 +156,9 @@ static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i)
 }
 
 /**
- * virtq_used_event - Get location of used event indices
+ * virtq_used_event() - Get location of used event indices
  *		      (only with VIRTIO_F_EVENT_IDX)
- * @vq		Virtqueue
+ * @vq:		Virtqueue
  *
  * Return: return the location of the used event index
  */
@@ -170,7 +170,7 @@ static inline uint16_t *virtq_used_event(const struct vu_virtq *vq)
 
 /**
  * vring_get_used_event() - Get the used event from the available ring
- * @vq		Virtqueue
+ * @vq:		Virtqueue
  *
  * Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set)
  *         used_event is a performant alternative where the driver
@@ -244,9 +244,9 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
 
 /**
  * enum virtqueue_read_desc_state - State in the descriptor chain
- * @VIRTQUEUE_READ_DESC_ERROR	Found an invalid descriptor
- * @VIRTQUEUE_READ_DESC_DONE	No more descriptors in the chain
- * @VIRTQUEUE_READ_DESC_MORE	there are more descriptors in the chain
+ * @VIRTQUEUE_READ_DESC_ERROR:	Found an invalid descriptor
+ * @VIRTQUEUE_READ_DESC_DONE:	No more descriptors in the chain
+ * @VIRTQUEUE_READ_DESC_MORE:	there are more descriptors in the chain
  */
 enum virtqueue_read_desc_state {
 	VIRTQUEUE_READ_DESC_ERROR = -1,
@@ -347,8 +347,9 @@ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
 		die_perror("Error writing vhost-user queue eventfd");
 }
 
-/* virtq_avail_event() -  Get location of available event indices
- *			      (only with VIRTIO_F_EVENT_IDX)
+/**
+ * virtq_avail_event() -  Get location of available event indices
+ *			  (only with VIRTIO_F_EVENT_IDX)
  * @vq:		Virtqueue
  *
  * Return: return the location of the available event index
@@ -421,8 +422,8 @@ static bool virtqueue_map_desc(const struct vu_dev *dev,
 }
 
 /**
- * vu_queue_map_desc - Map the virtqueue descriptor ring into our virtual
- * 		       address space
+ * vu_queue_map_desc() - Map the virtqueue descriptor ring into our virtual
+ * 			 address space
  * @dev:	Vhost-user device
  * @vq:		Virtqueue
  * @idx:	First descriptor ring entry to map
@@ -505,7 +506,7 @@ static int vu_queue_map_desc(const struct vu_dev *dev,
  * vu_queue_pop() - Pop an entry from the virtqueue
  * @dev:	Vhost-user device
  * @vq:		Virtqueue
- * @elem:	Virtqueue element to file with the entry information
+ * @elem:	Virtqueue element to fill with the entry information
  *
  * Return: -1 if there is an error, 0 otherwise
  */
@@ -545,7 +546,7 @@ int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
 }
 
 /**
- * vu_queue_detach_element() - Detach an element from the virqueue
+ * vu_queue_detach_element() - Detach an element from the virtqueue
  * @vq:		Virtqueue
  */
 void vu_queue_detach_element(struct vu_virtq *vq)
@@ -555,7 +556,7 @@ void vu_queue_detach_element(struct vu_virtq *vq)
 }
 
 /**
- * vu_queue_unpop() - Push back the previously popped element from the virqueue
+ * vu_queue_unpop() - Push back the previously popped element from the virtqueue
  * @vq:		Virtqueue
  */
 /* cppcheck-suppress unusedFunction */
@@ -569,6 +570,8 @@ void vu_queue_unpop(struct vu_virtq *vq)
  * vu_queue_rewind() - Push back a given number of popped elements
  * @vq:		Virtqueue
  * @num:	Number of element to unpop
+ *
+ * Return: True on success, false if not
  */
 bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num)
 {
diff --git a/virtio.h b/virtio.h
index 7a370bd..d8beb88 100644
--- a/virtio.h
+++ b/virtio.h
@@ -150,7 +150,7 @@ static inline bool has_feature(uint64_t features, unsigned int fbit)
 /**
  * vu_has_feature() - Check if a virtio-net feature is available
  * @vdev:	Vhost-user device
- * @bit:	Feature to check
+ * @fbit:	Feature to check
  *
  * Return:	True if the feature is available
  */
@@ -163,7 +163,7 @@ static inline bool vu_has_feature(const struct vu_dev *vdev,
 /**
  * vu_has_protocol_feature() - Check if a vhost-user feature is available
  * @vdev:	Vhost-user device
- * @bit:	Feature to check
+ * @fbit:	Feature to check
  *
  * Return:	True if the feature is available
  */

From 3262c9b088288902f28b5d09f61220fae5376082 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 16 May 2025 14:42:28 +0200
Subject: [PATCH 144/144] iov: Standardize function comment headers

Update function comment headers in iov.c to a consistent and
standardized format.

This change ensures:
- Comment blocks for functions consistently start with /**.
- Function names in the comment summary line include parentheses ().

This improves overall comment clarity and uniformity within the file.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 iov.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/iov.c b/iov.c
index 8c63b7e..91e87a7 100644
--- a/iov.c
+++ b/iov.c
@@ -26,7 +26,8 @@
 #include "iov.h"
 
 
-/* iov_skip_bytes() - Skip leading bytes of an IO vector
+/**
+ * iov_skip_bytes() - Skip leading bytes of an IO vector
  * @iov:	IO vector
  * @n:		Number of entries in @iov
  * @skip:	Number of leading bytes of @iov to skip
@@ -56,8 +57,8 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
 }
 
 /**
- * iov_from_buf - Copy data from a buffer to an I/O vector (struct iovec)
- *                efficiently.
+ * iov_from_buf() - Copy data from a buffer to an I/O vector (struct iovec)
+ *                  efficiently.
  *
  * @iov:       Pointer to the array of struct iovec describing the
  *             scatter/gather I/O vector.
@@ -96,8 +97,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
 }
 
 /**
- * iov_to_buf - Copy data from a scatter/gather I/O vector (struct iovec) to
- *		a buffer efficiently.
+ * iov_to_buf() - Copy data from a scatter/gather I/O vector (struct iovec) to
+ *		  a buffer efficiently.
  *
  * @iov:       Pointer to the array of struct iovec describing the scatter/gather
  *             I/O vector.
@@ -136,8 +137,8 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
 }
 
 /**
- * iov_size - Calculate the total size of a scatter/gather I/O vector
- *            (struct iovec).
+ * iov_size() - Calculate the total size of a scatter/gather I/O vector
+ *              (struct iovec).
  *
  * @iov:       Pointer to the array of struct iovec describing the
  *             scatter/gather I/O vector.