tcp: Move tcp_l2_buf_fill_headers() to tcp_buf.c

This function only has callers in tcp_buf.c. More importantly, it's inherently tied to the "buf" path, because it uses internal knowledge of how we lay out the various headers across our locally allocated buffers. Therefore, move it to tcp_buf.c. Slightly reformat the prototypes while we're at it. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
tcp_vu: Share more header construction between IPv4 and IPv6 paths
2025-04-11 07:45:01 +02:00 · 2024-11-15 10:55:53 +01:00 · 2024-11-15 10:55:53 +01:00 · 2024-11-15 10:55:53 +01:00 · 2024-11-15 10:55:53 +01:00 · 2024-11-15 10:54:01 +01:00
107 changed files with 1816 additions and 7528 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,10 +3,8 @@
 /passt.avx2
 /pasta
 /pasta.avx2
-/passt-repair
 /qrap
 /pasta.1
 /seccomp.h
-/seccomp_repair.h
 /c*.json
 README.plain.md
--- a/48
+++ b/48
@ -16,12 +16,9 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
 DUAL_STACK_SOCKETS := 1

 TARGET ?= $(shell $(CC) -dumpmachine)
-$(if $(TARGET),,$(error Failed to get target architecture))
 # Get 'uname -m'-like architecture description for target
-TARGET_ARCH := $(firstword $(subst -, ,$(TARGET)))
-TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH))
-TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH))
-TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH))
+TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
+TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')

 # On some systems enabling optimization also enables source fortification,
 # automagically. Do not override it.
@ -30,7 +27,7 @@ ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /
 FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
 endif

-FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security
+FLAGS := -Wall -Wextra -Wno-format-zero-length
 FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
 FLAGS +=  $(FORTIFY_FLAG) -O2 -pie -fPIE
 FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
@ -39,21 +36,20 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)

 PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
 	icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
-	ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \
-	repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \
-	udp_vu.c util.c vhost_user.c virtio.c vu_common.c
+	ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
+	tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
+	vhost_user.c virtio.c vu_common.c
 QRAP_SRCS = qrap.c
-PASST_REPAIR_SRCS = passt-repair.c
-SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
+SRCS = $(PASST_SRCS) $(QRAP_SRCS)

-MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
+MANPAGES = passt.1 pasta.1 qrap.1

 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
-	lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
-	pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \
-	tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \
-	udp_vu.h util.h vhost_user.h virtio.h vu_common.h
+	lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
+	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
+	tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \
+	virtio.h vu_common.h
 HEADERS = $(PASST_HEADERS) seccomp.h

 C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
@ -74,9 +70,9 @@ mandir		?= $(datarootdir)/man
 man1dir		?= $(mandir)/man1

 ifeq ($(TARGET_ARCH),x86_64)
-BIN := passt passt.avx2 pasta pasta.avx2 qrap passt-repair
+BIN := passt passt.avx2 pasta pasta.avx2 qrap
 else
-BIN := passt pasta qrap passt-repair
+BIN := passt pasta qrap
 endif

 all: $(BIN) $(MANPAGES) docs
@ -85,10 +81,7 @@ static: FLAGS += -static -DGLIBC_NO_STATIC_NSS
 static: clean all

 seccomp.h: seccomp.sh $(PASST_SRCS) $(PASST_HEADERS)
-	@ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp.h $(PASST_SRCS) $(PASST_HEADERS)
-
-seccomp_repair.h: seccomp.sh $(PASST_REPAIR_SRCS)
-	@ ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp_repair.h $(PASST_REPAIR_SRCS)
+	@ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh $(PASST_SRCS) $(PASST_HEADERS)

 passt: $(PASST_SRCS) $(HEADERS)
 	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_SRCS) -o passt $(LDFLAGS)
@ -106,19 +99,16 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
 qrap: $(QRAP_SRCS) passt.h
 	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)

-passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h
-	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS)
-
 valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction	\
-			    rt_sigreturn getpid gettid kill clock_gettime \
-			    mmap|mmap2 munmap open unlink gettimeofday futex \
-			    statx readlink
+			    rt_sigreturn getpid gettid kill clock_gettime mmap \
+			    mmap2 munmap open unlink gettimeofday futex statx \
+			    readlink
 valgrind: FLAGS += -g -DVALGRIND
 valgrind: all

 .PHONY: clean
 clean:
-	$(RM) $(BIN) *~ *.o seccomp.h seccomp_repair.h pasta.1 \
+	$(RM) $(BIN) *~ *.o seccomp.h pasta.1 \
 		passt.tar passt.tar.gz *.deb *.rpm \
 		passt.pid README.plain.md

--- a/README.md
+++ b/README.md
@ -321,7 +321,7 @@ speeding up local connections, and usually requiring NAT. _pasta_:
  protocol
 * ✅ 4 to 50 times IPv4 TCP throughput of existing, conceptually similar
  solutions depending on MTU (UDP and IPv6 hard to compare)
-* ✅ [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for
+* 🛠 [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for
  maximum one copy on every data path and lower request-response latency
 * ⌚ [multithreading](https://bugs.passt.top/show_bug.cgi?id=13)
 * ⌚ [raw IP socket support](https://bugs.passt.top/show_bug.cgi?id=14) if
--- a/checksum.c
+++ b/checksum.c
@ -85,7 +85,7 @@
 */
 /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
 __attribute__((optimize("-fno-strict-aliasing")))
-static uint32_t sum_16b(const void *buf, size_t len)
+uint32_t sum_16b(const void *buf, size_t len)
 {
 	const uint16_t *p = buf;
 	uint32_t sum = 0;
@ -107,7 +107,7 @@ static uint32_t sum_16b(const void *buf, size_t len)
 *
 * Return: 16-bit folded sum
 */
-static uint16_t csum_fold(uint32_t sum)
+uint16_t csum_fold(uint32_t sum)
 {
 	while (sum >> 16)
 		sum = (sum & 0xffff) + (sum >> 16);
@ -161,42 +161,29 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
 	return psum;
 }

-/**
- * csum() - Compute TCP/IP-style checksum
- * @buf:	Input buffer
- * @len:	Input length
- * @init:	Initial 32-bit checksum, 0 for no pre-computed checksum
- *
- * Return: 16-bit folded, complemented checksum
- */
-/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
-__attribute__((optimize("-fno-strict-aliasing")))	/* See csum_16b() */
-static uint16_t csum(const void *buf, size_t len, uint32_t init)
-{
-	return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
-}
-
 /**
 * csum_udp4() - Calculate and set checksum for a UDP over IPv4 packet
 * @udp4hr:	UDP header, initialised apart from checksum
 * @saddr:	IPv4 source address
 * @daddr:	IPv4 destination address
- * @data:	UDP payload (as IO vector tail)
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ * @offset:	UDP payload offset in the iovec array
 */
 void csum_udp4(struct udphdr *udp4hr,
 	       struct in_addr saddr, struct in_addr daddr,
-	       struct iov_tail *data)
+	       const struct iovec *iov, int iov_cnt, size_t offset)
 {
 	/* UDP checksums are optional, so don't bother */
 	udp4hr->check = 0;

 	if (UDP4_REAL_CHECKSUMS) {
-		uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr);
+		uint16_t l4len = iov_size(iov, iov_cnt) - offset +
+				 sizeof(struct udphdr);
 		uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
 						       saddr, daddr);
-
 		psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
-		udp4hr->check = csum_iov_tail(data, psum);
+		udp4hr->check = csum_iov(iov, iov_cnt, offset, psum);
 	}
 }

@ -244,20 +231,22 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
 * @udp6hr:	UDP header, initialised apart from checksum
 * @saddr:	Source address
 * @daddr:	Destination address
- * @data:	UDP payload (as IO vector tail)
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ * @offset:	UDP payload offset in the iovec array
 */
 void csum_udp6(struct udphdr *udp6hr,
 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
-	       struct iov_tail *data)
+	       const struct iovec *iov, int iov_cnt, size_t offset)
 {
-	uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr);
+	uint16_t l4len = iov_size(iov, iov_cnt) - offset +
+			 sizeof(struct udphdr);
 	uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
 					       saddr, daddr);
-
 	udp6hr->check = 0;

 	psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
-	udp6hr->check = csum_iov_tail(data, psum);
+	udp6hr->check = csum_iov(iov, iov_cnt, offset, psum);
 }

 /**
@ -467,8 +456,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
 	intptr_t align = ROUND_UP((intptr_t)buf, sizeof(__m256i));
 	unsigned int pad = align - (intptr_t)buf;

-	/* Don't mix sum_16b() and csum_avx2() with odd padding lengths */
-	if (pad & 1 || len < pad)
+	if (len < pad)
 		pad = len;

 	if (pad)
@ -498,23 +486,46 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
 #endif /* !__AVX2__ */

 /**
- * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector
- * @tail:	IO vector tail to checksum
+ * csum() - Compute TCP/IP-style checksum
+ * @buf:	Input buffer
+ * @len:	Input length
+ * @init:	Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit folded, complemented checksum
+ */
+/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
+__attribute__((optimize("-fno-strict-aliasing")))	/* See csum_16b() */
+uint16_t csum(const void *buf, size_t len, uint32_t init)
+{
+	return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
+}
+
+/**
+ * csum_iov() - Calculates the unfolded checksum over an array of IO vectors
+ *
+ * @iov		Pointer to the array of IO vectors
+ * @n		Length of the array
+ * @offset:	Offset of the data to checksum within the full data length
 * @init	Initial 32-bit checksum, 0 for no pre-computed checksum
 *
 * Return: 16-bit folded, complemented checksum
 */
-uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init)
+uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
+		  uint32_t init)
 {
-	if (iov_tail_prune(tail)) {
-		size_t i;
+	unsigned int i;
+	size_t first;
+
+	i = iov_skip_bytes(iov, n, offset, &first);
+	if (i >= n)
+		return (uint16_t)~csum_fold(init);
+
+	init = csum_unfolded((char *)iov[i].iov_base + first,
+			     iov[i].iov_len - first, init);
+	i++;
+
+	for (; i < n; i++)
+		init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);

-		init = csum_unfolded((char *)tail->iov[0].iov_base + tail->off,
-				     tail->iov[0].iov_len - tail->off, init);
-		for (i = 1; i < tail->cnt; i++) {
-			const struct iovec *iov = &tail->iov[i];
-			init = csum_unfolded(iov->iov_base, iov->iov_len, init);
-		}
-	}
 	return (uint16_t)~csum_fold(init);
 }
--- a/checksum.h
+++ b/checksum.h
@ -9,8 +9,9 @@
 struct udphdr;
 struct icmphdr;
 struct icmp6hdr;
-struct iov_tail;

+uint32_t sum_16b(const void *buf, size_t len);
+uint16_t csum_fold(uint32_t sum);
 uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init);
 uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
 			 struct in_addr saddr, struct in_addr daddr);
@ -18,18 +19,20 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
 				struct in_addr saddr, struct in_addr daddr);
 void csum_udp4(struct udphdr *udp4hr,
 	       struct in_addr saddr, struct in_addr daddr,
-	       struct iov_tail *data);
+	       const struct iovec *iov, int iov_cnt, size_t offset);
 void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
 uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
 				const struct in6_addr *saddr,
 				const struct in6_addr *daddr);
 void csum_udp6(struct udphdr *udp6hr,
 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
-	       struct iov_tail *data);
+	       const struct iovec *iov, int iov_cnt, size_t offset);
 void csum_icmp6(struct icmp6hdr *icmp6hr,
 		const struct in6_addr *saddr, const struct in6_addr *daddr,
 		const void *payload, size_t dlen);
 uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
-uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init);
+uint16_t csum(const void *buf, size_t len, uint32_t init);
+uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
+		  uint32_t init);

 #endif /* CHECKSUM_H */
--- a/conf.c
+++ b/conf.c
@ -16,7 +16,6 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
-#include <libgen.h>
 #include <string.h>
 #include <sched.h>
 #include <sys/types.h>
@ -50,20 +49,6 @@

 #define NETNS_RUN_DIR	"/run/netns"

-#define IP4_LL_GUEST_ADDR	(struct in_addr){ htonl_constant(0xa9fe0201) }
-				/* 169.254.2.1, libslirp default: 10.0.2.1 */
-
-#define IP4_LL_GUEST_GW		(struct in_addr){ htonl_constant(0xa9fe0202) }
-				/* 169.254.2.2, libslirp default: 10.0.2.2 */
-
-#define IP4_LL_PREFIX_LEN	16
-
-#define IP6_LL_GUEST_GW		(struct in6_addr)			\
-				{{{ 0xfe, 0x80, 0, 0, 0, 0, 0, 0,	\
-				       0, 0, 0, 0, 0, 0, 0, 0x01 }}}
-
-const char *pasta_default_ifn = "tap0";
-
 /**
 * next_chunk - Return the next piece of a string delimited by a character
 * @s:		String to search
@ -124,75 +109,6 @@ static int parse_port_range(const char *s, char **endptr,
 	return 0;
 }

-/**
- * conf_ports_range_except() - Set up forwarding for a range of ports minus a
- *                             bitmap of exclusions
- * @c:		Execution context
- * @optname:	Short option name, t, T, u, or U
- * @optarg:	Option argument (port specification)
- * @fwd:	Pointer to @fwd_ports to be updated
- * @addr:	Listening address
- * @ifname:	Listening interface
- * @first:	First port to forward
- * @last:	Last port to forward
- * @exclude:	Bitmap of ports to exclude
- * @to:		Port to translate @first to when forwarding
- * @weak:	Ignore errors, as long as at least one port is mapped
- */
-static void conf_ports_range_except(const struct ctx *c, char optname,
-				    const char *optarg, struct fwd_ports *fwd,
-				    const union inany_addr *addr,
-				    const char *ifname,
-				    uint16_t first, uint16_t last,
-				    const uint8_t *exclude, uint16_t to,
-				    bool weak)
-{
-	bool bound_one = false;
-	unsigned i;
-	int ret;
-
-	if (first == 0) {
-		die("Can't forward port 0 for option '-%c %s'",
-		    optname, optarg);
-	}
-
-	for (i = first; i <= last; i++) {
-		if (bitmap_isset(exclude, i))
-			continue;
-
-		if (bitmap_isset(fwd->map, i)) {
-			warn(
-"Altering mapping of already mapped port number: %s", optarg);
-		}
-
-		bitmap_set(fwd->map, i);
-		fwd->delta[i] = to - first;
-
-		if (optname == 't')
-			ret = tcp_sock_init(c, addr, ifname, i);
-		else if (optname == 'u')
-			ret = udp_sock_init(c, 0, addr, ifname, i);
-		else
-			/* No way to check in advance for -T and -U */
-			ret = 0;
-
-		if (ret == -ENFILE || ret == -EMFILE) {
-			die("Can't open enough sockets for port specifier: %s",
-			    optarg);
-		}
-
-		if (!ret) {
-			bound_one = true;
-		} else if (!weak) {
-			die("Failed to bind port %u (%s) for option '-%c %s'",
-			    i, strerror_(-ret), optname, optarg);
-		}
-	}
-
-	if (!bound_one)
-		die("Failed to bind any port for '-%c %s'", optname, optarg);
-}
-
 /**
 * conf_ports() - Parse port configuration options, initialise UDP/TCP sockets
 * @c:		Execution context
@ -205,9 +121,10 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 {
 	union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
 	char buf[BUFSIZ], *spec, *ifname = NULL, *p;
+	bool exclude_only = true, bound_one = false;
 	uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
-	bool exclude_only = true;
 	unsigned i;
+	int ret;

 	if (!strcmp(optarg, "none")) {
 		if (fwd->mode)
@ -242,15 +159,32 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,

 		fwd->mode = FWD_ALL;

-		/* Exclude ephemeral ports */
-		for (i = 0; i < NUM_PORTS; i++)
+		/* Skip port 0.  It has special meaning for many socket APIs, so
+		 * trying to bind it is not really safe.
+		 */
+		for (i = 1; i < NUM_PORTS; i++) {
 			if (fwd_port_is_ephemeral(i))
-				bitmap_set(exclude, i);
+				continue;
+
+			bitmap_set(fwd->map, i);
+			if (optname == 't') {
+				ret = tcp_sock_init(c, NULL, NULL, i);
+				if (ret == -ENFILE || ret == -EMFILE)
+					goto enfile;
+				if (!ret)
+					bound_one = true;
+			} else if (optname == 'u') {
+				ret = udp_sock_init(c, 0, NULL, NULL, i);
+				if (ret == -ENFILE || ret == -EMFILE)
+					goto enfile;
+				if (!ret)
+					bound_one = true;
+			}
+		}
+
+		if (!bound_one)
+			goto bind_all_fail;

-		conf_ports_range_except(c, optname, optarg, fwd,
-					NULL, NULL,
-					1, NUM_PORTS - 1, exclude,
-					1, true);
 		return;
 	}

@ -327,15 +261,37 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 	} while ((p = next_chunk(p, ',')));

 	if (exclude_only) {
-		/* Exclude ephemeral ports */
-		for (i = 0; i < NUM_PORTS; i++)
-			if (fwd_port_is_ephemeral(i))
-				bitmap_set(exclude, i);
+		/* Skip port 0.  It has special meaning for many socket APIs, so
+		 * trying to bind it is not really safe.
+		 */
+		for (i = 1; i < NUM_PORTS; i++) {
+			if (fwd_port_is_ephemeral(i) ||
+			    bitmap_isset(exclude, i))
+				continue;
+
+			bitmap_set(fwd->map, i);
+
+			if (optname == 't') {
+				ret = tcp_sock_init(c, addr, ifname, i);
+				if (ret == -ENFILE || ret == -EMFILE)
+					goto enfile;
+				if (!ret)
+					bound_one = true;
+			} else if (optname == 'u') {
+				ret = udp_sock_init(c, 0, addr, ifname, i);
+				if (ret == -ENFILE || ret == -EMFILE)
+					goto enfile;
+				if (!ret)
+					bound_one = true;
+			} else {
+				/* No way to check in advance for -T and -U */
+				bound_one = true;
+			}
+		}
+
+		if (!bound_one)
+			goto bind_all_fail;

-		conf_ports_range_except(c, optname, optarg, fwd,
-					addr, ifname,
-					1, NUM_PORTS - 1, exclude,
-					1, true);
 		return;
 	}

@ -364,18 +320,40 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 		if ((*p != '\0')  && (*p != ',')) /* Garbage after the ranges */
 			goto bad;

-		conf_ports_range_except(c, optname, optarg, fwd,
-					addr, ifname,
-					orig_range.first, orig_range.last,
-					exclude,
-					mapped_range.first, false);
+		for (i = orig_range.first; i <= orig_range.last; i++) {
+			if (bitmap_isset(fwd->map, i))
+				warn(
+"Altering mapping of already mapped port number: %s", optarg);
+
+			if (bitmap_isset(exclude, i))
+				continue;
+
+			bitmap_set(fwd->map, i);
+
+			fwd->delta[i] = mapped_range.first - orig_range.first;
+
+			ret = 0;
+			if (optname == 't')
+				ret = tcp_sock_init(c, addr, ifname, i);
+			else if (optname == 'u')
+				ret = udp_sock_init(c, 0, addr, ifname, i);
+			if (ret)
+				goto bind_fail;
+		}
 	} while ((p = next_chunk(p, ',')));

 	return;
+enfile:
+	die("Can't open enough sockets for port specifier: %s", optarg);
 bad:
 	die("Invalid port specifier %s", optarg);
 mode_conflict:
 	die("Port forwarding mode '%s' conflicts with previous mode", optarg);
+bind_fail:
+	die("Failed to bind port %u (%s) for option '-%c %s', exiting",
+	    i, strerror(-ret), optname, optarg);
+bind_all_fail:
+	die("Failed to bind any port for '-%c %s', exiting", optname, optarg);
 }

 /**
@ -434,12 +412,10 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver,
 		if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
 			c->ip4.dns_host = ns4;

-		/* Special handling if guest or container can only access local
-		 * addresses via redirect, or if the host gateway is also a
-		 * resolver and we shadow its address
+		/* Guest or container can only access local addresses via
+		 * redirect
 		 */
-		if (IN4_IS_ADDR_LOOPBACK(&ns4) ||
-		    IN4_ARE_ADDR_EQUAL(&ns4, &c->ip4.map_host_loopback)) {
+		if (IN4_IS_ADDR_LOOPBACK(&ns4)) {
 			if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
 				return;

@ -455,12 +431,10 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver,
 		if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
 			c->ip6.dns_host = ns6;

-		/* Special handling if guest or container can only access local
-		 * addresses via redirect, or if the host gateway is also a
-		 * resolver and we shadow its address
+		/* Guest or container can only access local addresses via
+		 * redirect
 		 */
-		if (IN6_IS_ADDR_LOOPBACK(&ns6) ||
-		    IN6_ARE_ADDR_EQUAL(&ns6, &c->ip6.map_host_loopback)) {
+		if (IN6_IS_ADDR_LOOPBACK(&ns6)) {
 			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
 				return;

@ -658,7 +632,7 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
 		ifi = nl_get_ext_if(nl_sock, AF_INET);

 	if (!ifi) {
-		debug("Failed to detect external interface for IPv4");
+		info("Couldn't pick external interface: disabling IPv4");
 		return 0;
 	}

@ -666,8 +640,8 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
 		int rc = nl_route_get_def(nl_sock, ifi, AF_INET,
 					  &ip4->guest_gw);
 		if (rc < 0) {
-			debug("Couldn't discover IPv4 gateway address: %s",
-			      strerror_(-rc));
+			err("Couldn't discover IPv4 gateway address: %s",
+			    strerror(-rc));
 			return 0;
 		}
 	}
@ -676,8 +650,8 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
 		int rc = nl_addr_get(nl_sock, ifi, AF_INET,
 				     &ip4->addr, &ip4->prefix_len, NULL);
 		if (rc < 0) {
-			debug("Couldn't discover IPv4 address: %s",
-			      strerror_(-rc));
+			err("Couldn't discover IPv4 address: %s",
+			    strerror(-rc));
 			return 0;
 		}
 	}
@ -704,19 +678,6 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
 	return ifi;
 }

-/**
- * conf_ip4_local() - Configure IPv4 addresses and attributes for local mode
- * @ip4:	IPv4 context (will be written)
- */
-static void conf_ip4_local(struct ip4_ctx *ip4)
-{
-	ip4->addr_seen = ip4->addr = IP4_LL_GUEST_ADDR;
-	ip4->our_tap_addr = ip4->guest_gw = IP4_LL_GUEST_GW;
-	ip4->prefix_len = IP4_LL_PREFIX_LEN;
-
-	ip4->no_copy_addrs = ip4->no_copy_routes = true;
-}
-
 /**
 * conf_ip6() - Verify or detect IPv6 support, get relevant addresses
 * @ifi:	Host interface to attempt (0 to determine one)
@ -733,15 +694,15 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
 		ifi = nl_get_ext_if(nl_sock, AF_INET6);

 	if (!ifi) {
-		debug("Failed to detect external interface for IPv6");
+		info("Couldn't pick external interface: disabling IPv6");
 		return 0;
 	}

 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->guest_gw)) {
 		rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->guest_gw);
 		if (rc < 0) {
-			debug("Couldn't discover IPv6 gateway address: %s",
-			      strerror_(-rc));
+			err("Couldn't discover IPv6 gateway address: %s",
+			    strerror(-rc));
 			return 0;
 		}
 	}
@ -750,7 +711,7 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
 			 IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ? &ip6->addr : NULL,
 			 &prefix_len, &ip6->our_tap_ll);
 	if (rc < 0) {
-		debug("Couldn't discover IPv6 address: %s", strerror_(-rc));
+		err("Couldn't discover IPv6 address: %s", strerror(-rc));
 		return 0;
 	}

@ -766,22 +727,11 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
 	return ifi;
 }

-/**
- * conf_ip6_local() - Configure IPv6 addresses and attributes for local mode
- * @ip6:	IPv6 context (will be written)
- */
-static void conf_ip6_local(struct ip6_ctx *ip6)
-{
-	ip6->our_tap_ll = ip6->guest_gw = IP6_LL_GUEST_GW;
-
-	ip6->no_copy_addrs = ip6->no_copy_routes = true;
-}
-
 /**
 * usage() - Print usage, exit with given status code
 * @name:	Executable name
 * @f:		Stream to print usage info to
- * @status:	Status code for _exit()
+ * @status:	Status code for exit()
 */
 static void usage(const char *name, FILE *f, int status)
 {
@ -828,9 +778,6 @@ static void usage(const char *name, FILE *f, int status)
 			"    UNIX domain socket is provided by -s option\n"
 			"  --print-capabilities	print back-end capabilities in JSON format,\n"
 			"    only meaningful for vhost-user mode\n");
-		FPRINTF(f,
-			"  --repair-path PATH	path for passt-repair(1)\n"
-			"    default: append '.repair' to UNIX domain path\n");
 	}

 	FPRINTF(f,
@ -847,7 +794,7 @@ static void usage(const char *name, FILE *f, int status)
 		"  -n, --netmask MASK	Assign IPv4 MASK, dot-decimal or bits\n"
 		"    default: netmask from matching address on the host\n"
 		"  -M, --mac-addr ADDR	Use source MAC address ADDR\n"
-		"    default: 9a:55:9a:55:9a:55 (locally administered)\n"
+		"    default: MAC address from interface with default route\n"
 		"  -g, --gateway ADDR	Pass IPv4 or IPv6 address as gateway\n"
 		"    default: gateway from interface with default route\n"
 		"  -i, --interface NAME	Interface for addresses and routes\n"
@ -869,9 +816,7 @@ static void usage(const char *name, FILE *f, int status)
 		FPRINTF(f, "    default: use addresses from /etc/resolv.conf\n");
 	FPRINTF(f,
 		"  -S, --search LIST	Space-separated list, search domains\n"
-		"    a single, empty option disables the DNS search list\n"
-		"  -H, --hostname NAME 	Hostname to configure client with\n"
-		"  --fqdn NAME		FQDN to configure client with\n");
+		"    a single, empty option disables the DNS search list\n");
 	if (strstr(name, "pasta"))
 		FPRINTF(f, "    default: don't use any search list\n");
 	else
@ -942,7 +887,7 @@ static void usage(const char *name, FILE *f, int status)
 		"    SPEC is as described for TCP above\n"
 		"    default: none\n");

-	_exit(status);
+	exit(status);

 pasta_opts:

@ -980,7 +925,8 @@ pasta_opts:
 		"  -U, --udp-ns SPEC	UDP port forwarding to init namespace\n"
 		"    SPEC is as described above\n"
 		"    default: auto\n"
-		"  --host-lo-to-ns-lo	Translate host-loopback forwards to\n"
+		"  --host-lo-to-ns-lo	DEPRECATED:\n"
+		"			Translate host-loopback forwards to\n"
 		"			namespace loopback\n"
 		"  --userns NSPATH 	Target user namespace to join\n"
 		"  --netns PATH|NAME	Target network namespace to join\n"
@ -993,49 +939,9 @@ pasta_opts:
 		"			Don't copy all routes to namespace\n"
 		"  --no-copy-addrs	DEPRECATED:\n"
 		"			Don't copy all addresses to namespace\n"
-		"  --ns-mac-addr ADDR	Set MAC address on tap interface\n"
-		"  --no-splice		Disable inbound socket splicing\n");
+		"  --ns-mac-addr ADDR	Set MAC address on tap interface\n");

-	_exit(status);
-}
-
-/**
- * conf_mode() - Determine passt/pasta's operating mode from command line
- * @argc:	Argument count
- * @argv:	Command line arguments
- *
- * Return: mode to operate in, PASTA or PASST
- */
-enum passt_modes conf_mode(int argc, char *argv[])
-{
-	int vhost_user = 0;
-	const struct option optvu[] = {
-		{"vhost-user",	no_argument,		&vhost_user,	1 },
-		{ 0 },
-	};
-	char argv0[PATH_MAX], *basearg0;
-	int name;
-
-	optind = 0;
-	do {
-		name = getopt_long(argc, argv, "-:", optvu, NULL);
-	} while (name != -1);
-
-	if (vhost_user)
-		return MODE_VU;
-
-	if (argc < 1)
-		die("Cannot determine argv[0]");
-
-	strncpy(argv0, argv[0], PATH_MAX - 1);
-	basearg0 = basename(argv0);
-	if (strstr(basearg0, "pasta"))
-		return MODE_PASTA;
-
-	if (strstr(basearg0, "passt"))
-		return MODE_PASST;
-
-	die("Cannot determine mode, invoke as \"passt\" or \"pasta\"");
+	exit(status);
 }

 /**
@ -1048,14 +954,12 @@ static void conf_print(const struct ctx *c)
 	char bufmac[ETH_ADDRSTRLEN], ifn[IFNAMSIZ];
 	int i;

-	if (c->ifi4 > 0 || c->ifi6 > 0) {
-		info("Template interface: %s%s%s%s%s",
-		     c->ifi4 > 0 ? if_indextoname(c->ifi4, ifn) : "",
-		     c->ifi4 > 0 ? " (IPv4)" : "",
-		     (c->ifi4 && c->ifi6) ? ", " : "",
-		     c->ifi6 > 0 ? if_indextoname(c->ifi6, ifn) : "",
-		     c->ifi6 > 0 ? " (IPv6)" : "");
-	}
+	info("Template interface: %s%s%s%s%s",
+	     c->ifi4 ? if_indextoname(c->ifi4, ifn) : "",
+	     c->ifi4 ? " (IPv4)" : "",
+	     (c->ifi4 && c->ifi6) ? ", " : "",
+	     c->ifi6 ? if_indextoname(c->ifi6, ifn) : "",
+	     c->ifi6 ? " (IPv6)" : "");

 	if (*c->ip4.ifname_out || *c->ip6.ifname_out) {
 		info("Outbound interface: %s%s%s%s%s",
@ -1126,9 +1030,9 @@ static void conf_print(const struct ctx *c)

 		if (!c->no_ndp && !c->no_dhcpv6)
 			info("NDP/DHCPv6:");
-		else if (!c->no_dhcpv6)
-			info("DHCPv6:");
 		else if (!c->no_ndp)
+			info("DHCPv6:");
+		else if (!c->no_dhcpv6)
 			info("NDP:");
 		else
 			goto dns6;
@ -1272,8 +1176,6 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
 		*addr6 = in6addr_any;
 		if (no_map_gw)
 			*no_map_gw = 1;
-
-		return;
 	}

 	if (inet_pton(AF_INET6, arg, addr6)	&&
@ -1297,25 +1199,8 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
 */
 static void conf_open_files(struct ctx *c)
 {
-	if (c->mode != MODE_PASTA && c->fd_tap == -1) {
-		c->fd_tap_listen = sock_unix(c->sock_path);
-
-		if (c->mode == MODE_VU && strcmp(c->repair_path, "none")) {
-			if (!*c->repair_path &&
-			    snprintf_check(c->repair_path,
-					   sizeof(c->repair_path), "%s.repair",
-					   c->sock_path)) {
-				warn("passt-repair path %s not usable",
-				     c->repair_path);
-				c->fd_repair_listen = -1;
-			} else {
-				c->fd_repair_listen = sock_unix(c->repair_path);
-			}
-		} else {
-			c->fd_repair_listen = -1;
-		}
-		c->fd_repair = -1;
-	}
+	if (c->mode != MODE_PASTA && c->fd_tap == -1)
+		c->fd_tap_listen = tap_sock_unix_open(c->sock_path);

 	if (*c->pidfile) {
 		c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY);
@ -1387,7 +1272,6 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"outbound",	required_argument,	NULL,		'o' },
 		{"dns",		required_argument,	NULL,		'D' },
 		{"search",	required_argument,	NULL,		'S' },
-		{"hostname",	required_argument,	NULL,		'H' },
 		{"no-tcp",	no_argument,		&c->no_tcp,	1 },
 		{"no-udp",	no_argument,		&c->no_udp,	1 },
 		{"no-icmp",	no_argument,		&c->no_icmp,	1 },
@ -1395,7 +1279,6 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"no-dhcpv6",	no_argument,		&c->no_dhcpv6,	1 },
 		{"no-ndp",	no_argument,		&c->no_ndp,	1 },
 		{"no-ra",	no_argument,		&c->no_ra,	1 },
-		{"no-splice",	no_argument,		&c->no_splice,	1 },
 		{"freebind",	no_argument,		&c->freebind,	1 },
 		{"no-map-gw",	no_argument,		&no_map_gw,	1 },
 		{"ipv4-only",	no_argument,		NULL,		'4' },
@ -1429,25 +1312,21 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"host-lo-to-ns-lo", no_argument, 	NULL,		23 },
 		{"dns-host",	required_argument,	NULL,		24 },
 		{"vhost-user",	no_argument,		NULL,		25 },
-
 		/* vhost-user backend program convention */
 		{"print-capabilities", no_argument,	NULL,		26 },
 		{"socket-path",	required_argument,	NULL,		's' },
-		{"fqdn",	required_argument,	NULL,		27 },
-		{"repair-path",	required_argument,	NULL,		28 },
 		{ 0 },
 	};
-	const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
 	const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
 	char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
 	bool copy_addrs_opt = false, copy_routes_opt = false;
 	enum fwd_ports_mode fwd_default = FWD_NONE;
 	bool v4_only = false, v6_only = false;
 	unsigned dns4_idx = 0, dns6_idx = 0;
-	unsigned long max_mtu = IP_MAX_MTU;
 	struct fqdn *dnss = c->dns_search;
 	unsigned int ifi4 = 0, ifi6 = 0;
 	const char *logfile = NULL;
+	const char *optstring;
 	size_t logsize = 0;
 	char *runas = NULL;
 	long fd_tap_opt;
@ -1458,11 +1337,11 @@ void conf(struct ctx *c, int argc, char **argv)
 	if (c->mode == MODE_PASTA) {
 		c->no_dhcp_dns = c->no_dhcp_dns_search = 1;
 		fwd_default = FWD_AUTO;
+		optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:46t:u:T:U:";
+	} else {
+		optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:461t:u:";
 	}

-	if (tap_l2_max_len(c) - ETH_HLEN < max_mtu)
-		max_mtu = tap_l2_max_len(c) - ETH_HLEN;
-	c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t));
 	c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
 	c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET;
 	memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN);
@ -1561,7 +1440,7 @@ void conf(struct ctx *c, int argc, char **argv)
 			FPRINTF(stdout,
 				c->mode == MODE_PASTA ? "pasta " : "passt ");
 			FPRINTF(stdout, VERSION_BLOB);
-			_exit(EXIT_SUCCESS);
+			exit(EXIT_SUCCESS);
 		case 15:
 			ret = snprintf(c->ip4.ifname_out,
 				       sizeof(c->ip4.ifname_out), "%s", optarg);
@ -1630,26 +1509,14 @@ void conf(struct ctx *c, int argc, char **argv)

 			die("Invalid host nameserver address: %s", optarg);
 		case 25:
-			/* Already handled in conf_mode() */
-			ASSERT(c->mode == MODE_VU);
+			if (c->mode == MODE_PASTA) {
+				err("--vhost-user is for passt mode only");
+				usage(argv[0], stdout, EXIT_SUCCESS);
+			}
+			c->mode = MODE_VU;
 			break;
 		case 26:
 			vu_print_capabilities();
-			break;
-		case 27:
-			if (snprintf_check(c->fqdn, PASST_MAXDNAME,
-					   "%s", optarg))
-				die("Invalid FQDN: %s", optarg);
-			break;
-		case 28:
-			if (c->mode != MODE_VU && strcmp(optarg, "none"))
-				die("--repair-path is for vhost-user mode only");
-
-			if (snprintf_check(c->repair_path,
-					   sizeof(c->repair_path), "%s",
-					   optarg))
-				die("Invalid passt-repair path: %s", optarg);
-
 			break;
 		case 'd':
 			c->debug = 1;
@ -1669,9 +1536,6 @@ void conf(struct ctx *c, int argc, char **argv)
 			c->foreground = 1;
 			break;
 		case 's':
-			if (c->mode == MODE_PASTA)
-				die("-s is for passt / vhost-user mode only");
-
 			ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s",
 				       optarg);
 			if (ret <= 0 || ret >= (int)sizeof(c->sock_path))
@ -1692,9 +1556,6 @@ void conf(struct ctx *c, int argc, char **argv)
 			*c->sock_path = 0;
 			break;
 		case 'I':
-			if (c->mode != MODE_PASTA)
-				die("-I is for pasta mode only");
-
 			ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s",
 				       optarg);
 			if (ret <= 0 || ret >= IFNAMSIZ)
@ -1714,24 +1575,20 @@ void conf(struct ctx *c, int argc, char **argv)
 				die("Invalid PID file: %s", optarg);

 			break;
-		case 'm': {
-			unsigned long mtu;
-			char *e;
-
+		case 'm':
 			errno = 0;
-			mtu = strtoul(optarg, &e, 0);
+			c->mtu = strtol(optarg, NULL, 0);

-			if (errno || *e)
-				die("Invalid MTU: %s", optarg);
-
-			if (mtu > max_mtu) {
-				die("MTU %lu too large (max %lu)",
-				    mtu, max_mtu);
+			if (!c->mtu) {
+				c->mtu = -1;
+				break;
 			}

-			c->mtu = mtu;
+			if (c->mtu < ETH_MIN_MTU || c->mtu > (int)ETH_MAX_MTU ||
+			    errno)
+				die("Invalid MTU: %s", optarg);
+
 			break;
-		}
 		case 'a':
 			if (inet_pton(AF_INET6, optarg, &c->ip6.addr)	&&
 			    !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)	&&
@ -1830,11 +1687,6 @@ void conf(struct ctx *c, int argc, char **argv)

 			die("Cannot use DNS search domain %s", optarg);
 			break;
-		case 'H':
-			if (snprintf_check(c->hostname, PASST_MAXDNAME,
-					   "%s", optarg))
-				die("Invalid hostname: %s", optarg);
-			break;
 		case '4':
 			v4_only = true;
 			v6_only = false;
@ -1851,15 +1703,10 @@ void conf(struct ctx *c, int argc, char **argv)
 			break;
 		case 't':
 		case 'u':
-		case 'D':
-			/* Handle these later, once addresses are configured */
-			break;
 		case 'T':
 		case 'U':
-			if (c->mode != MODE_PASTA)
-				die("-%c is for pasta mode only", name);
-
-			/* Handle properly later, once addresses are configured */
+		case 'D':
+			/* Handle these later, once addresses are configured */
 			break;
 		case 'h':
 			usage(argv[0], stdout, EXIT_SUCCESS);
@ -1871,9 +1718,6 @@ void conf(struct ctx *c, int argc, char **argv)
 		}
 	} while (name != -1);

-	if (c->mode != MODE_PASTA)
-		c->no_splice = 1;
-
 	if (c->mode == MODE_PASTA && !c->pasta_conf_ns) {
 		if (copy_routes_opt)
 			die("--no-copy-routes needs --config-net");
@ -1908,36 +1752,11 @@ void conf(struct ctx *c, int argc, char **argv)
 		c->ifi4 = conf_ip4(ifi4, &c->ip4);
 	if (!v4_only)
 		c->ifi6 = conf_ip6(ifi6, &c->ip6);
-
-	if (c->ifi4 && c->mtu < IPV4_MIN_MTU) {
-		warn("MTU %"PRIu16" is too small for IPv4 (minimum %u)",
-		     c->mtu, IPV4_MIN_MTU);
-	}
-	if (c->ifi6 && c->mtu < IPV6_MIN_MTU) {
-		warn("MTU %"PRIu16" is too small for IPv6 (minimum %u)",
-			     c->mtu, IPV6_MIN_MTU);
-	}
-
-	if ((*c->ip4.ifname_out && !c->ifi4) ||
+	if ((!c->ifi4 && !c->ifi6) ||
+	    (*c->ip4.ifname_out && !c->ifi4) ||
 	    (*c->ip6.ifname_out && !c->ifi6))
 		die("External interface not usable");

-
-	if (!c->ifi4 && !c->ifi6) {
-		info("No external interface as template, switch to local mode");
-
-		conf_ip4_local(&c->ip4);
-		c->ifi4 = -1;
-
-		conf_ip6_local(&c->ip6);
-		c->ifi6 = -1;
-
-		if (!*c->pasta_ifn) {
-			strncpy(c->pasta_ifn, pasta_default_ifn,
-				sizeof(c->pasta_ifn) - 1);
-		}
-	}
-
 	if (c->ifi4 && !no_map_gw &&
 	    IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
 		c->ip4.map_host_loopback = c->ip4.guest_gw;
@ -1949,8 +1768,8 @@ void conf(struct ctx *c, int argc, char **argv)
 	if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
 		c->no_dhcp = 1;

-	/* Inbound port options and DNS can be parsed now, after IPv4/IPv6
-	 * settings
+	/* Inbound port options & DNS can be parsed now (after IPv4/IPv6
+	 * settings)
 	 */
 	fwd_probe_ephemeral();
 	udp_portmap_clear();
@ -2040,16 +1859,17 @@ void conf(struct ctx *c, int argc, char **argv)
 	if (!c->ifi6) {
 		c->no_ndp = 1;
 		c->no_dhcpv6 = 1;
-	} else if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) {
-		c->no_dhcpv6 = 1;
 	}

+	if (!c->mtu)
+		c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
+
 	get_dns(c);

 	if (!*c->pasta_ifn) {
-		if (c->ifi4 > 0)
+		if (c->ifi4)
 			if_indextoname(c->ifi4, c->pasta_ifn);
-		else if (c->ifi6 > 0)
+		else
 			if_indextoname(c->ifi6, c->pasta_ifn);
 	}

--- a/conf.h
+++ b/conf.h
@ -6,7 +6,6 @@
 #ifndef CONF_H
 #define CONF_H

-enum passt_modes conf_mode(int argc, char *argv[]);
 void conf(struct ctx *c, int argc, char **argv);

 #endif /* CONF_H */
--- a/contrib/apparmor/usr.bin.passt
+++ b/contrib/apparmor/usr.bin.passt
@ -27,25 +27,4 @@ profile passt /usr/bin/passt{,.avx2} {

  owner @{HOME}/**			w,	# pcap(), pidfile_open(),
 						# pidfile_write()
-
-  # Workaround: libvirt's profile comes with a passt subprofile which includes,
-  # in turn, <abstractions/passt>, and adds libvirt-specific rules on top, to
-  # allow passt (when started by libvirtd) to write socket and PID files in the
-  # location requested by libvirtd itself, and to execute passt itself.
-  #
-  # However, when libvirt runs as unprivileged user, the mechanism based on
-  # virt-aa-helper, designed to build per-VM profiles as guests are started,
-  # doesn't work. The helper needs to create and load profiles on the fly, which
-  # can't be done by unprivileged users, of course.
-  #
-  # As a result, libvirtd runs unconfined if guests are started by unprivileged
-  # users, starting passt unconfined as well, which means that passt runs under
-  # its own stand-alone profile (this one), which implies in turn that execve()
-  # of /usr/bin/passt is not allowed, and socket and PID files can't be written.
-  #
-  # Duplicate libvirt-specific rules here as long as this is not solved in
-  # libvirt's profile itself.
-  /usr/bin/passt r,
-  owner @{run}/user/[0-9]*/libvirt/qemu/run/passt/* rw,
-  owner @{run}/libvirt/qemu/passt/* rw,
 }
--- a/contrib/apparmor/usr.bin.passt-repair
+++ b/contrib/apparmor/usr.bin.passt-repair
@ -1,29 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# contrib/apparmor/usr.bin.passt-repair - AppArmor profile for passt-repair(1)
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-abi <abi/3.0>,
-
-#include <tunables/global>
-
-profile passt-repair /usr/bin/passt-repair {
-  #include <abstractions/base>
-  /** rw,			# passt's ".repair" socket might be anywhere
-  unix (connect, receive, send) type=stream,
-
-  capability dac_override,	# connect to passt's socket as root
-  capability net_admin,		# currently needed for TCP_REPAIR socket option
-  capability net_raw,		# what TCP_REPAIR should require instead
-
-  network unix stream,		# connect and use UNIX domain socket
-  network inet stream,		# use TCP sockets
-}
--- a/contrib/fedora/passt.spec
+++ b/contrib/fedora/passt.spec
@ -44,7 +44,7 @@ Requires(preun): %{name}
 Requires(preun): policycoreutils

 %description selinux
-This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1).
+This package adds SELinux enforcement to passt(1) and pasta(1).

 %prep
 %setup -q -n passt-%{git_hash}
@ -82,7 +82,6 @@ make -f %{_datadir}/selinux/devel/Makefile
 install -p -m 644 -D passt.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
 install -p -m 644 -D passt.if %{buildroot}%{_datadir}/selinux/devel/include/distributed/passt.if
 install -p -m 644 -D pasta.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
-install -p -m 644 -D passt-repair.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
 popd

 %pre selinux
@ -91,13 +90,11 @@ popd
 %post selinux
 %selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
 %selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
-%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp

 %postun selinux
 if [ $1 -eq 0 ]; then
 	%selinux_modules_uninstall -s %{selinuxtype} passt
 	%selinux_modules_uninstall -s %{selinuxtype} pasta
-	%selinux_modules_uninstall -s %{selinuxtype} passt-repair
 fi

 %posttrans selinux
@ -111,11 +108,9 @@ fi
 %{_bindir}/passt
 %{_bindir}/pasta
 %{_bindir}/qrap
-%{_bindir}/passt-repair
 %{_mandir}/man1/passt.1*
 %{_mandir}/man1/pasta.1*
 %{_mandir}/man1/qrap.1*
-%{_mandir}/man1/passt-repair.1*
 %ifarch x86_64
 %{_bindir}/passt.avx2
 %{_mandir}/man1/passt.avx2.1*
@ -127,7 +122,6 @@ fi
 %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
 %{_datadir}/selinux/devel/include/distributed/passt.if
 %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
-%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp

 %changelog
 {{{ passt_git_changelog }}}
--- a/contrib/selinux/passt-repair.fc
+++ b/contrib/selinux/passt-repair.fc
@ -1,11 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# contrib/selinux/passt-repair.fc - SELinux: File Context for passt-repair
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-/usr/bin/passt-repair		system_u:object_r:passt_repair_exec_t:s0
--- a/contrib/selinux/passt-repair.te
+++ b/contrib/selinux/passt-repair.te
@ -1,87 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# contrib/selinux/passt-repair.te - SELinux: Type Enforcement for passt-repair
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-policy_module(passt-repair, 0.1)
-
-require {
-	type unconfined_t;
-	type passt_t;
-	role unconfined_r;
-	class process transition;
-
-	class file { read execute execute_no_trans entrypoint open map };
-	class capability { dac_override net_admin net_raw };
-	class chr_file { append open getattr read write ioctl };
-
-	class unix_stream_socket { create connect sendto };
-	class sock_file { read write };
-
-	class tcp_socket { read setopt write };
-
-	type console_device_t;
-	type user_devpts_t;
-	type user_tmp_t;
-
-	# Workaround: passt-repair needs to needs to access socket files
-	# that passt, started by libvirt, might create under different
-	# labels, depending on whether passt is started as root or not.
-	#
-	# However, libvirt doesn't maintain its own policy, which makes
-	# updates particularly complicated. To avoid breakage in the short
-	# term, deal with that in passt's own policy.
-	type qemu_var_run_t;
-	type virt_var_run_t;
-}
-
-type passt_repair_t;
-domain_type(passt_repair_t);
-type passt_repair_exec_t;
-corecmd_executable_file(passt_repair_exec_t);
-
-role unconfined_r types passt_repair_t;
-
-allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans entrypoint open map };
-type_transition unconfined_t passt_repair_exec_t:process passt_repair_t;
-allow unconfined_t passt_repair_t:process transition;
-
-allow passt_repair_t self:capability { dac_override dac_read_search net_admin net_raw };
-allow passt_repair_t self:capability2 bpf;
-
-allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl };
-allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl };
-
-allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
-allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
-allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
-
-allow passt_repair_t user_tmp_t:dir { getattr read search watch };
-
-allow passt_repair_t unconfined_t:sock_file { getattr read write };
-allow passt_repair_t passt_t:sock_file { getattr read write };
-allow passt_repair_t user_tmp_t:sock_file { getattr read write };
-
-allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
-allow passt_repair_t passt_t:tcp_socket { read setopt write };
-
-# Workaround: passt-repair needs to needs to access socket files
-# that passt, started by libvirt, might create under different
-# labels, depending on whether passt is started as root or not.
-#
-# However, libvirt doesn't maintain its own policy, which makes
-# updates particularly complicated. To avoid breakage in the short
-# term, deal with that in passt's own policy.
-allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write };
-allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write };
-
-allow passt_repair_t qemu_var_run_t:dir { getattr read search watch };
-allow passt_repair_t virt_var_run_t:dir { getattr read search watch };
-
-allow passt_repair_t qemu_var_run_t:sock_file { getattr read write };
-allow passt_repair_t virt_var_run_t:sock_file { getattr read write };
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@ -20,19 +20,9 @@ require {
 	type fs_t;
 	type tmp_t;
 	type user_tmp_t;
-	type user_home_t;
 	type tmpfs_t;
 	type root_t;

-	# Workaround: passt --vhost-user needs to map guest memory, but
-	# libvirt doesn't maintain its own policy, which makes updates
-	# particularly complicated. To avoid breakage in the short term,
-	# deal with it in passt's own policy.
-	type svirt_image_t;
-	type svirt_tmpfs_t;
-	type svirt_t;
-	type null_device_t;
-
 	class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map };
 	class dir { search write add_name remove_name mounton };
 	class chr_file { append read write open getattr ioctl };
@ -48,7 +38,7 @@ require {
 	type net_conf_t;
 	type proc_net_t;
 	type node_t;
-	class tcp_socket { create accept listen name_bind name_connect getattr ioctl };
+	class tcp_socket { create accept listen name_bind name_connect };
 	class udp_socket { create accept listen };
 	class icmp_socket { bind create name_bind node_bind setopt read write };
 	class sock_file { create unlink write };
@ -90,9 +80,6 @@ allow passt_t root_t:dir mounton;
 allow passt_t tmp_t:dir { add_name mounton remove_name write };
 allow passt_t tmpfs_t:filesystem mount;
 allow passt_t fs_t:filesystem unmount;
-allow passt_t user_home_t:dir search;
-allow passt_t user_tmp_t:fifo_file append;
-allow passt_t user_tmp_t:file map;

 manage_files_pattern(passt_t, user_tmp_t, user_tmp_t)
 files_pid_filetrans(passt_t, user_tmp_t, file)
@ -132,7 +119,7 @@ corenet_udp_sendrecv_all_ports(passt_t)
 allow passt_t node_t:icmp_socket { name_bind node_bind };
 allow passt_t port_t:icmp_socket name_bind;

-allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl };
+allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write };
 allow passt_t self:udp_socket { create getopt setopt connect bind read write };
 allow passt_t self:icmp_socket { bind create setopt read write };

@ -140,11 +127,3 @@ allow passt_t user_tmp_t:dir { add_name write };
 allow passt_t user_tmp_t:file { create open };
 allow passt_t user_tmp_t:sock_file { create read write unlink };
 allow passt_t unconfined_t:unix_stream_socket { read write };
-
-# Workaround: passt --vhost-user needs to map guest memory, but
-# libvirt doesn't maintain its own policy, which makes updates
-# particularly complicated. To avoid breakage in the short term,
-# deal with it in passt's own policy.
-allow passt_t svirt_image_t:file { read write map };
-allow passt_t svirt_tmpfs_t:file { read write map };
-allow passt_t null_device_t:chr_file map;
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@ -18,7 +18,6 @@ require {
 	type bin_t;
 	type user_home_t;
 	type user_home_dir_t;
-	type user_tmp_t;
 	type fs_t;
 	type tmp_t;
 	type tmpfs_t;
@ -57,10 +56,8 @@ require {
 	attribute port_type;
 	type port_t;
 	type http_port_t;
-	type http_cache_port_t;
 	type ssh_port_t;
 	type reserved_port_t;
-	type unreserved_port_t;
 	type dns_port_t;
 	type dhcpc_port_t;
 	type chronyd_port_t;
@ -125,8 +122,8 @@ domain_auto_trans(pasta_t, ping_exec_t, ping_t);

 allow pasta_t nsfs_t:file { open read };

-allow pasta_t user_home_t:dir { getattr search };
-allow pasta_t user_home_t:file { open read getattr setattr execute execute_no_trans map};
+allow pasta_t user_home_t:dir getattr;
+allow pasta_t user_home_t:file { open read getattr setattr };
 allow pasta_t user_home_dir_t:dir { search getattr open add_name read write };
 allow pasta_t user_home_dir_t:file { create open read write };
 allow pasta_t tmp_t:dir { add_name mounton remove_name write };
@ -136,11 +133,6 @@ allow pasta_t root_t:dir mounton;
 manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t)
 files_pid_filetrans(pasta_t, pasta_pid_t, file)

-allow pasta_t user_tmp_t:dir { add_name remove_name search write };
-allow pasta_t user_tmp_t:fifo_file append;
-allow pasta_t user_tmp_t:file { create open write };
-allow pasta_t user_tmp_t:sock_file { create unlink };
-
 allow pasta_t console_device_t:chr_file { open write getattr ioctl };
 allow pasta_t user_devpts_t:chr_file { getattr read write ioctl };
 logging_send_syslog_msg(pasta_t)
@ -168,8 +160,6 @@ allow pasta_t self:udp_socket create_stream_socket_perms;
 allow pasta_t reserved_port_t:udp_socket name_bind;
 allow pasta_t llmnr_port_t:tcp_socket name_bind;
 allow pasta_t llmnr_port_t:udp_socket name_bind;
-allow pasta_t http_cache_port_t:tcp_socket { name_bind name_connect };
-allow pasta_t unreserved_port_t:udp_socket name_bind;
 corenet_udp_sendrecv_generic_node(pasta_t)
 corenet_udp_bind_generic_node(pasta_t)
 allow pasta_t node_t:icmp_socket { name_bind node_bind };
@ -181,7 +171,7 @@ allow pasta_t init_t:lnk_file read;
 allow pasta_t init_t:unix_stream_socket connectto;
 allow pasta_t init_t:dbus send_msg;
 allow pasta_t init_t:system status;
-allow pasta_t unconfined_t:dir { read search };
+allow pasta_t unconfined_t:dir search;
 allow pasta_t unconfined_t:file read;
 allow pasta_t unconfined_t:lnk_file read;
 allow pasta_t self:process { setpgid setcap };
@ -202,6 +192,8 @@ allow pasta_t sysctl_net_t:dir search;
 allow pasta_t sysctl_net_t:file { open read write };
 allow pasta_t kernel_t:system module_request;

+allow pasta_t nsfs_t:file read;
+
 allow pasta_t proc_t:dir mounton;
 allow pasta_t proc_t:filesystem mount;
 allow pasta_t net_conf_t:lnk_file read;
--- a/dhcp.c
+++ b/dhcp.c
@ -36,9 +36,9 @@
 /**
 * struct opt - DHCP option
 * @sent:	Convenience flag, set while filling replies
- * @slen:	Length of option defined for server, -1 if not going to be sent
+ * @slen:	Length of option defined for server
 * @s:		Option payload from server
- * @clen:	Length of option received from client, -1 if not received
+ * @clen:	Length of option received from client
 * @c:		Option payload from client
 */
 struct opt {
@ -63,21 +63,11 @@ static struct opt opts[255];

 #define OPT_MIN		60 /* RFC 951 */

-/* Total option size (excluding end option) is 576 (RFC 2131), minus
- * offset of options (268), minus end option (1).
- */
-#define OPT_MAX		307
-
 /**
 * dhcp_init() - Initialise DHCP options
 */
 void dhcp_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(opts); i++)
-		opts[i].slen = -1;
-
 	opts[1]  = (struct opt) { 0, 4, {     0 }, 0, { 0 }, };	/* Mask */
 	opts[3]  = (struct opt) { 0, 4, {     0 }, 0, { 0 }, };	/* Router */
 	opts[51] = (struct opt) { 0, 4, {  0xff,
@ -117,8 +107,6 @@ struct msg {
 	uint32_t xid;
 	uint16_t secs;
 	uint16_t flags;
-#define FLAG_BROADCAST	htons_constant(0x8000)
-
 	uint32_t ciaddr;
 	struct in_addr yiaddr;
 	uint32_t siaddr;
@ -127,7 +115,7 @@ struct msg {
 	uint8_t sname[64];
 	uint8_t file[128];
 	uint32_t magic;
-	uint8_t o[OPT_MAX + 1 /* End option */ ];
+	uint8_t o[308];
 } __attribute__((__packed__));

 /**
@ -135,28 +123,15 @@ struct msg {
 * @m:		Message to fill
 * @o:		Option number
 * @offset:	Current offset within options field, updated on insertion
- *
- * Return: false if m has space to write the option, true otherwise
 */
-static bool fill_one(struct msg *m, int o, int *offset)
+static void fill_one(struct msg *m, int o, int *offset)
 {
-	size_t slen = opts[o].slen;
-
-	/* If we don't have space to write the option, then just skip */
-	if (*offset + 2 /* code and length of option */ + slen > OPT_MAX)
-		return true;
-
 	m->o[*offset] = o;
-	m->o[*offset + 1] = slen;
-
-	/* Move to option */
-	*offset += 2;
-
-	memcpy(&m->o[*offset], opts[o].s, slen);
+	m->o[*offset + 1] = opts[o].slen;
+	memcpy(&m->o[*offset + 2], opts[o].s, opts[o].slen);

 	opts[o].sent = 1;
-	*offset += slen;
-	return false;
+	*offset += 2 + opts[o].slen;
 }

 /**
@ -169,6 +144,9 @@ static int fill(struct msg *m)
 {
 	int i, o, offset = 0;

+	m->op = BOOTREPLY;
+	m->secs = 0;
+
 	for (o = 0; o < 255; o++)
 		opts[o].sent = 0;

@ -176,24 +154,22 @@ static int fill(struct msg *m)
 	 * option 53 at the beginning of the list.
 	 * Put it there explicitly, unless requested via option 55.
 	 */
-	if (opts[55].clen > 0 && !memchr(opts[55].c, 53, opts[55].clen))
-		if (fill_one(m, 53, &offset))
-			 debug("DHCP: skipping option 53");
+	if (!memchr(opts[55].c, 53, opts[55].clen))
+		fill_one(m, 53, &offset);

 	for (i = 0; i < opts[55].clen; i++) {
 		o = opts[55].c[i];
-		if (opts[o].slen != -1)
-			if (fill_one(m, o, &offset))
-				debug("DHCP: skipping option %i", o);
+		if (opts[o].slen)
+			fill_one(m, o, &offset);
 	}

 	for (o = 0; o < 255; o++) {
-		if (opts[o].slen != -1 && !opts[o].sent)
-			if (fill_one(m, o, &offset))
-				debug("DHCP: skipping option %i", o);
+		if (opts[o].slen && !opts[o].sent)
+			fill_one(m, o, &offset);
 	}

 	m->o[offset++] = 255;
+	m->o[offset++] = 0;

 	if (offset < OPT_MIN) {
 		memset(&m->o[offset], 0, OPT_MIN - offset);
@ -288,9 +264,6 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len)
 						 ".\xc0");
 		}
 	}
-
-	if (!opts[119].slen)
-		opts[119].slen = -1;
 }

 /**
@ -304,13 +277,12 @@ int dhcp(const struct ctx *c, const struct pool *p)
 {
 	size_t mlen, dlen, offset = 0, opt_len, opt_off = 0;
 	char macstr[ETH_ADDRSTRLEN];
-	struct in_addr mask, dst;
 	const struct ethhdr *eh;
 	const struct iphdr *iph;
 	const struct udphdr *uh;
-	struct msg const *m;
-	struct msg reply;
+	struct in_addr mask;
 	unsigned int i;
+	struct msg *m;

 	eh  = packet_get(p, 0, offset, sizeof(*eh),  NULL);
 	offset += sizeof(*eh);
@ -339,27 +311,8 @@ int dhcp(const struct ctx *c, const struct pool *p)
 	    m->op != BOOTREQUEST)
 		return -1;

-	reply.op		= BOOTREPLY;
-	reply.htype		= m->htype;
-	reply.hlen		= m->hlen;
-	reply.hops		= 0;
-	reply.xid		= m->xid;
-	reply.secs		= 0;
-	reply.flags		= m->flags;
-	reply.ciaddr		= m->ciaddr;
-	reply.yiaddr		= c->ip4.addr;
-	reply.siaddr		= 0;
-	reply.giaddr		= m->giaddr;
-	memcpy(&reply.chaddr,	m->chaddr,	sizeof(reply.chaddr));
-	memset(&reply.sname,	0,		sizeof(reply.sname));
-	memset(&reply.file,	0,		sizeof(reply.file));
-	reply.magic		= m->magic;
-
 	offset += offsetof(struct msg, o);

-	for (i = 0; i < ARRAY_SIZE(opts); i++)
-		opts[i].clen = -1;
-
 	while (opt_off + 2 < opt_len) {
 		const uint8_t *olen, *val;
 		uint8_t *type;
@ -378,19 +331,11 @@ int dhcp(const struct ctx *c, const struct pool *p)
 		opt_off += *olen + 2;
 	}

-	opts[80].slen = -1;
-	if (opts[53].clen > 0 && opts[53].c[0] == DHCPDISCOVER) {
-		if (opts[80].clen == -1) {
-			info("DHCP: offer to discover");
-			opts[53].s[0] = DHCPOFFER;
-		} else {
-			info("DHCP: ack to discover (Rapid Commit)");
-			opts[53].s[0] = DHCPACK;
-			opts[80].slen = 0;
-		}
-	} else if (opts[53].clen <= 0 || opts[53].c[0] == DHCPREQUEST) {
-		info("%s: ack to request", /* DHCP needs a valid message type */
-		     (opts[53].clen <= 0) ? "BOOTP" : "DHCP");
+	if (opts[53].c[0] == DHCPDISCOVER) {
+		info("DHCP: offer to discover");
+		opts[53].s[0] = DHCPOFFER;
+	} else if (opts[53].c[0] == DHCPREQUEST || !opts[53].clen) {
+		info("%s: ack to request", opts[53].clen ? "DHCP" : "BOOTP");
 		opts[53].s[0] = DHCPACK;
 	} else {
 		return -1;
@ -398,6 +343,7 @@ int dhcp(const struct ctx *c, const struct pool *p)

 	info("    from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr)));

+	m->yiaddr = c->ip4.addr;
 	mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len));
 	memcpy(opts[1].s,  &mask,                sizeof(mask));
 	memcpy(opts[3].s,  &c->ip4.guest_gw,     sizeof(c->ip4.guest_gw));
@ -417,7 +363,7 @@ int dhcp(const struct ctx *c, const struct pool *p)
 		       &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
 	}

-	if (c->mtu) {
+	if (c->mtu != -1) {
 		opts[26].slen = 2;
 		opts[26].s[0] = c->mtu / 256;
 		opts[26].s[1] = c->mtu % 256;
@ -428,44 +374,12 @@ int dhcp(const struct ctx *c, const struct pool *p)
 		((struct in_addr *)opts[6].s)[i] = c->ip4.dns[i];
 		opts[6].slen += sizeof(uint32_t);
 	}
-	if (!opts[6].slen)
-		opts[6].slen = -1;
-
-	opt_len = strlen(c->hostname);
-	if (opt_len > 0) {
-		opts[12].slen = opt_len;
-		memcpy(opts[12].s, &c->hostname, opt_len);
-	}
-
-	opt_len = strlen(c->fqdn);
-	if (opt_len > 0) {
-		opt_len += 3 /* flags */
-			+ 2; /* Length byte for first label, and terminator */
-
-		if (sizeof(opts[81].s) >= opt_len) {
-			opts[81].s[0] = 0x4; /* flags (E) */
-			opts[81].s[1] = 0xff; /* RCODE1 */
-			opts[81].s[2] = 0xff; /* RCODE2 */
-
-			encode_domain_name((char *)opts[81].s + 3, c->fqdn);
-
-			opts[81].slen = opt_len;
-		} else {
-			debug("DHCP: client FQDN option doesn't fit, skipping");
-		}
-	}

 	if (!c->no_dhcp_dns_search)
 		opt_set_dns_search(c, sizeof(m->o));

-	dlen = offsetof(struct msg, o) + fill(&reply);
-
-	if (m->flags & FLAG_BROADCAST)
-		dst = in4addr_broadcast;
-	else
-		dst = c->ip4.addr;
-
-	tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, &reply, dlen);
+	dlen = offsetof(struct msg, o) + fill(m);
+	tap_udp4_send(c, c->ip4.our_tap_addr, 67, c->ip4.addr, 68, m, dlen);

 	return 1;
 }
--- a/dhcpv6.c
+++ b/dhcpv6.c
@ -48,7 +48,6 @@ struct opt_hdr {
 # define  STATUS_NOTONLINK	htons_constant(4)
 # define OPT_DNS_SERVERS	htons_constant(23)
 # define OPT_DNS_SEARCH		htons_constant(24)
-# define OPT_CLIENT_FQDN	htons_constant(39)
 #define   STR_NOTONLINK		"Prefix not appropriate for link."

 	uint16_t l;
@ -59,9 +58,6 @@ struct opt_hdr {
 					      sizeof(struct opt_hdr))
 #define OPT_VSIZE(x)		(sizeof(struct opt_##x) - 		\
 				 sizeof(struct opt_hdr))
-#define OPT_MAX_SIZE		IPV6_MIN_MTU - (sizeof(struct ipv6hdr) + \
-						sizeof(struct udphdr) + \
-						sizeof(struct msg_hdr))

 /**
 * struct opt_client_id - DHCPv6 Client Identifier option
@ -167,18 +163,6 @@ struct opt_dns_search {
 	char list[MAXDNSRCH * NS_MAXDNAME];
 } __attribute__((packed));

-/**
- * struct opt_client_fqdn - Client FQDN option (RFC 4704)
- * @hdr:		Option header
- * @flags:		Flags described by RFC 4704
- * @domain_name:	Client FQDN
- */
-struct opt_client_fqdn {
-	struct opt_hdr hdr;
-	uint8_t flags;
-	char domain_name[PASST_MAXDNAME];
-} __attribute__((packed));
-
 /**
 * struct msg_hdr - DHCPv6 client/server message header
 * @type:		DHCP message type
@ -209,7 +193,6 @@ struct msg_hdr {
 * @client_id:		Client Identifier, variable length
 * @dns_servers:	DNS Recursive Name Server, here just for storage size
 * @dns_search:		Domain Search List, here just for storage size
- * @client_fqdn:	Client FQDN, variable length
 */
 static struct resp_t {
 	struct msg_hdr hdr;
@ -220,7 +203,6 @@ static struct resp_t {
 	struct opt_client_id client_id;
 	struct opt_dns_servers dns_servers;
 	struct opt_dns_search dns_search;
-	struct opt_client_fqdn client_fqdn;
 } __attribute__((__packed__)) resp = {
 	{ 0 },
 	SERVER_ID,
@ -246,10 +228,6 @@ static struct resp_t {
 	{ { OPT_DNS_SEARCH,	0, },
 	  { 0 },
 	},
-
-	{ { OPT_CLIENT_FQDN, 0, },
-	  0, { 0 },
-	},
 };

 static const struct opt_status_code sc_not_on_link = {
@ -368,6 +346,7 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset)
 {
 	struct opt_dns_servers *srv = NULL;
 	struct opt_dns_search *srch = NULL;
+	char *p = NULL;
 	int i;

 	if (c->no_dhcp_dns)
@ -404,81 +383,34 @@ search:
 		if (!name_len)
 			continue;

-		name_len += 2; /* Length byte for first label, and terminator */
-		if (name_len >
-		    NS_MAXDNAME + 1 /* Length byte for first label */ ||
-		    name_len > 255) {
-			debug("DHCP: DNS search name '%s' too long, skipping",
-			      c->dns_search[i].n);
-			continue;
-		}
-
 		if (!srch) {
 			srch = (struct opt_dns_search *)(buf + offset);
 			offset += sizeof(struct opt_hdr);
 			srch->hdr.t = OPT_DNS_SEARCH;
 			srch->hdr.l = 0;
+			p = srch->list;
 		}

-		encode_domain_name(buf + offset, c->dns_search[i].n);
-
-		srch->hdr.l += name_len;
-		offset += name_len;
-
+		*p = '.';
+		p = stpncpy(p + 1, c->dns_search[i].n, name_len);
+		p++;
+		srch->hdr.l += name_len + 2;
+		offset += name_len + 2;
 	}

-	if (srch)
+	if (srch) {
+		for (i = 0; i < srch->hdr.l; i++) {
+			if (srch->list[i] == '.') {
+				srch->list[i] = strcspn(srch->list + i + 1,
+							".");
+			}
+		}
 		srch->hdr.l = htons(srch->hdr.l);
+	}

 	return offset;
 }

-/**
- * dhcpv6_client_fqdn_fill() - Fill in client FQDN option
- * @c:		Execution context
- * @buf:	Response message buffer where options will be appended
- * @offset:	Offset in message buffer for new options
- *
- * Return: updated length of response message buffer.
- */
-static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
-				      char *buf, int offset)
-
-{
-	struct opt_client_fqdn const *req_opt;
-	struct opt_client_fqdn *o;
-	size_t opt_len;
-
-	opt_len = strlen(c->fqdn);
-	if (opt_len == 0) {
-		return offset;
-	}
-
-	opt_len += 2; /* Length byte for first label, and terminator */
-	if (opt_len > OPT_MAX_SIZE - (offset +
-				      sizeof(struct opt_hdr) +
-				      1 /* flags */ )) {
-		debug("DHCPv6: client FQDN option doesn't fit, skipping");
-		return offset;
-	}
-
-	o = (struct opt_client_fqdn *)(buf + offset);
-	encode_domain_name(o->domain_name, c->fqdn);
-	req_opt = (struct opt_client_fqdn *)dhcpv6_opt(p, &(size_t){ 0 },
-						       OPT_CLIENT_FQDN);
-	if (req_opt && req_opt->flags & 0x01 /* S flag */)
-		o->flags = 0x02 /* O flag */;
-	else
-		o->flags = 0x00;
-
-	opt_len++;
-
-	o->hdr.t = OPT_CLIENT_FQDN;
-	o->hdr.l = htons(opt_len);
-
-	return offset + sizeof(struct opt_hdr) + opt_len;
-}
-
 /**
 * dhcpv6() - Check if this is a DHCPv6 message, reply as needed
 * @c:		Execution context
@ -612,7 +544,6 @@ int dhcpv6(struct ctx *c, const struct pool *p,
 	n = offsetof(struct resp_t, client_id) +
 	    sizeof(struct opt_hdr) + ntohs(client_id->l);
 	n = dhcpv6_dns_fill(c, (char *)&resp, n);
-	n = dhcpv6_client_fqdn_fill(p, c, (char *)&resp, n);

 	resp.hdr.xid = mh->xid;

--- a/doc/migration/.gitignore
+++ b/doc/migration/.gitignore
@ -1,2 +0,0 @@
-/source
-/target
--- a/doc/migration/Makefile
+++ b/doc/migration/Makefile
@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-TARGETS = source target
-CFLAGS = -Wall -Wextra -pedantic
-
-all: $(TARGETS)
-
-$(TARGETS): %: %.c
-
-clean:
-	rm -f $(TARGETS)
--- a/doc/migration/README
+++ b/doc/migration/README
@ -1,51 +0,0 @@
-<!---
-SPDX-License-Identifier: GPL-2.0-or-later
-Copyright (c) 2025 Red Hat GmbH
-Author: Stefano Brivio <sbrivio@redhat.com>
-->
-
-Migration
-=========
-
-These test programs show a migration of a TCP connection from one process to
-another using the TCP_REPAIR socket option.
-
-The two processes are a mock of the matching implementation in passt(1), and run
-unprivileged, so they rely on the passt-repair helper to connect to them and set
-or clear TCP_REPAIR on the connection socket, transferred to the helper using
-SCM_RIGHTS.
-
-The passt-repair helper needs to have the CAP_NET_ADMIN capability, or run as
-root.
-
-Example of usage
----------------
-
-* Start the test server
-
-        $ nc -l 9999
-
-* Start the source side of the TCP client (mock of the source instance of passt)
-
-        $ ./source 127.0.0.1 9999 9998 /tmp/repair.sock
-
-* The client sends a test string, and waits for a connection from passt-repair
-
-        # passt-repair /tmp/repair.sock
-
-* The socket is now in repair mode, and `source` dumps sequences, then exits
-
-        sending sequence: 3244673313
-        receiving sequence: 2250449386
-
-* Continue the connection on the target side, restarting from those sequences
-
-        $ ./target 127.0.0.1 9999 9998 /tmp/repair.sock 3244673313 2250449386
-
-* The target side now waits for a connection from passt-repair
-
-        # passt-repair /tmp/repair.sock
-
-* The target side asks passt-repair to switch the socket to repair mode, sets up
-  the TCP sequences, then asks passt-repair to clear repair mode, and sends a
-  test string to the server
--- a/doc/migration/source.c
+++ b/doc/migration/source.c
@ -1,92 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-/* PASST - Plug A Simple Socket Transport
- *  for qemu/UNIX domain socket mode
- *
- * PASTA - Pack A Subtle Tap Abstraction
- *  for network namespace/tap device mode
- *
- * doc/migration/source.c - Mock of TCP migration source, use with passt-repair
- *
- * Copyright (c) 2025 Red Hat GmbH
- * Author: Stefano Brivio <sbrivio@redhat.com>
- */
-
-#include <arpa/inet.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <limits.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/tcp.h>
-
-int main(int argc, char **argv)
-{
-	struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } };
-	struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0,
-				  NULL, NULL, NULL };
-	struct sockaddr_un a_helper = { AF_UNIX, { 0 } };
-	int seq, s, s_helper;
-	int8_t cmd;
-	struct iovec iov = { &cmd, sizeof(cmd) };
-	char buf[CMSG_SPACE(sizeof(int))];
-	struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
-	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
-	socklen_t seqlen = sizeof(int);
-	struct addrinfo *r;
-
-	(void)argc;
-
-	if (argc != 5) {
-		fprintf(stderr, "%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH\n",
-			argv[0]);
-		return -1;
-	}
-
-	strcpy(a_helper.sun_path, argv[4]);
-	getaddrinfo(argv[1], argv[2], &hints, &r);
-
-	/* Connect socket to server and send some data */
-	s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP);
-	setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int));
-	bind(s, (struct sockaddr *)&a, sizeof(a));
-	connect(s, r->ai_addr, r->ai_addrlen);
-	send(s, "before migration\n", sizeof("before migration\n"), 0);
-
-	/* Wait for helper */
-	s_helper = socket(AF_UNIX, SOCK_STREAM, 0);
-	unlink(a_helper.sun_path);
-	bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper));
-	listen(s_helper, 1);
-	s_helper = accept(s_helper, NULL, NULL);
-
-	/* Set up message for helper, with socket */
-	cmsg->cmsg_level = SOL_SOCKET;
-	cmsg->cmsg_type = SCM_RIGHTS;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
-	memcpy(CMSG_DATA(cmsg), &s, sizeof(s));
-
-	/* Send command to helper: turn repair mode on, wait for reply */
-	cmd = TCP_REPAIR_ON;
-	sendmsg(s_helper, &msg, 0);
-	recv(s_helper, &((int8_t){ 0 }), 1, 0);
-
-	/* Terminate helper */
-	close(s_helper);
-
-	/* Get sending sequence */
-	seq = TCP_SEND_QUEUE;
-	setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
-	getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen);
-	fprintf(stdout, "%u ", seq);
-
-	/* Get receiving sequence */
-	seq = TCP_RECV_QUEUE;
-	setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
-	getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen);
-	fprintf(stdout, "%u\n", seq);
-}
--- a/doc/migration/target.c
+++ b/doc/migration/target.c
@ -1,102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-/* PASST - Plug A Simple Socket Transport
- *  for qemu/UNIX domain socket mode
- *
- * PASTA - Pack A Subtle Tap Abstraction
- *  for network namespace/tap device mode
- *
- * doc/migration/target.c - Mock of TCP migration target, use with passt-repair
- *
- * Copyright (c) 2025 Red Hat GmbH
- * Author: Stefano Brivio <sbrivio@redhat.com>
- */
-
-#include <arpa/inet.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <limits.h>
-#include <unistd.h>
-#include <netdb.h>
-#include <netinet/tcp.h>
-
-int main(int argc, char **argv)
-{
-	struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } };
-	struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0,
-				  NULL, NULL, NULL };
-	struct sockaddr_un a_helper = { AF_UNIX, { 0 } };
-	int s, s_helper, seq;
-	int8_t cmd;
-	struct iovec iov = { &cmd, sizeof(cmd) };
-	char buf[CMSG_SPACE(sizeof(int))];
-	struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
-	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
-	struct addrinfo *r;
-
-	(void)argc;
-
-	strcpy(a_helper.sun_path, argv[4]);
-	getaddrinfo(argv[1], argv[2], &hints, &r);
-
-	if (argc != 7) {
-		fprintf(stderr,
-			"%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH SSEQ RSEQ\n",
-			argv[0]);
-		return -1;
-	}
-
-	/* Prepare socket, bind to source port */
-	s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP);
-	setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int));
-	bind(s, (struct sockaddr *)&a, sizeof(a));
-
-	/* Wait for helper */
-	s_helper = socket(AF_UNIX, SOCK_STREAM, 0);
-	unlink(a_helper.sun_path);
-	bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper));
-	listen(s_helper, 1);
-	s_helper = accept(s_helper, NULL, NULL);
-
-	/* Set up message for helper, with socket */
-	cmsg->cmsg_level = SOL_SOCKET;
-	cmsg->cmsg_type = SCM_RIGHTS;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
-	memcpy(CMSG_DATA(cmsg), &s, sizeof(s));
-
-	/* Send command to helper: turn repair mode on, wait for reply */
-	cmd = TCP_REPAIR_ON;
-	sendmsg(s_helper, &msg, 0);
-	recv(s_helper, &((int){ 0 }), 1, 0);
-
-	/* Set sending sequence */
-	seq = TCP_SEND_QUEUE;
-	setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
-	seq = atoi(argv[5]);
-	setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq));
-
-	/* Set receiving sequence */
-	seq = TCP_RECV_QUEUE;
-	setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
-	seq = atoi(argv[6]);
-	setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq));
-
-	/* Connect setting kernel state only, without actual SYN / handshake */
-	connect(s, r->ai_addr, r->ai_addrlen);
-
-	/* Send command to helper: turn repair mode off, wait for reply */
-	cmd = TCP_REPAIR_OFF;
-	sendmsg(s_helper, &msg, 0);
-
-	recv(s_helper, &((int8_t){ 0 }), 1, 0);
-
-	/* Terminate helper */
-	close(s_helper);
-
-	/* Send some more data */
-	send(s, "after migration\n", sizeof("after migration\n"), 0);
-}
--- a/doc/platform-requirements/.gitignore
+++ b/doc/platform-requirements/.gitignore
@ -1,4 +1,3 @@
-/listen-vs-repair
 /reuseaddr-priority
 /recv-zero
 /udp-close-dup
--- a/doc/platform-requirements/Makefile
+++ b/doc/platform-requirements/Makefile
@ -3,8 +3,8 @@
 # Copyright Red Hat
 # Author: David Gibson <david@gibson.dropbear.id.au>

-TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair
-SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c
+TARGETS = reuseaddr-priority recv-zero udp-close-dup
+SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c
 CFLAGS = -Wall

 all: cppcheck clang-tidy $(TARGETS:%=check-%)
--- a/doc/platform-requirements/common.h
+++ b/doc/platform-requirements/common.h
@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <stdlib.h>

-__attribute__((format(printf, 1, 2), noreturn))
 static inline void die(const char *fmt, ...)
 {
 	va_list ap;
--- a/doc/platform-requirements/listen-vs-repair.c
+++ b/doc/platform-requirements/listen-vs-repair.c
@ -1,128 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-/* liste-vs-repair.c
- *
- * Do listening sockets have address conflicts with sockets under repair
- * ====================================================================
- *
- * When we accept() an incoming connection the accept()ed socket will have the
- * same local address as the listening socket.  This can be a complication on
- * migration.  On the migration target we've already set up listening sockets
- * according to the command line.  However to restore connections that we're
- * migrating in we need to bind the new sockets to the same address, which would
- * be an address conflict on the face of it.  This test program verifies that
- * enabling repair mode before bind() correctly suppresses that conflict.
- *
- * Copyright Red Hat
- * Author: David Gibson <david@gibson.dropbear.id.au>
- */
-
-/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */
-#define _GNU_SOURCE
-
-#include <arpa/inet.h>
-#include <errno.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <net/if.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <sched.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "common.h"
-
-#define PORT	13256U
-#define CPORT	13257U
-
-/* 127.0.0.1:PORT */
-static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT);
-
-/* 127.0.0.1:CPORT */
-static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT);
-
-/* Put ourselves into a network sandbox */
-static void net_sandbox(void)
-{
-	/* NOLINTNEXTLINE(altera-struct-pack-align) */
-	const struct req_t {
-		struct nlmsghdr nlh;
-		struct ifinfomsg ifm;
-	} __attribute__((packed)) req = {
-		.nlh.nlmsg_type		= RTM_NEWLINK,
-		.nlh.nlmsg_flags	= NLM_F_REQUEST,
-		.nlh.nlmsg_len		= sizeof(req),
-		.nlh.nlmsg_seq		= 1,
-		.ifm.ifi_family		= AF_UNSPEC,
-                .ifm.ifi_index		= 1,
-                .ifm.ifi_flags		= IFF_UP,
-                .ifm.ifi_change		= IFF_UP,
-	};
-	int nl;
-
-	if (unshare(CLONE_NEWUSER | CLONE_NEWNET))
-		die("unshare(): %s\n", strerror(errno));
-
-	/* Bring up lo in the new netns */
-	nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
-	if (nl < 0)
-		die("Can't create netlink socket: %s\n", strerror(errno));
-
-	if (send(nl, &req, sizeof(req), 0) < 0)
-		die("Netlink send(): %s\n", strerror(errno));
-	close(nl);
-}
-
-static void check(void)
-{
-	int s1, s2, op;
-
-	s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
-	if (s1 < 0)
-		die("socket() 1: %s\n", strerror(errno));
-
-	if (bind(s1, (struct sockaddr *)&addr, sizeof(addr)))
-		die("bind() 1: %s\n", strerror(errno));
-
-	if (listen(s1, 0))
-		die("listen(): %s\n", strerror(errno));
-
-	s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
-	if (s2 < 0)
-		die("socket() 2: %s\n", strerror(errno));
-
-	op = TCP_REPAIR_ON;
-	if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
-		die("TCP_REPAIR: %s\n", strerror(errno));
-
-	if (bind(s2, (struct sockaddr *)&addr, sizeof(addr)))
-		die("bind() 2: %s\n", strerror(errno));
-
-	if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr)))
-		die("connect(): %s\n", strerror(errno));
-
-	op = TCP_REPAIR_OFF_NO_WP;
-	if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
-		die("TCP_REPAIR: %s\n", strerror(errno));
-
-	close(s1);
-	close(s2);
-}
-
-int main(int argc, char *argv[])
-{
-	(void)argc;
-	(void)argv;
-
-	net_sandbox();
-
-	check();
-
-	printf("Repair mode appears to properly suppress conflicts with listening sockets\n");
-
-	exit(0);
-}
--- a/doc/platform-requirements/reuseaddr-priority.c
+++ b/doc/platform-requirements/reuseaddr-priority.c
@ -46,13 +46,13 @@
 /* Different cases for receiving socket configuration */
 enum sock_type {
 	/* Socket is bound to 0.0.0.0:DSTPORT and not connected */
-	SOCK_BOUND_ANY,
+	SOCK_BOUND_ANY = 0,

 	/* Socket is bound to 127.0.0.1:DSTPORT and not connected */
-	SOCK_BOUND_LO,
+	SOCK_BOUND_LO = 1,

 	/* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */
-	SOCK_CONNECTED,
+	SOCK_CONNECTED = 2,

 	NUM_SOCK_TYPES,
 };
--- a/epoll_type.h
+++ b/epoll_type.h
@ -22,8 +22,8 @@ enum epoll_type {
 	EPOLL_TYPE_TCP_TIMER,
 	/* UDP "listening" sockets */
 	EPOLL_TYPE_UDP_LISTEN,
-	/* UDP socket for a specific flow */
-	EPOLL_TYPE_UDP,
+	/* UDP socket for replies on a specific flow */
+	EPOLL_TYPE_UDP_REPLY,
 	/* ICMP/ICMPv6 ping sockets */
 	EPOLL_TYPE_PING,
 	/* inotify fd watching for end of netns (pasta) */
@ -40,10 +40,6 @@ enum epoll_type {
 	EPOLL_TYPE_VHOST_CMD,
 	/* vhost-user kick event socket */
 	EPOLL_TYPE_VHOST_KICK,
-	/* TCP_REPAIR helper listening socket */
-	EPOLL_TYPE_REPAIR_LISTEN,
-	/* TCP_REPAIR helper socket */
-	EPOLL_TYPE_REPAIR,

 	EPOLL_NUM_TYPES,
 };
--- a/flow.c
+++ b/flow.c
@ -19,7 +19,6 @@
 #include "inany.h"
 #include "flow.h"
 #include "flow_table.h"
-#include "repair.h"

 const char *flow_state_str[] = {
 	[FLOW_STATE_FREE]	= "FREE",
@ -53,13 +52,6 @@ const uint8_t flow_proto[] = {
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");

-#define foreach_established_tcp_flow(flow)				\
-	flow_foreach_of_type((flow), FLOW_TCP)				\
-		if (!tcp_flow_is_established(&(flow)->tcp))		\
-			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
-			continue;					\
-		else
-
 /* Global Flow Table */

 /**
@ -267,13 +259,11 @@ int flowside_connect(const struct ctx *c, int s,

 /** flow_log_ - Log flow-related message
 * @f:		flow the message is related to
- * @newline:	Append newline at the end of the message, if missing
 * @pri:	Log priority
 * @fmt:	Format string
 * @...:	printf-arguments
 */
-void flow_log_(const struct flow_common *f, bool newline, int pri,
-	       const char *fmt, ...)
+void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 {
 	const char *type_or_state;
 	char msg[BUFSIZ];
@ -289,7 +279,7 @@ void flow_log_(const struct flow_common *f, bool newline, int pri,
 	else
 		type_or_state = FLOW_TYPE(f);

-	logmsg(newline, false, pri,
+	logmsg(true, false, pri,
 	       "Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
 }

@ -309,7 +299,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
 	const struct flowside *tgt = &f->side[TGTSIDE];

 	if (state >= FLOW_STATE_TGT)
-		flow_log_(f, true, pri,
+		flow_log_(f, pri,
 			  "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
 			  pif_name(f->pif[INISIDE]),
 			  inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
@ -322,7 +312,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
 			  inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
 			  tgt->eport);
 	else if (state >= FLOW_STATE_INI)
-		flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
+		flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
 			  pif_name(f->pif[INISIDE]),
 			  inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
 			  ini->eport,
@ -343,7 +333,7 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
 	ASSERT(oldstate < FLOW_NUM_STATES);

 	f->state = state;
-	flow_log_(f, true, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
+	flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
 		  FLOW_STATE(f));

 	flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
@ -396,22 +386,18 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
 * @flow:	Flow to change state
 * @pif:	pif of the initiating side
 * @ssa:	Source socket address
- * @daddr:	Destination address (may be NULL)
 * @dport:	Destination port
 *
 * Return: pointer to the initiating flowside information
 */
-struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
-				  const union sockaddr_inany *ssa,
-				  const union inany_addr *daddr,
-				  in_port_t dport)
+const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
+					const union sockaddr_inany *ssa,
+					in_port_t dport)
 {
 	struct flowside *ini = &flow->f.side[INISIDE];

 	inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
-	if (daddr)
-		ini->oaddr = *daddr;
-	else if (inany_v4(&ini->eaddr))
+	if (inany_v4(&ini->eaddr))
 		ini->oaddr = inany_any4;
 	else
 		ini->oaddr = inany_any6;
@ -428,8 +414,8 @@ struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
 *
 * Return: pointer to the target flowside information
 */
-struct flowside *flow_target(const struct ctx *c, union flow *flow,
-			     uint8_t proto)
+const struct flowside *flow_target(const struct ctx *c, union flow *flow,
+				   uint8_t proto)
 {
 	char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
 	struct flow_common *f = &flow->f;
@ -611,7 +597,12 @@ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
 	const struct flowside *side = &f->side[sidx.sidei];
 	uint8_t pif = f->pif[sidx.sidei];

-	ASSERT(pif != PIF_NONE);
+	/* For the hash table to work, entries must have complete endpoint
+	 * information, and at least a forwarding port.
+	 */
+	ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
+	       side->eport != 0 && side->oport != 0);
+
 	return flow_hash(c, FLOW_PROTO(f), pif, side);
 }

@ -755,23 +746,19 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
 * @proto:	Protocol of the flow (IP L4 protocol number)
 * @pif:	Interface of the flow
 * @esa:	Socket address of the endpoint
- * @oaddr:	Our address (may be NULL)
 * @oport:	Our port number
 *
 * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
 */
 flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
-			   const void *esa,
-			   const union inany_addr *oaddr, in_port_t oport)
+			   const void *esa, in_port_t oport)
 {
 	struct flowside side = {
 		.oport = oport,
 	};

 	inany_from_sockaddr(&side.eaddr, &side.eport, esa);
-	if (oaddr)
-		side.oaddr = *oaddr;
-	else if (inany_v4(&side.eaddr))
+	if (inany_v4(&side.eaddr))
 		side.oaddr = inany_any4;
 	else
 		side.oaddr = inany_any6;
@ -789,7 +776,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 	struct flow_free_cluster *free_head = NULL;
 	unsigned *last_next = &flow_first_free;
 	bool timer = false;
-	union flow *flow;
+	unsigned idx;

 	if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
 		timer = true;
@ -798,7 +785,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)

 	ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */

-	flow_foreach_slot(flow) {
+	for (idx = 0; idx < FLOW_MAX; idx++) {
+		union flow *flow = &flowtab[idx];
 		bool closed = false;

 		switch (flow->f.state) {
@ -815,12 +803,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 			} else {
 				/* New free cluster, add to chain */
 				free_head = &flow->free;
-				*last_next = FLOW_IDX(flow);
+				*last_next = idx;
 				last_next = &free_head->next;
 			}

 			/* Skip remaining empty entries */
-			flow += skip - 1;
+			idx += skip - 1;
 			continue;
 		}

@ -858,7 +846,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 				closed = icmp_ping_timer(c, &flow->ping, now);
 			break;
 		case FLOW_UDP:
-			closed = udp_flow_defer(c, &flow->udp, now);
+			closed = udp_flow_defer(&flow->udp);
 			if (!closed && timer)
 				closed = udp_flow_timer(c, &flow->udp, now);
 			break;
@ -873,15 +861,14 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)

 			if (free_head) {
 				/* Add slot to current free cluster */
-				ASSERT(FLOW_IDX(flow) ==
-				       FLOW_IDX(free_head) + free_head->n);
+				ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
 				free_head->n++;
 				flow->free.n = flow->free.next = 0;
 			} else {
 				/* Create new free cluster */
 				free_head = &flow->free;
 				free_head->n = 1;
-				*last_next = FLOW_IDX(flow);
+				*last_next = idx;
 				last_next = &free_head->next;
 			}
 		} else {
@ -892,254 +879,6 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 	*last_next = FLOW_MAX;
 }

-/**
- * flow_migrate_source_rollback() - Disable repair mode, return failure
- * @c:		Execution context
- * @bound:	No need to roll back flow indices >= @bound
- * @ret:	Negative error code
- *
- * Return: @ret
- */
-static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
-{
-	union flow *flow;
-
-	debug("...roll back migration");
-
-	foreach_established_tcp_flow(flow) {
-		if (FLOW_IDX(flow) >= bound)
-			break;
-		if (tcp_flow_repair_off(c, &flow->tcp))
-			die("Failed to roll back TCP_REPAIR mode");
-	}
-
-	if (repair_flush(c))
-		die("Failed to roll back TCP_REPAIR mode");
-
-	return ret;
-}
-
-/**
- * flow_migrate_need_repair() - Do we need to set repair mode for any flow?
- *
- * Return: true if repair mode is needed, false otherwise
- */
-static bool flow_migrate_need_repair(void)
-{
-	union flow *flow;
-
-	foreach_established_tcp_flow(flow)
-		return true;
-
-	return false;
-}
-
-/**
- * flow_migrate_repair_all() - Turn repair mode on or off for all flows
- * @c:		Execution context
- * @enable:	Switch repair mode on if set, off otherwise
- *
- * Return: 0 on success, negative error code on failure
- */
-static int flow_migrate_repair_all(struct ctx *c, bool enable)
-{
-	union flow *flow;
-	int rc;
-
-	/* If we don't have a repair helper, there's nothing we can do */
-	if (c->fd_repair < 0)
-		return 0;
-
-	foreach_established_tcp_flow(flow) {
-		if (enable)
-			rc = tcp_flow_repair_on(c, &flow->tcp);
-		else
-			rc = tcp_flow_repair_off(c, &flow->tcp);
-
-		if (rc) {
-			debug("Can't %s repair mode: %s",
-			      enable ? "enable" : "disable", strerror_(-rc));
-			return flow_migrate_source_rollback(c, FLOW_IDX(flow),
-							    rc);
-		}
-	}
-
-	if ((rc = repair_flush(c))) {
-		debug("Can't %s repair mode: %s",
-		      enable ? "enable" : "disable", strerror_(-rc));
-		return flow_migrate_source_rollback(c, FLOW_IDX(flow), rc);
-	}
-
-	return 0;
-}
-
-/**
- * flow_migrate_source_pre() - Prepare flows for migration: enable repair mode
- * @c:		Execution context
- * @stage:	Migration stage information (unused)
- * @fd:		Migration file descriptor (unused)
- *
- * Return: 0 on success, positive error code on failure
- */
-int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
-			    int fd)
-{
-	int rc;
-
-	(void)stage;
-	(void)fd;
-
-	if (flow_migrate_need_repair())
-		repair_wait(c);
-
-	if ((rc = flow_migrate_repair_all(c, true)))
-		return -rc;
-
-	return 0;
-}
-
-/**
- * flow_migrate_source() - Dump all the remaining information and send data
- * @c:		Execution context (unused)
- * @stage:	Migration stage information (unused)
- * @fd:		Migration file descriptor
- *
- * Return: 0 on success, positive error code on failure
- */
-int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
-			int fd)
-{
-	uint32_t count = 0;
-	bool first = true;
-	union flow *flow;
-	int rc;
-
-	(void)c;
-	(void)stage;
-
-	/* If we don't have a repair helper, we can't migrate TCP flows */
-	if (c->fd_repair >= 0) {
-		foreach_established_tcp_flow(flow)
-			count++;
-	}
-
-	count = htonl(count);
-	if (write_all_buf(fd, &count, sizeof(count))) {
-		rc = errno;
-		err_perror("Can't send flow count (%u)", ntohl(count));
-		return flow_migrate_source_rollback(c, FLOW_MAX, rc);
-	}
-
-	debug("Sending %u flows", ntohl(count));
-
-	if (!count)
-		return 0;
-
-	/* Dump and send information that can be stored in the flow table.
-	 *
-	 * Limited rollback options here: if we fail to transfer any data (that
-	 * is, on the first flow), undo everything and resume. Otherwise, the
-	 * stream might now be inconsistent, and we might have closed listening
-	 * TCP sockets, so just terminate.
-	 */
-	foreach_established_tcp_flow(flow) {
-		rc = tcp_flow_migrate_source(fd, &flow->tcp);
-		if (rc) {
-			flow_err(flow, "Can't send data: %s",
-				 strerror_(-rc));
-			if (!first)
-				die("Inconsistent migration state, exiting");
-
-			return flow_migrate_source_rollback(c, FLOW_MAX, -rc);
-		}
-
-		first = false;
-	}
-
-	/* And then "extended" data (including window data we saved previously):
-	 * the target needs to set repair mode on sockets before it can set
-	 * this stuff, but it needs sockets (and flows) for that.
-	 *
-	 * This also closes sockets so that the target can start connecting
-	 * theirs: you can't sendmsg() to queues (using the socket) if the
-	 * socket is not connected (EPIPE), not even in repair mode. And the
-	 * target needs to restore queues now because we're sending the data.
-	 *
-	 * So, no rollback here, just try as hard as we can. Tolerate per-flow
-	 * failures but not if the stream might be inconsistent (reported here
-	 * as EIO).
-	 */
-	foreach_established_tcp_flow(flow) {
-		rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
-		if (rc) {
-			flow_err(flow, "Can't send extended data: %s",
-				 strerror_(-rc));
-
-			if (rc == -EIO)
-				die("Inconsistent migration state, exiting");
-		}
-	}
-
-	return 0;
-}
-
-/**
- * flow_migrate_target() - Receive flows and insert in flow table
- * @c:		Execution context
- * @stage:	Migration stage information (unused)
- * @fd:		Migration file descriptor
- *
- * Return: 0 on success, positive error code on failure
- */
-int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
-			int fd)
-{
-	uint32_t count;
-	unsigned i;
-	int rc;
-
-	(void)stage;
-
-	if (read_all_buf(fd, &count, sizeof(count)))
-		return errno;
-
-	count = ntohl(count);
-	debug("Receiving %u flows", count);
-
-	if (!count)
-		return 0;
-
-	repair_wait(c);
-
-	if ((rc = flow_migrate_repair_all(c, true)))
-		return -rc;
-
-	repair_flush(c);
-
-	/* TODO: flow header with type, instead? */
-	for (i = 0; i < count; i++) {
-		rc = tcp_flow_migrate_target(c, fd);
-		if (rc) {
-			flow_dbg(FLOW(i), "Migration data failure, abort: %s",
-				 strerror_(-rc));
-			return -rc;
-		}
-	}
-
-	repair_flush(c);
-
-	for (i = 0; i < count; i++) {
-		rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
-		if (rc) {
-			flow_dbg(FLOW(i), "Migration data failure, abort: %s",
-				 strerror_(-rc));
-			return -rc;
-		}
-	}
-
-	return 0;
-}
-
 /**
 * flow_init() - Initialise flow related data structures
 */
--- a/flow.h
+++ b/flow.h
@ -243,27 +243,18 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
 			   const void *eaddr, const void *oaddr,
 			   in_port_t eport, in_port_t oport);
 flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
-			   const void *esa,
-			   const union inany_addr *oaddr, in_port_t oport);
+			   const void *esa, in_port_t oport);

 union flow;

 void flow_init(void);
 void flow_defer_handler(const struct ctx *c, const struct timespec *now);
-int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
-			      int fd);
-int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
-			    int fd);
-int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
-			int fd);
-int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
-			int fd);

-void flow_log_(const struct flow_common *f, bool newline, int pri,
-	       const char *fmt, ...)
-	__attribute__((format(printf, 4, 5)));
+void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
+	__attribute__((format(printf, 3, 4)));
+
+#define flow_log(f_, pri, ...)	flow_log_(&(f_)->f, (pri), __VA_ARGS__)

-#define flow_log(f_, pri, ...)	flow_log_(&(f_)->f, true, (pri), __VA_ARGS__)
 #define flow_dbg(f, ...)	flow_log((f), LOG_DEBUG, __VA_ARGS__)
 #define flow_err(f, ...)	flow_log((f), LOG_ERR, __VA_ARGS__)

@ -273,16 +264,6 @@ void flow_log_(const struct flow_common *f, bool newline, int pri,
 			flow_dbg((f), __VA_ARGS__);			\
 	} while (0)

-#define flow_log_perror_(f, pri, ...)					\
-	do {								\
-		int errno_ = errno;					\
-		flow_log_((f), false, (pri), __VA_ARGS__);		\
-		logmsg(true, true, (pri), ": %s", strerror_(errno_));	\
-	} while (0)
-
-#define flow_dbg_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_DEBUG, __VA_ARGS__)
-#define flow_perror(f_, ...)	flow_log_perror_(&(f_)->f, LOG_ERR, __VA_ARGS__)
-
 void flow_log_details_(const struct flow_common *f, int pri,
 		       enum flow_state state);
 #define flow_log_details(f_, pri) \
--- a/flow_table.h
+++ b/flow_table.h
@ -50,42 +50,6 @@ extern union flow flowtab[];
 #define flow_foreach_sidei(sidei_) \
 	for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++)

-
-/**
- * flow_foreach_slot() - Step through each flow table entry
- * @flow:	Takes values of pointer to each flow table entry
- *
- * Includes FREE slots.
- */
-#define flow_foreach_slot(flow)						\
-	for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++)
-
-/**
- * flow_foreach() - Step through each active flow
- * @flow:	Takes values of pointer to each active flow
- */
-#define flow_foreach(flow)						\
-	flow_foreach_slot((flow))					\
-		if ((flow)->f.state == FLOW_STATE_FREE)			\
-			(flow) += (flow)->free.n - 1;			\
-		else if ((flow)->f.state != FLOW_STATE_ACTIVE) {	\
-			flow_err((flow), "Bad flow state during traversal"); \
-			continue;					\
-		} else
-
-/**
- * flow_foreach_of_type() - Step through each active flow of given type
- * @flow:	Takes values of pointer to each flow
- * @type_:	Type of flow to traverse
- */
-#define flow_foreach_of_type(flow, type_)				\
-	flow_foreach((flow))						\
-	if ((flow)->f.type != (type_))					\
-			/* NOLINTNEXTLINE(bugprone-branch-clone) */	\
-			continue;					\
-		else
-
-
 /** flow_idx() - Index of flow from common structure
 * @f:	Common flow fields pointer
 *
@ -197,16 +161,15 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
 					sa_family_t af,
 					const void *saddr, in_port_t sport,
 					const void *daddr, in_port_t dport);
-struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
-				  const union sockaddr_inany *ssa,
-				  const union inany_addr *daddr,
-				  in_port_t dport);
+const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
+					const union sockaddr_inany *ssa,
+					in_port_t dport);
 const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
 				      sa_family_t af,
 				      const void *saddr, in_port_t sport,
 				      const void *daddr, in_port_t dport);
-struct flowside *flow_target(const struct ctx *c, union flow *flow,
-			     uint8_t proto);
+const struct flowside *flow_target(const struct ctx *c, union flow *flow,
+				   uint8_t proto);

 union flow *flow_set_type(union flow *flow, enum flow_type type);
 #define FLOW_SET_TYPE(flow_, t_, var_)	(&flow_set_type((flow_), (t_))->var_)
--- a/fwd.c
+++ b/fwd.c
@ -443,7 +443,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
 	else if (proto == IPPROTO_UDP)
 		tgt->eport += c->udp.fwd_in.delta[tgt->eport];

-	if (!c->no_splice && inany_is_loopback(&ini->eaddr) &&
+	if (c->mode == MODE_PASTA && inany_is_loopback(&ini->eaddr) &&
 	    (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
 		/* spliceable */

--- a/hooks/pre-push
+++ b/hooks/pre-push
@ -56,7 +56,6 @@ cd ..
 make pkgs
 scp passt passt.avx2 passt.1 qrap qrap.1	"${USER_HOST}:${BIN}"
 scp pasta pasta.avx2 pasta.1			"${USER_HOST}:${BIN}"
-scp passt-repair passt-repair.1			"${USER_HOST}:${BIN}"

 ssh "${USER_HOST}" 				"rm -f ${BIN}/*.deb"
 ssh "${USER_HOST}"				"rm -f ${BIN}/*.rpm"
--- a/icmp.c
+++ b/icmp.c
@ -85,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)

 	n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
 	if (n < 0) {
-		flow_perror(pingf, "recvfrom() error");
+		flow_err(pingf, "recvfrom() error: %s", strerror(errno));
 		return;
 	}

@ -150,7 +150,7 @@ unexpected:
 static void icmp_ping_close(const struct ctx *c,
 			    const struct icmp_ping_flow *pingf)
 {
-	epoll_del(c, pingf->sock);
+	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL);
 	close(pingf->sock);
 	flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
 }
@ -300,7 +300,8 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,

 	pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
 	if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
-		flow_dbg_perror(pingf, "failed to relay request to socket");
+		flow_dbg(pingf, "failed to relay request to socket: %s",
+			 strerror(errno));
 	} else {
 		flow_dbg(pingf,
 			 "echo request to socket, ID: %"PRIu16", seq: %"PRIu16,
--- a/inany.h
+++ b/inany.h
@ -252,8 +252,7 @@ static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
 		*port = ntohs(sa->sa4.sin_port);
 	} else {
 		/* Not valid to call with other address families */
-		ASSERT_WITH_MSG(0, "Unexpected sockaddr family: %u",
-				sa->sa_family);
+		ASSERT(0);
 	}
 }

--- a/iov.c
+++ b/iov.c
@ -155,95 +155,3 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt)

 	return len;
 }
-
-/**
- * iov_tail_prune() - Remove any unneeded buffers from an IOV tail
- * @tail:	IO vector tail (modified)
- *
- * If an IOV tail's offset is large enough, it may not include any bytes from
- * the first (or first several) buffers in the underlying IO vector.  Modify the
- * tail's representation so it contains the same logical bytes, but only
- * includes buffers that are actually needed.  This will avoid stepping through
- * unnecessary elements of the underlying IO vector on future operations.
- *
- * Return:	true if the tail still contains any bytes, otherwise false
- */
-bool iov_tail_prune(struct iov_tail *tail)
-{
-	size_t i;
-
-	i = iov_skip_bytes(tail->iov, tail->cnt, tail->off, &tail->off);
-	tail->iov += i;
-	tail->cnt -= i;
-
-	return !!tail->cnt;
-}
-
-/**
- * iov_tail_size - Calculate the total size of an IO vector tail
- * @tail:	IO vector tail
- *
- * Returns:    The total size in bytes.
- */
-size_t iov_tail_size(struct iov_tail *tail)
-{
-	iov_tail_prune(tail);
-	return iov_size(tail->iov, tail->cnt) - tail->off;
-}
-
-/**
- * iov_peek_header_() - Get pointer to a header from an IOV tail
- * @tail:	IOV tail to get header from
- * @len:	Length of header to get, in bytes
- * @align:	Required alignment of header, in bytes
- *
- * @tail may be pruned, but will represent the same bytes as before.
- *
- * Returns: Pointer to the first @len logical bytes of the tail, NULL if that
- *	    overruns the IO vector, is not contiguous or doesn't have the
- *	    requested alignment.
- */
-/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
-void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
-{
-	char *p;
-
-	if (!iov_tail_prune(tail))
-		return NULL; /* Nothing left */
-
-	if (tail->off + len < tail->off)
-		return NULL; /* Overflow */
-
-	if (tail->off + len > tail->iov[0].iov_len)
-		return NULL; /* Not contiguous */
-
-	p = (char *)tail->iov[0].iov_base + tail->off;
-	if ((uintptr_t)p % align)
-		return NULL; /* not aligned */
-
-	return p;
-}
-
-/**
- * iov_remove_header_() - Remove a header from an IOV tail
- * @tail:	IOV tail to remove header from (modified)
- * @len:	Length of header to remove, in bytes
- * @align:	Required alignment of header, in bytes
- *
- * On success, @tail is updated so that it longer includes the bytes of the
- * returned header.
- *
- * Returns: Pointer to the first @len logical bytes of the tail, NULL if that
- *	    overruns the IO vector, is not contiguous or doesn't have the
- *	    requested alignment.
- */
-void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align)
-{
-	char *p = iov_peek_header_(tail, len, align);
-
-	if (!p)
-		return NULL;
-
-	tail->off = tail->off + len;
-	return p;
-}
--- a/iov.h
+++ b/iov.h
@ -28,80 +28,4 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
 size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
                  size_t offset, void *buf, size_t bytes);
 size_t iov_size(const struct iovec *iov, size_t iov_cnt);
-
-/*
- * DOC: Theory of Operation, struct iov_tail
- *
- * Sometimes a single logical network frame is split across multiple buffers,
- * represented by an IO vector (struct iovec[]).  We often want to process this
- * one header / network layer at a time.  So, it's useful to maintain a "tail"
- * of the vector representing the parts we haven't yet extracted.
- *
- * The headers we extract need not line up with buffer boundaries (though we do
- * assume they're contiguous within a single buffer for now).  So, we could
- * represent that tail as another struct iovec[], but that would mean copying
- * the whole array of struct iovecs, just so we can adjust the offset and length
- * on the first one.
- *
- * So, instead represent the tail as pointer into an existing struct iovec[],
- * with an explicit offset for where the "tail" starts within it.  If we extract
- * enough headers that some buffers of the original vector no longer contain
- * part of the tail, we (lazily) advance our struct iovec * to the first buffer
- * we still need, and adjust the vector length and offset to match.
- */
-
-/**
- * struct iov_tail - An IO vector which may have some headers logically removed
- * @iov:	IO vector
- * @cnt:	Number of entries in @iov
- * @off:	Current offset in @iov
- */
-struct iov_tail {
-	const struct iovec *iov;
-	size_t cnt, off;
-};
-
-/**
- * IOV_TAIL() - Create a new IOV tail
- * @iov_:	IO vector to create tail from
- * @cnt_:	Length of the IO vector at @iov_
- * @off_:	Byte offset in the IO vector where the tail begins
- */
-#define IOV_TAIL(iov_, cnt_, off_) \
-	(struct iov_tail){ .iov = (iov_), .cnt = (cnt_), .off = (off_) }
-
-bool iov_tail_prune(struct iov_tail *tail);
-size_t iov_tail_size(struct iov_tail *tail);
-void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align);
-void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align);
-
-/**
- * IOV_PEEK_HEADER() - Get typed pointer to a header from an IOV tail
- * @tail_:	IOV tail to get header from
- * @type_:	Data type of the header
- *
- * @tail_ may be pruned, but will represent the same bytes as before.
- *
- * Returns: Pointer of type (@type_ *) located at the start of @tail_, NULL if
- *          we can't get a contiguous and aligned pointer.
- */
-#define IOV_PEEK_HEADER(tail_, type_)					\
-	((type_ *)(iov_peek_header_((tail_),				\
-				    sizeof(type_), __alignof__(type_))))
-
-/**
- * IOV_REMOVE_HEADER() - Remove and return typed header from an IOV tail
- * @tail_:	IOV tail to remove header from (modified)
- * @type_:	Data type of the header to remove
- *
- * On success, @tail_ is updated so that it longer includes the bytes of the
- * returned header.
- *
- * Returns: Pointer of type (@type_ *) located at the old start of @tail_, NULL
- *          if we can't get a contiguous and aligned pointer.
- */
-#define IOV_REMOVE_HEADER(tail_, type_)					\
-	((type_ *)(iov_remove_header_((tail_),				\
-				      sizeof(type_), __alignof__(type_))))
-
 #endif /* IOVEC_H */
--- a/ip.h
+++ b/ip.h
@ -36,14 +36,13 @@
 		.tos		= 0,					\
 		.tot_len	= 0,					\
 		.id		= 0,					\
-		.frag_off	= htons(IP_DF), 			\
+		.frag_off	= 0,					\
 		.ttl		= 0xff,					\
 		.protocol	= (proto),				\
 		.saddr		= 0,					\
 		.daddr		= 0,					\
 	}
 #define L2_BUF_IP4_PSUM(proto)	((uint32_t)htons_constant(0x4500) +	\
-				 (uint32_t)htons_constant(IP_DF) +	\
 				 (uint32_t)htons(0xff00 | (proto)))


@ -91,30 +90,6 @@ struct ipv6_opt_hdr {
 	 */
 } __attribute__((packed));	/* required for some archs */

-/**
- * ip6_set_flow_lbl() - Set flow label in an IPv6 header
- * @ip6h:	Pointer to IPv6 header, updated
- * @flow:	Set @ip6h flow label to the low 20 bits of this integer
- */
-static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow)
-{
-	ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
-	ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
-	ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
-}
-
-/** ip6_get_flow_lbl() - Get flow label from an IPv6 header
- * @ip6h:	Pointer to IPv6 header
- *
- * Return: flow label from @ip6h as an integer (<= 20 bits)
- */
-static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
-{
-	return (ip6h->flow_lbl[0] & 0xf) << 16 |
-		ip6h->flow_lbl[1] << 8 |
-		ip6h->flow_lbl[2];
-}
-
 char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
 		 size_t *dlen);

@ -126,14 +101,4 @@ static const struct in6_addr in6addr_ll_all_nodes = {
 	},
 };

-/* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */
-static const struct in_addr in4addr_broadcast = { 0xffffffff };
-
-#ifndef IPV4_MIN_MTU
-#define IPV4_MIN_MTU		68
-#endif
-#ifndef IPV6_MIN_MTU
-#define IPV6_MIN_MTU		1280
-#endif
-
 #endif /* IP_H */
--- a/log.c
+++ b/log.c
@ -56,7 +56,7 @@ bool		log_stderr = true;	/* Not daemonised, no shell spawned */
 *
 * Return: pointer to @now, or NULL if there was an error retrieving the time
 */
-static const struct timespec *logtime(struct timespec *ts)
+const struct timespec *logtime(struct timespec *ts)
 {
 	if (clock_gettime(CLOCK_MONOTONIC, ts))
 		return NULL;
@ -249,30 +249,6 @@ static void logfile_write(bool newline, bool cont, int pri,
 		log_written += n;
 }

-/**
- * passt_vsyslog() - vsyslog() implementation not using heap memory
- * @newline:	Append newline at the end of the message, if missing
- * @pri:	Facility and level map, same as priority for vsyslog()
- * @format:	Same as vsyslog() format
- * @ap:		Same as vsyslog() ap
- */
-static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
-{
-	char buf[BUFSIZ];
-	int n;
-
-	/* Send without timestamp, the system logger should add it */
-	n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
-
-	n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
-
-	if (newline && format[strlen(format)] != '\n')
-		n += snprintf(buf + n, BUFSIZ - n, "\n");
-
-	if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
-		FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
-}
-
 /**
 * vlogmsg() - Print or send messages to log or output files as configured
 * @newline:	Append newline at the end of the message, if missing
@ -281,7 +257,6 @@ static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
 * @format:	Message
 * @ap:		Variable argument list
 */
-/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
 void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
 {
 	bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;
@ -347,7 +322,7 @@ void logmsg_perror(int pri, const char *format, ...)
 	vlogmsg(false, false, pri, format, ap);
 	va_end(ap);

-	logmsg(true, true, pri, ": %s", strerror_(errno_copy));
+	logmsg(true, true, pri, ": %s", strerror(errno_copy));
 }

 /**
@ -398,6 +373,30 @@ void __setlogmask(int mask)
 	setlogmask(mask);
 }

+/**
+ * passt_vsyslog() - vsyslog() implementation not using heap memory
+ * @newline:	Append newline at the end of the message, if missing
+ * @pri:	Facility and level map, same as priority for vsyslog()
+ * @format:	Same as vsyslog() format
+ * @ap:		Same as vsyslog() ap
+ */
+void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
+{
+	char buf[BUFSIZ];
+	int n;
+
+	/* Send without timestamp, the system logger should add it */
+	n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
+
+	n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
+
+	if (newline && format[strlen(format)] != '\n')
+		n += snprintf(buf + n, BUFSIZ - n, "\n");
+
+	if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
+		FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
+}
+
 /**
 * logfile_init() - Open log file and write header with PID, version, path
 * @name:	Identifier for header: passt or pasta
--- a/log.h
+++ b/log.h
@ -32,13 +32,13 @@ void logmsg_perror(int pri, const char *format, ...)
 #define die(...)							\
 	do {								\
 		err(__VA_ARGS__);					\
-		_exit(EXIT_FAILURE);					\
+		exit(EXIT_FAILURE);					\
 	} while (0)

 #define die_perror(...)							\
 	do {								\
 		err_perror(__VA_ARGS__);				\
-		_exit(EXIT_FAILURE);					\
+		exit(EXIT_FAILURE);					\
 	} while (0)

 extern int log_trace;
@ -55,6 +55,7 @@ void trace_init(int enable);

 void __openlog(const char *ident, int option, int facility);
 void logfile_init(const char *name, const char *path, size_t size);
+void passt_vsyslog(bool newline, int pri, const char *format, va_list ap);
 void __setlogmask(int mask);

 #endif /* LOG_H */
--- a/migrate.c
+++ b/migrate.c
@ -1,304 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-/* PASST - Plug A Simple Socket Transport
- *  for qemu/UNIX domain socket mode
- *
- * PASTA - Pack A Subtle Tap Abstraction
- *  for network namespace/tap device mode
- *
- * migrate.c - Migration sections, layout, and routines
- *
- * Copyright (c) 2025 Red Hat GmbH
- * Author: Stefano Brivio <sbrivio@redhat.com>
- */
-
-#include <errno.h>
-#include <sys/uio.h>
-
-#include "util.h"
-#include "ip.h"
-#include "passt.h"
-#include "inany.h"
-#include "flow.h"
-#include "flow_table.h"
-
-#include "migrate.h"
-#include "repair.h"
-
-/* Magic identifier for migration data */
-#define MIGRATE_MAGIC		0xB1BB1D1B0BB1D1B0
-
-/**
- * struct migrate_seen_addrs_v1 - Migratable guest addresses for v1 state stream
- * @addr6:	Observed guest IPv6 address
- * @addr6_ll:	Observed guest IPv6 link-local address
- * @addr4:	Observed guest IPv4 address
- * @mac:	Observed guest MAC address
- */
-struct migrate_seen_addrs_v1 {
-	struct in6_addr addr6;
-	struct in6_addr addr6_ll;
-	struct in_addr addr4;
-	unsigned char mac[ETH_ALEN];
-} __attribute__((packed));
-
-/**
- * seen_addrs_source_v1() - Copy and send guest observed addresses from source
- * @c:		Execution context
- * @stage:	Migration stage, unused
- * @fd:		File descriptor for state transfer
- *
- * Return: 0 on success, positive error code on failure
- */
-/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
-static int seen_addrs_source_v1(struct ctx *c,
-				const struct migrate_stage *stage, int fd)
-{
-	struct migrate_seen_addrs_v1 addrs = {
-		.addr6 = c->ip6.addr_seen,
-		.addr6_ll = c->ip6.addr_ll_seen,
-		.addr4 = c->ip4.addr_seen,
-	};
-
-	(void)stage;
-
-	memcpy(addrs.mac, c->guest_mac, sizeof(addrs.mac));
-
-	if (write_all_buf(fd, &addrs, sizeof(addrs)))
-		return errno;
-
-	return 0;
-}
-
-/**
- * seen_addrs_target_v1() - Receive and use guest observed addresses on target
- * @c:		Execution context
- * @stage:	Migration stage, unused
- * @fd:		File descriptor for state transfer
- *
- * Return: 0 on success, positive error code on failure
- */
-static int seen_addrs_target_v1(struct ctx *c,
-				const struct migrate_stage *stage, int fd)
-{
-	struct migrate_seen_addrs_v1 addrs;
-
-	(void)stage;
-
-	if (read_all_buf(fd, &addrs, sizeof(addrs)))
-		return errno;
-
-	c->ip6.addr_seen = addrs.addr6;
-	c->ip6.addr_ll_seen = addrs.addr6_ll;
-	c->ip4.addr_seen = addrs.addr4;
-	memcpy(c->guest_mac, addrs.mac, sizeof(c->guest_mac));
-
-	return 0;
-}
-
-/* Stages for version 2 */
-static const struct migrate_stage stages_v2[] = {
-	{
-		.name = "observed addresses",
-		.source = seen_addrs_source_v1,
-		.target = seen_addrs_target_v1,
-	},
-	{
-		.name = "prepare flows",
-		.source = flow_migrate_source_pre,
-		.target = NULL,
-	},
-	{
-		.name = "transfer flows",
-		.source = flow_migrate_source,
-		.target = flow_migrate_target,
-	},
-	{ 0 },
-};
-
-/* Supported encoding versions, from latest (most preferred) to oldest */
-static const struct migrate_version versions[] = {
-	{ 2,	stages_v2, },
-	/* v1 was released, but not widely used.  It had bad endianness for the
-	 * MSS and omitted timestamps, which meant it usually wouldn't work.
-	 * Therefore we don't attempt to support compatibility with it.
-	 */
-	{ 0 },
-};
-
-/* Current encoding version */
-#define CURRENT_VERSION		(&versions[0])
-
-/**
- * migrate_source() - Migration as source, send state to hypervisor
- * @c:		Execution context
- * @fd:		File descriptor for state transfer
- *
- * Return: 0 on success, positive error code on failure
- */
-static int migrate_source(struct ctx *c, int fd)
-{
-	const struct migrate_version *v = CURRENT_VERSION;
-	const struct migrate_header header = {
-		.magic		= htonll_constant(MIGRATE_MAGIC),
-		.version	= htonl(v->id),
-		.compat_version	= htonl(v->id),
-	};
-	const struct migrate_stage *s;
-	int ret;
-
-	if (write_all_buf(fd, &header, sizeof(header))) {
-		ret = errno;
-		err("Can't send migration header: %s, abort", strerror_(ret));
-		return ret;
-	}
-
-	for (s = v->s; s->name; s++) {
-		if (!s->source)
-			continue;
-
-		debug("Source side migration stage: %s", s->name);
-
-		if ((ret = s->source(c, s, fd))) {
-			err("Source migration stage: %s: %s, abort", s->name,
-			    strerror_(ret));
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * migrate_target_read_header() - Read header in target
- * @fd:		Descriptor for state transfer
- *
- * Return: version structure on success, NULL on failure with errno set
- */
-static const struct migrate_version *migrate_target_read_header(int fd)
-{
-	const struct migrate_version *v;
-	struct migrate_header h;
-	uint32_t id, compat_id;
-
-	if (read_all_buf(fd, &h, sizeof(h)))
-		return NULL;
-
-	id = ntohl(h.version);
-	compat_id = ntohl(h.compat_version);
-
-	debug("Source magic: 0x%016" PRIx64 ", version: %u, compat: %u",
-	      ntohll(h.magic), id, compat_id);
-
-	if (ntohll(h.magic) != MIGRATE_MAGIC || !id || !compat_id) {
-		err("Invalid incoming device state");
-		errno = EINVAL;
-		return NULL;
-	}
-
-	for (v = versions; v->id; v++)
-		if (v->id <= id && v->id >= compat_id)
-			return v;
-
-	errno = ENOTSUP;
-	err("Unsupported device state version: %u", id);
-	return NULL;
-}
-
-/**
- * migrate_target() - Migration as target, receive state from hypervisor
- * @c:		Execution context
- * @fd:		File descriptor for state transfer
- *
- * Return: 0 on success, positive error code on failure
- */
-static int migrate_target(struct ctx *c, int fd)
-{
-	const struct migrate_version *v;
-	const struct migrate_stage *s;
-	int ret;
-
-	if (!(v = migrate_target_read_header(fd)))
-		return errno;
-
-	for (s = v->s; s->name; s++) {
-		if (!s->target)
-			continue;
-
-		debug("Target side migration stage: %s", s->name);
-
-		if ((ret = s->target(c, s, fd))) {
-			err("Target migration stage: %s: %s, abort", s->name,
-			    strerror_(ret));
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * migrate_init() - Set up things necessary for migration
- * @c:		Execution context
- */
-void migrate_init(struct ctx *c)
-{
-	c->device_state_result = -1;
-}
-
-/**
- * migrate_close() - Close migration channel and connection to passt-repair
- * @c:		Execution context
- */
-void migrate_close(struct ctx *c)
-{
-	if (c->device_state_fd != -1) {
-		debug("Closing migration channel, fd: %d", c->device_state_fd);
-		close(c->device_state_fd);
-		c->device_state_fd = -1;
-		c->device_state_result = -1;
-	}
-
-	repair_close(c);
-}
-
-/**
- * migrate_request() - Request a migration of device state
- * @c:		Execution context
- * @fd:		fd to transfer state
- * @target:	Are we the target of the migration?
- */
-void migrate_request(struct ctx *c, int fd, bool target)
-{
-	debug("Migration requested, fd: %d (was %d)", fd, c->device_state_fd);
-
-	if (c->device_state_fd != -1)
-		migrate_close(c);
-
-	c->device_state_fd = fd;
-	c->migrate_target = target;
-}
-
-/**
- * migrate_handler() - Send/receive passt internal state to/from hypervisor
- * @c:		Execution context
- */
-void migrate_handler(struct ctx *c)
-{
-	int rc;
-
-	if (c->device_state_fd < 0)
-		return;
-
-	debug("Handling migration request from fd: %d, target: %d",
-	      c->device_state_fd, c->migrate_target);
-
-	if (c->migrate_target)
-		rc = migrate_target(c, c->device_state_fd);
-	else
-		rc = migrate_source(c, c->device_state_fd);
-
-	migrate_close(c);
-
-	c->device_state_result = rc;
-}
--- a/migrate.h
+++ b/migrate.h
@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later
- * Copyright (c) 2025 Red Hat GmbH
- * Author: Stefano Brivio <sbrivio@redhat.com>
- */
-
-#ifndef MIGRATE_H
-#define MIGRATE_H
-
-/**
- * struct migrate_header - Migration header from source
- * @magic:		0xB1BB1D1B0BB1D1B0, network order
- * @version:		Highest known, target aborts if too old, network order
- * @compat_version:	Lowest version compatible with @version, target aborts
- *			if too new, network order
- */
-struct migrate_header {
-	uint64_t magic;
-	uint32_t version;
-	uint32_t compat_version;
-} __attribute__((packed));
-
-/**
- * struct migrate_stage - Callbacks and parameters for one stage of migration
- * @name:	Stage name (for debugging)
- * @source:	Callback to implement this stage on the source
- * @target:	Callback to implement this stage on the target
- */
-struct migrate_stage {
-	const char *name;
-	int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd);
-	int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd);
-
-	/* Add here separate rollback callbacks if needed */
-};
-
-/**
- * struct migrate_version - Stages for a particular protocol version
- * @id:		Version number, host order
- * @s:		Ordered array of stages, NULL-terminated
- */
-struct migrate_version {
-	uint32_t id;
-	const struct migrate_stage *s;
-};
-
-void migrate_init(struct ctx *c);
-void migrate_close(struct ctx *c);
-void migrate_request(struct ctx *c, int fd, bool target);
-void migrate_handler(struct ctx *c);
-
-#endif /* MIGRATE_H */
--- a/ndp.c
+++ b/ndp.c
@ -256,7 +256,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)

 	ptr = &ra.var[0];

-	if (c->mtu) {
+	if (c->mtu != -1) {
 		struct opt_mtu *mtu = (struct opt_mtu *)ptr;
 		*mtu = (struct opt_mtu) {
 			.header = {
@ -391,7 +391,7 @@ void ndp_timer(const struct ctx *c, const struct timespec *now)
 	time_t max_rtr_adv_interval = DEFAULT_MAX_RTR_ADV_INTERVAL;
 	time_t min_rtr_adv_interval, interval;

-	if (c->fd_tap < 0 || c->no_ra || now->tv_sec < next_ra)
+	if (c->no_ra || now->tv_sec < next_ra)
 		return;

 	/* We must advertise before the route's lifetime expires */
@ -420,13 +420,9 @@ void ndp_timer(const struct ctx *c, const struct timespec *now)
 	interval = min_rtr_adv_interval +
 		random() % (max_rtr_adv_interval - min_rtr_adv_interval);

-	if (!next_ra)
-		goto first;
-
 	info("NDP: sending unsolicited RA, next in %llds", (long long)interval);

 	ndp_ra(c, &in6addr_ll_all_nodes);

-first:
 	next_ra = now->tv_sec + interval;
 }
--- a/netlink.c
+++ b/netlink.c
@ -297,10 +297,6 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
 		if (!thisifi)
 			continue; /* No interface for this route */

-		/* Skip 'lo': we should test IFF_LOOPBACK, but keep it simple */
-		if (thisifi == 1)
-			continue;
-
 		/* Skip routes to link-local addresses */
 		if (af == AF_INET && dst &&
 		    IN4_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len))
@ -324,7 +320,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
 	}

 	if (status < 0)
-		warn("netlink: RTM_GETROUTE failed: %s", strerror_(-status));
+		warn("netlink: RTM_GETROUTE failed: %s", strerror(-status));

 	if (defifi) {
 		if (ndef > 1) {
@ -355,7 +351,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
 *
 * Return: true if a gateway was found, false otherwise
 */
-static bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
+bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
 {
 	int nh_len = RTA_PAYLOAD(rta);
 	struct rtnexthop *rtnh;
--- a/packet.c
+++ b/packet.c
@ -23,73 +23,51 @@
 #include "log.h"

 /**
- * packet_check_range() - Check if a memory range is valid for a pool
+ * packet_check_range() - Check if a packet memory range is valid
 * @p:		Packet pool
- * @ptr:	Start of desired data range
+ * @offset:	Offset of data range in packet descriptor
 * @len:	Length of desired data range
+ * @start:	Start of the packet descriptor
 * @func:	For tracing: name of calling function
 * @line:	For tracing: caller line of function call
 *
 * Return: 0 if the range is valid, -1 otherwise
 */
-static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
-			      const char *func, int line)
+static int packet_check_range(const struct pool *p, size_t offset, size_t len,
+			      const char *start, const char *func, int line)
 {
-	if (len > PACKET_MAX_LEN) {
-		debug("packet range length %zu (max %zu), %s:%i",
-		      len, PACKET_MAX_LEN, func, line);
-		return -1;
-	}
-
 	if (p->buf_size == 0) {
 		int ret;

-		ret = vu_packet_check_range((void *)p->buf, ptr, len);
+		ret = vu_packet_check_range((void *)p->buf, offset, len, start);

 		if (ret == -1)
-			debug("cannot find region, %s:%i", func, line);
+			trace("cannot find region, %s:%i", func, line);

 		return ret;
 	}

-	if (ptr < p->buf) {
-		debug("packet range start %p before buffer start %p, %s:%i",
-		      (void *)ptr, (void *)p->buf, func, line);
+	if (start < p->buf) {
+		trace("packet start %p before buffer start %p, "
+		      "%s:%i", (void *)start, (void *)p->buf, func, line);
 		return -1;
 	}

-	if (len > p->buf_size) {
-		debug("packet range length %zu larger than buffer %zu, %s:%i",
-		      len, p->buf_size, func, line);
-		return -1;
-	}
-
-	if ((size_t)(ptr - p->buf) > p->buf_size - len) {
-		debug("packet range %p, len %zu after buffer end %p, %s:%i",
-		      (void *)ptr, len, (void *)(p->buf + p->buf_size),
-		      func, line);
+	if (start + len + offset > p->buf + p->buf_size) {
+		trace("packet offset plus length %lu from size %lu, "
+		      "%s:%i", start - p->buf + len + offset,
+		      p->buf_size, func, line);
 		return -1;
 	}

 	return 0;
 }
-/**
- * pool_full() - Is a packet pool full?
- * @p:		Pointer to packet pool
- *
- * Return: true if the pool is full, false if more packets can be added
- */
-bool pool_full(const struct pool *p)
-{
-	return p->count >= p->size;
-}
-
 /**
 * packet_add_do() - Add data as packet descriptor to given pool
 * @p:		Existing pool
 * @len:	Length of new descriptor
 * @start:	Start of data
- * @func:	For tracing: name of calling function
+ * @func:	For tracing: name of calling function, NULL means no trace()
 * @line:	For tracing: caller line of function call
 */
 void packet_add_do(struct pool *p, size_t len, const char *start,
@ -97,63 +75,26 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 {
 	size_t idx = p->count;

-	if (pool_full(p)) {
-		debug("add packet index %zu to pool with size %zu, %s:%i",
+	if (idx >= p->size) {
+		trace("add packet index %zu to pool with size %zu, %s:%i",
 		      idx, p->size, func, line);
 		return;
 	}

-	if (packet_check_range(p, start, len, func, line))
+	if (packet_check_range(p, 0, len, start, func, line))
 		return;

+	if (len > UINT16_MAX) {
+		trace("add packet length %zu, %s:%i", len, func, line);
+		return;
+	}
+
 	p->pkt[idx].iov_base = (void *)start;
 	p->pkt[idx].iov_len = len;

 	p->count++;
 }

-/**
- * packet_get_try_do() - Get data range from packet descriptor from given pool
- * @p:		Packet pool
- * @idx:	Index of packet descriptor in pool
- * @offset:	Offset of data range in packet descriptor
- * @len:	Length of desired data range
- * @left:	Length of available data after range, set on return, can be NULL
- * @func:	For tracing: name of calling function
- * @line:	For tracing: caller line of function call
- *
- * Return: pointer to start of data range, NULL on invalid range or descriptor
- */
-void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
-			size_t len, size_t *left, const char *func, int line)
-{
-	char *ptr;
-
-	ASSERT_WITH_MSG(p->count <= p->size,
-			"Corrupt pool count: %zu, size: %zu, %s:%i",
-			p->count, p->size, func, line);
-
-	if (idx >= p->count) {
-		debug("packet %zu from pool count: %zu, %s:%i",
-		      idx, p->count, func, line);
-		return NULL;
-	}
-
-	if (offset > p->pkt[idx].iov_len ||
-	    len > (p->pkt[idx].iov_len - offset))
-		return NULL;
-
-	ptr = (char *)p->pkt[idx].iov_base + offset;
-
-	ASSERT_WITH_MSG(!packet_check_range(p, ptr, len, func, line),
-			"Corrupt packet pool, %s:%i", func, line);
-
-	if (left)
-		*left = p->pkt[idx].iov_len - offset - len;
-
-	return ptr;
-}
-
 /**
 * packet_get_do() - Get data range from packet descriptor from given pool
 * @p:		Packet pool
@ -161,24 +102,47 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
 * @offset:	Offset of data range in packet descriptor
 * @len:	Length of desired data range
 * @left:	Length of available data after range, set on return, can be NULL
- * @func:	For tracing: name of calling function
+ * @func:	For tracing: name of calling function, NULL means no trace()
 * @line:	For tracing: caller line of function call
 *
- * Return: as packet_get_try_do() but log a trace message when returning NULL
+ * Return: pointer to start of data range, NULL on invalid range or descriptor
 */
-void *packet_get_do(const struct pool *p, const size_t idx,
-		    size_t offset, size_t len, size_t *left,
-		    const char *func, int line)
+void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
+		    size_t len, size_t *left, const char *func, int line)
 {
-	void *r = packet_get_try_do(p, idx, offset, len, left, func, line);
-
-	if (!r) {
-		trace("missing packet data length %zu, offset %zu from "
-		      "length %zu, %s:%i",
-		      len, offset, p->pkt[idx].iov_len, func, line);
+	if (idx >= p->size || idx >= p->count) {
+		if (func) {
+			trace("packet %zu from pool size: %zu, count: %zu, "
+			      "%s:%i", idx, p->size, p->count, func, line);
+		}
+		return NULL;
 	}

-	return r;
+	if (len > UINT16_MAX) {
+		if (func) {
+			trace("packet data length %zu, %s:%i",
+			      len, func, line);
+		}
+		return NULL;
+	}
+
+	if (len + offset > p->pkt[idx].iov_len) {
+		if (func) {
+			trace("data length %zu, offset %zu from length %zu, "
+			      "%s:%i", len, offset, p->pkt[idx].iov_len,
+			      func, line);
+		}
+		return NULL;
+	}
+
+	if (packet_check_range(p, offset, len, p->pkt[idx].iov_base,
+			       func, line))
+		return NULL;
+
+	if (left)
+		*left = p->pkt[idx].iov_len - offset - len;
+
+	return (char *)p->pkt[idx].iov_base + offset;
 }

 /**
--- a/packet.h
+++ b/packet.h
@ -6,11 +6,6 @@
 #ifndef PACKET_H
 #define PACKET_H

-#include <stdbool.h>
-
-/* Maximum size of a single packet stored in pool, including headers */
-#define PACKET_MAX_LEN	((size_t)UINT16_MAX)
-
 /**
 * struct pool - Generic pool of packets stored in a buffer
 * @buf:	Buffer storing packet descriptors,
@ -26,29 +21,27 @@ struct pool {
 	size_t buf_size;
 	size_t size;
 	size_t count;
-	struct iovec pkt[];
+	struct iovec pkt[1];
 };

-int vu_packet_check_range(void *buf, const char *ptr, size_t len);
+int vu_packet_check_range(void *buf, size_t offset, size_t len,
+			  const char *start);
 void packet_add_do(struct pool *p, size_t len, const char *start,
 		   const char *func, int line);
-void *packet_get_try_do(const struct pool *p, const size_t idx,
-			size_t offset, size_t len, size_t *left,
-			const char *func, int line);
 void *packet_get_do(const struct pool *p, const size_t idx,
 		    size_t offset, size_t len, size_t *left,
 		    const char *func, int line);
-bool pool_full(const struct pool *p);
 void pool_flush(struct pool *p);

 #define packet_add(p, len, start)					\
 	packet_add_do(p, len, start, __func__, __LINE__)

-#define packet_get_try(p, idx, offset, len, left)			\
-	packet_get_try_do(p, idx, offset, len, left, __func__, __LINE__)
 #define packet_get(p, idx, offset, len, left)				\
 	packet_get_do(p, idx, offset, len, left, __func__, __LINE__)

+#define packet_get_try(p, idx, offset, len, left)			\
+	packet_get_do(p, idx, offset, len, left, NULL, 0)
+
 #define PACKET_POOL_DECL(_name, _size, _buf)				\
 struct _name ## _t {							\
 	char *buf;							\
--- a/passt-repair.1
+++ b/passt-repair.1
@ -1,74 +0,0 @@
-.\" SPDX-License-Identifier: GPL-2.0-or-later
-.\" Copyright (c) 2025 Red Hat GmbH
-.\" Author: Stefano Brivio <sbrivio@redhat.com>
-.TH passt-repair 1
-
-.SH NAME
-.B passt-repair
-\- Helper setting TCP_REPAIR socket options for \fBpasst\fR(1)
-
-.SH SYNOPSIS
-.B passt-repair
-\fIPATH\fR
-
-.SH DESCRIPTION
-
-.B passt-repair
-is a privileged helper setting and clearing repair mode on TCP sockets on behalf
-of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain
-socket.
-
-It can be used to migrate TCP connections between guests without granting
-additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections,
-\fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR
-capability (see \fBcapabilities\fR(7)) to be set or cleared.
-
-If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to
-connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file
-ending with \fI.repair\fR appears in it, and then attempts to connect to it.
-
-.SH PROTOCOL
-
-\fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via
-\fI--repair-path\fR option in \fBpasst\fR(1) itself. By default, the name is the
-same as the UNIX domain socket used for guest communication, suffixed by
-\fI.repair\fR.
-
-The messages consist of one 8-bit signed integer that can be \fITCP_REPAIR_ON\fR
-(1), \fITCP_REPAIR_OFF\fR (0), or \fITCP_REPAIR_OFF_NO_WP\fR (-1), as defined by
-the Linux kernel user API, and one to SCM_MAX_FD (253) sockets as SCM_RIGHTS
-(see \fBunix\fR(7)) ancillary message, sent by the server, \fBpasst\fR(1).
-
-The client, \fBpasst-repair\fR(1), replies with the same byte (and no ancillary
-message) to indicate success, and closes the connection on failure.
-
-The server closes the connection on error or completion.
-
-.SH NOTES
-
-\fBpasst-repair\fR(1) can be granted the \fBCAP_NET_ADMIN\fR capability
-(preferred, as it limits privileges to the strictly necessary ones), or it can
-be run as root.
-
-.SH AUTHOR
-
-Stefano Brivio <sbrivio@redhat.com>.
-
-.SH REPORTING BUGS
-
-Please report issues on the bug tracker at https://bugs.passt.top/, or
-send a message to the passt-user@passt.top mailing list, see
-https://lists.passt.top/.
-
-.SH COPYRIGHT
-
-Copyright (c) 2025 Red Hat GmbH.
-
-\fBpasst-repair\fR is free software: you can redistribute them and/or modify
-them under the terms of the GNU General Public License as published by the Free
-Software Foundation, either version 2 of the License, or (at your option) any
-later version. 
-
-.SH SEE ALSO
-
-\fBpasst\fR(1), \fBqemu\fR(1), \fBcapabilities\fR(7), \fBunix\fR(7).
--- a/passt-repair.c
+++ b/passt-repair.c
@ -1,266 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-/* PASST - Plug A Simple Socket Transport
- *  for qemu/UNIX domain socket mode
- *
- * PASTA - Pack A Subtle Tap Abstraction
- *  for network namespace/tap device mode
- *
- * passt-repair.c - Privileged helper to set/clear TCP_REPAIR on sockets
- *
- * Copyright (c) 2025 Red Hat GmbH
- * Author: Stefano Brivio <sbrivio@redhat.com>
- *
- * Connect to passt via UNIX domain socket, receive sockets via SCM_RIGHTS along
- * with byte commands mapping to TCP_REPAIR values, and switch repair mode on or
- * off. Reply by echoing the command. Exit on EOF.
- */
-
-#include <sys/inotify.h>
-#include <sys/prctl.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/stat.h>
-#include <sys/un.h>
-#include <errno.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <unistd.h>
-#include <netdb.h>
-
-#include <netinet/tcp.h>
-
-#include <linux/audit.h>
-#include <linux/capability.h>
-#include <linux/filter.h>
-#include <linux/seccomp.h>
-
-#include "seccomp_repair.h"
-
-#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
-#define REPAIR_EXT		".repair"
-#define REPAIR_EXT_LEN		strlen(REPAIR_EXT)
-
-/**
- * main() - Entry point and whole program with loop
- * @argc:	Argument count, must be 2
- * @argv:	Argument: path of UNIX domain socket to connect to
- *
- * Return: 0 on success (EOF), 1 on error, 2 on usage error
- *
- * #syscalls:repair connect setsockopt write close exit_group
- * #syscalls:repair socket s390x:socketcall i686:socketcall
- * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv
- * #syscalls:repair sendto sendmsg arm:send ppc64le:send
- * #syscalls:repair stat|statx stat64|statx statx
- * #syscalls:repair fstat|fstat64 newfstatat|fstatat64
- * #syscalls:repair inotify_init1 inotify_add_watch
- */
-int main(int argc, char **argv)
-{
-	char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
-	     __attribute__ ((aligned(__alignof__(struct cmsghdr))));
-	struct sockaddr_un a = { AF_UNIX, "" };
-	int fds[SCM_MAX_FD], s, ret, i, n = 0;
-	bool inotify_dir = false;
-	struct sock_fprog prog;
-	int8_t cmd = INT8_MAX;
-	struct cmsghdr *cmsg;
-	struct msghdr msg;
-	struct iovec iov;
-	size_t cmsg_len;
-	struct stat sb;
-	int op;
-
-	prctl(PR_SET_DUMPABLE, 0);
-
-	prog.len = (unsigned short)sizeof(filter_repair) /
-				   sizeof(filter_repair[0]);
-	prog.filter = filter_repair;
-	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
-	    prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
-		fprintf(stderr, "Failed to apply seccomp filter");
-		_exit(1);
-	}
-
-	iov = (struct iovec){ &cmd, sizeof(cmd) };
-	msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0,
-			       .msg_iov = &iov, .msg_iovlen = 1,
-			       .msg_control = buf,
-			       .msg_controllen = sizeof(buf),
-			       .msg_flags = 0 };
-	cmsg = CMSG_FIRSTHDR(&msg);
-
-	if (argc != 2) {
-		fprintf(stderr, "Usage: %s PATH\n", argv[0]);
-		_exit(2);
-	}
-
-	if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
-		fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
-		_exit(1);
-	}
-
-	if ((stat(argv[1], &sb))) {
-		fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno);
-		_exit(1);
-	}
-
-	if ((sb.st_mode & S_IFMT) == S_IFDIR) {
-		char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
-		   __attribute__ ((aligned(__alignof__(struct inotify_event))));
-		const struct inotify_event *ev;
-		char path[PATH_MAX + 1];
-		bool found = false;
-		ssize_t n;
-		int fd;
-
-		if ((fd = inotify_init1(IN_CLOEXEC)) < 0) {
-			fprintf(stderr, "inotify_init1: %i\n", errno);
-			_exit(1);
-		}
-
-		if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) {
-			fprintf(stderr, "inotify_add_watch: %i\n", errno);
-			_exit(1);
-		}
-
-		do {
-			char *p;
-
-			n = read(fd, buf, sizeof(buf));
-			if (n < 0) {
-				fprintf(stderr, "inotify read: %i", errno);
-				_exit(1);
-			}
-			buf[n - 1] = '\0';
-
-			if (n < (ssize_t)sizeof(*ev)) {
-				fprintf(stderr, "Short inotify read: %zi", n);
-				continue;
-			}
-
-			for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
-				ev = (const struct inotify_event *)p;
-
-				if (ev->len >= REPAIR_EXT_LEN &&
-				    !memcmp(ev->name +
-					    strnlen(ev->name, ev->len) -
-					    REPAIR_EXT_LEN,
-					    REPAIR_EXT, REPAIR_EXT_LEN)) {
-					found = true;
-					break;
-				}
-			}
-		} while (!found);
-
-		if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') {
-			fprintf(stderr, "Invalid filename from inotify\n");
-			_exit(1);
-		}
-
-		snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name);
-		if ((stat(path, &sb))) {
-			fprintf(stderr, "Can't stat() %s: %i\n", path, errno);
-			_exit(1);
-		}
-
-		ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path);
-		inotify_dir = true;
-	} else {
-		ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
-	}
-
-	if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
-		fprintf(stderr, "Invalid socket path");
-		_exit(2);
-	}
-
-	if ((sb.st_mode & S_IFMT) != S_IFSOCK) {
-		fprintf(stderr, "%s is not a socket\n", a.sun_path);
-		_exit(2);
-	}
-
-	while (connect(s, (struct sockaddr *)&a, sizeof(a))) {
-		if (inotify_dir && errno == ECONNREFUSED)
-			continue;
-
-		fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path,
-			strerror(errno));
-		_exit(1);
-	}
-
-loop:
-	ret = recvmsg(s, &msg, 0);
-	if (ret < 0) {
-		if (errno == ECONNRESET) {
-			ret = 0;
-		} else {
-			fprintf(stderr, "Failed to read message: %i\n", errno);
-			_exit(1);
-		}
-	}
-
-	if (!ret)	/* Done */
-		_exit(0);
-
-	if (!cmsg ||
-	    cmsg->cmsg_len < CMSG_LEN(sizeof(int)) ||
-	    cmsg->cmsg_len > CMSG_LEN(sizeof(int) * SCM_MAX_FD) ||
-	    cmsg->cmsg_type != SCM_RIGHTS) {
-		fprintf(stderr, "No/bad ancillary data from peer\n");
-		_exit(1);
-	}
-
-	/* No inverse formula for CMSG_LEN(x), and building one with CMSG_LEN(0)
-	 * works but there's no guarantee it does. Search the whole domain.
-	 */
-	for (i = 1; i <= SCM_MAX_FD; i++) {
-		if (CMSG_LEN(sizeof(int) * i) == cmsg->cmsg_len) {
-			n = i;
-			break;
-		}
-	}
-	if (!n) {
-		cmsg_len = cmsg->cmsg_len; /* socklen_t is 'unsigned' on musl */
-		fprintf(stderr, "Invalid ancillary data length %zu from peer\n",
-			cmsg_len);
-		_exit(1);
-	}
-
-	memcpy(fds, CMSG_DATA(cmsg), sizeof(int) * n);
-
-	if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF &&
-	    cmd != TCP_REPAIR_OFF_NO_WP) {
-		fprintf(stderr, "Unsupported command 0x%04x\n", cmd);
-		_exit(1);
-	}
-
-	op = cmd;
-
-	for (i = 0; i < n; i++) {
-		if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &op, sizeof(op))) {
-			fprintf(stderr,
-				"Setting TCP_REPAIR to %i on socket %i: %s", op,
-				fds[i], strerror(errno));
-			_exit(1);
-		}
-
-		/* Close _our_ copy */
-		close(fds[i]);
-	}
-
-	/* Confirm setting by echoing the command back */
-	if (send(s, &cmd, sizeof(cmd), 0) < 0) {
-		fprintf(stderr, "Reply to %i: %s\n", op, strerror(errno));
-		_exit(1);
-	}
-
-	goto loop;
-
-	return 0;
-}
--- a/passt.1
+++ b/passt.1
@ -160,9 +160,7 @@ once for IPv6).
 By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces
 with the first default route, if any, for the corresponding IP version. If no
 default routes are available and there is any interface with any route for a
-given IP version, the first of these interfaces will be chosen instead. If no
-such interface exists, the link-local address 169.254.2.1 is assigned for IPv4,
-and no additional address will be assigned for IPv6.
+given IP version, the first of these interfaces will be chosen instead.

 .TP
 .BR \-n ", " \-\-netmask " " \fImask
@ -176,7 +174,8 @@ according to the CIDR block of the assigned address (RFC 4632).
 .BR \-M ", " \-\-mac-addr " " \fIaddr
 Use source MAC address \fIaddr\fR when communicating to the guest or to the
 target namespace.
-Default is the locally administered MAC addresses 9a:55:9a:55:9a:55.
+Default is to use the MAC address of the interface with the first IPv4 default
+route on the host.

 .TP
 .BR \-g ", " \-\-gateway " " \fIaddr
@ -189,9 +188,7 @@ first default route, if any, for the corresponding IP version. If the default
 route is a multipath one, the gateway is the first nexthop router returned by
 the kernel which has the highest weight in the set of paths. If no default
 routes are available and there is just one interface with any route, that
-interface will be chosen instead. If no such interface exists, the link-local
-address 169.254.2.2 is used for IPv4, and the link-local address fe80::1 is used
-for IPv6.
+interface will be chosen instead.

 Note: these addresses are also used as source address for packets directed to
 the guest or to the target namespace having a loopback or local source address,
@ -206,9 +203,7 @@ Default is to use the interfaces specified by \fB--outbound-if4\fR and

 If no interfaces are given, the interface with the first default routes for each
 IP version is selected. If no default routes are available and there is just one
-interface with any route, that interface will be chosen instead. If no such
-interface exists, host interfaces will be ignored for the purposes of assigning
-addresses and routes, and link-local addresses will be used instead.
+interface with any route, that interface will be chosen instead.

 .TP
 .BR \-o ", " \-\-outbound " " \fIaddr
@ -227,8 +222,7 @@ derive IPv4 addresses and routes.

 By default, the interface given by the default route is selected. If no default
 routes are available and there is just one interface with any route, that
-interface will be chosen instead. If no such interface exists, outbound sockets
-will not be bound to any specific interface.
+interface will be chosen instead.

 .TP
 .BR \-\-outbound-if6 " " \fIname
@ -238,8 +232,7 @@ derive IPv6 addresses and routes.

 By default, the interface given by the default route is selected. If no default
 routes are available and there is just one interface with any route, that
-interface will be chosen instead. If no such interface exists, outbound sockets
-will not be bound to any specific interface.
+interface will be chosen instead.

 .TP
 .BR \-D ", " \-\-dns " " \fIaddr
@ -380,14 +373,14 @@ Translate \fIaddr\fR in the guest to be equal to the guest's assigned
 address on the host.  That is, packets from the guest to \fIaddr\fR
 will be redirected to the address assigned to the guest with \fB-a\fR,
 or by default the host's global address.  This allows the guest to
-access services available on the host's global address, even though its
+access services availble on the host's global address, even though its
 own address shadows that of the host.

 If \fIaddr\fR is 'none', no address is mapped.  Only one IPv4 and one
 IPv6 address can be translated, and if the option is specified
 multiple times, the last one for each address type takes effect.

-By default, mapping happens as described for the \-\-map-host-loopback option.
+Default is no mapping.

 .TP
 .BR \-4 ", " \-\-ipv4-only
@ -401,16 +394,6 @@ Enable IPv6-only operation. IPv4 traffic will be ignored.
 By default, IPv4 operation is enabled as long as at least an IPv4 route and an
 interface address are configured on a given host interface.

-.TP
-.BR \-H ", " \-\-hostname " " \fIname
-Hostname to configure the client with.
-Send \fIname\fR as DHCP option 12 (hostname).
-
-.TP
-.BR \-\-fqdn " " \fIname
-FQDN to configure the client with.
-Send \fIname\fR as Client FQDN: DHCP option 81 and DHCPv6 option 39.
-
 .SS \fBpasst\fR-only options

 .TP
@ -428,17 +411,6 @@ Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR.
 .BR \-\-print-capabilities
 Print back-end capabilities in JSON format, only meaningful for vhost-user mode.

-.TP
-.BR \-\-repair-path " " \fIpath
-Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect
-to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during
-migration. \fB--repair-path none\fR disables this interface (if you need to
-specify a socket path called "none" you can prefix the path by \fI./\fR).
-
-Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
-chosen for the hypervisor UNIX domain socket. No socket is created if not in
-\-\-vhost-user mode.
-
 .TP
 .BR \-F ", " \-\-fd " " \fIFD
 Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
@ -540,7 +512,6 @@ Default is \fBnone\fR.
 .BR \-I ", " \-\-ns-ifname " " \fIname
 Name of tap interface to be created in target namespace.
 By default, the same interface name as the external, routable interface is used.
-If no such interface exists, the name \fItap0\fR will be used instead.

 .TP
 .BR \-t ", " \-\-tcp-ports " " \fIspec
@ -643,7 +614,7 @@ Configure UDP port forwarding from target namespace to init namespace.
 Default is \fBauto\fR.

 .TP
-.BR \-\-host-lo-to-ns-lo
+.BR \-\-host-lo-to-ns-lo " " (DEPRECATED)
 If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
 the host's loopback address will appear on the loopback address in the
 guest as well.  Without this option such forwarded packets will appear
@ -716,11 +687,6 @@ Configure MAC address \fIaddr\fR on the tap interface in the namespace.

 Default is to let the tap driver build a pseudorandom hardware address.

-.TP
-.BR \-\-no-splice
-Disable the bypass path for inbound, local traffic. See the section \fBHandling
-of local traffic in pasta\fR in the \fBNOTES\fR for more details.
-
 .SH EXAMPLES

 .SS \fBpasta
@ -962,16 +928,10 @@ with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
 the last observed source address from guest or namespace is 192.0.2.2, this will
 be translated to a connection from 192.0.2.1 to 192.0.2.2.

-Similarly, for traffic coming from guest or namespace, packets with destination
-address corresponding to the \fB\-\-map-host-loopback\fR address will have their
-destination address translated to a loopback address.
-
-As an exception, traffic identified as DNS, originally directed to the
-\fB\-\-map-host-loopback\fR address, if this address matches a resolver address
-on the host, is \fBnot\fR translated to loopback, but rather handled in the same
-way as if specified as \-\-dns-forward address, if no such option was given.
-In the common case where the host gateway also acts a resolver, this avoids that
-the host mapping shadows the gateway/resolver itself.
+Similarly, for traffic coming from guest or namespace, packets with
+destination address corresponding to the \fB\-\-map-host-loopback\fR
+address will have their destination address translated to a loopback
+address.

 .SS Handling of local traffic in pasta

@ -1080,20 +1040,6 @@ If the sending window cannot be queried, it will always be announced as the
 current sending buffer size to guest or target namespace. This might affect
 throughput of TCP connections.

-.SS Local mode for disconnected setups
-
-If \fBpasst\fR and \fBpasta\fR fail to find a host interface with a configured
-address, other than loopback addresses, they will, obviously, not attempt to
-source addresses or routes from the host.
-
-In this case, unless configured otherwise, they will assign the IPv4 link-local
-address 169.254.2.1 to the guest or target namespace, and no IPv6 address. The
-notion of the guest or target namespace IPv6 address is derived from the first
-link-local address observed.
-
-Default gateways will be assigned as the link-local address 169.254.2.2 for
-IPv4, and as the link-local address fe80::1 for IPv6.
-
 .SH LIMITATIONS

 Currently, IGMP/MLD proxying (RFC 4605) and support for SCTP (RFC 4960) are not
--- a/passt.c
+++ b/passt.c
@ -51,8 +51,6 @@
 #include "tcp_splice.h"
 #include "ndp.h"
 #include "vu_common.h"
-#include "migrate.h"
-#include "repair.h"

 #define EPOLL_EVENTS		8

@ -68,7 +66,7 @@ char *epoll_type_str[] = {
 	[EPOLL_TYPE_TCP_LISTEN]		= "listening TCP socket",
 	[EPOLL_TYPE_TCP_TIMER]		= "TCP timer",
 	[EPOLL_TYPE_UDP_LISTEN]		= "listening UDP socket",
-	[EPOLL_TYPE_UDP]		= "UDP flow socket",
+	[EPOLL_TYPE_UDP_REPLY]		= "UDP reply socket",
 	[EPOLL_TYPE_PING]	= "ICMP/ICMPv6 ping socket",
 	[EPOLL_TYPE_NSQUIT_INOTIFY]	= "namespace inotify watch",
 	[EPOLL_TYPE_NSQUIT_TIMER]	= "namespace timer watch",
@ -77,8 +75,6 @@ char *epoll_type_str[] = {
 	[EPOLL_TYPE_TAP_LISTEN]		= "listening qemu socket",
 	[EPOLL_TYPE_VHOST_CMD]		= "vhost-user command socket",
 	[EPOLL_TYPE_VHOST_KICK]		= "vhost-user kick socket",
-	[EPOLL_TYPE_REPAIR_LISTEN]	= "TCP_REPAIR helper listening socket",
-	[EPOLL_TYPE_REPAIR]		= "TCP_REPAIR helper socket",
 };
 static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
 	      "epoll_type_str[] doesn't match enum epoll_type");
@ -116,8 +112,7 @@ static void post_handler(struct ctx *c, const struct timespec *now)
 	flow_defer_handler(c, now);
 #undef CALL_PROTO_HANDLER

-	if (!c->no_ndp)
-		ndp_timer(c, now);
+	ndp_timer(c, now);
 }

 /**
@ -166,11 +161,11 @@ void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
 *
 * #syscalls exit_group
 */
-static void exit_handler(int signal)
+void exit_handler(int signal)
 {
 	(void)signal;

-	_exit(EXIT_SUCCESS);
+	exit(EXIT_SUCCESS);
 }

 /**
@ -184,13 +179,14 @@ static void exit_handler(int signal)
 * #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close
 * #syscalls bind connect recvfrom sendto shutdown
 * #syscalls arm:recv ppc64le:recv arm:send ppc64le:send
- * #syscalls accept4 accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
+ * #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
 * #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64
 */
 int main(int argc, char **argv)
 {
 	struct epoll_event events[EPOLL_EVENTS];
 	int nfds, i, devnull_fd = -1;
+	char argv0[PATH_MAX], *name;
 	struct ctx c = { 0 };
 	struct rlimit limit;
 	struct timespec now;
@ -204,7 +200,6 @@ int main(int argc, char **argv)
 	isolate_initial(argc, argv);

 	c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1;
-	c.device_state_fd = -1;

 	sigemptyset(&sa.sa_mask);
 	sa.sa_flags = 0;
@ -212,18 +207,27 @@ int main(int argc, char **argv)
 	sigaction(SIGTERM, &sa, NULL);
 	sigaction(SIGQUIT, &sa, NULL);

-	c.mode = conf_mode(argc, argv);
+	if (argc < 1)
+		exit(EXIT_FAILURE);

-	if (c.mode == MODE_PASTA) {
+	strncpy(argv0, argv[0], PATH_MAX - 1);
+	name = basename(argv0);
+	if (strstr(name, "pasta")) {
 		sa.sa_handler = pasta_child_handler;
 		if (sigaction(SIGCHLD, &sa, NULL))
 			die_perror("Couldn't install signal handlers");
+
+		if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
+			die_perror("Couldn't set disposition for SIGPIPE");
+
+		c.mode = MODE_PASTA;
+	} else if (strstr(name, "passt")) {
+		c.mode = MODE_PASST;
+	} else {
+		exit(EXIT_FAILURE);
 	}

-	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
-		die_perror("Couldn't set disposition for SIGPIPE");
-
-	madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE);
+	madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);

 	c.epollfd = epoll_create1(EPOLL_CLOEXEC);
 	if (c.epollfd == -1)
@ -253,7 +257,7 @@ int main(int argc, char **argv)
 	flow_init();

 	if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
-		_exit(EXIT_FAILURE);
+		exit(EXIT_FAILURE);

 	proto_update_l2_buf(c.guest_mac, c.our_tap_mac);

@ -339,8 +343,8 @@ loop:
 		case EPOLL_TYPE_UDP_LISTEN:
 			udp_listen_sock_handler(&c, ref, eventmask, &now);
 			break;
-		case EPOLL_TYPE_UDP:
-			udp_sock_handler(&c, ref, eventmask, &now);
+		case EPOLL_TYPE_UDP_REPLY:
+			udp_reply_sock_handler(&c, ref, eventmask, &now);
 			break;
 		case EPOLL_TYPE_PING:
 			icmp_sock_handler(&c, ref);
@ -351,12 +355,6 @@ loop:
 		case EPOLL_TYPE_VHOST_KICK:
 			vu_kick_cb(c.vdev, ref, &now);
 			break;
-		case EPOLL_TYPE_REPAIR_LISTEN:
-			repair_listen_handler(&c, eventmask);
-			break;
-		case EPOLL_TYPE_REPAIR:
-			repair_handler(&c, eventmask);
-			break;
 		default:
 			/* Can't happen */
 			ASSERT(0);
@ -365,7 +363,5 @@ loop:

 	post_handler(&c, &now);

-	migrate_handler(&c);
-
 	goto loop;
 }
--- a/passt.h
+++ b/passt.h
@ -20,7 +20,6 @@ union epoll_ref;
 #include "siphash.h"
 #include "ip.h"
 #include "inany.h"
-#include "migrate.h"
 #include "flow.h"
 #include "icmp.h"
 #include "fwd.h"
@ -69,9 +68,12 @@ union epoll_ref {
 static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
 	      "epoll_ref must have same size as epoll_data");

-/* Large enough for ~128 maximum size frames */
-#define PKT_BUF_BYTES		(8UL << 20)
+#define TAP_BUF_BYTES							\
+	ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
+#define TAP_MSGS							\
+	DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))

+#define PKT_BUF_BYTES		MAX(TAP_BUF_BYTES, 0)
 extern char pkt_buf		[PKT_BUF_BYTES];

 extern char *epoll_type_str[];
@ -191,7 +193,6 @@ struct ip6_ctx {
 * @foreground:		Run in foreground, don't log to stderr by default
 * @nofile:		Maximum number of open files (ulimit -n)
 * @sock_path:		Path for UNIX domain socket
- * @repair_path:	TCP_REPAIR helper path, can be "none", empty for default
 * @pcap:		Path for packet capture file
 * @pidfile:		Path to PID file, empty string if not configured
 * @pidfile_fd:		File descriptor for PID file, -1 if none
@ -202,17 +203,13 @@ struct ip6_ctx {
 * @epollfd:		File descriptor for epoll instance
 * @fd_tap_listen:	File descriptor for listening AF_UNIX socket, if any
 * @fd_tap:		AF_UNIX socket, tuntap device, or pre-opened socket
- * @fd_repair_listen:	File descriptor for listening TCP_REPAIR socket, if any
- * @fd_repair:		Connected AF_UNIX socket for TCP_REPAIR helper
 * @our_tap_mac:	Pasta/passt's MAC on the tap link
 * @guest_mac:		MAC address of guest or namespace, seen or configured
 * @hash_secret:	128-bit secret for siphash functions
- * @ifi4:		Template interface for IPv4, -1: none, 0: IPv4 disabled
+ * @ifi4:		Index of template interface for IPv4, 0 if IPv4 disabled
 * @ip:			IPv4 configuration
 * @dns_search:		DNS search list
- * @hostname:		Guest hostname
- * @fqdn:		Guest FQDN
- * @ifi6:		Template interface for IPv6, -1: none, 0: IPv6 disabled
+ * @ifi6:		Index of template interface for IPv6, 0 if IPv6 disabled
 * @ip6:		IPv6 configuration
 * @pasta_ifn:		Name of namespace interface for pasta
 * @pasta_ifi:		Index of namespace interface for pasta
@ -232,15 +229,11 @@ struct ip6_ctx {
 * @no_dhcpv6:		Disable DHCPv6 server
 * @no_ndp:		Disable NDP handler altogether
 * @no_ra:		Disable router advertisements
- * @no_splice:		Disable socket splicing for inbound traffic
 * @host_lo_to_ns_lo:	Map host loopback addresses to ns loopback addresses
 * @freebind:		Allow binding of non-local addresses for forwarding
 * @low_wmem:		Low probed net.core.wmem_max
 * @low_rmem:		Low probed net.core.rmem_max
 * @vdev:		vhost-user device
- * @device_state_fd:	Device state migration channel
- * @device_state_result: Device state migration result
- * @migrate_target:	Are we the target, on the next migration request?
 */
 struct ctx {
 	enum passt_modes mode;
@ -250,7 +243,6 @@ struct ctx {
 	int foreground;
 	int nofile;
 	char sock_path[UNIX_PATH_MAX];
-	char repair_path[UNIX_PATH_MAX];
 	char pcap[PATH_MAX];

 	char pidfile[PATH_MAX];
@ -267,23 +259,16 @@ struct ctx {
 	int epollfd;
 	int fd_tap_listen;
 	int fd_tap;
-	int fd_repair_listen;
-	int fd_repair;
 	unsigned char our_tap_mac[ETH_ALEN];
 	unsigned char guest_mac[ETH_ALEN];
-	uint16_t mtu;
-
 	uint64_t hash_secret[2];

-	int ifi4;
+	unsigned int ifi4;
 	struct ip4_ctx ip4;

 	struct fqdn dns_search[MAXDNSRCH];

-	char hostname[PASST_MAXDNAME];
-	char fqdn[PASST_MAXDNAME];
-
-	int ifi6;
+	unsigned int ifi6;
 	struct ip6_ctx ip6;

 	char pasta_ifn[IF_NAMESIZE];
@ -297,6 +282,7 @@ struct ctx {
 	int no_icmp;
 	struct icmp_ctx icmp;

+	int mtu;
 	int no_dns;
 	int no_dns_search;
 	int no_dhcp_dns;
@ -305,7 +291,6 @@ struct ctx {
 	int no_dhcpv6;
 	int no_ndp;
 	int no_ra;
-	int no_splice;
 	int host_lo_to_ns_lo;
 	int freebind;

@ -313,11 +298,6 @@ struct ctx {
 	int low_rmem;

 	struct vu_dev *vdev;
-
-	/* Migration */
-	int device_state_fd;
-	int device_state_result;
-	bool migrate_target;
 };

 void proto_update_l2_buf(const unsigned char *eth_d,
--- a/pasta.c
+++ b/pasta.c
@ -73,12 +73,12 @@ void pasta_child_handler(int signal)
 	    !waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) {
 		if (infop.si_pid == pasta_child_pid) {
 			if (infop.si_code == CLD_EXITED)
-				_exit(infop.si_status);
+				exit(infop.si_status);

 			/* If killed by a signal, si_status is the number.
 			 * Follow common shell convention of returning it + 128.
 			 */
-			_exit(infop.si_status + 128);
+			exit(infop.si_status + 128);

 			/* Nothing to do, detached PID namespace going away */
 		}
@ -169,12 +169,10 @@ void pasta_open_ns(struct ctx *c, const char *netns)
 * struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd()
 * @exe:	Executable to run
 * @argv:	Command and arguments to run
- * @ctx:	Context to read config from
 */
 struct pasta_spawn_cmd_arg {
 	const char *exe;
 	char *const *argv;
-	struct ctx *c;
 };

 /**
@ -188,7 +186,6 @@ static int pasta_spawn_cmd(void *arg)
 {
 	char hostname[HOST_NAME_MAX + 1] = HOSTNAME_PREFIX;
 	const struct pasta_spawn_cmd_arg *a;
-	size_t conf_hostname_len;
 	sigset_t set;

 	/* We run in a detached PID and mount namespace: mount /proc over */
@ -198,15 +195,9 @@ static int pasta_spawn_cmd(void *arg)
 	if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
 		warn("Cannot set ping_group_range, ICMP requests might fail");

-	a = (const struct pasta_spawn_cmd_arg *)arg;
-
-	conf_hostname_len = strlen(a->c->hostname);
-	if (conf_hostname_len > 0) {
-		if (sethostname(a->c->hostname, conf_hostname_len))
-			warn("Unable to set configured hostname");
-	} else if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1,
-				HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) ||
-		   errno == ENAMETOOLONG) {
+	if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1,
+			 HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) ||
+	    errno == ENAMETOOLONG) {
 		hostname[HOST_NAME_MAX] = '\0';
 		if (sethostname(hostname, strlen(hostname)))
 			warn("Unable to set pasta-prefixed hostname");
@ -217,6 +208,7 @@ static int pasta_spawn_cmd(void *arg)
 	sigaddset(&set, SIGUSR1);
 	sigwaitinfo(&set, NULL);

+	a = (const struct pasta_spawn_cmd_arg *)arg;
 	execvp(a->exe, a->argv);

 	die_perror("Failed to start command or shell");
@ -238,7 +230,6 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
 	struct pasta_spawn_cmd_arg arg = {
 		.exe = argv[0],
 		.argv = argv,
-		.c = c,
 	};
 	char uidmap[BUFSIZ], gidmap[BUFSIZ];
 	char *sh_argv[] = { NULL, NULL };
@ -305,7 +296,7 @@ void pasta_ns_conf(struct ctx *c)
 	rc = nl_link_set_flags(nl_sock_ns, 1 /* lo */, IFF_UP, IFF_UP);
 	if (rc < 0)
 		die("Couldn't bring up loopback interface in namespace: %s",
-		    strerror_(-rc));
+		    strerror(-rc));

 	/* Get or set MAC in target namespace */
 	if (MAC_IS_ZERO(c->guest_mac))
@ -314,12 +305,12 @@ void pasta_ns_conf(struct ctx *c)
 		rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac);
 	if (rc < 0)
 		die("Couldn't set MAC address in namespace: %s",
-		    strerror_(-rc));
+		    strerror(-rc));

 	if (c->pasta_conf_ns) {
 		unsigned int flags = IFF_UP;

-		if (c->mtu)
+		if (c->mtu != -1)
 			nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu);

 		if (c->ifi6) /* Avoid duplicate address detection on link up */
@ -341,7 +332,7 @@ void pasta_ns_conf(struct ctx *c)

 			if (rc < 0) {
 				die("Couldn't set IPv4 address(es) in namespace: %s",
-				    strerror_(-rc));
+				    strerror(-rc));
 			}

 			if (c->ip4.no_copy_routes) {
@ -355,7 +346,7 @@ void pasta_ns_conf(struct ctx *c)

 			if (rc < 0) {
 				die("Couldn't set IPv4 route(s) in guest: %s",
-				    strerror_(-rc));
+				    strerror(-rc));
 			}
 		}

@ -364,13 +355,13 @@ void pasta_ns_conf(struct ctx *c)
 					    &c->ip6.addr_ll_seen);
 			if (rc < 0) {
 				warn("Can't get LL address from namespace: %s",
-				    strerror_(-rc));
+				    strerror(-rc));
 			}

 			rc = nl_addr_set_ll_nodad(nl_sock_ns, c->pasta_ifi);
 			if (rc < 0) {
 				warn("Can't set nodad for LL in namespace: %s",
-				    strerror_(-rc));
+				    strerror(-rc));
 			}

 			/* We dodged DAD: re-enable neighbour solicitations */
@ -378,11 +369,8 @@ void pasta_ns_conf(struct ctx *c)
 					  0, IFF_NOARP);

 			if (c->ip6.no_copy_addrs) {
-				if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) {
-					rc = nl_addr_set(nl_sock_ns,
-							 c->pasta_ifi, AF_INET6,
-							 &c->ip6.addr, 64);
-				}
+				rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
+						 AF_INET6, &c->ip6.addr, 64);
 			} else {
 				rc = nl_addr_dup(nl_sock, c->ifi6,
 						 nl_sock_ns, c->pasta_ifi,
@ -391,7 +379,7 @@ void pasta_ns_conf(struct ctx *c)

 			if (rc < 0) {
 				die("Couldn't set IPv6 address(es) in namespace: %s",
-				    strerror_(-rc));
+				    strerror(-rc));
 			}

 			if (c->ip6.no_copy_routes) {
@ -406,7 +394,7 @@ void pasta_ns_conf(struct ctx *c)

 			if (rc < 0) {
 				die("Couldn't set IPv6 route(s) in guest: %s",
-				    strerror_(-rc));
+				    strerror(-rc));
 			}
 		}
 	}
@ -455,18 +443,18 @@ void pasta_netns_quit_init(const struct ctx *c)
 		return;

 	if ((dir_fd = open(c->netns_dir, O_CLOEXEC | O_RDONLY)) < 0)
-		die("netns dir open: %s, exiting", strerror_(errno));
+		die("netns dir open: %s, exiting", strerror(errno));

 	if (fstatfs(dir_fd, &s)          || s.f_type == DEVPTS_SUPER_MAGIC ||
 	    s.f_type == PROC_SUPER_MAGIC || s.f_type == SYSFS_MAGIC)
 		try_inotify = false;

 	if (try_inotify && (fd = inotify_init1(flags)) < 0)
-		warn("inotify_init1(): %s, use a timer", strerror_(errno));
+		warn("inotify_init1(): %s, use a timer", strerror(errno));

 	if (fd >= 0 && inotify_add_watch(fd, c->netns_dir, IN_DELETE) < 0) {
 		warn("inotify_add_watch(): %s, use a timer",
-		     strerror_(errno));
+		     strerror(errno));
 		close(fd);
 		fd = -1;
 	}
@ -498,23 +486,17 @@ void pasta_netns_quit_init(const struct ctx *c)
 */
 void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd)
 {
-	char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
-		__attribute__ ((aligned(__alignof__(struct inotify_event))));
-	const struct inotify_event *ev;
-	ssize_t n;
-	char *p;
+	char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
+	const struct inotify_event *in_ev = (struct inotify_event *)buf;

-	if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev))
+	if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev))
 		return;

-	for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
-		ev = (const struct inotify_event *)p;
+	if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base)))
+		return;

-		if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) {
-			info("Namespace %s is gone, exiting", c->netns_base);
-			_exit(EXIT_SUCCESS);
-		}
-	}
+	info("Namespace %s is gone, exiting", c->netns_base);
+	exit(EXIT_SUCCESS);
 }

 /**
@ -540,7 +522,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref)
 			return;

 		info("Namespace %s is gone, exiting", c->netns_base);
-		_exit(EXIT_SUCCESS);
+		exit(EXIT_SUCCESS);
 	}

 	close(fd);
--- a/pcap.c
+++ b/pcap.c
@ -33,12 +33,33 @@
 #include "log.h"
 #include "pcap.h"
 #include "iov.h"
-#include "tap.h"

 #define PCAP_VERSION_MINOR 4

 static int pcap_fd = -1;

+/* See pcap.h from libpcap, or pcap-savefile(5) */
+static const struct {
+	uint32_t magic;
+#define PCAP_MAGIC		0xa1b2c3d4
+
+	uint16_t major;
+#define PCAP_VERSION_MAJOR	2
+
+	uint16_t minor;
+#define PCAP_VERSION_MINOR	4
+
+	int32_t thiszone;
+	uint32_t sigfigs;
+	uint32_t snaplen;
+
+	uint32_t linktype;
+#define PCAP_LINKTYPE_ETHERNET	1
+} pcap_hdr = {
+	PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU,
+	PCAP_LINKTYPE_ETHERNET
+};
+
 struct pcap_pkthdr {
 	uint32_t tv_sec;
 	uint32_t tv_usec;
@ -141,29 +162,6 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
 */
 void pcap_init(struct ctx *c)
 {
-	/* See pcap.h from libpcap, or pcap-savefile(5) */
-#define PCAP_MAGIC		0xa1b2c3d4
-#define PCAP_VERSION_MAJOR	2
-#define PCAP_VERSION_MINOR	4
-#define PCAP_LINKTYPE_ETHERNET	1
-	const struct {
-		uint32_t magic;
-		uint16_t major;
-		uint16_t minor;
-
-		int32_t thiszone;
-		uint32_t sigfigs;
-		uint32_t snaplen;
-
-		uint32_t linktype;
-	} pcap_hdr = {
-		.magic = PCAP_MAGIC,
-		.major = PCAP_VERSION_MAJOR,
-		.minor = PCAP_VERSION_MINOR,
-		.snaplen = tap_l2_max_len(c),
-		.linktype = PCAP_LINKTYPE_ETHERNET
-	};
-
 	if (pcap_fd != -1)
 		return;

--- a/repair.c
+++ b/repair.c
@ -1,255 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-/* PASST - Plug A Simple Socket Transport
- *  for qemu/UNIX domain socket mode
- *
- * PASTA - Pack A Subtle Tap Abstraction
- *  for network namespace/tap device mode
- *
- * repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR
- *
- * Copyright (c) 2025 Red Hat GmbH
- * Author: Stefano Brivio <sbrivio@redhat.com>
- */
-
-#include <errno.h>
-#include <sys/socket.h>
-#include <sys/uio.h>
-
-#include "util.h"
-#include "ip.h"
-#include "passt.h"
-#include "inany.h"
-#include "flow.h"
-#include "flow_table.h"
-
-#include "repair.h"
-
-#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
-
-/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */
-#define REPAIR_ACCEPT_TIMEOUT_MS	10
-#define REPAIR_ACCEPT_TIMEOUT_US	(REPAIR_ACCEPT_TIMEOUT_MS * 1000)
-
-/* Pending file descriptors for next repair_flush() call, or command change */
-static int repair_fds[SCM_MAX_FD];
-
-/* Pending command: flush pending file descriptors if it changes */
-static int8_t repair_cmd;
-
-/* Number of pending file descriptors set in @repair_fds */
-static int repair_nfds;
-
-/**
- * repair_sock_init() - Start listening for connections on helper socket
- * @c:		Execution context
- */
-void repair_sock_init(const struct ctx *c)
-{
-	union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN };
-	struct epoll_event ev = { 0 };
-
-	if (c->fd_repair_listen == -1)
-		return;
-
-	if (listen(c->fd_repair_listen, 0)) {
-		err_perror("listen() on repair helper socket, won't migrate");
-		return;
-	}
-
-	ref.fd = c->fd_repair_listen;
-	ev.events = EPOLLIN | EPOLLHUP | EPOLLET;
-	ev.data.u64 = ref.u64;
-	if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev))
-		err_perror("repair helper socket epoll_ctl(), won't migrate");
-}
-
-/**
- * repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
- * @c:		Execution context
- * @events:	epoll events
- */
-void repair_listen_handler(struct ctx *c, uint32_t events)
-{
-	union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
-	struct epoll_event ev = { 0 };
-	struct ucred ucred;
-	socklen_t len;
-
-	if (events != EPOLLIN) {
-		debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
-		      events);
-		return;
-	}
-
-	len = sizeof(ucred);
-
-	/* Another client is already connected: accept and close right away. */
-	if (c->fd_repair != -1) {
-		int discard = accept4(c->fd_repair_listen, NULL, NULL,
-				      SOCK_NONBLOCK);
-
-		if (discard == -1)
-			return;
-
-		if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
-			info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
-
-		close(discard);
-		return;
-	}
-
-	if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) {
-		debug_perror("accept4() on TCP_REPAIR helper listening socket");
-		return;
-	}
-
-	if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
-		info("Accepted TCP_REPAIR helper, PID %i", ucred.pid);
-
-	ref.fd = c->fd_repair;
-	ev.events = EPOLLHUP | EPOLLET;
-	ev.data.u64 = ref.u64;
-	if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) {
-		debug_perror("epoll_ctl() on TCP_REPAIR helper socket");
-		close(c->fd_repair);
-		c->fd_repair = -1;
-	}
-}
-
-/**
- * repair_close() - Close connection to TCP_REPAIR helper
- * @c:		Execution context
- */
-void repair_close(struct ctx *c)
-{
-	debug("Closing TCP_REPAIR helper socket");
-
-	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL);
-	close(c->fd_repair);
-	c->fd_repair = -1;
-}
-
-/**
- * repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket
- * @c:		Execution context
- * @events:	epoll events
- */
-void repair_handler(struct ctx *c, uint32_t events)
-{
-	(void)events;
-
-	repair_close(c);
-}
-
-/**
- * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect
- * @c:		Execution context
- */
-void repair_wait(struct ctx *c)
-{
-	struct timeval tv = { .tv_sec = 0,
-			      .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) };
-	static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000,
-		      ".tv_usec is greater than 1000 * 1000");
-
-	if (c->fd_repair >= 0 || c->fd_repair_listen == -1)
-		return;
-
-	if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
-		       &tv, sizeof(tv))) {
-		err_perror("Set timeout on TCP_REPAIR listening socket");
-		return;
-	}
-
-	repair_listen_handler(c, EPOLLIN);
-
-	tv.tv_usec = 0;
-	if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
-		       &tv, sizeof(tv)))
-		err_perror("Clear timeout on TCP_REPAIR listening socket");
-}
-
-/**
- * repair_flush() - Flush current set of sockets to helper, with current command
- * @c:		Execution context
- *
- * Return: 0 on success, negative error code on failure
- */
-int repair_flush(struct ctx *c)
-{
-	char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
-	     __attribute__ ((aligned(__alignof__(struct cmsghdr)))) = { 0 };
-	struct iovec iov = { &repair_cmd, sizeof(repair_cmd) };
-	struct cmsghdr *cmsg;
-	struct msghdr msg;
-	int8_t reply;
-
-	if (!repair_nfds)
-		return 0;
-
-	msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0,
-			       .msg_iov = &iov, .msg_iovlen = 1,
-			       .msg_control = buf,
-			       .msg_controllen = CMSG_SPACE(sizeof(int) *
-							    repair_nfds),
-			       .msg_flags = 0 };
-	cmsg = CMSG_FIRSTHDR(&msg);
-
-	cmsg->cmsg_level = SOL_SOCKET;
-	cmsg->cmsg_type = SCM_RIGHTS;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds);
-	memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds);
-
-	repair_nfds = 0;
-
-	if (sendmsg(c->fd_repair, &msg, 0) < 0) {
-		int ret = -errno;
-		err_perror("Failed to send sockets to TCP_REPAIR helper");
-		repair_close(c);
-		return ret;
-	}
-
-	if (recv(c->fd_repair, &reply, sizeof(reply), 0) < 0) {
-		int ret = -errno;
-		err_perror("Failed to receive reply from TCP_REPAIR helper");
-		repair_close(c);
-		return ret;
-	}
-
-	if (reply != repair_cmd) {
-		err("Unexpected reply from TCP_REPAIR helper: %d", reply);
-		repair_close(c);
-		return -ENXIO;
-	}
-
-	return 0;
-}
-
-/**
- * repair_set() - Add socket to TCP_REPAIR set with given command
- * @c:		Execution context
- * @s:		Socket to add
- * @cmd:	TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP
- *
- * Return: 0 on success, negative error code on failure
- */
-int repair_set(struct ctx *c, int s, int cmd)
-{
-	int rc;
-
-	if (repair_nfds && repair_cmd != cmd) {
-		if ((rc = repair_flush(c)))
-			return rc;
-	}
-
-	repair_cmd = cmd;
-	repair_fds[repair_nfds++] = s;
-
-	if (repair_nfds >= SCM_MAX_FD) {
-		if ((rc = repair_flush(c)))
-			return rc;
-	}
-
-	return 0;
-}
--- a/repair.h
+++ b/repair.h
@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later
- * Copyright (c) 2025 Red Hat GmbH
- * Author: Stefano Brivio <sbrivio@redhat.com>
- */
-
-#ifndef REPAIR_H
-#define REPAIR_H
-
-void repair_sock_init(const struct ctx *c);
-void repair_listen_handler(struct ctx *c, uint32_t events);
-void repair_handler(struct ctx *c, uint32_t events);
-void repair_close(struct ctx *c);
-void repair_wait(struct ctx *c);
-int repair_flush(struct ctx *c);
-int repair_set(struct ctx *c, int s, int cmd);
-
-#endif /* REPAIR_H */
--- a/seccomp.sh
+++ b/seccomp.sh
@ -14,10 +14,8 @@
 # Author: Stefano Brivio <sbrivio@redhat.com>

 TMP="$(mktemp)"
-OUT="$(mktemp)"
-OUT_FINAL="${1}"
-shift
 IN="$@"
+OUT="$(mktemp)"

 [ -z "${ARCH}" ] && ARCH="$(uname -m)"
 [ -z "${CC}" ] && CC="cc"
@ -255,7 +253,7 @@ for __p in ${__profiles}; do
 	__calls="${__calls} ${EXTRA_SYSCALLS:-}"
 	__calls="$(filter ${__calls})"

-	cols="$(stty -a 2>/dev/null | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
+	cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
 	case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac
 	echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args}

@ -270,4 +268,4 @@ for __p in ${__profiles}; do
 	gen_profile "${__p}" ${__calls}
 done

-mv "${OUT}" "${OUT_FINAL}"
+mv "${OUT}" seccomp.h
--- a/tap.c
+++ b/tap.c
@ -56,70 +56,18 @@
 #include "netlink.h"
 #include "pasta.h"
 #include "packet.h"
-#include "repair.h"
 #include "tap.h"
 #include "log.h"
 #include "vhost_user.h"
 #include "vu_common.h"

-/* Maximum allowed frame lengths (including L2 header) */
-
-/* Verify that an L2 frame length limit is large enough to contain the header,
- * but small enough to fit in the packet pool
- */
-#define CHECK_FRAME_LEN(len) \
-	static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN,	\
-		      #len " has bad value")
-
-CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
-CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
-CHECK_FRAME_LEN(L2_MAX_LEN_VU);
-
-/* We try size the packet pools so that we can use a single batch for the entire
- * packet buffer.  This might be exceeded for vhost-user, though, which uses its
- * own buffers rather than pkt_buf.
- *
- * This is just a tuning parameter, the code will work with slightly more
- * overhead if it's incorrect.  So, we estimate based on the minimum practical
- * frame size - an empty UDP datagram - rather than the minimum theoretical
- * frame size.
- *
- * FIXME: Profile to work out how big this actually needs to be to amortise
- *        per-batch syscall overheads
- */
-#define TAP_MSGS_IP4							\
-	DIV_ROUND_UP(sizeof(pkt_buf),					\
-		     ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr))
-#define TAP_MSGS_IP6							\
-	DIV_ROUND_UP(sizeof(pkt_buf),					\
-		     ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr))
-
 /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
-static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4, pkt_buf);
-static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf);
+static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
+static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);

 #define TAP_SEQS		128 /* Different L4 tuples in one batch */
 #define FRAGMENT_MSG_RATE	10  /* # seconds between fragment warnings */

-/**
- * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
- * @c:		Execution context
- */
-unsigned long tap_l2_max_len(const struct ctx *c)
-{
-	/* NOLINTBEGIN(bugprone-branch-clone): values can be the same */
-	switch (c->mode) {
-	case MODE_PASST:
-		return L2_MAX_LEN_PASST;
-	case MODE_PASTA:
-		return L2_MAX_LEN_PASTA;
-	case MODE_VU:
-		return L2_MAX_LEN_VU;
-	}
-	/* NOLINTEND(bugprone-branch-clone) */
-	ASSERT(0);
-}
-
 /**
 * tap_send_single() - Send a single frame
 * @c:		Execution context
@ -173,7 +121,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
 *
 * Return: pointer at which to write the packet's payload
 */
-void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
+static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
 {
 	struct ethhdr *eh = (struct ethhdr *)buf;

@ -194,8 +142,8 @@ void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
 *
 * Return: pointer at which to write the packet's payload
 */
-void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
-		    struct in_addr dst, size_t l4len, uint8_t proto)
+static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+			   struct in_addr dst, size_t l4len, uint8_t proto)
 {
 	uint16_t l3len = l4len + sizeof(*ip4h);

@ -204,43 +152,13 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
 	ip4h->tos = 0;
 	ip4h->tot_len = htons(l3len);
 	ip4h->id = 0;
-	ip4h->frag_off = htons(IP_DF);
+	ip4h->frag_off = 0;
 	ip4h->ttl = 255;
 	ip4h->protocol = proto;
 	ip4h->saddr = src.s_addr;
 	ip4h->daddr = dst.s_addr;
 	ip4h->check = csum_ip4_header(l3len, proto, src, dst);
-	return (char *)ip4h + sizeof(*ip4h);
-}
-
-/**
- * tap_push_uh4() - Build UDPv4 header with checksum
- * @c:		Execution context
- * @src:	IPv4 source address
- * @sport:	UDP source port
- * @dst:	IPv4 destination address
- * @dport:	UDP destination port
- * @in:		UDP payload contents (not including UDP header)
- * @dlen:	UDP payload length (not including UDP header)
- *
- * Return: pointer at which to write the packet's payload
- */
-void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
-		   struct in_addr dst, in_port_t dport,
-		   const void *in, size_t dlen)
-{
-	size_t l4len = dlen + sizeof(struct udphdr);
-	const struct iovec iov = {
-		.iov_base = (void *)in,
-		.iov_len = dlen
-	};
-	struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
-
-	uh->source = htons(sport);
-	uh->dest = htons(dport);
-	uh->len = htons(l4len);
-	csum_udp4(uh, src, dst, &payload);
-	return (char *)uh + sizeof(*uh);
+	return ip4h + 1;
 }

 /**
@ -250,7 +168,7 @@ void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
 * @sport:	UDP source port
 * @dst:	IPv4 destination address
 * @dport:	UDP destination port
- * @in:	UDP payload contents (not including UDP header)
+ * @in:		UDP payload contents (not including UDP header)
 * @dlen:	UDP payload length (not including UDP header)
 */
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
@ -261,9 +179,18 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 	char buf[USHRT_MAX];
 	struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
 	struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
-	char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
+	char *data = (char *)(uh + 1);
+	const struct iovec iov = {
+		.iov_base = (void *)in,
+		.iov_len = dlen
+	};

+	uh->source = htons(sport);
+	uh->dest = htons(dport);
+	uh->len = htons(l4len);
+	csum_udp4(uh, src, dst, &iov, 1, 0);
 	memcpy(data, in, dlen);
+
 	tap_send_single(c, buf, dlen + (data - buf));
 }

@ -300,9 +227,10 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
 *
 * Return: pointer at which to write the packet's payload
 */
-void *tap_push_ip6h(struct ipv6hdr *ip6h,
-		    const struct in6_addr *src, const struct in6_addr *dst,
-		    size_t l4len, uint8_t proto, uint32_t flow)
+static void *tap_push_ip6h(struct ipv6hdr *ip6h,
+			   const struct in6_addr *src,
+			   const struct in6_addr *dst,
+			   size_t l4len, uint8_t proto, uint32_t flow)
 {
 	ip6h->payload_len = htons(l4len);
 	ip6h->priority = 0;
@ -311,40 +239,10 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h,
 	ip6h->hop_limit = 255;
 	ip6h->saddr = *src;
 	ip6h->daddr = *dst;
-	ip6_set_flow_lbl(ip6h, flow);
-	return (char *)ip6h + sizeof(*ip6h);
-}
-
-/**
- * tap_push_uh6() - Build UDPv6 header with checksum
- * @c:		Execution context
- * @src:	IPv6 source address
- * @sport:	UDP source port
- * @dst:	IPv6 destination address
- * @dport:	UDP destination port
- * @flow:	Flow label
- * @in:		UDP payload contents (not including UDP header)
- * @dlen:	UDP payload length (not including UDP header)
- *
- * Return: pointer at which to write the packet's payload
- */
-void *tap_push_uh6(struct udphdr *uh,
-		   const struct in6_addr *src, in_port_t sport,
-		   const struct in6_addr *dst, in_port_t dport,
-		   void *in, size_t dlen)
-{
-	size_t l4len = dlen + sizeof(struct udphdr);
-	const struct iovec iov = {
-		.iov_base = in,
-		.iov_len = dlen
-	};
-	struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
-
-	uh->source = htons(sport);
-	uh->dest = htons(dport);
-	uh->len = htons(l4len);
-	csum_udp6(uh, src, dst, &payload);
-	return (char *)uh + sizeof(*uh);
+	ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
+	ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
+	ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
+	return ip6h + 1;
 }

 /**
@ -355,7 +253,7 @@ void *tap_push_uh6(struct udphdr *uh,
 * @dst:	IPv6 destination address
 * @dport:	UDP destination port
 * @flow:	Flow label
- * @in:	UDP payload contents (not including UDP header)
+ * @in:		UDP payload contents (not including UDP header)
 * @dlen:	UDP payload length (not including UDP header)
 */
 void tap_udp6_send(const struct ctx *c,
@ -368,9 +266,18 @@ void tap_udp6_send(const struct ctx *c,
 	struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
 	struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
 					  l4len, IPPROTO_UDP, flow);
-	char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
+	char *data = (char *)(uh + 1);
+	const struct iovec iov = {
+		.iov_base = in,
+		.iov_len = dlen
+	};

+	uh->source = htons(sport);
+	uh->dest = htons(dport);
+	uh->len = htons(l4len);
+	csum_udp6(uh, src, dst, &iov, 1, 0);
 	memcpy(data, in, dlen);
+
 	tap_send_single(c, buf, dlen + (data - buf));
 }

@ -559,7 +466,6 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
 * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
 * @msgs:	Count of messages in sequence
 * @protocol:	Protocol number
- * @ttl:	Time to live
 * @source:	Source port
 * @dest:	Destination port
 * @saddr:	Source address
@ -568,7 +474,6 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
 */
 static struct tap4_l4_t {
 	uint8_t protocol;
-	uint8_t ttl;

 	uint16_t source;
 	uint16_t dest;
@ -583,17 +488,14 @@ static struct tap4_l4_t {
 * struct l4_seq6_t - Message sequence for one protocol handler call, IPv6
 * @msgs:	Count of messages in sequence
 * @protocol:	Protocol number
- * @flow_lbl:	IPv6 flow label
 * @source:	Source port
 * @dest:	Destination port
 * @saddr:	Source address
 * @daddr:	Destination address
- * @hop_limit:	Hop limit
 * @msg:	Array of messages that can be handled in a single call
 */
 static struct tap6_l4_t {
 	uint8_t protocol;
-	uint32_t flow_lbl :20;

 	uint16_t source;
 	uint16_t dest;
@ -601,8 +503,6 @@ static struct tap6_l4_t {
 	struct in6_addr saddr;
 	struct in6_addr daddr;

-	uint8_t hop_limit;
-
 	struct pool_l4_t p;
 } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];

@ -791,8 +691,7 @@ resume:
 #define L4_MATCH(iph, uh, seq)							\
 	((seq)->protocol == (iph)->protocol &&					\
 	 (seq)->source   == (uh)->source    && (seq)->dest  == (uh)->dest &&	\
-	 (seq)->saddr.s_addr == (iph)->saddr &&				\
-	 (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
+	 (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)

 #define L4_SET(iph, uh, seq)						\
 	do {								\
@ -801,7 +700,6 @@ resume:
 		(seq)->dest		= (uh)->dest;			\
 		(seq)->saddr.s_addr	= (iph)->saddr;			\
 		(seq)->daddr.s_addr	= (iph)->daddr;			\
-		(seq)->ttl		= (iph)->ttl;			\
 	} while (0)

 		if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
@ -843,14 +741,14 @@ append:
 			for (k = 0; k < p->count; )
 				k += tcp_tap_handler(c, PIF_TAP, AF_INET,
 						     &seq->saddr, &seq->daddr,
-						     0, p, k, now);
+						     p, k, now);
 		} else if (seq->protocol == IPPROTO_UDP) {
 			if (c->no_udp)
 				continue;
 			for (k = 0; k < p->count; )
 				k += udp_tap_handler(c, PIF_TAP, AF_INET,
 						     &seq->saddr, &seq->daddr,
-						     seq->ttl, p, k, now);
+						     p, k, now);
 		}
 	}

@ -921,9 +819,6 @@ resume:
 			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen)) {
 				c->ip6.addr_seen = *saddr;
 			}
-
-			if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr))
-				c->ip6.addr = *saddr;
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(saddr)){
 			c->ip6.addr_seen = *saddr;
 		}
@ -971,20 +866,16 @@ resume:
 		((seq)->protocol == (proto)                &&		\
 		 (seq)->source   == (uh)->source           &&		\
 		 (seq)->dest == (uh)->dest                 &&		\
-		 (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) &&		\
 		 IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr)  &&		\
-		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)  &&		\
-		 (seq)->hop_limit == (ip6h)->hop_limit)
+		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))

 #define L4_SET(ip6h, proto, uh, seq)					\
 	do {								\
 		(seq)->protocol	= (proto);				\
 		(seq)->source	= (uh)->source;				\
 		(seq)->dest	= (uh)->dest;				\
-		(seq)->flow_lbl	= ip6_get_flow_lbl(ip6h);		\
 		(seq)->saddr	= *saddr;				\
 		(seq)->daddr	= *daddr;				\
-		(seq)->hop_limit = (ip6h)->hop_limit;			\
 	} while (0)

 		if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
@ -1028,14 +919,14 @@ append:
 			for (k = 0; k < p->count; )
 				k += tcp_tap_handler(c, PIF_TAP, AF_INET6,
 						     &seq->saddr, &seq->daddr,
-						     seq->flow_lbl, p, k, now);
+						     p, k, now);
 		} else if (seq->protocol == IPPROTO_UDP) {
 			if (c->no_udp)
 				continue;
 			for (k = 0; k < p->count; )
 				k += udp_tap_handler(c, PIF_TAP, AF_INET6,
 						     &seq->saddr, &seq->daddr,
-						     seq->hop_limit, p, k, now);
+						     p, k, now);
 		}
 	}

@ -1070,10 +961,8 @@ void tap_handler(struct ctx *c, const struct timespec *now)
 * @c:		Execution context
 * @l2len:	Total L2 packet length
 * @p:		Packet buffer
- * @now:	Current timestamp
 */
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
-		    const struct timespec *now)
+void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
 {
 	const struct ethhdr *eh;

@ -1089,17 +978,9 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
 	switch (ntohs(eh->h_proto)) {
 	case ETH_P_ARP:
 	case ETH_P_IP:
-		if (pool_full(pool_tap4)) {
-			tap4_handler(c, pool_tap4, now);
-			pool_flush(pool_tap4);
-		}
 		packet_add(pool_tap4, l2len, p);
 		break;
 	case ETH_P_IPV6:
-		if (pool_full(pool_tap6)) {
-			tap6_handler(c, pool_tap6, now);
-			pool_flush(pool_tap6);
-		}
 		packet_add(pool_tap6, l2len, p);
 		break;
 	default:
@ -1116,10 +997,10 @@ void tap_sock_reset(struct ctx *c)
 	info("Client connection closed%s", c->one_off ? ", exiting" : "");

 	if (c->one_off)
-		_exit(EXIT_SUCCESS);
+		exit(EXIT_SUCCESS);

 	/* Close the connected socket, wait for a new connection */
-	epoll_del(c, c->fd_tap);
+	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
 	close(c->fd_tap);
 	c->fd_tap = -1;
 	if (c->mode == MODE_VU)
@ -1150,7 +1031,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)

 	do {
 		n = recv(c->fd_tap, pkt_buf + partial_len,
-			 sizeof(pkt_buf) - partial_len, MSG_DONTWAIT);
+			 TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
 	} while ((n < 0) && errno == EINTR);

 	if (n < 0) {
@ -1167,7 +1048,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 	while (n >= (ssize_t)sizeof(uint32_t)) {
 		uint32_t l2len = ntohl_unaligned(p);

-		if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) {
+		if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
 			err("Bad frame size from guest, resetting connection");
 			tap_sock_reset(c);
 			return;
@ -1180,7 +1061,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 		p += sizeof(uint32_t);
 		n -= sizeof(uint32_t);

-		tap_add_packet(c, l2len, p, now);
+		tap_add_packet(c, l2len, p);

 		p += l2len;
 		n -= l2len;
@ -1221,10 +1102,8 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)

 	tap_flush_pools();

-	for (n = 0;
-	     n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA);
-	     n += len) {
-		len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA);
+	for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
+		len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);

 		if (len == 0) {
 			die("EOF on tap device, exiting");
@ -1242,10 +1121,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)

 		/* Ignore frames of bad length */
 		if (len < (ssize_t)sizeof(struct ethhdr) ||
-		    len > (ssize_t)L2_MAX_LEN_PASTA)
+		    len > (ssize_t)ETH_MAX_MTU)
 			continue;

-		tap_add_packet(c, len, pkt_buf + n, now);
+		tap_add_packet(c, len, pkt_buf + n);
 	}

 	tap_handler(c, now);
@ -1267,6 +1146,68 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
 		tap_pasta_input(c, now);
 }

+/**
+ * tap_sock_unix_open() - Create and bind AF_UNIX socket
+ * @sock_path:	Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
+ *
+ * Return: socket descriptor on success, won't return on failure
+ */
+int tap_sock_unix_open(char *sock_path)
+{
+	int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+	struct sockaddr_un addr = {
+		.sun_family = AF_UNIX,
+	};
+	int i;
+
+	if (fd < 0)
+		die_perror("Failed to open UNIX domain socket");
+
+	for (i = 1; i < UNIX_SOCK_MAX; i++) {
+		char *path = addr.sun_path;
+		int ex, ret;
+
+		if (*sock_path)
+			memcpy(path, sock_path, UNIX_PATH_MAX);
+		else if (snprintf_check(path, UNIX_PATH_MAX - 1,
+					UNIX_SOCK_PATH, i))
+			die_perror("Can't build UNIX domain socket path");
+
+		ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+			    0);
+		if (ex < 0)
+			die_perror("Failed to check for UNIX domain conflicts");
+
+		ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
+		if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
+			     errno != EACCES)) {
+			if (*sock_path)
+				die("Socket path %s already in use", path);
+
+			close(ex);
+			continue;
+		}
+		close(ex);
+
+		unlink(path);
+		ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
+		if (*sock_path && ret)
+			die_perror("Failed to bind UNIX domain socket");
+
+		if (!ret)
+			break;
+	}
+
+	if (i == UNIX_SOCK_MAX)
+		die_perror("Failed to bind UNIX domain socket");
+
+	info("UNIX domain socket bound at %s", addr.sun_path);
+	if (!*sock_path)
+		memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
+
+	return fd;
+}
+
 /**
 * tap_backend_show_hints() - Give help information to start QEMU
 * @c:		Execution context
@ -1309,33 +1250,6 @@ static void tap_sock_unix_init(const struct ctx *c)
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
 }

-/**
- * tap_start_connection() - start a new connection
- * @c:		Execution context
- */
-static void tap_start_connection(const struct ctx *c)
-{
-	struct epoll_event ev = { 0 };
-	union epoll_ref ref = { 0 };
-
-	ref.fd = c->fd_tap;
-	switch (c->mode) {
-	case MODE_PASST:
-		ref.type = EPOLL_TYPE_TAP_PASST;
-		break;
-	case MODE_PASTA:
-		ref.type = EPOLL_TYPE_TAP_PASTA;
-		break;
-	case MODE_VU:
-		ref.type = EPOLL_TYPE_VHOST_CMD;
-		break;
-	}
-
-	ev.events = EPOLLIN | EPOLLRDHUP;
-	ev.data.u64 = ref.u64;
-	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
-}
-
 /**
 * tap_listen_handler() - Handle new connection on listening socket
 * @c:		Execution context
@ -1343,6 +1257,8 @@ static void tap_start_connection(const struct ctx *c)
 */
 void tap_listen_handler(struct ctx *c, uint32_t events)
 {
+	struct epoll_event ev = { 0 };
+	union epoll_ref ref = { 0 };
 	int v = INT_MAX / 2;
 	struct ucred ucred;
 	socklen_t len;
@ -1381,7 +1297,14 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
 	    setsockopt(c->fd_tap, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
 		trace("tap: failed to set SO_SNDBUF to %i", v);

-	tap_start_connection(c);
+	ref.fd = c->fd_tap;
+	if (c->mode == MODE_VU)
+		ref.type = EPOLL_TYPE_VHOST_CMD;
+	else
+		ref.type = EPOLL_TYPE_TAP_PASST;
+	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.data.u64 = ref.u64;
+	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 }

 /**
@ -1425,13 +1348,19 @@ static int tap_ns_tun(void *arg)
 */
 static void tap_sock_tun_init(struct ctx *c)
 {
+	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASTA };
+	struct epoll_event ev = { 0 };
+
 	NS_CALL(tap_ns_tun, c);
 	if (c->fd_tap == -1)
 		die("Failed to set up tap device in namespace");

 	pasta_ns_conf(c);

-	tap_start_connection(c);
+	ref.fd = c->fd_tap;
+	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.data.u64 = ref.u64;
+	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 }

 /**
@ -1443,8 +1372,8 @@ void tap_sock_update_pool(void *base, size_t size)
 {
 	int i;

-	pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size);
-	pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size);
+	pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size);
+	pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size);

 	for (i = 0; i < TAP_SEQS; i++) {
 		tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
@ -1459,16 +1388,32 @@ void tap_sock_update_pool(void *base, size_t size)
 */
 void tap_backend_init(struct ctx *c)
 {
-	if (c->mode == MODE_VU) {
+	if (c->mode == MODE_VU)
 		tap_sock_update_pool(NULL, 0);
-		vu_init(c);
-	} else {
+	else
 		tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
-	}

 	if (c->fd_tap != -1) { /* Passed as --fd */
+		struct epoll_event ev = { 0 };
+		union epoll_ref ref;
+
 		ASSERT(c->one_off);
-		tap_start_connection(c);
+		ref.fd = c->fd_tap;
+		switch (c->mode) {
+		case MODE_PASST:
+			ref.type = EPOLL_TYPE_TAP_PASST;
+			break;
+		case MODE_PASTA:
+			ref.type = EPOLL_TYPE_TAP_PASTA;
+			break;
+		case MODE_VU:
+			ref.type = EPOLL_TYPE_VHOST_CMD;
+			break;
+		}
+
+		ev.events = EPOLLIN | EPOLLRDHUP;
+		ev.data.u64 = ref.u64;
+		epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 		return;
 	}

@ -1477,7 +1422,7 @@ void tap_backend_init(struct ctx *c)
 		tap_sock_tun_init(c);
 		break;
 	case MODE_VU:
-		repair_sock_init(c);
+		vu_init(c);
 		/* fall through */
 	case MODE_PASST:
 		tap_sock_unix_init(c);
--- a/tap.h
+++ b/tap.h
@ -6,32 +6,7 @@
 #ifndef TAP_H
 #define TAP_H

-/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header)
- *
- * The kernel tuntap device imposes a maximum frame size of 65535 including
- * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).
- */
-#define L2_MAX_LEN_PASTA	USHRT_MAX
-
-/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header)
- *
- * The only structural limit the QEMU socket protocol imposes on frames is
- * (2^32-1) bytes, but that would be ludicrously long in practice.  For now,
- * limit it somewhat arbitrarily to 65535 bytes.  FIXME: Work out an appropriate
- * limit with more precision.
- */
-#define L2_MAX_LEN_PASST	USHRT_MAX
-
-/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header)
- *
- * vhost-user allows multiple buffers per frame, each of which can be quite
- * large, so the inherent frame size limit is rather large.  Much larger than is
- * actually useful for IP.  For now limit arbitrarily to 65535 bytes. FIXME:
- * Work out an appropriate limit with more precision.
- */
-#define L2_MAX_LEN_VU		USHRT_MAX
-
-struct udphdr;
+#define ETH_HDR_INIT(proto) { .h_proto = htons_constant(proto) }

 /**
 * struct tap_hdr - tap backend specific headers
@ -69,23 +44,6 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
 		thdr->vnet_len = htonl(l2len);
 }

-unsigned long tap_l2_max_len(const struct ctx *c);
-void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
-void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
-		     struct in_addr dst, size_t l4len, uint8_t proto);
-void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
-		   struct in_addr dst, in_port_t dport,
-		   const void *in, size_t dlen);
-void *tap_push_uh6(struct udphdr *uh,
-		   const struct in6_addr *src, in_port_t sport,
-		   const struct in6_addr *dst, in_port_t dport,
-		   void *in, size_t dlen);
-void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
-		    struct in_addr dst, size_t l4len, uint8_t proto);
-void *tap_push_ip6h(struct ipv6hdr *ip6h,
-		    const struct in6_addr *src,
-		    const struct in6_addr *dst,
-		    size_t l4len, uint8_t proto, uint32_t flow);
 void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 		   struct in_addr dst, in_port_t dport,
 		   const void *in, size_t dlen);
@ -93,9 +51,6 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
 		    const void *in, size_t l4len);
 const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
 				     const struct in6_addr *src);
-void *tap_push_ip6h(struct ipv6hdr *ip6h,
-		    const struct in6_addr *src, const struct in6_addr *dst,
-		    size_t l4len, uint8_t proto, uint32_t flow);
 void tap_udp6_send(const struct ctx *c,
 		   const struct in6_addr *src, in_port_t sport,
 		   const struct in6_addr *dst, in_port_t dport,
@ -119,7 +74,6 @@ void tap_sock_update_pool(void *base, size_t size);
 void tap_backend_init(struct ctx *c);
 void tap_flush_pools(void);
 void tap_handler(struct ctx *c, const struct timespec *now);
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
-		    const struct timespec *now);
+void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);

 #endif /* TAP_H */
--- a/tcp.c
+++ b/tcp.c
--- a/tcp.h
+++ b/tcp.h
@ -16,7 +16,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
 		      uint32_t events);
 int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
-		    const void *saddr, const void *daddr, uint32_t flow_lbl,
+		    const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now);
 int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);
@ -25,6 +25,7 @@ void tcp_timer(struct ctx *c, const struct timespec *now);
 void tcp_defer_handler(struct ctx *c);

 void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
+int tcp_set_peek_offset(int s, int offset);

 extern bool peek_offset_cap;

--- a/tcp_buf.c
+++ b/tcp_buf.c
@ -125,7 +125,7 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,

 		conn->seq_to_tap = seq;
 		peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
-		if (tcp_set_peek_offset(conn, peek_offset))
+		if (tcp_set_peek_offset(conn->sock, peek_offset))
 			tcp_rst(c, conn);
 	}
 }
@ -151,29 +151,30 @@ void tcp_payload_flush(const struct ctx *c)
 * tcp_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
 * @conn:	Connection pointer
 * @iov:	Pointer to an array of iovec of TCP pre-cooked buffers
+ * @dlen:	TCP payload length
 * @check:	Checksum, if already known
 * @seq:	Sequence number for this segment
 * @no_tcp_csum: Do not set TCP checksum
 */
 static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
-				    struct iovec *iov, const uint16_t *check,
-				    uint32_t seq, bool no_tcp_csum)
+				    struct iovec *iov, size_t dlen,
+				    const uint16_t *check, uint32_t seq,
+				    bool no_tcp_csum)
 {
-	struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0);
-	struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr);
-	struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base;
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *a4 = inany_v4(&tapside->oaddr);
-	struct ipv6hdr *ip6h = NULL;
-	struct iphdr *ip4h = NULL;

-	if (a4)
-		ip4h = iov[TCP_IOV_IP].iov_base;
-	else
-		ip6h = iov[TCP_IOV_IP].iov_base;
-
-	tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail,
-			 check, seq, no_tcp_csum);
+	if (a4) {
+		tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
+				  iov[TCP_IOV_IP].iov_base,
+				  iov[TCP_IOV_PAYLOAD].iov_base, dlen,
+				  check, seq, no_tcp_csum);
+	} else {
+		tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
+				  iov[TCP_IOV_IP].iov_base,
+				  iov[TCP_IOV_PAYLOAD].iov_base, dlen,
+				  seq, no_tcp_csum);
+	}
 }

 /**
@ -212,7 +213,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	tcp_payload_used++;
 	l4len = optlen + sizeof(struct tcphdr);
 	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-	tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
+	tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);

 	if (flags & DUP_ACK) {
 		struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
@ -239,10 +240,9 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 * @dlen:	TCP payload length
 * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
 * @seq:	Sequence number to be sent
- * @push:	Set PSH flag, last segment in a batch
 */
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
-			    ssize_t dlen, int no_csum, uint32_t seq, bool push)
+			    ssize_t dlen, int no_csum, uint32_t seq)
 {
 	struct tcp_payload_t *payload;
 	const uint16_t *check = NULL;
@ -269,9 +269,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 	payload->th.th_x2 = 0;
 	payload->th.th_flags = 0;
 	payload->th.ack = 1;
-	payload->th.psh = push;
 	iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
-	tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
+	tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
 	if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
 		tcp_payload_flush(c);
 }
@ -304,14 +303,13 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
-		if (tcp_set_peek_offset(conn, 0)) {
+		if (tcp_set_peek_offset(s, 0)) {
 			tcp_rst(c, conn);
 			return -1;
 		}
 	}

 	if (!wnd_scaled || already_sent >= wnd_scaled) {
-		conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
 		conn_flag(c, conn, STALLED);
 		conn_flag(c, conn, ACK_FROM_TAP_DUE);
 		return 0;
@ -362,9 +360,6 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 			return -errno;
 		}

-		if (already_sent) /* No new data and EAGAIN: set EPOLLET */
-			conn_flag(c, conn, STALLED);
-
 		return 0;
 	}

@ -390,7 +385,6 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}

-	conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
 	conn_flag(c, conn, ~STALLED);

 	send_bufs = DIV_ROUND_UP(len, mss);
@ -404,14 +398,11 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	seq = conn->seq_to_tap;
 	for (i = 0; i < send_bufs; i++) {
 		int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
-		bool push = false;

-		if (i == send_bufs - 1) {
+		if (i == send_bufs - 1)
 			dlen = last_len;
-			push = true;
-		}

-		tcp_data_to_tap(c, conn, dlen, no_csum, seq, push);
+		tcp_data_to_tap(c, conn, dlen, no_csum, seq);
 		seq += dlen;
 	}

--- a/tcp_conn.h
+++ b/tcp_conn.h
@ -19,7 +19,6 @@
 * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
 * @sock:		Socket descriptor number
 * @events:		Connection events, implying connection states
- * @listening_sock:	Listening socket this socket was accept()ed from, or -1
 * @timer:		timerfd descriptor for timeout events
 * @flags:		Connection flags representing internal attributes
 * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
@ -69,7 +68,6 @@ struct tcp_tap_conn {
 #define	CONN_STATE_BITS		/* Setting these clears other flags */	\
 	(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)

-	int		listening_sock;

 	int		timer		:FD_REF_BITS;

@ -79,7 +77,6 @@ struct tcp_tap_conn {
 #define ACTIVE_CLOSE		BIT(2)
 #define ACK_TO_TAP_DUE		BIT(3)
 #define ACK_FROM_TAP_DUE	BIT(4)
-#define ACK_FROM_TAP_BLOCKS	BIT(5)

 #define SNDBUF_BITS		24
 	unsigned int	sndbuf		:SNDBUF_BITS;
@ -98,95 +95,6 @@ struct tcp_tap_conn {
 	uint32_t	seq_init_from_tap;
 };

-/**
- * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
- * @pif:		Interfaces for each side of the flow
- * @side:		Addresses and ports for each side of the flow
- * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
- * @ws_from_tap:	Window scaling factor advertised from tap/guest
- * @ws_to_tap:		Window scaling factor advertised to tap/guest
- * @events:		Connection events, implying connection states
- * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
- * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
- * @flags:		Connection flags representing internal attributes
- * @seq_dup_ack_approx:	Last duplicate ACK number sent to tap
- * @wnd_from_tap:	Last window size from tap, unscaled (as received)
- * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
- * @seq_to_tap:		Next sequence for packets to tap
- * @seq_ack_from_tap:	Last ACK number received from tap
- * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
- * @seq_ack_to_tap:	Last ACK number sent to tap
- * @seq_init_from_tap:	Initial sequence number from tap
-*/
-struct tcp_tap_transfer {
-	uint8_t		pif[SIDES];
-	struct flowside	side[SIDES];
-
-	uint8_t		retrans;
-	uint8_t		ws_from_tap;
-	uint8_t		ws_to_tap;
-	uint8_t		events;
-
-	uint32_t	tap_mss;
-
-	uint32_t	sndbuf;
-
-	uint8_t		flags;
-	uint8_t		seq_dup_ack_approx;
-
-	uint16_t	wnd_from_tap;
-	uint16_t	wnd_to_tap;
-
-	uint32_t	seq_to_tap;
-	uint32_t	seq_ack_from_tap;
-	uint32_t	seq_from_tap;
-	uint32_t	seq_ack_to_tap;
-	uint32_t	seq_init_from_tap;
-} __attribute__((packed, aligned(__alignof__(uint32_t))));
-
-/**
- * struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order
- * @seq_snd:		Socket-side send sequence
- * @seq_rcv:		Socket-side receive sequence
- * @sndq:		Length of pending send queue (unacknowledged / not sent)
- * @notsent:		Part of pending send queue that wasn't sent out yet
- * @rcvq:		Length of pending receive queue
- * @mss:		Socket-side MSS clamp
- * @timestamp:		RFC 7323 timestamp
- * @snd_wl1:		Next sequence used in window probe (next sequence - 1)
- * @snd_wnd:		Socket-side sending window
- * @max_window:		Window clamp
- * @rcv_wnd:		Socket-side receive window
- * @rcv_wup:		rcv_nxt on last window update sent
- * @snd_ws:		Window scaling factor, send
- * @rcv_ws:		Window scaling factor, receive
- * @tcpi_state:		Connection state in TCP_INFO style (enum, tcp_states.h)
- * @tcpi_options:	TCPI_OPT_* constants (timestamps, selective ACK)
- */
-struct tcp_tap_transfer_ext {
-	uint32_t	seq_snd;
-	uint32_t	seq_rcv;
-
-	uint32_t	sndq;
-	uint32_t	notsent;
-	uint32_t	rcvq;
-
-	uint32_t	mss;
-	uint32_t	timestamp;
-
-	/* We can't just use struct tcp_repair_window: we need network order */
-	uint32_t	snd_wl1;
-	uint32_t	snd_wnd;
-	uint32_t	max_window;
-	uint32_t	rcv_wnd;
-	uint32_t	rcv_wup;
-
-	uint8_t		snd_ws;
-	uint8_t		rcv_ws;
-	uint8_t		tcpi_state;
-	uint8_t		tcpi_options;
-} __attribute__((packed, aligned(__alignof__(uint32_t))));
-
 /**
 * struct tcp_splice_conn - Descriptor for a spliced TCP connection
 * @f:			Generic flow information
@ -231,23 +139,11 @@ extern int init_sock_pool4	[TCP_SOCK_POOL_SIZE];
 extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];

 bool tcp_flow_defer(const struct tcp_tap_conn *conn);
-
-int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
-int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
-
-int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
-
-int tcp_flow_migrate_target(struct ctx *c, int fd);
-int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
-
-bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
-
 bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
 void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
 int tcp_conn_pool_sock(int pool[]);
-int tcp_conn_sock(sa_family_t af);
-int tcp_sock_refill_pool(int pool[], sa_family_t af);
+int tcp_conn_sock(const struct ctx *c, sa_family_t af);
+int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af);
 void tcp_splice_refill(const struct ctx *c);

 #endif /* TCP_CONN_H */
--- a/tcp_internal.h
+++ b/tcp_internal.h
@ -38,13 +38,9 @@
 #define OPT_SACK	5
 #define OPT_TS		8

-#define TAPSIDE(conn_)		((conn_)->f.pif[1] == PIF_TAP)
-#define TAPFLOW(conn_)		(&((conn_)->f.side[TAPSIDE(conn_)]))
-#define TAP_SIDX(conn_)		(FLOW_SIDX((conn_), TAPSIDE(conn_)))
-
-#define HOSTSIDE(conn_)		((conn_)->f.pif[1] == PIF_HOST)
-#define HOSTFLOW(conn_)		(&((conn_)->f.side[HOSTSIDE(conn_)]))
-#define HOST_SIDX(conn_)	(FLOW_SIDX((conn_), TAPSIDE(conn_)))
+#define TAPSIDE(conn_)	((conn_)->f.pif[1] == PIF_TAP)
+#define TAPFLOW(conn_)	(&((conn_)->f.side[TAPSIDE(conn_)]))
+#define TAP_SIDX(conn_)	(FLOW_SIDX((conn_), TAPSIDE(conn_)))

 #define CONN_V4(conn)		(!!inany_v4(&TAPFLOW(conn)->oaddr))
 #define CONN_V6(conn)		(!CONN_V4(conn))
@ -166,17 +162,25 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);

 struct tcp_info_linux;

-void tcp_fill_headers(const struct tcp_tap_conn *conn,
-		      struct tap_hdr *taph,
-		      struct iphdr *ip4h, struct ipv6hdr *ip6h,
-		      struct tcphdr *th, struct iov_tail *payload,
-		      const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum);
+void tcp_update_check_tcp4(const struct iphdr *iph,
+			   const struct iovec *iov, int iov_cnt,
+			   size_t l4offset);
+void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
+			   const struct iovec *iov, int iov_cnt,
+			   size_t l4offset);
+void tcp_fill_headers4(const struct tcp_tap_conn *conn,
+		       struct tap_hdr *taph, struct iphdr *iph,
+		       struct tcp_payload_t *bp, size_t dlen,
+		       const uint16_t *check, uint32_t seq, bool no_tcp_csum);
+void tcp_fill_headers6(const struct tcp_tap_conn *conn,
+		       struct tap_hdr *taph, struct ipv6hdr *ip6h,
+		       struct tcp_payload_t *bp, size_t dlen,
+		       uint32_t seq, bool no_tcp_csum);

 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 			  bool force_seq, struct tcp_info_linux *tinfo);
 int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 		      int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
 		      size_t *optlen);
-int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset);

 #endif /* TCP_INTERNAL_H */
--- a/tcp_splice.c
+++ b/tcp_splice.c
@ -28,7 +28,7 @@
 * - FIN_SENT_0:		FIN (write shutdown) sent to accepted socket
 * - FIN_SENT_1:		FIN (write shutdown) sent to target socket
 *
- * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64
+ * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
 */

 #include <sched.h>
@ -131,12 +131,8 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
 		ev[1].events = EPOLLOUT;
 	}

-	flow_foreach_sidei(sidei) {
-		if (events & OUT_WAIT(sidei)) {
-			ev[sidei].events |= EPOLLOUT;
-			ev[!sidei].events &= ~EPOLLIN;
-		}
-	}
+	flow_foreach_sidei(sidei)
+		ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0;
 }

 /**
@ -164,7 +160,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
 	if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
 	    epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
 		int ret = -errno;
-		flow_perror(conn, "ERROR on epoll_ctl()");
+		flow_err(conn, "ERROR on epoll_ctl(): %s", strerror(errno));
 		return ret;
 	}

@ -204,8 +200,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
 	}

 	if (flag == CLOSING) {
-		epoll_del(c, conn->s[0]);
-		epoll_del(c, conn->s[1]);
+		epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[0], NULL);
+		epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[1], NULL);
 	}
 }

@ -317,8 +313,8 @@ static int tcp_splice_connect_finish(const struct ctx *c,

 		if (conn->pipe[sidei][0] < 0) {
 			if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
-				flow_perror(conn, "cannot create %d->%d pipe",
-					    sidei, !sidei);
+				flow_err(conn, "cannot create %d->%d pipe: %s",
+					 sidei, !sidei, strerror(errno));
 				conn_flag(c, conn, CLOSING);
 				return -EIO;
 			}
@ -352,10 +348,9 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
 	uint8_t tgtpif = conn->f.pif[TGTSIDE];
 	union sockaddr_inany sa;
 	socklen_t sl;
-	int one = 1;

 	if (tgtpif == PIF_HOST)
-		conn->s[1] = tcp_conn_sock(af);
+		conn->s[1] = tcp_conn_sock(c, af);
 	else if (tgtpif == PIF_SPLICE)
 		conn->s[1] = tcp_conn_sock_ns(c, af);
 	else
@ -364,27 +359,18 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
 	if (conn->s[1] < 0)
 		return -1;

-	if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &one, sizeof(one))) {
+	if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK,
+		       &((int){ 1 }), sizeof(int))) {
 		flow_trace(conn, "failed to set TCP_QUICKACK on socket %i",
 			   conn->s[1]);
 	}

-	if (setsockopt(conn->s[0], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
-		flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
-			   conn->s[0]);
-	}
-
-	if (setsockopt(conn->s[1], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
-		flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
-			   conn->s[1]);
-	}
-
 	pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);

 	if (connect(conn->s[1], &sa.sa, sl)) {
 		if (errno != EINPROGRESS) {
 			flow_trace(conn, "Couldn't connect socket for splice: %s",
-				   strerror_(errno));
+				   strerror(errno));
 			return -errno;
 		}

@ -482,10 +468,11 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,

 		rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl);
 		if (rc)
-			flow_perror(conn, "Error retrieving SO_ERROR");
+			flow_err(conn, "Error retrieving SO_ERROR: %s",
+				 strerror(errno));
 		else
 			flow_trace(conn, "Error event on socket: %s",
-				   strerror_(err));
+				   strerror(err));

 		goto close;
 	}
@ -520,21 +507,20 @@ swap:
 		int more = 0;

 retry:
-		do
-			readlen = splice(conn->s[fromsidei], NULL,
-					 conn->pipe[fromsidei][1], NULL,
-					 c->tcp.pipe_size,
-					 SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
-		while (readlen < 0 && errno == EINTR);
-
-		if (readlen < 0 && errno != EAGAIN)
-			goto close;
-
+		readlen = splice(conn->s[fromsidei], NULL,
+				 conn->pipe[fromsidei][1], NULL,
+				 c->tcp.pipe_size,
+				 SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
 		flow_trace(conn, "%zi from read-side call", readlen);
+		if (readlen < 0) {
+			if (errno == EINTR)
+				goto retry;

-		if (!readlen) {
+			if (errno != EAGAIN)
+				goto close;
+		} else if (!readlen) {
 			eof = 1;
-		} else if (readlen > 0) {
+		} else {
 			never_read = 0;

 			if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
@ -544,16 +530,10 @@ retry:
 				conn_flag(c, conn, lowat_act_flag);
 		}

-		do
-			written = splice(conn->pipe[fromsidei][0], NULL,
-					 conn->s[!fromsidei], NULL,
-					 c->tcp.pipe_size,
-					 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
-		while (written < 0 && errno == EINTR);
-
-		if (written < 0 && errno != EAGAIN)
-			goto close;
-
+eintr:
+		written = splice(conn->pipe[fromsidei][0], NULL,
+				 conn->s[!fromsidei], NULL, c->tcp.pipe_size,
+				 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
 		flow_trace(conn, "%zi from write-side call (passed %zi)",
 			   written, c->tcp.pipe_size);

@ -562,7 +542,7 @@ retry:
 			if (readlen >= (long)c->tcp.pipe_size * 10 / 100)
 				continue;

-			if (!(conn->flags & lowat_set_flag) &&
+			if (conn->flags & lowat_set_flag &&
 			    readlen > (long)c->tcp.pipe_size / 10) {
 				int lowat = c->tcp.pipe_size / 4;

@ -571,7 +551,7 @@ retry:
 					       &lowat, sizeof(lowat))) {
 					flow_trace(conn,
 						   "Setting SO_RCVLOWAT %i: %s",
-						   lowat, strerror_(errno));
+						   lowat, strerror(errno));
 				} else {
 					conn_flag(c, conn, lowat_set_flag);
 					conn_flag(c, conn, lowat_act_flag);
@ -585,6 +565,12 @@ retry:
 		conn->written[fromsidei] += written > 0 ? written : 0;

 		if (written < 0) {
+			if (errno == EINTR)
+				goto eintr;
+
+			if (errno != EAGAIN)
+				goto close;
+
 			if (conn->read[fromsidei] == conn->written[fromsidei])
 				break;

@ -707,16 +693,16 @@ static int tcp_sock_refill_ns(void *arg)
 	ns_enter(c);

 	if (c->ifi4) {
-		int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET);
+		int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv4 ns socket pool: %s",
-			     strerror_(-rc));
+			     strerror(-rc));
 	}
 	if (c->ifi6) {
-		int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6);
+		int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv6 ns socket pool: %s",
-			     strerror_(-rc));
+			     strerror(-rc));
 	}

 	return 0;
--- a/tcp_vu.c
+++ b/tcp_vu.c
@ -14,7 +14,6 @@

 #include <sys/socket.h>

-#include <netinet/if_ether.h>
 #include <linux/virtio_net.h>

 #include "util.h"
@ -37,7 +36,6 @@

 static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
 static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
-static int head[VIRTQUEUE_MAX_SIZE + 1];

 /**
 * tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
@ -60,6 +58,30 @@ static size_t tcp_vu_hdrlen(bool v6)
 	return hdrlen;
 }

+/**
+ * tcp_vu_update_check() - Calculate TCP checksum
+ * @tapside:	Address information for one side of the flow
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_used:	Length of the array
+ */
+static void tcp_vu_update_check(const struct flowside *tapside,
+			        struct iovec *iov, int iov_used)
+{
+	char *base = iov[0].iov_base;
+
+	if (inany_v4(&tapside->oaddr)) {
+		const struct iphdr *iph = vu_ip(base);
+
+		tcp_update_check_tcp4(iph, iov, iov_used,
+				      (char *)vu_payloadv4(base) - base);
+	} else {
+		const struct ipv6hdr *ip6h = vu_ip(base);
+
+		tcp_update_check_tcp6(ip6h, iov, iov_used,
+				      (char *)vu_payloadv6(base) - base);
+	}
+}
+
 /**
 * tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload)
 * @c:		Execution context
@ -72,14 +94,13 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 {
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	const struct flowside *tapside = TAPFLOW(conn);
 	size_t optlen, hdrlen;
 	struct vu_virtq_element flags_elem[2];
+	struct tcp_payload_t *payload;
 	struct ipv6hdr *ip6h = NULL;
-	struct iphdr *ip4h = NULL;
 	struct iovec flags_iov[2];
-	struct tcp_syn_opts *opts;
-	struct iov_tail payload;
-	struct tcphdr *th;
+	struct iphdr *iph = NULL;
 	struct ethhdr *eh;
 	uint32_t seq;
 	int elem_cnt;
@ -95,9 +116,6 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	if (elem_cnt != 1)
 		return -1;

-	ASSERT(flags_elem[0].in_sg[0].iov_len >=
-	       hdrlen + sizeof(struct tcp_syn_opts));
-
 	vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1);

 	eh = vu_eth(flags_elem[0].in_sg[0].iov_base);
@ -108,37 +126,42 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	if (CONN_V4(conn)) {
 		eh->h_proto = htons(ETH_P_IP);

-		ip4h = vu_ip(flags_elem[0].in_sg[0].iov_base);
-		*ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+		iph = vu_ip(flags_elem[0].in_sg[0].iov_base);
+		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);

-		th = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
+		payload = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
 	} else {
 		eh->h_proto = htons(ETH_P_IPV6);

 		ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base);
 		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
-		th = vu_payloadv6(flags_elem[0].in_sg[0].iov_base);
+		payload = vu_payloadv6(flags_elem[0].in_sg[0].iov_base);
 	}

-	memset(th, 0, sizeof(*th));
-	th->doff = sizeof(*th) / 4;
-	th->ack = 1;
+	memset(&payload->th, 0, sizeof(payload->th));
+	payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
+	payload->th.ack = 1;

 	seq = conn->seq_to_tap;
-	opts = (struct tcp_syn_opts *)(th + 1);
-	ret = tcp_prepare_flags(c, conn, flags, th, opts, &optlen);
+	ret = tcp_prepare_flags(c, conn, flags, &payload->th,
+				(struct tcp_syn_opts *)payload->data,
+				&optlen);
 	if (ret <= 0) {
 		vu_queue_rewind(vq, 1);
 		return ret;
 	}

 	flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
-	payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);

-	tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
-			 NULL, seq, !*c->pcap);
+	if (CONN_V4(conn)) {
+		tcp_fill_headers4(conn, NULL, iph, payload, optlen, NULL, seq,
+				  true);
+	} else {
+		tcp_fill_headers6(conn, NULL, ip6h, payload, optlen, seq, true);
+	}

 	if (*c->pcap) {
+		tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1);
 		pcap_iov(&flags_elem[0].in_sg[0], 1,
 			 sizeof(struct virtio_net_hdr_mrg_rxbuf));
 	}
@ -149,18 +172,14 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)

 		elem_cnt = vu_collect(vdev, vq, &flags_elem[1], 1,
 				      flags_elem[0].in_sg[0].iov_len, NULL);
-		if (elem_cnt == 1 &&
-		    flags_elem[1].in_sg[0].iov_len >=
-		    flags_elem[0].in_sg[0].iov_len) {
+		if (elem_cnt == 1) {
 			memcpy(flags_elem[1].in_sg[0].iov_base,
 			       flags_elem[0].in_sg[0].iov_base,
 			       flags_elem[0].in_sg[0].iov_len);
 			nb_ack++;

-			if (*c->pcap) {
-				pcap_iov(&flags_elem[1].in_sg[0], 1,
-					 sizeof(struct virtio_net_hdr_mrg_rxbuf));
-			}
+			if (*c->pcap)
+				pcap_iov(&flags_elem[1].in_sg[0], 1, 0);
 		}
 	}

@ -174,25 +193,24 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 * @conn:		Connection pointer
 * @v6:			Set for IPv6 connections
 * @already_sent:	Number of bytes already sent
- * @fillsize:		Maximum bytes to fill in guest-side receiving window
+ * @fillsize:		Number of bytes we can receive
 * @iov_cnt:		number of iov (output)
 *
- * Return: Number of iov entries used to store the data or negative error code
+ * Return: Number of iov entries used to store the data
 */
 static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 				const struct tcp_tap_conn *conn, bool v6,
 				uint32_t already_sent, size_t fillsize,
-				int *iov_cnt, int *head_cnt)
+				int *iov_cnt)
 {
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	struct msghdr mh_sock = { 0 };
 	uint16_t mss = MSS_GET(conn);
 	int s = conn->sock;
-	ssize_t ret, len;
 	size_t hdrlen;
 	int elem_cnt;
-	int i;
+	ssize_t ret;

 	*iov_cnt = 0;

@ -201,29 +219,36 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 	vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);

 	elem_cnt = 0;
-	*head_cnt = 0;
+
 	while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) {
 		struct iovec *iov;
-		size_t frame_size, dlen;
+		size_t frame_size;
 		int cnt;

+		if (mss > fillsize)
+			mss = fillsize;
+
 		cnt = vu_collect(vdev, vq, &elem[elem_cnt],
 				 VIRTQUEUE_MAX_SIZE - elem_cnt,
-				 MIN(mss, fillsize) + hdrlen, &frame_size);
+				 mss + hdrlen, &frame_size);
 		if (cnt == 0)
 			break;

-		dlen = frame_size - hdrlen;
-
-		/* reserve space for headers in iov */
+		frame_size -= hdrlen;
 		iov = &elem[elem_cnt].in_sg[0];
-		ASSERT(iov->iov_len >= hdrlen);
 		iov->iov_base = (char *)iov->iov_base + hdrlen;
 		iov->iov_len -= hdrlen;
-		head[(*head_cnt)++] = elem_cnt;

-		fillsize -= dlen;
+		fillsize -= frame_size;
 		elem_cnt += cnt;
+
+		/* All the frames must have the same size (except the last one),
+		 * otherwise we will no able to scan the iov array
+		 * to find iov entries with headers
+		 * (headers are spread every frame_size in the the array
+		 */
+		if (frame_size < mss)
+			break;
 	}

 	if (peek_offset_cap) {
@ -241,74 +266,33 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
 		ret = recvmsg(s, &mh_sock, MSG_PEEK);
 	while (ret < 0 && errno == EINTR);

-	if (ret < 0) {
-		vu_queue_rewind(vq, elem_cnt);
-		return -errno;
-	}
-
-	if (!peek_offset_cap)
-		ret -= already_sent;
-
-	/* adjust iov number and length of the last iov */
-	len = ret;
-	for (i = 0; len && i < elem_cnt; i++) {
-		struct iovec *iov = &elem[i].in_sg[0];
-
-		if (iov->iov_len > (size_t)len)
-			iov->iov_len = len;
-
-		len -= iov->iov_len;
-	}
-	/* adjust head count */
-	while (*head_cnt > 0 && head[*head_cnt - 1] >= i)
-		(*head_cnt)--;
-
-	/* mark end of array */
-	head[*head_cnt] = i;
-	*iov_cnt = i;
-
-	/* release unused buffers */
-	vu_queue_rewind(vq, elem_cnt - i);
-
-	/* restore space for headers in iov */
-	for (i = 0; i < *head_cnt; i++) {
-		struct iovec *iov = &elem[head[i]].in_sg[0];
-
-		iov->iov_base = (char *)iov->iov_base - hdrlen;
-		iov->iov_len += hdrlen;
-	}
+	*iov_cnt = elem_cnt;

 	return ret;
 }

 /**
 * tcp_vu_prepare() - Prepare the frame header
- * @c:			Execution context
- * @conn:		Connection pointer
- * @iov:		Pointer to the array of IO vectors
- * @iov_cnt:		Number of entries in @iov
- * @check:		Checksum, if already known
- * @no_tcp_csum:	Do not set TCP checksum
- * @push:		Set PSH flag, last segment in a batch
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ * @first:	Pointer to the array of IO vectors
+ * @dlen:	Packet data length
+ * @check:	Checksum, if already known
 */
-static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
-			   struct iovec *iov, size_t iov_cnt,
-			   const uint16_t **check, bool no_tcp_csum, bool push)
+static void tcp_vu_prepare(const struct ctx *c,
+			   struct tcp_tap_conn *conn, struct iovec *first,
+			   size_t dlen, const uint16_t **check)
 {
 	const struct flowside *toside = TAPFLOW(conn);
-	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
-	size_t hdrlen = tcp_vu_hdrlen(v6);
-	struct iov_tail payload = IOV_TAIL(iov, iov_cnt, hdrlen);
-	char *base = iov[0].iov_base;
+	struct tcp_payload_t *payload;
+	char *base = first->iov_base;
 	struct ipv6hdr *ip6h = NULL;
-	struct iphdr *ip4h = NULL;
-	struct tcphdr *th;
+	struct iphdr *iph = NULL;
 	struct ethhdr *eh;

 	/* we guess the first iovec provided by the guest can embed
 	 * all the headers needed by L2 frame
 	 */
-	ASSERT(iov[0].iov_len >= hdrlen);

 	eh = vu_eth(base);

@ -317,30 +301,37 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,

 	/* initialize header */

-	if (!v6) {
+	if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
+		ASSERT(first[0].iov_len >= tcp_vu_hdrlen(false));
+
 		eh->h_proto = htons(ETH_P_IP);

-		ip4h = vu_ip(base);
-		*ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
-		th = vu_payloadv4(base);
+		iph = vu_ip(base);
+		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+		payload = vu_payloadv4(base);
 	} else {
+		ASSERT(first[0].iov_len >= tcp_vu_hdrlen(true));
+
 		eh->h_proto = htons(ETH_P_IPV6);

 		ip6h = vu_ip(base);
 		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);

-		th = vu_payloadv6(base);
+		payload = vu_payloadv6(base);
 	}

-	memset(th, 0, sizeof(*th));
-	th->doff = sizeof(*th) / 4;
-	th->ack = 1;
-	th->psh = push;
+	memset(&payload->th, 0, sizeof(payload->th));
+	payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
+	payload->th.ack = 1;

-	tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
-			 *check, conn->seq_to_tap, no_tcp_csum);
-	if (ip4h)
-		*check = &ip4h->check;
+	if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
+		tcp_fill_headers4(conn, NULL, iph, payload, dlen,
+				  *check, conn->seq_to_tap, true);
+		*check = &iph->check;
+	} else {
+		tcp_fill_headers6(conn, NULL, ip6h, payload, dlen,
+				  conn->seq_to_tap, true);
+	}
 }

 /**
@ -356,15 +347,21 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
-	ssize_t len, previous_dlen;
-	int i, iov_cnt, head_cnt;
+	const struct flowside *tapside = TAPFLOW(conn);
+	uint16_t mss = MSS_GET(conn);
 	size_t hdrlen, fillsize;
+	int i, iov_cnt, iov_used;
 	int v6 = CONN_V6(conn);
-	uint32_t already_sent;
+	uint32_t already_sent = 0;
 	const uint16_t *check;
+	struct iovec *first;
+	int frame_size;
+	int num_buffers;
+	ssize_t len;

 	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
-		debug("Got packet, but RX virtqueue not usable yet");
+		flow_err(conn,
+			 "Got packet, but RX virtqueue not usable yet");
 		return 0;
 	}

@ -376,14 +373,13 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
-		if (tcp_set_peek_offset(conn, 0)) {
+		if (tcp_set_peek_offset(conn->sock, 0)) {
 			tcp_rst(c, conn);
 			return -1;
 		}
 	}

 	if (!wnd_scaled || already_sent >= wnd_scaled) {
-		conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
 		conn_flag(c, conn, STALLED);
 		conn_flag(c, conn, ACK_FROM_TAP_DUE);
 		return 0;
@ -396,25 +392,19 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	/* collect the buffers from vhost-user and fill them with the
 	 * data from the socket
 	 */
-	len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize,
-			       &iov_cnt, &head_cnt);
+	len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, &iov_cnt);
 	if (len < 0) {
-		if (len != -EAGAIN && len != -EWOULDBLOCK) {
+		vu_queue_rewind(vq, iov_cnt);
+		if (errno != EAGAIN && errno != EWOULDBLOCK) {
 			tcp_rst(c, conn);
-			return len;
+			return -errno;
 		}
-
-		if (already_sent) /* No new data and EAGAIN: set EPOLLET */
-			conn_flag(c, conn, STALLED);
-
 		return 0;
 	}

 	if (!len) {
-		if (already_sent) {
-			conn_flag(c, conn, STALLED);
-		} else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
-			   SOCK_FIN_RCVD) {
+		vu_queue_rewind(vq, iov_cnt);
+		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
 			int ret = tcp_vu_send_flag(c, conn, FIN | ACK);
 			if (ret) {
 				tcp_rst(c, conn);
@ -427,46 +417,76 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}

-	conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
+	if (!peek_offset_cap)
+		len -= already_sent;
+
+	if (len <= 0) {
+		vu_queue_rewind(vq, iov_cnt);
+		conn_flag(c, conn, STALLED);
+		return 0;
+	}
+
 	conn_flag(c, conn, ~STALLED);

 	/* Likely, some new data was acked too. */
 	tcp_update_seqack_wnd(c, conn, false, NULL);

 	/* initialize headers */
+	hdrlen = tcp_vu_hdrlen(v6);
+	iov_used = 0;
+	num_buffers = 0;
+	check = NULL;
+	frame_size = 0;
+
 	/* iov_vu is an array of buffers and the buffer size can be
 	 * smaller than the frame size we want to use but with
 	 * num_buffer we can merge several virtio iov buffers in one packet
 	 * we need only to set the packet headers in the first iov and
 	 * num_buffer to the number of iov entries
 	 */
+	for (i = 0; i < iov_cnt && len; i++) {

-	hdrlen = tcp_vu_hdrlen(v6);
-	for (i = 0, previous_dlen = -1, check = NULL; i < head_cnt; i++) {
-		struct iovec *iov = &elem[head[i]].in_sg[0];
-		int buf_cnt = head[i + 1] - head[i];
-		ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen;
-		bool push = i == head_cnt - 1;
+		if (frame_size == 0)
+			first = &iov_vu[i + 1];

-		vu_set_vnethdr(vdev, iov->iov_base, buf_cnt);
+		if (iov_vu[i + 1].iov_len > (size_t)len)
+			iov_vu[i + 1].iov_len = len;

-		/* The IPv4 header checksum varies only with dlen */
-		if (previous_dlen != dlen)
-			check = NULL;
-		previous_dlen = dlen;
+		len -= iov_vu[i + 1].iov_len;
+		iov_used++;

-		tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap, push);
+		frame_size += iov_vu[i + 1].iov_len;
+		num_buffers++;

-		if (*c->pcap) {
-			pcap_iov(iov, buf_cnt,
-				 sizeof(struct virtio_net_hdr_mrg_rxbuf));
+		if (frame_size >= mss || len == 0 ||
+		    i + 1 == iov_cnt || !vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
+			if (i + 1 == iov_cnt)
+				check = NULL;
+
+			/* restore first iovec base: point to vnet header */
+			first->iov_base = (char *)first->iov_base - hdrlen;
+			first->iov_len += hdrlen;
+			vu_set_vnethdr(vdev, first->iov_base, num_buffers);
+
+			tcp_vu_prepare(c, conn, first, frame_size, &check);
+			if (*c->pcap)  {
+				tcp_vu_update_check(tapside, first, num_buffers);
+				pcap_iov(first, num_buffers,
+					 sizeof(struct virtio_net_hdr_mrg_rxbuf));
+			}
+
+			conn->seq_to_tap += frame_size;
+
+			frame_size = 0;
+			num_buffers = 0;
 		}
-
-		conn->seq_to_tap += dlen;
 	}

+	/* release unused buffers */
+	vu_queue_rewind(vq, iov_cnt - iov_used);
+
 	/* send packets */
-	vu_flush(vdev, vq, elem, iov_cnt);
+	vu_flush(vdev, vq, elem, iov_used);

 	conn_flag(c, conn, ACK_FROM_TAP_DUE);

--- a/test/.gitignore
+++ b/test/.gitignore
@ -8,6 +8,5 @@ QEMU_EFI.fd
 *.raw.xz
 *.bin
 nstool
-rampstream
 guest-key
 guest-key.pub
--- a/test/Makefile
+++ b/test/Makefile
@ -52,8 +52,7 @@ UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS)

 DOWNLOAD_ASSETS = mbuto podman \
 	$(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS)
-TESTDATA_ASSETS = small.bin big.bin medium.bin \
-	rampstream
+TESTDATA_ASSETS = small.bin big.bin medium.bin
 LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \
 	$(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \
 	$(UBUNTU_NEW_IMGS:%=prepared-%) \
@ -86,7 +85,7 @@ podman/bin/podman: pull-podman
 guest-key guest-key.pub:
 	ssh-keygen -f guest-key -N ''

-mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub rampstream-check.sh $(TESTDATA_ASSETS)
+mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub $(TESTDATA_ASSETS)
 	./mbuto/mbuto -p ./$< -c lz4 -f $@

 mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2
--- a/test/lib/layout
+++ b/test/lib/layout
@ -134,54 +134,6 @@ layout_two_guests() {

 	get_info_cols

-	pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
-	pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #1" qemu_2 guest_2
-
-	tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done'
-	tmux send-keys -t ${PANE_INFO} -N 100 C-m
-	tmux select-pane -t ${PANE_INFO} -T "test log"
-
-	pane_watch_contexts ${PANE_HOST} host host
-	pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
-	pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #1" pasta_1 passt_2
-
-	info_layout "two guests, two passt instances, in namespaces"
-
-	sleep 1
-}
-
-# layout_migrate() - Two guest panes, two passt panes, two passt-repair panes,
-#		     plus host and log
-layout_migrate() {
-	sleep 1
-
-	tmux kill-pane -a -t 0
-	cmd_write 0 clear
-
-	tmux split-window -v -t passt_test
-	tmux split-window -h -l '33%'
-	tmux split-window -h -t passt_test:1.1
-
-	tmux split-window -h -l '35%' -t passt_test:1.0
-	tmux split-window -v -t passt_test:1.0
-
-	tmux split-window -v -t passt_test:1.4
-	tmux split-window -v -t passt_test:1.6
-
-	tmux split-window -v -t passt_test:1.3
-
-	PANE_GUEST_1=0
-	PANE_GUEST_2=1
-	PANE_INFO=2
-	PANE_MON=3
-	PANE_HOST=4
-	PANE_PASST_REPAIR_1=5
-	PANE_PASST_1=6
-	PANE_PASST_REPAIR_2=7
-	PANE_PASST_2=8
-
-	get_info_cols
-
 	pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
 	pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2

@ -189,16 +141,11 @@ layout_migrate() {
 	tmux send-keys -t ${PANE_INFO} -N 100 C-m
 	tmux select-pane -t ${PANE_INFO} -T "test log"

-	pane_watch_contexts ${PANE_MON} "QEMU monitor" mon mon
-
 	pane_watch_contexts ${PANE_HOST} host host
-	pane_watch_contexts ${PANE_PASST_REPAIR_1} "passt-repair #1 in namespace #1" repair_1 passt_repair_1
 	pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
-
-	pane_watch_contexts ${PANE_PASST_REPAIR_2} "passt-repair #2 in namespace #2" repair_2 passt_repair_2
 	pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2

-	info_layout "two guests, two passt + passt-repair instances, in namespaces"
+	info_layout "two guests, two passt instances, in namespaces"

 	sleep 1
 }
--- a/test/lib/setup
+++ b/test/lib/setup
@ -49,7 +49,7 @@ setup_passt() {

 	context_run passt "make clean"
 	context_run passt "make valgrind"
-	context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -H hostname1 --fqdn fqdn1.passt.test -P ${STATESETUP}/passt.pid"
+	context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -P ${STATESETUP}/passt.pid"

 	# pidfile isn't created until passt is listening
 	wait_for [ -f "${STATESETUP}/passt.pid" ]
@ -160,11 +160,11 @@ setup_passt_in_ns() {
 	if [ ${VALGRIND} -eq 1 ]; then
 		context_run passt "make clean"
 		context_run passt "make valgrind"
-		context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
+		context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
 	else
 		context_run passt "make clean"
 		context_run passt "make"
-		context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
+		context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
 	fi
 	wait_for [ -f "${STATESETUP}/passt.pid" ]

@ -243,7 +243,7 @@ setup_two_guests() {
 	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
 	[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"

-	context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} --fqdn fqdn1.passt.test -H hostname1 -t 10001 -u 10001"
+	context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001"
 	wait_for [ -f "${STATESETUP}/passt_1.pid" ]

 	__opts=
@ -252,7 +252,7 @@ setup_two_guests() {
 	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
 	[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"

-	context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} --hostname hostname2 --fqdn fqdn2 -t 10004 -u 10004"
+	context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004"
 	wait_for [ -f "${STATESETUP}/passt_2.pid" ]

 	__vmem="$((${MEM_KIB} / 1024 / 4))"
@ -305,117 +305,6 @@ setup_two_guests() {
 	context_setup_guest guest_2 ${GUEST_2_CID}
 }

-# setup_migrate() - Set up two namespace, run qemu, passt/passt-repair in both
-setup_migrate() {
-	context_setup_host host
-	context_setup_host mon
-	context_setup_host pasta_1
-	context_setup_host pasta_2
-
-	layout_migrate
-
-	# Ports:
-	#
-	#         guest #1  |  guest #2 |   ns #1   |    host
-	#         --------- |-----------|-----------|------------
-	#  10001  as server |           | to guest  |  to ns #1
-	#  10002            |           | as server |  to ns #1
-	#  10003            |           |  to init  |  as server
-	#  10004            | as server | to guest  |  to ns #1
-
-	__opts=
-	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/pasta_1.pcap"
-	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
-	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
-
-	__map_host4=192.0.2.1
-	__map_host6=2001:db8:9a55::1
-	__map_ns4=192.0.2.2
-	__map_ns6=2001:db8:9a55::2
-
-	# Option 1: send stuff via spliced path in pasta
-	# context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002 -T 10003 -u 10001,10002 -U 10003 --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
-	# Option 2: send stuff via tap (--map-guest-addr) instead (useful to see capture of full migration)
-	context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002,10004 -T 10003 -u 10001,10002,10004 -U 10003 --map-guest-addr ${__map_host4} --map-guest-addr ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
-	context_setup_nstool passt_1 ${STATESETUP}/ns1.hold
-	context_setup_nstool passt_repair_1 ${STATESETUP}/ns1.hold
-
-	context_setup_nstool passt_2 ${STATESETUP}/ns1.hold
-	context_setup_nstool passt_repair_2 ${STATESETUP}/ns1.hold
-
-	context_setup_nstool qemu_1 ${STATESETUP}/ns1.hold
-	context_setup_nstool qemu_2 ${STATESETUP}/ns1.hold
-
-	__ifname="$(context_run qemu_1 "ip -j link show | jq -rM '.[] | select(.link_type == \"ether\").ifname'")"
-
-	sleep 1
-
-	__opts="--vhost-user"
-	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
-	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
-	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
-
-	context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001"
-	wait_for [ -f "${STATESETUP}/passt_1.pid" ]
-
-	context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
-
-	__opts="--vhost-user"
-	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
-	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
-	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
-
-	context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004"
-	wait_for [ -f "${STATESETUP}/passt_2.pid" ]
-
-	context_run_bg passt_repair_2 "./passt-repair ${STATESETUP}/passt_2.socket.repair"
-
-	__vmem="512M"	# Keep migration fast
-	__qemu_netdev1="					       \
-		-chardev socket,id=c,path=${STATESETUP}/passt_1.socket \
-		-netdev vhost-user,id=v,chardev=c		       \
-		-device virtio-net,netdev=v			       \
-		-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
-		-numa node,memdev=m"
-	__qemu_netdev2="					       \
-		-chardev socket,id=c,path=${STATESETUP}/passt_2.socket \
-		-netdev vhost-user,id=v,chardev=c		       \
-		-device virtio-net,netdev=v			       \
-		-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
-		-numa node,memdev=m"
-
-	GUEST_1_CID=94557
-	context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}"		     \
-		' -M accel=kvm:tcg'                                          \
-		' -m '${__vmem}' -cpu host -smp '${VCPUS}		     \
-		' -kernel '"${KERNEL}"					     \
-		' -initrd '${INITRAMFS}' -nographic -serial stdio'	     \
-		' -nodefaults'						     \
-		' -append "console=ttyS0 mitigations=off apparmor=0" '	     \
-		" ${__qemu_netdev1}"					     \
-		" -pidfile ${STATESETUP}/qemu_1.pid"			     \
-		" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"	     \
-		" -monitor unix:${STATESETUP}/qemu_1_mon.sock,server,nowait"
-
-	GUEST_2_CID=94558
-	context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}"		     \
-		' -M accel=kvm:tcg'                                          \
-		' -m '${__vmem}' -cpu host -smp '${VCPUS}		     \
-		' -kernel '"${KERNEL}"					     \
-		' -initrd '${INITRAMFS}' -nographic -serial stdio'	     \
-		' -nodefaults'						     \
-		' -append "console=ttyS0 mitigations=off apparmor=0" '	     \
-		" ${__qemu_netdev2}"					     \
-		" -pidfile ${STATESETUP}/qemu_2.pid"			     \
-		" -device vhost-vsock-pci,guest-cid=$GUEST_2_CID"	     \
-		" -monitor unix:${STATESETUP}/qemu_2_mon.sock,server,nowait" \
-		" -incoming tcp:0:20005"
-
-	context_setup_guest guest_1 ${GUEST_1_CID}
-	# Only available after migration:
-	( context_setup_guest guest_2 ${GUEST_2_CID} & )
-}
-
 # teardown_context_watch() - Remove contexts and stop panes watching them
 # $1:	Pane number watching
 # $@:	Context names
@ -486,8 +375,7 @@ teardown_two_guests() {
 	context_wait pasta_1
 	context_wait pasta_2

-	rm "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid"
-	rm "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid"
+	rm -f "${STATESETUP}/passt__[12].pid" "${STATESETUP}/pasta_[12].pid"

 	teardown_context_watch ${PANE_HOST} host
 	teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
@ -496,30 +384,6 @@ teardown_two_guests() {
 	teardown_context_watch ${PANE_PASST_2} pasta_2 passt_2
 }

-# teardown_migrate() - Exit namespaces, kill qemu processes, passt and pasta
-teardown_migrate() {
-	${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_1.pid")
-	${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_2.pid")
-	context_wait qemu_1
-	context_wait qemu_2
-
-	${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/passt_2.pid")
-	context_wait passt_1
-	context_wait passt_2
-	${NSTOOL} stop "${STATESETUP}/ns1.hold"
-	context_wait pasta_1
-
-	rm -f "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid"
-	rm -f "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid"
-
-	teardown_context_watch ${PANE_HOST} host
-
-	teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
-	teardown_context_watch ${PANE_GUEST_2} qemu_2 guest_2
-	teardown_context_watch ${PANE_PASST_1} pasta_1 passt_1
-	teardown_context_watch ${PANE_PASST_2} pasta_1 passt_2
-}
-
 # teardown_demo_passt() - Exit namespace, kill qemu, passt and pasta
 teardown_demo_passt() {
 	tmux send-keys -t ${PANE_GUEST} "C-c"
--- a/test/lib/test
+++ b/test/lib/test
@ -20,7 +20,10 @@ test_iperf3s() {
 	__sctx="${1}"
 	__port="${2}"

-	pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid'
+	pane_or_context_run_bg "${__sctx}" 				\
+		 'iperf3 -s -p'${__port}' & echo $! > s.pid'		\
+
+	sleep 1		# Wait for server to be ready
 }

 # test_iperf3k() - Kill iperf3 server
@ -28,7 +31,7 @@ test_iperf3s() {
 test_iperf3k() {
 	__sctx="${1}"

-	pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)'
+	pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'

 	sleep 1		# Wait for kernel to free up ports
 }
@ -65,45 +68,6 @@ test_iperf3() {
 	TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
 }

-# test_iperf3m() - Ugly helper for iperf3 directive, guest migration variant
-# $1:	Variable name: to put the measure bandwidth into
-# $2:	Initial source/client context
-# $3:	Second source/client context the guest is moving to
-# $4:	Destination name or address for client
-# $5:	Port number, ${i} is translated to process index
-# $6:	Run time, in seconds
-# $7:	Client options
-test_iperf3m() {
-	__var="${1}"; shift
-	__cctx="${1}"; shift
-	__cctx2="${1}"; shift
-	__dest="${1}"; shift
-	__port="${1}"; shift
-	__time="${1}"; shift
-
-	pane_or_context_run "${__cctx}" 'rm -f c.json'
-
-        # A 1s wait for connection on what's basically a local link
-        # indicates something is pretty wrong
-        __timeout=1000
-	pane_or_context_run_bg "${__cctx}" 				\
-		 'iperf3 -J -c '${__dest}' -p '${__port}		\
-		 '	 --connect-timeout '${__timeout}		\
-		 '	 -t'${__time}' -i0 '"${@}"' > c.json'		\
-
-	__jval=".end.sum_received.bits_per_second"
-
-	sleep $((${__time} + 3))
-
-	pane_or_context_output "${__cctx2}"				\
-		 'cat c.json'
-
-	__bw=$(pane_or_context_output "${__cctx2}"			\
-		 'cat c.json | jq -rMs "map('${__jval}') | add"')
-
-	TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
-}
-
 test_one_line() {
 	__line="${1}"

@ -213,12 +177,6 @@ test_one_line() {
 	"guest2w")
 		pane_or_context_wait guest_2 || TEST_ONE_nok=1
 		;;
-	"mon")
-		pane_or_context_run mon "${__arg}" || TEST_ONE_nok=1
-		;;
-	"monb")
-		pane_or_context_run_bg mon "${__arg}"
-		;;
 	"ns")
 		pane_or_context_run ns "${__arg}" || TEST_ONE_nok=1
 		;;
@ -334,9 +292,6 @@ test_one_line() {
 	"iperf3")
 		test_iperf3 ${__arg}
 		;;
-	"iperf3m")
-		test_iperf3m ${__arg}
-		;;
 	"set")
 		TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__arg%% *}__" "${__arg#* }")"
 		;;
--- a/test/migrate/basic
+++ b/test/migrate/basic
@ -1,59 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/basic - Check basic migration functionality
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	DHCPv6: address
-# Link is up now, wait for DAD to complete
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-guest1	/sbin/dhclient -6 __IFNAME1__
-# Wait for DAD to complete on the DHCP address
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
-
-test	TCP/IPv4: guest1/guest2 > host
-g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
-hostb	socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
-sleep	1
-# Option 1: via spliced path in pasta, namespace to host
-# guest1b	{ printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003
-# Option 2: via --map-guest-addr (tap) in pasta, namespace to host
-guest1b	{ printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006
-sleep	1
-
-mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-hostw
-hout	MSG cat __STATESETUP__/msg
-check	[ "__MSG__" = "Hello from guest 1 and from guest 2" ]
--- a/test/migrate/basic_fin
+++ b/test/migrate/basic_fin
@ -1,62 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/basic_fin - Outbound traffic across migration, half-closed socket
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	DHCPv6: address
-# Link is up now, wait for DAD to complete
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-guest1	/sbin/dhclient -6 __IFNAME1__
-# Wait for DAD to complete on the DHCP address
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
-
-test	TCP/IPv4: guest1, half-close, guest2 > host
-g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
-
-hostb	echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg
-#hostb	socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
-
-#sleep	20
-# Option 1: via spliced path in pasta, namespace to host
-# guest1b	{ printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003
-# Option 2: via --map-guest-addr (tap) in pasta, namespace to host
-guest1b	{ printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006
-sleep	1
-
-mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-hostw
-hout	MSG cat __STATESETUP__/msg
-check	[ "__MSG__" = "Hello from guest 1 and from guest 2" ]
--- a/test/migrate/bidirectional
+++ b/test/migrate/bidirectional
@ -1,64 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/bidirectional - Check migration with messages in both directions
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	TCP/IPv4: guest1/guest2 > host, host > guest1/guest2
-g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
-
-hostb	socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
-guest1b	socat -u TCP4-LISTEN:10001 OPEN:msg,create,trunc
-sleep	1
-
-guest1b	socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006
-hostb	socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001
-sleep	1
-guest1	printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock
-host	printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock
-sleep	1
-
-mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-sleep	2
-guest2	printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null
-host	printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null
-
-hostw
-# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1,
-# use sleep 1 for the moment
-sleep	1
-
-hout	MSG cat __STATESETUP__/msg
-check	[ "__MSG__" = "Hello from guest 1 and from guest 2" ]
-
-g2out	MSG cat msg
-check	[ "__MSG__" = "Dear guest 1, you are now guest 2" ]
--- a/test/migrate/bidirectional_fin
+++ b/test/migrate/bidirectional_fin
@ -1,64 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/bidirectional_fin - Both directions, half-closed sockets
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	TCP/IPv4: guest1/guest2 <- (half closed) -> host
-g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
-
-hostb	echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg
-guest1b	echo FIN | socat TCP4-LISTEN:10001,shut-down STDIO,ignoreeof > msg
-sleep	1
-
-guest1b	socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006
-hostb	socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001
-sleep	1
-guest1	printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock
-host	printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock
-sleep	1
-
-mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-sleep	2
-guest2	printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null
-host	printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null
-
-hostw
-# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1,
-# use sleep 1 for the moment
-sleep	1
-
-hout	MSG cat __STATESETUP__/msg
-check	[ "__MSG__" = "Hello from guest 1 and from guest 2" ]
-
-g2out	MSG cat msg
-check	[ "__MSG__" = "Dear guest 1, you are now guest 2" ]
--- a/test/migrate/iperf3_bidir6
+++ b/test/migrate/iperf3_bidir6
@ -1,58 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/iperf3_bidir6 - Migration behaviour with many bidirectional flows
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-
-set	THREADS 128
-set	TIME 3
-set	OMIT 0.1
-set	OPTS -Z -P __THREADS__ -O__OMIT__ -N --bidir
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	DHCPv6: address
-# Link is up now, wait for DAD to complete
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-guest1	/sbin/dhclient -6 __IFNAME1__
-# Wait for DAD to complete on the DHCP address
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
-
-test	TCP/IPv6 host <-> guest flood, many flows, during migration
-
-monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-iperf3s	host 10006
-iperf3m	BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
-bw	__BW__ 1 2
-
-iperf3k	host
--- a/test/migrate/iperf3_in4
+++ b/test/migrate/iperf3_in4
@ -1,50 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/iperf3_in4 - Migration behaviour under inbound IPv4 flood
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-
-guest1	/sbin/sysctl -w net.core.rmem_max=33554432
-guest1	/sbin/sysctl -w net.core.wmem_max=33554432
-
-set	THREADS 1
-set	TIME 4
-set	OMIT 0.1
-set	OPTS -Z -P __THREADS__ -O__OMIT__ -N -R
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	TCP/IPv4 host to guest throughput during migration
-
-monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-iperf3s	host 10006
-iperf3m	BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__
-bw	__BW__ 1 2
-
-iperf3k	host
--- a/test/migrate/iperf3_in6
+++ b/test/migrate/iperf3_in6
@ -1,58 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/iperf3_in6 - Migration behaviour under inbound IPv6 flood
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-
-set	THREADS 4
-set	TIME 3
-set	OMIT 0.1
-set	OPTS -Z -P __THREADS__ -O__OMIT__ -N -R
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	DHCPv6: address
-# Link is up now, wait for DAD to complete
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-guest1	/sbin/dhclient -6 __IFNAME1__
-# Wait for DAD to complete on the DHCP address
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
-
-test	TCP/IPv6 host to guest throughput during migration
-
-monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-iperf3s	host 10006
-iperf3m	BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
-bw	__BW__ 1 2
-
-iperf3k	host
--- a/test/migrate/iperf3_many_out6
+++ b/test/migrate/iperf3_many_out6
@ -1,60 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/iperf3_many_out6 - Migration behaviour with many outbound flows
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-
-set	THREADS 16
-set	TIME 3
-set	OMIT 0.1
-set	OPTS -Z -P __THREADS__ -O__OMIT__ -N -l 1M
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	DHCPv6: address
-# Link is up now, wait for DAD to complete
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-guest1	/sbin/dhclient -6 __IFNAME1__
-# Wait for DAD to complete on the DHCP address
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
-
-test	TCP/IPv6 guest to host flood, many flows, during migration
-
-test	TCP/IPv6 host to guest throughput during migration
-
-monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-iperf3s	host 10006
-iperf3m	BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
-bw	__BW__ 1 2
-
-iperf3k	host
--- a/test/migrate/iperf3_out4
+++ b/test/migrate/iperf3_out4
@ -1,47 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/iperf3_out4 - Migration behaviour under outbound IPv4 flood
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-
-set	THREADS 6
-set	TIME 2
-set	OMIT 0.1
-set	OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	TCP/IPv4 guest to host throughput during migration
-
-monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-iperf3s	host 10006
-iperf3m	BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__
-bw	__BW__ 1 2
-
-iperf3k	host
--- a/test/migrate/iperf3_out6
+++ b/test/migrate/iperf3_out6
@ -1,58 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/iperf3_out6 - Migration behaviour under outbound IPv6 flood
-#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-
-set	THREADS 6
-set	TIME 2
-set	OMIT 0.1
-set	OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	DHCPv6: address
-# Link is up now, wait for DAD to complete
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-guest1	/sbin/dhclient -6 __IFNAME1__
-# Wait for DAD to complete on the DHCP address
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
-
-test	TCP/IPv6 guest to host throughput during migration
-
-monb	sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-iperf3s	host 10006
-iperf3m	BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
-bw	__BW__ 1 2
-
-iperf3k	host
--- a/test/migrate/rampstream_in
+++ b/test/migrate/rampstream_in
@ -1,59 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/rampstream_in - Check sequence correctness with inbound ramp
-#
-# Copyright (c) 2025 Red Hat
-# Author: David Gibson <david@gibson.dropbear.id.au>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-set	RAMPS 6000000
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	DHCPv6: address
-# Link is up now, wait for DAD to complete
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-guest1	/sbin/dhclient -6 __IFNAME1__
-# Wait for DAD to complete on the DHCP address
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
-
-test	TCP/IPv4: sequence check, ramps, inbound
-g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
-guest1b	socat -u TCP4-LISTEN:10001 EXEC:"rampstream-check.sh __RAMPS__"
-sleep	1
-hostb	socat -u EXEC:"test/rampstream send __RAMPS__" TCP4:__ADDR1__:10001
-
-sleep	1
-
-monb	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-hostw
-
-guest2	cat rampstream.err
-guest2	[ $(cat rampstream.status) -eq 0 ]
--- a/test/migrate/rampstream_out
+++ b/test/migrate/rampstream_out
@ -1,55 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-#  for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-#  for network namespace/tap device mode
-#
-# test/migrate/rampstream_out - Check sequence correctness with outbound ramp
-#
-# Copyright (c) 2025 Red Hat
-# Author: David Gibson <david@gibson.dropbear.id.au>
-
-g1tools	ip jq dhclient socat cat
-htools	ip jq
-
-set	MAP_HOST4 192.0.2.1
-set	MAP_HOST6 2001:db8:9a55::1
-set	MAP_NS4 192.0.2.2
-set	MAP_NS6 2001:db8:9a55::2
-set	RAMPS 6000000
-
-test	Interface name
-g1out	IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout	HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
-check	[ -n "__IFNAME1__" ]
-
-test	DHCP: address
-guest1	ip link set dev __IFNAME1__ up
-guest1	/sbin/dhclient -4 __IFNAME1__
-g1out	ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
-hout	HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
-check	[ "__ADDR1__" = "__HOST_ADDR__" ]
-
-test	DHCPv6: address
-# Link is up now, wait for DAD to complete
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-guest1	/sbin/dhclient -6 __IFNAME1__
-# Wait for DAD to complete on the DHCP address
-guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
-g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check	[ "__ADDR1_6__" = "__HOST_ADDR6__" ]
-
-test	TCP/IPv4: sequence check, ramps, outbound
-g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
-hostb	socat -u TCP4-LISTEN:10006 EXEC:"test/rampstream check __RAMPS__"
-sleep	1
-guest1b	socat -u EXEC:"rampstream send __RAMPS__" TCP4:__MAP_HOST4__:10006
-sleep	1
-
-mon	echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
-
-hostw
--- a/test/passt.mbuto
+++ b/test/passt.mbuto
@ -13,8 +13,7 @@
 PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod
       modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname
       sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
-       nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump
-       env}"
+       nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}"

 # OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and
 # sshd-session the per-session program. We need the latter as well, and the path
@ -32,7 +31,7 @@ LINKS="${LINKS:-

 DIRS="${DIRS} /tmp /usr/sbin /usr/share /var/log /var/lib /etc/ssh /run/sshd /root/.ssh"

-COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin rampstream,/bin/rampstream rampstream-check.sh,/bin/rampstream-check.sh"
+COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin"

 FIXUP="${FIXUP}"'
 	mv /sbin/* /usr/sbin || :
@ -42,7 +41,6 @@ FIXUP="${FIXUP}"'
 #!/bin/sh
 LOG=/var/log/dhclient-script.log
 echo \${reason} \${interface} >> \$LOG
-env >> \$LOG
 set >> \$LOG

 [ -n "\${new_interface_mtu}" ]       && ip link set dev \${interface} mtu \${new_interface_mtu}
@ -56,8 +54,7 @@ set >> \$LOG
 [ -n "\${new_ip6_address}" ]         && ip addr add \${new_ip6_address}/\${new_ip6_prefixlen} dev \${interface}
 [ -n "\${new_dhcp6_name_servers}" ]  && for d in \${new_dhcp6_name_servers}; do echo "nameserver \${d}%\${interface}" >> /etc/resolv.conf; done
 [ -n "\${new_dhcp6_domain_search}" ] && (printf "search"; for d in \${new_dhcp6_domain_search}; do printf " %s" "\${d}"; done; printf "\n") >> /etc/resolv.conf
-[ -n "\${new_host_name}" ]           && echo "\${new_host_name}" > /tmp/new_host_name
-[ -n "\${new_fqdn_fqdn}" ]           && echo "\${new_fqdn_fqdn}" > /tmp/new_fqdn_fqdn
+[ -n "\${new_host_name}" ]           && hostname "\${new_host_name}"
 exit 0
 EOF
 	chmod 755 /sbin/dhclient-script
@ -68,7 +65,6 @@ EOF
 	# sshd via vsock
 	cat > /etc/passwd << EOF
 root:x:0:0:root:/root:/bin/sh
-tcpdump:x:72:72:tcpdump:/:/sbin/nologin
 sshd:x:100:100:Privilege-separated SSH:/var/empty/sshd:/sbin/nologin
 EOF
 	cat > /etc/shadow << EOF
--- a/test/passt/dhcp
+++ b/test/passt/dhcp
@ -11,7 +11,7 @@
 # Copyright (c) 2021 Red Hat GmbH
 # Author: Stefano Brivio <sbrivio@redhat.com>

-gtools	ip jq dhclient sed tr hostname
+gtools	ip jq dhclient sed tr
 htools	ip jq sed tr head

 test	Interface name
@ -47,16 +47,7 @@ gout	SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^searc
 hout	HOST_SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
 check	[ "__SEARCH__" = "__HOST_SEARCH__" ]

-test	DHCP: Hostname
-gout	NEW_HOST_NAME cat /tmp/new_host_name
-check	[ "__NEW_HOST_NAME__" = "hostname1" ]
-
-test	DHCP: Client FQDN
-gout	NEW_FQDN_FQDN cat /tmp/new_fqdn_fqdn
-check	[ "__NEW_FQDN_FQDN__" = "fqdn1.passt.test" ]
-
 test	DHCPv6: address
-guest	rm /tmp/new_fqdn_fqdn
 guest	/sbin/dhclient -6 __IFNAME__
 # Wait for DAD to complete
 guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
@ -79,7 +70,3 @@ test	DHCPv6: search list
 gout	SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
 hout	HOST_SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
 check	[ "__SEARCH6__" = "__HOST_SEARCH6__" ]
-
-test	DHCPv6: Hostname
-gout	NEW_FQDN_FQDN cat /tmp/new_fqdn_fqdn
-check	[ "__NEW_FQDN_FQDN__" = "fqdn1.passt.test" ]
--- a/test/passt/ndp
+++ b/test/passt/ndp
@ -17,13 +17,13 @@ htools	ip jq sipcalc grep cut
 test	Interface name
 gout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
 guest	ip link set dev __IFNAME__ up
-# Wait for SLAAC & DAD to complete
-guest	while ! ip -j -6 addr show dev __IFNAME__ | jq -e '.[].addr_info.[] | select(.protocol == "kernel_ra")'; do sleep 0.1; done
+# Wait for DAD to complete
+guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
 check	[ -n "__IFNAME__" ]

 test	SLAAC: prefix
-gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
+gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
 gout	PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 hout	HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
--- a/test/pasta/ndp
+++ b/test/pasta/ndp
@ -18,11 +18,11 @@ test	Interface name
 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
 check	[ -n "__IFNAME__" ]
 ns	ip link set dev __IFNAME__ up
-# Wait for SLAAC & DAD to complete
-ns	while ! ip -j -6 addr show dev __IFNAME__ | jq -e '.[].addr_info.[] | select(.protocol == "kernel_ra")'; do sleep 0.1; done
+# Wait for DAD to complete
+ns	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done

 test	SLAAC: prefix
-nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
 nsout	PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 hout	HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
--- a/test/pasta_podman/bats
+++ b/test/pasta_podman/bats
@ -23,4 +23,4 @@ check	[ "__PASTA_BIN__" = "__WD__/pasta" ]

 test	Podman system test with bats

-host	PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" taskset -c 1 bats test/podman/test/system/505-networking-pasta.bats
+host	PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats test/podman/test/system/505-networking-pasta.bats
--- a/test/perf/passt_vu_tcp
+++ b/test/perf/passt_vu_tcp
@ -38,10 +38,10 @@ hout	FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\
 hout	FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
 hout	FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__

-set	THREADS 6
-set	TIME 2
+set	THREADS 4
+set	TIME 5
 set	OMIT 0.1
-set	OPTS -Z -P __THREADS__ -O__OMIT__ -N
+set	OPTS -Z -P __THREADS__ -l 1M -O__OMIT__ -N

 info	Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
 report	passt_vu tcp __THREADS__ __FREQ__
@ -55,16 +55,16 @@ iperf3s	ns 10002
 bw	-
 bw	-
 guest	ip link set dev __IFNAME__ mtu 1280
-iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M -l 1M
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M
 bw	__BW__ 1.2 1.5
 guest	ip link set dev __IFNAME__ mtu 1500
-iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 32M -l 1M
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 32M
 bw	__BW__ 1.6 1.8
 guest	ip link set dev __IFNAME__ mtu 9000
-iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M -l 1M
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M
 bw	__BW__ 4.0 5.0
 guest	ip link set dev __IFNAME__ mtu 65520
-iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M -l 1M
+iperf3	BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M
 bw	__BW__ 7.0 8.0

 iperf3k	ns
@ -93,22 +93,22 @@ tr	TCP throughput over IPv4: guest to host
 iperf3s	ns 10002

 guest	ip link set dev __IFNAME__ mtu 256
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 2M -l 1M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 2M
 bw	__BW__ 0.2 0.3
 guest	ip link set dev __IFNAME__ mtu 576
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M -l 1M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M
 bw	__BW__ 0.5 0.8
 guest	ip link set dev __IFNAME__ mtu 1280
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M -l 1M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M
 bw	__BW__ 1.2 1.5
 guest	ip link set dev __IFNAME__ mtu 1500
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M -l 1M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M
 bw	__BW__ 1.6 1.8
 guest	ip link set dev __IFNAME__ mtu 9000
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M -l 1M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M
 bw	__BW__ 4.0 5.0
 guest	ip link set dev __IFNAME__ mtu 65520
-iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M -l 1M
+iperf3	BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M
 bw	__BW__ 7.0 8.0

 iperf3k	ns
@ -145,7 +145,7 @@ bw	-
 bw	-
 bw	-
 bw	-
-iperf3	BW ns ::1 10001 __TIME__ __OPTS__ -w 256M -l 16k
+iperf3	BW ns ::1 10001 __TIME__ __OPTS__ -w 32M
 bw	__BW__ 6.0 6.8

 iperf3k	guest
@ -181,7 +181,7 @@ bw	-
 bw	-
 bw	-
 bw	-
-iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -w 256M -l 16k
+iperf3	BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -w 32M
 bw	__BW__ 6.0 6.8

 iperf3k	guest
--- a/test/perf/pasta_tcp
+++ b/test/perf/pasta_tcp
@ -211,7 +211,7 @@ tr	TCP throughput over IPv6: host to ns
 iperf3s	ns 10002

 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
 bw	-
 bw	-
 bw	-
--- a/test/perf/pasta_udp
+++ b/test/perf/pasta_udp
@ -196,7 +196,7 @@ tr	UDP throughput over IPv6: host to ns
 iperf3s	ns 10002

 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
 iperf3	BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472
 bw	__BW__ 0.3 0.5
 iperf3	BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972
--- a/test/rampstream-check.sh
+++ b/test/rampstream-check.sh
@ -1,3 +0,0 @@
-#! /bin/sh
-
-(rampstream check "$@" 2>&1; echo $? > rampstream.status) | tee rampstream.err
--- a/test/rampstream.c
+++ b/test/rampstream.c
@ -1,143 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-/* rampstream - Generate a check and stream of bytes in a ramp pattern
- *
- * Copyright Red Hat
- * Author: David Gibson <david@gibson.dropbear.id.au>
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-
-/* Length of the repeating ramp.  This is a deliberately not a "round" number so
- * that we're very likely to misalign with likely block or chunk sizes of the
- * transport.  That means we'll detect gaps in the stream, even if they occur
- * neatly on block boundaries.  Specifically this is the largest 8-bit prime. */
-#define RAMPLEN		251
-
-#define INTERVAL	10000
-
-#define	ARRAY_SIZE(a)	((int)(sizeof(a) / sizeof((a)[0])))
-
-#define die(...)						\
-	do {							\
-		fprintf(stderr, "rampstream: " __VA_ARGS__);	\
-		exit(1);					\
-	} while (0)
-
-static void usage(void)
-{
-	die("Usage:\n"
-	    "  rampstream send <number>\n"
-	    "    Generate a ramp pattern of bytes on stdout, repeated <number>\n"
-	    "    times\n"
-	    "  rampstream check <number>\n"
-	    "    Check a ramp pattern of bytes on stdin, repeater <number>\n"
-	    "    times\n");
-}
-
-static void ramp_send(unsigned long long num, const uint8_t *ramp)
-{
-	unsigned long long i;
-
-	for (i = 0; i < num; i++) {
-		int off = 0;
-		ssize_t rc;
-
-		if (i % INTERVAL == 0)
-			fprintf(stderr, "%llu...\r", i);
-
-		while (off < RAMPLEN) {
-			rc = write(1, ramp + off, RAMPLEN - off);
-			if (rc < 0) {
-				if (errno == EINTR ||
-				    errno == EAGAIN ||
-				    errno == EWOULDBLOCK)
-					continue;
-				die("Error writing ramp: %s\n",
-				    strerror(errno));
-			}
-			if (rc == 0)
-				die("Zero length write\n");
-			off += rc;
-		}
-	}
-}
-
-static void ramp_check(unsigned long long num, const uint8_t *ramp)
-{
-	unsigned long long i;
-
-	for (i = 0; i < num; i++) {
-		uint8_t buf[RAMPLEN];
-		int off = 0;
-		ssize_t rc;
-
-		if (i % INTERVAL == 0)
-			fprintf(stderr, "%llu...\r", i);
-		
-		while (off < RAMPLEN) {
-			rc = read(0, buf + off, RAMPLEN - off);
-			if (rc < 0) {
-				if (errno == EINTR ||
-				    errno == EAGAIN ||
-				    errno == EWOULDBLOCK)
-					continue;
-				die("Error reading ramp: %s\n",
-				    strerror(errno));
-			}
-			if (rc == 0)
-				die("Unexpected EOF, ramp %llu, byte %d\n",
-				    i, off);
-			off += rc;
-		}
-
-		if (memcmp(buf, ramp, sizeof(buf)) != 0) {
-			int j, k;
-
-			for (j = 0; j < RAMPLEN; j++)
-				if (buf[j] != ramp[j])
-					break;
-			for (k = j; k < RAMPLEN && k < j + 16; k++)
-				fprintf(stderr,
-					"Byte %d: expected 0x%02x, got 0x%02x\n",
-					k, ramp[k], buf[k]);
-			die("Data mismatch, ramp %llu, byte %d\n", i, j);
-		}
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	const char *subcmd = argv[1];
-	unsigned long long num;
-	uint8_t ramp[RAMPLEN];
-	char *e;
-	int i;
-
-	if (argc < 2)
-		usage();
-
-	errno = 0;
-	num = strtoull(argv[2], &e, 0);
-	if (*e || errno)
-		usage();
-
-	/* Initialize the ramp block */
-	for (i = 0; i < RAMPLEN; i++)
-		ramp[i] = i;
-
-	if (strcmp(subcmd, "send") == 0)
-		ramp_send(num, ramp);
-	else if (strcmp(subcmd, "check") == 0)
-		ramp_check(num, ramp);
-	else
-		usage();
-
-	exit(0);
-}
--- a/test/run
+++ b/test/run
@ -130,43 +130,6 @@ run() {
 	test two_guests_vu/basic
 	teardown two_guests

-	setup migrate
-	test migrate/basic
-	teardown migrate
-	setup migrate
-	test migrate/basic_fin
-	teardown migrate
-	setup migrate
-	test migrate/bidirectional
-	teardown migrate
-	setup migrate
-	test migrate/bidirectional_fin
-	teardown migrate
-	setup migrate
-	test migrate/iperf3_out4
-	teardown migrate
-	setup migrate
-	test migrate/iperf3_out6
-	teardown migrate
-	setup migrate
-	test migrate/iperf3_in4
-	teardown migrate
-	setup migrate
-	test migrate/iperf3_in6
-	teardown migrate
-	setup migrate
-	test migrate/iperf3_bidir6
-	teardown migrate
-	setup migrate
-	test migrate/iperf3_many_out6
-	teardown migrate
-	setup migrate
-	test migrate/rampstream_in
-	teardown migrate
-	setup migrate
-	test migrate/rampstream_out
-	teardown migrate
-
 	VALGRIND=0
 	VHOST_USER=0
 	setup passt_in_ns
@ -223,10 +186,7 @@ run_selected() {

 	__setup=
 	for __test; do
-		# HACK: the migrate tests need the setup repeated for
-		#       each test
-		if [ "${__test%%/*}" != "${__setup}" -o		\
-		     "${__test%%/*}" = "migrate" ]; then
+		if [ "${__test%%/*}" != "${__setup}" ]; then
 			[ -n "${__setup}" ] && teardown "${__setup}"
 			__setup="${__test%%/*}"
 			setup "${__setup}"
--- a/udp.c
+++ b/udp.c
@ -39,30 +39,27 @@
 * could receive packets from multiple flows, so we use a hash table match to
 * find the specific flow for a datagram.
 *
- * Flow sockets
- * ============
- *
- * When a UDP flow targets a socket, we create a "flow" socket in
- * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
- * replies on the target side.  This socket is both bound and connected and has
- * EPOLL_TYPE_UDP.  The connect() means it will only receive datagrams
- * associated with this flow, so the epoll reference directly points to the flow
- * and we don't need a hash lookup.
- *
- * When a flow is initiated from a listening socket, we create a "flow" socket
- * with the same bound address as the listening socket, but also connect()ed to
- * the flow's peer.  This is stored in uflow->s[INISIDE] and will last for the
+ * When a UDP flow is initiated from a listening socket we take a duplicate of
+ * the socket and store it in uflow->s[INISIDE].  This will last for the
 * lifetime of the flow, even if the original listening socket is closed due to
 * port auto-probing.  The duplicate is used to deliver replies back to the
 * originating side.
 *
- * NOTE: A flow socket can have a bound address overlapping with a listening
- * socket.  That will happen naturally for flows initiated from a socket, but is
- * also possible (though unlikely) for tap initiated flows, depending on the
- * source port.  We assume datagrams for the flow will come to a connect()ed
- * socket in preference to a listening socket.  The sample program
- * doc/platform-requirements/reuseaddr-priority.c documents and tests that
- * assumption.
+ * Reply sockets
+ * =============
+ *
+ * When a UDP flow targets a socket, we create a "reply" socket in
+ * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
+ * replies on the target side.  This socket is both bound and connected and has
+ * EPOLL_TYPE_UDP_REPLY.  The connect() means it will only receive datagrams
+ * associated with this flow, so the epoll reference directly points to the flow
+ * and we don't need a hash lookup.
+ *
+ * NOTE: it's possible that the reply socket could have a bound address
+ * overlapping with an unrelated listening socket.  We assume datagrams for the
+ * flow will come to the reply socket in preference to a listening socket.  The
+ * sample program doc/platform-requirements/reuseaddr-priority.c documents and
+ * tests that assumption.
 *
 * "Spliced" flows
 * ===============
@ -74,7 +71,8 @@
 * actually used; it doesn't make sense for datagrams and instead a pair of
 * recvmmsg() and sendmmsg() is used to forward the datagrams.
 *
- * Note that a spliced flow will have two flow sockets (see above).
+ * Note that a spliced flow will have *both* a duplicated listening socket and a
+ * reply socket (see above).
 */

 #include <sched.h>
@ -89,8 +87,6 @@
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/udp.h>
-#include <netinet/ip_icmp.h>
-#include <netinet/icmp6.h>
 #include <stdint.h>
 #include <stddef.h>
 #include <string.h>
@ -116,14 +112,6 @@
 #include "udp_internal.h"
 #include "udp_vu.h"

-#define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
-
-/* Maximum UDP data to be returned in ICMP messages */
-#define ICMP4_MAX_DLEN 8
-#define ICMP6_MAX_DLEN (IPV6_MIN_MTU			\
-			- sizeof(struct udphdr)	\
-			- sizeof(struct ipv6hdr))
-
 /* "Spliced" sockets indexed by bound port (host order) */
 static int udp_splice_ns  [IP_VERSIONS][NUM_PORTS];
 static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
@ -140,15 +128,20 @@ static struct ethhdr udp4_eth_hdr;
 static struct ethhdr udp6_eth_hdr;

 /**
- * struct udp_meta_t - Pre-cooked headers for UDP packets
+ * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets
 * @ip6h:	Pre-filled IPv6 header (except for payload_len and addresses)
 * @ip4h:	Pre-filled IPv4 header (except for tot_len and saddr)
 * @taph:	Tap backend specific header
+ * @s_in:	Source socket address, filled in by recvmmsg()
+ * @tosidx:	sidx for the destination side of this datagram's flow
 */
 static struct udp_meta_t {
 	struct ipv6hdr ip6h;
 	struct iphdr ip4h;
 	struct tap_hdr taph;
+
+	union sockaddr_inany s_in;
+	flow_sidx_t tosidx;
 }
 #ifdef __AVX2__
 __attribute__ ((aligned(32)))
@ -231,6 +224,8 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
 	tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
 	tiov[UDP_IOV_PAYLOAD].iov_base = payload;

+	mh->msg_name	= &meta->s_in;
+	mh->msg_namelen	= sizeof(meta->s_in);
 	mh->msg_iov	= siov;
 	mh->msg_iovlen	= 1;
 }
@ -250,6 +245,41 @@ static void udp_iov_init(const struct ctx *c)
 		udp_iov_init_one(c, i);
 }

+/**
+ * udp_splice_prepare() - Prepare one datagram for splicing
+ * @mmh:	Receiving mmsghdr array
+ * @idx:	Index of the datagram to prepare
+ */
+static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx)
+{
+	udp_mh_splice[idx].msg_hdr.msg_iov->iov_len = mmh[idx].msg_len;
+}
+
+/**
+ * udp_splice_send() - Send a batch of datagrams from socket to socket
+ * @c:		Execution context
+ * @start:	Index of batch's first datagram in udp[46]_l2_buf
+ * @n:		Number of datagrams in batch
+ * @src:	Source port for datagram (target side)
+ * @dst:	Destination port for datagrams (target side)
+ * @ref:	epoll reference for origin socket
+ * @now:	Timestamp
+ */
+static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
+			    flow_sidx_t tosidx)
+{
+	const struct flowside *toside = flowside_at_sidx(tosidx);
+	const struct udp_flow *uflow = udp_at_sidx(tosidx);
+	uint8_t topif = pif_at_sidx(tosidx);
+	int s = uflow->s[tosidx.sidei];
+	socklen_t sl;
+
+	pif_sockaddr(c, &udp_splice_to, &sl, topif,
+		     &toside->eaddr, toside->eport);
+
+	sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL);
+}
+
 /**
 * udp_update_hdr4() - Update headers for one IPv4 datagram
 * @ip4h:		Pre-filled IPv4 header (except for tot_len and saddr)
@ -286,8 +316,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 			.iov_base = bp->data,
 			.iov_len = dlen
 		};
-		struct iov_tail data = IOV_TAIL(&iov, 1, 0);
-		csum_udp4(&bp->uh, *src, *dst, &data);
+		csum_udp4(&bp->uh, *src, *dst, &iov, 1, 0);
 	}

 	return l4len;
@ -331,8 +360,8 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
 			.iov_base = bp->data,
 			.iov_len = dlen
 		};
-		struct iov_tail data = IOV_TAIL(&iov, 1, 0);
-		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data);
+		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
+			  &iov, 1, 0);
 	}

 	return l4len;
@ -372,122 +401,25 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
 	(*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
 }

-/**
- * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer
- * @c:		Execution context
- * @ee:	Extended error descriptor
- * @toside:	Destination side of flow
- * @saddr:	Address of ICMP generating node
- * @in:	First bytes (max 8) of original UDP message body
- * @dlen:	Length of the read part of original UDP message body
- */
-static void udp_send_tap_icmp4(const struct ctx *c,
-			       const struct sock_extended_err *ee,
-			       const struct flowside *toside,
-			       struct in_addr saddr,
-			       const void *in, size_t dlen)
-{
-	struct in_addr oaddr = toside->oaddr.v4mapped.a4;
-	struct in_addr eaddr = toside->eaddr.v4mapped.a4;
-	in_port_t eport = toside->eport;
-	in_port_t oport = toside->oport;
-	struct {
-		struct icmphdr icmp4h;
-		struct iphdr ip4h;
-		struct udphdr uh;
-		char data[ICMP4_MAX_DLEN];
-	} __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
-	size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
-	size_t l4len = dlen + sizeof(struct udphdr);
-
-	ASSERT(dlen <= ICMP4_MAX_DLEN);
-	memset(&msg, 0, sizeof(msg));
-	msg.icmp4h.type = ee->ee_type;
-	msg.icmp4h.code = ee->ee_code;
-	if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED)
-		msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info);
-
-	/* Reconstruct the original headers as returned in the ICMP message */
-	tap_push_ip4h(&msg.ip4h, eaddr, oaddr, l4len, IPPROTO_UDP);
-	tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
-	memcpy(&msg.data, in, dlen);
-
-	tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
-}
-
-
-/**
- * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer
- * @c:		Execution context
- * @ee:	Extended error descriptor
- * @toside:	Destination side of flow
- * @saddr:	Address of ICMP generating node
- * @in:	First bytes (max 1232) of original UDP message body
- * @dlen:	Length of the read part of original UDP message body
- * @flow:	IPv6 flow identifier
- */
-static void udp_send_tap_icmp6(const struct ctx *c,
-			       const struct sock_extended_err *ee,
-			       const struct flowside *toside,
-			       const struct in6_addr *saddr,
-			       void *in, size_t dlen, uint32_t flow)
-{
-	const struct in6_addr *oaddr = &toside->oaddr.a6;
-	const struct in6_addr *eaddr = &toside->eaddr.a6;
-	in_port_t eport = toside->eport;
-	in_port_t oport = toside->oport;
-	struct {
-		struct icmp6_hdr icmp6h;
-		struct ipv6hdr ip6h;
-		struct udphdr uh;
-		char data[ICMP6_MAX_DLEN];
-	} __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
-	size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
-	size_t l4len = dlen + sizeof(struct udphdr);
-
-	ASSERT(dlen <= ICMP6_MAX_DLEN);
-	memset(&msg, 0, sizeof(msg));
-	msg.icmp6h.icmp6_type = ee->ee_type;
-	msg.icmp6h.icmp6_code = ee->ee_code;
-	if (ee->ee_type == ICMP6_PACKET_TOO_BIG)
-		msg.icmp6h.icmp6_dataun.icmp6_un_data32[0] = htonl(ee->ee_info);
-
-	/* Reconstruct the original headers as returned in the ICMP message */
-	tap_push_ip6h(&msg.ip6h, eaddr, oaddr, l4len, IPPROTO_UDP, flow);
-	tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
-	memcpy(&msg.data, in, dlen);
-
-	tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
-}
-
 /**
 * udp_sock_recverr() - Receive and clear an error from a socket
- * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to receive from
 *
 * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
 *         if there was an error reading the queue
 *
 * #syscalls recvmsg
 */
-static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
+static int udp_sock_recverr(int s)
 {
-	struct errhdr {
-		struct sock_extended_err ee;
-		union sockaddr_inany saddr;
-	};
-	const struct errhdr *eh;
+	const struct sock_extended_err *ee;
 	const struct cmsghdr *hdr;
-	char buf[CMSG_SPACE(sizeof(struct errhdr))];
-	char data[ICMP6_MAX_DLEN];
-	int s = ref.fd;
-	struct iovec iov = {
-		.iov_base = data,
-		.iov_len = sizeof(data)
-	};
+	char buf[CMSG_SPACE(sizeof(*ee))];
 	struct msghdr mh = {
-		.msg_iov = &iov,
-		.msg_iovlen = 1,
+		.msg_name = NULL,
+		.msg_namelen = 0,
+		.msg_iov = NULL,
+		.msg_iovlen = 0,
 		.msg_control = buf,
 		.msg_controllen = sizeof(buf),
 	};
@ -516,29 +448,11 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 		return -1;
 	}

-	eh = (const struct errhdr *)CMSG_DATA(hdr);
-	if (ref.type == EPOLL_TYPE_UDP) {
-		flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
-		const struct flowside *toside = flowside_at_sidx(sidx);
-		size_t dlen = rc;
+	ee = (const struct sock_extended_err *)CMSG_DATA(hdr);

-		if (pif_is_socket(pif_at_sidx(sidx))) {
-			/* XXX Is there any way to propagate ICMPs from socket
-			 * to socket? */
-		} else if (hdr->cmsg_level == IPPROTO_IP) {
-			dlen = MIN(dlen, ICMP4_MAX_DLEN);
-			udp_send_tap_icmp4(c, &eh->ee, toside,
-					   eh->saddr.sa4.sin_addr, data, dlen);
-		} else if (hdr->cmsg_level == IPPROTO_IPV6) {
-			udp_send_tap_icmp6(c, &eh->ee, toside,
-					   &eh->saddr.sa6.sin6_addr, data,
-					   dlen, sidx.flowi);
-		}
-	} else {
-		trace("Ignoring received IP_RECVERR cmsg on listener socket");
-	}
+	/* TODO: When possible propagate and otherwise handle errors */
 	debug("%s error on UDP socket %i: %s",
-	      str_ee_origin(&eh->ee), s, strerror_(eh->ee.ee_errno));
+	      str_ee_origin(ee), s, strerror(ee->ee_errno));

 	return 1;
 }
@ -546,21 +460,24 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
 /**
 * udp_sock_errs() - Process errors on a socket
 * @c:		Execution context
- * @ref:	epoll reference
+ * @s:		Socket to receive from
+ * @events:	epoll events bitmap
 *
 * Return: Number of errors handled, or < 0 if we have an unrecoverable error
 */
-static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
+int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
 {
 	unsigned n_err = 0;
 	socklen_t errlen;
-	int s = ref.fd;
 	int rc, err;

 	ASSERT(!c->no_udp);

+	if (!(events & EPOLLERR))
+		return 0; /* Nothing to do */
+
 	/* Empty the error queue */
-	while ((rc = udp_sock_recverr(c, ref)) > 0)
+	while ((rc = udp_sock_recverr(s)) > 0)
 		n_err += rc;

 	if (rc < 0)
@ -574,7 +491,7 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
 	}

 	if (err) {
-		debug("Unqueued error on UDP socket %i: %s", s, strerror_(err));
+		debug("Unqueued error on UDP socket %i: %s", s, strerror(err));
 		n_err++;
 	}

@ -587,77 +504,34 @@ static int udp_sock_errs(const struct ctx *c, union epoll_ref ref)
 	return n_err;
 }

-#define PKTINFO_SPACE					\
-	MAX(CMSG_SPACE(sizeof(struct in_pktinfo)),	\
-	    CMSG_SPACE(sizeof(struct in6_pktinfo)))
-
-/**
- * udp_peek_addr() - Get source address for next packet
- * @s:		Socket to get information from
- * @src:	Socket address (output)
- * @dst:	(Local) destination address (output)
- *
- * Return: 0 on success, -1 otherwise
- */
-static int udp_peek_addr(int s, union sockaddr_inany *src,
-			 union inany_addr *dst)
-{
-	char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN];
-	const struct cmsghdr *hdr;
-	char cmsg[PKTINFO_SPACE];
-	struct msghdr msg = {
-		.msg_name = src,
-		.msg_namelen = sizeof(*src),
-		.msg_control = cmsg,
-		.msg_controllen = sizeof(cmsg),
-	};
-	int rc;
-
-	rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
-	if (rc < 0) {
-		if (errno != EAGAIN && errno != EWOULDBLOCK)
-			warn_perror("Error peeking at socket address");
-		return rc;
-	}
-
-	hdr = CMSG_FIRSTHDR(&msg);
-	if (hdr && hdr->cmsg_level == IPPROTO_IP &&
-	    hdr->cmsg_type == IP_PKTINFO) {
-		const struct in_pktinfo *info4 = (void *)CMSG_DATA(hdr);
-
-		*dst = inany_from_v4(info4->ipi_addr);
-	} else if (hdr && hdr->cmsg_level == IPPROTO_IPV6 &&
-		   hdr->cmsg_type == IPV6_PKTINFO) {
-		const struct in6_pktinfo *info6 = (void *)CMSG_DATA(hdr);
-
-		dst->a6 = info6->ipi6_addr;
-	} else {
-		debug("Unexpected cmsg on UDP datagram");
-		*dst = inany_any6;
-	}
-
-	trace("Peeked UDP datagram: %s -> %s",
-	      sockaddr_ntop(src, sastr, sizeof(sastr)),
-	      inany_ntop(dst, dstr, sizeof(dstr)));
-
-	return 0;
-}
-
 /**
 * udp_sock_recv() - Receive datagrams from a socket
 * @c:		Execution context
 * @s:		Socket to receive from
+ * @events:	epoll events bitmap
 * @mmh		mmsghdr array to receive into
- * @n:		Maximum number of datagrams to receive
 *
 * Return: Number of datagrams received
 *
 * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
 */
-static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
+static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
+			 struct mmsghdr *mmh)
 {
+	/* For not entirely clear reasons (data locality?) pasta gets better
+	 * throughput if we receive tap datagrams one at a atime.  For small
+	 * splice datagrams throughput is slightly better if we do batch, but
+	 * it's slightly worse for large splice datagrams.  Since we don't know
+	 * before we receive whether we'll use tap or splice, always go one at a
+	 * time for pasta mode.
+	 */
+	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
+
 	ASSERT(!c->no_udp);

+	if (!(events & EPOLLIN))
+		return 0;
+
 	n = recvmmsg(s, mmh, n, 0, NULL);
 	if (n < 0) {
 		err_perror("Error receiving datagrams");
@ -668,94 +542,78 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 }

 /**
- * udp_sock_to_sock() - Forward datagrams from socket to socket
+ * udp_buf_listen_sock_handler() - Handle new data from socket
 * @c:		Execution context
- * @from_s:	Socket to receive datagrams from
- * @n:		Maximum number of datagrams to forward
- * @tosidx:	Flow & side to forward datagrams to
+ * @ref:	epoll reference
+ * @events:	epoll events bitmap
+ * @now:	Current timestamp
 *
- * #syscalls sendmmsg
+ * #syscalls recvmmsg
 */
-static void udp_sock_to_sock(const struct ctx *c, int from_s, int n,
-			     flow_sidx_t tosidx)
+static void udp_buf_listen_sock_handler(const struct ctx *c,
+					union epoll_ref ref, uint32_t events,
+					const struct timespec *now)
 {
-	const struct flowside *toside = flowside_at_sidx(tosidx);
-	const struct udp_flow *uflow = udp_at_sidx(tosidx);
-	uint8_t topif = pif_at_sidx(tosidx);
-	int to_s = uflow->s[tosidx.sidei];
-	socklen_t sl;
-	int i;
+	const socklen_t sasize = sizeof(udp_meta[0].s_in);
+	int n, i;

-	if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0)
+	if (udp_sock_errs(c, ref.fd, events) < 0) {
+		err("UDP: Unrecoverable error on listening socket:"
+		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
+		/* FIXME: what now?  close/re-open socket? */
 		return;
-
-	for (i = 0; i < n; i++) {
-		udp_mh_splice[i].msg_hdr.msg_iov->iov_len
-			= udp_mh_recv[i].msg_len;
 	}

-	pif_sockaddr(c, &udp_splice_to, &sl, topif,
-		     &toside->eaddr, toside->eport);
-
-	sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL);
-}
-
-/**
- * udp_buf_sock_to_tap() - Forward datagrams from socket to tap
- * @c:		Execution context
- * @s:		Socket to read data from
- * @n:		Maximum number of datagrams to forward
- * @tosidx:	Flow & side to forward data from @s to
- */
-static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
-				flow_sidx_t tosidx)
-{
-	const struct flowside *toside = flowside_at_sidx(tosidx);
-	int i;
-
-	if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
+	if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
 		return;

-	for (i = 0; i < n; i++)
-		udp_tap_prepare(udp_mh_recv, i, toside, false);
+	/* We divide datagrams into batches based on how we need to send them,
+	 * determined by udp_meta[i].tosidx.  To avoid either two passes through
+	 * the array, or recalculating tosidx for a single entry, we have to
+	 * populate it one entry *ahead* of the loop counter.
+	 */
+	udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now);
+	udp_mh_recv[0].msg_hdr.msg_namelen = sasize;
+	for (i = 0; i < n; ) {
+		flow_sidx_t batchsidx = udp_meta[i].tosidx;
+		uint8_t batchpif = pif_at_sidx(batchsidx);
+		int batchstart = i;

-	tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
-}
+		do {
+			if (pif_is_socket(batchpif)) {
+				udp_splice_prepare(udp_mh_recv, i);
+			} else if (batchpif == PIF_TAP) {
+				udp_tap_prepare(udp_mh_recv, i,
+						flowside_at_sidx(batchsidx),
+						false);
+			}

-/**
- * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket
- * @c:		Execution context
- * @s:		Socket to forward from
- * @frompif:	Interface to which @s belongs
- * @port:	Our (local) port number of @s
- * @now:	Current timestamp
- */
-void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
-		  in_port_t port, const struct timespec *now)
-{
-	union sockaddr_inany src;
-	union inany_addr dst;
+			if (++i >= n)
+				break;

-	while (udp_peek_addr(s, &src, &dst) == 0) {
-		flow_sidx_t tosidx = udp_flow_from_sock(c, frompif,
-							&dst, port, &src, now);
-		uint8_t topif = pif_at_sidx(tosidx);
+			udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
+								&udp_meta[i].s_in,
+								now);
+			udp_mh_recv[i].msg_hdr.msg_namelen = sasize;
+		} while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx));

-		if (pif_is_socket(topif)) {
-			udp_sock_to_sock(c, s, 1, tosidx);
-		} else if (topif == PIF_TAP) {
-			if (c->mode == MODE_VU)
-				udp_vu_sock_to_tap(c, s, 1, tosidx);
-			else
-				udp_buf_sock_to_tap(c, s, 1, tosidx);
-		} else if (flow_sidx_valid(tosidx)) {
-			struct udp_flow *uflow = udp_at_sidx(tosidx);
+		if (pif_is_socket(batchpif)) {
+			udp_splice_send(c, batchstart, i - batchstart,
+					batchsidx);
+		} else if (batchpif == PIF_TAP) {
+			tap_send_frames(c, &udp_l2_iov[batchstart][0],
+					UDP_NUM_IOVS, i - batchstart);
+		} else if (flow_sidx_valid(batchsidx)) {
+			flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx);
+			struct udp_flow *uflow = udp_at_sidx(batchsidx);

 			flow_err(uflow,
 				 "No support for forwarding UDP from %s to %s",
-				 pif_name(frompif), pif_name(topif));
+				 pif_name(pif_at_sidx(fromsidx)),
+				 pif_name(batchpif));
 		} else {
-			debug("Discarding datagram without flow");
+			debug("Discarding %d datagrams without flow",
+			      i - batchstart);
 		}
 	}
 }
@ -771,78 +629,87 @@ void udp_listen_sock_handler(const struct ctx *c,
 			     union epoll_ref ref, uint32_t events,
 			     const struct timespec *now)
 {
-	if (events & EPOLLERR) {
-		if (udp_sock_errs(c, ref) < 0) {
-			err("UDP: Unrecoverable error on listening socket:"
-			    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
-			/* FIXME: what now?  close/re-open socket? */
-			return;
-		}
+	if (c->mode == MODE_VU) {
+		udp_vu_listen_sock_handler(c, ref, events, now);
+		return;
 	}

-	if (events & EPOLLIN)
-		udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now);
+	udp_buf_listen_sock_handler(c, ref, events, now);
 }

 /**
- * udp_sock_handler() - Handle new data from flow specific socket
+ * udp_buf_reply_sock_handler() - Handle new data from flow specific socket
+ * @c:		Execution context
+ * @ref:	epoll reference
+ * @events:	epoll events bitmap
+ * @now:	Current timestamp
+ *
+ * #syscalls recvmmsg
+ */
+static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
+				       uint32_t events,
+				       const struct timespec *now)
+{
+	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+	const struct flowside *toside = flowside_at_sidx(tosidx);
+	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
+	uint8_t topif = pif_at_sidx(tosidx);
+	int n, i, from_s;
+
+	ASSERT(!c->no_udp && uflow);
+
+	from_s = uflow->s[ref.flowside.sidei];
+
+	if (udp_sock_errs(c, from_s, events) < 0) {
+		flow_err(uflow, "Unrecoverable error on reply socket");
+		flow_err_details(uflow);
+		udp_flow_close(c, uflow);
+		return;
+	}
+
+	if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
+		return;
+
+	flow_trace(uflow, "Received %d datagrams on reply socket", n);
+	uflow->ts = now->tv_sec;
+
+	for (i = 0; i < n; i++) {
+		if (pif_is_socket(topif))
+			udp_splice_prepare(udp_mh_recv, i);
+		else if (topif == PIF_TAP)
+			udp_tap_prepare(udp_mh_recv, i, toside, false);
+		/* Restore sockaddr length clobbered by recvmsg() */
+		udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
+	}
+
+	if (pif_is_socket(topif)) {
+		udp_splice_send(c, 0, n, tosidx);
+	} else if (topif == PIF_TAP) {
+		tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
+	} else {
+		uint8_t frompif = pif_at_sidx(ref.flowside);
+
+		flow_err(uflow, "No support for forwarding UDP from %s to %s",
+			 pif_name(frompif), pif_name(topif));
+	}
+}
+
+/**
+ * udp_reply_sock_handler() - Handle new data from flow specific socket
 * @c:		Execution context
 * @ref:	epoll reference
 * @events:	epoll events bitmap
 * @now:	Current timestamp
 */
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
-		      uint32_t events, const struct timespec *now)
+void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
+			    uint32_t events, const struct timespec *now)
 {
-	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
-
-	ASSERT(!c->no_udp && uflow);
-
-	if (events & EPOLLERR) {
-		if (udp_sock_errs(c, ref) < 0) {
-			flow_err(uflow, "Unrecoverable error on flow socket");
-			goto fail;
-		}
+	if (c->mode == MODE_VU) {
+		udp_vu_reply_sock_handler(c, ref, events, now);
+		return;
 	}

-	if (events & EPOLLIN) {
-		/* For not entirely clear reasons (data locality?) pasta gets
-		 * better throughput if we receive tap datagrams one at a
-		 * time.  For small splice datagrams throughput is slightly
-		 * better if we do batch, but it's slightly worse for large
-		 * splice datagrams.  Since we don't know the size before we
-		 * receive, always go one at a time for pasta mode.
-		 */
-		size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
-		flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
-		uint8_t topif = pif_at_sidx(tosidx);
-		int s = ref.fd;
-
-		flow_trace(uflow, "Received data on reply socket");
-		uflow->ts = now->tv_sec;
-
-		if (pif_is_socket(topif)) {
-			udp_sock_to_sock(c, ref.fd, n, tosidx);
-		} else if (topif == PIF_TAP) {
-			if (c->mode == MODE_VU) {
-				udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES,
-						   tosidx);
-			} else {
-				udp_buf_sock_to_tap(c, s, n, tosidx);
-			}
-		} else {
-			flow_err(uflow,
-				 "No support for forwarding UDP from %s to %s",
-				 pif_name(pif_at_sidx(ref.flowside)),
-				 pif_name(topif));
-			goto fail;
-		}
-	}
-	return;
-
-fail:
-	flow_err_details(uflow);
-	udp_flow_close(c, uflow);
+	udp_buf_reply_sock_handler(c, ref, events, now);
 }

 /**
@ -852,7 +719,6 @@ fail:
 * @af:		Address family, AF_INET or AF_INET6
 * @saddr:	Source address
 * @daddr:	Destination address
- * @ttl:	TTL or hop limit for packets to be sent in this call
 * @p:		Pool of UDP packets, with UDP headers
 * @idx:	Index of first packet to process
 * @now:	Current timestamp
@ -863,8 +729,7 @@ fail:
 */
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    uint8_t ttl, const struct pool *p, int idx,
-		    const struct timespec *now)
+		    const struct pool *p, int idx, const struct timespec *now)
 {
 	const struct flowside *toside;
 	struct mmsghdr mm[UIO_MAXIOV];
@ -912,7 +777,7 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 	}
 	toside = flowside_at_sidx(tosidx);

-	s = uflow->s[tosidx.sidei];
+	s = udp_at_sidx(tosidx)->s[tosidx.sidei];
 	ASSERT(s >= 0);

 	pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport);
@ -943,24 +808,6 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		mm[i].msg_hdr.msg_controllen = 0;
 		mm[i].msg_hdr.msg_flags = 0;

-		if (ttl != uflow->ttl[tosidx.sidei]) {
-			uflow->ttl[tosidx.sidei] = ttl;
-			if (af == AF_INET) {
-				if (setsockopt(s, IPPROTO_IP, IP_TTL,
-					       &ttl, sizeof(ttl)) < 0)
-					flow_perror(uflow,
-						    "setsockopt IP_TTL");
-			} else {
-				/* IPv6 hop_limit cannot be only 1 byte */
-				int hop_limit = ttl;
-
-				if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS,
-					       &hop_limit, sizeof(hop_limit)) < 0)
-					flow_perror(uflow,
-						    "setsockopt IPV6_UNICAST_HOPS");
-			}
-		}
-
 		count++;
 	}

--- a/udp.h
+++ b/udp.h
@ -11,12 +11,11 @@
 void udp_portmap_clear(void);
 void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 			     uint32_t events, const struct timespec *now);
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
-		      uint32_t events, const struct timespec *now);
+void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
+			    uint32_t events, const struct timespec *now);
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    uint8_t ttl, const struct pool *p, int idx,
-		    const struct timespec *now);
+		    const struct pool *p, int idx, const struct timespec *now);
 int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);
 int udp_init(struct ctx *c);
--- a/udp_flow.c
+++ b/udp_flow.c
@ -9,12 +9,10 @@
 #include <fcntl.h>
 #include <sys/uio.h>
 #include <unistd.h>
-#include <netinet/udp.h>

 #include "util.h"
 #include "passt.h"
 #include "flow_table.h"
-#include "udp_internal.h"

 #define UDP_CONN_TIMEOUT	180 /* s, timeout for ephemeral or local bind */

@ -43,141 +41,127 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
 */
 void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 {
-	unsigned sidei;
-
 	if (uflow->closed)
 		return; /* Nothing to do */

-	flow_foreach_sidei(sidei) {
-		flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
-		if (uflow->s[sidei] >= 0) {
-			epoll_del(c, uflow->s[sidei]);
-			close(uflow->s[sidei]);
-			uflow->s[sidei] = -1;
-		}
+	if (uflow->s[INISIDE] >= 0) {
+		/* The listening socket needs to stay in epoll */
+		close(uflow->s[INISIDE]);
+		uflow->s[INISIDE] = -1;
 	}

+	if (uflow->s[TGTSIDE] >= 0) {
+		/* But the flow specific one needs to be removed */
+		epoll_ctl(c->epollfd, EPOLL_CTL_DEL, uflow->s[TGTSIDE], NULL);
+		close(uflow->s[TGTSIDE]);
+		uflow->s[TGTSIDE] = -1;
+	}
+	flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
+	if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
+		flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
+
 	uflow->closed = true;
 }

-/**
- * udp_flow_sock() - Create, bind and connect a flow specific UDP socket
- * @c:		Execution context
- * @uflow:	UDP flow to open socket for
- * @sidei:	Side of @uflow to open socket for
- *
- * Return: fd of new socket on success, -ve error code on failure
- */
-static int udp_flow_sock(const struct ctx *c,
-			 struct udp_flow *uflow, unsigned sidei)
-{
-	const struct flowside *side = &uflow->f.side[sidei];
-	uint8_t pif = uflow->f.pif[sidei];
-	union {
-		flow_sidx_t sidx;
-		uint32_t data;
-	} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
-	int s;
-
-	s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
-	if (s < 0) {
-		flow_dbg_perror(uflow, "Couldn't open flow specific socket");
-		return s;
-	}
-
-	if (flowside_connect(c, s, pif, side) < 0) {
-		int rc = -errno;
-		flow_dbg_perror(uflow, "Couldn't connect flow socket");
-		return rc;
-	}
-
-	/* It's possible, if unlikely, that we could receive some packets in
-	 * between the bind() and connect() which may or may not be for this
-	 * flow.  Being UDP we could just discard them, but it's not ideal.
-	 *
-	 * There's also a tricky case if a bunch of datagrams for a new flow
-	 * arrive in rapid succession, the first going to the original listening
-	 * socket and later ones going to this new socket.  If we forwarded the
-	 * datagrams from the new socket immediately here they would go before
-	 * the datagram which established the flow.  Again, not strictly wrong
-	 * for UDP, but not ideal.
-	 *
-	 * So, we flag that the new socket is in a transient state where it
-	 * might have datagrams for a different flow queued.  Before the next
-	 * epoll cycle, udp_flow_defer() will flush out any such datagrams, and
-	 * thereafter everything on the new socket should be strictly for this
-	 * flow.
-	 */
-	if (sidei)
-		uflow->flush1 = true;
-	else
-		uflow->flush0 = true;
-
-	return s;
-}
-
 /**
 * udp_flow_new() - Common setup for a new UDP flow
 * @c:		Execution context
 * @flow:	Initiated flow
+ * @s_ini:	Initiating socket (or -1)
 * @now:	Timestamp
 *
 * Return: UDP specific flow, if successful, NULL on failure
- *
- * #syscalls getsockname
 */
 static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
-				const struct timespec *now)
+				int s_ini, const struct timespec *now)
 {
+	const struct flowside *ini = &flow->f.side[INISIDE];
 	struct udp_flow *uflow = NULL;
 	const struct flowside *tgt;
-	unsigned sidei;
+	uint8_t tgtpif;
+
+	if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) {
+		flow_trace(flow, "Invalid endpoint to initiate UDP flow");
+		goto cancel;
+	}

 	if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
 		goto cancel;
+	tgtpif = flow->f.pif[TGTSIDE];

 	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
 	uflow->ts = now->tv_sec;
 	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
-	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;

-	flow_foreach_sidei(sidei) {
-		if (pif_is_socket(uflow->f.pif[sidei]))
-			if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0)
-				goto cancel;
-	}
-
-	if (uflow->s[TGTSIDE] >= 0 && inany_is_unspecified(&tgt->oaddr)) {
-		/* When we target a socket, we connect() it, but might not
-		 * always bind(), leaving the kernel to pick our address.  In
-		 * that case connect() will implicitly bind() the socket, but we
-		 * need to determine its local address so that we can match
-		 * reply packets back to the correct flow.  Update the flow with
-		 * the information from getsockname() */
-		union sockaddr_inany sa;
-		socklen_t sl = sizeof(sa);
-		in_port_t port;
-
-		if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0) {
-			flow_perror(uflow, "Unable to determine local address");
-			goto cancel;
-		}
-		inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr,
-				    &port, &sa);
-		if (port != tgt->oport) {
-			flow_err(uflow, "Unexpected local port");
+	if (s_ini >= 0) {
+		/* When using auto port-scanning the listening port could go
+		 * away, so we need to duplicate the socket
+		 */
+		uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0);
+		if (uflow->s[INISIDE] < 0) {
+			flow_err(uflow,
+				 "Couldn't duplicate listening socket: %s",
+				 strerror(errno));
 			goto cancel;
 		}
 	}

-	/* Tap sides always need to be looked up by hash.  Socket sides don't
-	 * always, but sometimes do (receiving packets on a socket not specific
-	 * to one flow).  Unconditionally hash both sides so all our bases are
-	 * covered
+	if (pif_is_socket(tgtpif)) {
+		struct mmsghdr discard[UIO_MAXIOV] = { 0 };
+		union {
+			flow_sidx_t sidx;
+			uint32_t data;
+		} fref = {
+			.sidx = FLOW_SIDX(flow, TGTSIDE),
+		};
+		int rc;
+
+		uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY,
+						     tgtpif, tgt, fref.data);
+		if (uflow->s[TGTSIDE] < 0) {
+			flow_dbg(uflow,
+				 "Couldn't open socket for spliced flow: %s",
+				 strerror(errno));
+			goto cancel;
+		}
+
+		if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) {
+			flow_dbg(uflow,
+				 "Couldn't connect flow socket: %s",
+				 strerror(errno));
+			goto cancel;
+		}
+
+		/* It's possible, if unlikely, that we could receive some
+		 * unrelated packets in between the bind() and connect() of this
+		 * socket.  For now we just discard these.  We could consider
+		 * trying to redirect these to an appropriate handler, if we
+		 * need to.
+		 */
+		rc = recvmmsg(uflow->s[TGTSIDE], discard, ARRAY_SIZE(discard),
+			      MSG_DONTWAIT, NULL);
+		if (rc >= ARRAY_SIZE(discard)) {
+			flow_dbg(uflow,
+				 "Too many (%d) spurious reply datagrams", rc);
+			goto cancel;
+		} else if (rc > 0) {
+			flow_trace(uflow,
+				   "Discarded %d spurious reply datagrams", rc);
+		} else if (errno != EAGAIN) {
+			flow_err(uflow,
+				 "Unexpected error discarding datagrams: %s",
+				 strerror(errno));
+		}
+	}
+
+	flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
+
+	/* If the target side is a socket, it will be a reply socket that knows
+	 * its own flowside.  But if it's tap, then we need to look it up by
+	 * hash.
 	 */
-	flow_foreach_sidei(sidei)
-		flow_hash_insert(c, FLOW_SIDX(uflow, sidei));
-
+	if (!pif_is_socket(tgtpif))
+		flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE));
 	FLOW_ACTIVATE(uflow);

 	return FLOW_SIDX(uflow, TGTSIDE);
@ -190,30 +174,28 @@ cancel:
 }

 /**
- * udp_flow_from_sock() - Find or create UDP flow for incoming datagram
+ * udp_flow_from_sock() - Find or create UDP flow for "listening" socket
 * @c:		Execution context
- * @pif:	Interface the datagram is arriving from
- * @dst:	Our (local) address to which the datagram is arriving
- * @port:	Our (local) port number to which the datagram is arriving
+ * @ref:	epoll reference of the receiving socket
 * @s_in:	Source socket address, filled in by recvmmsg()
 * @now:	Timestamp
 *
- * #syscalls fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64
+ * #syscalls fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
 *
 * Return: sidx for the destination side of the flow for this packet, or
 *         FLOW_SIDX_NONE if we couldn't find or create a flow.
 */
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
-			       const union inany_addr *dst, in_port_t port,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now)
 {
-	const struct flowside *ini;
 	struct udp_flow *uflow;
 	union flow *flow;
 	flow_sidx_t sidx;

-	sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port);
+	ASSERT(ref.type == EPOLL_TYPE_UDP_LISTEN);
+
+	sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, s_in, ref.udp.port);
 	if ((uflow = udp_at_sidx(sidx))) {
 		uflow->ts = now->tv_sec;
 		return flow_sidx_opposite(sidx);
@ -223,24 +205,13 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
 		char sastr[SOCKADDR_STRLEN];

 		debug("Couldn't allocate flow for UDP datagram from %s %s",
-		      pif_name(pif), sockaddr_ntop(s_in, sastr, sizeof(sastr)));
+		      pif_name(ref.udp.pif),
+		      sockaddr_ntop(s_in, sastr, sizeof(sastr)));
 		return FLOW_SIDX_NONE;
 	}

-	ini = flow_initiate_sa(flow, pif, s_in, dst, port);
-
-	if (!inany_is_unicast(&ini->eaddr) ||
-	    ini->eport == 0 || ini->oport == 0) {
-		/* In principle ini->oddr also must be specified, but when we've
-		 * been initiated from a socket bound to 0.0.0.0 or ::, we don't
-		 * know our address, so we have to leave it unpopulated.
-		 */
-		flow_err(flow, "Invalid endpoint on UDP recvfrom()");
-		flow_alloc_cancel(flow);
-		return FLOW_SIDX_NONE;
-	}
-
-	return udp_flow_new(c, flow, now);
+	flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port);
+	return udp_flow_new(c, flow, ref.fd, now);
 }

 /**
@ -262,7 +233,6 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 			      in_port_t srcport, in_port_t dstport,
 			      const struct timespec *now)
 {
-	const struct flowside *ini;
 	struct udp_flow *uflow;
 	union flow *flow;
 	flow_sidx_t sidx;
@ -286,55 +256,19 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 		return FLOW_SIDX_NONE;
 	}

-	ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport,
-			       daddr, dstport);
+	flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport);

-	if (inany_is_unspecified(&ini->eaddr) || ini->eport == 0 ||
-	    inany_is_unspecified(&ini->oaddr) || ini->oport == 0) {
-		flow_dbg(flow, "Invalid endpoint on UDP packet");
-		flow_alloc_cancel(flow);
-		return FLOW_SIDX_NONE;
-	}
-
-	return udp_flow_new(c, flow, now);
-}
-
-/**
- * udp_flush_flow() - Flush datagrams that might not be for this flow
- * @c:		Execution context
- * @uflow:	Flow to handle
- * @sidei:	Side of the flow to flush
- * @now:	Current timestamp
- */
-static void udp_flush_flow(const struct ctx *c,
-			   const struct udp_flow *uflow, unsigned sidei,
-			   const struct timespec *now)
-{
-	/* We don't know exactly where the datagrams will come from, but we know
-	 * they'll have an interface and oport matching this flow */
-	udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei],
-		     uflow->f.side[sidei].oport, now);
+	return udp_flow_new(c, flow, -1, now);
 }

 /**
 * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
- * @c:		Execution context
 * @uflow:	Flow to handle
- * @now:	Current timestamp
 *
 * Return: true if the connection is ready to free, false otherwise
 */
-bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
-		    const struct timespec *now)
+bool udp_flow_defer(const struct udp_flow *uflow)
 {
-	if (uflow->flush0) {
-		udp_flush_flow(c, uflow, INISIDE, now);
-		uflow->flush0 = false;
-	}
-	if (uflow->flush1) {
-		udp_flush_flow(c, uflow, TGTSIDE, now);
-		uflow->flush1 = false;
-	}
 	return uflow->closed;
 }

--- a/udp_flow.h
+++ b/udp_flow.h
@ -8,12 +8,9 @@
 #define UDP_FLOW_H

 /**
- * struct udp_flow - Descriptor for a flow of UDP packets
+ * struct udp - Descriptor for a flow of UDP packets
 * @f:		Generic flow information
- * @ttl:	TTL or hop_limit for both sides
 * @closed:	Flow is already closed
- * @flush0:	@s[0] may have datagrams queued for other flows
- * @flush1:	@s[1] may have datagrams queued for other flows
 * @ts:		Activity timestamp
 * @s:		Socket fd (or -1) for each side of the flow
 */
@ -21,19 +18,13 @@ struct udp_flow {
 	/* Must be first element */
 	struct flow_common f;

-	uint8_t ttl[SIDES];
-
-	bool	closed	:1,
-		flush0	:1,
-		flush1	:1;
-
+	bool closed :1;
 	time_t ts;
 	int s[SIDES];
 };

 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
-			       const union inany_addr *dst, in_port_t port,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
 			       const union sockaddr_inany *s_in,
 			       const struct timespec *now);
 flow_sidx_t udp_flow_from_tap(const struct ctx *c,
@ -42,8 +33,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 			      in_port_t srcport, in_port_t dstport,
 			      const struct timespec *now);
 void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
-bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
-		    const struct timespec *now);
+bool udp_flow_defer(const struct udp_flow *uflow);
 bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
 		    const struct timespec *now);

--- a/udp_internal.h
+++ b/udp_internal.h
@ -8,6 +8,8 @@

 #include "tap.h" /* needed by udp_meta_t */

+#define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
+
 /**
 * struct udp_payload_t - UDP header and data for inbound messages
 * @uh:		UDP header
@ -28,7 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
                       const struct flowside *toside, size_t dlen,
 		       bool no_udp_csum);
-void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
-		  in_port_t port, const struct timespec *now);
-
+int udp_sock_errs(const struct ctx *c, int s, uint32_t events);
 #endif /* UDP_INTERNAL_H */
--- a/udp_vu.c
+++ b/udp_vu.c
@ -57,16 +57,28 @@ static size_t udp_vu_hdrlen(bool v6)
 	return hdrlen;
 }

+static int udp_vu_sock_init(int s, union sockaddr_inany *s_in)
+{
+	struct msghdr msg = {
+		.msg_name = s_in,
+		.msg_namelen = sizeof(union sockaddr_inany),
+	};
+
+	return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
+}
+
 /**
 * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
 * @c:		Execution context
 * @s:		Socket to receive from
+ * @events:	epoll events bitmap
 * @v6:		Set for IPv6 connections
 * @dlen:	Size of received data (output)
 *
 * Return: Number of iov entries used to store the datagram
 */
-static int udp_vu_sock_recv(const struct ctx *c, int s, bool v6, ssize_t *dlen)
+static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
+			    bool v6, ssize_t *dlen)
 {
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
@ -76,20 +88,21 @@ static int udp_vu_sock_recv(const struct ctx *c, int s, bool v6, ssize_t *dlen)

 	ASSERT(!c->no_udp);

+	if (!(events & EPOLLIN))
+		return 0;
+
 	/* compute L2 header length */
 	hdrlen = udp_vu_hdrlen(v6);

 	vu_init_elem(elem, iov_vu, VIRTQUEUE_MAX_SIZE);

 	iov_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE,
-			     IP_MAX_MTU + ETH_HLEN +
-			     sizeof(struct virtio_net_hdr_mrg_rxbuf),
+			     IP_MAX_MTU - sizeof(struct udphdr) + hdrlen,
 			     NULL);
 	if (iov_cnt == 0)
 		return 0;

 	/* reserve space for the headers */
-	ASSERT(iov_vu[0].iov_len >= hdrlen);
 	iov_vu[0].iov_base = (char *)iov_vu[0].iov_base + hdrlen;
 	iov_vu[0].iov_len -= hdrlen;

@ -169,8 +182,9 @@ static size_t udp_vu_prepare(const struct ctx *c,

 /**
 * udp_vu_csum() - Calculate and set checksum for a UDP packet
- * @toside:	Address information for one side of the flow
- * @iov_used:	Number of used iov_vu items
+ * @toside:	ddress information for one side of the flow
+ * @l4len:	IPv4 Payload length
+ * @iov_used:	Length of the array
 */
 static void udp_vu_csum(const struct flowside *toside, int iov_used)
 {
@ -178,39 +192,74 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used)
 	const struct in_addr *dst4 = inany_v4(&toside->eaddr);
 	char *base = iov_vu[0].iov_base;
 	struct udp_payload_t *bp;
-	struct iov_tail data;

 	if (src4 && dst4) {
 		bp = vu_payloadv4(base);
-		data = IOV_TAIL(iov_vu, iov_used, (char *)&bp->data - base);
-		csum_udp4(&bp->uh, *src4, *dst4, &data);
+		csum_udp4(&bp->uh, *src4, *dst4, iov_vu, iov_used,
+			  (char *)&bp->data - base);
 	} else {
 		bp = vu_payloadv6(base);
-		data = IOV_TAIL(iov_vu, iov_used, (char *)&bp->data - base);
-		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data);
+		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
+			  iov_vu, iov_used, (char *)&bp->data - base);
 	}
 }

 /**
- * udp_vu_sock_to_tap() - Forward datagrams from socket to tap
+ * udp_vu_listen_sock_handler() - Handle new data from socket
 * @c:		Execution context
- * @s:		Socket to read data from
- * @n:		Maximum number of datagrams to forward
- * @tosidx:	Flow & side to forward data from @s to
+ * @ref:	epoll reference
+ * @events:	epoll events bitmap
+ * @now:	Current timestamp
 */
-void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx)
+void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
+				uint32_t events, const struct timespec *now)
 {
-	const struct flowside *toside = flowside_at_sidx(tosidx);
-	bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
 	struct vu_dev *vdev = c->vdev;
 	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
 	int i;

-	for (i = 0; i < n; i++) {
+	if (udp_sock_errs(c, ref.fd, events) < 0) {
+		err("UDP: Unrecoverable error on listening socket:"
+		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
+		return;
+	}
+
+	for (i = 0; i < UDP_MAX_FRAMES; i++) {
+		const struct flowside *toside;
+		union sockaddr_inany s_in;
+		flow_sidx_t sidx;
+		uint8_t pif;
 		ssize_t dlen;
 		int iov_used;
+		bool v6;

-		iov_used = udp_vu_sock_recv(c, s, v6, &dlen);
+		if (udp_vu_sock_init(ref.fd, &s_in) < 0)
+			break;
+
+		sidx = udp_flow_from_sock(c, ref, &s_in, now);
+		pif = pif_at_sidx(sidx);
+
+		if (pif != PIF_TAP) {
+			if (flow_sidx_valid(sidx)) {
+				flow_sidx_t fromsidx = flow_sidx_opposite(sidx);
+				struct udp_flow *uflow = udp_at_sidx(sidx);
+
+				flow_err(uflow,
+					"No support for forwarding UDP from %s to %s",
+					pif_name(pif_at_sidx(fromsidx)),
+					pif_name(pif));
+			} else {
+				debug("Discarding 1 datagram without flow");
+			}
+
+			continue;
+		}
+
+		toside = flowside_at_sidx(sidx);
+
+		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
+
+		iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen);
 		if (iov_used <= 0)
 			break;

@ -223,3 +272,65 @@ void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx)
 		vu_flush(vdev, vq, elem, iov_used);
 	}
 }
+
+/**
+ * udp_vu_reply_sock_handler() - Handle new data from flow specific socket
+ * @c:		Execution context
+ * @ref:	epoll reference
+ * @events:	epoll events bitmap
+ * @now:	Current timestamp
+ */
+void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
+			        uint32_t events, const struct timespec *now)
+{
+	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+	const struct flowside *toside = flowside_at_sidx(tosidx);
+	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
+	int from_s = uflow->s[ref.flowside.sidei];
+	struct vu_dev *vdev = c->vdev;
+	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	int i;
+
+	ASSERT(!c->no_udp);
+
+	if (udp_sock_errs(c, from_s, events) < 0) {
+		flow_err(uflow, "Unrecoverable error on reply socket");
+		flow_err_details(uflow);
+		udp_flow_close(c, uflow);
+		return;
+	}
+
+	for (i = 0; i < UDP_MAX_FRAMES; i++) {
+		uint8_t topif = pif_at_sidx(tosidx);
+		ssize_t dlen;
+		int iov_used;
+		bool v6;
+
+		ASSERT(uflow);
+
+		if (topif != PIF_TAP) {
+			uint8_t frompif = pif_at_sidx(ref.flowside);
+
+			flow_err(uflow,
+				 "No support for forwarding UDP from %s to %s",
+				 pif_name(frompif), pif_name(topif));
+			continue;
+		}
+
+		v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
+
+		iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen);
+		if (iov_used <= 0)
+			break;
+		flow_trace(uflow, "Received 1 datagram on reply socket");
+		uflow->ts = now->tv_sec;
+
+		udp_vu_prepare(c, toside, dlen);
+		if (*c->pcap) {
+			udp_vu_csum(toside, iov_used);
+			pcap_iov(iov_vu, iov_used,
+				 sizeof(struct virtio_net_hdr_mrg_rxbuf));
+		}
+		vu_flush(vdev, vq, elem, iov_used);
+	}
+}
--- a/udp_vu.h
+++ b/udp_vu.h
@ -6,8 +6,8 @@
 #ifndef UDP_VU_H
 #define UDP_VU_H

-void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
-			     const struct timespec *now);
-void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx);
-
+void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
+				uint32_t events, const struct timespec *now);
+void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
+			       uint32_t events, const struct timespec *now);
 #endif /* UDP_VU_H */
--- a/util.c
+++ b/util.c
@ -71,7 +71,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	case EPOLL_TYPE_UDP_LISTEN:
 		freebind = c->freebind;
 		/* fallthrough */
-	case EPOLL_TYPE_UDP:
+	case EPOLL_TYPE_UDP_REPLY:
 		proto = IPPROTO_UDP;
 		socktype = SOCK_DGRAM | SOCK_NONBLOCK;
 		break;
@ -90,7 +90,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,

 	ret = -errno;
 	if (fd < 0) {
-		warn("L4 socket: %s", strerror_(-ret));
+		warn("L4 socket: %s", strerror(-ret));
 		return ret;
 	}

@ -109,15 +109,11 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 		debug("Failed to set SO_REUSEADDR on socket %i", fd);

 	if (proto == IPPROTO_UDP) {
-		int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
-		int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
 		int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
+		int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;

-		if (setsockopt(fd, level, recverr, &y, sizeof(y)))
+		if (setsockopt(fd, level, opt, &y, sizeof(y)))
 			die_perror("Failed to set RECVERR on socket %i", fd);
-
-		if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
-			die_perror("Failed to set PKTINFO on socket %i", fd);
 	}

 	if (ifname && *ifname) {
@ -166,7 +162,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,

 	if (type == EPOLL_TYPE_TCP_LISTEN && listen(fd, 128) < 0) {
 		ret = -errno;
-		warn("TCP socket listen: %s", strerror_(-ret));
+		warn("TCP socket listen: %s", strerror(-ret));
 		close(fd);
 		return ret;
 	}
@ -175,75 +171,13 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	ev.data.u64 = ref.u64;
 	if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
 		ret = -errno;
-		warn("L4 epoll_ctl: %s", strerror_(-ret));
+		warn("L4 epoll_ctl: %s", strerror(-ret));
 		return ret;
 	}

 	return fd;
 }

-/**
- * sock_unix() - Create and bind AF_UNIX socket
- * @sock_path:	Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
- *
- * Return: socket descriptor on success, won't return on failure
- */
-int sock_unix(char *sock_path)
-{
-	int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
-	struct sockaddr_un addr = {
-		.sun_family = AF_UNIX,
-	};
-	int i;
-
-	if (fd < 0)
-		die_perror("Failed to open UNIX domain socket");
-
-	for (i = 1; i < UNIX_SOCK_MAX; i++) {
-		char *path = addr.sun_path;
-		int ex, ret;
-
-		if (*sock_path)
-			memcpy(path, sock_path, UNIX_PATH_MAX);
-		else if (snprintf_check(path, UNIX_PATH_MAX - 1,
-					UNIX_SOCK_PATH, i))
-			die_perror("Can't build UNIX domain socket path");
-
-		ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
-			    0);
-		if (ex < 0)
-			die_perror("Failed to check for UNIX domain conflicts");
-
-		ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
-		if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
-			     errno != EACCES)) {
-			if (*sock_path)
-				die("Socket path %s already in use", path);
-
-			close(ex);
-			continue;
-		}
-		close(ex);
-
-		unlink(path);
-		ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
-		if (*sock_path && ret)
-			die_perror("Failed to bind UNIX domain socket");
-
-		if (!ret)
-			break;
-	}
-
-	if (i == UNIX_SOCK_MAX)
-		die_perror("Failed to bind UNIX domain socket");
-
-	info("UNIX domain socket bound at %s", addr.sun_path);
-	if (!*sock_path)
-		memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
-
-	return fd;
-}
-
 /**
 * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
 * @c:		Execution context
@ -471,7 +405,7 @@ void pidfile_write(int fd, pid_t pid)

 	if (write(fd, pid_buf, n) < 0) {
 		perror("PID file write");
-		_exit(EXIT_FAILURE);
+		exit(EXIT_FAILURE);
 	}

 	close(fd);
@ -507,12 +441,12 @@ int __daemon(int pidfile_fd, int devnull_fd)

 	if (pid == -1) {
 		perror("fork");
-		_exit(EXIT_FAILURE);
+		exit(EXIT_FAILURE);
 	}

 	if (pid) {
 		pidfile_write(pidfile_fd, pid);
-		_exit(EXIT_SUCCESS);
+		exit(EXIT_SUCCESS);
 	}

 	if (setsid()				< 0 ||
@ -520,7 +454,7 @@ int __daemon(int pidfile_fd, int devnull_fd)
 	    dup2(devnull_fd, STDOUT_FILENO)	< 0 ||
 	    dup2(devnull_fd, STDERR_FILENO)	< 0 ||
 	    close(devnull_fd))
-		_exit(EXIT_FAILURE);
+		exit(EXIT_FAILURE);

 	return 0;
 }
@ -672,90 +606,6 @@ int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip)
 	return 0;
 }

-/**
- * read_all_buf() - Fill a whole buffer from a file descriptor
- * @fd:		File descriptor
- * @buf:	Pointer to base of buffer
- * @len:	Length of buffer
- *
- * Return: 0 on success, -1 on error (with errno set)
- *
- * #syscalls read
- */
-int read_all_buf(int fd, void *buf, size_t len)
-{
-	size_t left = len;
-	char *p = buf;
-
-	while (left) {
-		ssize_t rc;
-
-		ASSERT(left <= len);
-
-		do
-			rc = read(fd, p, left);
-		while ((rc < 0) && errno == EINTR);
-
-		if (rc < 0)
-			return -1;
-
-		if (rc == 0) {
-			errno = ENODATA;
-			return -1;
-		}
-
-		p += rc;
-		left -= rc;
-	}
-	return 0;
-}
-
-/**
- * read_remainder() - Read the tail of an IO vector from a file descriptor
- * @fd:		File descriptor
- * @iov:	IO vector
- * @cnt:	Number of entries in @iov
- * @skip:	Number of bytes of the vector to skip reading
- *
- * Return: 0 on success, -1 on error (with errno set)
- *
- * Note: mode-specific seccomp profiles need to enable readv() to use this.
- */
-/* cppcheck-suppress unusedFunction */
-int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip)
-{
-	size_t i = 0, offset;
-
-	while ((i += iov_skip_bytes(iov + i, cnt - i, skip, &offset)) < cnt) {
-		ssize_t rc;
-
-		if (offset) {
-			ASSERT(offset < iov[i].iov_len);
-			/* Read the remainder of the partially read buffer */
-			if (read_all_buf(fd, (char *)iov[i].iov_base + offset,
-					 iov[i].iov_len - offset) < 0)
-				return -1;
-			i++;
-		}
-
-		if (cnt == i)
-			break;
-
-		/* Fill as many of the remaining buffers as we can */
-		rc = readv(fd, &iov[i], cnt - i);
-		if (rc < 0)
-			return -1;
-
-		if (rc == 0) {
-			errno = ENODATA;
-			return -1;
-		}
-
-		skip = rc;
-	}
-	return 0;
-}
-
 /** sockaddr_ntop() - Convert a socket address to text format
 * @sa:		Socket address
 * @dst:	output buffer, minimum SOCKADDR_STRLEN bytes
@ -987,56 +837,3 @@ void raw_random(void *buf, size_t buflen)
 	if (random_read < buflen)
 		die("Unexpected EOF on random data source");
 }
-
-/**
- * epoll_del() - Remove a file descriptor from our passt epoll
- * @c:		Execution context
- * @fd:		File descriptor to remove
- */
-void epoll_del(const struct ctx *c, int fd)
-{
-	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
-
-}
-
-/**
- * encode_domain_name() - Encode domain name according to RFC 1035, section 3.1
- * @buf:		Buffer to fill in with encoded domain name
- * @domain_name:	Input domain name string with terminator
- *
- * The buffer's 'buf' size has to be >= strlen(domain_name) + 2
- */
-void encode_domain_name(char *buf, const char *domain_name)
-{
-	size_t i;
-	char *p;
-
-	buf[0] = strcspn(domain_name, ".");
-	p = buf + 1;
-	for (i = 0; domain_name[i]; i++) {
-		if (domain_name[i] == '.')
-			p[i] = strcspn(domain_name + i + 1, ".");
-		else
-			p[i] = domain_name[i];
-	}
-	p[i] = 0L;
-}
-
-/**
- * abort_with_msg() - Print error message and abort
- * @fmt:	Format string
- * @...:	Format parameters
- */
-void abort_with_msg(const char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-	vlogmsg(true, false, LOG_CRIT, fmt, ap);
-	va_end(ap);
-
-	/* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp,
-	 * but that will still get the job done.
-	 */
-	abort();
-}
--- a/Show more
+++ b/Show more
Author	SHA1	Message	Date
David Gibson	71a16dbc49	tcp: Move tcp_l2_buf_fill_headers() to tcp_buf.c This function only has callers in tcp_buf.c. More importantly, it's inherently tied to the "buf" path, because it uses internal knowledge of how we lay out the various headers across our locally allocated buffers. Therefore, move it to tcp_buf.c. Slightly reformat the prototypes while we're at it. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Laurent Vivier <lvivier@redhat.com>	2024-11-15 10:55:53 +01:00
David Gibson	3958736de5	tcp_vu: Share more header construction between IPv4 and IPv6 paths tcp_vu_send_flag() and tcp_vu_prepare() both needs to do some different things for IPv4 vs. IPv6. However the two paths have a number of lines of duplicated code. We can share those at the expense of an additional conditional (which we might be able to simplify again later). Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Laurent Vivier <lvivier@redhat.com>	2024-11-15 10:55:53 +01:00
Stefano Brivio	9392ea7e5a	test: Add tests for passt in vhost-user mode Run functional and performance tests for vhost-user mode as well. For functional tests, we add passt_vu and passt_vu_in_ns as symbolic links to their non-vhost-user counterparts, as no differences are intended but we want to distinguish them in test logs. For performance tests, instead, we add separate perf/passt_vu_tcp and perf/passt_vu_udp files, as we need longer test duration, as well as higher UDP sending bandwidths and larger TCP windows, to actually get the highest throughput vhost-user mode offers. For valgrind tests, vhost-user mode needs two extra system calls: statx and readlink. Add them as EXTRA_SYSCALLS for the valgrind target. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>	2024-11-15 10:55:53 +01:00
Laurent Vivier	92fe7e967a	vhost-user: add vhost-user add virtio and vhost-user functions to connect with QEMU. $ ./passt --vhost-user and # qemu-system-x86_64 ... -m 4G \ -object memory-backend-memfd,id=memfd0,share=on,size=4G \ -numa node,memdev=memfd0 \ -chardev socket,id=chr0,path=/tmp/passt_1.socket \ -netdev vhost-user,id=netdev0,chardev=chr0 \ -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \ ... Signed-off-by: Laurent Vivier <lvivier@redhat.com>	2024-11-15 10:55:53 +01:00
Laurent Vivier	007af94bb9	passt: rename tap_sock_init() to tap_backend_init() Extract pool storage initialization loop to tap_sock_update_pool(), extract QEMU hints to tap_backend_show_hints(). Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>	2024-11-15 10:54:01 +01:00
Laurent Vivier	1ceee36c57	tcp: Export headers functions Export tcp_fill_headers[4\|6]() and tcp_update_check_tcp[4\|6](). They'll be needed by vhost-user. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>	2024-11-15 10:53:40 +01:00
Laurent Vivier	7f6b184fb8	udp: Prepare udp.c to be shared with vhost-user Export udp_payload_t, udp_update_hdr4(), udp_update_hdr6() and udp_sock_errs(). Rename udp_listen_sock_handler() to udp_buf_listen_sock_handler() and udp_reply_sock_handler to udp_buf_reply_sock_handler(). Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>	2024-11-15 10:53:40 +01:00
Laurent Vivier	23cc8f892f	vhost-user: introduce vhost-user API Add vhost_user.c and vhost_user.h that define the functions needed to implement vhost-user backend. Signed-off-by: Laurent Vivier <lvivier@redhat.com>	2024-11-15 10:53:40 +01:00
Laurent Vivier	119b45358c	vhost-user: introduce virtio API Add virtio.c and virtio.h that define the functions needed to manage virtqueues. Signed-off-by: Laurent Vivier <lvivier@redhat.com>	2024-11-15 10:53:40 +01:00
Laurent Vivier	8ac20f4795	packet: replace struct desc by struct iovec To be able to manage buffers inside a shared memory provided by a VM via a vhost-user interface, we cannot rely on the fact that buffers are located in a pre-defined memory area and use a base address and a 32bit offset to address them. We need a 64bit address, so replace struct desc by struct iovec and update range checking. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>	2024-11-15 10:53:40 +01:00
Stefano Brivio	7f6c10626d	selinux: Use auth_read_passwd() interface for all our getpwnam() needs If passt or pasta are started as root, we need to read the passwd file (be it /etc/passwd or whatever sssd provides) to find out UID and GID of 'nobody' so that we can switch to it. Instead of a bunch of allow rules for passwd_file_t and sssd macros, use the more convenient auth_read_passwd() interface which should cover our usage of getpwnam(). The existing rules weren't actually enough: # strace -e openat passt -f [...] Started as root, will change to nobody. openat(AT_FDCWD, "/etc/nsswitch.conf", O_RDONLY\|O_CLOEXEC) = 4 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY\|O_CLOEXEC) = 4 openat(AT_FDCWD, "/lib64/libnss_sss.so.2", O_RDONLY\|O_CLOEXEC) = 4 openat(AT_FDCWD, "/var/lib/sss/mc/passwd", O_RDONLY\|O_CLOEXEC) = -1 EACCES (Permission denied) openat(AT_FDCWD, "/var/lib/sss/mc/passwd", O_RDONLY\|O_CLOEXEC) = -1 EACCES (Permission denied) openat(AT_FDCWD, "/etc/passwd", O_RDONLY\|O_CLOEXEC) = 4 with corresponding SELinux warnings logged in audit.log. Reported-by: Minxi Hou <mhou@redhat.com> Analysed-by: Miloš Malik <mmalik@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>	2024-11-14 23:41:52 +01:00