cppcheck: Don't check the system headers

We pass -I options to cppcheck so that it will find the system headers. Then we need to pass a bunch more options to suppress the zillions of cppcheck errors found in those headers. It turns out, however, that it's not recommended to give the system headers to cppcheck anyway. Instead it has built-in knowledge of the ANSI libc and uses that as the basis of its checks. We do need to suppress missingIncludeSystem warnings instead though. Not bothering with the system headers makes the cppcheck runtime go from ~37s to ~14s on my machine, which is a pretty nice win. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
linux_dep: Fix CLOSE_RANGE_UNSHARE availability handling
2024-11-08 08:26:21 +01:00 · 2024-11-08 08:26:17 +01:00 · 2024-11-08 08:26:15 +01:00 · 2024-11-08 08:25:58 +01:00 · 2024-11-08 08:24:58 +01:00 · 2024-11-08 08:24:52 +01:00
64 changed files with 1767 additions and 1123 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,126 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 11.
+#
+# For more information, see:
+#
+#   Documentation/dev-tools/clang-format.rst
+#   https://clang.llvm.org/docs/ClangFormat.html
+#   https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: true
+  AfterNamespace: true
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  IndentBraces: false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeInheritanceComma: false
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeComma
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 80
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: false
+
+# Taken from:
+#   git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
+#   | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$,  - '\1'," \
+#   | LC_ALL=C sort -u
+ForEachMacros:
+  - 'for_each_nst'
+
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex: '.*'
+    Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentGotoLabels: false
+IndentPPDirectives: None
+IndentWidth: 8
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+PenaltyBreakAssignment: 10
+PenaltyBreakBeforeFirstCallParameter: 30
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatementsExceptForEachMacros
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 8
+UseTab: Always
+...
--- a/.clang-tidy
+++ b/.clang-tidy
@ -0,0 +1,93 @@
+---
+Checks:
+    - "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*"
+
+    #	TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
+    - "-clang-analyzer-valist.Uninitialized"
+
+    #	Dubious value, would kill readability
+    - "-cppcoreguidelines-init-variables"
+
+    #	Dubious value over the compiler's built-in warning.  Would
+    #	increase verbosity.
+    - "-bugprone-assignment-in-if-condition"
+
+    #	Debatable whether these improve readability, right now it would look
+    #	like a mess
+    - "-google-readability-braces-around-statements"
+    - "-hicpp-braces-around-statements"
+    - "-readability-braces-around-statements"
+
+    #	TODO: in most cases they are justified, but probably not everywhere
+    #
+    - "-readability-magic-numbers"
+    - "-cppcoreguidelines-avoid-magic-numbers"
+
+    #	TODO: this is Linux-only for the moment, nice to fix eventually
+    - "-llvmlibc-restrict-system-libc-headers"
+
+    #	Those are needed for syscalls, epoll_wait flags, etc.
+    - "-hicpp-signed-bitwise"
+
+    #	Probably not doable to impement this without plain memcpy(), memset()
+    - "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
+
+    #	TODO: not really important, but nice to fix eventually
+    - "-llvm-include-order"
+
+    #	Dubious value, would kill readability
+    - "-readability-isolate-declaration"
+
+    #	TODO: nice to fix eventually
+    - "-bugprone-narrowing-conversions"
+    - "-cppcoreguidelines-narrowing-conversions"
+
+    #	TODO: check, fix, and more in general constify wherever possible
+    - "-cppcoreguidelines-avoid-non-const-global-variables"
+
+    #	TODO: check paths where it might make sense to improve performance
+    - "-altera-unroll-loops"
+    - "-altera-id-dependent-backward-branch"
+
+    #	Not much can be done about them other than being careful
+    - "-bugprone-easily-swappable-parameters"
+
+    #	TODO: split reported functions
+    - "-readability-function-cognitive-complexity"
+
+    #	"Poor" alignment needed for structs reflecting message formats/headers
+    - "-altera-struct-pack-align"
+
+    #	TODO: check again if multithreading is implemented
+    - "-concurrency-mt-unsafe"
+
+    #	Complains about any identifier <3 characters, reasonable for
+    #	globals, pointlessly verbose for locals and parameters.
+    - "-readability-identifier-length"
+
+    #	Wants to include headers which *directly* provide the things
+    #	we use.  That sounds nice, but means it will often want a OS
+    #	specific header instead of a mostly standard one, such as
+    #	<linux/limits.h> instead of <limits.h>.
+    - "-misc-include-cleaner"
+
+    #	Want to replace all #defines of integers with enums.  Kind of
+    #	makes sense when those defines form an enum-like set, but
+    #	weird for cases like standalone constants, and causes other
+    #	awkwardness for a bunch of cases we use
+    - "-cppcoreguidelines-macro-to-enum"
+
+    #	It's been a couple of centuries since multiplication has been granted
+    #	precedence over addition in modern mathematical notation. Adding
+    #	parentheses to reinforce that certainly won't improve readability.
+    - "-readability-math-missing-parentheses"
+WarningsAsErrors: "*"
+HeaderFileExtensions:
+    - h
+ImplementationFileExtensions:
+    - c
+HeaderFilterRegex: ""
+FormatStyle: none
+CheckOptions:
+    bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false"
+SystemHeaders: false
--- a/.clangd
+++ b/.clangd
@ -0,0 +1,3 @@
+CompileFlags:
+    # Don't try to interpret our headers as C++'
+    Add: [-xc, -Wall]
--- a/161
+++ b/161
@ -15,24 +15,11 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
 # the IPv6 socket API? (Linux does)
 DUAL_STACK_SOCKETS := 1

-RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s')
-ifeq ($(RLIMIT_STACK_VAL),unlimited)
-RLIMIT_STACK_VAL := 1024
-endif
-
 TARGET ?= $(shell $(CC) -dumpmachine)
 # Get 'uname -m'-like architecture description for target
 TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
 TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')

-AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
-AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
-
 # On some systems enabling optimization also enables source fortification,
 # automagically. Do not override it.
 FORTIFY_FLAG :=
@ -44,10 +31,6 @@ FLAGS := -Wall -Wextra -Wno-format-zero-length
 FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
 FLAGS +=  $(FORTIFY_FLAG) -O2 -pie -fPIE
 FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
-FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
-FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
-FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
-FLAGS += -DARCH=\"$(TARGET_ARCH)\"
 FLAGS += -DVERSION=\"$(VERSION)\"
 FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)

@ -67,21 +50,6 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	udp.h udp_flow.h util.h
 HEADERS = $(PASST_HEADERS) seccomp.h

-C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
-ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
-	FLAGS += -DHAS_SND_WND
-endif
-
-C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
-ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
-	FLAGS += -DHAS_BYTES_ACKED
-endif
-
-C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 };
-ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
-	FLAGS += -DHAS_MIN_RTT
-endif
-
 C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
 ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
 	FLAGS += -DHAS_GETRANDOM
@ -91,11 +59,6 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec
 	FLAGS += -fstack-protector-strong
 endif

-C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE;
-ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
-	EXTRA_SYSCALLS += fallocate
-endif
-
 prefix		?= /usr/local
 exec_prefix	?= $(prefix)
 bindir		?= $(exec_prefix)/bin
@ -132,7 +95,7 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
 	ln -sf $< $@

 qrap: $(QRAP_SRCS) passt.h
-	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
+	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)

 valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction	\
 			    rt_sigreturn getpid gettid kill clock_gettime mmap \
@ -196,116 +159,11 @@ docs: README.md
 		done < README.md;					\
 	) > README.plain.md

-# Checkers currently disabled for clang-tidy:
-# - llvmlibc-restrict-system-libc-headers
-#	TODO: this is Linux-only for the moment, nice to fix eventually
-#
-# - google-readability-braces-around-statements
-# - hicpp-braces-around-statements
-# - readability-braces-around-statements
-#	Debatable whether that improves readability, right now it would look
-#	like a mess
-#
-# - readability-magic-numbers
-# - cppcoreguidelines-avoid-magic-numbers
-#	TODO: in most cases they are justified, but probably not everywhere
-#
-# - clang-analyzer-valist.Uninitialized
-#	TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
-#
-# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling
-#	Probably not doable to impement this without plain memcpy(), memset()
-#
-# - cppcoreguidelines-init-variables
-#	Dubious value, would kill readability
-#
-# - hicpp-signed-bitwise
-#	Those are needed for syscalls, epoll_wait flags, etc.
-#
-# - llvm-include-order
-#	TODO: not really important, but nice to fix eventually
-#
-# - readability-isolate-declaration
-#	Dubious value, would kill readability
-#
-# - bugprone-narrowing-conversions
-# - cppcoreguidelines-narrowing-conversions
-#	TODO: nice to fix eventually
-#
-# - cppcoreguidelines-avoid-non-const-global-variables
-#	TODO: check, fix, and more in general constify wherever possible
-#
-# - altera-unroll-loops
-# - altera-id-dependent-backward-branch
-#	TODO: check paths where it might make sense to improve performance
-#
-# - bugprone-easily-swappable-parameters
-#	Not much can be done about them other than being careful
-#
-# - readability-function-cognitive-complexity
-#	TODO: split reported functions
-#
-# - altera-struct-pack-align
-#	"Poor" alignment needed for structs reflecting message formats/headers
-#
-# - concurrency-mt-unsafe
-#	TODO: check again if multithreading is implemented
-#
-# - readability-identifier-length
-#	Complains about any identifier <3 characters, reasonable for
-#	globals, pointlessly verbose for locals and parameters.
-#
-# - bugprone-assignment-in-if-condition
-#	Dubious value over the compiler's built-in warning.  Would
-#	increase verbosity.
-#
-# - misc-include-cleaner
-#	Wants to include headers which *directly* provide the things
-#	we use.  That sounds nice, but means it will often want a OS
-#	specific header instead of a mostly standard one, such as
-#	<linux/limits.h> instead of <limits.h>.
-#
-# - cppcoreguidelines-macro-to-enum
-#	Want to replace all #defines of integers with enums.  Kind of
-#	makes sense when those defines form an enum-like set, but
-#	weird for cases like standalone constants, and causes other
-#	awkwardness for a bunch of cases we use
+clang-tidy: $(PASST_SRCS) $(HEADERS)
+	clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
+	           -DCLANG_TIDY_58992

-clang-tidy: $(SRCS) $(HEADERS)
-	clang-tidy -checks=*,-modernize-*,\
-	-clang-analyzer-valist.Uninitialized,\
-	-cppcoreguidelines-init-variables,\
-	-bugprone-assignment-in-if-condition,\
-	-google-readability-braces-around-statements,\
-	-hicpp-braces-around-statements,\
-	-readability-braces-around-statements,\
-	-readability-magic-numbers,\
-	-llvmlibc-restrict-system-libc-headers,\
-	-hicpp-signed-bitwise,\
-	-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
-	-llvm-include-order,\
-	-cppcoreguidelines-avoid-magic-numbers,\
-	-readability-isolate-declaration,\
-	-bugprone-narrowing-conversions,\
-	-cppcoreguidelines-narrowing-conversions,\
-	-cppcoreguidelines-avoid-non-const-global-variables,\
-	-altera-unroll-loops,-altera-id-dependent-backward-branch,\
-	-bugprone-easily-swappable-parameters,\
-	-readability-function-cognitive-complexity,\
-	-altera-struct-pack-align,\
-	-concurrency-mt-unsafe,\
-	-readability-identifier-length,\
-	-misc-include-cleaner,\
-	-cppcoreguidelines-macro-to-enum \
-	-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
-	--warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
-
-SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
-ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)
-VER := $(shell $(CC) -dumpversion)
-SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include
-endif
-cppcheck: $(SRCS) $(HEADERS)
+cppcheck: $(PASST_SRCS) $(HEADERS)
 	if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
 		CPPCHECK_EXHAUSTIVE="--check-level=exhaustive";		\
 	else								\
@ -314,11 +172,8 @@ cppcheck: $(SRCS) $(HEADERS)
 	cppcheck --std=c11 --error-exitcode=1 --enable=all --force	\
 	--inconclusive --library=posix --quiet				\
 	$${CPPCHECK_EXHAUSTIVE}						\
-	$(SYSTEM_INCLUDES:%=-I%)					\
-	$(SYSTEM_INCLUDES:%=--config-exclude=%)				\
-	$(SYSTEM_INCLUDES:%=--suppress=*:%/*)				\
-	$(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*)	\
 	--inline-suppr							\
+	--suppress=missingIncludeSystem \
 	--suppress=unusedStructMember					\
-	$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS))			\
-	$(SRCS) $(HEADERS)
+	$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936  \
+	$(PASST_SRCS) $(HEADERS)
--- a/arch.c
+++ b/arch.c
@ -19,6 +19,7 @@
 #include <unistd.h>

 #include "log.h"
+#include "util.h"

 /**
 * arch_avx2_exec() - Switch to AVX2 build if supported
@ -40,8 +41,11 @@ void arch_avx2_exec(char **argv)
 	if (__builtin_cpu_supports("avx2")) {
 		char new_path[PATH_MAX + sizeof(".avx2")];

-		snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe);
-		execve(new_path, argv, environ);
+		if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"),
+				   "%s.avx2", exe))
+			die_perror("Can't build AVX2 executable path");
+
+		execv(new_path, argv);
 		warn_perror("Can't run AVX2 build, using non-AVX2 version");
 	}
 }
--- a/arp.c
+++ b/arp.c
@ -59,14 +59,12 @@ int arp(const struct ctx *c, const struct pool *p)
 	    ah->ar_op  != htons(ARPOP_REQUEST))
 		return 1;

-	/* Discard announcements (but not 0.0.0.0 "probes"): we might have the
-	 * same IP address, hide that.
-	 */
-	if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
+	/* Discard announcements, but not 0.0.0.0 "probes" */
+	if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) &&
 	    !memcmp(am->sip, am->tip, sizeof(am->sip)))
 		return 1;

-	/* Don't resolve our own address, either. */
+	/* Don't resolve the guest's assigned address, either. */
 	if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
 		return 1;

--- a/checksum.c
+++ b/checksum.c
@ -59,6 +59,7 @@
 #include "util.h"
 #include "ip.h"
 #include "checksum.h"
+#include "iov.h"

 /* Checksums are optional for UDP over IPv4, so we usually just set
 * them to 0.  Change this to 1 to calculate real UDP over IPv4
@ -165,22 +166,24 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
 * @udp4hr:	UDP header, initialised apart from checksum
 * @saddr:	IPv4 source address
 * @daddr:	IPv4 destination address
- * @payload:	UDP packet payload
- * @dlen:	Length of @payload (not including UDP header)
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ * @offset:	UDP payload offset in the iovec array
 */
 void csum_udp4(struct udphdr *udp4hr,
 	       struct in_addr saddr, struct in_addr daddr,
-	       const void *payload, size_t dlen)
+	       const struct iovec *iov, int iov_cnt, size_t offset)
 {
 	/* UDP checksums are optional, so don't bother */
 	udp4hr->check = 0;

 	if (UDP4_REAL_CHECKSUMS) {
-		uint16_t l4len = dlen + sizeof(struct udphdr);
+		uint16_t l4len = iov_size(iov, iov_cnt) - offset +
+				 sizeof(struct udphdr);
 		uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
 						       saddr, daddr);
 		psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
-		udp4hr->check = csum(payload, dlen, psum);
+		udp4hr->check = csum_iov(iov, iov_cnt, offset, psum);
 	}
 }

@ -226,19 +229,24 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
 /**
 * csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet
 * @udp6hr:	UDP header, initialised apart from checksum
- * @payload:	UDP packet payload
- * @dlen:	Length of @payload (not including UDP header)
+ * @saddr:	Source address
+ * @daddr:	Destination address
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ * @offset:	UDP payload offset in the iovec array
 */
 void csum_udp6(struct udphdr *udp6hr,
 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
-	       const void *payload, size_t dlen)
+	       const struct iovec *iov, int iov_cnt, size_t offset)
 {
-	uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(struct udphdr),
-					       IPPROTO_UDP, saddr, daddr);
+	uint16_t l4len = iov_size(iov, iov_cnt) - offset +
+			 sizeof(struct udphdr);
+	uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
+					       saddr, daddr);
 	udp6hr->check = 0;

 	psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
-	udp6hr->check = csum(payload, dlen, psum);
+	udp6hr->check = csum_iov(iov, iov_cnt, offset, psum);
 }

 /**
@ -497,16 +505,26 @@ uint16_t csum(const void *buf, size_t len, uint32_t init)
 *
 * @iov		Pointer to the array of IO vectors
 * @n		Length of the array
+ * @offset:	Offset of the data to checksum within the full data length
 * @init	Initial 32-bit checksum, 0 for no pre-computed checksum
 *
 * Return: 16-bit folded, complemented checksum
 */
-/* cppcheck-suppress unusedFunction */
-uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init)
+uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
+		  uint32_t init)
 {
 	unsigned int i;
+	size_t first;

-	for (i = 0; i < n; i++)
+	i = iov_skip_bytes(iov, n, offset, &first);
+	if (i >= n)
+		return (uint16_t)~csum_fold(init);
+
+	init = csum_unfolded((char *)iov[i].iov_base + first,
+			     iov[i].iov_len - first, init);
+	i++;
+
+	for (; i < n; i++)
 		init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);

 	return (uint16_t)~csum_fold(init);
--- a/checksum.h
+++ b/checksum.h
@ -19,19 +19,20 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
 				struct in_addr saddr, struct in_addr daddr);
 void csum_udp4(struct udphdr *udp4hr,
 	       struct in_addr saddr, struct in_addr daddr,
-	       const void *payload, size_t dlen);
+	       const struct iovec *iov, int iov_cnt, size_t offset);
 void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
 uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
 				const struct in6_addr *saddr,
 				const struct in6_addr *daddr);
 void csum_udp6(struct udphdr *udp6hr,
 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
-	       const void *payload, size_t dlen);
+	       const struct iovec *iov, int iov_cnt, size_t offset);
 void csum_icmp6(struct icmp6hdr *icmp6hr,
 		const struct in6_addr *saddr, const struct in6_addr *daddr,
 		const void *payload, size_t dlen);
 uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
 uint16_t csum(const void *buf, size_t len, uint32_t init);
-uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init);
+uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
+		  uint32_t init);

 #endif /* CHECKSUM_H */
--- a/conf.c
+++ b/conf.c
@ -46,6 +46,8 @@
 #include "isolation.h"
 #include "log.h"

+#define NETNS_RUN_DIR	"/run/netns"
+
 /**
 * next_chunk - Return the next piece of a string delimited by a character
 * @s:		String to search
@ -116,11 +118,10 @@ static int parse_port_range(const char *s, char **endptr,
 static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 		       struct fwd_ports *fwd)
 {
-	char addr_buf[sizeof(struct in6_addr)] = { 0 }, *addr = addr_buf;
+	union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
 	char buf[BUFSIZ], *spec, *ifname = NULL, *p;
 	bool exclude_only = true, bound_one = false;
 	uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
-	sa_family_t af = AF_UNSPEC;
 	unsigned i;
 	int ret;

@ -166,15 +167,13 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,

 			bitmap_set(fwd->map, i);
 			if (optname == 't') {
-				ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL,
-						    i);
+				ret = tcp_sock_init(c, NULL, NULL, i);
 				if (ret == -ENFILE || ret == -EMFILE)
 					goto enfile;
 				if (!ret)
 					bound_one = true;
 			} else if (optname == 'u') {
-				ret = udp_sock_init(c, 0, AF_UNSPEC, NULL, NULL,
-						    i);
+				ret = udp_sock_init(c, 0, NULL, NULL, i);
 				if (ret == -ENFILE || ret == -EMFILE)
 					goto enfile;
 				if (!ret)
@ -226,11 +225,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 				p++;
 			}

-			if (inet_pton(AF_INET, p, addr))
-				af = AF_INET;
-			else if (inet_pton(AF_INET6, p, addr))
-				af = AF_INET6;
-			else
+			if (!inany_pton(p, addr))
 				goto bad;
 		}
 	} else {
@ -276,13 +271,13 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
 			bitmap_set(fwd->map, i);

 			if (optname == 't') {
-				ret = tcp_sock_init(c, af, addr, ifname, i);
+				ret = tcp_sock_init(c, addr, ifname, i);
 				if (ret == -ENFILE || ret == -EMFILE)
 					goto enfile;
 				if (!ret)
 					bound_one = true;
 			} else if (optname == 'u') {
-				ret = udp_sock_init(c, 0, af, addr, ifname, i);
+				ret = udp_sock_init(c, 0, addr, ifname, i);
 				if (ret == -ENFILE || ret == -EMFILE)
 					goto enfile;
 				if (!ret)
@ -338,9 +333,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,

 			ret = 0;
 			if (optname == 't')
-				ret = tcp_sock_init(c, af, addr, ifname, i);
+				ret = tcp_sock_init(c, addr, ifname, i);
 			else if (optname == 'u')
-				ret = udp_sock_init(c, 0, af, addr, ifname, i);
+				ret = udp_sock_init(c, 0, addr, ifname, i);
 			if (ret)
 				goto bind_fail;
 		}
@ -581,10 +576,15 @@ static void conf_pasta_ns(int *netns_only, char *userns, char *netns,
 			if (pidval < 0 || pidval > INT_MAX)
 				die("Invalid PID %s", argv[optind]);

-			snprintf(netns, PATH_MAX, "/proc/%ld/ns/net", pidval);
-			if (!*userns)
-				snprintf(userns, PATH_MAX, "/proc/%ld/ns/user",
-					 pidval);
+			if (snprintf_check(netns, PATH_MAX,
+					   "/proc/%ld/ns/net", pidval))
+				die_perror("Can't build netns path");
+
+			if (!*userns) {
+				if (snprintf_check(userns, PATH_MAX,
+						   "/proc/%ld/ns/user", pidval))
+					die_perror("Can't build userns path");
+			}
 		}
 	}

@ -735,19 +735,19 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
 static void usage(const char *name, FILE *f, int status)
 {
 	if (strstr(name, "pasta")) {
-		fprintf(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name);
-		fprintf(f, "       %s [OPTION]... PID\n", name);
-		fprintf(f, "       %s [OPTION]... --netns [PATH|NAME]\n", name);
-		fprintf(f,
+		FPRINTF(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name);
+		FPRINTF(f, "       %s [OPTION]... PID\n", name);
+		FPRINTF(f, "       %s [OPTION]... --netns [PATH|NAME]\n", name);
+		FPRINTF(f,
 			"\n"
 			"Without PID or --netns, run the given command or a\n"
 			"default shell in a new network and user namespace, and\n"
 			"connect it via pasta.\n");
 	} else {
-		fprintf(f, "Usage: %s [OPTION]...\n", name);
+		FPRINTF(f, "Usage: %s [OPTION]...\n", name);
 	}

-	fprintf(f,
+	FPRINTF(f,
 		"\n"
 		"  -d, --debug		Be verbose\n"
 		"      --trace		Be extra verbose, implies --debug\n"
@ -764,17 +764,17 @@ static void usage(const char *name, FILE *f, int status)
 		"  --version		Show version and exit\n");

 	if (strstr(name, "pasta")) {
-		fprintf(f,
+		FPRINTF(f,
 			"  -I, --ns-ifname NAME	namespace interface name\n"
 			"    default: same interface name as external one\n");
 	} else {
-		fprintf(f,
+		FPRINTF(f,
 			"  -s, --socket PATH	UNIX domain socket path\n"
 			"    default: probe free path starting from "
 			UNIX_SOCK_PATH "\n", 1);
 	}

-	fprintf(f,
+	FPRINTF(f,
 		"  -F, --fd FD		Use FD as pre-opened connected socket\n"
 		"  -p, --pcap FILE	Log tap-facing traffic to pcap file\n"
 		"  -P, --pid FILE	Write own PID to the given file\n"
@ -805,28 +805,28 @@ static void usage(const char *name, FILE *f, int status)
 		"    can be specified multiple times\n"
 		"    a single, empty option disables DNS information\n");
 	if (strstr(name, "pasta"))
-		fprintf(f, "    default: don't use any addresses\n");
+		FPRINTF(f, "    default: don't use any addresses\n");
 	else
-		fprintf(f, "    default: use addresses from /etc/resolv.conf\n");
-	fprintf(f,
+		FPRINTF(f, "    default: use addresses from /etc/resolv.conf\n");
+	FPRINTF(f,
 		"  -S, --search LIST	Space-separated list, search domains\n"
 		"    a single, empty option disables the DNS search list\n");
 	if (strstr(name, "pasta"))
-		fprintf(f, "    default: don't use any search list\n");
+		FPRINTF(f, "    default: don't use any search list\n");
 	else
-		fprintf(f, "    default: use search list from /etc/resolv.conf\n");
+		FPRINTF(f, "    default: use search list from /etc/resolv.conf\n");

 	if (strstr(name, "pasta"))
-		fprintf(f, "  --dhcp-dns	\tPass DNS list via DHCP/DHCPv6/NDP\n");
+		FPRINTF(f, "  --dhcp-dns	\tPass DNS list via DHCP/DHCPv6/NDP\n");
 	else
-		fprintf(f, "  --no-dhcp-dns	No DNS list in DHCP/DHCPv6/NDP\n");
+		FPRINTF(f, "  --no-dhcp-dns	No DNS list in DHCP/DHCPv6/NDP\n");

 	if (strstr(name, "pasta"))
-		fprintf(f, "  --dhcp-search	Pass list via DHCP/DHCPv6/NDP\n");
+		FPRINTF(f, "  --dhcp-search	Pass list via DHCP/DHCPv6/NDP\n");
 	else
-		fprintf(f, "  --no-dhcp-search	No list in DHCP/DHCPv6/NDP\n");
+		FPRINTF(f, "  --no-dhcp-search	No list in DHCP/DHCPv6/NDP\n");

-	fprintf(f,
+	FPRINTF(f,
 		"  --map-host-loopback ADDR	Translate ADDR to refer to host\n"
 	        "    can be specified zero to two times (for IPv4 and IPv6)\n"
 		"    default: gateway address\n"
@ -836,6 +836,9 @@ static void usage(const char *name, FILE *f, int status)
 		"  --dns-forward ADDR	Forward DNS queries sent to ADDR\n"
 		"    can be specified zero to two times (for IPv4 and IPv6)\n"
 		"    default: don't forward DNS queries\n"
+		"  --dns-host ADDR	Host nameserver to direct queries to\n"
+		"    can be specified zero to two times (for IPv4 and IPv6)\n"
+		"    default: first nameserver from host's /etc/resolv.conf\n"
 		"  --no-tcp		Disable TCP protocol handler\n"
 		"  --no-udp		Disable UDP protocol handler\n"
 		"  --no-icmp		Disable ICMP/ICMPv6 protocol handler\n"
@ -843,6 +846,7 @@ static void usage(const char *name, FILE *f, int status)
 		"  --no-ndp		Disable NDP responses\n"
 		"  --no-dhcpv6		Disable DHCPv6 server\n"
 		"  --no-ra		Disable router advertisements\n"
+		"  --freebind		Bind to any address for forwarding\n"
 		"  --no-map-gw		Don't map gateway address to host\n"
 		"  -4, --ipv4-only	Enable IPv4 operation only\n"
 		"  -6, --ipv6-only	Enable IPv6 operation only\n");
@ -850,7 +854,7 @@ static void usage(const char *name, FILE *f, int status)
 	if (strstr(name, "pasta"))
 		goto pasta_opts;

-	fprintf(f,
+	FPRINTF(f,
 		"  -1, --one-off	Quit after handling one single client\n"
 		"  -t, --tcp-ports SPEC	TCP port forwarding to guest\n"
 		"    can be specified multiple times\n"
@ -881,7 +885,7 @@ static void usage(const char *name, FILE *f, int status)

 pasta_opts:

-	fprintf(f,
+	FPRINTF(f,
 		"  -t, --tcp-ports SPEC	TCP port forwarding to namespace\n"
 		"    can be specified multiple times\n"
 		"    SPEC can be:\n"
@ -915,6 +919,9 @@ pasta_opts:
 		"  -U, --udp-ns SPEC	UDP port forwarding to init namespace\n"
 		"    SPEC is as described above\n"
 		"    default: auto\n"
+		"  --host-lo-to-ns-lo	DEPRECATED:\n"
+		"			Translate host-loopback forwards to\n"
+		"			namespace loopback\n"
 		"  --userns NSPATH 	Target user namespace to join\n"
 		"  --netns PATH|NAME	Target network namespace to join\n"
 		"  --netns-only		Don't join existing user namespace\n"
@ -1189,7 +1196,11 @@ static void conf_open_files(struct ctx *c)
 	if (c->mode != MODE_PASTA && c->fd_tap == -1)
 		c->fd_tap_listen = tap_sock_unix_open(c->sock_path);

-	c->pidfile_fd = pidfile_open(c->pidfile);
+	if (*c->pidfile) {
+		c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY);
+		if (c->pidfile_fd < 0)
+			die_perror("Couldn't open PID file %s", c->pidfile);
+	}
 }

 /**
@ -1262,6 +1273,7 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"no-dhcpv6",	no_argument,		&c->no_dhcpv6,	1 },
 		{"no-ndp",	no_argument,		&c->no_ndp,	1 },
 		{"no-ra",	no_argument,		&c->no_ra,	1 },
+		{"freebind",	no_argument,		&c->freebind,	1 },
 		{"no-map-gw",	no_argument,		&no_map_gw,	1 },
 		{"ipv4-only",	no_argument,		NULL,		'4' },
 		{"ipv6-only",	no_argument,		NULL,		'6' },
@ -1291,6 +1303,8 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"netns-only",	no_argument,		NULL,		20 },
 		{"map-host-loopback", required_argument, NULL,		21 },
 		{"map-guest-addr", required_argument,	NULL,		22 },
+		{"host-lo-to-ns-lo", no_argument, 	NULL,		23 },
+		{"dns-host",	required_argument,	NULL,		24 },
 		{ 0 },
 	};
 	const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@ -1413,9 +1427,9 @@ void conf(struct ctx *c, int argc, char **argv)

 			break;
 		case 14:
-			fprintf(stdout,
+			FPRINTF(stdout,
 				c->mode == MODE_PASTA ? "pasta " : "passt ");
-			fprintf(stdout, VERSION_BLOB);
+			FPRINTF(stdout, VERSION_BLOB);
 			exit(EXIT_SUCCESS);
 		case 15:
 			ret = snprintf(c->ip4.ifname_out,
@ -1468,6 +1482,23 @@ void conf(struct ctx *c, int argc, char **argv)
 			conf_nat(optarg, &c->ip4.map_guest_addr,
 				 &c->ip6.map_guest_addr, NULL);
 			break;
+		case 23:
+			if (c->mode != MODE_PASTA)
+				die("--host-lo-to-ns-lo is for pasta mode only");
+			c->host_lo_to_ns_lo = 1;
+			break;
+		case 24:
+			if (inet_pton(AF_INET6, optarg, &c->ip6.dns_host) &&
+			    !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
+				break;
+
+			if (inet_pton(AF_INET, optarg, &c->ip4.dns_host) &&
+			    !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)   &&
+			    !IN4_IS_ADDR_BROADCAST(&c->ip4.dns_host))
+				break;
+
+			die("Invalid host nameserver address: %s", optarg);
+			break;
 		case 'd':
 			c->debug = 1;
 			c->quiet = 0;
--- a/contrib/apparmor/abstractions/passt
+++ b/contrib/apparmor/abstractions/passt
@ -34,6 +34,8 @@

  owner @{PROC}/@{pid}/uid_map		r,	# conf_ugid()

+  @{PROC}/sys/net/ipv4/ip_local_port_range r,	# fwd_probe_ephemeral()
+
  network netlink raw,				# nl_sock_init_do(), netlink.c

  network inet stream,				# tcp.c
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@ -50,6 +50,7 @@ require {
 	type passwd_file_t;

 	class netlink_route_socket { bind create nlmsg_read };
+	type sysctl_net_t;

 	class capability { sys_tty_config setuid setgid };
 	class cap_userns { setpcap sys_admin sys_ptrace };
@ -104,6 +105,8 @@ allow passt_t net_conf_t:lnk_file read;
 allow passt_t tmp_t:sock_file { create unlink write };
 allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
 kernel_search_network_sysctl(passt_t)
+allow passt_t sysctl_net_t:dir search;
+allow passt_t sysctl_net_t:file { open read };

 corenet_tcp_bind_all_nodes(passt_t)
 corenet_udp_bind_all_nodes(passt_t)
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@ -196,7 +196,7 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch };
 allow pasta_t self:tun_socket create;
 allow pasta_t tun_tap_device_t:chr_file { ioctl open read write };
 allow pasta_t sysctl_net_t:dir search;
-allow pasta_t sysctl_net_t:file { open write };
+allow pasta_t sysctl_net_t:file { open read write };
 allow pasta_t kernel_t:system module_request;

 allow pasta_t nsfs_t:file read;
--- a/dhcpv6.c
+++ b/dhcpv6.c
@ -296,47 +296,42 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
 static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
 					   struct in6_addr *la)
 {
+	int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
+	const struct opt_ia_addr *opt_addr;
 	char buf[INET6_ADDRSTRLEN];
 	struct in6_addr req_addr;
 	const struct opt_hdr *h;
 	struct opt_hdr *ia;
 	size_t offset;
-	int ia_type;

-	ia_type = OPT_IA_NA;
-ia_ta:
-	offset = 0;
-	while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
-		if (ntohs(ia->l) < OPT_VSIZE(ia_na))
-			return NULL;
-
-		offset += sizeof(struct opt_ia_na);
-
-		while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
-			const struct opt_ia_addr *opt_addr;
-
-			if (ntohs(h->l) != OPT_VSIZE(ia_addr))
+	foreach(ia_type, ia_types) {
+		offset = 0;
+		while ((ia = dhcpv6_opt(p, &offset, *ia_type))) {
+			if (ntohs(ia->l) < OPT_VSIZE(ia_na))
 				return NULL;

-			opt_addr = (const struct opt_ia_addr *)h;
-			req_addr = opt_addr->addr;
-			if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) {
-				info("DHCPv6: requested address %s not on link",
-				     inet_ntop(AF_INET6, &req_addr,
-					       buf, sizeof(buf)));
-				return ia;
-			}
+			offset += sizeof(struct opt_ia_na);

-			offset += sizeof(struct opt_ia_addr);
+			while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
+				if (ntohs(h->l) != OPT_VSIZE(ia_addr))
+					return NULL;
+
+				opt_addr = (const struct opt_ia_addr *)h;
+				req_addr = opt_addr->addr;
+				if (!IN6_ARE_ADDR_EQUAL(la, &req_addr))
+					goto err;
+
+				offset += sizeof(struct opt_ia_addr);
+			}
 		}
 	}

-	if (ia_type == OPT_IA_NA) {
-		ia_type = OPT_IA_TA;
-		goto ia_ta;
-	}
-
 	return NULL;
+
+err:
+	info("DHCPv6: requested address %s not on link",
+	     inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
+	return ia;
 }

 /**
@ -428,11 +423,11 @@ search:
 int dhcpv6(struct ctx *c, const struct pool *p,
 	   const struct in6_addr *saddr, const struct in6_addr *daddr)
 {
-	struct opt_hdr *ia, *bad_ia, *client_id;
-	const struct opt_hdr *server_id;
+	const struct opt_hdr *client_id, *server_id, *ia;
 	const struct in6_addr *src;
 	const struct msg_hdr *mh;
 	const struct udphdr *uh;
+	struct opt_hdr *bad_ia;
 	size_t mlen, n;

 	uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
--- a/flow.c
+++ b/flow.c
@ -283,28 +283,23 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 	       "Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
 }

-/**
- * flow_set_state() - Change flow's state
- * @f:		Flow changing state
- * @state:	New state
+/** flow_log_details_() - Log the details of a flow
+ * @f:		flow to log
+ * @pri:	Log priority
+ * @state:	State to log details according to
+ *
+ * Logs the details of the flow: endpoints, interfaces, type etc.
 */
-static void flow_set_state(struct flow_common *f, enum flow_state state)
+void flow_log_details_(const struct flow_common *f, int pri,
+		       enum flow_state state)
 {
 	char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
 	char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
 	const struct flowside *ini = &f->side[INISIDE];
 	const struct flowside *tgt = &f->side[TGTSIDE];
-	uint8_t oldstate = f->state;

-	ASSERT(state < FLOW_NUM_STATES);
-	ASSERT(oldstate < FLOW_NUM_STATES);
-
-	f->state = state;
-	flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
-		  FLOW_STATE(f));
-
-	if (MAX(state, oldstate) >= FLOW_STATE_TGT)
-		flow_log_(f, LOG_DEBUG,
+	if (state >= FLOW_STATE_TGT)
+		flow_log_(f, pri,
 			  "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
 			  pif_name(f->pif[INISIDE]),
 			  inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
@ -316,8 +311,8 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
 			  tgt->oport,
 			  inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
 			  tgt->eport);
-	else if (MAX(state, oldstate) >= FLOW_STATE_INI)
-		flow_log_(f, LOG_DEBUG, "%s [%s]:%hu -> [%s]:%hu => ?",
+	else if (state >= FLOW_STATE_INI)
+		flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
 			  pif_name(f->pif[INISIDE]),
 			  inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
 			  ini->eport,
@ -325,6 +320,25 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
 			  ini->oport);
 }

+/**
+ * flow_set_state() - Change flow's state
+ * @f:		Flow changing state
+ * @state:	New state
+ */
+static void flow_set_state(struct flow_common *f, enum flow_state state)
+{
+	uint8_t oldstate = f->state;
+
+	ASSERT(state < FLOW_NUM_STATES);
+	ASSERT(oldstate < FLOW_NUM_STATES);
+
+	f->state = state;
+	flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
+		  FLOW_STATE(f));
+
+	flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
+}
+
 /**
 * flow_initiate_() - Move flow to INI, setting pif[INISIDE]
 * @flow:	Flow to change state
@ -697,7 +711,7 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
 	       !(FLOW_PROTO(&flow->f) == proto &&
 		 flow->f.pif[sidx.sidei] == pif &&
 		 flowside_eq(&flow->f.side[sidx.sidei], side)))
-		b = (b + 1) % FLOW_HASH_SIZE;
+		b = mod_sub(b, 1, FLOW_HASH_SIZE);

 	return flow_hashtab[b];
 }
@ -832,7 +846,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 				closed = icmp_ping_timer(c, &flow->ping, now);
 			break;
 		case FLOW_UDP:
-			if (timer)
+			closed = udp_flow_defer(&flow->udp);
+			if (!closed && timer)
 				closed = udp_flow_timer(c, &flow->udp, now);
 			break;
 		default:
--- a/flow.h
+++ b/flow.h
@ -264,4 +264,11 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 			flow_dbg((f), __VA_ARGS__);			\
 	} while (0)

+void flow_log_details_(const struct flow_common *f, int pri,
+		       enum flow_state state);
+#define flow_log_details(f_, pri) \
+	flow_log_details_(&((f_)->f), (pri), (f_)->f.state)
+#define flow_dbg_details(f_)	flow_log_details((f_), LOG_DEBUG)
+#define flow_err_details(f_)	flow_log_details((f_), LOG_ERR)
+
 #endif /* FLOW_H */
--- a/flow_table.h
+++ b/flow_table.h
@ -110,7 +110,7 @@ static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
 	const union flow *flow = flow_at_sidx(sidx);

 	if (!flow)
-		return PIF_NONE;
+		return NULL;

 	return &flow->f.side[sidx.sidei];
 }
--- a/fwd.c
+++ b/fwd.c
@ -75,8 +75,8 @@ void fwd_probe_ephemeral(void)
 	if (*end || errno)
 		goto parse_err;

-	if (min < 0 || min >= NUM_PORTS ||
-	    max < 0 || max >= NUM_PORTS)
+	if (min < 0 || min >= (long)NUM_PORTS ||
+	    max < 0 || max >= (long)NUM_PORTS)
 		goto parse_err;

 	fwd_ephemeral_min = min;
@ -447,20 +447,35 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
 	    (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
 		/* spliceable */

-		/* Preserve the specific loopback adddress used, but let the
-		 * kernel pick a source port on the target side
+		/* The traffic will go over the guest's 'lo' interface, but by
+		 * default use its external address, so we don't inadvertently
+		 * expose services that listen only on the guest's loopback
+		 * address.  That can be overridden by --host-lo-to-ns-lo which
+		 * will instead forward to the loopback address in the guest.
+		 *
+		 * In either case, let the kernel pick the source address to
+		 * match.
 		 */
-		tgt->oaddr = ini->eaddr;
+		if (inany_v4(&ini->eaddr)) {
+			if (c->host_lo_to_ns_lo)
+				tgt->eaddr = inany_loopback4;
+			else
+				tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
+			tgt->oaddr = inany_any4;
+		} else {
+			if (c->host_lo_to_ns_lo)
+				tgt->eaddr = inany_loopback6;
+			else
+				tgt->eaddr.a6 = c->ip6.addr_seen;
+			tgt->oaddr = inany_any6;
+		}
+
+		/* Let the kernel pick source port */
 		tgt->oport = 0;
 		if (proto == IPPROTO_UDP)
 			/* But for UDP preserve the source port */
 			tgt->oport = ini->eport;

-		if (inany_v4(&ini->eaddr))
-			tgt->eaddr = inany_loopback4;
-		else
-			tgt->eaddr = inany_loopback6;
-
 		return PIF_SPLICE;
 	}

--- a/inany.c
+++ b/inany.c
@ -36,3 +36,23 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)

 	return inet_ntop(AF_INET6, &src->a6, dst, size);
 }
+
+/** inany_pton - Parse an IPv[46] address from text format
+ * @src:	IPv[46] address
+ * @dst:	output buffer, filled with parsed address
+ *
+ * Return: On success, 1, if no parseable address is found, 0
+ */
+int inany_pton(const char *src, union inany_addr *dst)
+{
+	if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) {
+		memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero));
+		memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one));
+		return 1;
+	}
+
+	if (inet_pton(AF_INET6, src, &dst->a6))
+		return 1;
+
+	return 0;
+}
--- a/inany.h
+++ b/inany.h
@ -270,5 +270,6 @@ static inline void inany_siphash_feed(struct siphash_state *state,
 #define INANY_ADDRSTRLEN	MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)

 const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size);
+int inany_pton(const char *src, union inany_addr *dst);

 #endif /* INANY_H */
--- a/linux_dep.h
+++ b/linux_dep.h
@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ *
+ * Declarations for Linux specific dependencies
+ */
+
+#ifndef LINUX_DEP_H
+#define LINUX_DEP_H
+
+/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
+ *
+ * Largely derived from include/linux/tcp.h in the Linux kernel
+ *
+ * Some fields returned by TCP_INFO have been there for ages and are shared with
+ * BSD.  struct tcp_info from netinet/tcp.h has only those fields.  There are
+ * also a many Linux specific extensions to the structure, which are only found
+ * in the linux/tcp.h version of struct tcp_info.
+ *
+ * We want to use some of those extension fields, when available.  We can test
+ * for availability in the runtime kernel using the length returned from
+ * getsockopt(). However, we won't necessarily be compiled against the same
+ * kernel headers as we'll run with, so compiling directly against linux/tcp.h
+ * means wrapping every field access in an #ifdef whose #else does the same
+ * thing as when the field is missing at runtime.  This rapidly gets messy.
+ *
+ * Instead we define here struct tcp_info_linux which includes all the Linux
+ * extensions that we want to use.  This is taken from v6.11 of the kernel.
+ */
+struct tcp_info_linux {
+	uint8_t		tcpi_state;
+	uint8_t		tcpi_ca_state;
+	uint8_t		tcpi_retransmits;
+	uint8_t		tcpi_probes;
+	uint8_t		tcpi_backoff;
+	uint8_t		tcpi_options;
+	uint8_t		tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
+	uint8_t		tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
+
+	uint32_t	tcpi_rto;
+	uint32_t	tcpi_ato;
+	uint32_t	tcpi_snd_mss;
+	uint32_t	tcpi_rcv_mss;
+
+	uint32_t	tcpi_unacked;
+	uint32_t	tcpi_sacked;
+	uint32_t	tcpi_lost;
+	uint32_t	tcpi_retrans;
+	uint32_t	tcpi_fackets;
+
+	/* Times. */
+	uint32_t	tcpi_last_data_sent;
+	uint32_t	tcpi_last_ack_sent;
+	uint32_t	tcpi_last_data_recv;
+	uint32_t	tcpi_last_ack_recv;
+
+	/* Metrics. */
+	uint32_t	tcpi_pmtu;
+	uint32_t	tcpi_rcv_ssthresh;
+	uint32_t	tcpi_rtt;
+	uint32_t	tcpi_rttvar;
+	uint32_t	tcpi_snd_ssthresh;
+	uint32_t	tcpi_snd_cwnd;
+	uint32_t	tcpi_advmss;
+	uint32_t	tcpi_reordering;
+
+	uint32_t	tcpi_rcv_rtt;
+	uint32_t	tcpi_rcv_space;
+
+	uint32_t	tcpi_total_retrans;
+
+	/* Linux extensions */
+	uint64_t	tcpi_pacing_rate;
+	uint64_t	tcpi_max_pacing_rate;
+	uint64_t	tcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
+	uint64_t	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
+	uint32_t	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */
+	uint32_t	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */
+
+	uint32_t	tcpi_notsent_bytes;
+	uint32_t	tcpi_min_rtt;
+	uint32_t	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
+	uint32_t	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
+
+	uint64_t	tcpi_delivery_rate;
+
+	uint64_t	tcpi_busy_time;      /* Time (usec) busy sending data */
+	uint64_t	tcpi_rwnd_limited;   /* Time (usec) limited by receive window */
+	uint64_t	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
+
+	uint32_t	tcpi_delivered;
+	uint32_t	tcpi_delivered_ce;
+
+	uint64_t	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
+	uint64_t	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
+	uint32_t	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */
+	uint32_t	tcpi_reord_seen;     /* reordering events seen */
+
+	uint32_t	tcpi_rcv_ooopack;    /* Out-of-order packets received */
+
+	uint32_t	tcpi_snd_wnd;	     /* peer's advertised receive window after
+					      * scaling (bytes)
+					      */
+	uint32_t	tcpi_rcv_wnd;	     /* local advertised receive window after
+					      * scaling (bytes)
+					      */
+
+	uint32_t 	tcpi_rehash;         /* PLB or timeout triggered rehash attempts */
+
+	uint16_t	tcpi_total_rto;	/* Total number of RTO timeouts, including
+					 * SYN/SYN-ACK and recurring timeouts.
+					 */
+	uint16_t	tcpi_total_rto_recoveries;	/* Total number of RTO
+							 * recoveries, including any
+							 * unfinished recovery.
+							 */
+	uint32_t	tcpi_total_rto_time;	/* Total time spent in RTO recoveries
+						 * in milliseconds, including any
+						 * unfinished recovery.
+						 */
+};
+
+#include <linux/falloc.h>
+
+#ifndef FALLOC_FL_COLLAPSE_RANGE
+#define FALLOC_FL_COLLAPSE_RANGE	0x08
+#endif
+
+#include <linux/close_range.h>
+
+/* glibc < 2.34 and musl as of 1.2.5 need these */
+#ifndef SYS_close_range
+#define SYS_close_range		436
+#endif
+#ifndef CLOSE_RANGE_UNSHARE	/* Linux kernel < 5.9 */
+#define CLOSE_RANGE_UNSHARE	(1U << 1)
+#endif
+
+__attribute__ ((weak))
+/* cppcheck-suppress funcArgNamesDifferent */
+int close_range(unsigned int first, unsigned int last, int flags) {
+	return syscall(SYS_close_range, first, last, flags);
+}
+
+#endif /* LINUX_DEP_H */
--- a/log.c
+++ b/log.c
@ -26,6 +26,7 @@
 #include <stdarg.h>
 #include <sys/socket.h>

+#include "linux_dep.h"
 #include "log.h"
 #include "util.h"
 #include "passt.h"
@ -92,7 +93,6 @@ const char *logfile_prefix[] = {
 	"         ",		/* LOG_DEBUG */
 };

-#ifdef FALLOC_FL_COLLAPSE_RANGE
 /**
 * logfile_rotate_fallocate() - Write header, set log_written after fallocate()
 * @fd:		Log file descriptor
@ -126,7 +126,6 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)

 	log_written -= log_cut_size;
 }
-#endif /* FALLOC_FL_COLLAPSE_RANGE */

 /**
 * logfile_rotate_move() - Fallback: move recent entries toward start, then cut
@ -198,21 +197,17 @@ out:
 *
 * Return: 0 on success, negative error code on failure
 *
- * #syscalls fcntl
- *
- * fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there
+ * #syscalls fcntl fallocate
 */
 static int logfile_rotate(int fd, const struct timespec *now)
 {
 	if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */))
 		return -errno;

-#ifdef FALLOC_FL_COLLAPSE_RANGE
 	/* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */
 	if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size))
 		logfile_rotate_fallocate(fd, now);
 	else
-#endif
 		logfile_rotate_move(fd, now);

 	if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND))
@ -224,19 +219,23 @@ static int logfile_rotate(int fd, const struct timespec *now)
 /**
 * logfile_write() - Write entry to log file, trigger rotation if full
 * @newline:	Append newline at the end of the message, if missing
+ * @cont:	Continuation of a previous message, on the same line
 * @pri:	Facility and level map, same as priority for vsyslog()
 * @now:	Timestamp
 * @format:	Same as vsyslog() format
 * @ap:		Same as vsyslog() ap
 */
-static void logfile_write(bool newline, int pri, const struct timespec *now,
+static void logfile_write(bool newline, bool cont, int pri,
+			  const struct timespec *now,
 			  const char *format, va_list ap)
 {
 	char buf[BUFSIZ];
-	int n;
+	int n = 0;

-	n  = logtime_fmt(buf, BUFSIZ, now);
-	n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]);
+	if (!cont) {
+		n += logtime_fmt(buf, BUFSIZ, now);
+		n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]);
+	}

 	n += vsnprintf(buf + n, BUFSIZ - n, format, ap);

@ -270,7 +269,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
 		char timestr[LOGTIME_STRLEN];

 		logtime_fmt(timestr, sizeof(timestr), now);
-		fprintf(stderr, "%s: ", timestr);
+		FPRINTF(stderr, "%s: ", timestr);
 	}

 	if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) {
@ -278,7 +277,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)

 		va_copy(ap2, ap); /* Don't clobber ap, we need it again */
 		if (log_file != -1)
-			logfile_write(newline, pri, now, format, ap2);
+			logfile_write(newline, cont, pri, now, format, ap2);
 		else if (!(log_mask & LOG_MASK(LOG_DEBUG)))
 			passt_vsyslog(newline, pri, format, ap2);

@ -289,7 +288,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
 	    (log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) {
 		(void)vfprintf(stderr, format, ap);
 		if (newline && format[strlen(format)] != '\n')
-			fprintf(stderr, "\n");
+			FPRINTF(stderr, "\n");
 	}
 }

@ -395,7 +394,7 @@ void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
 		n += snprintf(buf + n, BUFSIZ - n, "\n");

 	if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
-		fprintf(stderr, "Failed to send %i bytes to syslog\n", n);
+		FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
 }

 /**
@ -412,8 +411,7 @@ void logfile_init(const char *name, const char *path, size_t size)
 	if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
 		die_perror("Failed to read own /proc/self/exe link");

-	log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC,
-			S_IRUSR | S_IWUSR);
+	log_file = output_file_open(path, O_APPEND | O_RDWR);
 	if (log_file == -1)
 		die_perror("Couldn't open log file %s", path);

@ -429,4 +427,3 @@ void logfile_init(const char *name, const char *path, size_t size)
 	/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
 	log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
 }
-
--- a/ndp.c
+++ b/ndp.c
@ -234,8 +234,8 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
 		return 1;

 	if (ih->icmp6_type == NS) {
-		struct ndp_ns *ns = packet_get(p, 0, 0, sizeof(struct ndp_ns),
-					       NULL);
+		const struct ndp_ns *ns =
+			packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);

 		if (!ns)
 			return -1;
--- a/netlink.c
+++ b/netlink.c
@ -353,7 +353,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
 */
 bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
 {
-	size_t nh_len = RTA_PAYLOAD(rta);
+	int nh_len = RTA_PAYLOAD(rta);
 	struct rtnexthop *rtnh;
 	bool found = false;
 	int hops = -1;
@ -582,7 +582,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,

 				*(unsigned int *)RTA_DATA(rta) = ifi_dst;
 			} else if (rta->rta_type == RTA_MULTIPATH) {
-				size_t nh_len = RTA_PAYLOAD(rta);
+				int nh_len = RTA_PAYLOAD(rta);
 				struct rtnexthop *rtnh;

 				for (rtnh = (struct rtnexthop *)RTA_DATA(rta);
--- a/passt.1
+++ b/passt.1
@ -95,7 +95,7 @@ detached PID namespace after starting, because the PID itself cannot change.
 Default is to fork into background.

 .TP
-.BR \-e ", " \-\-stderr
+.BR \-e ", " \-\-stderr " " (DEPRECATED)
 This option has no effect, and is maintained for compatibility purposes only.

 Note that this configuration option is \fBdeprecated\fR and will be removed in a
@ -249,10 +249,19 @@ the host.
 .TP
 .BR \-\-dns-forward " " \fIaddr
 Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the
-first configured DNS resolver (with corresponding IP version). Maps
-only UDP and TCP traffic to port 53 or port 853.  Replies are
-translated back with a reverse mapping.  This option can be specified
-zero to two times (once for IPv4, once for IPv6).
+nameserver (with corresponding IP version) specified by the
+\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or
+port 853.  Replies are translated back with a reverse mapping.  This
+option can be specified zero to two times (once for IPv4, once for
+IPv6).
+
+.TP
+.BR \-\-dns-host " " \fIaddr
+Configure the host nameserver which guest or namespace queries to the
+\fB\-\-dns-forward\fR address will be redirected to. This option can
+be specified zero to two times (once for IPv4, once for IPv6).
+By default, the first nameserver from the host's
+\fI/etc/resolv.conf\fR.

 .TP
 .BR \-S ", " \-\-search " " \fIlist
@ -327,6 +336,16 @@ namespace will be silently dropped.
 Disable Router Advertisements. Router Solicitations coming from guest or target
 namespace will be ignored.

+.TP
+.BR \-\-freebind
+Allow any binding address to be specified for \fB-t\fR and \fB-u\fR
+options.  Usually binding addresses must be addresses currently
+configured on the host.  With \fB\-\-freebind\fR, the
+\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled
+allowing any address to be used.  This is typically used to bind
+addresses which might be configured on the host in future, at which
+point the forwarding will immediately start operating.
+
 .TP
 .BR \-\-map-host-loopback " " \fIaddr
 Translate \fIaddr\fR to refer to the host. Packets from the guest to
@ -586,6 +605,13 @@ Configure UDP port forwarding from target namespace to init namespace.

 Default is \fBauto\fR.

+.TP
+.BR \-\-host-lo-to-ns-lo " " (DEPRECATED)
+If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
+the host's loopback address will appear on the loopback address in the
+guest as well.  Without this option such forwarded packets will appear
+to come from the guest's public address.
+
 .TP
 .BR \-\-userns " " \fIspec
 Target user namespace to join, as a path. If PID is given, without this option,
@ -863,38 +889,41 @@ root@localhost's password:

 .SH NOTES

-.SS Handling of traffic with local destination and source addresses
+.SS Handling of traffic with loopback destination and source addresses

-Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address,
-depending on the configuration. Local destination or source addresses need to be
-changed before packets are delivered to the guest or target namespace: most
-operating systems would drop packets received from non-loopback interfaces with
-local addresses, and it would also be impossible for guest or target namespace
-to route answers back.
+Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback
+address (127.0.0.0/8 or ::1), depending on the configuration. Loopback
+destination or source addresses need to be changed before packets are
+delivered to the guest or target namespace: most operating systems
+would drop packets received with loopback addresses on non-loopback
+interfaces, and it would also be impossible for guest or target
+namespace to route answers back.

-For convenience, and somewhat arbitrarily, the source address on these packets
-is translated to the address of the default IPv4 or IPv6 gateway (if any) --
-this is known to be an existing, valid address on the same subnet.
+For convenience, the source address on these packets is translated to
+the address specified by the \fB\-\-map-host-loopback\fR option (with
+some exceptions in pasta mode, see next section below).  If not
+specified this defaults, somewhat arbitrarily, to the address of
+default IPv4 or IPv6 gateway (if any) -- this is known to be an
+existing, valid address on the same subnet.  If \fB\-\-no-map-gw\fR or
+\fB\-\-map-host-loopback none\fR are specified this translation is
+disabled and packets with loopback addresses are simply dropped.

-Loopback destination addresses are instead translated to the observed external
-address of the guest or target namespace. For IPv6 packets, if usage of a
-link-local address by guest or namespace has ever been observed, and the
-original destination address is also a link-local address, the observed
-link-local address is used. Otherwise, the observed global address is used. For
-both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses
-will be used instead.
+Loopback destination addresses are translated to the observed external
+address of the guest or target namespace. For IPv6, the observed
+link-local address is used if the translated source address is
+link-local, otherwise the observed global address is used. For both
+IPv4 and IPv6, if no addresses have been seen yet, the configured
+addresses will be used instead.

 For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1,
 with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
 the last observed source address from guest or namespace is 192.0.2.2, this will
 be translated to a connection from 192.0.2.1 to 192.0.2.2.

-Similarly, for traffic coming from guest or namespace, packets with destination
-address corresponding to the default gateway will have their destination address
-translated to a loopback address, if and only if a packet, in the opposite
-direction, with a loopback destination or source address, port-wise matching for
-UDP, or connection-wise for TCP, has been recently forwarded to guest or
-namespace. This behaviour can be disabled with \-\-no\-map\-gw.
+Similarly, for traffic coming from guest or namespace, packets with
+destination address corresponding to the \fB\-\-map-host-loopback\fR
+address will have their destination address translated to a loopback
+address.

 .SS Handling of local traffic in pasta

@ -910,8 +939,15 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair
 of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet
 transfers.

-This bypass only applies to local connections and traffic, because it's not
-possible to bind sockets to foreign addresses.
+Because it's not possible to bind sockets to foreign addresses, this
+bypass only applies to local connections and traffic.  It also means
+that the address translation differs slightly from passt mode.
+Connections from loopback to loopback on the host will appear to come
+from the target namespace's public address within the guest, unless
+\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will
+appear to come from loopback in the namespace as well.  The latter
+behaviour used to be the default, but is usually undesirable, since it
+can unintentionally expose namespace local services to the host.

 .SS Binding to low numbered ports (well-known or system ports, up to 1023)

--- a/passt.c
+++ b/passt.c
@ -207,7 +207,8 @@ int main(int argc, char **argv)
 	struct timespec now;
 	struct sigaction sa;

-	clock_gettime(CLOCK_MONOTONIC, &log_start);
+	if (clock_gettime(CLOCK_MONOTONIC, &log_start))
+		die_perror("Failed to get CLOCK_MONOTONIC time");

 	arch_avx2_exec(argv);

@ -265,7 +266,8 @@ int main(int argc, char **argv)

 	secret_init(&c);

-	clock_gettime(CLOCK_MONOTONIC, &now);
+	if (clock_gettime(CLOCK_MONOTONIC, &now))
+		die_perror("Failed to get CLOCK_MONOTONIC time");

 	flow_init();

@ -307,13 +309,15 @@ int main(int argc, char **argv)
 	timer_init(&c, &now);

 loop:
-	/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
+	/* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */
 	/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
 	nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
+	/* NOLINTEND(bugprone-branch-clone) */
 	if (nfds == -1 && errno != EINTR)
 		die_perror("epoll_wait() failed in main loop");

-	clock_gettime(CLOCK_MONOTONIC, &now);
+	if (clock_gettime(CLOCK_MONOTONIC, &now))
+		err_perror("Failed to get CLOCK_MONOTONIC time");

 	for (i = 0; i < nfds; i++) {
 		union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
--- a/passt.h
+++ b/passt.h
@ -225,6 +225,8 @@ struct ip6_ctx {
 * @no_dhcpv6:		Disable DHCPv6 server
 * @no_ndp:		Disable NDP handler altogether
 * @no_ra:		Disable router advertisements
+ * @host_lo_to_ns_lo:	Map host loopback addresses to ns loopback addresses
+ * @freebind:		Allow binding of non-local addresses for forwarding
 * @low_wmem:		Low probed net.core.wmem_max
 * @low_rmem:		Low probed net.core.rmem_max
 */
@ -284,6 +286,8 @@ struct ctx {
 	int no_dhcpv6;
 	int no_ndp;
 	int no_ra;
+	int host_lo_to_ns_lo;
+	int freebind;

 	int low_wmem;
 	int low_rmem;
--- a/pasta.c
+++ b/pasta.c
@ -102,7 +102,9 @@ static int pasta_wait_for_ns(void *arg)
 	int flags = O_RDONLY | O_CLOEXEC;
 	char ns[PATH_MAX];

-	snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
+	if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid))
+		die_perror("Can't build netns path");
+
 	do {
 		while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
 			if (errno != ENOENT)
@ -239,8 +241,11 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
 		c->quiet = 1;

 	/* Configure user and group mappings */
-	snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
-	snprintf(gidmap, BUFSIZ, "0 %u 1", gid);
+	if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid))
+		die_perror("Can't build uidmap");
+
+	if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid))
+		die_perror("Can't build gidmap");

 	if (write_file("/proc/self/uid_map", uidmap) ||
 	    write_file("/proc/self/setgroups", "deny") ||
@ -427,12 +432,12 @@ static int pasta_netns_quit_timer(void)
 */
 void pasta_netns_quit_init(const struct ctx *c)
 {
-	union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY };
 	struct epoll_event ev = { .events = EPOLLIN };
 	int flags = O_NONBLOCK | O_CLOEXEC;
 	struct statfs s = { 0 };
 	bool try_inotify = true;
 	int fd = -1, dir_fd;
+	union epoll_ref ref;

 	if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
 		return;
@ -463,6 +468,7 @@ void pasta_netns_quit_init(const struct ctx *c)
 		ref.type = EPOLL_TYPE_NSQUIT_TIMER;
 	} else {
 		close(dir_fd);
+		ref.type = EPOLL_TYPE_NSQUIT_INOTIFY;
 	}

 	if (fd > FD_REF_MAX)
--- a/pcap.c
+++ b/pcap.c
@ -86,9 +86,8 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
 		.caplen = l2len,
 		.len = l2len
 	};
-	struct iovec hiov = { &h, sizeof(h) };

-	if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 ||
+	if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 ||
 	    write_remainder(pcap_fd, iov, iovcnt, offset) < 0)
 		debug_perror("Cannot log packet, length %zu", l2len);
 }
@ -101,12 +100,14 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
 void pcap(const char *pkt, size_t l2len)
 {
 	struct iovec iov = { (char *)pkt, l2len };
-	struct timespec now;
+	struct timespec now = { 0 };

 	if (pcap_fd == -1)
 		return;

-	clock_gettime(CLOCK_REALTIME, &now);
+	if (clock_gettime(CLOCK_REALTIME, &now))
+		err_perror("Failed to get CLOCK_REALTIME time");
+
 	pcap_frame(&iov, 1, 0, &now);
 }

@ -120,13 +121,14 @@ void pcap(const char *pkt, size_t l2len)
 void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
 		   size_t offset)
 {
-	struct timespec now;
+	struct timespec now = { 0 };
 	unsigned int i;

 	if (pcap_fd == -1)
 		return;

-	clock_gettime(CLOCK_REALTIME, &now);
+	if (clock_gettime(CLOCK_REALTIME, &now))
+		err_perror("Failed to get CLOCK_REALTIME time");

 	for (i = 0; i < n; i++)
 		pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
@ -139,17 +141,20 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
 * @iov:	Pointer to the array of struct iovec describing the I/O vector
 *		containing packet data to write, including L2 header
 * @iovcnt:	Number of buffers (@iov entries)
+ * @offset:	Offset of the L2 frame within the full data length
 */
 /* cppcheck-suppress unusedFunction */
-void pcap_iov(const struct iovec *iov, size_t iovcnt)
+void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
 {
-	struct timespec now;
+	struct timespec now = { 0 };

 	if (pcap_fd == -1)
 		return;

-	clock_gettime(CLOCK_REALTIME, &now);
-	pcap_frame(iov, iovcnt, 0, &now);
+	if (clock_gettime(CLOCK_REALTIME, &now))
+		err_perror("Failed to get CLOCK_REALTIME time");
+
+	pcap_frame(iov, iovcnt, offset, &now);
 }

 /**
@ -158,18 +163,15 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt)
 */
 void pcap_init(struct ctx *c)
 {
-	int flags = O_WRONLY | O_CREAT | O_TRUNC;
-
 	if (pcap_fd != -1)
 		return;

 	if (!*c->pcap)
 		return;

-	flags |= c->foreground ? O_CLOEXEC : 0;
-	pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR);
+	pcap_fd = output_file_open(c->pcap, O_WRONLY);
 	if (pcap_fd == -1) {
-		perror("open");
+		err_perror("Couldn't open pcap file %s", c->pcap);
 		return;
 	}

--- a/pcap.h
+++ b/pcap.h
@ -9,7 +9,7 @@
 void pcap(const char *pkt, size_t l2len);
 void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
 		   size_t offset);
-void pcap_iov(const struct iovec *iov, size_t iovcnt);
+void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset);
 void pcap_init(struct ctx *c);

 #endif /* PCAP_H */
--- a/pif.c
+++ b/pif.c
@ -59,3 +59,45 @@ void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
 		*sl = sizeof(sa->sa6);
 	}
 }
+
+/** pif_sock_l4() - Open a socket bound to an address on a specified interface
+ * @c:		Execution context
+ * @type:	Socket epoll type
+ * @pif:	Interface for this socket
+ * @addr:	Address to bind to, or NULL for dual-stack any
+ * @ifname:	Interface for binding, NULL for any
+ * @port:	Port number to bind to (host byte order)
+ * @data:	epoll reference portion for protocol handlers
+ *
+ * NOTE: For namespace pifs, this must be called having already entered the
+ * relevant namespace.
+ *
+ * Return: newly created socket, negative error code on failure
+ */
+int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
+		const union inany_addr *addr, const char *ifname,
+		in_port_t port, uint32_t data)
+{
+	union sockaddr_inany sa = {
+		.sa6.sin6_family = AF_INET6,
+		.sa6.sin6_addr = in6addr_any,
+		.sa6.sin6_port = htons(port),
+	};
+	socklen_t sl;
+
+	ASSERT(pif_is_socket(pif));
+
+	if (pif == PIF_SPLICE) {
+		/* Sanity checks */
+		ASSERT(!ifname);
+		ASSERT(addr && inany_is_loopback(addr));
+	}
+
+	if (!addr)
+		return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
+				  ifname, false, data);
+
+	pif_sockaddr(c, &sa, &sl, pif, addr, port);
+	return sock_l4_sa(c, type, &sa, sl,
+			  ifname, sa.sa_family == AF_INET6, data);
+}
--- a/pif.h
+++ b/pif.h
@ -59,5 +59,8 @@ static inline bool pif_is_socket(uint8_t pif)

 void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
 		  uint8_t pif, const union inany_addr *addr, in_port_t port);
+int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
+		const union inany_addr *addr, const char *ifname,
+		in_port_t port, uint32_t data);

 #endif /* PIF_H */
--- a/seccomp.sh
+++ b/seccomp.sh
@ -20,6 +20,15 @@ OUT="$(mktemp)"
 [ -z "${ARCH}" ] && ARCH="$(uname -m)"
 [ -z "${CC}" ] && CC="cc"

+AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z]             \
+                                      | sed 's/^ARM.*/ARM/'        \
+                                      | sed 's/I[456]86/I386/'     \
+                                      | sed 's/PPC64/PPC/'         \
+                                      | sed 's/PPCLE/PPC64LE/'     \
+                                      | sed 's/MIPS64EL/MIPSEL64/' \
+                                      | sed 's/HPPA/PARISC/'       \
+                                      | sed 's/SH4/SH/')"
+
 HEADER="/* This file was automatically generated by $(basename ${0}) */

 #ifndef AUDIT_ARCH_PPC64LE
@ -32,7 +41,7 @@ struct sock_filter filter_@PROFILE@[] = {
 	/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
 	BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
 		 (offsetof(struct seccomp_data, arch))),
-	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@),
 	/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
 	BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
 		 (offsetof(struct seccomp_data, nr))),
@ -233,7 +242,8 @@ gen_profile() {
 		sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}"
 	done

-	finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))"
+	finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \
+	       "AUDIT_ARCH:${AUDIT_ARCH}"
 }

 printf '%s\n' "${HEADER}" > "${OUT}"
--- a/tap.c
+++ b/tap.c
@ -172,11 +172,15 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 	struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
 	struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
 	char *data = (char *)(uh + 1);
+	const struct iovec iov = {
+		.iov_base = (void *)in,
+		.iov_len = dlen
+	};

 	uh->source = htons(sport);
 	uh->dest = htons(dport);
 	uh->len = htons(l4len);
-	csum_udp4(uh, src, dst, in, dlen);
+	csum_udp4(uh, src, dst, &iov, 1, 0);
 	memcpy(data, in, dlen);

 	tap_send_single(c, buf, dlen + (data - buf));
@ -247,7 +251,7 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h,
 void tap_udp6_send(const struct ctx *c,
 		   const struct in6_addr *src, in_port_t sport,
 		   const struct in6_addr *dst, in_port_t dport,
-		   uint32_t flow, const void *in, size_t dlen)
+		   uint32_t flow, void *in, size_t dlen)
 {
 	size_t l4len = dlen + sizeof(struct udphdr);
 	char buf[USHRT_MAX];
@ -255,11 +259,15 @@ void tap_udp6_send(const struct ctx *c,
 	struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
 					  l4len, IPPROTO_UDP, flow);
 	char *data = (char *)(uh + 1);
+	const struct iovec iov = {
+		.iov_base = in,
+		.iov_len = dlen
+	};

 	uh->source = htons(sport);
 	uh->dest = htons(dport);
 	uh->len = htons(l4len);
-	csum_udp6(uh, src, dst, in, dlen);
+	csum_udp6(uh, src, dst, &iov, 1, 0);
 	memcpy(data, in, dlen);

 	tap_send_single(c, buf, dlen + (data - buf));
@ -982,24 +990,17 @@ static void tap_sock_reset(struct ctx *c)
 }

 /**
- * tap_handler_passt() - Packet handler for AF_UNIX file descriptor
+ * tap_passt_input() - Handler for new data on the socket to qemu
 * @c:		Execution context
- * @events:	epoll events
 * @now:	Current timestamp
 */
-void tap_handler_passt(struct ctx *c, uint32_t events,
-		       const struct timespec *now)
+static void tap_passt_input(struct ctx *c, const struct timespec *now)
 {
 	static const char *partial_frame;
 	static ssize_t partial_len = 0;
 	ssize_t n;
 	char *p;

-	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
-		tap_sock_reset(c);
-		return;
-	}
-
 	tap_flush_pools();

 	if (partial_len) {
@ -1010,10 +1011,13 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
 		memmove(pkt_buf, partial_frame, partial_len);
 	}

-	n = recv(c->fd_tap, pkt_buf + partial_len, TAP_BUF_BYTES - partial_len,
-		 MSG_DONTWAIT);
+	do {
+		n = recv(c->fd_tap, pkt_buf + partial_len,
+			 TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
+	} while ((n < 0) && errno == EINTR);
+
 	if (n < 0) {
-		if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
+		if (errno != EAGAIN && errno != EWOULDBLOCK) {
 			err_perror("Receive error on guest connection, reset");
 			tap_sock_reset(c);
 		}
@ -1051,6 +1055,63 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
 	tap_handler(c, now);
 }

+/**
+ * tap_handler_passt() - Event handler for AF_UNIX file descriptor
+ * @c:		Execution context
+ * @events:	epoll events
+ * @now:	Current timestamp
+ */
+void tap_handler_passt(struct ctx *c, uint32_t events,
+		       const struct timespec *now)
+{
+	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
+		tap_sock_reset(c);
+		return;
+	}
+
+	if (events & EPOLLIN)
+		tap_passt_input(c, now);
+}
+
+/**
+ * tap_pasta_input() - Handler for new data on the socket to hypervisor
+ * @c:		Execution context
+ * @now:	Current timestamp
+ */
+static void tap_pasta_input(struct ctx *c, const struct timespec *now)
+{
+	ssize_t n, len;
+
+	tap_flush_pools();
+
+	for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
+		len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
+
+		if (len == 0) {
+			die("EOF on tap device, exiting");
+		} else if (len < 0) {
+			if (errno == EINTR) {
+				len = 0;
+				continue;
+			}
+
+			if (errno == EAGAIN && errno == EWOULDBLOCK)
+				break; /* all done for now */
+
+			die("Error on tap device, exiting");
+		}
+
+		/* Ignore frames of bad length */
+		if (len < (ssize_t)sizeof(struct ethhdr) ||
+		    len > (ssize_t)ETH_MAX_MTU)
+			continue;
+
+		tap_add_packet(c, len, pkt_buf + n);
+	}
+
+	tap_handler(c, now);
+}
+
 /**
 * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor
 * @c:		Execution context
@ -1060,46 +1121,11 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
 void tap_handler_pasta(struct ctx *c, uint32_t events,
 		       const struct timespec *now)
 {
-	ssize_t n, len;
-	int ret;
-
 	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
 		die("Disconnect event on /dev/net/tun device, exiting");

-redo:
-	n = 0;
-
-	tap_flush_pools();
-restart:
-	while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
-
-		if (len < (ssize_t)sizeof(struct ethhdr) ||
-		    len > (ssize_t)ETH_MAX_MTU) {
-			n += len;
-			continue;
-		}
-
-
-		tap_add_packet(c, len, pkt_buf + n);
-
-		if ((n += len) == TAP_BUF_BYTES)
-			break;
-	}
-
-	if (len < 0 && errno == EINTR)
-		goto restart;
-
-	ret = errno;
-
-	tap_handler(c, now);
-
-	if (len > 0 || ret == EAGAIN)
-		return;
-
-	if (n == TAP_BUF_BYTES)
-		goto redo;
-
-	die("Error on tap device, exiting");
+	if (events & EPOLLIN)
+		tap_pasta_input(c, now);
 }

 /**
@ -1110,7 +1136,7 @@ restart:
 */
 int tap_sock_unix_open(char *sock_path)
 {
-	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
 	struct sockaddr_un addr = {
 		.sun_family = AF_UNIX,
 	};
@ -1125,10 +1151,12 @@ int tap_sock_unix_open(char *sock_path)

 		if (*sock_path)
 			memcpy(path, sock_path, UNIX_PATH_MAX);
-		else
-			snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i);
+		else if (snprintf_check(path, UNIX_PATH_MAX - 1,
+					UNIX_SOCK_PATH, i))
+			die_perror("Can't build UNIX domain socket path");

-		ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
+		ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+			    0);
 		if (ex < 0)
 			die_perror("Failed to check for UNIX domain conflicts");

@ -1261,7 +1289,7 @@ static int tap_ns_tun(void *arg)
 	if (fd < 0)
 		die_perror("Failed to open() /dev/net/tun");

-	rc = ioctl(fd, TUNSETIFF, &ifr);
+	rc = ioctl(fd, (int)TUNSETIFF, &ifr);
 	if (rc < 0)
 		die_perror("TUNSETIFF ioctl on /dev/net/tun failed");

--- a/tap.h
+++ b/tap.h
@ -53,7 +53,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
 void tap_udp6_send(const struct ctx *c,
 		   const struct in6_addr *src, in_port_t sport,
 		   const struct in6_addr *dst, in_port_t dport,
-		   uint32_t flow, const void *in, size_t dlen);
+		   uint32_t flow, void *in, size_t dlen);
 void tap_icmp6_send(const struct ctx *c,
 		    const struct in6_addr *src, const struct in6_addr *dst,
 		    const void *in, size_t l4len);
--- a/tcp.c
+++ b/tcp.c
@ -274,6 +274,7 @@
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
+#include <netinet/tcp.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <stddef.h>
@ -286,8 +287,6 @@
 #include <time.h>
 #include <arpa/inet.h>

-#include <linux/tcp.h> /* For struct tcp_info */
-
 #include "checksum.h"
 #include "util.h"
 #include "iov.h"
@ -300,6 +299,7 @@
 #include "log.h"
 #include "inany.h"
 #include "flow.h"
+#include "linux_dep.h"

 #include "flow_table.h"
 #include "tcp_internal.h"
@ -308,11 +308,6 @@
 /* MSS rounding: see SET_MSS() */
 #define MSS_DEFAULT			536
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
-#ifdef HAS_SND_WND
-# define KERNEL_REPORTS_SND_WND(c)	((c)->tcp.kernel_snd_wnd)
-#else
-# define KERNEL_REPORTS_SND_WND(c)	(0 && (c))
-#endif

 #define ACK_INTERVAL			10		/* ms */
 #define SYN_TIMEOUT			10		/* s */
@ -323,11 +318,6 @@
 #define LOW_RTT_TABLE_SIZE		8
 #define LOW_RTT_THRESHOLD		10 /* us */

-/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
- * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
- */
-#define SOL_TCP				IPPROTO_TCP
-
 #define ACK_IF_NEEDED	0		/* See tcp_send_flag() */

 #define CONN_IS_CLOSING(conn)						\
@ -371,6 +361,20 @@ char		tcp_buf_discard		[MAX_WINDOW];
 /* Does the kernel support TCP_PEEK_OFF? */
 bool peek_offset_cap;

+/* Size of data returned by TCP_INFO getsockopt() */
+socklen_t tcp_info_size;
+
+#define tcp_info_cap(f_)						\
+	((offsetof(struct tcp_info_linux, tcpi_##f_) +			\
+	  sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
+
+/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
+#define snd_wnd_cap	tcp_info_cap(snd_wnd)
+/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
+#define bytes_acked_cap	tcp_info_cap(bytes_acked)
+/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
+#define min_rtt_cap	tcp_info_cap(min_rtt)
+
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];

@ -440,7 +444,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
 	if (events == TAP_SYN_RCVD)
 		return EPOLLOUT | EPOLLET | EPOLLRDHUP;

-	return EPOLLRDHUP;
+	return EPOLLET | EPOLLRDHUP;
 }

 /**
@ -545,7 +549,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		 (unsigned long long)it.it_value.tv_sec,
 		 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);

-	timerfd_settime(conn->timer, 0, &it, NULL);
+	if (timerfd_settime(conn->timer, 0, &it, NULL))
+		flow_err(conn, "failed to set timer: %s", strerror(errno));
 }

 /**
@ -675,13 +680,12 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
 * @tinfo:	Pointer to struct tcp_info for socket
 */
 static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
-			      const struct tcp_info *tinfo)
+			      const struct tcp_info_linux *tinfo)
 {
-#ifdef HAS_MIN_RTT
 	const struct flowside *tapside = TAPFLOW(conn);
 	int i, hole = -1;

-	if (!tinfo->tcpi_min_rtt ||
+	if (!min_rtt_cap ||
 	    (int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
 		return;

@ -702,10 +706,6 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
 	if (hole == LOW_RTT_TABLE_SIZE)
 		hole = 0;
 	inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
-#else
-	(void)conn;
-	(void)tinfo;
-#endif /* HAS_MIN_RTT */
 }

 /**
@ -752,34 +752,106 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
 }

 /**
- * tcp_update_check_tcp4() - Update TCP checksum from stored one
+ * tcp_update_check_tcp4() - Calculate TCP checksum for IPv4
 * @iph:	IPv4 header
- * @th:		TCP header followed by TCP payload
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ * @l4offset:	IPv4 payload offset in the iovec array
 */
-static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th)
+static void tcp_update_check_tcp4(const struct iphdr *iph,
+				  const struct iovec *iov, int iov_cnt,
+				  size_t l4offset)
 {
 	uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
 	struct in_addr saddr = { .s_addr = iph->saddr };
 	struct in_addr daddr = { .s_addr = iph->daddr };
-	uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
+	size_t check_ofs;
+	uint16_t *check;
+	int check_idx;
+	uint32_t sum;
+	char *ptr;

-	th->check = 0;
-	th->check = csum(th, l4len, sum);
+	sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
+
+	check_idx = iov_skip_bytes(iov, iov_cnt,
+				   l4offset + offsetof(struct tcphdr, check),
+				   &check_ofs);
+
+	if (check_idx >= iov_cnt) {
+		err("TCP4 buffer is too small, iov size %zd, check offset %zd",
+		    iov_size(iov, iov_cnt),
+		    l4offset + offsetof(struct tcphdr, check));
+		return;
+	}
+
+	if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
+		err("TCP4 checksum field memory is not contiguous "
+		    "check_ofs %zd check_idx %d iov_len %zd",
+		    check_ofs, check_idx, iov[check_idx].iov_len);
+		return;
+	}
+
+	ptr = (char *)iov[check_idx].iov_base + check_ofs;
+	if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
+		err("TCP4 checksum field is not correctly aligned in memory");
+		return;
+	}
+
+	check = (uint16_t *)ptr;
+
+	*check = 0;
+	*check = csum_iov(iov, iov_cnt, l4offset, sum);
 }

 /**
 * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
 * @ip6h:	IPv6 header
- * @th:		TCP header followed by TCP payload
+ * @iov:	Pointer to the array of IO vectors
+ * @iov_cnt:	Length of the array
+ * @l4offset:	IPv6 payload offset in the iovec array
 */
-static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
+static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
+				  const struct iovec *iov, int iov_cnt,
+				  size_t l4offset)
 {
 	uint16_t l4len = ntohs(ip6h->payload_len);
-	uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
-					      &ip6h->saddr, &ip6h->daddr);
+	size_t check_ofs;
+	uint16_t *check;
+	int check_idx;
+	uint32_t sum;
+	char *ptr;

-	th->check = 0;
-	th->check = csum(th, l4len, sum);
+	sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr,
+				     &ip6h->daddr);
+
+	check_idx = iov_skip_bytes(iov, iov_cnt,
+				   l4offset + offsetof(struct tcphdr, check),
+				   &check_ofs);
+
+	if (check_idx >= iov_cnt) {
+		err("TCP6 buffer is too small, iov size %zd, check offset %zd",
+		    iov_size(iov, iov_cnt),
+		    l4offset + offsetof(struct tcphdr, check));
+		return;
+	}
+
+	if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
+		err("TCP6 checksum field memory is not contiguous "
+		    "check_ofs %zd check_idx %d iov_len %zd",
+		    check_ofs, check_idx, iov[check_idx].iov_len);
+		return;
+	}
+
+	ptr = (char *)iov[check_idx].iov_base + check_ofs;
+	if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
+		err("TCP6 checksum field is not correctly aligned in memory");
+		return;
+	}
+
+	check = (uint16_t *)ptr;
+
+	*check = 0;
+	*check = csum_iov(iov, iov_cnt, l4offset, sum);
 }

 /**
@ -865,7 +937,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn)
 /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
 void tcp_defer_handler(struct ctx *c)
 {
-	tcp_flags_flush(c);
 	tcp_payload_flush(c);
 }

@ -896,26 +967,27 @@ static void tcp_fill_header(struct tcphdr *th,

 /**
 * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers
- * @conn:	Connection pointer
- * @taph:	tap backend specific header
- * @iph:	Pointer to IPv4 header
- * @th:		Pointer to TCP header
- * @dlen:	TCP payload length
- * @check:	Checksum, if already known
- * @seq:	Sequence number for this segment
+ * @conn:		Connection pointer
+ * @taph:		tap backend specific header
+ * @iph:		Pointer to IPv4 header
+ * @bp:			Pointer to TCP header followed by TCP payload
+ * @dlen:		TCP payload length
+ * @check:		Checksum, if already known
+ * @seq:		Sequence number for this segment
+ * @no_tcp_csum:	Do not set TCP checksum
 *
 * Return: The IPv4 payload length, host order
 */
 static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
 				struct tap_hdr *taph,
-				struct iphdr *iph, struct tcphdr *th,
+				struct iphdr *iph, struct tcp_payload_t *bp,
 				size_t dlen, const uint16_t *check,
-				uint32_t seq)
+				uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *src4 = inany_v4(&tapside->oaddr);
 	const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
-	size_t l4len = dlen + sizeof(*th);
+	size_t l4len = dlen + sizeof(bp->th);
 	size_t l3len = l4len + sizeof(*iph);

 	ASSERT(src4 && dst4);
@ -927,9 +999,18 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
 	iph->check = check ? *check :
 			     csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4);

-	tcp_fill_header(th, conn, seq);
+	tcp_fill_header(&bp->th, conn, seq);

-	tcp_update_check_tcp4(iph, th);
+	if (no_tcp_csum) {
+		bp->th.check = 0;
+	} else {
+		const struct iovec iov = {
+			.iov_base = bp,
+			.iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr),
+		};
+
+		tcp_update_check_tcp4(iph, &iov, 1, 0);
+	}

 	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));

@ -938,23 +1019,24 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,

 /**
 * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers
- * @conn:	Connection pointer
- * @taph:	tap backend specific header
- * @ip6h:	Pointer to IPv6 header
- * @th:		Pointer to TCP header
- * @dlen:	TCP payload length
- * @check:	Checksum, if already known
- * @seq:	Sequence number for this segment
+ * @conn:		Connection pointer
+ * @taph:		tap backend specific header
+ * @ip6h:		Pointer to IPv6 header
+ * @bp:			Pointer to TCP header followed by TCP payload
+ * @dlen:		TCP payload length
+ * @check:		Checksum, if already known
+ * @seq:		Sequence number for this segment
+ * @no_tcp_csum:	Do not set TCP checksum
 *
 * Return: The IPv6 payload length, host order
 */
 static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
 				struct tap_hdr *taph,
-				struct ipv6hdr *ip6h, struct tcphdr *th,
-				size_t dlen, uint32_t seq)
+				struct ipv6hdr *ip6h, struct tcp_payload_t *bp,
+				size_t dlen, uint32_t seq, bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
-	size_t l4len = dlen + sizeof(*th);
+	size_t l4len = dlen + sizeof(bp->th);

 	ip6h->payload_len = htons(l4len);
 	ip6h->saddr = tapside->oaddr.a6;
@ -968,9 +1050,18 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
 	ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
 	ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;

-	tcp_fill_header(th, conn, seq);
+	tcp_fill_header(&bp->th, conn, seq);

-	tcp_update_check_tcp6(ip6h, th);
+	if (no_tcp_csum) {
+		bp->th.check = 0;
+	} else {
+		const struct iovec iov = {
+			.iov_base = bp,
+			.iov_len = ntohs(ip6h->payload_len)
+		};
+
+		tcp_update_check_tcp6(ip6h, &iov, 1, 0);
+	}

 	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));

@ -984,12 +1075,14 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
 * @dlen:	TCP payload length
 * @check:	Checksum, if already known
 * @seq:	Sequence number for this segment
+ * @no_tcp_csum: Do not set TCP checksum
 *
 * Return: IP payload length, host order
 */
 size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 			       struct iovec *iov, size_t dlen,
-			       const uint16_t *check, uint32_t seq)
+			       const uint16_t *check, uint32_t seq,
+			       bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	const struct in_addr *a4 = inany_v4(&tapside->oaddr);
@ -998,13 +1091,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 		return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
 					 iov[TCP_IOV_IP].iov_base,
 					 iov[TCP_IOV_PAYLOAD].iov_base, dlen,
-					 check, seq);
+					 check, seq, no_tcp_csum);
 	}

 	return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
 				 iov[TCP_IOV_IP].iov_base,
 				 iov[TCP_IOV_PAYLOAD].iov_base, dlen,
-				 seq);
+				 seq, no_tcp_csum);
 }

 /**
@ -1017,42 +1110,41 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 * Return: 1 if sequence or window were updated, 0 otherwise
 */
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
-			  int force_seq, struct tcp_info *tinfo)
+			  bool force_seq, struct tcp_info_linux *tinfo)
 {
 	uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
 	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
 	/* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */
 	socklen_t sl = sizeof(*tinfo);
-	struct tcp_info tinfo_new;
+	struct tcp_info_linux tinfo_new;
 	uint32_t new_wnd_to_tap = prev_wnd_to_tap;
 	int s = conn->sock;

-#ifndef HAS_BYTES_ACKED
-	(void)force_seq;
-
-	conn->seq_ack_to_tap = conn->seq_from_tap;
-	if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
-		conn->seq_ack_to_tap = prev_ack_to_tap;
-#else
-	if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
-	    || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
+	if (!bytes_acked_cap) {
 		conn->seq_ack_to_tap = conn->seq_from_tap;
-	} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
-		if (!tinfo) {
-			tinfo = &tinfo_new;
-			if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
-				return 0;
-		}
-
-		conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
-				       conn->seq_init_from_tap;
-
 		if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
 			conn->seq_ack_to_tap = prev_ack_to_tap;
-	}
-#endif /* !HAS_BYTES_ACKED */
+	} else {
+		if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL ||
+		    tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) ||
+		    (conn->flags & LOCAL) || force_seq) {
+			conn->seq_ack_to_tap = conn->seq_from_tap;
+		} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
+			if (!tinfo) {
+				tinfo = &tinfo_new;
+				if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
+					return 0;
+			}

-	if (!KERNEL_REPORTS_SND_WND(c)) {
+			conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
+				conn->seq_init_from_tap;
+
+			if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
+				conn->seq_ack_to_tap = prev_ack_to_tap;
+		}
+	}
+
+	if (!snd_wnd_cap) {
 		tcp_get_sndbuf(conn);
 		new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
 		conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
@ -1063,14 +1155,13 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (!tinfo) {
 		if (prev_wnd_to_tap > WINDOW_DEFAULT) {
 			goto out;
-}
+		}
 		tinfo = &tinfo_new;
 		if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) {
 			goto out;
-}
+		}
 	}

-#ifdef HAS_SND_WND
 	if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
 		new_wnd_to_tap = tinfo->tcpi_snd_wnd;
 	} else {
@ -1078,7 +1169,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 		new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
 				     SNDBUF_GET(conn));
 	}
-#endif

 	new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
 	if (!(conn->events & ESTABLISHED))
@ -1136,11 +1226,11 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
 *	     0 if there is no flag to send
 *	     1 otherwise
 */
-int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
-		      int flags, struct tcphdr *th, char *data,
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
+		      int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
 		      size_t *optlen)
 {
-	struct tcp_info tinfo = { 0 };
+	struct tcp_info_linux tinfo = { 0 };
 	socklen_t sl = sizeof(tinfo);
 	int s = conn->sock;

@ -1153,27 +1243,16 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
 		return -ECONNRESET;
 	}

-#ifdef HAS_SND_WND
-	if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
-		c->tcp.kernel_snd_wnd = 1;
-#endif
-
 	if (!(conn->flags & LOCAL))
 		tcp_rtt_dst_check(conn, &tinfo);

-	if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
+	if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags)
 		return 0;

 	*optlen = 0;
 	if (flags & SYN) {
 		int mss;

-		/* Options: MSS, NOP and window scale (8 bytes) */
-		*optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
-
-		*data++ = OPT_MSS;
-		*data++ = OPT_MSS_LEN;
-
 		if (c->mtu == -1) {
 			mss = tinfo.tcpi_snd_mss;
 		} else {
@ -1189,16 +1268,11 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
 			else if (mss > PAGE_SIZE)
 				mss = ROUND_DOWN(mss, PAGE_SIZE);
 		}
-		*(uint16_t *)data = htons(MIN(USHRT_MAX, mss));
-
-		data += OPT_MSS_LEN - 2;

 		conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);

-		*data++ = OPT_NOP;
-		*data++ = OPT_WS;
-		*data++ = OPT_WS_LEN;
-		*data++ = conn->ws_to_tap;
+		*opts = TCP_SYN_OPTS(mss, conn->ws_to_tap);
+		*optlen = sizeof(*opts);
 	} else if (!(flags & RST)) {
 		flags |= ACK;
 	}
@ -1235,7 +1309,8 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
 *
 * Return: negative error code on connection reset, 0 otherwise
 */
-int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
+			 int flags)
 {
 	return tcp_buf_send_flag(c, conn, flags);
 }
@ -1245,7 +1320,7 @@ int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 * @c:		Execution context
 * @conn:	Connection pointer
 */
-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	if (conn->events == CLOSED)
 		return;
@ -1335,7 +1410,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
 {
 	int s;

-	s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+	s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP);

 	if (s > FD_REF_MAX) {
 		close(s);
@ -1463,7 +1538,7 @@ static void tcp_bind_outbound(const struct ctx *c,
 * @optlen:	Bytes in options: caller MUST ensure available length
 * @now:	Current timestamp
 */
-static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
+static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 			      const void *saddr, const void *daddr,
 			      const struct tcphdr *th, const char *opts,
 			      size_t optlen, const struct timespec *now)
@ -1628,7 +1703,7 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
 *
 * #syscalls recvmsg
 */
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	return tcp_buf_data_from_sock(c, conn);
 }
@ -1644,8 +1719,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 *
 * Return: count of consumed packets
 */
-static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
-			      const struct pool *p, int idx)
+static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+			     const struct pool *p, int idx)
 {
 	int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
 	uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
@ -1842,7 +1917,8 @@ out:
 * @opts:	Pointer to start of options
 * @optlen:	Bytes in options: caller MUST ensure available length
 */
-static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_conn_from_sock_finish(const struct ctx *c,
+				      struct tcp_tap_conn *conn,
 				      const struct tcphdr *th,
 				      const char *opts, size_t optlen)
 {
@ -1865,11 +1941,12 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 		return;
 	}

+	tcp_send_flag(c, conn, ACK);
+
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
 	 */
 	tcp_data_from_sock(c, conn);
-	tcp_send_flag(c, conn, ACK);
 }

 /**
@ -1885,7 +1962,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 *
 * Return: count of consumed packets
 */
-int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
+int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		    const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now)
 {
@ -2023,7 +2100,7 @@ reset:
 * @c:		Execution context
 * @conn:	Connection pointer
 */
-static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	socklen_t sl;
 	int so;
@ -2049,8 +2126,8 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
 * @sa:		Peer socket address (from accept())
 * @now:	Current timestamp
 */
-static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
-				   const struct timespec *now)
+static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
+				   int s, const struct timespec *now)
 {
 	struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
 	uint64_t hash;
@ -2081,7 +2158,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
 * @ref:	epoll reference of listening socket
 * @now:	Current timestamp
 */
-void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			const struct timespec *now)
 {
 	const struct flowside *ini;
@ -2146,7 +2223,7 @@ cancel:
 *
 * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
 */
-void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
+void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 {
 	struct itimerspec check_armed = { { 0 }, { 0 } };
 	struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp;
@ -2158,7 +2235,9 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 	 * timer is currently armed, this event came from a previous setting,
 	 * and we just set the timer to a new point in the future: discard it.
 	 */
-	timerfd_gettime(conn->timer, &check_armed);
+	if (timerfd_gettime(conn->timer, &check_armed))
+		flow_err(conn, "failed to read timer: %s", strerror(errno));
+
 	if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
 		return;

@ -2196,7 +2275,10 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 		 * case. This avoids having to preemptively reset the timer on
 		 * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
 		 */
-		timerfd_settime(conn->timer, 0, &new, &old);
+		if (timerfd_settime(conn->timer, 0, &new, &old))
+			flow_err(conn, "failed to set timer: %s",
+				 strerror(errno));
+
 		if (old.it_value.tv_sec == ACT_TIMEOUT) {
 			flow_dbg(conn, "activity timeout");
 			tcp_rst(c, conn);
@ -2210,7 +2292,8 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 * @ref:	epoll reference
 * @events:	epoll events bitmap
 */
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
+void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
+		      uint32_t events)
 {
 	struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside);

@ -2241,7 +2324,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 			tcp_data_from_sock(c, conn);

 		if (events & EPOLLOUT)
-			tcp_update_seqack_wnd(c, conn, 0, NULL);
+			tcp_update_seqack_wnd(c, conn, false, NULL);

 		return;
 	}
@ -2264,17 +2347,16 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 }

 /**
- * tcp_sock_init_af() - Initialise listening socket for a given af and port
+ * tcp_sock_init_one() - Initialise listening socket for address and port
 * @c:		Execution context
- * @af:		Address family to listen on
- * @port:	Port, host order
- * @addr:	Pointer to address for binding, NULL if not configured
+ * @addr:	Pointer to address for binding, NULL for dual stack any
 * @ifname:	Name of interface to bind to, NULL if not configured
+ * @port:	Port, host order
 *
 * Return: fd for the new listening socket, negative error code on failure
 */
-static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
-			    const void *addr, const char *ifname)
+static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
+			     const char *ifname, in_port_t port)
 {
 	union tcp_listen_epoll_ref tref = {
 		.port = port,
@ -2282,12 +2364,13 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
 	};
 	int s;

-	s = sock_l4(c, af, EPOLL_TYPE_TCP_LISTEN, addr, ifname, port, tref.u32);
+	s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr,
+				ifname, port, tref.u32);

 	if (c->tcp.fwd_in.mode == FWD_AUTO) {
-		if (af == AF_INET  || af == AF_UNSPEC)
+		if (!addr || inany_v4(addr))
 			tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s;
-		if (af == AF_INET6 || af == AF_UNSPEC)
+		if (!addr || !inany_v4(addr))
 			tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s;
 	}

@ -2301,31 +2384,32 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
 /**
 * tcp_sock_init() - Create listening sockets for a given host ("inbound") port
 * @c:		Execution context
- * @af:		Address family to select a specific IP version, or AF_UNSPEC
 * @addr:	Pointer to address for binding, NULL if not configured
 * @ifname:	Name of interface to bind to, NULL if not configured
 * @port:	Port, host order
 *
 * Return: 0 on (partial) success, negative error code on (complete) failure
 */
-int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
+int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
 		  const char *ifname, in_port_t port)
 {
 	int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;

 	ASSERT(!c->no_tcp);

-	if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
+	if (!addr && c->ifi4 && c->ifi6)
 		/* Attempt to get a dual stack socket */
-		if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0)
+		if (tcp_sock_init_one(c, NULL, ifname, port) >= 0)
 			return 0;

 	/* Otherwise create a socket per IP version */
-	if ((af == AF_INET  || af == AF_UNSPEC) && c->ifi4)
-		r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname);
+	if ((!addr || inany_v4(addr)) && c->ifi4)
+		r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4,
+				       ifname, port);

-	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
-		r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
+	if ((!addr || !inany_v4(addr)) && c->ifi6)
+		r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6,
+				       ifname, port);

 	if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
 		return 0;
@ -2348,8 +2432,8 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)

 	ASSERT(c->mode == MODE_PASTA);

-	s = sock_l4(c, AF_INET, EPOLL_TYPE_TCP_LISTEN, &in4addr_loopback,
-		    NULL, port, tref.u32);
+	s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4,
+			NULL, port, tref.u32);
 	if (s >= 0)
 		tcp_sock_set_bufsize(c, s);
 	else
@ -2374,8 +2458,8 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)

 	ASSERT(c->mode == MODE_PASTA);

-	s = sock_l4(c, AF_INET6, EPOLL_TYPE_TCP_LISTEN, &in6addr_loopback,
-		    NULL, port, tref.u32);
+	s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6,
+			NULL, port, tref.u32);
 	if (s >= 0)
 		tcp_sock_set_bufsize(c, s);
 	else
@ -2477,7 +2561,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
 *
 * Return: true if supported, false otherwise
 */
-bool tcp_probe_peek_offset_cap(sa_family_t af)
+static bool tcp_probe_peek_offset_cap(sa_family_t af)
 {
 	bool ret = false;
 	int s, optv = 0;
@ -2494,6 +2578,34 @@ bool tcp_probe_peek_offset_cap(sa_family_t af)
 	return ret;
 }

+/**
+ * tcp_probe_tcp_info() - Check what data TCP_INFO reports
+ *
+ * Return: Number of bytes returned by TCP_INFO getsockopt()
+ */
+static socklen_t tcp_probe_tcp_info(void)
+{
+	struct tcp_info_linux tinfo;
+	socklen_t sl = sizeof(tinfo);
+	int s;
+
+	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
+		warn_perror("Temporary TCP socket creation failed");
+		return false;
+	}
+
+	if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+		warn_perror("Failed to get TCP_INFO on temporary socket");
+		close(s);
+		return false;
+	}
+
+	close(s);
+
+	return sl;
+}
+
 /**
 * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
 * @c:		Execution context
@ -2504,11 +2616,7 @@ int tcp_init(struct ctx *c)
 {
 	ASSERT(!c->no_tcp);

-	if (c->ifi4)
-		tcp_sock4_iov_init(c);
-
-	if (c->ifi6)
-		tcp_sock6_iov_init(c);
+	tcp_sock_iov_init(c);

 	memset(init_sock_pool4,		0xff,	sizeof(init_sock_pool4));
 	memset(init_sock_pool6,		0xff,	sizeof(init_sock_pool6));
@ -2527,6 +2635,15 @@ int tcp_init(struct ctx *c)
 			  (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
 	debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");

+	tcp_info_size = tcp_probe_tcp_info();
+
+#define dbg_tcpi(f_)	debug("TCP_INFO tcpi_%s field%s supported",	\
+			      STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
+	dbg_tcpi(snd_wnd);
+	dbg_tcpi(bytes_acked);
+	dbg_tcpi(min_rtt);
+#undef dbg_tcpi
+
 	return 0;
 }

@ -2568,7 +2685,7 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
 			if (outbound)
 				tcp_ns_sock_init(c, port);
 			else
-				tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port);
+				tcp_sock_init(c, NULL, NULL, port);
 		}
 	}
 }
--- a/tcp.h
+++ b/tcp.h
@ -10,14 +10,15 @@

 struct ctx;

-void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
-void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+void tcp_timer_handler(const struct ctx *c, union epoll_ref ref);
+void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			const struct timespec *now);
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
-int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
+void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
+		      uint32_t events);
+int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		    const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now);
-int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
+int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);
 int tcp_init(struct ctx *c);
 void tcp_timer(struct ctx *c, const struct timespec *now);
@ -58,16 +59,12 @@ union tcp_listen_epoll_ref {
 * @fwd_in:		Port forwarding configuration for inbound packets
 * @fwd_out:		Port forwarding configuration for outbound packets
 * @timer_run:		Timestamp of most recent timer run
- * @kernel_snd_wnd:	Kernel reports sending window (with commit 8f7baad7f035)
 * @pipe_size:		Size of pipes for spliced connections
 */
 struct tcp_ctx {
 	struct fwd_ports fwd_in;
 	struct fwd_ports fwd_out;
 	struct timespec timer_run;
-#ifdef HAS_SND_WND
-	int kernel_snd_wnd;
-#endif
 	size_t pipe_size;
 };

--- a/tcp_buf.c
+++ b/tcp_buf.c
@ -20,7 +20,7 @@

 #include <netinet/ip.h>

-#include <linux/tcp.h>
+#include <netinet/tcp.h>

 #include "util.h"
 #include "ip.h"
@ -38,88 +38,32 @@
 	(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)

 /* Static buffers */
-/**
- * struct tcp_payload_t - TCP header and data to send segments with payload
- * @th:		TCP header
- * @data:	TCP data
- */
-struct tcp_payload_t {
-	struct tcphdr th;
-	uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));    /* For AVX2 checksum routines */
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif

-/**
- * struct tcp_flags_t - TCP header and data to send zero-length
- *                      segments (flags)
- * @th:		TCP header
- * @opts	TCP options
- */
-struct tcp_flags_t {
-	struct tcphdr th;
-	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-/* Ethernet header for IPv4 frames */
+/* Ethernet header for IPv4 and IPv6 frames */
 static struct ethhdr		tcp4_eth_src;
-
-static struct tap_hdr		tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
-/* IPv4 headers */
-static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
-/* TCP segments with payload for IPv4 frames */
-static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
-
-static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
-
-/* References tracking the owner connection of frames in the tap outqueue */
-static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
-static unsigned int tcp4_payload_used;
-
-static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
-/* IPv4 headers for TCP segment without payload */
-static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
-/* TCP segments without payload for IPv4 frames */
-static struct tcp_flags_t	tcp4_flags[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_flags_used;
-
-/* Ethernet header for IPv6 frames */
 static struct ethhdr		tcp6_eth_src;

-static struct tap_hdr		tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
-/* IPv6 headers */
-static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
-/* TCP headers and data for IPv6 frames */
-static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
+static struct tap_hdr		tcp_payload_tap_hdr[TCP_FRAMES_MEM];

-static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
+/* IP headers for IPv4 and IPv6 */
+struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
+struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
+
+/* TCP segments with payload for IPv4 and IPv6 frames */
+static struct tcp_payload_t	tcp_payload[TCP_FRAMES_MEM];
+
+static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516");
+static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516");

 /* References tracking the owner connection of frames in the tap outqueue */
-static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
-static unsigned int tcp6_payload_used;
-
-static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
-/* IPv6 headers for TCP segment without payload */
-static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
-/* TCP segment without payload for IPv6 frames */
-static struct tcp_flags_t	tcp6_flags[TCP_FRAMES_MEM];
-
-static unsigned int tcp6_flags_used;
+static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
+static unsigned int tcp_payload_used;

 /* recvmsg()/sendmsg() data for tap */
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];

-static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec	tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+
 /**
 * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
 * @eth_d:	Ethernet destination address, NULL if unchanged
@ -132,105 +76,30 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
 }

 /**
- * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
+ * tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
 * @c:		Execution context
 */
-void tcp_sock4_iov_init(const struct ctx *c)
-{
-	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
-	struct iovec *iov;
-	int i;
-
-	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
-
-	for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
-		tcp4_payload_ip[i] = iph;
-		tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp4_payload[i].th.ack = 1;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
-		tcp4_flags_ip[i] = iph;
-		tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp4_flags[i].th.ack = 1;
-	}
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp4_l2_iov[i];
-
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
-	}
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp4_l2_flags_iov[i];
-
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
-		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
-	}
-}
-
-/**
- * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
- * @c:		Execution context
- */
-void tcp_sock6_iov_init(const struct ctx *c)
+void tcp_sock_iov_init(const struct ctx *c)
 {
 	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
-	struct iovec *iov;
+	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
 	int i;

 	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
+	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);

-	for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
+	for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
 		tcp6_payload_ip[i] = ip6;
-		tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp6_payload[i].th.ack = 1;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
-		tcp6_flags_ip[i] = ip6;
-		tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp6_flags[i].th .ack = 1;
+		tcp4_payload_ip[i] = iph;
 	}

 	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp6_l2_iov[i];
+		struct iovec *iov = tcp_l2_iov[i];

-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
+		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
 	}
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp6_l2_flags_iov[i];
-
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
-	}
-}
-
-/**
- * tcp_flags_flush() - Send out buffers for segments with no data (flags)
- * @c:		Execution context
- */
-void tcp_flags_flush(const struct ctx *c)
-{
-	tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
-			tcp6_flags_used);
-	tcp6_flags_used = 0;
-
-	tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
-			tcp4_flags_used);
-	tcp4_flags_used = 0;
 }

 /**
@ -240,7 +109,7 @@ void tcp_flags_flush(const struct ctx *c)
 * @frames:	Two-dimensional array containing queued frames with sub-iovs
 * @num_frames:	Number of entries in the two arrays to be compared
 */
-static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
+static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
 			   struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
 {
 	int i;
@ -262,28 +131,20 @@ static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
 }

 /**
- * tcp_payload_flush() - Send out buffers for segments with data
+ * tcp_payload_flush() - Send out buffers for segments with data or flags
 * @c:		Execution context
 */
-void tcp_payload_flush(struct ctx *c)
+void tcp_payload_flush(const struct ctx *c)
 {
 	size_t m;

-	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
-			    tcp6_payload_used);
-	if (m != tcp6_payload_used) {
-		tcp_revert_seq(c, &tcp6_frame_conns[m], &tcp6_l2_iov[m],
-			       tcp6_payload_used - m);
+	m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
+			    tcp_payload_used);
+	if (m != tcp_payload_used) {
+		tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
+			       tcp_payload_used - m);
 	}
-	tcp6_payload_used = 0;
-
-	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
-			    tcp4_payload_used);
-	if (m != tcp4_payload_used) {
-		tcp_revert_seq(c, &tcp4_frame_conns[m], &tcp4_l2_iov[m],
-			       tcp4_payload_used - m);
-	}
-	tcp4_payload_used = 0;
+	tcp_payload_used = 0;
 }

 /**
@ -294,58 +155,48 @@ void tcp_payload_flush(struct ctx *c)
 *
 * Return: negative error code on connection reset, 0 otherwise
 */
-int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
 {
-	struct tcp_flags_t *payload;
+	struct tcp_payload_t *payload;
 	struct iovec *iov;
 	size_t optlen;
 	size_t l4len;
 	uint32_t seq;
 	int ret;

-	if (CONN_V4(conn))
-		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
-	else
-		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
+	iov = tcp_l2_iov[tcp_payload_used];
+	if (CONN_V4(conn)) {
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+	} else {
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
+		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
+	}

 	payload = iov[TCP_IOV_PAYLOAD].iov_base;
-
 	seq = conn->seq_to_tap;
 	ret = tcp_prepare_flags(c, conn, flags, &payload->th,
-				payload->opts, &optlen);
-	if (ret <= 0) {
-		if (CONN_V4(conn))
-			tcp4_flags_used--;
-		else
-			tcp6_flags_used--;
+				(struct tcp_syn_opts *)&payload->data, &optlen);
+	if (ret <= 0)
 		return ret;
-	}

-	l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq);
+	tcp_payload_used++;
+	l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
 	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-
 	if (flags & DUP_ACK) {
-		struct iovec *dup_iov;
-		int i;
+		struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];

-		if (CONN_V4(conn))
-			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
-		else
-			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-
-		for (i = 0; i < TCP_NUM_IOVS; i++)
-			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
-			       iov[i].iov_len);
-		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
+		memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
+		       iov[TCP_IOV_TAP].iov_len);
+		dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
+		dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP];
+		memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
+		       iov[TCP_IOV_PAYLOAD].iov_base, l4len);
+		dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
 	}

-	if (CONN_V4(conn)) {
-		if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
-			tcp_flags_flush(c);
-	} else {
-		if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
-			tcp_flags_flush(c);
-	}
+	if (tcp_payload_used > TCP_FRAMES_MEM - 2)
+		tcp_payload_flush(c);

 	return 0;
 }
@ -358,39 +209,39 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
 * @seq:	Sequence number to be sent
 */
-static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
+	struct tcp_payload_t *payload;
+	const uint16_t *check = NULL;
 	struct iovec *iov;
 	size_t l4len;

 	conn->seq_to_tap = seq + dlen;
-
+	tcp_frame_conns[tcp_payload_used] = conn;
+	iov = tcp_l2_iov[tcp_payload_used];
 	if (CONN_V4(conn)) {
-		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
-		const uint16_t *check = NULL;
-
 		if (no_csum) {
+			struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
 			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
+
 			check = &iph->check;
 		}
-
-		tcp4_frame_conns[tcp4_payload_used] = conn;
-
-		iov = tcp4_l2_iov[tcp4_payload_used++];
-		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq);
-		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
-			tcp_payload_flush(c);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
 	} else if (CONN_V6(conn)) {
-		tcp6_frame_conns[tcp6_payload_used] = conn;
-
-		iov = tcp6_l2_iov[tcp6_payload_used++];
-		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq);
-		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
-			tcp_payload_flush(c);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
+		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
 	}
+	payload = iov[TCP_IOV_PAYLOAD].iov_base;
+	payload->th.th_off = sizeof(struct tcphdr) / 4;
+	payload->th.th_x2 = 0;
+	payload->th.th_flags = 0;
+	payload->th.ack = 1;
+	l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
+	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+	if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
+		tcp_payload_flush(c);
 }

 /**
@ -402,12 +253,11 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
 *
 * #syscalls recvmsg
 */
-int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
 	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
-	int sendlen, len, dlen, v4 = CONN_V4(conn);
-	int s = conn->sock, i, ret = 0;
+	int len, dlen, i, s = conn->sock;
 	struct msghdr mh_sock = { 0 };
 	uint16_t mss = MSS_GET(conn);
 	uint32_t already_sent, seq;
@ -454,19 +304,15 @@ int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		mh_sock.msg_iovlen = fill_bufs;
 	}

-	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
-	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
+	if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
 		tcp_payload_flush(c);

 		/* Silence Coverity CWE-125 false positive */
-		tcp4_payload_used = tcp6_payload_used = 0;
+		tcp_payload_used = 0;
 	}

 	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
-		if (v4)
-			iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
-		else
-			iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
+		iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
 		iov->iov_len = mss;
 	}
 	if (iov_rem)
@ -477,12 +323,19 @@ int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		len = recvmsg(s, &mh_sock, MSG_PEEK);
 	while (len < 0 && errno == EINTR);

-	if (len < 0)
-		goto err;
+	if (len < 0) {
+		if (errno != EAGAIN && errno != EWOULDBLOCK) {
+			tcp_rst(c, conn);
+			return -errno;
+		}
+
+		return 0;
+	}

 	if (!len) {
 		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
-			if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
+			int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
+			if (ret) {
 				tcp_rst(c, conn);
 				return ret;
 			}
@ -493,28 +346,27 @@ int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}

-	sendlen = len;
 	if (!peek_offset_cap)
-		sendlen -= already_sent;
+		len -= already_sent;

-	if (sendlen <= 0) {
+	if (len <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
 	}

 	conn_flag(c, conn, ~STALLED);

-	send_bufs = DIV_ROUND_UP(sendlen, mss);
-	last_len = sendlen - (send_bufs - 1) * mss;
+	send_bufs = DIV_ROUND_UP(len, mss);
+	last_len = len - (send_bufs - 1) * mss;

 	/* Likely, some new data was acked too. */
-	tcp_update_seqack_wnd(c, conn, 0, NULL);
+	tcp_update_seqack_wnd(c, conn, false, NULL);

 	/* Finally, queue to tap */
 	dlen = mss;
 	seq = conn->seq_to_tap;
 	for (i = 0; i < send_bufs; i++) {
-		int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
+		int no_csum = i && i != send_bufs - 1 && tcp_payload_used;

 		if (i == send_bufs - 1)
 			dlen = last_len;
@ -526,12 +378,4 @@ int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	conn_flag(c, conn, ACK_FROM_TAP_DUE);

 	return 0;
-
-err:
-	if (errno != EAGAIN && errno != EWOULDBLOCK) {
-		ret = -errno;
-		tcp_rst(c, conn);
-	}
-
-	return ret;
 }
--- a/tcp_buf.h
+++ b/tcp_buf.h
@ -6,11 +6,9 @@
 #ifndef TCP_BUF_H
 #define TCP_BUF_H

-void tcp_sock4_iov_init(const struct ctx *c);
-void tcp_sock6_iov_init(const struct ctx *c);
-void tcp_flags_flush(const struct ctx *c);
-void tcp_payload_flush(struct ctx *c);
-int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
-int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
+void tcp_sock_iov_init(const struct ctx *c);
+void tcp_payload_flush(const struct ctx *c);
+int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);

 #endif  /*TCP_BUF_H */
--- a/tcp_internal.h
+++ b/tcp_internal.h
@ -33,9 +33,7 @@
 #define OPT_EOL		0
 #define OPT_NOP		1
 #define OPT_MSS		2
-#define OPT_MSS_LEN	4
 #define OPT_WS		3
-#define OPT_WS_LEN	3
 #define OPT_SACKP	4
 #define OPT_SACK	5
 #define OPT_TS		8
@ -63,6 +61,79 @@ enum tcp_iov_parts {
 	TCP_NUM_IOVS
 };

+/**
+ * struct tcp_payload_t - TCP header and data to send segments with payload
+ * @th:		TCP header
+ * @data:	TCP data
+ */
+struct tcp_payload_t {
+	struct tcphdr th;
+	uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)));    /* For AVX2 checksum routines */
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
+#endif
+
+/** struct tcp_opt_nop - TCP NOP option
+ * @kind:	Option kind (OPT_NOP = 1)
+ */
+struct tcp_opt_nop {
+	uint8_t kind;
+} __attribute__ ((packed));
+#define TCP_OPT_NOP		((struct tcp_opt_nop){ .kind = OPT_NOP, })
+
+/** struct tcp_opt_mss - TCP MSS option
+ * @kind:	Option kind (OPT_MSS == 2)
+ * @len:	Option length (4)
+ * @mss:	Maximum Segment Size
+ */
+struct tcp_opt_mss {
+	uint8_t kind;
+	uint8_t len;
+	uint16_t mss;
+} __attribute__ ((packed));
+#define TCP_OPT_MSS(mss_)				\
+	((struct tcp_opt_mss) {				\
+		.kind = OPT_MSS,			\
+		.len = sizeof(struct tcp_opt_mss),	\
+		.mss = htons(mss_),			\
+	})
+
+/** struct tcp_opt_ws - TCP Window Scaling option
+ * @kind:	Option kind (OPT_WS == 3)
+ * @len:	Option length (3)
+ * @shift:	Window scaling shift
+ */
+struct tcp_opt_ws {
+	uint8_t kind;
+	uint8_t len;
+	uint8_t shift;
+} __attribute__ ((packed));
+#define TCP_OPT_WS(shift_)				\
+	((struct tcp_opt_ws) {				\
+		.kind = OPT_WS,				\
+		.len = sizeof(struct tcp_opt_ws),	\
+		.shift = (shift_),			\
+	})
+
+/** struct tcp_syn_opts - TCP options we apply to SYN packets
+ * @mss:	Maximum Segment Size (MSS) option
+ * @nop:	NOP opt (for alignment)
+ * @ws:		Window Scaling (WS) option
+ */
+struct tcp_syn_opts {
+	struct tcp_opt_mss mss;
+	struct tcp_opt_nop nop;
+	struct tcp_opt_ws ws;
+} __attribute__ ((packed));
+#define TCP_SYN_OPTS(mss_, ws_)				\
+	((struct tcp_syn_opts){				\
+		.mss = TCP_OPT_MSS(mss_),		\
+		.nop = TCP_OPT_NOP,			\
+		.ws = TCP_OPT_WS(ws_),			\
+	})
+
 extern char tcp_buf_discard [MAX_WINDOW];

 void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
@ -82,19 +153,23 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
 		conn_event_do(c, conn, event);				\
 	} while (0)

-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
 #define tcp_rst(c, conn)						\
 	do {								\
 		flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
 		tcp_rst_do(c, conn);					\
 	} while (0)

+struct tcp_info_linux;
+
 size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
 			       struct iovec *iov, size_t dlen,
-			       const uint16_t *check, uint32_t seq);
+			       const uint16_t *check, uint32_t seq,
+			       bool no_tcp_csum);
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
-			  int force_seq, struct tcp_info *tinfo);
-int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
-		      struct tcphdr *th, char *data, size_t *optlen);
+			  bool force_seq, struct tcp_info_linux *tinfo);
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
+		      int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
+		      size_t *optlen);

 #endif /* TCP_INTERNAL_H */
--- a/tcp_splice.c
+++ b/tcp_splice.c
@ -320,7 +320,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
 			}

 			if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
-				  c->tcp.pipe_size)) {
+				  c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
 				flow_trace(conn,
 					   "cannot set %d->%d pipe size to %zu",
 					   sidei, !sidei, c->tcp.pipe_size);
@ -503,7 +503,7 @@ swap:
 	lowat_act_flag = RCVLOWAT_ACT(fromsidei);

 	while (1) {
-		ssize_t readlen, to_write = 0, written;
+		ssize_t readlen, written, pending;
 		int more = 0;

 retry:
@ -518,14 +518,11 @@ retry:

 			if (errno != EAGAIN)
 				goto close;
-
-			to_write = c->tcp.pipe_size;
 		} else if (!readlen) {
 			eof = 1;
-			to_write = c->tcp.pipe_size;
 		} else {
 			never_read = 0;
-			to_write += readlen;
+
 			if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
 				more = SPLICE_F_MORE;

@ -535,10 +532,10 @@ retry:

 eintr:
 		written = splice(conn->pipe[fromsidei][0], NULL,
-				 conn->s[!fromsidei], NULL, to_write,
+				 conn->s[!fromsidei], NULL, c->tcp.pipe_size,
 				 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
 		flow_trace(conn, "%zi from write-side call (passed %zi)",
-			   written, to_write);
+			   written, c->tcp.pipe_size);

 		/* Most common case: skip updating counters. */
 		if (readlen > 0 && readlen == written) {
@ -584,10 +581,9 @@ eintr:
 		if (never_read && written == (long)(c->tcp.pipe_size))
 			goto retry;

-		if (!never_read && written < to_write) {
-			to_write -= written;
+		pending = conn->read[fromsidei] - conn->written[fromsidei];
+		if (!never_read && written > 0 && written < pending)
 			goto retry;
-		}

 		if (eof)
 			break;
@ -676,7 +672,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
 			continue;

 		if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
-			  c->tcp.pipe_size)) {
+			  c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
 			trace("TCP (spliced): cannot set pool pipe size to %zu",
 			      c->tcp.pipe_size);
 		}
--- a/test/Makefile
+++ b/test/Makefile
@ -8,7 +8,6 @@
 WGET = wget -c

 DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
-	debian-9-nocloud-amd64-daily-20200210-166.qcow2 \
 	debian-10-nocloud-amd64.qcow2 \
 	debian-10-generic-arm64.qcow2 \
 	debian-10-generic-ppc64el-20220911-1135.qcow2 \
@ -42,8 +41,7 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \
 	openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \
 	openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \
 	openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \
-	openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \
-	openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
+	openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz

 UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \
 	trusty-server-cloudimg-i386-disk1.img \
@ -135,9 +133,6 @@ realclean: clean
 debian-8.11.0-openstack-%.qcow2:
 	$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2

-debian-9-nocloud-%-daily-20200210-166.qcow2:
-	$(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2
-
 debian-10-nocloud-%.qcow2:
 	$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2

@ -203,9 +198,6 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz:
 openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
 	$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz

-openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2:
-	$(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
-
 # Ubuntu downloads
 trusty-server-cloudimg-%-disk1.img:
 	$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img
--- a/test/lib/setup
+++ b/test/lib/setup
@ -58,7 +58,7 @@ setup_passt() {
 	context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}"		   \
 		' -machine accel=kvm'                                      \
 		' -m '${VMEM}' -cpu host -smp '${VCPUS}                    \
-		' -kernel ' "/boot/vmlinuz-$(uname -r)"			   \
+		' -kernel '"${KERNEL}"					   \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	   \
 		' -nodefaults'						   \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	   \
@ -159,7 +159,7 @@ setup_passt_in_ns() {
 		' -machine accel=kvm'                                      \
 		' -M accel=kvm:tcg'                                        \
 		' -m '${VMEM}' -cpu host -smp '${VCPUS}                    \
-		' -kernel ' "/boot/vmlinuz-$(uname -r)"			   \
+		' -kernel '"${KERNEL}"					   \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	   \
 		' -nodefaults'						   \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	   \
@ -230,7 +230,7 @@ setup_two_guests() {
 	context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}"		     \
 		' -M accel=kvm:tcg'                                          \
 		' -m '${VMEM}' -cpu host -smp '${VCPUS}                      \
-		' -kernel ' "/boot/vmlinuz-$(uname -r)"			     \
+		' -kernel '"${KERNEL}"					     \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	     \
 		' -nodefaults'						     \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	     \
@ -243,7 +243,7 @@ setup_two_guests() {
 	context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}"		     \
 		' -M accel=kvm:tcg'                                          \
 		' -m '${VMEM}' -cpu host -smp '${VCPUS}                      \
-		' -kernel ' "/boot/vmlinuz-$(uname -r)"			     \
+		' -kernel '"${KERNEL}"					     \
 		' -initrd '${INITRAMFS}' -nographic -serial stdio'	     \
 		' -nodefaults'						     \
 		' -append "console=ttyS0 mitigations=off apparmor=0" '	     \
--- a/test/lib/term
+++ b/test/lib/term
@ -31,8 +31,8 @@ PR_DELAY_INIT=100 # ms
 # $@:	Message to print
 info() {
 	tmux select-pane -t ${PANE_INFO}
-	echo "${@}" >> $STATEBASE/log_pipe
-	echo "${@}" >> "${LOGFILE}"
+	printf "${@}\n" >> $STATEBASE/log_pipe
+	printf "${@}\n" >> "${LOGFILE}"
 }

 # info_n() - Highlight, print message to pane and to log file without newline
@ -47,13 +47,13 @@ info_n() {
 # $@:	Message to print
 info_nolog() {
 	tmux select-pane -t ${PANE_INFO}
-	echo "${@}" >> $STATEBASE/log_pipe
+	printf "${@}\n" >> $STATEBASE/log_pipe
 }

 # info_nolog() - Print message to log file
 # $@:	Message to print
 log() {
-	echo "${@}" >> "${LOGFILE}"
+	printf "${@}\n" >> "${LOGFILE}"
 }

 # info_nolog_n() - Send message to pane without highlighting it, without newline
@ -664,7 +664,7 @@ pause_continue() {

 # run_term() - Start tmux session, running entry point, with recording if needed
 run_term() {
-	TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG"
+	TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL"

 	if [ ${CI} -eq 1 ]; then
 		printf '\e[8;50;240t'
--- a/test/nstool.c
+++ b/test/nstool.c
@ -31,10 +31,15 @@

 #define	ARRAY_SIZE(a)	((int)(sizeof(a) / sizeof((a)[0])))

-#define die(...)				\
-	do {					\
-		fprintf(stderr, __VA_ARGS__);	\
-		exit(1);			\
+#define die(...)						\
+	do {							\
+		fprintf(stderr, "nstool: " __VA_ARGS__);	\
+		exit(1);					\
+	} while (0)
+
+#define err(...)						\
+	do {							\
+		fprintf(stderr, "nstool: " __VA_ARGS__);	\
 	} while (0)

 struct ns_type {
@ -156,6 +161,9 @@ static int connect_ctl(const char *sockpath, bool wait,

 static void cmd_hold(int argc, char *argv[])
 {
+	struct sigaction sa = {
+		.sa_handler = SIG_IGN,
+	};
 	int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX);
 	struct sockaddr_un addr;
 	const char *sockpath = argv[1];
@ -185,6 +193,10 @@ static void cmd_hold(int argc, char *argv[])
 	if (!getcwd(info.cwd, sizeof(info.cwd)))
 		die("getcwd(): %s\n", strerror(errno));

+	rc = sigaction(SIGPIPE, &sa, NULL);
+	if (rc)
+		die("sigaction(SIGPIPE): %s\n", strerror(errno));
+
 	do {
 		int afd = accept(fd, NULL, NULL);
 		char buf;
@ -193,17 +205,21 @@ static void cmd_hold(int argc, char *argv[])
 			die("accept(): %s\n", strerror(errno));

 		rc = write(afd, &info, sizeof(info));
-		if (rc < 0)
-			die("write(): %s\n", strerror(errno));
+		if (rc < 0) {
+			err("holder write() to control socket: %s\n",
+			    strerror(errno));
+		}
 		if ((size_t)rc < sizeof(info))
-			die("short write() on control socket\n");
+			err("holder short write() on control socket\n");

 		rc = read(afd, &buf, sizeof(buf));
-		if (rc < 0)
-			die("read(): %s\n", strerror(errno));
+		if (rc < 0) {
+			err("holder read() on control socket: %s\n",
+			    strerror(errno));
+		}

 		close(afd);
-	} while (rc == 0);
+	} while (rc <= 0);

 	unlink(sockpath);
 }
@ -346,7 +362,7 @@ static int openns(const char *fmt, ...)
 }

 static pid_t sig_pid;
-static void sig_handler(int signum)
+static void sig_propagate(int signum)
 {
 	int err;

@ -358,7 +374,7 @@ static void sig_handler(int signum)
 static void wait_for_child(pid_t pid)
 {
 	struct sigaction sa = {
-		.sa_handler = sig_handler,
+		.sa_handler = sig_propagate,
 		.sa_flags = SA_RESETHAND,
 	};
 	int status, err;
--- a/test/passt/dhcp
+++ b/test/passt/dhcp
@ -49,6 +49,8 @@ check	[ "__SEARCH__" = "__HOST_SEARCH__" ]

 test	DHCPv6: address
 guest	/sbin/dhclient -6 __IFNAME__
+# Wait for DAD to complete
+guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 check	[ "__ADDR6__" = "__HOST_ADDR6__" ]
--- a/test/passt/ndp
+++ b/test/passt/ndp
@ -16,13 +16,15 @@ htools	ip jq sipcalc grep cut

 test	Interface name
 gout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-guest	ip link set dev __IFNAME__ up && sleep 2
+guest	ip link set dev __IFNAME__ up
+# Wait for DAD to complete
+guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
 check	[ -n "__IFNAME__" ]

 test	SLAAC: prefix
-gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
-gout	PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
+gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
+gout	PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 hout	HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
 check	[ "__PREFIX6__" = "__HOST_PREFIX6__" ]
--- a/test/passt_in_ns/dhcp
+++ b/test/passt_in_ns/dhcp
@ -52,6 +52,8 @@ check	[ "__SEARCH__" = "__HOST_SEARCH__" ]

 test	DHCPv6: address
 guest	/sbin/dhclient -6 __IFNAME__
+# Wait for DAD to complete
+guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 gout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 check	[ "__ADDR6__" = "__HOST_ADDR6__" ]
--- a/test/passt_in_ns/tcp
+++ b/test/passt_in_ns/tcp
@ -32,7 +32,7 @@ host	socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10001
 guestw
 guest	cmp test_big.bin /root/big.bin

-test	TCP/IPv4: host to ns: big transfer
+test	TCP/IPv4: host to ns (spliced): big transfer
 nsb	socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
@ -90,7 +90,7 @@ host	socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10001
 guestw
 guest	cmp test_small.bin /root/small.bin

-test	TCP/IPv4: host to ns: small transfer
+test	TCP/IPv4: host to ns (spliced): small transfer
 nsb	socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
@ -146,7 +146,7 @@ host	socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10001
 guestw
 guest	cmp test_big.bin /root/big.bin

-test	TCP/IPv6: host to ns: big transfer
+test	TCP/IPv6: host to ns (spliced): big transfer
 nsb	socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
@ -204,7 +204,7 @@ host	socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10001
 guestw
 guest	cmp test_small.bin /root/small.bin

-test	TCP/IPv6: host to ns: small transfer
+test	TCP/IPv6: host to ns (spliced): small transfer
 nsb	socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
--- a/test/passt_in_ns/udp
+++ b/test/passt_in_ns/udp
@ -30,7 +30,7 @@ host	socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10001,shut-null
 guestw
 guest	cmp test.bin /root/medium.bin

-test	UDP/IPv4: host to ns
+test	UDP/IPv4: host to ns (recvmmsg/sendmmsg)
 nsb	socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
@ -88,7 +88,7 @@ host	socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10001,shut-null
 guestw
 guest	cmp test.bin /root/medium.bin

-test	UDP/IPv6: host to ns
+test	UDP/IPv6: host to ns (recvmmsg/sendmmsg)
 nsb	socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
 sleep	1
 host	socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
--- a/test/pasta/dhcp
+++ b/test/pasta/dhcp
@ -35,6 +35,8 @@ check	[ __MTU__ = 65520 ]

 test	DHCPv6: address
 ns	/sbin/dhclient -6 --no-pid __IFNAME__
+# Wait for DAD to complete
+ns	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 hout	HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
 nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
--- a/test/pasta/ndp
+++ b/test/pasta/ndp
@ -18,11 +18,12 @@ test	Interface name
 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
 check	[ -n "__IFNAME__" ]
 ns	ip link set dev __IFNAME__ up
-sleep	2
+# Wait for DAD to complete
+ns	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done

 test	SLAAC: prefix
-nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
-nsout	PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
+nsout	ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
+nsout	PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
 hout	HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
 check	[ "__PREFIX6__" = "__HOST_PREFIX6__" ]
--- a/test/pasta/tcp
+++ b/test/pasta/tcp
@ -19,8 +19,8 @@ set	TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
 set	TEMP_SMALL __STATEDIR__/test_small.bin
 set	TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin

-test	TCP/IPv4: host to ns: big transfer
-nsb	socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_BIG__,create,trunc
+test	TCP/IPv4: host to ns (spliced): big transfer
+nsb	socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
 host	socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
 nsw
 check	cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
@ -38,8 +38,8 @@ ns	socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003
 hostw
 check	cmp __BASEPATH__/big.bin __TEMP_BIG__

-test	TCP/IPv4: host to ns: small transfer
-nsb	socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_SMALL__,create,trunc
+test	TCP/IPv4: host to ns (spliced): small transfer
+nsb	socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
 host	socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
 nsw
 check	cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
@ -57,8 +57,8 @@ ns	socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003
 hostw
 check	cmp __BASEPATH__/small.bin __TEMP_SMALL__

-test	TCP/IPv6: host to ns: big transfer
-nsb	socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_BIG__,create,trunc
+test	TCP/IPv6: host to ns (spliced): big transfer
+nsb	socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
 host	socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
 nsw
 check	cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
@ -77,8 +77,8 @@ ns	socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003
 hostw
 check	cmp __BASEPATH__/big.bin __TEMP_BIG__

-test	TCP/IPv6: host to ns: small transfer
-nsb	socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_SMALL__,create,trunc
+test	TCP/IPv6: host to ns (spliced): small transfer
+nsb	socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
 host	socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
 nsw
 check	cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
--- a/test/pasta/udp
+++ b/test/pasta/udp
@ -17,8 +17,8 @@ htools	dd socat ip jq
 set	TEMP __STATEDIR__/test.bin
 set	TEMP_NS __STATEDIR__/test_ns.bin

-test	UDP/IPv4: host to ns
-nsb	socat -u UDP4-LISTEN:10002,bind=127.0.0.1,null-eof OPEN:__TEMP_NS__,create,trunc
+test	UDP/IPv4: host to ns (recvmmsg/sendmmsg)
+nsb	socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
 host	socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
 nsw
 check	cmp __BASEPATH__/medium.bin __TEMP_NS__
@ -37,8 +37,8 @@ ns	socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null
 hostw
 check	cmp __BASEPATH__/medium.bin __TEMP__

-test	UDP/IPv6: host to ns
-nsb	socat -u UDP6-LISTEN:10002,bind=[::1],null-eof OPEN:__TEMP_NS__,create,trunc
+test	UDP/IPv6: host to ns (recvmmsg/sendmmsg)
+nsb	socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
 host	socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
 nsw
 check	cmp __BASEPATH__/medium.bin __TEMP_NS__
--- a/test/perf/passt_tcp
+++ b/test/perf/passt_tcp
@ -116,6 +116,8 @@ iperf3k	ns
 # Reducing MTU below 1280 deconfigures IPv6, get our address back
 guest	dhclient -6 -x
 guest	dhclient -6 __IFNAME__
+# Wait for DAD to complete
+guest	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done

 tl	TCP RR latency over IPv4: guest to host
 lat	-
--- a/test/perf/pasta_tcp
+++ b/test/perf/pasta_tcp
@ -211,7 +211,7 @@ tr	TCP throughput over IPv6: host to ns
 iperf3s	ns 10002

 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
 bw	-
 bw	-
 bw	-
--- a/test/perf/pasta_udp
+++ b/test/perf/pasta_udp
@ -196,7 +196,7 @@ tr	UDP throughput over IPv6: host to ns
 iperf3s	ns 10002

 nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
 iperf3	BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472
 bw	__BW__ 0.3 0.5
 iperf3	BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972
--- a/test/run
+++ b/test/run
@ -38,6 +38,9 @@ TRACE=${TRACE:-0}
 # If set, tell passt and pasta to take packet captures
 PCAP=${PCAP:-0}

+# Custom kernel to boot guests with, if given
+KERNEL=${KERNEL:-"/boot/vmlinuz-$(uname -r)"}
+
 COMMIT="$(git log --oneline --no-decorate -1)"

 . lib/util
--- a/test/two_guests/basic
+++ b/test/two_guests/basic
@ -36,9 +36,13 @@ check	[ "__ADDR2__" = "__HOST_ADDR__" ]

 test	DHCPv6: addresses
 # Link is up now, wait for DAD to complete
-sleep	2
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest2	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 guest1	/sbin/dhclient -6 __IFNAME1__
 guest2	/sbin/dhclient -6 __IFNAME2__
+# Wait for DAD to complete on the DHCP address
+guest1	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest2	while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
 g1out	ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
 g2out	ADDR2_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME2__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
 hout	HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
@ -48,33 +52,33 @@ check	[ "__ADDR2_6__" = "__HOST_ADDR6__" ]
 test	TCP/IPv4: guest 1 > guest 2
 g1out	GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
 guest2b	socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc
+sleep	1
 guest1	echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004
 guest2w
-sleep	1
 g2out	MSG2 cat msg
 check	[ "__MSG2__" = "Hello_from_guest_1" ]

 test	TCP/IPv6: guest 2 > guest 1
 g2out	GW2_6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
 guest1b	socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc
+sleep	1
 guest2	echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001
 guest1w
-sleep	1
 g1out	MSG1 cat msg
 check	[ "__MSG1__" = "Hello_from_guest_2" ]

 test	UDP/IPv4: guest 1 > guest 2
 guest2b	socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc
+sleep	1
 guest1	echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004
 guest2w
-sleep	1
 g2out	MSG2 cat msg
 check	[ "__MSG2__" = "Hello_from_guest_1" ]

 test	UDP/IPv6: guest 2 > guest 1
 guest1b	socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc
+sleep	1
 guest2	echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001
 guest1w
-sleep	1
 g1out	MSG1 cat msg
 check	[ "__MSG1__" = "Hello_from_guest_2" ]
--- a/udp.c
+++ b/udp.c
@ -169,11 +169,11 @@ udp_meta[UDP_MAX_FRAMES];
 * @UDP_NUM_IOVS        the number of entries in the iovec array
 */
 enum udp_iov_idx {
-	UDP_IOV_TAP	= 0,
-	UDP_IOV_ETH	= 1,
-	UDP_IOV_IP	= 2,
-	UDP_IOV_PAYLOAD	= 3,
-	UDP_NUM_IOVS
+	UDP_IOV_TAP,
+	UDP_IOV_ETH,
+	UDP_IOV_IP,
+	UDP_IOV_PAYLOAD,
+	UDP_NUM_IOVS,
 };

 /* IOVs and msghdr arrays for receiving datagrams from sockets */
@ -294,15 +294,17 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n,

 /**
 * udp_update_hdr4() - Update headers for one IPv4 datagram
- * @ip4h:	Pre-filled IPv4 header (except for tot_len and saddr)
- * @bp:		Pointer to udp_payload_t to update
- * @toside:	Flowside for destination side
- * @dlen:	Length of UDP payload
+ * @ip4h:		Pre-filled IPv4 header (except for tot_len and saddr)
+ * @bp:			Pointer to udp_payload_t to update
+ * @toside:		Flowside for destination side
+ * @dlen:		Length of UDP payload
+ * @no_udp_csum:	Do not set UDP checksum
 *
 * Return: size of IPv4 payload (UDP header + data)
 */
 static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
-			      const struct flowside *toside, size_t dlen)
+			      const struct flowside *toside, size_t dlen,
+			      bool no_udp_csum)
 {
 	const struct in_addr *src = inany_v4(&toside->oaddr);
 	const struct in_addr *dst = inany_v4(&toside->eaddr);
@ -319,22 +321,33 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 	bp->uh.source = htons(toside->oport);
 	bp->uh.dest = htons(toside->eport);
 	bp->uh.len = htons(l4len);
-	csum_udp4(&bp->uh, *src, *dst, bp->data, dlen);
+	if (no_udp_csum) {
+		bp->uh.check = 0;
+	} else {
+		const struct iovec iov = {
+			.iov_base = bp->data,
+			.iov_len = dlen
+		};
+		csum_udp4(&bp->uh, *src, *dst, &iov, 1, 0);
+	}

 	return l4len;
 }

 /**
 * udp_update_hdr6() - Update headers for one IPv6 datagram
- * @ip6h:	Pre-filled IPv6 header (except for payload_len and addresses)
- * @bp:		Pointer to udp_payload_t to update
- * @toside:	Flowside for destination side
- * @dlen:	Length of UDP payload
+ * @ip6h:		Pre-filled IPv6 header (except for payload_len and
+ * 			addresses)
+ * @bp:			Pointer to udp_payload_t to update
+ * @toside:		Flowside for destination side
+ * @dlen:		Length of UDP payload
+ * @no_udp_csum:	Do not set UDP checksum
 *
 * Return: size of IPv6 payload (UDP header + data)
 */
 static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
-			      const struct flowside *toside, size_t dlen)
+			      const struct flowside *toside, size_t dlen,
+			      bool no_udp_csum)
 {
 	uint16_t l4len = dlen + sizeof(bp->uh);

@ -348,7 +361,20 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
 	bp->uh.source = htons(toside->oport);
 	bp->uh.dest = htons(toside->eport);
 	bp->uh.len = ip6h->payload_len;
-	csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, bp->data, dlen);
+	if (no_udp_csum) {
+		/* 0 is an invalid checksum for UDP IPv6 and dropped by
+		 * the kernel stack, even if the checksum is disabled by virtio
+		 * flags. We need to put any non-zero value here.
+		 */
+		bp->uh.check = 0xffff;
+	} else {
+		const struct iovec iov = {
+			.iov_base = bp->data,
+			.iov_len = dlen
+		};
+		csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
+			  &iov, 1, 0);
+	}

 	return l4len;
 }
@ -358,9 +384,11 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
 * @mmh:	Receiving mmsghdr array
 * @idx:	Index of the datagram to prepare
 * @toside:	Flowside for destination side
+ * @no_udp_csum: Do not set UDP checksum
 */
-static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
-			    const struct flowside *toside)
+static void udp_tap_prepare(const struct mmsghdr *mmh,
+			    unsigned idx, const struct flowside *toside,
+			    bool no_udp_csum)
 {
 	struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
 	struct udp_payload_t *bp = &udp_payload[idx];
@ -368,13 +396,15 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
 	size_t l4len;

 	if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
-		l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len);
+		l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
+					mmh[idx].msg_len, no_udp_csum);
 		tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
 			       sizeof(udp6_eth_hdr));
 		(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
 		(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
 	} else {
-		l4len = udp_update_hdr4(&bm->ip4h, bp, toside, mmh[idx].msg_len);
+		l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
+					mmh[idx].msg_len, no_udp_csum);
 		tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
 			       sizeof(udp4_eth_hdr));
 		(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
@ -387,11 +417,12 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
 * udp_sock_recverr() - Receive and clear an error from a socket
 * @s:		Socket to receive from
 *
- * Return: true if errors received and processed, false if no more errors
+ * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
+ *         if there was an error reading the queue
 *
 * #syscalls recvmsg
 */
-static bool udp_sock_recverr(int s)
+static int udp_sock_recverr(int s)
 {
 	const struct sock_extended_err *ee;
 	const struct cmsghdr *hdr;
@ -408,14 +439,16 @@ static bool udp_sock_recverr(int s)

 	rc = recvmsg(s, &mh, MSG_ERRQUEUE);
 	if (rc < 0) {
-		if (errno != EAGAIN && errno != EWOULDBLOCK)
-			err_perror("Failed to read error queue");
-		return false;
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+			return 0;
+
+		err_perror("UDP: Failed to read error queue");
+		return -1;
 	}

 	if (!(mh.msg_flags & MSG_ERRQUEUE)) {
 		err("Missing MSG_ERRQUEUE flag reading error queue");
-		return false;
+		return -1;
 	}

 	hdr = CMSG_FIRSTHDR(&mh);
@ -424,7 +457,7 @@ static bool udp_sock_recverr(int s)
 	      (hdr->cmsg_level == IPPROTO_IPV6 &&
 	       hdr->cmsg_type == IPV6_RECVERR))) {
 		err("Unexpected cmsg reading error queue");
-		return false;
+		return -1;
 	}

 	ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
@ -433,7 +466,54 @@ static bool udp_sock_recverr(int s)
 	debug("%s error on UDP socket %i: %s",
 	      str_ee_origin(ee), s, strerror(ee->ee_errno));

-	return true;
+	return 1;
+}
+
+/**
+ * udp_sock_errs() - Process errors on a socket
+ * @c:		Execution context
+ * @s:		Socket to receive from
+ * @events:	epoll events bitmap
+ *
+ * Return: Number of errors handled, or < 0 if we have an unrecoverable error
+ */
+static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
+{
+	unsigned n_err = 0;
+	socklen_t errlen;
+	int rc, err;
+
+	ASSERT(!c->no_udp);
+
+	if (!(events & EPOLLERR))
+		return 0; /* Nothing to do */
+
+	/* Empty the error queue */
+	while ((rc = udp_sock_recverr(s)) > 0)
+		n_err += rc;
+
+	if (rc < 0)
+		return -1; /* error reading error, unrecoverable */
+
+	errlen = sizeof(err);
+	if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 ||
+	    errlen != sizeof(err)) {
+		err_perror("Error reading SO_ERROR");
+		return -1;  /* error reading error, unrecoverable */
+	}
+
+	if (err) {
+		debug("Unqueued error on UDP socket %i: %s", s, strerror(err));
+		n_err++;
+	}
+
+	if (!n_err) {
+		/* EPOLLERR, but no errors to clear !? */
+		err("EPOLLERR event without reported errors on socket %i", s);
+		return -1; /* no way to clear, unrecoverable */
+	}
+
+	return n_err;
 }

 /**
@ -443,6 +523,8 @@ static bool udp_sock_recverr(int s)
 * @events:	epoll events bitmap
 * @mmh		mmsghdr array to receive into
 *
+ * Return: Number of datagrams received
+ *
 * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
 */
 static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
@ -459,12 +541,6 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,

 	ASSERT(!c->no_udp);

-	/* Clear any errors first */
-	if (events & EPOLLERR) {
-		while (udp_sock_recverr(s))
-			;
-	}
-
 	if (!(events & EPOLLIN))
 		return 0;

@ -492,6 +568,13 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 	const socklen_t sasize = sizeof(udp_meta[0].s_in);
 	int n, i;

+	if (udp_sock_errs(c, ref.fd, events) < 0) {
+		err("UDP: Unrecoverable error on listening socket:"
+		    " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
+		/* FIXME: what now?  close/re-open socket? */
+		return;
+	}
+
 	if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
 		return;

@ -512,7 +595,8 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
 				udp_splice_prepare(udp_mh_recv, i);
 			} else if (batchpif == PIF_TAP) {
 				udp_tap_prepare(udp_mh_recv, i,
-						flowside_at_sidx(batchsidx));
+						flowside_at_sidx(batchsidx),
+						false);
 			}

 			if (++i >= n)
@ -560,12 +644,20 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
 	const struct flowside *toside = flowside_at_sidx(tosidx);
 	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
-	int from_s = uflow->s[ref.flowside.sidei];
 	uint8_t topif = pif_at_sidx(tosidx);
-	int n, i;
+	int n, i, from_s;

 	ASSERT(!c->no_udp && uflow);

+	from_s = uflow->s[ref.flowside.sidei];
+
+	if (udp_sock_errs(c, from_s, events) < 0) {
+		flow_err(uflow, "Unrecoverable error on reply socket");
+		flow_err_details(uflow);
+		udp_flow_close(c, uflow);
+		return;
+	}
+
 	if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
 		return;

@ -576,7 +668,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 		if (pif_is_socket(topif))
 			udp_splice_prepare(udp_mh_recv, i);
 		else if (topif == PIF_TAP)
-			udp_tap_prepare(udp_mh_recv, i, toside);
+			udp_tap_prepare(udp_mh_recv, i, toside, false);
 		/* Restore sockaddr length clobbered by recvmsg() */
 		udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
 	}
@ -703,69 +795,61 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 * udp_sock_init() - Initialise listening sockets for a given port
 * @c:		Execution context
 * @ns:		In pasta mode, if set, bind with loopback address in namespace
- * @af:		Address family to select a specific IP version, or AF_UNSPEC
 * @addr:	Pointer to address for binding, NULL if not configured
 * @ifname:	Name of interface to bind to, NULL if not configured
 * @port:	Port, host order
 *
 * Return: 0 on (partial) success, negative error code on (complete) failure
 */
-int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
-		  const void *addr, const char *ifname, in_port_t port)
+int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
+		  const char *ifname, in_port_t port)
 {
-	union udp_listen_epoll_ref uref = { .port = port };
+	union udp_listen_epoll_ref uref = {
+		.pif = ns ? PIF_SPLICE : PIF_HOST,
+		.port = port,
+	};
 	int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;

 	ASSERT(!c->no_udp);

-	if (ns)
-		uref.pif = PIF_SPLICE;
-	else
-		uref.pif = PIF_HOST;
-
-	if (af == AF_UNSPEC && c->ifi4 && c->ifi6) {
+	if (!addr && c->ifi4 && c->ifi6 && !ns) {
 		int s;

 		/* Attempt to get a dual stack socket */
-		if (!ns) {
-			s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
-				    addr, ifname, port, uref.u32);
-			udp_splice_init[V4][port] = s < 0 ? -1 : s;
-			udp_splice_init[V6][port] = s < 0 ? -1 : s;
-		} else {
-			s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
-				    &in4addr_loopback, ifname, port, uref.u32);
-			udp_splice_ns[V4][port] = s < 0 ? -1 : s;
-			udp_splice_ns[V6][port] = s < 0 ? -1 : s;
-		}
+		s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
+				NULL, ifname, port, uref.u32);
+		udp_splice_init[V4][port] = s < 0 ? -1 : s;
+		udp_splice_init[V6][port] = s < 0 ? -1 : s;
 		if (IN_INTERVAL(0, FD_REF_MAX, s))
 			return 0;
 	}

-	if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
+	if ((!addr || inany_v4(addr)) && c->ifi4) {
 		if (!ns) {
-			r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
-				     addr, ifname, port, uref.u32);
+			r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
+					 addr ? addr : &inany_any4, ifname,
+					 port, uref.u32);

 			udp_splice_init[V4][port] = r4 < 0 ? -1 : r4;
 		} else {
-			r4  = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
-				      &in4addr_loopback,
-				      ifname, port, uref.u32);
+			r4  = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
+					  &inany_loopback4, ifname,
+					  port, uref.u32);
 			udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4;
 		}
 	}

-	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
+	if ((!addr || !inany_v4(addr)) && c->ifi6) {
 		if (!ns) {
-			r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
-				     addr, ifname, port, uref.u32);
+			r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
+					 addr ? addr : &inany_any6, ifname,
+					 port, uref.u32);

 			udp_splice_init[V6][port] = r6 < 0 ? -1 : r6;
 		} else {
-			r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
-				     &in6addr_loopback,
-				     ifname, port, uref.u32);
+			r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
+					 &inany_loopback6, ifname,
+					 port, uref.u32);
 			udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6;
 		}
 	}
@ -833,7 +917,7 @@ static void udp_port_rebind(struct ctx *c, bool outbound)

 		if ((c->ifi4 && socks[V4][port] == -1) ||
 		    (c->ifi6 && socks[V6][port] == -1))
-			udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port);
+			udp_sock_init(c, outbound, NULL, NULL, port);
 	}
 }

--- a/udp.h
+++ b/udp.h
@ -16,8 +16,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now);
-int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
-		  const void *addr, const char *ifname, in_port_t port);
+int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
+		  const char *ifname, in_port_t port);
 int udp_init(struct ctx *c);
 void udp_timer(struct ctx *c, const struct timespec *now);
 void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
--- a/udp_flow.c
+++ b/udp_flow.c
@ -39,8 +39,11 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
 * @c:		Execution context
 * @uflow:	UDP flow
 */
-static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
+void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 {
+	if (uflow->closed)
+		return; /* Nothing to do */
+
 	if (uflow->s[INISIDE] >= 0) {
 		/* The listening socket needs to stay in epoll */
 		close(uflow->s[INISIDE]);
@ -56,6 +59,8 @@ static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 	flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
 	if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
 		flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
+
+	uflow->closed = true;
 }

 /**
@ -256,6 +261,17 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 	return udp_flow_new(c, flow, -1, now);
 }

+/**
+ * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
+ * @uflow:	Flow to handle
+ *
+ * Return: true if the connection is ready to free, false otherwise
+ */
+bool udp_flow_defer(const struct udp_flow *uflow)
+{
+	return uflow->closed;
+}
+
 /**
 * udp_flow_timer() - Handler for timed events related to a given flow
 * @c:		Execution context
--- a/udp_flow.h
+++ b/udp_flow.h
@ -10,6 +10,7 @@
 /**
 * struct udp - Descriptor for a flow of UDP packets
 * @f:		Generic flow information
+ * @closed:	Flow is already closed
 * @ts:		Activity timestamp
 * @s:		Socket fd (or -1) for each side of the flow
 */
@ -17,6 +18,7 @@ struct udp_flow {
 	/* Must be first element */
 	struct flow_common f;

+	bool closed :1;
 	time_t ts;
 	int s[SIDES];
 };
@ -30,6 +32,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
 			      const void *saddr, const void *daddr,
 			      in_port_t srcport, in_port_t dstport,
 			      const struct timespec *now);
+void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
+bool udp_flow_defer(const struct udp_flow *uflow);
 bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
 		    const struct timespec *now);

--- a/util.c
+++ b/util.c
@ -28,6 +28,7 @@
 #include <linux/errqueue.h>
 #include <getopt.h>

+#include "linux_dep.h"
 #include "util.h"
 #include "iov.h"
 #include "passt.h"
@ -52,6 +53,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 {
 	sa_family_t af = ((const struct sockaddr *)sa)->sa_family;
 	union epoll_ref ref = { .type = type, .data = data };
+	bool freebind = false;
 	struct epoll_event ev;
 	int fd, y = 1, ret;
 	uint8_t proto;
@ -61,8 +63,11 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	case EPOLL_TYPE_TCP_LISTEN:
 		proto = IPPROTO_TCP;
 		socktype = SOCK_STREAM | SOCK_NONBLOCK;
+		freebind = c->freebind;
 		break;
 	case EPOLL_TYPE_UDP_LISTEN:
+		freebind = c->freebind;
+		/* fallthrough */
 	case EPOLL_TYPE_UDP_REPLY:
 		proto = IPPROTO_UDP;
 		socktype = SOCK_DGRAM | SOCK_NONBLOCK;
@ -127,6 +132,18 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 		}
 	}

+	if (freebind) {
+		int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
+		int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND;
+
+		if (setsockopt(fd, level, opt, &y, sizeof(y))) {
+			err_perror("Failed to set %s on socket %i",
+				   af == AF_INET ? "IP_FREEBIND"
+				                 : "IPV6_FREEBIND",
+				   fd);
+		}
+	}
+
 	if (bind(fd, sa, sl) < 0) {
 		/* We'll fail to bind to low ports if we don't have enough
 		 * capabilities, and we'll fail to bind on already bound ports,
@ -157,58 +174,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,

 	return fd;
 }
-/**
- * sock_l4() - Create and bind socket for given L4, add to epoll list
- * @c:		Execution context
- * @af:		Address family, AF_INET or AF_INET6
- * @type:	epoll type
- * @bind_addr:	Address for binding, NULL for any
- * @ifname:	Interface for binding, NULL for any
- * @port:	Port, host order
- * @data:	epoll reference portion for protocol handlers
- *
- * Return: newly created socket, negative error code on failure
- */
-int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
-	    const void *bind_addr, const char *ifname, uint16_t port,
-	    uint32_t data)
-{
-	switch (af) {
-	case AF_INET: {
-		struct sockaddr_in addr4 = {
-			.sin_family = AF_INET,
-			.sin_port = htons(port),
-			{ 0 }, { 0 },
-		};
-		if (bind_addr)
-			addr4.sin_addr = *(struct in_addr *)bind_addr;
-		return sock_l4_sa(c, type, &addr4, sizeof(addr4), ifname,
-				  false, data);
-	}
-
-	case AF_UNSPEC:
-		if (!DUAL_STACK_SOCKETS || bind_addr)
-			 return -EINVAL;
-		/* fallthrough */
-	case AF_INET6: {
-		struct sockaddr_in6 addr6 = {
-			.sin6_family = AF_INET6,
-			.sin6_port = htons(port),
-			0, IN6ADDR_ANY_INIT, 0,
-		};
-		if (bind_addr) {
-			addr6.sin6_addr = *(struct in6_addr *)bind_addr;
-
-			if (IN6_IS_ADDR_LINKLOCAL(bind_addr))
-				addr6.sin6_scope_id = c->ifi6;
-		}
-		return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname,
-				  af == AF_INET6, data);
-	}
-	default:
-		return -EINVAL;
-	}
-}

 /**
 * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
@ -219,7 +184,8 @@ void sock_probe_mem(struct ctx *c)
 	int v = INT_MAX / 2, s;
 	socklen_t sl;

-	if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
+	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
 		c->low_wmem = c->low_rmem = 1;
 		return;
 	}
@ -249,7 +215,7 @@ void sock_probe_mem(struct ctx *c)
 int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b)
 {
 	if (a->tv_nsec < b->tv_nsec) {
-		return (b->tv_nsec - a->tv_nsec) / 1000 +
+		return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 +
 		       (a->tv_sec - b->tv_sec - 1) * 1000000;
 	}

@ -443,25 +409,20 @@ void pidfile_write(int fd, pid_t pid)
 }

 /**
- * pidfile_open() - Open PID file if needed
- * @path:	Path for PID file, empty string if no PID file is requested
+ * output_file_open() - Open file for output, if needed
+ * @path:	Path for output file
+ * @flags:	Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC
 *
- * Return: descriptor for PID file, -1 if path is NULL, won't return on failure
+ * Return: file descriptor on success, -1 on failure with errno set by open()
 */
-int pidfile_open(const char *path)
+int output_file_open(const char *path, int flags)
 {
-	int fd;
-
-	if (!*path)
-		return -1;
-
-	if ((fd = open(path, O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC,
-			     S_IRUSR | S_IWUSR)) < 0) {
-		perror("PID file open");
-		exit(EXIT_FAILURE);
-	}
-
-	return fd;
+	/* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for
+	 * it in the 'mode' argument if we have one
+	 */
+	return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags,
+		    /* NOLINTNEXTLINE(android-cloexec-open) */
+		    S_IRUSR | S_IWUSR);
 }

 /**
@ -485,16 +446,11 @@ int __daemon(int pidfile_fd, int devnull_fd)
 		exit(EXIT_SUCCESS);
 	}

-	errno = 0;
-
-	setsid();
-
-	dup2(devnull_fd, STDIN_FILENO);
-	dup2(devnull_fd, STDOUT_FILENO);
-	dup2(devnull_fd, STDERR_FILENO);
-	close(devnull_fd);
-
-	if (errno)
+	if (setsid()				< 0 ||
+	    dup2(devnull_fd, STDIN_FILENO)	< 0 ||
+	    dup2(devnull_fd, STDOUT_FILENO)	< 0 ||
+	    dup2(devnull_fd, STDERR_FILENO)	< 0 ||
+	    close(devnull_fd))
 		exit(EXIT_FAILURE);

 	return 0;
@ -582,6 +538,36 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
 #endif
 }

+/* write_all_buf() - write all of a buffer to an fd
+ * @fd:		File descriptor
+ * @buf:	Pointer to base of buffer
+ * @len:	Length of buffer
+ *
+ * Return: 0 on success, -1 on error (with errno set)
+ *
+ * #syscalls write
+ */
+int write_all_buf(int fd, const void *buf, size_t len)
+{
+	const char *p = buf;
+	size_t left = len;
+
+	while (left) {
+		ssize_t rc;
+
+		do
+			rc = write(fd, p, left);
+		while ((rc < 0) && errno == EINTR);
+
+		if (rc < 0)
+			return -1;
+
+		p += rc;
+		left -= rc;
+	}
+	return 0;
+}
+
 /* write_remainder() - write the tail of an IO vector to an fd
 * @fd:		File descriptor
 * @iov:	IO vector
@ -590,28 +576,30 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
 *
 * Return: 0 on success, -1 on error (with errno set)
 *
- * #syscalls write writev
+ * #syscalls writev
 */
 int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip)
 {
-	size_t offset, i;
+	size_t i = 0, offset;

-	while ((i = iov_skip_bytes(iov, iovcnt, skip, &offset)) < iovcnt) {
+	while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) {
 		ssize_t rc;

 		if (offset) {
-			rc = write(fd, (char *)iov[i].iov_base + offset,
-				   iov[i].iov_len - offset);
-		} else {
-			rc = writev(fd, &iov[i], iovcnt - i);
+			/* Write the remainder of the partially written buffer */
+			if (write_all_buf(fd, (char *)iov[i].iov_base + offset,
+					  iov[i].iov_len - offset) < 0)
+				return -1;
+			i++;
 		}

+		/* Write as much of the remaining whole buffers as we can */
+		rc = writev(fd, &iov[i], iovcnt - i);
 		if (rc < 0)
 			return -1;

-		skip += rc;
+		skip = rc;
 	}
-
 	return 0;
 }

@ -750,6 +738,48 @@ void close_open_files(int argc, char **argv)
 			rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE);
 	}

-	if (rc)
-		die_perror("Failed to close files leaked by parent");
+	if (rc) {
+		if (errno == ENOSYS || errno == EINVAL) {
+			/* This probably means close_range() or the
+			 * CLOSE_RANGE_UNSHARE flag is not supported by the
+			 * kernel.  Not much we can do here except carry on and
+			 * hope for the best.
+			 */
+			warn(
+"Can't use close_range() to ensure no files leaked by parent");
+		} else {
+			die_perror("Failed to close files leaked by parent");
+		}
+	}
+
+}
+
+/**
+ * snprintf_check() - snprintf() wrapper, checking for truncation and errors
+ * @str:	Output buffer
+ * @size:	Maximum size to write to @str
+ * @format:	Message
+ *
+ * Return: false on success, true on truncation or error, sets errno on failure
+ */
+bool snprintf_check(char *str, size_t size, const char *format, ...)
+{
+	va_list ap;
+	int rc;
+
+	va_start(ap, format);
+	rc = vsnprintf(str, size, format, ap);
+	va_end(ap);
+
+	if (rc < 0) {
+		errno = EIO;
+		return true;
+	}
+
+	if ((size_t)rc >= size) {
+		errno = ENOBUFS;
+		return true;
+	}
+
+	return false;
 }
--- a/util.h
+++ b/util.h
@ -11,12 +11,12 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <string.h>
 #include <signal.h>
 #include <arpa/inet.h>
 #include <unistd.h>
 #include <sys/syscall.h>
-#include <linux/close_range.h>

 #include "log.h"

@ -67,6 +67,15 @@
 #define STRINGIFY(x)	#x
 #define STR(x)		STRINGIFY(x)

+#ifdef CPPCHECK_6936
+/* Some cppcheck versions get confused by aborts inside a loop, causing
+ * it to give false positive uninitialised variable warnings later in
+ * the function, because it doesn't realise the non-initialising path
+ * already exited.  See https://trac.cppcheck.net/ticket/13227
+ */
+#define ASSERT(expr)		\
+	((expr) ? (void)0 : abort())
+#else
 #define ASSERT(expr)							\
 	do {								\
 		if (!(expr)) {						\
@ -78,6 +87,7 @@
 			abort();					\
 		}							\
 	} while (0)
+#endif

 #ifdef P_tmpdir
 #define TMPDIR		P_tmpdir
@ -91,6 +101,9 @@

 #define ARRAY_SIZE(a)		((int)(sizeof(a) / sizeof((a)[0])))

+#define foreach(item, array)						\
+	for ((item) = (array); (item) - (array) < ARRAY_SIZE(array); (item)++)
+
 #define IN_INTERVAL(a, b, x)	((x) >= (a) && (x) <= (b))
 #define FD_PROTO(x, proto)						\
 	(IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x)))
@ -131,7 +144,7 @@ static inline uint32_t ntohl_unaligned(const void *p)
 	return ntohl(val);
 }

-#define NS_FN_STACK_SIZE	(RLIMIT_STACK_VAL * 1024 / 8)
+#define NS_FN_STACK_SIZE	(1024 * 1024) /* 1MiB */
 int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
 	     void *arg);
 #define NS_CALL(fn, arg)						\
@ -144,9 +157,9 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
 			 (void *)(arg));				\
 	} while (0)

-#define RCVBUF_BIG		(2UL * 1024 * 1024)
-#define SNDBUF_BIG		(4UL * 1024 * 1024)
-#define SNDBUF_SMALL		(128UL * 1024)
+#define RCVBUF_BIG		(2ULL * 1024 * 1024)
+#define SNDBUF_BIG		(4ULL * 1024 * 1024)
+#define SNDBUF_SMALL		(128ULL * 1024)

 #include <net/if.h>
 #include <limits.h>
@ -157,33 +170,9 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,

 struct ctx;

-/* cppcheck-suppress funcArgNamesDifferent */
-__attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); }
-
-#ifdef CLOSE_RANGE_UNSHARE	/* Linux kernel >= 5.9 */
-/* glibc < 2.34 and musl as of 1.2.5 need these */
-#ifndef SYS_close_range
-#define SYS_close_range		436
-#endif
-__attribute__ ((weak))
-/* cppcheck-suppress funcArgNamesDifferent */
-int close_range(unsigned int first, unsigned int last, int flags) {
-	return syscall(SYS_close_range, first, last, flags);
-}
-#else
-/* No reasonable fallback option */
-/* cppcheck-suppress funcArgNamesDifferent */
-int close_range(unsigned int first, unsigned int last, int flags) {
-	return 0;
-}
-#endif
-
 int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	       const void *sa, socklen_t sl,
 	       const char *ifname, bool v6only, uint32_t data);
-int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
-	    const void *bind_addr, const char *ifname, uint16_t port,
-	    uint32_t data);
 void sock_probe_mem(struct ctx *c);
 long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
 int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);
@ -195,13 +184,15 @@ char *line_read(char *buf, size_t len, int fd);
 void ns_enter(const struct ctx *c);
 bool ns_is_init(void);
 int open_in_ns(const struct ctx *c, const char *path, int flags);
-int pidfile_open(const char *path);
+int output_file_open(const char *path, int flags);
 void pidfile_write(int fd, pid_t pid);
 int __daemon(int pidfile_fd, int devnull_fd);
 int fls(unsigned long x);
 int write_file(const char *path, const char *buf);
+int write_all_buf(int fd, const void *buf, size_t len);
 int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
 void close_open_files(int argc, char **argv);
+bool snprintf_check(char *str, size_t size, const char *format, ...);

 /**
 * af_name() - Return name of an address family
@ -269,6 +260,9 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
 	return mod_sub(x, i, m) < mod_sub(j, i, m);
 }

+/* FPRINTF() intentionally silences cert-err33-c clang-tidy warnings */
+#define FPRINTF(f, ...)	(void)fprintf(f, __VA_ARGS__)
+
 /*
 * Workarounds for https://github.com/llvm/llvm-project/issues/58992
 *