1
0
Fork 0
mirror of https://passt.top/passt synced 2025-07-26 11:28:00 +02:00

Compare commits

..

No commits in common. "master" and "2024_09_06.6b38f07" have entirely different histories.

141 changed files with 2600 additions and 13161 deletions

View file

@ -1,126 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
#
# clang-format configuration file. Intended for clang-format >= 11.
#
# For more information, see:
#
# Documentation/dev-tools/clang-format.rst
# https://clang.llvm.org/docs/ClangFormat.html
# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
#
---
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: None
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: false
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: true
AfterNamespace: true
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Custom
BreakBeforeInheritanceComma: false
BreakBeforeTernaryOperators: false
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeComma
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 8
ContinuationIndentWidth: 8
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: false
# Taken from:
# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
# | LC_ALL=C sort -u
ForEachMacros:
- 'for_each_nst'
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '.*'
Priority: 1
IncludeIsMainRegex: '(Test)?$'
IndentCaseLabels: false
IndentGotoLabels: false
IndentPPDirectives: None
IndentWidth: 8
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 8
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
# Taken from git's rules
PenaltyBreakAssignment: 10
PenaltyBreakBeforeFirstCallParameter: 30
PenaltyBreakComment: 10
PenaltyBreakFirstLessLess: 0
PenaltyBreakString: 10
PenaltyExcessCharacter: 100
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Right
ReflowComments: false
SortIncludes: false
SortUsingDeclarations: false
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatementsExceptForEachMacros
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp03
TabWidth: 8
UseTab: Always
...

View file

@ -1,93 +0,0 @@
---
Checks:
- "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*"
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
- "-clang-analyzer-valist.Uninitialized"
# Dubious value, would kill readability
- "-cppcoreguidelines-init-variables"
# Dubious value over the compiler's built-in warning. Would
# increase verbosity.
- "-bugprone-assignment-in-if-condition"
# Debatable whether these improve readability, right now it would look
# like a mess
- "-google-readability-braces-around-statements"
- "-hicpp-braces-around-statements"
- "-readability-braces-around-statements"
# TODO: in most cases they are justified, but probably not everywhere
#
- "-readability-magic-numbers"
- "-cppcoreguidelines-avoid-magic-numbers"
# TODO: this is Linux-only for the moment, nice to fix eventually
- "-llvmlibc-restrict-system-libc-headers"
# Those are needed for syscalls, epoll_wait flags, etc.
- "-hicpp-signed-bitwise"
# Probably not doable to impement this without plain memcpy(), memset()
- "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
# TODO: not really important, but nice to fix eventually
- "-llvm-include-order"
# Dubious value, would kill readability
- "-readability-isolate-declaration"
# TODO: nice to fix eventually
- "-bugprone-narrowing-conversions"
- "-cppcoreguidelines-narrowing-conversions"
# TODO: check, fix, and more in general constify wherever possible
- "-cppcoreguidelines-avoid-non-const-global-variables"
# TODO: check paths where it might make sense to improve performance
- "-altera-unroll-loops"
- "-altera-id-dependent-backward-branch"
# Not much can be done about them other than being careful
- "-bugprone-easily-swappable-parameters"
# TODO: split reported functions
- "-readability-function-cognitive-complexity"
# "Poor" alignment needed for structs reflecting message formats/headers
- "-altera-struct-pack-align"
# TODO: check again if multithreading is implemented
- "-concurrency-mt-unsafe"
# Complains about any identifier <3 characters, reasonable for
# globals, pointlessly verbose for locals and parameters.
- "-readability-identifier-length"
# Wants to include headers which *directly* provide the things
# we use. That sounds nice, but means it will often want a OS
# specific header instead of a mostly standard one, such as
# <linux/limits.h> instead of <limits.h>.
- "-misc-include-cleaner"
# Want to replace all #defines of integers with enums. Kind of
# makes sense when those defines form an enum-like set, but
# weird for cases like standalone constants, and causes other
# awkwardness for a bunch of cases we use
- "-cppcoreguidelines-macro-to-enum"
# It's been a couple of centuries since multiplication has been granted
# precedence over addition in modern mathematical notation. Adding
# parentheses to reinforce that certainly won't improve readability.
- "-readability-math-missing-parentheses"
WarningsAsErrors: "*"
HeaderFileExtensions:
- h
ImplementationFileExtensions:
- c
HeaderFilterRegex: ""
FormatStyle: none
CheckOptions:
bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false"
SystemHeaders: false

View file

@ -1,3 +0,0 @@
CompileFlags:
# Don't try to interpret our headers as C++'
Add: [-xc, -Wall]

2
.gitignore vendored
View file

@ -3,10 +3,8 @@
/passt.avx2
/pasta
/pasta.avx2
/passt-repair
/qrap
/pasta.1
/seccomp.h
/seccomp_repair.h
/c*.json
README.plain.md

206
Makefile
View file

@ -15,13 +15,23 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
# the IPv6 socket API? (Linux does)
DUAL_STACK_SOCKETS := 1
RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s')
ifeq ($(RLIMIT_STACK_VAL),unlimited)
RLIMIT_STACK_VAL := 1024
endif
TARGET ?= $(shell $(CC) -dumpmachine)
$(if $(TARGET),,$(error Failed to get target architecture))
# Get 'uname -m'-like architecture description for target
TARGET_ARCH := $(firstword $(subst -, ,$(TARGET)))
TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH))
TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH))
TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH))
TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')
AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
# On some systems enabling optimization also enables source fortification,
# automagically. Do not override it.
@ -30,32 +40,48 @@ ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /
FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
endif
FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security
FLAGS := -Wall -Wextra -Wno-format-zero-length
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
FLAGS += -DARCH=\"$(TARGET_ARCH)\"
FLAGS += -DVERSION=\"$(VERSION)\"
FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \
repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \
udp_vu.c util.c vhost_user.c virtio.c vu_common.c
ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c
QRAP_SRCS = qrap.c
PASST_REPAIR_SRCS = passt-repair.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
MANPAGES = passt.1 pasta.1 qrap.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \
tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \
udp_vu.h util.h vhost_user.h virtio.h vu_common.h
lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
udp.h udp_flow.h util.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_SND_WND
endif
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_BYTES_ACKED
endif
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 };
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_MIN_RTT
endif
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_GETRANDOM
@ -65,6 +91,11 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec
FLAGS += -fstack-protector-strong
endif
C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE;
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
EXTRA_SYSCALLS += fallocate
endif
prefix ?= /usr/local
exec_prefix ?= $(prefix)
bindir ?= $(exec_prefix)/bin
@ -74,9 +105,9 @@ mandir ?= $(datarootdir)/man
man1dir ?= $(mandir)/man1
ifeq ($(TARGET_ARCH),x86_64)
BIN := passt passt.avx2 pasta pasta.avx2 qrap passt-repair
BIN := passt passt.avx2 pasta pasta.avx2 qrap
else
BIN := passt pasta qrap passt-repair
BIN := passt pasta qrap
endif
all: $(BIN) $(MANPAGES) docs
@ -85,10 +116,7 @@ static: FLAGS += -static -DGLIBC_NO_STATIC_NSS
static: clean all
seccomp.h: seccomp.sh $(PASST_SRCS) $(PASST_HEADERS)
@ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp.h $(PASST_SRCS) $(PASST_HEADERS)
seccomp_repair.h: seccomp.sh $(PASST_REPAIR_SRCS)
@ ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp_repair.h $(PASST_REPAIR_SRCS)
@ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh $(PASST_SRCS) $(PASST_HEADERS)
passt: $(PASST_SRCS) $(HEADERS)
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_SRCS) -o passt $(LDFLAGS)
@ -104,21 +132,17 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
ln -sf $< $@
qrap: $(QRAP_SRCS) passt.h
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)
passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS)
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
rt_sigreturn getpid gettid kill clock_gettime \
mmap|mmap2 munmap open unlink gettimeofday futex \
statx readlink
rt_sigreturn getpid gettid kill clock_gettime mmap \
mmap2 munmap open unlink gettimeofday futex
valgrind: FLAGS += -g -DVALGRIND
valgrind: all
.PHONY: clean
clean:
$(RM) $(BIN) *~ *.o seccomp.h seccomp_repair.h pasta.1 \
$(RM) $(BIN) *~ *.o seccomp.h pasta.1 \
passt.tar passt.tar.gz *.deb *.rpm \
passt.pid README.plain.md
@ -172,11 +196,116 @@ docs: README.md
done < README.md; \
) > README.plain.md
clang-tidy: $(PASST_SRCS) $(HEADERS)
clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
-DCLANG_TIDY_58992
# Checkers currently disabled for clang-tidy:
# - llvmlibc-restrict-system-libc-headers
# TODO: this is Linux-only for the moment, nice to fix eventually
#
# - google-readability-braces-around-statements
# - hicpp-braces-around-statements
# - readability-braces-around-statements
# Debatable whether that improves readability, right now it would look
# like a mess
#
# - readability-magic-numbers
# - cppcoreguidelines-avoid-magic-numbers
# TODO: in most cases they are justified, but probably not everywhere
#
# - clang-analyzer-valist.Uninitialized
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
#
# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling
# Probably not doable to impement this without plain memcpy(), memset()
#
# - cppcoreguidelines-init-variables
# Dubious value, would kill readability
#
# - hicpp-signed-bitwise
# Those are needed for syscalls, epoll_wait flags, etc.
#
# - llvm-include-order
# TODO: not really important, but nice to fix eventually
#
# - readability-isolate-declaration
# Dubious value, would kill readability
#
# - bugprone-narrowing-conversions
# - cppcoreguidelines-narrowing-conversions
# TODO: nice to fix eventually
#
# - cppcoreguidelines-avoid-non-const-global-variables
# TODO: check, fix, and more in general constify wherever possible
#
# - altera-unroll-loops
# - altera-id-dependent-backward-branch
# TODO: check paths where it might make sense to improve performance
#
# - bugprone-easily-swappable-parameters
# Not much can be done about them other than being careful
#
# - readability-function-cognitive-complexity
# TODO: split reported functions
#
# - altera-struct-pack-align
# "Poor" alignment needed for structs reflecting message formats/headers
#
# - concurrency-mt-unsafe
# TODO: check again if multithreading is implemented
#
# - readability-identifier-length
# Complains about any identifier <3 characters, reasonable for
# globals, pointlessly verbose for locals and parameters.
#
# - bugprone-assignment-in-if-condition
# Dubious value over the compiler's built-in warning. Would
# increase verbosity.
#
# - misc-include-cleaner
# Wants to include headers which *directly* provide the things
# we use. That sounds nice, but means it will often want a OS
# specific header instead of a mostly standard one, such as
# <linux/limits.h> instead of <limits.h>.
#
# - cppcoreguidelines-macro-to-enum
# Want to replace all #defines of integers with enums. Kind of
# makes sense when those defines form an enum-like set, but
# weird for cases like standalone constants, and causes other
# awkwardness for a bunch of cases we use
cppcheck: $(PASST_SRCS) $(HEADERS)
clang-tidy: $(SRCS) $(HEADERS)
clang-tidy -checks=*,-modernize-*,\
-clang-analyzer-valist.Uninitialized,\
-cppcoreguidelines-init-variables,\
-bugprone-assignment-in-if-condition,\
-google-readability-braces-around-statements,\
-hicpp-braces-around-statements,\
-readability-braces-around-statements,\
-readability-magic-numbers,\
-llvmlibc-restrict-system-libc-headers,\
-hicpp-signed-bitwise,\
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
-llvm-include-order,\
-cppcoreguidelines-avoid-magic-numbers,\
-readability-isolate-declaration,\
-bugprone-narrowing-conversions,\
-cppcoreguidelines-narrowing-conversions,\
-cppcoreguidelines-avoid-non-const-global-variables,\
-altera-unroll-loops,-altera-id-dependent-backward-branch,\
-bugprone-easily-swappable-parameters,\
-readability-function-cognitive-complexity,\
-altera-struct-pack-align,\
-concurrency-mt-unsafe,\
-readability-identifier-length,\
-misc-include-cleaner,\
-cppcoreguidelines-macro-to-enum \
-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
--warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)
VER := $(shell $(CC) -dumpversion)
SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include
endif
cppcheck: $(SRCS) $(HEADERS)
if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \
else \
@ -185,8 +314,11 @@ cppcheck: $(PASST_SRCS) $(HEADERS)
cppcheck --std=c11 --error-exitcode=1 --enable=all --force \
--inconclusive --library=posix --quiet \
$${CPPCHECK_EXHAUSTIVE} \
$(SYSTEM_INCLUDES:%=-I%) \
$(SYSTEM_INCLUDES:%=--config-exclude=%) \
$(SYSTEM_INCLUDES:%=--suppress=*:%/*) \
$(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*) \
--inline-suppr \
--suppress=missingIncludeSystem \
--suppress=unusedStructMember \
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936 \
$(PASST_SRCS) $(HEADERS)
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
$(SRCS) $(HEADERS)

View file

@ -321,7 +321,7 @@ speeding up local connections, and usually requiring NAT. _pasta_:
protocol
* ✅ 4 to 50 times IPv4 TCP throughput of existing, conceptually similar
solutions depending on MTU (UDP and IPv6 hard to compare)
* [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for
* 🛠 [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for
maximum one copy on every data path and lower request-response latency
* ⌚ [multithreading](https://bugs.passt.top/show_bug.cgi?id=13)
* ⌚ [raw IP socket support](https://bugs.passt.top/show_bug.cgi?id=14) if

8
arch.c
View file

@ -19,7 +19,6 @@
#include <unistd.h>
#include "log.h"
#include "util.h"
/**
* arch_avx2_exec() - Switch to AVX2 build if supported
@ -41,11 +40,8 @@ void arch_avx2_exec(char **argv)
if (__builtin_cpu_supports("avx2")) {
char new_path[PATH_MAX + sizeof(".avx2")];
if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"),
"%s.avx2", exe))
die_perror("Can't build AVX2 executable path");
execv(new_path, argv);
snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe);
execve(new_path, argv, environ);
warn_perror("Can't run AVX2 build, using non-AVX2 version");
}
}

8
arp.c
View file

@ -59,12 +59,14 @@ int arp(const struct ctx *c, const struct pool *p)
ah->ar_op != htons(ARPOP_REQUEST))
return 1;
/* Discard announcements, but not 0.0.0.0 "probes" */
if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) &&
/* Discard announcements (but not 0.0.0.0 "probes"): we might have the
* same IP address, hide that.
*/
if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
!memcmp(am->sip, am->tip, sizeof(am->sip)))
return 1;
/* Don't resolve the guest's assigned address, either. */
/* Don't resolve our own address, either. */
if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
return 1;

View file

@ -59,7 +59,6 @@
#include "util.h"
#include "ip.h"
#include "checksum.h"
#include "iov.h"
/* Checksums are optional for UDP over IPv4, so we usually just set
* them to 0. Change this to 1 to calculate real UDP over IPv4
@ -85,7 +84,7 @@
*/
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
__attribute__((optimize("-fno-strict-aliasing")))
static uint32_t sum_16b(const void *buf, size_t len)
uint32_t sum_16b(const void *buf, size_t len)
{
const uint16_t *p = buf;
uint32_t sum = 0;
@ -107,7 +106,7 @@ static uint32_t sum_16b(const void *buf, size_t len)
*
* Return: 16-bit folded sum
*/
static uint16_t csum_fold(uint32_t sum)
uint16_t csum_fold(uint32_t sum)
{
while (sum >> 16)
sum = (sum & 0xffff) + (sum >> 16);
@ -145,7 +144,7 @@ uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
* @proto: Protocol number
* @saddr: Source address
* @daddr: Destination address
* Return: partial checksum of the IPv4 header
* Returns: Partial checksum of the IPv4 header
*/
uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
struct in_addr saddr, struct in_addr daddr)
@ -161,42 +160,27 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
return psum;
}
/**
* csum() - Compute TCP/IP-style checksum
* @buf: Input buffer
* @len: Input length
* @init: Initial 32-bit checksum, 0 for no pre-computed checksum
*
* Return: 16-bit folded, complemented checksum
*/
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */
static uint16_t csum(const void *buf, size_t len, uint32_t init)
{
return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
}
/**
* csum_udp4() - Calculate and set checksum for a UDP over IPv4 packet
* @udp4hr: UDP header, initialised apart from checksum
* @saddr: IPv4 source address
* @daddr: IPv4 destination address
* @data: UDP payload (as IO vector tail)
* @payload: UDP packet payload
* @dlen: Length of @payload (not including UDP header)
*/
void csum_udp4(struct udphdr *udp4hr,
struct in_addr saddr, struct in_addr daddr,
struct iov_tail *data)
const void *payload, size_t dlen)
{
/* UDP checksums are optional, so don't bother */
udp4hr->check = 0;
if (UDP4_REAL_CHECKSUMS) {
uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr);
uint16_t l4len = dlen + sizeof(struct udphdr);
uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
saddr, daddr);
psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
udp4hr->check = csum_iov_tail(data, psum);
udp4hr->check = csum(payload, dlen, psum);
}
}
@ -225,7 +209,7 @@ void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen)
* @proto: Protocol number
* @saddr: Source address
* @daddr: Destination address
* Return: partial checksum of the IPv6 header
* Returns: Partial checksum of the IPv6 header
*/
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
const struct in6_addr *saddr,
@ -242,22 +226,19 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
/**
* csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet
* @udp6hr: UDP header, initialised apart from checksum
* @saddr: Source address
* @daddr: Destination address
* @data: UDP payload (as IO vector tail)
* @payload: UDP packet payload
* @dlen: Length of @payload (not including UDP header)
*/
void csum_udp6(struct udphdr *udp6hr,
const struct in6_addr *saddr, const struct in6_addr *daddr,
struct iov_tail *data)
const void *payload, size_t dlen)
{
uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr);
uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
saddr, daddr);
uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(struct udphdr),
IPPROTO_UDP, saddr, daddr);
udp6hr->check = 0;
psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
udp6hr->check = csum_iov_tail(data, psum);
udp6hr->check = csum(payload, dlen, psum);
}
/**
@ -452,7 +433,7 @@ less_than_128_bytes:
}
/**
* csum_unfolded() - Calculate the unfolded checksum of a data buffer.
* csum_unfolded - Calculate the unfolded checksum of a data buffer.
*
* @buf: Input buffer
* @len: Input length
@ -467,8 +448,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
intptr_t align = ROUND_UP((intptr_t)buf, sizeof(__m256i));
unsigned int pad = align - (intptr_t)buf;
/* Don't mix sum_16b() and csum_avx2() with odd padding lengths */
if (pad & 1 || len < pad)
if (len < pad)
pad = len;
if (pad)
@ -481,7 +461,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
}
#else /* __AVX2__ */
/**
* csum_unfolded() - Calculate the unfolded checksum of a data buffer.
* csum_unfolded - Calculate the unfolded checksum of a data buffer.
*
* @buf: Input buffer
* @len: Input length
@ -498,23 +478,36 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
#endif /* !__AVX2__ */
/**
* csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector
* @tail: IO vector tail to checksum
* csum() - Compute TCP/IP-style checksum
* @buf: Input buffer
* @len: Input length
* @init: Initial 32-bit checksum, 0 for no pre-computed checksum
*
* Return: 16-bit folded, complemented checksum
*/
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */
uint16_t csum(const void *buf, size_t len, uint32_t init)
{
return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
}
/**
* csum_iov() - Calculates the unfolded checksum over an array of IO vectors
*
* @iov Pointer to the array of IO vectors
* @n Length of the array
* @init Initial 32-bit checksum, 0 for no pre-computed checksum
*
* Return: 16-bit folded, complemented checksum
*/
uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init)
/* cppcheck-suppress unusedFunction */
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init)
{
if (iov_tail_prune(tail)) {
size_t i;
unsigned int i;
for (i = 0; i < n; i++)
init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);
init = csum_unfolded((char *)tail->iov[0].iov_base + tail->off,
tail->iov[0].iov_len - tail->off, init);
for (i = 1; i < tail->cnt; i++) {
const struct iovec *iov = &tail->iov[i];
init = csum_unfolded(iov->iov_base, iov->iov_len, init);
}
}
return (uint16_t)~csum_fold(init);
}

View file

@ -9,8 +9,9 @@
struct udphdr;
struct icmphdr;
struct icmp6hdr;
struct iov_tail;
uint32_t sum_16b(const void *buf, size_t len);
uint16_t csum_fold(uint32_t sum);
uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init);
uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
struct in_addr saddr, struct in_addr daddr);
@ -18,18 +19,19 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
struct in_addr saddr, struct in_addr daddr);
void csum_udp4(struct udphdr *udp4hr,
struct in_addr saddr, struct in_addr daddr,
struct iov_tail *data);
const void *payload, size_t dlen);
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
const struct in6_addr *saddr,
const struct in6_addr *daddr);
void csum_udp6(struct udphdr *udp6hr,
const struct in6_addr *saddr, const struct in6_addr *daddr,
struct iov_tail *data);
const void *payload, size_t dlen);
void csum_icmp6(struct icmp6hdr *icmp6hr,
const struct in6_addr *saddr, const struct in6_addr *daddr,
const void *payload, size_t dlen);
uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init);
uint16_t csum(const void *buf, size_t len, uint32_t init);
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init);
#endif /* CHECKSUM_H */

703
conf.c

File diff suppressed because it is too large Load diff

1
conf.h
View file

@ -6,7 +6,6 @@
#ifndef CONF_H
#define CONF_H
enum passt_modes conf_mode(int argc, char *argv[]);
void conf(struct ctx *c, int argc, char **argv);
#endif /* CONF_H */

View file

@ -27,25 +27,4 @@ profile passt /usr/bin/passt{,.avx2} {
owner @{HOME}/** w, # pcap(), pidfile_open(),
# pidfile_write()
# Workaround: libvirt's profile comes with a passt subprofile which includes,
# in turn, <abstractions/passt>, and adds libvirt-specific rules on top, to
# allow passt (when started by libvirtd) to write socket and PID files in the
# location requested by libvirtd itself, and to execute passt itself.
#
# However, when libvirt runs as unprivileged user, the mechanism based on
# virt-aa-helper, designed to build per-VM profiles as guests are started,
# doesn't work. The helper needs to create and load profiles on the fly, which
# can't be done by unprivileged users, of course.
#
# As a result, libvirtd runs unconfined if guests are started by unprivileged
# users, starting passt unconfined as well, which means that passt runs under
# its own stand-alone profile (this one), which implies in turn that execve()
# of /usr/bin/passt is not allowed, and socket and PID files can't be written.
#
# Duplicate libvirt-specific rules here as long as this is not solved in
# libvirt's profile itself.
/usr/bin/passt r,
owner @{run}/user/[0-9]*/libvirt/qemu/run/passt/* rw,
owner @{run}/libvirt/qemu/passt/* rw,
}

View file

@ -1,29 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# contrib/apparmor/usr.bin.passt-repair - AppArmor profile for passt-repair(1)
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
abi <abi/3.0>,
#include <tunables/global>
profile passt-repair /usr/bin/passt-repair {
#include <abstractions/base>
/** rw, # passt's ".repair" socket might be anywhere
unix (connect, receive, send) type=stream,
capability dac_override, # connect to passt's socket as root
capability net_admin, # currently needed for TCP_REPAIR socket option
capability net_raw, # what TCP_REPAIR should require instead
network unix stream, # connect and use UNIX domain socket
network inet stream, # use TCP sockets
}

View file

@ -9,7 +9,6 @@
%global git_hash {{{ git_head }}}
%global selinuxtype targeted
%global selinux_policy_version 41.41
Name: passt
Version: {{{ git_version }}}
@ -34,22 +33,18 @@ for network namespaces: traffic is forwarded using a tap interface inside the
namespace, without the need to create further interfaces on the host, hence not
requiring any capabilities or privileges.
%package selinux
BuildArch: noarch
Summary: SELinux support for passt and pasta
Requires: selinux-policy-%{selinuxtype}
Requires: container-selinux
Requires(post): selinux-policy-%{selinuxtype}
Requires(post): container-selinux
Requires(post): policycoreutils
Requires(post): libselinux-utils
Requires(preun): policycoreutils
BuildRequires: selinux-policy-devel
BuildRequires: pkgconfig(systemd)
Recommends: selinux-policy-%{selinuxtype} >= %{selinux_policy_version}
%package selinux
BuildArch: noarch
Summary: SELinux support for passt and pasta
Requires: %{name} = %{version}-%{release}
Requires: selinux-policy
Requires(post): %{name}
Requires(post): policycoreutils
Requires(preun): %{name}
Requires(preun): policycoreutils
%description selinux
This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1).
This package adds SELinux enforcement to passt(1) and pasta(1).
%prep
%setup -q -n passt-%{git_hash}
@ -87,33 +82,23 @@ make -f %{_datadir}/selinux/devel/Makefile
install -p -m 644 -D passt.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
install -p -m 644 -D passt.if %{buildroot}%{_datadir}/selinux/devel/include/distributed/passt.if
install -p -m 644 -D pasta.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
install -p -m 644 -D passt-repair.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
popd
%pre selinux
%selinux_relabel_pre -s %{selinuxtype}
%post selinux
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
%postun selinux
if [ $1 -eq 0 ]; then
%selinux_modules_uninstall -s %{selinuxtype} passt pasta passt-repair
%selinux_modules_uninstall -s %{selinuxtype} passt
%selinux_modules_uninstall -s %{selinuxtype} pasta
fi
%posttrans selinux
%selinux_relabel_post -s %{selinuxtype}
# %selinux_relabel_post calls fixfiles(8) with the previous file_contexts file
# (see selabel_file(5)) in order to restore only the file contexts which
# actually changed. However, as file_contexts doesn't support %{USERID}
# substitutions, this will not work for specific file contexts that pasta needs
# to have under /run/user.
#
# Restore those explicitly, hiding errors from restorecon(8): we can't pass a
# path that's more specific than this, but at the same time /run/user often
# contains FUSE mountpoints that can't be accessed as root, leading to
# "Permission denied" messages, but not failures.
restorecon -R /run/user 2>/dev/null
%files
%license LICENSES/{GPL-2.0-or-later.txt,BSD-3-Clause.txt}
@ -123,11 +108,9 @@ restorecon -R /run/user 2>/dev/null
%{_bindir}/passt
%{_bindir}/pasta
%{_bindir}/qrap
%{_bindir}/passt-repair
%{_mandir}/man1/passt.1*
%{_mandir}/man1/pasta.1*
%{_mandir}/man1/qrap.1*
%{_mandir}/man1/passt-repair.1*
%ifarch x86_64
%{_bindir}/passt.avx2
%{_mandir}/man1/passt.avx2.1*
@ -139,7 +122,6 @@ restorecon -R /run/user 2>/dev/null
%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
%{_datadir}/selinux/devel/include/distributed/passt.if
%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
%changelog
{{{ passt_git_changelog }}}

View file

@ -1,11 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# contrib/selinux/passt-repair.fc - SELinux: File Context for passt-repair
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
/usr/bin/passt-repair system_u:object_r:passt_repair_exec_t:s0

View file

@ -1,87 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# contrib/selinux/passt-repair.te - SELinux: Type Enforcement for passt-repair
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
policy_module(passt-repair, 0.1)
require {
type unconfined_t;
type passt_t;
role unconfined_r;
class process transition;
class file { read execute execute_no_trans entrypoint open map };
class capability { dac_override net_admin net_raw };
class chr_file { append open getattr read write ioctl };
class unix_stream_socket { create connect sendto };
class sock_file { read write };
class tcp_socket { read setopt write };
type console_device_t;
type user_devpts_t;
type user_tmp_t;
# Workaround: passt-repair needs to needs to access socket files
# that passt, started by libvirt, might create under different
# labels, depending on whether passt is started as root or not.
#
# However, libvirt doesn't maintain its own policy, which makes
# updates particularly complicated. To avoid breakage in the short
# term, deal with that in passt's own policy.
type qemu_var_run_t;
type virt_var_run_t;
}
type passt_repair_t;
domain_type(passt_repair_t);
type passt_repair_exec_t;
corecmd_executable_file(passt_repair_exec_t);
role unconfined_r types passt_repair_t;
allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans entrypoint open map };
type_transition unconfined_t passt_repair_exec_t:process passt_repair_t;
allow unconfined_t passt_repair_t:process transition;
allow passt_repair_t self:capability { dac_override dac_read_search net_admin net_raw };
allow passt_repair_t self:capability2 bpf;
allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl };
allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl };
allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
allow passt_repair_t user_tmp_t:dir { getattr read search watch };
allow passt_repair_t unconfined_t:sock_file { getattr read write };
allow passt_repair_t passt_t:sock_file { getattr read write };
allow passt_repair_t user_tmp_t:sock_file { getattr read write };
allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
allow passt_repair_t passt_t:tcp_socket { read setopt write };
# Workaround: passt-repair needs to needs to access socket files
# that passt, started by libvirt, might create under different
# labels, depending on whether passt is started as root or not.
#
# However, libvirt doesn't maintain its own policy, which makes
# updates particularly complicated. To avoid breakage in the short
# term, deal with that in passt's own policy.
allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write };
allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write };
allow passt_repair_t qemu_var_run_t:dir { getattr read search watch };
allow passt_repair_t virt_var_run_t:dir { getattr read search watch };
allow passt_repair_t qemu_var_run_t:sock_file { getattr read write };
allow passt_repair_t virt_var_run_t:sock_file { getattr read write };

View file

@ -20,19 +20,9 @@ require {
type fs_t;
type tmp_t;
type user_tmp_t;
type user_home_t;
type tmpfs_t;
type root_t;
# Workaround: passt --vhost-user needs to map guest memory, but
# libvirt doesn't maintain its own policy, which makes updates
# particularly complicated. To avoid breakage in the short term,
# deal with it in passt's own policy.
type svirt_image_t;
type svirt_tmpfs_t;
type svirt_t;
type null_device_t;
class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map };
class dir { search write add_name remove_name mounton };
class chr_file { append read write open getattr ioctl };
@ -48,8 +38,8 @@ require {
type net_conf_t;
type proc_net_t;
type node_t;
class tcp_socket { create accept listen name_bind name_connect getattr ioctl };
class udp_socket { create accept listen getattr };
class tcp_socket { create accept listen name_bind name_connect };
class udp_socket { create accept listen };
class icmp_socket { bind create name_bind node_bind setopt read write };
class sock_file { create unlink write };
@ -57,6 +47,8 @@ require {
type port_t;
type http_port_t;
type passwd_file_t;
class netlink_route_socket { bind create nlmsg_read };
type sysctl_net_t;
@ -90,9 +82,6 @@ allow passt_t root_t:dir mounton;
allow passt_t tmp_t:dir { add_name mounton remove_name write };
allow passt_t tmpfs_t:filesystem mount;
allow passt_t fs_t:filesystem unmount;
allow passt_t user_home_t:dir search;
allow passt_t user_tmp_t:fifo_file append;
allow passt_t user_tmp_t:file map;
manage_files_pattern(passt_t, user_tmp_t, user_tmp_t)
files_pid_filetrans(passt_t, user_tmp_t, file)
@ -107,7 +96,8 @@ allow passt_t self:capability { sys_tty_config setpcap net_bind_service setuid s
allow passt_t self:cap_userns { setpcap sys_admin sys_ptrace };
allow passt_t self:user_namespace create;
auth_read_passwd(passt_t)
allow passt_t passwd_file_t:file read_file_perms;
sssd_search_lib(passt_t)
allow passt_t proc_net_t:file read;
allow passt_t net_conf_t:file { open read };
@ -132,19 +122,11 @@ corenet_udp_sendrecv_all_ports(passt_t)
allow passt_t node_t:icmp_socket { name_bind node_bind };
allow passt_t port_t:icmp_socket name_bind;
allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl };
allow passt_t self:udp_socket { create getopt setopt connect bind read write getattr };
allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write };
allow passt_t self:udp_socket { create getopt setopt connect bind read write };
allow passt_t self:icmp_socket { bind create setopt read write };
allow passt_t user_tmp_t:dir { add_name write };
allow passt_t user_tmp_t:file { create open };
allow passt_t user_tmp_t:sock_file { create read write unlink };
allow passt_t unconfined_t:unix_stream_socket { read write };
# Workaround: passt --vhost-user needs to map guest memory, but
# libvirt doesn't maintain its own policy, which makes updates
# particularly complicated. To avoid breakage in the short term,
# deal with it in passt's own policy.
allow passt_t svirt_image_t:file { read write map };
allow passt_t svirt_tmpfs_t:file { read write map };
allow passt_t null_device_t:chr_file map;

View file

@ -8,9 +8,7 @@
# Copyright (c) 2022 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
/usr/bin/pasta system_u:object_r:pasta_exec_t:s0
/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0
/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0
/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0
/run/user/%{USERID}/netns system_u:object_r:ifconfig_var_run_t:s0
/run/user/%{USERID}/containers/networks/rootless-netns system_u:object_r:ifconfig_var_run_t:s0
/usr/bin/pasta system_u:object_r:pasta_exec_t:s0
/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0
/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0
/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0

View file

@ -18,7 +18,6 @@ require {
type bin_t;
type user_home_t;
type user_home_dir_t;
type user_tmp_t;
type fs_t;
type tmp_t;
type tmpfs_t;
@ -57,10 +56,8 @@ require {
attribute port_type;
type port_t;
type http_port_t;
type http_cache_port_t;
type ssh_port_t;
type reserved_port_t;
type unreserved_port_t;
type dns_port_t;
type dhcpc_port_t;
type chronyd_port_t;
@ -71,6 +68,9 @@ require {
type system_dbusd_t;
type systemd_hostnamed_t;
type systemd_systemctl_exec_t;
type passwd_file_t;
type sssd_public_t;
type sssd_var_lib_t;
class dbus send_msg;
class system module_request;
class system status;
@ -89,15 +89,6 @@ require {
class capability { sys_tty_config setuid setgid };
class cap_userns { setpcap sys_admin sys_ptrace net_bind_service net_admin };
class user_namespace create;
# Container requires
attribute_role usernetctl_roles;
role container_user_r;
role staff_r;
role user_r;
type container_runtime_t;
type container_t;
type systemd_user_runtimedir_t;
}
type pasta_t;
@ -122,12 +113,10 @@ init_daemon_domain(pasta_t, pasta_exec_t)
allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read_search net_admin sys_resource setuid setgid };
allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service };
# pasta only calls setuid and setgid with the current UID and GID, so this
# denial is harmless. See https://bugzilla.redhat.com/show_bug.cgi?id=2330512#c10
dontaudit pasta_t self:cap_userns { setgid setuid };
allow pasta_t self:user_namespace create;
auth_read_passwd(pasta_t)
allow pasta_t passwd_file_t:file read_file_perms;
sssd_search_lib(pasta_t)
domain_auto_trans(pasta_t, bin_t, unconfined_t);
domain_auto_trans(pasta_t, shell_exec_t, unconfined_t);
@ -137,22 +126,17 @@ domain_auto_trans(pasta_t, ping_exec_t, ping_t);
allow pasta_t nsfs_t:file { open read };
allow pasta_t user_home_t:dir { getattr search };
allow pasta_t user_home_t:file { open read getattr setattr execute execute_no_trans map};
allow pasta_t user_home_t:dir getattr;
allow pasta_t user_home_t:file { open read getattr setattr };
allow pasta_t user_home_dir_t:dir { search getattr open add_name read write };
allow pasta_t user_home_dir_t:file { create open read write };
allow pasta_t tmp_t:dir { add_name mounton remove_name write };
allow pasta_t tmpfs_t:filesystem { getattr mount };
allow pasta_t tmpfs_t:filesystem mount;
allow pasta_t fs_t:filesystem unmount;
allow pasta_t root_t:dir mounton;
manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t)
files_pid_filetrans(pasta_t, pasta_pid_t, file)
allow pasta_t user_tmp_t:dir { add_name remove_name search write };
allow pasta_t user_tmp_t:fifo_file append;
allow pasta_t user_tmp_t:file { create open write };
allow pasta_t user_tmp_t:sock_file { create unlink };
allow pasta_t console_device_t:chr_file { open write getattr ioctl };
allow pasta_t user_devpts_t:chr_file { getattr read write ioctl };
logging_send_syslog_msg(pasta_t)
@ -168,11 +152,6 @@ allow pasta_t tmp_t:sock_file { create unlink write };
allow pasta_t self:tcp_socket create_stream_socket_perms;
corenet_tcp_sendrecv_generic_node(pasta_t)
corenet_tcp_bind_generic_node(pasta_t)
allow pasta_t container_runtime_t:dir { open read search };
allow pasta_t container_runtime_t:fifo_file { getattr write };
allow pasta_t container_runtime_t:file read;
allow pasta_t container_runtime_t:lnk_file read;
allow pasta_t container_t:lnk_file read;
allow pasta_t pasta_port_t:tcp_socket { name_bind name_connect };
allow pasta_t pasta_port_t:udp_socket { name_bind };
allow pasta_t http_port_t:tcp_socket { name_bind name_connect };
@ -185,8 +164,6 @@ allow pasta_t self:udp_socket create_stream_socket_perms;
allow pasta_t reserved_port_t:udp_socket name_bind;
allow pasta_t llmnr_port_t:tcp_socket name_bind;
allow pasta_t llmnr_port_t:udp_socket name_bind;
allow pasta_t http_cache_port_t:tcp_socket { name_bind name_connect };
allow pasta_t unreserved_port_t:udp_socket name_bind;
corenet_udp_sendrecv_generic_node(pasta_t)
corenet_udp_bind_generic_node(pasta_t)
allow pasta_t node_t:icmp_socket { name_bind node_bind };
@ -198,12 +175,15 @@ allow pasta_t init_t:lnk_file read;
allow pasta_t init_t:unix_stream_socket connectto;
allow pasta_t init_t:dbus send_msg;
allow pasta_t init_t:system status;
allow pasta_t unconfined_t:dir { read search };
allow pasta_t unconfined_t:dir search;
allow pasta_t unconfined_t:file read;
allow pasta_t unconfined_t:lnk_file read;
allow pasta_t passwd_file_t:file { getattr open read };
allow pasta_t self:process { setpgid setcap };
allow pasta_t shell_exec_t:file { execute execute_no_trans map };
allow pasta_t sssd_var_lib_t:dir search;
allow pasta_t sssd_public_t:dir search;
allow pasta_t hostname_exec_t:file { execute execute_no_trans getattr open read map };
allow pasta_t system_dbusd_t:unix_stream_socket connectto;
allow pasta_t system_dbusd_t:dbus send_msg;
@ -219,6 +199,8 @@ allow pasta_t sysctl_net_t:dir search;
allow pasta_t sysctl_net_t:file { open read write };
allow pasta_t kernel_t:system module_request;
allow pasta_t nsfs_t:file read;
allow pasta_t proc_t:dir mounton;
allow pasta_t proc_t:filesystem mount;
allow pasta_t net_conf_t:lnk_file read;
@ -230,28 +212,3 @@ allow pasta_t netutils_t:process { noatsecure rlimitinh siginh };
allow pasta_t ping_t:process { noatsecure rlimitinh siginh };
allow pasta_t user_tty_device_t:chr_file { append read write };
allow pasta_t user_devpts_t:chr_file { append read write };
# Allow network administration commands for non-privileged users
roleattribute container_user_r usernetctl_roles;
roleattribute staff_r usernetctl_roles;
roleattribute user_r usernetctl_roles;
role usernetctl_roles types pasta_t;
# Make pasta in a container run under the pasta_t context
type_transition container_runtime_t pasta_exec_t : process pasta_t;
allow container_runtime_t pasta_t:process transition;
# Label the user network namespace files
type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "netns";
type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootless-netns";
allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write };
allow pasta_t ifconfig_var_run_t:file { create open write };
allow systemd_user_runtimedir_t ifconfig_var_run_t:dir rmdir;
# Allow pasta to bind to any port
bool pasta_bind_all_ports true;
if (pasta_bind_all_ports) {
allow pasta_t port_type:icmp_socket { accept getopt name_bind };
allow pasta_t port_type:tcp_socket { accept getopt name_bind name_connect };
allow pasta_t port_type:udp_socket { accept getopt name_bind };
}

142
dhcp.c
View file

@ -36,9 +36,9 @@
/**
* struct opt - DHCP option
* @sent: Convenience flag, set while filling replies
* @slen: Length of option defined for server, -1 if not going to be sent
* @slen: Length of option defined for server
* @s: Option payload from server
* @clen: Length of option received from client, -1 if not received
* @clen: Length of option received from client
* @c: Option payload from client
*/
struct opt {
@ -63,21 +63,11 @@ static struct opt opts[255];
#define OPT_MIN 60 /* RFC 951 */
/* Total option size (excluding end option) is 576 (RFC 2131), minus
* offset of options (268), minus end option (1).
*/
#define OPT_MAX 307
/**
* dhcp_init() - Initialise DHCP options
*/
void dhcp_init(void)
{
int i;
for (i = 0; i < ARRAY_SIZE(opts); i++)
opts[i].slen = -1;
opts[1] = (struct opt) { 0, 4, { 0 }, 0, { 0 }, }; /* Mask */
opts[3] = (struct opt) { 0, 4, { 0 }, 0, { 0 }, }; /* Router */
opts[51] = (struct opt) { 0, 4, { 0xff,
@ -117,8 +107,6 @@ struct msg {
uint32_t xid;
uint16_t secs;
uint16_t flags;
#define FLAG_BROADCAST htons_constant(0x8000)
uint32_t ciaddr;
struct in_addr yiaddr;
uint32_t siaddr;
@ -127,7 +115,7 @@ struct msg {
uint8_t sname[64];
uint8_t file[128];
uint32_t magic;
uint8_t o[OPT_MAX + 1 /* End option */ ];
uint8_t o[308];
} __attribute__((__packed__));
/**
@ -135,28 +123,15 @@ struct msg {
* @m: Message to fill
* @o: Option number
* @offset: Current offset within options field, updated on insertion
*
* Return: false if m has space to write the option, true otherwise
*/
static bool fill_one(struct msg *m, int o, int *offset)
static void fill_one(struct msg *m, int o, int *offset)
{
size_t slen = opts[o].slen;
/* If we don't have space to write the option, then just skip */
if (*offset + 2 /* code and length of option */ + slen > OPT_MAX)
return true;
m->o[*offset] = o;
m->o[*offset + 1] = slen;
/* Move to option */
*offset += 2;
memcpy(&m->o[*offset], opts[o].s, slen);
m->o[*offset + 1] = opts[o].slen;
memcpy(&m->o[*offset + 2], opts[o].s, opts[o].slen);
opts[o].sent = 1;
*offset += slen;
return false;
*offset += 2 + opts[o].slen;
}
/**
@ -169,6 +144,9 @@ static int fill(struct msg *m)
{
int i, o, offset = 0;
m->op = BOOTREPLY;
m->secs = 0;
for (o = 0; o < 255; o++)
opts[o].sent = 0;
@ -176,24 +154,22 @@ static int fill(struct msg *m)
* option 53 at the beginning of the list.
* Put it there explicitly, unless requested via option 55.
*/
if (opts[55].clen > 0 && !memchr(opts[55].c, 53, opts[55].clen))
if (fill_one(m, 53, &offset))
debug("DHCP: skipping option 53");
if (!memchr(opts[55].c, 53, opts[55].clen))
fill_one(m, 53, &offset);
for (i = 0; i < opts[55].clen; i++) {
o = opts[55].c[i];
if (opts[o].slen != -1)
if (fill_one(m, o, &offset))
debug("DHCP: skipping option %i", o);
if (opts[o].slen)
fill_one(m, o, &offset);
}
for (o = 0; o < 255; o++) {
if (opts[o].slen != -1 && !opts[o].sent)
if (fill_one(m, o, &offset))
debug("DHCP: skipping option %i", o);
if (opts[o].slen && !opts[o].sent)
fill_one(m, o, &offset);
}
m->o[offset++] = 255;
m->o[offset++] = 0;
if (offset < OPT_MIN) {
memset(&m->o[offset], 0, OPT_MIN - offset);
@ -288,9 +264,6 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len)
".\xc0");
}
}
if (!opts[119].slen)
opts[119].slen = -1;
}
/**
@ -304,13 +277,12 @@ int dhcp(const struct ctx *c, const struct pool *p)
{
size_t mlen, dlen, offset = 0, opt_len, opt_off = 0;
char macstr[ETH_ADDRSTRLEN];
struct in_addr mask, dst;
const struct ethhdr *eh;
const struct iphdr *iph;
const struct udphdr *uh;
struct msg const *m;
struct msg reply;
struct in_addr mask;
unsigned int i;
struct msg *m;
eh = packet_get(p, 0, offset, sizeof(*eh), NULL);
offset += sizeof(*eh);
@ -339,27 +311,8 @@ int dhcp(const struct ctx *c, const struct pool *p)
m->op != BOOTREQUEST)
return -1;
reply.op = BOOTREPLY;
reply.htype = m->htype;
reply.hlen = m->hlen;
reply.hops = 0;
reply.xid = m->xid;
reply.secs = 0;
reply.flags = m->flags;
reply.ciaddr = m->ciaddr;
reply.yiaddr = c->ip4.addr;
reply.siaddr = 0;
reply.giaddr = m->giaddr;
memcpy(&reply.chaddr, m->chaddr, sizeof(reply.chaddr));
memset(&reply.sname, 0, sizeof(reply.sname));
memset(&reply.file, 0, sizeof(reply.file));
reply.magic = m->magic;
offset += offsetof(struct msg, o);
for (i = 0; i < ARRAY_SIZE(opts); i++)
opts[i].clen = -1;
while (opt_off + 2 < opt_len) {
const uint8_t *olen, *val;
uint8_t *type;
@ -378,19 +331,11 @@ int dhcp(const struct ctx *c, const struct pool *p)
opt_off += *olen + 2;
}
opts[80].slen = -1;
if (opts[53].clen > 0 && opts[53].c[0] == DHCPDISCOVER) {
if (opts[80].clen == -1) {
info("DHCP: offer to discover");
opts[53].s[0] = DHCPOFFER;
} else {
info("DHCP: ack to discover (Rapid Commit)");
opts[53].s[0] = DHCPACK;
opts[80].slen = 0;
}
} else if (opts[53].clen <= 0 || opts[53].c[0] == DHCPREQUEST) {
info("%s: ack to request", /* DHCP needs a valid message type */
(opts[53].clen <= 0) ? "BOOTP" : "DHCP");
if (opts[53].c[0] == DHCPDISCOVER) {
info("DHCP: offer to discover");
opts[53].s[0] = DHCPOFFER;
} else if (opts[53].c[0] == DHCPREQUEST || !opts[53].clen) {
info("%s: ack to request", opts[53].clen ? "DHCP" : "BOOTP");
opts[53].s[0] = DHCPACK;
} else {
return -1;
@ -398,6 +343,7 @@ int dhcp(const struct ctx *c, const struct pool *p)
info(" from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr)));
m->yiaddr = c->ip4.addr;
mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len));
memcpy(opts[1].s, &mask, sizeof(mask));
memcpy(opts[3].s, &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
@ -417,7 +363,7 @@ int dhcp(const struct ctx *c, const struct pool *p)
&c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
}
if (c->mtu) {
if (c->mtu != -1) {
opts[26].slen = 2;
opts[26].s[0] = c->mtu / 256;
opts[26].s[1] = c->mtu % 256;
@ -428,44 +374,12 @@ int dhcp(const struct ctx *c, const struct pool *p)
((struct in_addr *)opts[6].s)[i] = c->ip4.dns[i];
opts[6].slen += sizeof(uint32_t);
}
if (!opts[6].slen)
opts[6].slen = -1;
opt_len = strlen(c->hostname);
if (opt_len > 0) {
opts[12].slen = opt_len;
memcpy(opts[12].s, &c->hostname, opt_len);
}
opt_len = strlen(c->fqdn);
if (opt_len > 0) {
opt_len += 3 /* flags */
+ 2; /* Length byte for first label, and terminator */
if (sizeof(opts[81].s) >= opt_len) {
opts[81].s[0] = 0x4; /* flags (E) */
opts[81].s[1] = 0xff; /* RCODE1 */
opts[81].s[2] = 0xff; /* RCODE2 */
encode_domain_name((char *)opts[81].s + 3, c->fqdn);
opts[81].slen = opt_len;
} else {
debug("DHCP: client FQDN option doesn't fit, skipping");
}
}
if (!c->no_dhcp_dns_search)
opt_set_dns_search(c, sizeof(m->o));
dlen = offsetof(struct msg, o) + fill(&reply);
if (m->flags & FLAG_BROADCAST)
dst = in4addr_broadcast;
else
dst = c->ip4.addr;
tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, &reply, dlen);
dlen = offsetof(struct msg, o) + fill(m);
tap_udp4_send(c, c->ip4.our_tap_addr, 67, c->ip4.addr, 68, m, dlen);
return 1;
}

158
dhcpv6.c
View file

@ -48,7 +48,6 @@ struct opt_hdr {
# define STATUS_NOTONLINK htons_constant(4)
# define OPT_DNS_SERVERS htons_constant(23)
# define OPT_DNS_SEARCH htons_constant(24)
# define OPT_CLIENT_FQDN htons_constant(39)
#define STR_NOTONLINK "Prefix not appropriate for link."
uint16_t l;
@ -59,9 +58,6 @@ struct opt_hdr {
sizeof(struct opt_hdr))
#define OPT_VSIZE(x) (sizeof(struct opt_##x) - \
sizeof(struct opt_hdr))
#define OPT_MAX_SIZE IPV6_MIN_MTU - (sizeof(struct ipv6hdr) + \
sizeof(struct udphdr) + \
sizeof(struct msg_hdr))
/**
* struct opt_client_id - DHCPv6 Client Identifier option
@ -144,9 +140,7 @@ struct opt_ia_addr {
struct opt_status_code {
struct opt_hdr hdr;
uint16_t code;
/* "nonstring" is only supported since clang 23 */
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
__attribute__((nonstring)) char status_msg[sizeof(STR_NOTONLINK) - 1];
char status_msg[sizeof(STR_NOTONLINK) - 1];
} __attribute__((packed));
/**
@ -169,18 +163,6 @@ struct opt_dns_search {
char list[MAXDNSRCH * NS_MAXDNAME];
} __attribute__((packed));
/**
* struct opt_client_fqdn - Client FQDN option (RFC 4704)
* @hdr: Option header
* @flags: Flags described by RFC 4704
* @domain_name: Client FQDN
*/
struct opt_client_fqdn {
struct opt_hdr hdr;
uint8_t flags;
char domain_name[PASST_MAXDNAME];
} __attribute__((packed));
/**
* struct msg_hdr - DHCPv6 client/server message header
* @type: DHCP message type
@ -211,7 +193,6 @@ struct msg_hdr {
* @client_id: Client Identifier, variable length
* @dns_servers: DNS Recursive Name Server, here just for storage size
* @dns_search: Domain Search List, here just for storage size
* @client_fqdn: Client FQDN, variable length
*/
static struct resp_t {
struct msg_hdr hdr;
@ -222,7 +203,6 @@ static struct resp_t {
struct opt_client_id client_id;
struct opt_dns_servers dns_servers;
struct opt_dns_search dns_search;
struct opt_client_fqdn client_fqdn;
} __attribute__((__packed__)) resp = {
{ 0 },
SERVER_ID,
@ -248,10 +228,6 @@ static struct resp_t {
{ { OPT_DNS_SEARCH, 0, },
{ 0 },
},
{ { OPT_CLIENT_FQDN, 0, },
0, { 0 },
},
};
static const struct opt_status_code sc_not_on_link = {
@ -320,42 +296,47 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
struct in6_addr *la)
{
int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
const struct opt_ia_addr *opt_addr;
char buf[INET6_ADDRSTRLEN];
struct in6_addr req_addr;
const struct opt_hdr *h;
struct opt_hdr *ia;
size_t offset;
int ia_type;
foreach(ia_type, ia_types) {
offset = 0;
while ((ia = dhcpv6_opt(p, &offset, *ia_type))) {
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
ia_type = OPT_IA_NA;
ia_ta:
offset = 0;
while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
return NULL;
offset += sizeof(struct opt_ia_na);
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
const struct opt_ia_addr *opt_addr;
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
return NULL;
offset += sizeof(struct opt_ia_na);
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
return NULL;
opt_addr = (const struct opt_ia_addr *)h;
req_addr = opt_addr->addr;
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr))
goto err;
offset += sizeof(struct opt_ia_addr);
opt_addr = (const struct opt_ia_addr *)h;
req_addr = opt_addr->addr;
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) {
info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, &req_addr,
buf, sizeof(buf)));
return ia;
}
offset += sizeof(struct opt_ia_addr);
}
}
return NULL;
if (ia_type == OPT_IA_NA) {
ia_type = OPT_IA_TA;
goto ia_ta;
}
err:
info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
return ia;
return NULL;
}
/**
@ -370,6 +351,7 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset)
{
struct opt_dns_servers *srv = NULL;
struct opt_dns_search *srch = NULL;
char *p = NULL;
int i;
if (c->no_dhcp_dns)
@ -406,81 +388,34 @@ search:
if (!name_len)
continue;
name_len += 2; /* Length byte for first label, and terminator */
if (name_len >
NS_MAXDNAME + 1 /* Length byte for first label */ ||
name_len > 255) {
debug("DHCP: DNS search name '%s' too long, skipping",
c->dns_search[i].n);
continue;
}
if (!srch) {
srch = (struct opt_dns_search *)(buf + offset);
offset += sizeof(struct opt_hdr);
srch->hdr.t = OPT_DNS_SEARCH;
srch->hdr.l = 0;
p = srch->list;
}
encode_domain_name(buf + offset, c->dns_search[i].n);
srch->hdr.l += name_len;
offset += name_len;
*p = '.';
p = stpncpy(p + 1, c->dns_search[i].n, name_len);
p++;
srch->hdr.l += name_len + 2;
offset += name_len + 2;
}
if (srch)
if (srch) {
for (i = 0; i < srch->hdr.l; i++) {
if (srch->list[i] == '.') {
srch->list[i] = strcspn(srch->list + i + 1,
".");
}
}
srch->hdr.l = htons(srch->hdr.l);
}
return offset;
}
/**
* dhcpv6_client_fqdn_fill() - Fill in client FQDN option
* @c: Execution context
* @buf: Response message buffer where options will be appended
* @offset: Offset in message buffer for new options
*
* Return: updated length of response message buffer.
*/
static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
char *buf, int offset)
{
struct opt_client_fqdn const *req_opt;
struct opt_client_fqdn *o;
size_t opt_len;
opt_len = strlen(c->fqdn);
if (opt_len == 0) {
return offset;
}
opt_len += 2; /* Length byte for first label, and terminator */
if (opt_len > OPT_MAX_SIZE - (offset +
sizeof(struct opt_hdr) +
1 /* flags */ )) {
debug("DHCPv6: client FQDN option doesn't fit, skipping");
return offset;
}
o = (struct opt_client_fqdn *)(buf + offset);
encode_domain_name(o->domain_name, c->fqdn);
req_opt = (struct opt_client_fqdn *)dhcpv6_opt(p, &(size_t){ 0 },
OPT_CLIENT_FQDN);
if (req_opt && req_opt->flags & 0x01 /* S flag */)
o->flags = 0x02 /* O flag */;
else
o->flags = 0x00;
opt_len++;
o->hdr.t = OPT_CLIENT_FQDN;
o->hdr.l = htons(opt_len);
return offset + sizeof(struct opt_hdr) + opt_len;
}
/**
* dhcpv6() - Check if this is a DHCPv6 message, reply as needed
* @c: Execution context
@ -493,11 +428,11 @@ static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
int dhcpv6(struct ctx *c, const struct pool *p,
const struct in6_addr *saddr, const struct in6_addr *daddr)
{
const struct opt_hdr *client_id, *server_id, *ia;
struct opt_hdr *ia, *bad_ia, *client_id;
const struct opt_hdr *server_id;
const struct in6_addr *src;
const struct msg_hdr *mh;
const struct udphdr *uh;
struct opt_hdr *bad_ia;
size_t mlen, n;
uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
@ -614,7 +549,6 @@ int dhcpv6(struct ctx *c, const struct pool *p,
n = offsetof(struct resp_t, client_id) +
sizeof(struct opt_hdr) + ntohs(client_id->l);
n = dhcpv6_dns_fill(c, (char *)&resp, n);
n = dhcpv6_client_fqdn_fill(p, c, (char *)&resp, n);
resp.hdr.xid = mh->xid;

View file

@ -1,2 +0,0 @@
/source
/target

View file

@ -1,20 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
TARGETS = source target
CFLAGS = -Wall -Wextra -pedantic
all: $(TARGETS)
$(TARGETS): %: %.c
clean:
rm -f $(TARGETS)

View file

@ -1,51 +0,0 @@
<!---
SPDX-License-Identifier: GPL-2.0-or-later
Copyright (c) 2025 Red Hat GmbH
Author: Stefano Brivio <sbrivio@redhat.com>
-->
Migration
=========
These test programs show a migration of a TCP connection from one process to
another using the TCP_REPAIR socket option.
The two processes are a mock of the matching implementation in passt(1), and run
unprivileged, so they rely on the passt-repair helper to connect to them and set
or clear TCP_REPAIR on the connection socket, transferred to the helper using
SCM_RIGHTS.
The passt-repair helper needs to have the CAP_NET_ADMIN capability, or run as
root.
Example of usage
----------------
* Start the test server
$ nc -l 9999
* Start the source side of the TCP client (mock of the source instance of passt)
$ ./source 127.0.0.1 9999 9998 /tmp/repair.sock
* The client sends a test string, and waits for a connection from passt-repair
# passt-repair /tmp/repair.sock
* The socket is now in repair mode, and `source` dumps sequences, then exits
sending sequence: 3244673313
receiving sequence: 2250449386
* Continue the connection on the target side, restarting from those sequences
$ ./target 127.0.0.1 9999 9998 /tmp/repair.sock 3244673313 2250449386
* The target side now waits for a connection from passt-repair
# passt-repair /tmp/repair.sock
* The target side asks passt-repair to switch the socket to repair mode, sets up
the TCP sequences, then asks passt-repair to clear repair mode, and sends a
test string to the server

View file

@ -1,92 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* doc/migration/source.c - Mock of TCP migration source, use with passt-repair
*
* Copyright (c) 2025 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#include <arpa/inet.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <unistd.h>
#include <netdb.h>
#include <netinet/tcp.h>
int main(int argc, char **argv)
{
struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } };
struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0,
NULL, NULL, NULL };
struct sockaddr_un a_helper = { AF_UNIX, { 0 } };
int seq, s, s_helper;
int8_t cmd;
struct iovec iov = { &cmd, sizeof(cmd) };
char buf[CMSG_SPACE(sizeof(int))];
struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
socklen_t seqlen = sizeof(int);
struct addrinfo *r;
(void)argc;
if (argc != 5) {
fprintf(stderr, "%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH\n",
argv[0]);
return -1;
}
strcpy(a_helper.sun_path, argv[4]);
getaddrinfo(argv[1], argv[2], &hints, &r);
/* Connect socket to server and send some data */
s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP);
setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int));
bind(s, (struct sockaddr *)&a, sizeof(a));
connect(s, r->ai_addr, r->ai_addrlen);
send(s, "before migration\n", sizeof("before migration\n"), 0);
/* Wait for helper */
s_helper = socket(AF_UNIX, SOCK_STREAM, 0);
unlink(a_helper.sun_path);
bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper));
listen(s_helper, 1);
s_helper = accept(s_helper, NULL, NULL);
/* Set up message for helper, with socket */
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
memcpy(CMSG_DATA(cmsg), &s, sizeof(s));
/* Send command to helper: turn repair mode on, wait for reply */
cmd = TCP_REPAIR_ON;
sendmsg(s_helper, &msg, 0);
recv(s_helper, &((int8_t){ 0 }), 1, 0);
/* Terminate helper */
close(s_helper);
/* Get sending sequence */
seq = TCP_SEND_QUEUE;
setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen);
fprintf(stdout, "%u ", seq);
/* Get receiving sequence */
seq = TCP_RECV_QUEUE;
setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen);
fprintf(stdout, "%u\n", seq);
}

View file

@ -1,102 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* doc/migration/target.c - Mock of TCP migration target, use with passt-repair
*
* Copyright (c) 2025 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#include <arpa/inet.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <unistd.h>
#include <netdb.h>
#include <netinet/tcp.h>
int main(int argc, char **argv)
{
struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } };
struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0,
NULL, NULL, NULL };
struct sockaddr_un a_helper = { AF_UNIX, { 0 } };
int s, s_helper, seq;
int8_t cmd;
struct iovec iov = { &cmd, sizeof(cmd) };
char buf[CMSG_SPACE(sizeof(int))];
struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
struct addrinfo *r;
(void)argc;
strcpy(a_helper.sun_path, argv[4]);
getaddrinfo(argv[1], argv[2], &hints, &r);
if (argc != 7) {
fprintf(stderr,
"%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH SSEQ RSEQ\n",
argv[0]);
return -1;
}
/* Prepare socket, bind to source port */
s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP);
setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int));
bind(s, (struct sockaddr *)&a, sizeof(a));
/* Wait for helper */
s_helper = socket(AF_UNIX, SOCK_STREAM, 0);
unlink(a_helper.sun_path);
bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper));
listen(s_helper, 1);
s_helper = accept(s_helper, NULL, NULL);
/* Set up message for helper, with socket */
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
memcpy(CMSG_DATA(cmsg), &s, sizeof(s));
/* Send command to helper: turn repair mode on, wait for reply */
cmd = TCP_REPAIR_ON;
sendmsg(s_helper, &msg, 0);
recv(s_helper, &((int){ 0 }), 1, 0);
/* Set sending sequence */
seq = TCP_SEND_QUEUE;
setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
seq = atoi(argv[5]);
setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq));
/* Set receiving sequence */
seq = TCP_RECV_QUEUE;
setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
seq = atoi(argv[6]);
setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq));
/* Connect setting kernel state only, without actual SYN / handshake */
connect(s, r->ai_addr, r->ai_addrlen);
/* Send command to helper: turn repair mode off, wait for reply */
cmd = TCP_REPAIR_OFF;
sendmsg(s_helper, &msg, 0);
recv(s_helper, &((int8_t){ 0 }), 1, 0);
/* Terminate helper */
close(s_helper);
/* Send some more data */
send(s, "after migration\n", sizeof("after migration\n"), 0);
}

View file

@ -1,4 +1,3 @@
/listen-vs-repair
/reuseaddr-priority
/recv-zero
/udp-close-dup

View file

@ -3,8 +3,8 @@
# Copyright Red Hat
# Author: David Gibson <david@gibson.dropbear.id.au>
TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair
SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c
TARGETS = reuseaddr-priority recv-zero udp-close-dup
SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c
CFLAGS = -Wall
all: cppcheck clang-tidy $(TARGETS:%=check-%)

View file

@ -15,7 +15,6 @@
#include <stdio.h>
#include <stdlib.h>
__attribute__((format(printf, 1, 2), noreturn))
static inline void die(const char *fmt, ...)
{
va_list ap;

View file

@ -1,128 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* liste-vs-repair.c
*
* Do listening sockets have address conflicts with sockets under repair
* ====================================================================
*
* When we accept() an incoming connection the accept()ed socket will have the
* same local address as the listening socket. This can be a complication on
* migration. On the migration target we've already set up listening sockets
* according to the command line. However to restore connections that we're
* migrating in we need to bind the new sockets to the same address, which would
* be an address conflict on the face of it. This test program verifies that
* enabling repair mode before bind() correctly suppresses that conflict.
*
* Copyright Red Hat
* Author: David Gibson <david@gibson.dropbear.id.au>
*/
/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <errno.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "common.h"
#define PORT 13256U
#define CPORT 13257U
/* 127.0.0.1:PORT */
static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT);
/* 127.0.0.1:CPORT */
static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT);
/* Put ourselves into a network sandbox */
static void net_sandbox(void)
{
/* NOLINTNEXTLINE(altera-struct-pack-align) */
const struct req_t {
struct nlmsghdr nlh;
struct ifinfomsg ifm;
} __attribute__((packed)) req = {
.nlh.nlmsg_type = RTM_NEWLINK,
.nlh.nlmsg_flags = NLM_F_REQUEST,
.nlh.nlmsg_len = sizeof(req),
.nlh.nlmsg_seq = 1,
.ifm.ifi_family = AF_UNSPEC,
.ifm.ifi_index = 1,
.ifm.ifi_flags = IFF_UP,
.ifm.ifi_change = IFF_UP,
};
int nl;
if (unshare(CLONE_NEWUSER | CLONE_NEWNET))
die("unshare(): %s\n", strerror(errno));
/* Bring up lo in the new netns */
nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
if (nl < 0)
die("Can't create netlink socket: %s\n", strerror(errno));
if (send(nl, &req, sizeof(req), 0) < 0)
die("Netlink send(): %s\n", strerror(errno));
close(nl);
}
static void check(void)
{
int s1, s2, op;
s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
if (s1 < 0)
die("socket() 1: %s\n", strerror(errno));
if (bind(s1, (struct sockaddr *)&addr, sizeof(addr)))
die("bind() 1: %s\n", strerror(errno));
if (listen(s1, 0))
die("listen(): %s\n", strerror(errno));
s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
if (s2 < 0)
die("socket() 2: %s\n", strerror(errno));
op = TCP_REPAIR_ON;
if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
die("TCP_REPAIR: %s\n", strerror(errno));
if (bind(s2, (struct sockaddr *)&addr, sizeof(addr)))
die("bind() 2: %s\n", strerror(errno));
if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr)))
die("connect(): %s\n", strerror(errno));
op = TCP_REPAIR_OFF_NO_WP;
if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
die("TCP_REPAIR: %s\n", strerror(errno));
close(s1);
close(s2);
}
int main(int argc, char *argv[])
{
(void)argc;
(void)argv;
net_sandbox();
check();
printf("Repair mode appears to properly suppress conflicts with listening sockets\n");
exit(0);
}

View file

@ -46,13 +46,13 @@
/* Different cases for receiving socket configuration */
enum sock_type {
/* Socket is bound to 0.0.0.0:DSTPORT and not connected */
SOCK_BOUND_ANY,
SOCK_BOUND_ANY = 0,
/* Socket is bound to 127.0.0.1:DSTPORT and not connected */
SOCK_BOUND_LO,
SOCK_BOUND_LO = 1,
/* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */
SOCK_CONNECTED,
SOCK_CONNECTED = 2,
NUM_SOCK_TYPES,
};

View file

@ -22,8 +22,8 @@ enum epoll_type {
EPOLL_TYPE_TCP_TIMER,
/* UDP "listening" sockets */
EPOLL_TYPE_UDP_LISTEN,
/* UDP socket for a specific flow */
EPOLL_TYPE_UDP,
/* UDP socket for replies on a specific flow */
EPOLL_TYPE_UDP_REPLY,
/* ICMP/ICMPv6 ping sockets */
EPOLL_TYPE_PING,
/* inotify fd watching for end of netns (pasta) */
@ -36,14 +36,6 @@ enum epoll_type {
EPOLL_TYPE_TAP_PASST,
/* socket listening for qemu socket connections */
EPOLL_TYPE_TAP_LISTEN,
/* vhost-user command socket */
EPOLL_TYPE_VHOST_CMD,
/* vhost-user kick event socket */
EPOLL_TYPE_VHOST_KICK,
/* TCP_REPAIR helper listening socket */
EPOLL_TYPE_REPAIR_LISTEN,
/* TCP_REPAIR helper socket */
EPOLL_TYPE_REPAIR,
EPOLL_NUM_TYPES,
};

441
flow.c
View file

@ -19,7 +19,6 @@
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
#include "repair.h"
const char *flow_state_str[] = {
[FLOW_STATE_FREE] = "FREE",
@ -53,13 +52,6 @@ const uint8_t flow_proto[] = {
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
"flow_proto[] doesn't match enum flow_type");
#define foreach_established_tcp_flow(flow) \
flow_foreach_of_type((flow), FLOW_TCP) \
if (!tcp_flow_is_established(&(flow)->tcp)) \
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
continue; \
else
/* Global Flow Table */
/**
@ -81,7 +73,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
*
* Free cluster list
* flow_first_free gives the index of the first (lowest index) free cluster.
* Each free cluster has the index of the next free cluster, or FLOW_MAX if
* Each free cluster has the index of the next free cluster, or MAX_FLOW if
* it is the last free cluster. Together these form a linked list of free
* clusters, in strictly increasing order of index.
*
@ -267,13 +259,11 @@ int flowside_connect(const struct ctx *c, int s,
/** flow_log_ - Log flow-related message
* @f: flow the message is related to
* @newline: Append newline at the end of the message, if missing
* @pri: Log priority
* @fmt: Format string
* @...: printf-arguments
*/
void flow_log_(const struct flow_common *f, bool newline, int pri,
const char *fmt, ...)
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
{
const char *type_or_state;
char msg[BUFSIZ];
@ -289,7 +279,7 @@ void flow_log_(const struct flow_common *f, bool newline, int pri,
else
type_or_state = FLOW_TYPE(f);
logmsg(newline, false, pri,
logmsg(true, false, pri,
"Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
}
@ -309,7 +299,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
const struct flowside *tgt = &f->side[TGTSIDE];
if (state >= FLOW_STATE_TGT)
flow_log_(f, true, pri,
flow_log_(f, pri,
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
@ -322,7 +312,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
tgt->eport);
else if (state >= FLOW_STATE_INI)
flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
ini->eport,
@ -343,7 +333,7 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
ASSERT(oldstate < FLOW_NUM_STATES);
f->state = state;
flow_log_(f, true, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
FLOW_STATE(f));
flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
@ -396,27 +386,18 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
* @flow: Flow to change state
* @pif: pif of the initiating side
* @ssa: Source socket address
* @daddr: Destination address (may be NULL)
* @dport: Destination port
*
* Return: pointer to the initiating flowside information
*/
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
const union sockaddr_inany *ssa,
const union inany_addr *daddr,
in_port_t dport)
const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
const union sockaddr_inany *ssa,
in_port_t dport)
{
struct flowside *ini = &flow->f.side[INISIDE];
if (inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa) < 0) {
char str[SOCKADDR_STRLEN];
ASSERT_WITH_MSG(0, "Bad socket address %s",
sockaddr_ntop(ssa, str, sizeof(str)));
}
if (daddr)
ini->oaddr = *daddr;
else if (inany_v4(&ini->eaddr))
inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
if (inany_v4(&ini->eaddr))
ini->oaddr = inany_any4;
else
ini->oaddr = inany_any6;
@ -433,8 +414,8 @@ struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
*
* Return: pointer to the target flowside information
*/
struct flowside *flow_target(const struct ctx *c, union flow *flow,
uint8_t proto)
const struct flowside *flow_target(const struct ctx *c, union flow *flow,
uint8_t proto)
{
char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
struct flow_common *f = &flow->f;
@ -480,9 +461,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
/**
* flow_set_type() - Set type and move to TYPED
* @flow: Flow to change state
* @type: New flow type to assign
*
* Return: pointer to the modified flow structure.
* @pif: pif of the initiating side
*/
union flow *flow_set_type(union flow *flow, enum flow_type type)
{
@ -618,7 +597,12 @@ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
const struct flowside *side = &f->side[sidx.sidei];
uint8_t pif = f->pif[sidx.sidei];
ASSERT(pif != PIF_NONE);
/* For the hash table to work, entries must have complete endpoint
* information, and at least a forwarding port.
*/
ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
side->eport != 0 && side->oport != 0);
return flow_hash(c, FLOW_PROTO(f), pif, side);
}
@ -627,7 +611,7 @@ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
* @hash: Raw hash value for flow & side
* @sidx: Flow and side to find bucket for
*
* Return: if @sidx is in the hash table, its current bucket, otherwise a
* Return: If @sidx is in the hash table, its current bucket, otherwise a
* suitable free bucket for it.
*/
static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
@ -647,7 +631,7 @@ static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
* @c: Execution context
* @sidx: Flow and side to find bucket for
*
* Return: if @sidx is in the hash table, its current bucket, otherwise a
* Return: If @sidx is in the hash table, its current bucket, otherwise a
* suitable free bucket for it.
*/
static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx)
@ -762,30 +746,19 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
* @proto: Protocol of the flow (IP L4 protocol number)
* @pif: Interface of the flow
* @esa: Socket address of the endpoint
* @oaddr: Our address (may be NULL)
* @oport: Our port number
*
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
*/
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
const void *esa,
const union inany_addr *oaddr, in_port_t oport)
const void *esa, in_port_t oport)
{
struct flowside side = {
.oport = oport,
};
if (inany_from_sockaddr(&side.eaddr, &side.eport, esa) < 0) {
char str[SOCKADDR_STRLEN];
warn("Flow lookup on bad socket address %s",
sockaddr_ntop(esa, str, sizeof(str)));
return FLOW_SIDX_NONE;
}
if (oaddr)
side.oaddr = *oaddr;
else if (inany_v4(&side.eaddr))
inany_from_sockaddr(&side.eaddr, &side.eport, esa);
if (inany_v4(&side.eaddr))
side.oaddr = inany_any4;
else
side.oaddr = inany_any6;
@ -802,9 +775,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
{
struct flow_free_cluster *free_head = NULL;
unsigned *last_next = &flow_first_free;
bool to_free[FLOW_MAX] = { 0 };
bool timer = false;
union flow *flow;
unsigned idx;
if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
timer = true;
@ -813,12 +785,49 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
/* Check which flows we might need to close first, but don't free them
* yet as it's not safe to do that in the middle of flow_foreach().
*/
flow_foreach(flow) {
for (idx = 0; idx < FLOW_MAX; idx++) {
union flow *flow = &flowtab[idx];
bool closed = false;
switch (flow->f.state) {
case FLOW_STATE_FREE: {
unsigned skip = flow->free.n;
/* First entry of a free cluster must have n >= 1 */
ASSERT(skip);
if (free_head) {
/* Merge into preceding free cluster */
free_head->n += flow->free.n;
flow->free.n = flow->free.next = 0;
} else {
/* New free cluster, add to chain */
free_head = &flow->free;
*last_next = idx;
last_next = &free_head->next;
}
/* Skip remaining empty entries */
idx += skip - 1;
continue;
}
case FLOW_STATE_NEW:
case FLOW_STATE_INI:
case FLOW_STATE_TGT:
case FLOW_STATE_TYPED:
/* Incomplete flow at end of cycle */
ASSERT(false);
break;
case FLOW_STATE_ACTIVE:
/* Nothing to do */
break;
default:
ASSERT(false);
}
switch (flow->f.type) {
case FLOW_TYPE_NONE:
ASSERT(false);
@ -837,7 +846,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
closed = icmp_ping_timer(c, &flow->ping, now);
break;
case FLOW_UDP:
closed = udp_flow_defer(c, &flow->udp, now);
closed = udp_flow_defer(&flow->udp);
if (!closed && timer)
closed = udp_flow_timer(c, &flow->udp, now);
break;
@ -846,322 +855,30 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
;
}
to_free[FLOW_IDX(flow)] = closed;
}
/* Second step: actually free the flows */
flow_foreach_slot(flow) {
switch (flow->f.state) {
case FLOW_STATE_FREE: {
unsigned skip = flow->free.n;
/* First entry of a free cluster must have n >= 1 */
ASSERT(skip);
if (closed) {
flow_set_state(&flow->f, FLOW_STATE_FREE);
memset(flow, 0, sizeof(*flow));
if (free_head) {
/* Merge into preceding free cluster */
free_head->n += flow->free.n;
/* Add slot to current free cluster */
ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
free_head->n++;
flow->free.n = flow->free.next = 0;
} else {
/* New free cluster, add to chain */
/* Create new free cluster */
free_head = &flow->free;
*last_next = FLOW_IDX(flow);
free_head->n = 1;
*last_next = idx;
last_next = &free_head->next;
}
/* Skip remaining empty entries */
flow += skip - 1;
continue;
}
case FLOW_STATE_NEW:
case FLOW_STATE_INI:
case FLOW_STATE_TGT:
case FLOW_STATE_TYPED:
/* Incomplete flow at end of cycle */
ASSERT(false);
break;
case FLOW_STATE_ACTIVE:
if (to_free[FLOW_IDX(flow)]) {
flow_set_state(&flow->f, FLOW_STATE_FREE);
memset(flow, 0, sizeof(*flow));
if (free_head) {
/* Add slot to current free cluster */
ASSERT(FLOW_IDX(flow) ==
FLOW_IDX(free_head) + free_head->n);
free_head->n++;
flow->free.n = flow->free.next = 0;
} else {
/* Create new free cluster */
free_head = &flow->free;
free_head->n = 1;
*last_next = FLOW_IDX(flow);
last_next = &free_head->next;
}
} else {
free_head = NULL;
}
break;
default:
ASSERT(false);
} else {
free_head = NULL;
}
}
*last_next = FLOW_MAX;
}
/**
* flow_migrate_source_rollback() - Disable repair mode, return failure
* @c: Execution context
* @bound: No need to roll back flow indices >= @bound
* @ret: Negative error code
*
* Return: @ret
*/
static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
{
union flow *flow;
debug("...roll back migration");
foreach_established_tcp_flow(flow) {
if (FLOW_IDX(flow) >= bound)
break;
if (tcp_flow_repair_off(c, &flow->tcp))
die("Failed to roll back TCP_REPAIR mode");
}
if (repair_flush(c))
die("Failed to roll back TCP_REPAIR mode");
return ret;
}
/**
* flow_migrate_need_repair() - Do we need to set repair mode for any flow?
*
* Return: true if repair mode is needed, false otherwise
*/
static bool flow_migrate_need_repair(void)
{
union flow *flow;
foreach_established_tcp_flow(flow)
return true;
return false;
}
/**
* flow_migrate_repair_all() - Turn repair mode on or off for all flows
* @c: Execution context
* @enable: Switch repair mode on if set, off otherwise
*
* Return: 0 on success, negative error code on failure
*/
static int flow_migrate_repair_all(struct ctx *c, bool enable)
{
union flow *flow;
int rc;
/* If we don't have a repair helper, there's nothing we can do */
if (c->fd_repair < 0)
return 0;
foreach_established_tcp_flow(flow) {
if (enable)
rc = tcp_flow_repair_on(c, &flow->tcp);
else
rc = tcp_flow_repair_off(c, &flow->tcp);
if (rc) {
debug("Can't %s repair mode: %s",
enable ? "enable" : "disable", strerror_(-rc));
return flow_migrate_source_rollback(c, FLOW_IDX(flow),
rc);
}
}
if ((rc = repair_flush(c))) {
debug("Can't %s repair mode: %s",
enable ? "enable" : "disable", strerror_(-rc));
return flow_migrate_source_rollback(c, FLOW_IDX(flow), rc);
}
return 0;
}
/**
* flow_migrate_source_pre() - Prepare flows for migration: enable repair mode
* @c: Execution context
* @stage: Migration stage information (unused)
* @fd: Migration file descriptor (unused)
*
* Return: 0 on success, positive error code on failure
*/
int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
int fd)
{
int rc;
(void)stage;
(void)fd;
if (flow_migrate_need_repair())
repair_wait(c);
if ((rc = flow_migrate_repair_all(c, true)))
return -rc;
return 0;
}
/**
* flow_migrate_source() - Dump all the remaining information and send data
* @c: Execution context (unused)
* @stage: Migration stage information (unused)
* @fd: Migration file descriptor
*
* Return: 0 on success, positive error code on failure
*/
int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
int fd)
{
uint32_t count = 0;
bool first = true;
union flow *flow;
int rc;
(void)c;
(void)stage;
/* If we don't have a repair helper, we can't migrate TCP flows */
if (c->fd_repair >= 0) {
foreach_established_tcp_flow(flow)
count++;
}
count = htonl(count);
if (write_all_buf(fd, &count, sizeof(count))) {
rc = errno;
err_perror("Can't send flow count (%u)", ntohl(count));
return flow_migrate_source_rollback(c, FLOW_MAX, rc);
}
debug("Sending %u flows", ntohl(count));
if (!count)
return 0;
/* Dump and send information that can be stored in the flow table.
*
* Limited rollback options here: if we fail to transfer any data (that
* is, on the first flow), undo everything and resume. Otherwise, the
* stream might now be inconsistent, and we might have closed listening
* TCP sockets, so just terminate.
*/
foreach_established_tcp_flow(flow) {
rc = tcp_flow_migrate_source(fd, &flow->tcp);
if (rc) {
flow_err(flow, "Can't send data: %s",
strerror_(-rc));
if (!first)
die("Inconsistent migration state, exiting");
return flow_migrate_source_rollback(c, FLOW_MAX, -rc);
}
first = false;
}
/* And then "extended" data (including window data we saved previously):
* the target needs to set repair mode on sockets before it can set
* this stuff, but it needs sockets (and flows) for that.
*
* This also closes sockets so that the target can start connecting
* theirs: you can't sendmsg() to queues (using the socket) if the
* socket is not connected (EPIPE), not even in repair mode. And the
* target needs to restore queues now because we're sending the data.
*
* So, no rollback here, just try as hard as we can. Tolerate per-flow
* failures but not if the stream might be inconsistent (reported here
* as EIO).
*/
foreach_established_tcp_flow(flow) {
rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
if (rc) {
flow_err(flow, "Can't send extended data: %s",
strerror_(-rc));
if (rc == -EIO)
die("Inconsistent migration state, exiting");
}
}
return 0;
}
/**
* flow_migrate_target() - Receive flows and insert in flow table
* @c: Execution context
* @stage: Migration stage information (unused)
* @fd: Migration file descriptor
*
* Return: 0 on success, positive error code on failure
*/
int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
int fd)
{
uint32_t count;
unsigned i;
int rc;
(void)stage;
if (read_all_buf(fd, &count, sizeof(count)))
return errno;
count = ntohl(count);
debug("Receiving %u flows", count);
if (!count)
return 0;
if ((rc = repair_wait(c)))
return -rc;
if ((rc = flow_migrate_repair_all(c, true)))
return -rc;
repair_flush(c);
/* TODO: flow header with type, instead? */
for (i = 0; i < count; i++) {
rc = tcp_flow_migrate_target(c, fd);
if (rc) {
flow_dbg(FLOW(i), "Migration data failure, abort: %s",
strerror_(-rc));
return -rc;
}
}
repair_flush(c);
for (i = 0; i < count; i++) {
rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
if (rc) {
flow_dbg(FLOW(i), "Migration data failure, abort: %s",
strerror_(-rc));
return -rc;
}
}
return 0;
}
/**
* flow_init() - Initialise flow related data structures
*/

29
flow.h
View file

@ -243,27 +243,18 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
const void *eaddr, const void *oaddr,
in_port_t eport, in_port_t oport);
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
const void *esa,
const union inany_addr *oaddr, in_port_t oport);
const void *esa, in_port_t oport);
union flow;
void flow_init(void);
void flow_defer_handler(const struct ctx *c, const struct timespec *now);
int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
int fd);
int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
int fd);
int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
int fd);
int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
int fd);
void flow_log_(const struct flow_common *f, bool newline, int pri,
const char *fmt, ...)
__attribute__((format(printf, 4, 5)));
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
__attribute__((format(printf, 3, 4)));
#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, (pri), __VA_ARGS__)
#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, true, (pri), __VA_ARGS__)
#define flow_dbg(f, ...) flow_log((f), LOG_DEBUG, __VA_ARGS__)
#define flow_err(f, ...) flow_log((f), LOG_ERR, __VA_ARGS__)
@ -273,16 +264,6 @@ void flow_log_(const struct flow_common *f, bool newline, int pri,
flow_dbg((f), __VA_ARGS__); \
} while (0)
#define flow_log_perror_(f, pri, ...) \
do { \
int errno_ = errno; \
flow_log_((f), false, (pri), __VA_ARGS__); \
logmsg(true, true, (pri), ": %s", strerror_(errno_)); \
} while (0)
#define flow_dbg_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_DEBUG, __VA_ARGS__)
#define flow_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_ERR, __VA_ARGS__)
void flow_log_details_(const struct flow_common *f, int pri,
enum flow_state state);
#define flow_log_details(f_, pri) \

View file

@ -50,42 +50,6 @@ extern union flow flowtab[];
#define flow_foreach_sidei(sidei_) \
for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++)
/**
* flow_foreach_slot() - Step through each flow table entry
* @flow: Takes values of pointer to each flow table entry
*
* Includes FREE slots.
*/
#define flow_foreach_slot(flow) \
for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++)
/**
* flow_foreach() - Step through each active flow
* @flow: Takes values of pointer to each active flow
*/
#define flow_foreach(flow) \
flow_foreach_slot((flow)) \
if ((flow)->f.state == FLOW_STATE_FREE) \
(flow) += (flow)->free.n - 1; \
else if ((flow)->f.state != FLOW_STATE_ACTIVE) { \
flow_err((flow), "Bad flow state during traversal"); \
continue; \
} else
/**
* flow_foreach_of_type() - Step through each active flow of given type
* @flow: Takes values of pointer to each flow
* @type_: Type of flow to traverse
*/
#define flow_foreach_of_type(flow, type_) \
flow_foreach((flow)) \
if ((flow)->f.type != (type_)) \
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
continue; \
else
/** flow_idx() - Index of flow from common structure
* @f: Common flow fields pointer
*
@ -93,7 +57,6 @@ extern union flow flowtab[];
*/
static inline unsigned flow_idx(const struct flow_common *f)
{
/* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
return (union flow *)f - flowtab;
}
@ -140,14 +103,14 @@ static inline uint8_t pif_at_sidx(flow_sidx_t sidx)
/** flowside_at_sidx() - Retrieve a specific flowside
* @sidx: Flow & side index
*
* Return: flowside for the flow & side given by @sidx
* Return: Flowside for the flow & side given by @sidx
*/
static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
{
const union flow *flow = flow_at_sidx(sidx);
if (!flow)
return NULL;
return PIF_NONE;
return &flow->f.side[sidx.sidei];
}
@ -198,16 +161,15 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
sa_family_t af,
const void *saddr, in_port_t sport,
const void *daddr, in_port_t dport);
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
const union sockaddr_inany *ssa,
const union inany_addr *daddr,
in_port_t dport);
const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
const union sockaddr_inany *ssa,
in_port_t dport);
const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
sa_family_t af,
const void *saddr, in_port_t sport,
const void *daddr, in_port_t dport);
struct flowside *flow_target(const struct ctx *c, union flow *flow,
uint8_t proto);
const struct flowside *flow_target(const struct ctx *c, union flow *flow,
uint8_t proto);
union flow *flow_set_type(union flow *flow, enum flow_type type);
#define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_)

126
fwd.c
View file

@ -75,8 +75,8 @@ void fwd_probe_ephemeral(void)
if (*end || errno)
goto parse_err;
if (min < 0 || min >= (long)NUM_PORTS ||
max < 0 || max >= (long)NUM_PORTS)
if (min < 0 || min >= NUM_PORTS ||
max < 0 || max >= NUM_PORTS)
goto parse_err;
fwd_ephemeral_min = min;
@ -323,30 +323,6 @@ static bool fwd_guest_accessible(const struct ctx *c,
return fwd_guest_accessible6(c, &addr->a6);
}
/**
* nat_outbound() - Apply address translation for outbound (TAP to HOST)
* @c: Execution context
* @addr: Input address (as seen on TAP interface)
* @translated: Output address (as seen on HOST interface)
*
* Only handles translations that depend *only* on the address. Anything
* related to specific ports or flows is handled elsewhere.
*/
static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
union inany_addr *translated)
{
if (inany_equals4(addr, &c->ip4.map_host_loopback))
*translated = inany_loopback4;
else if (inany_equals6(addr, &c->ip6.map_host_loopback))
*translated = inany_loopback6;
else if (inany_equals4(addr, &c->ip4.map_guest_addr))
*translated = inany_from_v4(c->ip4.addr);
else if (inany_equals6(addr, &c->ip6.map_guest_addr))
translated->a6 = c->ip6.addr;
else
*translated = *addr;
}
/**
* fwd_nat_from_tap() - Determine to forward a flow from the tap interface
* @c: Execution context
@ -366,8 +342,16 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
else if (is_dns_flow(proto, ini) &&
inany_equals6(&ini->oaddr, &c->ip6.dns_match))
tgt->eaddr.a6 = c->ip6.dns_host;
else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
tgt->eaddr = inany_loopback4;
else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
tgt->eaddr = inany_loopback6;
else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
tgt->eaddr = inany_from_v4(c->ip4.addr);
else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
tgt->eaddr.a6 = c->ip6.addr;
else
nat_outbound(c, &ini->oaddr, &tgt->eaddr);
tgt->eaddr = ini->oaddr;
tgt->eport = ini->oport;
@ -418,7 +402,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
else
tgt->eaddr = inany_loopback6;
/* Preserve the specific loopback address used, but let the kernel pick
/* Preserve the specific loopback adddress used, but let the kernel pick
* a source port on the target side
*/
tgt->oaddr = ini->eaddr;
@ -439,42 +423,6 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
return PIF_HOST;
}
/**
* nat_inbound() - Apply address translation for inbound (HOST to TAP)
* @c: Execution context
* @addr: Input address (as seen on HOST interface)
* @translated: Output address (as seen on TAP interface)
*
* Return: true on success, false if it couldn't translate the address
*
* Only handles translations that depend *only* on the address. Anything
* related to specific ports or flows is handled elsewhere.
*/
bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
union inany_addr *translated)
{
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
inany_equals4(addr, &in4addr_loopback)) {
/* Specifically 127.0.0.1, not 127.0.0.0/8 */
*translated = inany_from_v4(c->ip4.map_host_loopback);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
inany_equals6(addr, &in6addr_loopback)) {
translated->a6 = c->ip6.map_host_loopback;
} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
inany_equals4(addr, &c->ip4.addr)) {
*translated = inany_from_v4(c->ip4.map_guest_addr);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
inany_equals6(addr, &c->ip6.addr)) {
translated->a6 = c->ip6.map_guest_addr;
} else if (fwd_guest_accessible(c, addr)) {
*translated = *addr;
} else {
return false;
}
return true;
}
/**
* fwd_nat_from_host() - Determine to forward a flow from the host interface
* @c: Execution context
@ -495,43 +443,41 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
else if (proto == IPPROTO_UDP)
tgt->eport += c->udp.fwd_in.delta[tgt->eport];
if (!c->no_splice && inany_is_loopback(&ini->eaddr) &&
if (c->mode == MODE_PASTA && inany_is_loopback(&ini->eaddr) &&
(proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
/* spliceable */
/* The traffic will go over the guest's 'lo' interface, but by
* default use its external address, so we don't inadvertently
* expose services that listen only on the guest's loopback
* address. That can be overridden by --host-lo-to-ns-lo which
* will instead forward to the loopback address in the guest.
*
* In either case, let the kernel pick the source address to
* match.
/* Preserve the specific loopback adddress used, but let the
* kernel pick a source port on the target side
*/
if (inany_v4(&ini->eaddr)) {
if (c->host_lo_to_ns_lo)
tgt->eaddr = inany_loopback4;
else
tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
tgt->oaddr = inany_any4;
} else {
if (c->host_lo_to_ns_lo)
tgt->eaddr = inany_loopback6;
else
tgt->eaddr.a6 = c->ip6.addr_seen;
tgt->oaddr = inany_any6;
}
/* Let the kernel pick source port */
tgt->oaddr = ini->eaddr;
tgt->oport = 0;
if (proto == IPPROTO_UDP)
/* But for UDP preserve the source port */
tgt->oport = ini->eport;
if (inany_v4(&ini->eaddr))
tgt->eaddr = inany_loopback4;
else
tgt->eaddr = inany_loopback6;
return PIF_SPLICE;
}
if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) {
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
inany_equals4(&ini->eaddr, &in4addr_loopback)) {
/* Specifically 127.0.0.1, not 127.0.0.0/8 */
tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
inany_equals6(&ini->eaddr, &in6addr_loopback)) {
tgt->oaddr.a6 = c->ip6.map_host_loopback;
} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
inany_equals4(&ini->eaddr, &c->ip4.addr)) {
tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
inany_equals6(&ini->eaddr, &c->ip6.addr)) {
tgt->oaddr.a6 = c->ip6.map_guest_addr;
} else if (!fwd_guest_accessible(c, &ini->eaddr)) {
if (inany_v4(&ini->eaddr)) {
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
/* No source address we can use */
@ -540,6 +486,8 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
} else {
tgt->oaddr.a6 = c->ip6.our_tap_ll;
}
} else {
tgt->oaddr = ini->eaddr;
}
tgt->oport = ini->eport;

5
fwd.h
View file

@ -7,7 +7,6 @@
#ifndef FWD_H
#define FWD_H
union inany_addr;
struct flowside;
/* Number of ports for both TCP and UDP */
@ -27,7 +26,7 @@ enum fwd_ports_mode {
#define PORT_BITMAP_SIZE DIV_ROUND_UP(NUM_PORTS, 8)
/**
* fwd_ports() - Describes port forwarding for one protocol and direction
* fwd_ports - Describes port forwarding for one protocol and direction
* @mode: Overall forwarding mode (all, none, auto, specific ports)
* @scan4: /proc/net fd to scan for IPv4 ports when in AUTO mode
* @scan6: /proc/net fd to scan for IPv6 ports when in AUTO mode
@ -48,8 +47,6 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
const struct fwd_ports *tcp_rev);
void fwd_scan_ports_init(struct ctx *c);
bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
union inany_addr *translated);
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,

View file

@ -56,7 +56,6 @@ cd ..
make pkgs
scp passt passt.avx2 passt.1 qrap qrap.1 "${USER_HOST}:${BIN}"
scp pasta pasta.avx2 pasta.1 "${USER_HOST}:${BIN}"
scp passt-repair passt-repair.1 "${USER_HOST}:${BIN}"
ssh "${USER_HOST}" "rm -f ${BIN}/*.deb"
ssh "${USER_HOST}" "rm -f ${BIN}/*.rpm"

9
icmp.c
View file

@ -85,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
if (n < 0) {
flow_perror(pingf, "recvfrom() error");
flow_err(pingf, "recvfrom() error: %s", strerror(errno));
return;
}
@ -150,7 +150,7 @@ unexpected:
static void icmp_ping_close(const struct ctx *c,
const struct icmp_ping_flow *pingf)
{
epoll_del(c, pingf->sock);
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL);
close(pingf->sock);
flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
}
@ -163,7 +163,7 @@ static void icmp_ping_close(const struct ctx *c,
* @saddr: Source address
* @daddr: Destination address
*
* Return: newly opened ping flow, or NULL on failure
* Return: Newly opened ping flow, or NULL on failure
*/
static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
sa_family_t af, uint16_t id,
@ -300,7 +300,8 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
flow_dbg_perror(pingf, "failed to relay request to socket");
flow_dbg(pingf, "failed to relay request to socket: %s",
strerror(errno));
} else {
flow_dbg(pingf,
"echo request to socket, ID: %"PRIu16", seq: %"PRIu16,

22
inany.c
View file

@ -25,7 +25,7 @@ const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
* @dst: output buffer, minimum INANY_ADDRSTRLEN bytes
* @size: size of buffer at @dst
*
* Return: on success, a non-null pointer to @dst, NULL on failure
* Return: On success, a non-null pointer to @dst, NULL on failure
*/
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
{
@ -36,23 +36,3 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
return inet_ntop(AF_INET6, &src->a6, dst, size);
}
/** inany_pton - Parse an IPv[46] address from text format
* @src: IPv[46] address
* @dst: output buffer, filled with parsed address
*
* Return: on success, 1, if no parseable address is found, 0
*/
int inany_pton(const char *src, union inany_addr *dst)
{
if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) {
memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero));
memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one));
return 1;
}
if (inet_pton(AF_INET6, src, &dst->a6))
return 1;
return 0;
}

28
inany.h
View file

@ -237,30 +237,23 @@ static inline void inany_from_af(union inany_addr *aa,
}
/** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr
* @dst: Pointer to store IPv[46] address (output)
* @aa: Pointer to store IPv[46] address
* @port: Pointer to store port number, host order
* @addr: Socket address
*
* Return: 0 on success, -1 on error (bad address family)
* @addr: AF_INET or AF_INET6 socket address
*/
static inline int inany_from_sockaddr(union inany_addr *dst, in_port_t *port,
const void *addr)
static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
const union sockaddr_inany *sa)
{
const union sockaddr_inany *sa = (const union sockaddr_inany *)addr;
if (sa->sa_family == AF_INET6) {
inany_from_af(dst, AF_INET6, &sa->sa6.sin6_addr);
inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr);
*port = ntohs(sa->sa6.sin6_port);
return 0;
}
if (sa->sa_family == AF_INET) {
inany_from_af(dst, AF_INET, &sa->sa4.sin_addr);
} else if (sa->sa_family == AF_INET) {
inany_from_af(aa, AF_INET, &sa->sa4.sin_addr);
*port = ntohs(sa->sa4.sin_port);
return 0;
} else {
/* Not valid to call with other address families */
ASSERT(0);
}
return -1;
}
/** inany_siphash_feed- Fold IPv[46] address into an in-progress siphash
@ -277,6 +270,5 @@ static inline void inany_siphash_feed(struct siphash_state *state,
#define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size);
int inany_pton(const char *src, union inany_addr *dst);
#endif /* INANY_H */

114
iov.c
View file

@ -26,8 +26,7 @@
#include "iov.h"
/**
* iov_skip_bytes() - Skip leading bytes of an IO vector
/* iov_skip_bytes() - Skip leading bytes of an IO vector
* @iov: IO vector
* @n: Number of entries in @iov
* @skip: Number of leading bytes of @iov to skip
@ -57,8 +56,8 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
}
/**
* iov_from_buf() - Copy data from a buffer to an I/O vector (struct iovec)
* efficiently.
* iov_from_buf - Copy data from a buffer to an I/O vector (struct iovec)
* efficiently.
*
* @iov: Pointer to the array of struct iovec describing the
* scatter/gather I/O vector.
@ -67,8 +66,9 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
* @buf: Pointer to the source buffer containing the data to copy.
* @bytes: Total number of bytes to copy from buf to iov.
*
* Return: the number of bytes successfully copied.
* Returns: The number of bytes successfully copied.
*/
/* cppcheck-suppress unusedFunction */
size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
size_t offset, const void *buf, size_t bytes)
{
@ -97,8 +97,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
}
/**
* iov_to_buf() - Copy data from a scatter/gather I/O vector (struct iovec) to
* a buffer efficiently.
* iov_to_buf - Copy data from a scatter/gather I/O vector (struct iovec) to
* a buffer efficiently.
*
* @iov: Pointer to the array of struct iovec describing the scatter/gather
* I/O vector.
@ -107,7 +107,7 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
* @buf: Pointer to the destination buffer where data will be copied.
* @bytes: Total number of bytes to copy from iov to buf.
*
* Return: the number of bytes successfully copied.
* Returns: The number of bytes successfully copied.
*/
/* cppcheck-suppress unusedFunction */
size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
@ -137,14 +137,14 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
}
/**
* iov_size() - Calculate the total size of a scatter/gather I/O vector
* (struct iovec).
* iov_size - Calculate the total size of a scatter/gather I/O vector
* (struct iovec).
*
* @iov: Pointer to the array of struct iovec describing the
* scatter/gather I/O vector.
* @iov_cnt: Number of elements in the iov array.
*
* Return: the total size in bytes.
* Returns: The total size in bytes.
*/
size_t iov_size(const struct iovec *iov, size_t iov_cnt)
{
@ -156,95 +156,3 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt)
return len;
}
/**
* iov_tail_prune() - Remove any unneeded buffers from an IOV tail
* @tail: IO vector tail (modified)
*
* If an IOV tail's offset is large enough, it may not include any bytes from
* the first (or first several) buffers in the underlying IO vector. Modify the
* tail's representation so it contains the same logical bytes, but only
* includes buffers that are actually needed. This will avoid stepping through
* unnecessary elements of the underlying IO vector on future operations.
*
* Return: true if the tail still contains any bytes, otherwise false
*/
bool iov_tail_prune(struct iov_tail *tail)
{
size_t i;
i = iov_skip_bytes(tail->iov, tail->cnt, tail->off, &tail->off);
tail->iov += i;
tail->cnt -= i;
return !!tail->cnt;
}
/**
* iov_tail_size() - Calculate the total size of an IO vector tail
* @tail: IO vector tail
*
* Return: the total size in bytes.
*/
size_t iov_tail_size(struct iov_tail *tail)
{
iov_tail_prune(tail);
return iov_size(tail->iov, tail->cnt) - tail->off;
}
/**
* iov_peek_header_() - Get pointer to a header from an IOV tail
* @tail: IOV tail to get header from
* @len: Length of header to get, in bytes
* @align: Required alignment of header, in bytes
*
* @tail may be pruned, but will represent the same bytes as before.
*
* Return: pointer to the first @len logical bytes of the tail, NULL if that
* overruns the IO vector, is not contiguous or doesn't have the
* requested alignment.
*/
/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
{
char *p;
if (!iov_tail_prune(tail))
return NULL; /* Nothing left */
if (tail->off + len < tail->off)
return NULL; /* Overflow */
if (tail->off + len > tail->iov[0].iov_len)
return NULL; /* Not contiguous */
p = (char *)tail->iov[0].iov_base + tail->off;
if ((uintptr_t)p % align)
return NULL; /* not aligned */
return p;
}
/**
* iov_remove_header_() - Remove a header from an IOV tail
* @tail: IOV tail to remove header from (modified)
* @len: Length of header to remove, in bytes
* @align: Required alignment of header, in bytes
*
* On success, @tail is updated so that it longer includes the bytes of the
* returned header.
*
* Return: pointer to the first @len logical bytes of the tail, NULL if that
* overruns the IO vector, is not contiguous or doesn't have the
* requested alignment.
*/
void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align)
{
char *p = iov_peek_header_(tail, len, align);
if (!p)
return NULL;
tail->off = tail->off + len;
return p;
}

76
iov.h
View file

@ -28,80 +28,4 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
size_t offset, void *buf, size_t bytes);
size_t iov_size(const struct iovec *iov, size_t iov_cnt);
/*
* DOC: Theory of Operation, struct iov_tail
*
* Sometimes a single logical network frame is split across multiple buffers,
* represented by an IO vector (struct iovec[]). We often want to process this
* one header / network layer at a time. So, it's useful to maintain a "tail"
* of the vector representing the parts we haven't yet extracted.
*
* The headers we extract need not line up with buffer boundaries (though we do
* assume they're contiguous within a single buffer for now). So, we could
* represent that tail as another struct iovec[], but that would mean copying
* the whole array of struct iovecs, just so we can adjust the offset and length
* on the first one.
*
* So, instead represent the tail as pointer into an existing struct iovec[],
* with an explicit offset for where the "tail" starts within it. If we extract
* enough headers that some buffers of the original vector no longer contain
* part of the tail, we (lazily) advance our struct iovec * to the first buffer
* we still need, and adjust the vector length and offset to match.
*/
/**
* struct iov_tail - An IO vector which may have some headers logically removed
* @iov: IO vector
* @cnt: Number of entries in @iov
* @off: Current offset in @iov
*/
struct iov_tail {
const struct iovec *iov;
size_t cnt, off;
};
/**
* IOV_TAIL() - Create a new IOV tail
* @iov_: IO vector to create tail from
* @cnt_: Length of the IO vector at @iov_
* @off_: Byte offset in the IO vector where the tail begins
*/
#define IOV_TAIL(iov_, cnt_, off_) \
(struct iov_tail){ .iov = (iov_), .cnt = (cnt_), .off = (off_) }
bool iov_tail_prune(struct iov_tail *tail);
size_t iov_tail_size(struct iov_tail *tail);
void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align);
void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align);
/**
* IOV_PEEK_HEADER() - Get typed pointer to a header from an IOV tail
* @tail_: IOV tail to get header from
* @type_: Data type of the header
*
* @tail_ may be pruned, but will represent the same bytes as before.
*
* Return: pointer of type (@type_ *) located at the start of @tail_, NULL if
* we can't get a contiguous and aligned pointer.
*/
#define IOV_PEEK_HEADER(tail_, type_) \
((type_ *)(iov_peek_header_((tail_), \
sizeof(type_), __alignof__(type_))))
/**
* IOV_REMOVE_HEADER() - Remove and return typed header from an IOV tail
* @tail_: IOV tail to remove header from (modified)
* @type_: Data type of the header to remove
*
* On success, @tail_ is updated so that it longer includes the bytes of the
* returned header.
*
* Return: pointer of type (@type_ *) located at the old start of @tail_, NULL
* if we can't get a contiguous and aligned pointer.
*/
#define IOV_REMOVE_HEADER(tail_, type_) \
((type_ *)(iov_remove_header_((tail_), \
sizeof(type_), __alignof__(type_))))
#endif /* IOVEC_H */

46
ip.h
View file

@ -36,14 +36,13 @@
.tos = 0, \
.tot_len = 0, \
.id = 0, \
.frag_off = htons(IP_DF), \
.frag_off = 0, \
.ttl = 0xff, \
.protocol = (proto), \
.saddr = 0, \
.daddr = 0, \
}
#define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \
(uint32_t)htons_constant(IP_DF) + \
(uint32_t)htons(0xff00 | (proto)))
@ -91,49 +90,6 @@ struct ipv6_opt_hdr {
*/
} __attribute__((packed)); /* required for some archs */
/**
* ip6_set_flow_lbl() - Set flow label in an IPv6 header
* @ip6h: Pointer to IPv6 header, updated
* @flow: Set @ip6h flow label to the low 20 bits of this integer
*/
static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow)
{
ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
}
/** ip6_get_flow_lbl() - Get flow label from an IPv6 header
* @ip6h: Pointer to IPv6 header
*
* Return: flow label from @ip6h as an integer (<= 20 bits)
*/
static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
{
return (ip6h->flow_lbl[0] & 0xf) << 16 |
ip6h->flow_lbl[1] << 8 |
ip6h->flow_lbl[2];
}
char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
size_t *dlen);
/* IPv6 link-local all-nodes multicast address, ff02::1 */
static const struct in6_addr in6addr_ll_all_nodes = {
.s6_addr = {
0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
},
};
/* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */
static const struct in_addr in4addr_broadcast = { 0xffffffff };
#ifndef IPV4_MIN_MTU
#define IPV4_MIN_MTU 68
#endif
#ifndef IPV6_MIN_MTU
#define IPV6_MIN_MTU 1280
#endif
#endif /* IP_H */

View file

@ -129,7 +129,7 @@ static void drop_caps_ep_except(uint64_t keep)
* additional layer of protection. Executing this requires
* CAP_SETPCAP, which we will have within our userns.
*
* Note that dropping capabilities from the bounding set limits
* Note that dropping capabilites from the bounding set limits
* exec()ed processes, but does not remove them from the effective or
* permitted sets, so it doesn't reduce our own capabilities.
*/
@ -174,8 +174,8 @@ static void clamp_caps(void)
* Should:
* - drop unneeded capabilities
* - close all open files except for standard streams and the one from --fd
* Mustn't:
* - remove filesystem access (we need to access files during setup)
* Musn't:
* - remove filesytem access (we need to access files during setup)
*/
void isolate_initial(int argc, char **argv)
{
@ -194,7 +194,7 @@ void isolate_initial(int argc, char **argv)
*
* It's debatable whether it's useful to drop caps when we
* retain SETUID and SYS_ADMIN, but we might as well. We drop
* further capabilities in isolate_user() and
* further capabilites in isolate_user() and
* isolate_prefork().
*/
keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
@ -379,21 +379,12 @@ void isolate_postfork(const struct ctx *c)
prctl(PR_SET_DUMPABLE, 0);
switch (c->mode) {
case MODE_PASST:
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
prog.filter = filter_passt;
break;
case MODE_PASTA:
if (c->mode == MODE_PASTA) {
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
prog.filter = filter_pasta;
break;
case MODE_VU:
prog.len = (unsigned short)ARRAY_SIZE(filter_vu);
prog.filter = filter_vu;
break;
default:
ASSERT(0);
} else {
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
prog.filter = filter_passt;
}
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||

View file

@ -70,7 +70,7 @@ static ssize_t peek_line(struct lineread *lr, bool eof)
* @lr: Line reader state structure
* @line: Place a pointer to the next line in this variable
*
* Return: length of line read on success, 0 on EOF, negative on error
* Return: Length of line read on success, 0 on EOF, negative on error
*/
ssize_t lineread_get(struct lineread *lr, char **line)
{

View file

@ -1,144 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright Red Hat
*
* Declarations for Linux specific dependencies
*/
#ifndef LINUX_DEP_H
#define LINUX_DEP_H
/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
*
* Largely derived from include/linux/tcp.h in the Linux kernel
*
* Some fields returned by TCP_INFO have been there for ages and are shared with
* BSD. struct tcp_info from netinet/tcp.h has only those fields. There are
* also a many Linux specific extensions to the structure, which are only found
* in the linux/tcp.h version of struct tcp_info.
*
* We want to use some of those extension fields, when available. We can test
* for availability in the runtime kernel using the length returned from
* getsockopt(). However, we won't necessarily be compiled against the same
* kernel headers as we'll run with, so compiling directly against linux/tcp.h
* means wrapping every field access in an #ifdef whose #else does the same
* thing as when the field is missing at runtime. This rapidly gets messy.
*
* Instead we define here struct tcp_info_linux which includes all the Linux
* extensions that we want to use. This is taken from v6.11 of the kernel.
*/
struct tcp_info_linux {
uint8_t tcpi_state;
uint8_t tcpi_ca_state;
uint8_t tcpi_retransmits;
uint8_t tcpi_probes;
uint8_t tcpi_backoff;
uint8_t tcpi_options;
uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
uint32_t tcpi_rto;
uint32_t tcpi_ato;
uint32_t tcpi_snd_mss;
uint32_t tcpi_rcv_mss;
uint32_t tcpi_unacked;
uint32_t tcpi_sacked;
uint32_t tcpi_lost;
uint32_t tcpi_retrans;
uint32_t tcpi_fackets;
/* Times. */
uint32_t tcpi_last_data_sent;
uint32_t tcpi_last_ack_sent;
uint32_t tcpi_last_data_recv;
uint32_t tcpi_last_ack_recv;
/* Metrics. */
uint32_t tcpi_pmtu;
uint32_t tcpi_rcv_ssthresh;
uint32_t tcpi_rtt;
uint32_t tcpi_rttvar;
uint32_t tcpi_snd_ssthresh;
uint32_t tcpi_snd_cwnd;
uint32_t tcpi_advmss;
uint32_t tcpi_reordering;
uint32_t tcpi_rcv_rtt;
uint32_t tcpi_rcv_space;
uint32_t tcpi_total_retrans;
/* Linux extensions */
uint64_t tcpi_pacing_rate;
uint64_t tcpi_max_pacing_rate;
uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
uint32_t tcpi_notsent_bytes;
uint32_t tcpi_min_rtt;
uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
uint64_t tcpi_delivery_rate;
uint64_t tcpi_busy_time; /* Time (usec) busy sending data */
uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */
uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
uint32_t tcpi_delivered;
uint32_t tcpi_delivered_ce;
uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */
uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */
uint32_t tcpi_reord_seen; /* reordering events seen */
uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */
uint32_t tcpi_snd_wnd; /* peer's advertised receive window after
* scaling (bytes)
*/
uint32_t tcpi_rcv_wnd; /* local advertised receive window after
* scaling (bytes)
*/
uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */
uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including
* SYN/SYN-ACK and recurring timeouts.
*/
uint16_t tcpi_total_rto_recoveries; /* Total number of RTO
* recoveries, including any
* unfinished recovery.
*/
uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries
* in milliseconds, including any
* unfinished recovery.
*/
};
#include <linux/falloc.h>
#ifndef FALLOC_FL_COLLAPSE_RANGE
#define FALLOC_FL_COLLAPSE_RANGE 0x08
#endif
#include <linux/close_range.h>
/* glibc < 2.34 and musl as of 1.2.5 need these */
#ifndef SYS_close_range
#define SYS_close_range 436
#endif
#ifndef CLOSE_RANGE_UNSHARE /* Linux kernel < 5.9 */
#define CLOSE_RANGE_UNSHARE (1U << 1)
#endif
__attribute__ ((weak))
/* cppcheck-suppress funcArgNamesDifferent */
int close_range(unsigned int first, unsigned int last, int flags) {
return syscall(SYS_close_range, first, last, flags);
}
#endif /* LINUX_DEP_H */

75
log.c
View file

@ -26,7 +26,6 @@
#include <stdarg.h>
#include <sys/socket.h>
#include "linux_dep.h"
#include "log.h"
#include "util.h"
#include "passt.h"
@ -54,10 +53,9 @@ bool log_stderr = true; /* Not daemonised, no shell spawned */
* logtime() - Get the current time for logging purposes
* @ts: Buffer into which to store the timestamp
*
* Return: pointer to @ts on success, or NULL if there was
* an error retrieving the time
* Return: pointer to @now, or NULL if there was an error retrieving the time
*/
static const struct timespec *logtime(struct timespec *ts)
const struct timespec *logtime(struct timespec *ts)
{
if (clock_gettime(CLOCK_MONOTONIC, ts))
return NULL;
@ -94,6 +92,7 @@ const char *logfile_prefix[] = {
" ", /* LOG_DEBUG */
};
#ifdef FALLOC_FL_COLLAPSE_RANGE
/**
* logfile_rotate_fallocate() - Write header, set log_written after fallocate()
* @fd: Log file descriptor
@ -127,6 +126,7 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
log_written -= log_cut_size;
}
#endif /* FALLOC_FL_COLLAPSE_RANGE */
/**
* logfile_rotate_move() - Fallback: move recent entries toward start, then cut
@ -198,17 +198,21 @@ out:
*
* Return: 0 on success, negative error code on failure
*
* #syscalls fcntl fallocate
* #syscalls fcntl
*
* fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there
*/
static int logfile_rotate(int fd, const struct timespec *now)
{
if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */))
return -errno;
#ifdef FALLOC_FL_COLLAPSE_RANGE
/* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */
if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size))
logfile_rotate_fallocate(fd, now);
else
#endif
logfile_rotate_move(fd, now);
if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND))
@ -250,30 +254,6 @@ static void logfile_write(bool newline, bool cont, int pri,
log_written += n;
}
/**
* passt_vsyslog() - vsyslog() implementation not using heap memory
* @newline: Append newline at the end of the message, if missing
* @pri: Facility and level map, same as priority for vsyslog()
* @format: Same as vsyslog() format
* @ap: Same as vsyslog() ap
*/
static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
{
char buf[BUFSIZ];
int n;
/* Send without timestamp, the system logger should add it */
n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
if (newline && format[strlen(format)] != '\n')
n += snprintf(buf + n, BUFSIZ - n, "\n");
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
}
/**
* vlogmsg() - Print or send messages to log or output files as configured
* @newline: Append newline at the end of the message, if missing
@ -282,7 +262,6 @@ static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
* @format: Message
* @ap: Variable argument list
*/
/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
{
bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;
@ -295,7 +274,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
char timestr[LOGTIME_STRLEN];
logtime_fmt(timestr, sizeof(timestr), now);
FPRINTF(stderr, "%s: ", timestr);
fprintf(stderr, "%s: ", timestr);
}
if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) {
@ -314,7 +293,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
(log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) {
(void)vfprintf(stderr, format, ap);
if (newline && format[strlen(format)] != '\n')
FPRINTF(stderr, "\n");
fprintf(stderr, "\n");
}
}
@ -348,7 +327,7 @@ void logmsg_perror(int pri, const char *format, ...)
vlogmsg(false, false, pri, format, ap);
va_end(ap);
logmsg(true, true, pri, ": %s", strerror_(errno_copy));
logmsg(true, true, pri, ": %s", strerror(errno_copy));
}
/**
@ -399,11 +378,35 @@ void __setlogmask(int mask)
setlogmask(mask);
}
/**
* passt_vsyslog() - vsyslog() implementation not using heap memory
* @newline: Append newline at the end of the message, if missing
* @pri: Facility and level map, same as priority for vsyslog()
* @format: Same as vsyslog() format
* @ap: Same as vsyslog() ap
*/
void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
{
char buf[BUFSIZ];
int n;
/* Send without timestamp, the system logger should add it */
n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
if (newline && format[strlen(format)] != '\n')
n += snprintf(buf + n, BUFSIZ - n, "\n");
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
fprintf(stderr, "Failed to send %i bytes to syslog\n", n);
}
/**
* logfile_init() - Open log file and write header with PID, version, path
* @name: Identifier for header: passt or pasta
* @path: Path to log file
* @size: Maximum size of log file: log_cut_size is calculated here
* @size: Maximum size of log file: log_cut_size is calculatd here
*/
void logfile_init(const char *name, const char *path, size_t size)
{
@ -413,7 +416,8 @@ void logfile_init(const char *name, const char *path, size_t size)
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
die_perror("Failed to read own /proc/self/exe link");
log_file = output_file_open(path, O_APPEND | O_RDWR);
log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC,
S_IRUSR | S_IWUSR);
if (log_file == -1)
die_perror("Couldn't open log file %s", path);
@ -429,3 +433,4 @@ void logfile_init(const char *name, const char *path, size_t size)
/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
}

5
log.h
View file

@ -32,13 +32,13 @@ void logmsg_perror(int pri, const char *format, ...)
#define die(...) \
do { \
err(__VA_ARGS__); \
_exit(EXIT_FAILURE); \
exit(EXIT_FAILURE); \
} while (0)
#define die_perror(...) \
do { \
err_perror(__VA_ARGS__); \
_exit(EXIT_FAILURE); \
exit(EXIT_FAILURE); \
} while (0)
extern int log_trace;
@ -55,6 +55,7 @@ void trace_init(int enable);
void __openlog(const char *ident, int option, int facility);
void logfile_init(const char *name, const char *path, size_t size);
void passt_vsyslog(bool newline, int pri, const char *format, va_list ap);
void __setlogmask(int mask);
#endif /* LOG_H */

304
migrate.c
View file

@ -1,304 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* migrate.c - Migration sections, layout, and routines
*
* Copyright (c) 2025 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#include <errno.h>
#include <sys/uio.h>
#include "util.h"
#include "ip.h"
#include "passt.h"
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
#include "migrate.h"
#include "repair.h"
/* Magic identifier for migration data */
#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0
/**
* struct migrate_seen_addrs_v1 - Migratable guest addresses for v1 state stream
* @addr6: Observed guest IPv6 address
* @addr6_ll: Observed guest IPv6 link-local address
* @addr4: Observed guest IPv4 address
* @mac: Observed guest MAC address
*/
struct migrate_seen_addrs_v1 {
struct in6_addr addr6;
struct in6_addr addr6_ll;
struct in_addr addr4;
unsigned char mac[ETH_ALEN];
} __attribute__((packed));
/**
* seen_addrs_source_v1() - Copy and send guest observed addresses from source
* @c: Execution context
* @stage: Migration stage, unused
* @fd: File descriptor for state transfer
*
* Return: 0 on success, positive error code on failure
*/
/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
static int seen_addrs_source_v1(struct ctx *c,
const struct migrate_stage *stage, int fd)
{
struct migrate_seen_addrs_v1 addrs = {
.addr6 = c->ip6.addr_seen,
.addr6_ll = c->ip6.addr_ll_seen,
.addr4 = c->ip4.addr_seen,
};
(void)stage;
memcpy(addrs.mac, c->guest_mac, sizeof(addrs.mac));
if (write_all_buf(fd, &addrs, sizeof(addrs)))
return errno;
return 0;
}
/**
* seen_addrs_target_v1() - Receive and use guest observed addresses on target
* @c: Execution context
* @stage: Migration stage, unused
* @fd: File descriptor for state transfer
*
* Return: 0 on success, positive error code on failure
*/
static int seen_addrs_target_v1(struct ctx *c,
const struct migrate_stage *stage, int fd)
{
struct migrate_seen_addrs_v1 addrs;
(void)stage;
if (read_all_buf(fd, &addrs, sizeof(addrs)))
return errno;
c->ip6.addr_seen = addrs.addr6;
c->ip6.addr_ll_seen = addrs.addr6_ll;
c->ip4.addr_seen = addrs.addr4;
memcpy(c->guest_mac, addrs.mac, sizeof(c->guest_mac));
return 0;
}
/* Stages for version 2 */
static const struct migrate_stage stages_v2[] = {
{
.name = "observed addresses",
.source = seen_addrs_source_v1,
.target = seen_addrs_target_v1,
},
{
.name = "prepare flows",
.source = flow_migrate_source_pre,
.target = NULL,
},
{
.name = "transfer flows",
.source = flow_migrate_source,
.target = flow_migrate_target,
},
{ 0 },
};
/* Supported encoding versions, from latest (most preferred) to oldest */
static const struct migrate_version versions[] = {
{ 2, stages_v2, },
/* v1 was released, but not widely used. It had bad endianness for the
* MSS and omitted timestamps, which meant it usually wouldn't work.
* Therefore we don't attempt to support compatibility with it.
*/
{ 0 },
};
/* Current encoding version */
#define CURRENT_VERSION (&versions[0])
/**
* migrate_source() - Migration as source, send state to hypervisor
* @c: Execution context
* @fd: File descriptor for state transfer
*
* Return: 0 on success, positive error code on failure
*/
static int migrate_source(struct ctx *c, int fd)
{
const struct migrate_version *v = CURRENT_VERSION;
const struct migrate_header header = {
.magic = htonll_constant(MIGRATE_MAGIC),
.version = htonl(v->id),
.compat_version = htonl(v->id),
};
const struct migrate_stage *s;
int ret;
if (write_all_buf(fd, &header, sizeof(header))) {
ret = errno;
err("Can't send migration header: %s, abort", strerror_(ret));
return ret;
}
for (s = v->s; s->name; s++) {
if (!s->source)
continue;
debug("Source side migration stage: %s", s->name);
if ((ret = s->source(c, s, fd))) {
err("Source migration stage: %s: %s, abort", s->name,
strerror_(ret));
return ret;
}
}
return 0;
}
/**
* migrate_target_read_header() - Read header in target
* @fd: Descriptor for state transfer
*
* Return: version structure on success, NULL on failure with errno set
*/
static const struct migrate_version *migrate_target_read_header(int fd)
{
const struct migrate_version *v;
struct migrate_header h;
uint32_t id, compat_id;
if (read_all_buf(fd, &h, sizeof(h)))
return NULL;
id = ntohl(h.version);
compat_id = ntohl(h.compat_version);
debug("Source magic: 0x%016" PRIx64 ", version: %u, compat: %u",
ntohll(h.magic), id, compat_id);
if (ntohll(h.magic) != MIGRATE_MAGIC || !id || !compat_id) {
err("Invalid incoming device state");
errno = EINVAL;
return NULL;
}
for (v = versions; v->id; v++)
if (v->id <= id && v->id >= compat_id)
return v;
errno = ENOTSUP;
err("Unsupported device state version: %u", id);
return NULL;
}
/**
* migrate_target() - Migration as target, receive state from hypervisor
* @c: Execution context
* @fd: File descriptor for state transfer
*
* Return: 0 on success, positive error code on failure
*/
static int migrate_target(struct ctx *c, int fd)
{
const struct migrate_version *v;
const struct migrate_stage *s;
int ret;
if (!(v = migrate_target_read_header(fd)))
return errno;
for (s = v->s; s->name; s++) {
if (!s->target)
continue;
debug("Target side migration stage: %s", s->name);
if ((ret = s->target(c, s, fd))) {
err("Target migration stage: %s: %s, abort", s->name,
strerror_(ret));
return ret;
}
}
return 0;
}
/**
* migrate_init() - Set up things necessary for migration
* @c: Execution context
*/
void migrate_init(struct ctx *c)
{
c->device_state_result = -1;
}
/**
* migrate_close() - Close migration channel and connection to passt-repair
* @c: Execution context
*/
void migrate_close(struct ctx *c)
{
if (c->device_state_fd != -1) {
debug("Closing migration channel, fd: %d", c->device_state_fd);
close(c->device_state_fd);
c->device_state_fd = -1;
c->device_state_result = -1;
}
repair_close(c);
}
/**
* migrate_request() - Request a migration of device state
* @c: Execution context
* @fd: fd to transfer state
* @target: Are we the target of the migration?
*/
void migrate_request(struct ctx *c, int fd, bool target)
{
debug("Migration requested, fd: %d (was %d)", fd, c->device_state_fd);
if (c->device_state_fd != -1)
migrate_close(c);
c->device_state_fd = fd;
c->migrate_target = target;
}
/**
* migrate_handler() - Send/receive passt internal state to/from hypervisor
* @c: Execution context
*/
void migrate_handler(struct ctx *c)
{
int rc;
if (c->device_state_fd < 0)
return;
debug("Handling migration request from fd: %d, target: %d",
c->device_state_fd, c->migrate_target);
if (c->migrate_target)
rc = migrate_target(c, c->device_state_fd);
else
rc = migrate_source(c, c->device_state_fd);
migrate_close(c);
c->device_state_result = rc;
}

View file

@ -1,51 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright (c) 2025 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#ifndef MIGRATE_H
#define MIGRATE_H
/**
* struct migrate_header - Migration header from source
* @magic: 0xB1BB1D1B0BB1D1B0, network order
* @version: Highest known, target aborts if too old, network order
* @compat_version: Lowest version compatible with @version, target aborts
* if too new, network order
*/
struct migrate_header {
uint64_t magic;
uint32_t version;
uint32_t compat_version;
} __attribute__((packed));
/**
* struct migrate_stage - Callbacks and parameters for one stage of migration
* @name: Stage name (for debugging)
* @source: Callback to implement this stage on the source
* @target: Callback to implement this stage on the target
*/
struct migrate_stage {
const char *name;
int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd);
int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd);
/* Add here separate rollback callbacks if needed */
};
/**
* struct migrate_version - Stages for a particular protocol version
* @id: Version number, host order
* @s: Ordered array of stages, NULL-terminated
*/
struct migrate_version {
uint32_t id;
const struct migrate_stage *s;
};
void migrate_init(struct ctx *c);
void migrate_close(struct ctx *c);
void migrate_request(struct ctx *c, int fd, bool target);
void migrate_handler(struct ctx *c);
#endif /* MIGRATE_H */

223
ndp.c
View file

@ -33,8 +33,6 @@
#include "tap.h"
#include "log.h"
#define RT_LIFETIME 65535
#define RS 133
#define RA 134
#define NS 135
@ -160,7 +158,7 @@ struct ndp_ra {
unsigned char var[sizeof(struct opt_mtu) + sizeof(struct opt_rdnss) +
sizeof(struct opt_dnssl)];
} __attribute__((packed, aligned(__alignof__(struct in6_addr))));
} __attribute__((packed));
/**
* struct ndp_ns - NDP Neighbor Solicitation (NS) message
@ -170,31 +168,19 @@ struct ndp_ra {
struct ndp_ns {
struct icmp6hdr ih;
struct in6_addr target_addr;
} __attribute__((packed, aligned(__alignof__(struct in6_addr))));
} __attribute__((packed));
/**
* ndp_send() - Send an NDP message
* ndp() - Check for NDP solicitations, reply as needed
* @c: Execution context
* @dst: IPv6 address to send the message to
* @buf: ICMPv6 header + message payload
* @l4len: Length of message, including ICMPv6 header
* @ih: ICMPv6 header
* @saddr: Source IPv6 address
* @p: Packet pool
*
* Return: 0 if not handled here, 1 if handled, -1 on failure
*/
static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
const void *buf, size_t l4len)
{
const struct in6_addr *src = &c->ip6.our_tap_ll;
tap_icmp6_send(c, src, dst, buf, l4len);
}
/**
* ndp_na() - Send an NDP Neighbour Advertisement (NA) message
* @c: Execution context
* @dst: IPv6 address to send the NA to
* @addr: IPv6 address to advertise
*/
static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
const struct in6_addr *addr)
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
const struct pool *p)
{
struct ndp_na na = {
.ih = {
@ -204,7 +190,6 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
.icmp6_solicited = 1,
.icmp6_override = 1,
},
.target_addr = *addr,
.target_l2_addr = {
.header = {
.type = OPT_TARGET_L2_ADDR,
@ -212,26 +197,13 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
},
}
};
memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
ndp_send(c, dst, &na, sizeof(na));
}
/**
* ndp_ra() - Send an NDP Router Advertisement (RA) message
* @c: Execution context
* @dst: IPv6 address to send the RA to
*/
static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
{
struct ndp_ra ra = {
.ih = {
.icmp6_type = RA,
.icmp6_code = 0,
.icmp6_hop_limit = 255,
/* RFC 8319 */
.icmp6_rt_lifetime = htons_constant(RT_LIFETIME),
.icmp6_rt_lifetime = htons_constant(65535),
.icmp6_addrconf_managed = 1,
},
.prefix_info = {
@ -244,7 +216,6 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
.valid_lifetime = ~0U,
.pref_lifetime = ~0U,
},
.prefix = c->ip6.addr,
.source_ll = {
.header = {
.type = OPT_SRC_L2_ADDR,
@ -252,26 +223,59 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
},
},
};
const struct in6_addr *rsaddr; /* src addr for reply */
unsigned char *ptr = NULL;
size_t dlen;
ptr = &ra.var[0];
if (ih->icmp6_type < RS || ih->icmp6_type > NA)
return 0;
if (c->mtu) {
struct opt_mtu *mtu = (struct opt_mtu *)ptr;
*mtu = (struct opt_mtu) {
.header = {
.type = OPT_MTU,
.len = 1,
},
.value = htonl(c->mtu),
};
ptr += sizeof(struct opt_mtu);
}
if (c->no_ndp)
return 1;
if (!c->no_dhcp_dns) {
if (ih->icmp6_type == NS) {
struct ndp_ns *ns = packet_get(p, 0, 0, sizeof(struct ndp_ns),
NULL);
if (!ns)
return -1;
if (IN6_IS_ADDR_UNSPECIFIED(saddr))
return 1;
info("NDP: received NS, sending NA");
memcpy(&na.target_addr, &ns->target_addr,
sizeof(na.target_addr));
memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
} else if (ih->icmp6_type == RS) {
size_t dns_s_len = 0;
int i, n;
if (c->no_ra)
return 1;
info("NDP: received RS, sending RA");
memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix));
ptr = &ra.var[0];
if (c->mtu != -1) {
struct opt_mtu *mtu = (struct opt_mtu *)ptr;
*mtu = (struct opt_mtu) {
.header = {
.type = OPT_MTU,
.len = 1,
},
.value = htonl(c->mtu),
};
ptr += sizeof(struct opt_mtu);
}
if (c->no_dhcp_dns)
goto dns_done;
for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n]); n++);
if (n) {
struct opt_rdnss *rdnss = (struct opt_rdnss *)ptr;
@ -283,7 +287,8 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
.lifetime = ~0U,
};
for (i = 0; i < n; i++) {
rdnss->dns[i] = c->ip6.dns[i];
memcpy(&rdnss->dns[i], &c->ip6.dns[i],
sizeof(rdnss->dns[i]));
}
ptr += offsetof(struct opt_rdnss, dns) +
i * sizeof(rdnss->dns[0]);
@ -324,109 +329,27 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
memset(ptr, 0, 8 - dns_s_len % 8); /* padding */
ptr += 8 - dns_s_len % 8;
}
dns_done:
memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
} else {
return 1;
}
memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
if (IN6_IS_ADDR_LINKLOCAL(saddr))
c->ip6.addr_ll_seen = *saddr;
else
c->ip6.addr_seen = *saddr;
/* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);
}
/**
* ndp() - Check for NDP solicitations, reply as needed
* @c: Execution context
* @saddr: Source IPv6 address
* @p: Packet pool
*
* Return: 0 if not handled here, 1 if handled, -1 on failure
*/
int ndp(const struct ctx *c, const struct icmp6hdr *ih,
const struct in6_addr *saddr, const struct pool *p)
{
if (ih->icmp6_type < RS || ih->icmp6_type > NA)
return 0;
if (c->no_ndp)
return 1;
rsaddr = &c->ip6.our_tap_ll;
if (ih->icmp6_type == NS) {
const struct ndp_ns *ns;
ns = packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
if (!ns)
return -1;
if (IN6_IS_ADDR_UNSPECIFIED(saddr))
return 1;
info("NDP: received NS, sending NA");
ndp_na(c, saddr, &ns->target_addr);
dlen = sizeof(struct ndp_na);
tap_icmp6_send(c, rsaddr, saddr, &na, dlen);
} else if (ih->icmp6_type == RS) {
if (c->no_ra)
return 1;
info("NDP: received RS, sending RA");
ndp_ra(c, saddr);
dlen = ptr - (unsigned char *)&ra;
tap_icmp6_send(c, rsaddr, saddr, &ra, dlen);
}
return 1;
}
/* Default interval between unsolicited RAs (seconds) */
#define DEFAULT_MAX_RTR_ADV_INTERVAL 600 /* RFC 4861, 6.2.1 */
/* Minimum required interval between RAs (seconds) */
#define MIN_DELAY_BETWEEN_RAS 3 /* RFC 4861, 10 */
static time_t next_ra;
/**
* ndp_timer() - Send unsolicited NDP messages if necessary
* @c: Execution context
* @now: Current (monotonic) time
*/
void ndp_timer(const struct ctx *c, const struct timespec *now)
{
time_t max_rtr_adv_interval = DEFAULT_MAX_RTR_ADV_INTERVAL;
time_t min_rtr_adv_interval, interval;
if (c->fd_tap < 0 || c->no_ra || now->tv_sec < next_ra)
return;
/* We must advertise before the route's lifetime expires */
max_rtr_adv_interval = MIN(max_rtr_adv_interval, RT_LIFETIME - 1);
/* But we must not go smaller than the minimum delay */
max_rtr_adv_interval = MAX(max_rtr_adv_interval, MIN_DELAY_BETWEEN_RAS);
/* RFC 4861, 6.2.1 */
min_rtr_adv_interval = MAX(max_rtr_adv_interval / 3,
MIN_DELAY_BETWEEN_RAS);
/* As required by RFC 4861, we randomise the interval between
* unsolicited RAs. This is to prevent multiple routers on a link
* getting synchronised (e.g. after booting a bunch of routers at once)
* and causing flurries of RAs at the same time.
*
* This random doesn't need to be cryptographically strong, so random(3)
* is fine. Other routers on the link also want to avoid
* synchronisation, and anything malicious has much easier ways to cause
* trouble.
*
* The modulus also makes this not strictly a uniform distribution, but,
* again, it's close enough for our purposes.
*/
interval = min_rtr_adv_interval +
random() % (max_rtr_adv_interval - min_rtr_adv_interval);
if (!next_ra)
goto first;
info("NDP: sending unsolicited RA, next in %llds", (long long)interval);
ndp_ra(c, &in6addr_ll_all_nodes);
first:
next_ra = now->tv_sec + interval;
}

7
ndp.h
View file

@ -6,10 +6,7 @@
#ifndef NDP_H
#define NDP_H
struct icmp6hdr;
int ndp(const struct ctx *c, const struct icmp6hdr *ih,
const struct in6_addr *saddr, const struct pool *p);
void ndp_timer(const struct ctx *c, const struct timespec *now);
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
const struct pool *p);
#endif /* NDP_H */

View file

@ -199,7 +199,7 @@ static struct nlmsghdr *nl_next(int s, char *buf, struct nlmsghdr *nh, ssize_t *
}
/**
* nl_foreach() - 'for' type macro to step through netlink response messages
* nl_foreach - 'for' type macro to step through netlink response messages
* nl_foreach_oftype - as above, but only messages of expected type
* @nh: Steps through each response header (struct nlmsghdr *)
* @status: When loop exits indicates if there was an error (ssize_t)
@ -297,10 +297,6 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
if (!thisifi)
continue; /* No interface for this route */
/* Skip 'lo': we should test IFF_LOOPBACK, but keep it simple */
if (thisifi == 1)
continue;
/* Skip routes to link-local addresses */
if (af == AF_INET && dst &&
IN4_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len))
@ -324,7 +320,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
}
if (status < 0)
warn("netlink: RTM_GETROUTE failed: %s", strerror_(-status));
warn("netlink: RTM_GETROUTE failed: %s", strerror(-status));
if (defifi) {
if (ndef > 1) {
@ -355,9 +351,9 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
*
* Return: true if a gateway was found, false otherwise
*/
static bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
{
int nh_len = RTA_PAYLOAD(rta);
size_t nh_len = RTA_PAYLOAD(rta);
struct rtnexthop *rtnh;
bool found = false;
int hops = -1;
@ -586,7 +582,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
*(unsigned int *)RTA_DATA(rta) = ifi_dst;
} else if (rta->rta_type == RTA_MULTIPATH) {
int nh_len = RTA_PAYLOAD(rta);
size_t nh_len = RTA_PAYLOAD(rta);
struct rtnexthop *rtnh;
for (rtnh = (struct rtnexthop *)RTA_DATA(rta);
@ -1024,6 +1020,7 @@ int nl_link_get_mac(int s, unsigned int ifi, void *mac)
/**
* nl_link_set_mac() - Set link MAC address
* @s: Netlink socket
* @ns: Use netlink socket in namespace
* @ifi: Interface index
* @mac: MAC address to set
*

185
packet.c
View file

@ -22,74 +22,12 @@
#include "util.h"
#include "log.h"
/**
* packet_check_range() - Check if a memory range is valid for a pool
* @p: Packet pool
* @ptr: Start of desired data range
* @len: Length of desired data range
* @func: For tracing: name of calling function
* @line: For tracing: caller line of function call
*
* Return: 0 if the range is valid, -1 otherwise
*/
static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
const char *func, int line)
{
if (len > PACKET_MAX_LEN) {
debug("packet range length %zu (max %zu), %s:%i",
len, PACKET_MAX_LEN, func, line);
return -1;
}
if (p->buf_size == 0) {
int ret;
ret = vu_packet_check_range((void *)p->buf, ptr, len);
if (ret == -1)
debug("cannot find region, %s:%i", func, line);
return ret;
}
if (ptr < p->buf) {
debug("packet range start %p before buffer start %p, %s:%i",
(void *)ptr, (void *)p->buf, func, line);
return -1;
}
if (len > p->buf_size) {
debug("packet range length %zu larger than buffer %zu, %s:%i",
len, p->buf_size, func, line);
return -1;
}
if ((size_t)(ptr - p->buf) > p->buf_size - len) {
debug("packet range %p, len %zu after buffer end %p, %s:%i",
(void *)ptr, len, (void *)(p->buf + p->buf_size),
func, line);
return -1;
}
return 0;
}
/**
* pool_full() - Is a packet pool full?
* @p: Pointer to packet pool
*
* Return: true if the pool is full, false if more packets can be added
*/
bool pool_full(const struct pool *p)
{
return p->count >= p->size;
}
/**
* packet_add_do() - Add data as packet descriptor to given pool
* @p: Existing pool
* @len: Length of new descriptor
* @start: Start of data
* @func: For tracing: name of calling function
* @func: For tracing: name of calling function, NULL means no trace()
* @line: For tracing: caller line of function call
*/
void packet_add_do(struct pool *p, size_t len, const char *start,
@ -97,61 +35,42 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
{
size_t idx = p->count;
if (pool_full(p)) {
debug("add packet index %zu to pool with size %zu, %s:%i",
if (idx >= p->size) {
trace("add packet index %zu to pool with size %zu, %s:%i",
idx, p->size, func, line);
return;
}
if (packet_check_range(p, start, len, func, line))
if (start < p->buf) {
trace("add packet start %p before buffer start %p, %s:%i",
(void *)start, (void *)p->buf, func, line);
return;
p->pkt[idx].iov_base = (void *)start;
p->pkt[idx].iov_len = len;
p->count++;
}
/**
* packet_get_try_do() - Get data range from packet descriptor from given pool
* @p: Packet pool
* @idx: Index of packet descriptor in pool
* @offset: Offset of data range in packet descriptor
* @len: Length of desired data range
* @left: Length of available data after range, set on return, can be NULL
* @func: For tracing: name of calling function
* @line: For tracing: caller line of function call
*
* Return: pointer to start of data range, NULL on invalid range or descriptor
*/
void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
size_t len, size_t *left, const char *func, int line)
{
char *ptr;
ASSERT_WITH_MSG(p->count <= p->size,
"Corrupt pool count: %zu, size: %zu, %s:%i",
p->count, p->size, func, line);
if (idx >= p->count) {
debug("packet %zu from pool count: %zu, %s:%i",
idx, p->count, func, line);
return NULL;
}
if (offset > p->pkt[idx].iov_len ||
len > (p->pkt[idx].iov_len - offset))
return NULL;
if (start + len > p->buf + p->buf_size) {
trace("add packet start %p, length: %zu, buffer end %p, %s:%i",
(void *)start, len, (void *)(p->buf + p->buf_size),
func, line);
return;
}
ptr = (char *)p->pkt[idx].iov_base + offset;
if (len > UINT16_MAX) {
trace("add packet length %zu, %s:%i", len, func, line);
return;
}
ASSERT_WITH_MSG(!packet_check_range(p, ptr, len, func, line),
"Corrupt packet pool, %s:%i", func, line);
#if UINTPTR_MAX == UINT64_MAX
if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {
trace("add packet start %p, buffer start %p, %s:%i",
(void *)start, (void *)p->buf, func, line);
return;
}
#endif
if (left)
*left = p->pkt[idx].iov_len - offset - len;
p->pkt[idx].offset = start - p->buf;
p->pkt[idx].len = len;
return ptr;
p->count++;
}
/**
@ -161,24 +80,52 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
* @offset: Offset of data range in packet descriptor
* @len: Length of desired data range
* @left: Length of available data after range, set on return, can be NULL
* @func: For tracing: name of calling function
* @func: For tracing: name of calling function, NULL means no trace()
* @line: For tracing: caller line of function call
*
* Return: as packet_get_try_do() but log a trace message when returning NULL
* Return: pointer to start of data range, NULL on invalid range or descriptor
*/
void *packet_get_do(const struct pool *p, const size_t idx,
size_t offset, size_t len, size_t *left,
const char *func, int line)
void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
size_t len, size_t *left, const char *func, int line)
{
void *r = packet_get_try_do(p, idx, offset, len, left, func, line);
if (!r) {
trace("missing packet data length %zu, offset %zu from "
"length %zu, %s:%i",
len, offset, p->pkt[idx].iov_len, func, line);
if (idx >= p->size || idx >= p->count) {
if (func) {
trace("packet %zu from pool size: %zu, count: %zu, "
"%s:%i", idx, p->size, p->count, func, line);
}
return NULL;
}
return r;
if (len > UINT16_MAX || len + offset > UINT32_MAX) {
if (func) {
trace("packet data length %zu, offset %zu, %s:%i",
len, offset, func, line);
}
return NULL;
}
if (p->pkt[idx].offset + len + offset > p->buf_size) {
if (func) {
trace("packet offset plus length %zu from size %zu, "
"%s:%i", p->pkt[idx].offset + len + offset,
p->buf_size, func, line);
}
return NULL;
}
if (len + offset > p->pkt[idx].len) {
if (func) {
trace("data length %zu, offset %zu from length %u, "
"%s:%i", len, offset, p->pkt[idx].len,
func, line);
}
return NULL;
}
if (left)
*left = p->pkt[idx].len - offset - len;
return p->buf + p->pkt[idx].offset + offset;
}
/**

View file

@ -6,17 +6,20 @@
#ifndef PACKET_H
#define PACKET_H
#include <stdbool.h>
/* Maximum size of a single packet stored in pool, including headers */
#define PACKET_MAX_LEN ((size_t)UINT16_MAX)
/**
* struct desc - Generic offset-based descriptor within buffer
* @offset: Offset of descriptor relative to buffer start, 32-bit limit
* @len: Length of descriptor, host order, 16-bit limit
*/
struct desc {
uint32_t offset;
uint16_t len;
};
/**
* struct pool - Generic pool of packets stored in a buffer
* @buf: Buffer storing packet descriptors,
* a struct vu_dev_region array for passt vhost-user mode
* @buf_size: Total size of buffer,
* 0 for passt vhost-user mode
* @buf: Buffer storing packet descriptors
* @buf_size: Total size of buffer
* @size: Number of usable descriptors for the pool
* @count: Number of used descriptors for the pool
* @pkt: Descriptors: see macros below
@ -26,36 +29,32 @@ struct pool {
size_t buf_size;
size_t size;
size_t count;
struct iovec pkt[];
struct desc pkt[1];
};
int vu_packet_check_range(void *buf, const char *ptr, size_t len);
void packet_add_do(struct pool *p, size_t len, const char *start,
const char *func, int line);
void *packet_get_try_do(const struct pool *p, const size_t idx,
size_t offset, size_t len, size_t *left,
const char *func, int line);
void *packet_get_do(const struct pool *p, const size_t idx,
size_t offset, size_t len, size_t *left,
const char *func, int line);
bool pool_full(const struct pool *p);
void pool_flush(struct pool *p);
#define packet_add(p, len, start) \
packet_add_do(p, len, start, __func__, __LINE__)
#define packet_get_try(p, idx, offset, len, left) \
packet_get_try_do(p, idx, offset, len, left, __func__, __LINE__)
#define packet_get(p, idx, offset, len, left) \
packet_get_do(p, idx, offset, len, left, __func__, __LINE__)
#define packet_get_try(p, idx, offset, len, left) \
packet_get_do(p, idx, offset, len, left, NULL, 0)
#define PACKET_POOL_DECL(_name, _size, _buf) \
struct _name ## _t { \
char *buf; \
size_t buf_size; \
size_t size; \
size_t count; \
struct iovec pkt[_size]; \
struct desc pkt[_size]; \
}
#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \

View file

@ -1,74 +0,0 @@
.\" SPDX-License-Identifier: GPL-2.0-or-later
.\" Copyright (c) 2025 Red Hat GmbH
.\" Author: Stefano Brivio <sbrivio@redhat.com>
.TH passt-repair 1
.SH NAME
.B passt-repair
\- Helper setting TCP_REPAIR socket options for \fBpasst\fR(1)
.SH SYNOPSIS
.B passt-repair
\fIPATH\fR
.SH DESCRIPTION
.B passt-repair
is a privileged helper setting and clearing repair mode on TCP sockets on behalf
of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain
socket.
It can be used to migrate TCP connections between guests without granting
additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections,
\fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR
capability (see \fBcapabilities\fR(7)) to be set or cleared.
If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to
connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file
ending with \fI.repair\fR appears in it, and then attempts to connect to it.
.SH PROTOCOL
\fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via
\fI--repair-path\fR option in \fBpasst\fR(1) itself. By default, the name is the
same as the UNIX domain socket used for guest communication, suffixed by
\fI.repair\fR.
The messages consist of one 8-bit signed integer that can be \fITCP_REPAIR_ON\fR
(1), \fITCP_REPAIR_OFF\fR (0), or \fITCP_REPAIR_OFF_NO_WP\fR (-1), as defined by
the Linux kernel user API, and one to SCM_MAX_FD (253) sockets as SCM_RIGHTS
(see \fBunix\fR(7)) ancillary message, sent by the server, \fBpasst\fR(1).
The client, \fBpasst-repair\fR(1), replies with the same byte (and no ancillary
message) to indicate success, and closes the connection on failure.
The server closes the connection on error or completion.
.SH NOTES
\fBpasst-repair\fR(1) can be granted the \fBCAP_NET_ADMIN\fR capability
(preferred, as it limits privileges to the strictly necessary ones), or it can
be run as root.
.SH AUTHOR
Stefano Brivio <sbrivio@redhat.com>.
.SH REPORTING BUGS
Please report issues on the bug tracker at https://bugs.passt.top/, or
send a message to the passt-user@passt.top mailing list, see
https://lists.passt.top/.
.SH COPYRIGHT
Copyright (c) 2025 Red Hat GmbH.
\fBpasst-repair\fR is free software: you can redistribute them and/or modify
them under the terms of the GNU General Public License as published by the Free
Software Foundation, either version 2 of the License, or (at your option) any
later version.
.SH SEE ALSO
\fBpasst\fR(1), \fBqemu\fR(1), \fBcapabilities\fR(7), \fBunix\fR(7).

View file

@ -1,266 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* passt-repair.c - Privileged helper to set/clear TCP_REPAIR on sockets
*
* Copyright (c) 2025 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
* Connect to passt via UNIX domain socket, receive sockets via SCM_RIGHTS along
* with byte commands mapping to TCP_REPAIR values, and switch repair mode on or
* off. Reply by echoing the command. Exit on EOF.
*/
#include <sys/inotify.h>
#include <sys/prctl.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/un.h>
#include <errno.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <unistd.h>
#include <netdb.h>
#include <netinet/tcp.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include "seccomp_repair.h"
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
#define REPAIR_EXT ".repair"
#define REPAIR_EXT_LEN strlen(REPAIR_EXT)
/**
* main() - Entry point and whole program with loop
* @argc: Argument count, must be 2
* @argv: Argument: path of UNIX domain socket to connect to
*
* Return: 0 on success (EOF), 1 on error, 2 on usage error
*
* #syscalls:repair connect setsockopt write close exit_group
* #syscalls:repair socket s390x:socketcall i686:socketcall
* #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv
* #syscalls:repair sendto sendmsg arm:send ppc64le:send
* #syscalls:repair stat|statx stat64|statx statx
* #syscalls:repair fstat|fstat64 newfstatat|fstatat64
* #syscalls:repair inotify_init1 inotify_add_watch
*/
int main(int argc, char **argv)
{
char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
__attribute__ ((aligned(__alignof__(struct cmsghdr))));
struct sockaddr_un a = { AF_UNIX, "" };
int fds[SCM_MAX_FD], s, ret, i, n = 0;
bool inotify_dir = false;
struct sock_fprog prog;
int8_t cmd = INT8_MAX;
struct cmsghdr *cmsg;
struct msghdr msg;
struct iovec iov;
size_t cmsg_len;
struct stat sb;
int op;
prctl(PR_SET_DUMPABLE, 0);
prog.len = (unsigned short)sizeof(filter_repair) /
sizeof(filter_repair[0]);
prog.filter = filter_repair;
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
fprintf(stderr, "Failed to apply seccomp filter\n");
_exit(1);
}
iov = (struct iovec){ &cmd, sizeof(cmd) };
msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0,
.msg_iov = &iov, .msg_iovlen = 1,
.msg_control = buf,
.msg_controllen = sizeof(buf),
.msg_flags = 0 };
cmsg = CMSG_FIRSTHDR(&msg);
if (argc != 2) {
fprintf(stderr, "Usage: %s PATH\n", argv[0]);
_exit(2);
}
if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
_exit(1);
}
if ((stat(argv[1], &sb))) {
fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno);
_exit(1);
}
if ((sb.st_mode & S_IFMT) == S_IFDIR) {
char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
__attribute__ ((aligned(__alignof__(struct inotify_event))));
const struct inotify_event *ev = NULL;
char path[PATH_MAX + 1];
bool found = false;
ssize_t n;
int fd;
if ((fd = inotify_init1(IN_CLOEXEC)) < 0) {
fprintf(stderr, "inotify_init1: %i\n", errno);
_exit(1);
}
if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) {
fprintf(stderr, "inotify_add_watch: %i\n", errno);
_exit(1);
}
do {
char *p;
n = read(fd, buf, sizeof(buf));
if (n < 0) {
fprintf(stderr, "inotify read: %i\n", errno);
_exit(1);
}
buf[n - 1] = '\0';
if (n < (ssize_t)sizeof(*ev)) {
fprintf(stderr, "Short inotify read: %zi\n", n);
continue;
}
for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
ev = (const struct inotify_event *)p;
if (ev->len >= REPAIR_EXT_LEN &&
!memcmp(ev->name +
strnlen(ev->name, ev->len) -
REPAIR_EXT_LEN,
REPAIR_EXT, REPAIR_EXT_LEN)) {
found = true;
break;
}
}
} while (!found);
if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') {
fprintf(stderr, "Invalid filename from inotify\n");
_exit(1);
}
snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name);
if ((stat(path, &sb))) {
fprintf(stderr, "Can't stat() %s: %i\n", path, errno);
_exit(1);
}
ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path);
inotify_dir = true;
} else {
ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
}
if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
fprintf(stderr, "Invalid socket path\n");
_exit(2);
}
if ((sb.st_mode & S_IFMT) != S_IFSOCK) {
fprintf(stderr, "%s is not a socket\n", a.sun_path);
_exit(2);
}
while (connect(s, (struct sockaddr *)&a, sizeof(a))) {
if (inotify_dir && errno == ECONNREFUSED)
continue;
fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path,
strerror(errno));
_exit(1);
}
loop:
ret = recvmsg(s, &msg, 0);
if (ret < 0) {
if (errno == ECONNRESET) {
ret = 0;
} else {
fprintf(stderr, "Failed to read message: %i\n", errno);
_exit(1);
}
}
if (!ret) /* Done */
_exit(0);
if (!cmsg ||
cmsg->cmsg_len < CMSG_LEN(sizeof(int)) ||
cmsg->cmsg_len > CMSG_LEN(sizeof(int) * SCM_MAX_FD) ||
cmsg->cmsg_type != SCM_RIGHTS) {
fprintf(stderr, "No/bad ancillary data from peer\n");
_exit(1);
}
/* No inverse formula for CMSG_LEN(x), and building one with CMSG_LEN(0)
* works but there's no guarantee it does. Search the whole domain.
*/
for (i = 1; i <= SCM_MAX_FD; i++) {
if (CMSG_LEN(sizeof(int) * i) == cmsg->cmsg_len) {
n = i;
break;
}
}
if (!n) {
cmsg_len = cmsg->cmsg_len; /* socklen_t is 'unsigned' on musl */
fprintf(stderr, "Invalid ancillary data length %zu from peer\n",
cmsg_len);
_exit(1);
}
memcpy(fds, CMSG_DATA(cmsg), sizeof(int) * n);
if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF &&
cmd != TCP_REPAIR_OFF_NO_WP) {
fprintf(stderr, "Unsupported command 0x%04x\n", cmd);
_exit(1);
}
op = cmd;
for (i = 0; i < n; i++) {
if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &op, sizeof(op))) {
fprintf(stderr,
"Setting TCP_REPAIR to %i on socket %i: %s\n",
op, fds[i], strerror(errno));
_exit(1);
}
/* Close _our_ copy */
close(fds[i]);
}
/* Confirm setting by echoing the command back */
if (send(s, &cmd, sizeof(cmd), 0) < 0) {
fprintf(stderr, "Reply to %i: %s\n", op, strerror(errno));
_exit(1);
}
goto loop;
return 0;
}

176
passt.1
View file

@ -95,7 +95,7 @@ detached PID namespace after starting, because the PID itself cannot change.
Default is to fork into background.
.TP
.BR \-e ", " \-\-stderr " " (DEPRECATED)
.BR \-e ", " \-\-stderr
This option has no effect, and is maintained for compatibility purposes only.
Note that this configuration option is \fBdeprecated\fR and will be removed in a
@ -160,9 +160,7 @@ once for IPv6).
By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces
with the first default route, if any, for the corresponding IP version. If no
default routes are available and there is any interface with any route for a
given IP version, the first of these interfaces will be chosen instead. If no
such interface exists, the link-local address 169.254.2.1 is assigned for IPv4,
and no additional address will be assigned for IPv6.
given IP version, the first of these interfaces will be chosen instead.
.TP
.BR \-n ", " \-\-netmask " " \fImask
@ -176,7 +174,8 @@ according to the CIDR block of the assigned address (RFC 4632).
.BR \-M ", " \-\-mac-addr " " \fIaddr
Use source MAC address \fIaddr\fR when communicating to the guest or to the
target namespace.
Default is the locally administered MAC addresses 9a:55:9a:55:9a:55.
Default is to use the MAC address of the interface with the first IPv4 default
route on the host.
.TP
.BR \-g ", " \-\-gateway " " \fIaddr
@ -189,9 +188,7 @@ first default route, if any, for the corresponding IP version. If the default
route is a multipath one, the gateway is the first nexthop router returned by
the kernel which has the highest weight in the set of paths. If no default
routes are available and there is just one interface with any route, that
interface will be chosen instead. If no such interface exists, the link-local
address 169.254.2.2 is used for IPv4, and the link-local address fe80::1 is used
for IPv6.
interface will be chosen instead.
Note: these addresses are also used as source address for packets directed to
the guest or to the target namespace having a loopback or local source address,
@ -206,9 +203,7 @@ Default is to use the interfaces specified by \fB--outbound-if4\fR and
If no interfaces are given, the interface with the first default routes for each
IP version is selected. If no default routes are available and there is just one
interface with any route, that interface will be chosen instead. If no such
interface exists, host interfaces will be ignored for the purposes of assigning
addresses and routes, and link-local addresses will be used instead.
interface with any route, that interface will be chosen instead.
.TP
.BR \-o ", " \-\-outbound " " \fIaddr
@ -227,8 +222,7 @@ derive IPv4 addresses and routes.
By default, the interface given by the default route is selected. If no default
routes are available and there is just one interface with any route, that
interface will be chosen instead. If no such interface exists, outbound sockets
will not be bound to any specific interface.
interface will be chosen instead.
.TP
.BR \-\-outbound-if6 " " \fIname
@ -238,8 +232,7 @@ derive IPv6 addresses and routes.
By default, the interface given by the default route is selected. If no default
routes are available and there is just one interface with any route, that
interface will be chosen instead. If no such interface exists, outbound sockets
will not be bound to any specific interface.
interface will be chosen instead.
.TP
.BR \-D ", " \-\-dns " " \fIaddr
@ -256,19 +249,10 @@ the host.
.TP
.BR \-\-dns-forward " " \fIaddr
Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the
nameserver (with corresponding IP version) specified by the
\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or
port 853. Replies are translated back with a reverse mapping. This
option can be specified zero to two times (once for IPv4, once for
IPv6).
.TP
.BR \-\-dns-host " " \fIaddr
Configure the host nameserver which guest or namespace queries to the
\fB\-\-dns-forward\fR address will be redirected to. This option can
be specified zero to two times (once for IPv4, once for IPv6).
By default, the first nameserver from the host's
\fI/etc/resolv.conf\fR.
first configured DNS resolver (with corresponding IP version). Maps
only UDP and TCP traffic to port 53 or port 853. Replies are
translated back with a reverse mapping. This option can be specified
zero to two times (once for IPv4, once for IPv6).
.TP
.BR \-S ", " \-\-search " " \fIlist
@ -343,16 +327,6 @@ namespace will be silently dropped.
Disable Router Advertisements. Router Solicitations coming from guest or target
namespace will be ignored.
.TP
.BR \-\-freebind
Allow any binding address to be specified for \fB-t\fR and \fB-u\fR
options. Usually binding addresses must be addresses currently
configured on the host. With \fB\-\-freebind\fR, the
\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled
allowing any address to be used. This is typically used to bind
addresses which might be configured on the host in future, at which
point the forwarding will immediately start operating.
.TP
.BR \-\-map-host-loopback " " \fIaddr
Translate \fIaddr\fR to refer to the host. Packets from the guest to
@ -380,14 +354,14 @@ Translate \fIaddr\fR in the guest to be equal to the guest's assigned
address on the host. That is, packets from the guest to \fIaddr\fR
will be redirected to the address assigned to the guest with \fB-a\fR,
or by default the host's global address. This allows the guest to
access services available on the host's global address, even though its
access services availble on the host's global address, even though its
own address shadows that of the host.
If \fIaddr\fR is 'none', no address is mapped. Only one IPv4 and one
IPv6 address can be translated, and if the option is specified
multiple times, the last one for each address type takes effect.
By default, mapping happens as described for the \-\-map-host-loopback option.
Default is no mapping.
.TP
.BR \-4 ", " \-\-ipv4-only
@ -401,44 +375,15 @@ Enable IPv6-only operation. IPv4 traffic will be ignored.
By default, IPv4 operation is enabled as long as at least an IPv4 route and an
interface address are configured on a given host interface.
.TP
.BR \-H ", " \-\-hostname " " \fIname
Hostname to configure the client with.
Send \fIname\fR as DHCP option 12 (hostname).
.TP
.BR \-\-fqdn " " \fIname
FQDN to configure the client with.
Send \fIname\fR as Client FQDN: DHCP option 81 and DHCPv6 option 39.
.SS \fBpasst\fR-only options
.TP
.BR \-s ", " \-\-socket-path ", " \-\-socket " " \fIpath
.BR \-s ", " \-\-socket " " \fIpath
Path for UNIX domain socket used by \fBqemu\fR(1) or \fBqrap\fR(1) to connect to
\fBpasst\fR.
Default is to probe a free socket, not accepting connections, starting from
\fI/tmp/passt_1.socket\fR to \fI/tmp/passt_64.socket\fR.
.TP
.BR \-\-vhost-user
Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR.
.TP
.BR \-\-print-capabilities
Print back-end capabilities in JSON format, only meaningful for vhost-user mode.
.TP
.BR \-\-repair-path " " \fIpath
Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect
to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during
migration. \fB--repair-path none\fR disables this interface (if you need to
specify a socket path called "none" you can prefix the path by \fI./\fR).
Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
chosen for the hypervisor UNIX domain socket. No socket is created if not in
\-\-vhost-user mode.
.TP
.BR \-F ", " \-\-fd " " \fIFD
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
@ -540,7 +485,6 @@ Default is \fBnone\fR.
.BR \-I ", " \-\-ns-ifname " " \fIname
Name of tap interface to be created in target namespace.
By default, the same interface name as the external, routable interface is used.
If no such interface exists, the name \fItap0\fR will be used instead.
.TP
.BR \-t ", " \-\-tcp-ports " " \fIspec
@ -642,13 +586,6 @@ Configure UDP port forwarding from target namespace to init namespace.
Default is \fBauto\fR.
.TP
.BR \-\-host-lo-to-ns-lo
If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
the host's loopback address will appear on the loopback address in the
guest as well. Without this option such forwarded packets will appear
to come from the guest's public address.
.TP
.BR \-\-userns " " \fIspec
Target user namespace to join, as a path. If PID is given, without this option,
@ -716,11 +653,6 @@ Configure MAC address \fIaddr\fR on the tap interface in the namespace.
Default is to let the tap driver build a pseudorandom hardware address.
.TP
.BR \-\-no-splice
Disable the bypass path for inbound, local traffic. See the section \fBHandling
of local traffic in pasta\fR in the \fBNOTES\fR for more details.
.SH EXAMPLES
.SS \fBpasta
@ -931,31 +863,26 @@ root@localhost's password:
.SH NOTES
.SS Handling of traffic with loopback destination and source addresses
.SS Handling of traffic with local destination and source addresses
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback
address (127.0.0.0/8 or ::1), depending on the configuration. Loopback
destination or source addresses need to be changed before packets are
delivered to the guest or target namespace: most operating systems
would drop packets received with loopback addresses on non-loopback
interfaces, and it would also be impossible for guest or target
namespace to route answers back.
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address,
depending on the configuration. Local destination or source addresses need to be
changed before packets are delivered to the guest or target namespace: most
operating systems would drop packets received from non-loopback interfaces with
local addresses, and it would also be impossible for guest or target namespace
to route answers back.
For convenience, the source address on these packets is translated to
the address specified by the \fB\-\-map-host-loopback\fR option (with
some exceptions in pasta mode, see next section below). If not
specified this defaults, somewhat arbitrarily, to the address of
default IPv4 or IPv6 gateway (if any) -- this is known to be an
existing, valid address on the same subnet. If \fB\-\-no-map-gw\fR or
\fB\-\-map-host-loopback none\fR are specified this translation is
disabled and packets with loopback addresses are simply dropped.
For convenience, and somewhat arbitrarily, the source address on these packets
is translated to the address of the default IPv4 or IPv6 gateway (if any) --
this is known to be an existing, valid address on the same subnet.
Loopback destination addresses are translated to the observed external
address of the guest or target namespace. For IPv6, the observed
link-local address is used if the translated source address is
link-local, otherwise the observed global address is used. For both
IPv4 and IPv6, if no addresses have been seen yet, the configured
addresses will be used instead.
Loopback destination addresses are instead translated to the observed external
address of the guest or target namespace. For IPv6 packets, if usage of a
link-local address by guest or namespace has ever been observed, and the
original destination address is also a link-local address, the observed
link-local address is used. Otherwise, the observed global address is used. For
both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses
will be used instead.
For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1,
with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
@ -963,15 +890,11 @@ the last observed source address from guest or namespace is 192.0.2.2, this will
be translated to a connection from 192.0.2.1 to 192.0.2.2.
Similarly, for traffic coming from guest or namespace, packets with destination
address corresponding to the \fB\-\-map-host-loopback\fR address will have their
destination address translated to a loopback address.
As an exception, traffic identified as DNS, originally directed to the
\fB\-\-map-host-loopback\fR address, if this address matches a resolver address
on the host, is \fBnot\fR translated to loopback, but rather handled in the same
way as if specified as \-\-dns-forward address, if no such option was given.
In the common case where the host gateway also acts a resolver, this avoids that
the host mapping shadows the gateway/resolver itself.
address corresponding to the default gateway will have their destination address
translated to a loopback address, if and only if a packet, in the opposite
direction, with a loopback destination or source address, port-wise matching for
UDP, or connection-wise for TCP, has been recently forwarded to guest or
namespace. This behaviour can be disabled with \-\-no\-map\-gw.
.SS Handling of local traffic in pasta
@ -987,15 +910,8 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair
of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet
transfers.
Because it's not possible to bind sockets to foreign addresses, this
bypass only applies to local connections and traffic. It also means
that the address translation differs slightly from passt mode.
Connections from loopback to loopback on the host will appear to come
from the target namespace's public address within the guest, unless
\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will
appear to come from loopback in the namespace as well. The latter
behaviour used to be the default, but is usually undesirable, since it
can unintentionally expose namespace local services to the host.
This bypass only applies to local connections and traffic, because it's not
possible to bind sockets to foreign addresses.
.SS Binding to low numbered ports (well-known or system ports, up to 1023)
@ -1080,20 +996,6 @@ If the sending window cannot be queried, it will always be announced as the
current sending buffer size to guest or target namespace. This might affect
throughput of TCP connections.
.SS Local mode for disconnected setups
If \fBpasst\fR and \fBpasta\fR fail to find a host interface with a configured
address, other than loopback addresses, they will, obviously, not attempt to
source addresses or routes from the host.
In this case, unless configured otherwise, they will assign the IPv4 link-local
address 169.254.2.1 to the guest or target namespace, and no IPv6 address. The
notion of the guest or target namespace IPv6 address is derived from the first
link-local address observed.
Default gateways will be assigned as the link-local address 169.254.2.2 for
IPv4, and as the link-local address fe80::1 for IPv6.
.SH LIMITATIONS
Currently, IGMP/MLD proxying (RFC 4605) and support for SCTP (RFC 4960) are not

115
passt.c
View file

@ -36,6 +36,9 @@
#include <sys/prctl.h>
#include <netinet/if_ether.h>
#include <libgen.h>
#ifdef HAS_GETRANDOM
#include <sys/random.h>
#endif
#include "util.h"
#include "passt.h"
@ -49,10 +52,6 @@
#include "arch.h"
#include "log.h"
#include "tcp_splice.h"
#include "ndp.h"
#include "vu_common.h"
#include "migrate.h"
#include "repair.h"
#define EPOLL_EVENTS 8
@ -68,17 +67,13 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
[EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket",
[EPOLL_TYPE_UDP] = "UDP flow socket",
[EPOLL_TYPE_UDP_REPLY] = "UDP reply socket",
[EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket",
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
[EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device",
[EPOLL_TYPE_TAP_PASST] = "connected qemu socket",
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
[EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
[EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@ -115,25 +110,40 @@ static void post_handler(struct ctx *c, const struct timespec *now)
flow_defer_handler(c, now);
#undef CALL_PROTO_HANDLER
if (!c->no_ndp)
ndp_timer(c, now);
}
/**
* random_init() - Initialise things based on random data
* secret_init() - Create secret value for SipHash calculations
* @c: Execution context
*/
static void random_init(struct ctx *c)
static void secret_init(struct ctx *c)
{
unsigned int seed;
#ifndef HAS_GETRANDOM
int dev_random = open("/dev/random", O_RDONLY);
unsigned int random_read = 0;
/* Create secret value for SipHash calculations */
raw_random(&c->hash_secret, sizeof(c->hash_secret));
while (dev_random && random_read < sizeof(c->hash_secret)) {
int ret = read(dev_random,
(uint8_t *)&c->hash_secret + random_read,
sizeof(c->hash_secret) - random_read);
/* Seed pseudo-RNG for things that need non-cryptographic random */
raw_random(&seed, sizeof(seed));
srandom(seed);
if (ret == -1 && errno == EINTR)
continue;
if (ret <= 0)
break;
random_read += ret;
}
if (dev_random >= 0)
close(dev_random);
if (random_read < sizeof(c->hash_secret))
#else
if (getrandom(&c->hash_secret, sizeof(c->hash_secret),
GRND_RANDOM) < 0)
#endif /* !HAS_GETRANDOM */
die_perror("Failed to get random bytes for hash table and TCP");
}
/**
@ -166,11 +176,11 @@ void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
*
* #syscalls exit_group
*/
static void exit_handler(int signal)
void exit_handler(int signal)
{
(void)signal;
_exit(EXIT_SUCCESS);
exit(EXIT_SUCCESS);
}
/**
@ -184,27 +194,26 @@ static void exit_handler(int signal)
* #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close
* #syscalls bind connect recvfrom sendto shutdown
* #syscalls arm:recv ppc64le:recv arm:send ppc64le:send
* #syscalls accept4 accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
* #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
* #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64
*/
int main(int argc, char **argv)
{
struct epoll_event events[EPOLL_EVENTS];
int nfds, i, devnull_fd = -1;
char argv0[PATH_MAX], *name;
struct ctx c = { 0 };
struct rlimit limit;
struct timespec now;
struct sigaction sa;
if (clock_gettime(CLOCK_MONOTONIC, &log_start))
die_perror("Failed to get CLOCK_MONOTONIC time");
clock_gettime(CLOCK_MONOTONIC, &log_start);
arch_avx2_exec(argv);
isolate_initial(argc, argv);
c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1;
c.device_state_fd = -1;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
@ -212,18 +221,27 @@ int main(int argc, char **argv)
sigaction(SIGTERM, &sa, NULL);
sigaction(SIGQUIT, &sa, NULL);
c.mode = conf_mode(argc, argv);
if (argc < 1)
exit(EXIT_FAILURE);
if (c.mode == MODE_PASTA) {
strncpy(argv0, argv[0], PATH_MAX - 1);
name = basename(argv0);
if (strstr(name, "pasta")) {
sa.sa_handler = pasta_child_handler;
if (sigaction(SIGCHLD, &sa, NULL))
die_perror("Couldn't install signal handlers");
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
die_perror("Couldn't set disposition for SIGPIPE");
c.mode = MODE_PASTA;
} else if (strstr(name, "passt")) {
c.mode = MODE_PASST;
} else {
exit(EXIT_FAILURE);
}
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
die_perror("Couldn't set disposition for SIGPIPE");
madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE);
madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
if (c.epollfd == -1)
@ -243,17 +261,16 @@ int main(int argc, char **argv)
pasta_netns_quit_init(&c);
tap_backend_init(&c);
tap_sock_init(&c);
random_init(&c);
secret_init(&c);
if (clock_gettime(CLOCK_MONOTONIC, &now))
die_perror("Failed to get CLOCK_MONOTONIC time");
clock_gettime(CLOCK_MONOTONIC, &now);
flow_init();
if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
_exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
@ -290,15 +307,13 @@ int main(int argc, char **argv)
timer_init(&c, &now);
loop:
/* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
/* NOLINTEND(bugprone-branch-clone) */
if (nfds == -1 && errno != EINTR)
die_perror("epoll_wait() failed in main loop");
if (clock_gettime(CLOCK_MONOTONIC, &now))
err_perror("Failed to get CLOCK_MONOTONIC time");
clock_gettime(CLOCK_MONOTONIC, &now);
for (i = 0; i < nfds; i++) {
union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
@ -339,24 +354,12 @@ loop:
case EPOLL_TYPE_UDP_LISTEN:
udp_listen_sock_handler(&c, ref, eventmask, &now);
break;
case EPOLL_TYPE_UDP:
udp_sock_handler(&c, ref, eventmask, &now);
case EPOLL_TYPE_UDP_REPLY:
udp_reply_sock_handler(&c, ref, eventmask, &now);
break;
case EPOLL_TYPE_PING:
icmp_sock_handler(&c, ref);
break;
case EPOLL_TYPE_VHOST_CMD:
vu_control_handler(c.vdev, c.fd_tap, eventmask);
break;
case EPOLL_TYPE_VHOST_KICK:
vu_kick_cb(c.vdev, ref, &now);
break;
case EPOLL_TYPE_REPAIR_LISTEN:
repair_listen_handler(&c, eventmask);
break;
case EPOLL_TYPE_REPAIR:
repair_handler(&c, eventmask);
break;
default:
/* Can't happen */
ASSERT(0);
@ -365,7 +368,5 @@ loop:
post_handler(&c, &now);
migrate_handler(&c);
goto loop;
}

51
passt.h
View file

@ -20,13 +20,11 @@ union epoll_ref;
#include "siphash.h"
#include "ip.h"
#include "inany.h"
#include "migrate.h"
#include "flow.h"
#include "icmp.h"
#include "fwd.h"
#include "tcp.h"
#include "udp.h"
#include "vhost_user.h"
/* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0
* (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise
@ -45,7 +43,6 @@ union epoll_ref;
* @icmp: ICMP-specific reference part
* @data: Data handled by protocol handlers
* @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone
* @queue: vhost-user queue index for this fd
* @u64: Opaque reference for epoll_ctl() and epoll_wait()
*/
union epoll_ref {
@ -61,7 +58,6 @@ union epoll_ref {
union udp_listen_epoll_ref udp;
uint32_t data;
int nsdir_fd;
int queue;
};
};
uint64_t u64;
@ -69,9 +65,12 @@ union epoll_ref {
static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
"epoll_ref must have same size as epoll_data");
/* Large enough for ~128 maximum size frames */
#define PKT_BUF_BYTES (8UL << 20)
#define TAP_BUF_BYTES \
ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
#define TAP_MSGS \
DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0)
extern char pkt_buf [PKT_BUF_BYTES];
extern char *epoll_type_str[];
@ -95,7 +94,6 @@ struct fqdn {
enum passt_modes {
MODE_PASST,
MODE_PASTA,
MODE_VU,
};
/**
@ -191,7 +189,6 @@ struct ip6_ctx {
* @foreground: Run in foreground, don't log to stderr by default
* @nofile: Maximum number of open files (ulimit -n)
* @sock_path: Path for UNIX domain socket
* @repair_path: TCP_REPAIR helper path, can be "none", empty for default
* @pcap: Path for packet capture file
* @pidfile: Path to PID file, empty string if not configured
* @pidfile_fd: File descriptor for PID file, -1 if none
@ -202,17 +199,13 @@ struct ip6_ctx {
* @epollfd: File descriptor for epoll instance
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
* @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket
* @fd_repair_listen: File descriptor for listening TCP_REPAIR socket, if any
* @fd_repair: Connected AF_UNIX socket for TCP_REPAIR helper
* @our_tap_mac: Pasta/passt's MAC on the tap link
* @guest_mac: MAC address of guest or namespace, seen or configured
* @hash_secret: 128-bit secret for siphash functions
* @ifi4: Template interface for IPv4, -1: none, 0: IPv4 disabled
* @ifi4: Index of template interface for IPv4, 0 if IPv4 disabled
* @ip: IPv4 configuration
* @dns_search: DNS search list
* @hostname: Guest hostname
* @fqdn: Guest FQDN
* @ifi6: Template interface for IPv6, -1: none, 0: IPv6 disabled
* @ifi6: Index of template interface for IPv6, 0 if IPv6 disabled
* @ip6: IPv6 configuration
* @pasta_ifn: Name of namespace interface for pasta
* @pasta_ifi: Index of namespace interface for pasta
@ -232,15 +225,8 @@ struct ip6_ctx {
* @no_dhcpv6: Disable DHCPv6 server
* @no_ndp: Disable NDP handler altogether
* @no_ra: Disable router advertisements
* @no_splice: Disable socket splicing for inbound traffic
* @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses
* @freebind: Allow binding of non-local addresses for forwarding
* @low_wmem: Low probed net.core.wmem_max
* @low_rmem: Low probed net.core.rmem_max
* @vdev: vhost-user device
* @device_state_fd: Device state migration channel
* @device_state_result: Device state migration result
* @migrate_target: Are we the target, on the next migration request?
*/
struct ctx {
enum passt_modes mode;
@ -250,7 +236,6 @@ struct ctx {
int foreground;
int nofile;
char sock_path[UNIX_PATH_MAX];
char repair_path[UNIX_PATH_MAX];
char pcap[PATH_MAX];
char pidfile[PATH_MAX];
@ -267,23 +252,16 @@ struct ctx {
int epollfd;
int fd_tap_listen;
int fd_tap;
int fd_repair_listen;
int fd_repair;
unsigned char our_tap_mac[ETH_ALEN];
unsigned char guest_mac[ETH_ALEN];
uint16_t mtu;
uint64_t hash_secret[2];
int ifi4;
unsigned int ifi4;
struct ip4_ctx ip4;
struct fqdn dns_search[MAXDNSRCH];
char hostname[PASST_MAXDNAME];
char fqdn[PASST_MAXDNAME];
int ifi6;
unsigned int ifi6;
struct ip6_ctx ip6;
char pasta_ifn[IF_NAMESIZE];
@ -297,6 +275,7 @@ struct ctx {
int no_icmp;
struct icmp_ctx icmp;
int mtu;
int no_dns;
int no_dns_search;
int no_dhcp_dns;
@ -305,19 +284,9 @@ struct ctx {
int no_dhcpv6;
int no_ndp;
int no_ra;
int no_splice;
int host_lo_to_ns_lo;
int freebind;
int low_wmem;
int low_rmem;
struct vu_dev *vdev;
/* Migration */
int device_state_fd;
int device_state_result;
bool migrate_target;
};
void proto_update_l2_buf(const unsigned char *eth_d,

89
pasta.c
View file

@ -57,13 +57,15 @@ int pasta_child_pid;
/**
* pasta_child_handler() - Exit once shell exits (if we started it), reap clones
* @signal: Signal number; this handler deals with SIGCHLD only
* @signal: Unused, handler deals with SIGCHLD only
*/
void pasta_child_handler(int signal)
{
int errno_save = errno;
siginfo_t infop;
(void)signal;
if (signal != SIGCHLD)
return;
@ -71,12 +73,12 @@ void pasta_child_handler(int signal)
!waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) {
if (infop.si_pid == pasta_child_pid) {
if (infop.si_code == CLD_EXITED)
_exit(infop.si_status);
exit(infop.si_status);
/* If killed by a signal, si_status is the number.
* Follow common shell convention of returning it + 128.
*/
_exit(infop.si_status + 128);
exit(infop.si_status + 128);
/* Nothing to do, detached PID namespace going away */
}
@ -100,9 +102,7 @@ static int pasta_wait_for_ns(void *arg)
int flags = O_RDONLY | O_CLOEXEC;
char ns[PATH_MAX];
if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid))
die_perror("Can't build netns path");
snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
do {
while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
if (errno != ENOENT)
@ -167,12 +167,10 @@ void pasta_open_ns(struct ctx *c, const char *netns)
* struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd()
* @exe: Executable to run
* @argv: Command and arguments to run
* @ctx: Context to read config from
*/
struct pasta_spawn_cmd_arg {
const char *exe;
char *const *argv;
struct ctx *c;
};
/**
@ -186,7 +184,6 @@ static int pasta_spawn_cmd(void *arg)
{
char hostname[HOST_NAME_MAX + 1] = HOSTNAME_PREFIX;
const struct pasta_spawn_cmd_arg *a;
size_t conf_hostname_len;
sigset_t set;
/* We run in a detached PID and mount namespace: mount /proc over */
@ -196,15 +193,9 @@ static int pasta_spawn_cmd(void *arg)
if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
warn("Cannot set ping_group_range, ICMP requests might fail");
a = (const struct pasta_spawn_cmd_arg *)arg;
conf_hostname_len = strlen(a->c->hostname);
if (conf_hostname_len > 0) {
if (sethostname(a->c->hostname, conf_hostname_len))
warn("Unable to set configured hostname");
} else if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1,
HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) ||
errno == ENAMETOOLONG) {
if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1,
HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) ||
errno == ENAMETOOLONG) {
hostname[HOST_NAME_MAX] = '\0';
if (sethostname(hostname, strlen(hostname)))
warn("Unable to set pasta-prefixed hostname");
@ -215,6 +206,7 @@ static int pasta_spawn_cmd(void *arg)
sigaddset(&set, SIGUSR1);
sigwaitinfo(&set, NULL);
a = (const struct pasta_spawn_cmd_arg *)arg;
execvp(a->exe, a->argv);
die_perror("Failed to start command or shell");
@ -236,7 +228,6 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
struct pasta_spawn_cmd_arg arg = {
.exe = argv[0],
.argv = argv,
.c = c,
};
char uidmap[BUFSIZ], gidmap[BUFSIZ];
char *sh_argv[] = { NULL, NULL };
@ -248,11 +239,8 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
c->quiet = 1;
/* Configure user and group mappings */
if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid))
die_perror("Can't build uidmap");
if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid))
die_perror("Can't build gidmap");
snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
snprintf(gidmap, BUFSIZ, "0 %u 1", gid);
if (write_file("/proc/self/uid_map", uidmap) ||
write_file("/proc/self/setgroups", "deny") ||
@ -303,7 +291,7 @@ void pasta_ns_conf(struct ctx *c)
rc = nl_link_set_flags(nl_sock_ns, 1 /* lo */, IFF_UP, IFF_UP);
if (rc < 0)
die("Couldn't bring up loopback interface in namespace: %s",
strerror_(-rc));
strerror(-rc));
/* Get or set MAC in target namespace */
if (MAC_IS_ZERO(c->guest_mac))
@ -312,12 +300,12 @@ void pasta_ns_conf(struct ctx *c)
rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac);
if (rc < 0)
die("Couldn't set MAC address in namespace: %s",
strerror_(-rc));
strerror(-rc));
if (c->pasta_conf_ns) {
unsigned int flags = IFF_UP;
if (c->mtu)
if (c->mtu != -1)
nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu);
if (c->ifi6) /* Avoid duplicate address detection on link up */
@ -339,7 +327,7 @@ void pasta_ns_conf(struct ctx *c)
if (rc < 0) {
die("Couldn't set IPv4 address(es) in namespace: %s",
strerror_(-rc));
strerror(-rc));
}
if (c->ip4.no_copy_routes) {
@ -353,7 +341,7 @@ void pasta_ns_conf(struct ctx *c)
if (rc < 0) {
die("Couldn't set IPv4 route(s) in guest: %s",
strerror_(-rc));
strerror(-rc));
}
}
@ -362,13 +350,13 @@ void pasta_ns_conf(struct ctx *c)
&c->ip6.addr_ll_seen);
if (rc < 0) {
warn("Can't get LL address from namespace: %s",
strerror_(-rc));
strerror(-rc));
}
rc = nl_addr_set_ll_nodad(nl_sock_ns, c->pasta_ifi);
if (rc < 0) {
warn("Can't set nodad for LL in namespace: %s",
strerror_(-rc));
strerror(-rc));
}
/* We dodged DAD: re-enable neighbour solicitations */
@ -376,11 +364,8 @@ void pasta_ns_conf(struct ctx *c)
0, IFF_NOARP);
if (c->ip6.no_copy_addrs) {
if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) {
rc = nl_addr_set(nl_sock_ns,
c->pasta_ifi, AF_INET6,
&c->ip6.addr, 64);
}
rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
AF_INET6, &c->ip6.addr, 64);
} else {
rc = nl_addr_dup(nl_sock, c->ifi6,
nl_sock_ns, c->pasta_ifi,
@ -389,7 +374,7 @@ void pasta_ns_conf(struct ctx *c)
if (rc < 0) {
die("Couldn't set IPv6 address(es) in namespace: %s",
strerror_(-rc));
strerror(-rc));
}
if (c->ip6.no_copy_routes) {
@ -404,7 +389,7 @@ void pasta_ns_conf(struct ctx *c)
if (rc < 0) {
die("Couldn't set IPv6 route(s) in guest: %s",
strerror_(-rc));
strerror(-rc));
}
}
}
@ -453,18 +438,18 @@ void pasta_netns_quit_init(const struct ctx *c)
return;
if ((dir_fd = open(c->netns_dir, O_CLOEXEC | O_RDONLY)) < 0)
die("netns dir open: %s, exiting", strerror_(errno));
die("netns dir open: %s, exiting", strerror(errno));
if (fstatfs(dir_fd, &s) || s.f_type == DEVPTS_SUPER_MAGIC ||
s.f_type == PROC_SUPER_MAGIC || s.f_type == SYSFS_MAGIC)
try_inotify = false;
if (try_inotify && (fd = inotify_init1(flags)) < 0)
warn("inotify_init1(): %s, use a timer", strerror_(errno));
warn("inotify_init1(): %s, use a timer", strerror(errno));
if (fd >= 0 && inotify_add_watch(fd, c->netns_dir, IN_DELETE) < 0) {
warn("inotify_add_watch(): %s, use a timer",
strerror_(errno));
strerror(errno));
close(fd);
fd = -1;
}
@ -496,23 +481,17 @@ void pasta_netns_quit_init(const struct ctx *c)
*/
void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd)
{
char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
__attribute__ ((aligned(__alignof__(struct inotify_event))));
const struct inotify_event *ev;
ssize_t n;
char *p;
char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
const struct inotify_event *in_ev = (struct inotify_event *)buf;
if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev))
if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev))
return;
for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
ev = (const struct inotify_event *)p;
if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base)))
return;
if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) {
info("Namespace %s is gone, exiting", c->netns_base);
_exit(EXIT_SUCCESS);
}
}
info("Namespace %s is gone, exiting", c->netns_base);
exit(EXIT_SUCCESS);
}
/**
@ -538,7 +517,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref)
return;
info("Namespace %s is gone, exiting", c->netns_base);
_exit(EXIT_SUCCESS);
exit(EXIT_SUCCESS);
}
close(fd);

84
pcap.c
View file

@ -33,12 +33,33 @@
#include "log.h"
#include "pcap.h"
#include "iov.h"
#include "tap.h"
#define PCAP_VERSION_MINOR 4
static int pcap_fd = -1;
/* See pcap.h from libpcap, or pcap-savefile(5) */
static const struct {
uint32_t magic;
#define PCAP_MAGIC 0xa1b2c3d4
uint16_t major;
#define PCAP_VERSION_MAJOR 2
uint16_t minor;
#define PCAP_VERSION_MINOR 4
int32_t thiszone;
uint32_t sigfigs;
uint32_t snaplen;
uint32_t linktype;
#define PCAP_LINKTYPE_ETHERNET 1
} pcap_hdr = {
PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU,
PCAP_LINKTYPE_ETHERNET
};
struct pcap_pkthdr {
uint32_t tv_sec;
uint32_t tv_usec;
@ -52,6 +73,8 @@ struct pcap_pkthdr {
* @iovcnt: Number of buffers (@iov entries) in frame
* @offset: Byte offset of the L2 headers within @iov
* @now: Timestamp
*
* Returns: 0 on success, -errno on error writing to the file
*/
static void pcap_frame(const struct iovec *iov, size_t iovcnt,
size_t offset, const struct timespec *now)
@ -63,8 +86,9 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
.caplen = l2len,
.len = l2len
};
struct iovec hiov = { &h, sizeof(h) };
if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 ||
if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 ||
write_remainder(pcap_fd, iov, iovcnt, offset) < 0)
debug_perror("Cannot log packet, length %zu", l2len);
}
@ -77,14 +101,12 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
void pcap(const char *pkt, size_t l2len)
{
struct iovec iov = { (char *)pkt, l2len };
struct timespec now = { 0 };
struct timespec now;
if (pcap_fd == -1)
return;
if (clock_gettime(CLOCK_REALTIME, &now))
err_perror("Failed to get CLOCK_REALTIME time");
clock_gettime(CLOCK_REALTIME, &now);
pcap_frame(&iov, 1, 0, &now);
}
@ -98,38 +120,36 @@ void pcap(const char *pkt, size_t l2len)
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
size_t offset)
{
struct timespec now = { 0 };
struct timespec now;
unsigned int i;
if (pcap_fd == -1)
return;
if (clock_gettime(CLOCK_REALTIME, &now))
err_perror("Failed to get CLOCK_REALTIME time");
clock_gettime(CLOCK_REALTIME, &now);
for (i = 0; i < n; i++)
pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
}
/**
* pcap_iov() - Write packet data described by an I/O vector
/*
* pcap_iov - Write packet data described by an I/O vector
* to a pcap file descriptor.
*
* @iov: Pointer to the array of struct iovec describing the I/O vector
* containing packet data to write, including L2 header
* @iovcnt: Number of buffers (@iov entries)
* @offset: Offset of the L2 frame within the full data length
*/
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
/* cppcheck-suppress unusedFunction */
void pcap_iov(const struct iovec *iov, size_t iovcnt)
{
struct timespec now = { 0 };
struct timespec now;
if (pcap_fd == -1)
return;
if (clock_gettime(CLOCK_REALTIME, &now))
err_perror("Failed to get CLOCK_REALTIME time");
pcap_frame(iov, iovcnt, offset, &now);
clock_gettime(CLOCK_REALTIME, &now);
pcap_frame(iov, iovcnt, 0, &now);
}
/**
@ -138,28 +158,7 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
*/
void pcap_init(struct ctx *c)
{
/* See pcap.h from libpcap, or pcap-savefile(5) */
#define PCAP_MAGIC 0xa1b2c3d4
#define PCAP_VERSION_MAJOR 2
#define PCAP_VERSION_MINOR 4
#define PCAP_LINKTYPE_ETHERNET 1
const struct {
uint32_t magic;
uint16_t major;
uint16_t minor;
int32_t thiszone;
uint32_t sigfigs;
uint32_t snaplen;
uint32_t linktype;
} pcap_hdr = {
.magic = PCAP_MAGIC,
.major = PCAP_VERSION_MAJOR,
.minor = PCAP_VERSION_MINOR,
.snaplen = tap_l2_max_len(c),
.linktype = PCAP_LINKTYPE_ETHERNET
};
int flags = O_WRONLY | O_CREAT | O_TRUNC;
if (pcap_fd != -1)
return;
@ -167,9 +166,10 @@ void pcap_init(struct ctx *c)
if (!*c->pcap)
return;
pcap_fd = output_file_open(c->pcap, O_WRONLY);
flags |= c->foreground ? O_CLOEXEC : 0;
pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR);
if (pcap_fd == -1) {
err_perror("Couldn't open pcap file %s", c->pcap);
perror("open");
return;
}

2
pcap.h
View file

@ -9,7 +9,7 @@
void pcap(const char *pkt, size_t l2len);
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
size_t offset);
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset);
void pcap_iov(const struct iovec *iov, size_t iovcnt);
void pcap_init(struct ctx *c);
#endif /* PCAP_H */

42
pif.c
View file

@ -59,45 +59,3 @@ void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
*sl = sizeof(sa->sa6);
}
}
/** pif_sock_l4() - Open a socket bound to an address on a specified interface
* @c: Execution context
* @type: Socket epoll type
* @pif: Interface for this socket
* @addr: Address to bind to, or NULL for dual-stack any
* @ifname: Interface for binding, NULL for any
* @port: Port number to bind to (host byte order)
* @data: epoll reference portion for protocol handlers
*
* NOTE: For namespace pifs, this must be called having already entered the
* relevant namespace.
*
* Return: newly created socket, negative error code on failure
*/
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
const union inany_addr *addr, const char *ifname,
in_port_t port, uint32_t data)
{
union sockaddr_inany sa = {
.sa6.sin6_family = AF_INET6,
.sa6.sin6_addr = in6addr_any,
.sa6.sin6_port = htons(port),
};
socklen_t sl;
ASSERT(pif_is_socket(pif));
if (pif == PIF_SPLICE) {
/* Sanity checks */
ASSERT(!ifname);
ASSERT(addr && inany_is_loopback(addr));
}
if (!addr)
return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
ifname, false, data);
pif_sockaddr(c, &sa, &sl, pif, addr, port);
return sock_l4_sa(c, type, &sa, sl,
ifname, sa.sa_family == AF_INET6, data);
}

3
pif.h
View file

@ -59,8 +59,5 @@ static inline bool pif_is_socket(uint8_t pif)
void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
uint8_t pif, const union inany_addr *addr, in_port_t port);
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
const union inany_addr *addr, const char *ifname,
in_port_t port, uint32_t data);
#endif /* PIF_H */

273
repair.c
View file

@ -1,273 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR
*
* Copyright (c) 2025 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#include <errno.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include "util.h"
#include "ip.h"
#include "passt.h"
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
#include "repair.h"
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */
#define REPAIR_ACCEPT_TIMEOUT_MS 10
#define REPAIR_ACCEPT_TIMEOUT_US (REPAIR_ACCEPT_TIMEOUT_MS * 1000)
/* Pending file descriptors for next repair_flush() call, or command change */
static int repair_fds[SCM_MAX_FD];
/* Pending command: flush pending file descriptors if it changes */
static int8_t repair_cmd;
/* Number of pending file descriptors set in @repair_fds */
static int repair_nfds;
/**
* repair_sock_init() - Start listening for connections on helper socket
* @c: Execution context
*/
void repair_sock_init(const struct ctx *c)
{
union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN };
struct epoll_event ev = { 0 };
if (c->fd_repair_listen == -1)
return;
if (listen(c->fd_repair_listen, 0)) {
err_perror("listen() on repair helper socket, won't migrate");
return;
}
ref.fd = c->fd_repair_listen;
ev.events = EPOLLIN | EPOLLHUP | EPOLLET;
ev.data.u64 = ref.u64;
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev))
err_perror("repair helper socket epoll_ctl(), won't migrate");
}
/**
* repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
* @c: Execution context
* @events: epoll events
*
* Return: 0 on valid event with new connected socket, error code on failure
*/
int repair_listen_handler(struct ctx *c, uint32_t events)
{
union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
struct epoll_event ev = { 0 };
struct ucred ucred;
socklen_t len;
int rc;
if (events != EPOLLIN) {
debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
events);
return EINVAL;
}
len = sizeof(ucred);
/* Another client is already connected: accept and close right away. */
if (c->fd_repair != -1) {
int discard = accept4(c->fd_repair_listen, NULL, NULL,
SOCK_NONBLOCK);
if (discard == -1)
return errno;
if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
close(discard);
return EEXIST;
}
if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) {
rc = errno;
debug_perror("accept4() on TCP_REPAIR helper listening socket");
return rc;
}
if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
info("Accepted TCP_REPAIR helper, PID %i", ucred.pid);
ref.fd = c->fd_repair;
ev.events = EPOLLHUP | EPOLLET;
ev.data.u64 = ref.u64;
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) {
rc = errno;
debug_perror("epoll_ctl() on TCP_REPAIR helper socket");
close(c->fd_repair);
c->fd_repair = -1;
return rc;
}
return 0;
}
/**
* repair_close() - Close connection to TCP_REPAIR helper
* @c: Execution context
*/
void repair_close(struct ctx *c)
{
debug("Closing TCP_REPAIR helper socket");
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL);
close(c->fd_repair);
c->fd_repair = -1;
}
/**
* repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket
* @c: Execution context
* @events: epoll events
*/
void repair_handler(struct ctx *c, uint32_t events)
{
(void)events;
repair_close(c);
}
/**
* repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect
* @c: Execution context
*
* Return: 0 on success or if already connected, error code on failure
*/
int repair_wait(struct ctx *c)
{
struct timeval tv = { .tv_sec = 0,
.tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) };
int rc;
static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000,
".tv_usec is greater than 1000 * 1000");
if (c->fd_repair >= 0)
return 0;
if (c->fd_repair_listen == -1)
return ENOENT;
if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
&tv, sizeof(tv))) {
rc = errno;
err_perror("Set timeout on TCP_REPAIR listening socket");
return rc;
}
rc = repair_listen_handler(c, EPOLLIN);
tv.tv_usec = 0;
if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
&tv, sizeof(tv)))
err_perror("Clear timeout on TCP_REPAIR listening socket");
return rc;
}
/**
* repair_flush() - Flush current set of sockets to helper, with current command
* @c: Execution context
*
* Return: 0 on success, negative error code on failure
*/
int repair_flush(struct ctx *c)
{
char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
__attribute__ ((aligned(__alignof__(struct cmsghdr)))) = { 0 };
struct iovec iov = { &repair_cmd, sizeof(repair_cmd) };
struct cmsghdr *cmsg;
struct msghdr msg;
int8_t reply;
if (!repair_nfds)
return 0;
msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0,
.msg_iov = &iov, .msg_iovlen = 1,
.msg_control = buf,
.msg_controllen = CMSG_SPACE(sizeof(int) *
repair_nfds),
.msg_flags = 0 };
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds);
memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds);
repair_nfds = 0;
if (sendmsg(c->fd_repair, &msg, 0) < 0) {
int ret = -errno;
err_perror("Failed to send sockets to TCP_REPAIR helper");
repair_close(c);
return ret;
}
if (recv(c->fd_repair, &reply, sizeof(reply), 0) < 0) {
int ret = -errno;
err_perror("Failed to receive reply from TCP_REPAIR helper");
repair_close(c);
return ret;
}
if (reply != repair_cmd) {
err("Unexpected reply from TCP_REPAIR helper: %d", reply);
repair_close(c);
return -ENXIO;
}
return 0;
}
/**
* repair_set() - Add socket to TCP_REPAIR set with given command
* @c: Execution context
* @s: Socket to add
* @cmd: TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP
*
* Return: 0 on success, negative error code on failure
*/
int repair_set(struct ctx *c, int s, int cmd)
{
int rc;
if (repair_nfds && repair_cmd != cmd) {
if ((rc = repair_flush(c)))
return rc;
}
repair_cmd = cmd;
repair_fds[repair_nfds++] = s;
if (repair_nfds >= SCM_MAX_FD) {
if ((rc = repair_flush(c)))
return rc;
}
return 0;
}

View file

@ -1,17 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright (c) 2025 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#ifndef REPAIR_H
#define REPAIR_H
void repair_sock_init(const struct ctx *c);
int repair_listen_handler(struct ctx *c, uint32_t events);
void repair_handler(struct ctx *c, uint32_t events);
void repair_close(struct ctx *c);
int repair_wait(struct ctx *c);
int repair_flush(struct ctx *c);
int repair_set(struct ctx *c, int s, int cmd);
#endif /* REPAIR_H */

View file

@ -14,23 +14,12 @@
# Author: Stefano Brivio <sbrivio@redhat.com>
TMP="$(mktemp)"
OUT="$(mktemp)"
OUT_FINAL="${1}"
shift
IN="$@"
OUT="$(mktemp)"
[ -z "${ARCH}" ] && ARCH="$(uname -m)"
[ -z "${CC}" ] && CC="cc"
AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z] \
| sed 's/^ARM.*/ARM/' \
| sed 's/I[456]86/I386/' \
| sed 's/PPC64/PPC/' \
| sed 's/PPCLE/PPC64LE/' \
| sed 's/MIPS64EL/MIPSEL64/' \
| sed 's/HPPA/PARISC/' \
| sed 's/SH4/SH/')"
HEADER="/* This file was automatically generated by $(basename ${0}) */
#ifndef AUDIT_ARCH_PPC64LE
@ -43,7 +32,7 @@ struct sock_filter filter_@PROFILE@[] = {
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(offsetof(struct seccomp_data, arch))),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@),
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(offsetof(struct seccomp_data, nr))),
@ -244,8 +233,7 @@ gen_profile() {
sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}"
done
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \
"AUDIT_ARCH:${AUDIT_ARCH}"
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))"
}
printf '%s\n' "${HEADER}" > "${OUT}"
@ -255,7 +243,7 @@ for __p in ${__profiles}; do
__calls="${__calls} ${EXTRA_SYSCALLS:-}"
__calls="$(filter ${__calls})"
cols="$(stty -a 2>/dev/null | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac
echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args}
@ -270,4 +258,4 @@ for __p in ${__profiles}; do
gen_profile "${__p}" ${__calls}
done
mv "${OUT}" "${OUT_FINAL}"
mv "${OUT}" seccomp.h

View file

@ -99,7 +99,7 @@ static inline void siphash_feed(struct siphash_state *state, uint64_t in)
}
/**
* siphash_final() - Finalize SipHash calculations
* siphash_final - Finalize SipHash calculations
* @v: siphash state (4 x 64-bit integers)
* @len: Total length of input data
* @tail: Final data for the hash (<= 7 bytes)

446
tap.c
View file

@ -56,72 +56,16 @@
#include "netlink.h"
#include "pasta.h"
#include "packet.h"
#include "repair.h"
#include "tap.h"
#include "log.h"
#include "vhost_user.h"
#include "vu_common.h"
/* Maximum allowed frame lengths (including L2 header) */
/* Verify that an L2 frame length limit is large enough to contain the header,
* but small enough to fit in the packet pool
*/
#define CHECK_FRAME_LEN(len) \
static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN, \
#len " has bad value")
CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
CHECK_FRAME_LEN(L2_MAX_LEN_VU);
/* We try size the packet pools so that we can use a single batch for the entire
* packet buffer. This might be exceeded for vhost-user, though, which uses its
* own buffers rather than pkt_buf.
*
* This is just a tuning parameter, the code will work with slightly more
* overhead if it's incorrect. So, we estimate based on the minimum practical
* frame size - an empty UDP datagram - rather than the minimum theoretical
* frame size.
*
* FIXME: Profile to work out how big this actually needs to be to amortise
* per-batch syscall overheads
*/
#define TAP_MSGS_IP4 \
DIV_ROUND_UP(sizeof(pkt_buf), \
ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr))
#define TAP_MSGS_IP6 \
DIV_ROUND_UP(sizeof(pkt_buf), \
ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr))
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4, pkt_buf);
static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf);
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
#define TAP_SEQS 128 /* Different L4 tuples in one batch */
#define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */
/**
* tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
* @c: Execution context
*/
unsigned long tap_l2_max_len(const struct ctx *c)
{
/* NOLINTBEGIN(bugprone-branch-clone): values can be the same */
switch (c->mode) {
case MODE_PASST:
return L2_MAX_LEN_PASST;
case MODE_PASTA:
return L2_MAX_LEN_PASTA;
case MODE_VU:
return L2_MAX_LEN_VU;
}
/* NOLINTEND(bugprone-branch-clone) */
ASSERT(0);
return 0; /* Unreachable, for cppcheck's sake */
}
/**
* tap_send_single() - Send a single frame
* @c: Execution context
@ -134,22 +78,16 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
struct iovec iov[2];
size_t iovcnt = 0;
switch (c->mode) {
case MODE_PASST:
if (c->mode == MODE_PASST) {
iov[iovcnt] = IOV_OF_LVALUE(vnet_len);
iovcnt++;
/* fall through */
case MODE_PASTA:
iov[iovcnt].iov_base = (void *)data;
iov[iovcnt].iov_len = l2len;
iovcnt++;
tap_send_frames(c, iov, iovcnt, 1);
break;
case MODE_VU:
vu_send_single(c, data, l2len);
break;
}
iov[iovcnt].iov_base = (void *)data;
iov[iovcnt].iov_len = l2len;
iovcnt++;
tap_send_frames(c, iov, iovcnt, 1);
}
/**
@ -175,7 +113,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
*
* Return: pointer at which to write the packet's payload
*/
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
{
struct ethhdr *eh = (struct ethhdr *)buf;
@ -196,8 +134,8 @@ void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
*
* Return: pointer at which to write the packet's payload
*/
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
struct in_addr dst, size_t l4len, uint8_t proto)
static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
struct in_addr dst, size_t l4len, uint8_t proto)
{
uint16_t l3len = l4len + sizeof(*ip4h);
@ -206,43 +144,13 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
ip4h->tos = 0;
ip4h->tot_len = htons(l3len);
ip4h->id = 0;
ip4h->frag_off = htons(IP_DF);
ip4h->frag_off = 0;
ip4h->ttl = 255;
ip4h->protocol = proto;
ip4h->saddr = src.s_addr;
ip4h->daddr = dst.s_addr;
ip4h->check = csum_ip4_header(l3len, proto, src, dst);
return (char *)ip4h + sizeof(*ip4h);
}
/**
* tap_push_uh4() - Build UDPv4 header with checksum
* @c: Execution context
* @src: IPv4 source address
* @sport: UDP source port
* @dst: IPv4 destination address
* @dport: UDP destination port
* @in: UDP payload contents (not including UDP header)
* @dlen: UDP payload length (not including UDP header)
*
* Return: pointer at which to write the packet's payload
*/
void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
struct in_addr dst, in_port_t dport,
const void *in, size_t dlen)
{
size_t l4len = dlen + sizeof(struct udphdr);
const struct iovec iov = {
.iov_base = (void *)in,
.iov_len = dlen
};
struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
uh->source = htons(sport);
uh->dest = htons(dport);
uh->len = htons(l4len);
csum_udp4(uh, src, dst, &payload);
return (char *)uh + sizeof(*uh);
return ip4h + 1;
}
/**
@ -252,7 +160,7 @@ void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
* @sport: UDP source port
* @dst: IPv4 destination address
* @dport: UDP destination port
* @in: UDP payload contents (not including UDP header)
* @in: UDP payload contents (not including UDP header)
* @dlen: UDP payload length (not including UDP header)
*/
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
@ -263,9 +171,14 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
char buf[USHRT_MAX];
struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
char *data = (char *)(uh + 1);
uh->source = htons(sport);
uh->dest = htons(dport);
uh->len = htons(l4len);
csum_udp4(uh, src, dst, in, dlen);
memcpy(data, in, dlen);
tap_send_single(c, buf, dlen + (data - buf));
}
@ -302,9 +215,10 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
*
* Return: pointer at which to write the packet's payload
*/
void *tap_push_ip6h(struct ipv6hdr *ip6h,
const struct in6_addr *src, const struct in6_addr *dst,
size_t l4len, uint8_t proto, uint32_t flow)
static void *tap_push_ip6h(struct ipv6hdr *ip6h,
const struct in6_addr *src,
const struct in6_addr *dst,
size_t l4len, uint8_t proto, uint32_t flow)
{
ip6h->payload_len = htons(l4len);
ip6h->priority = 0;
@ -313,40 +227,10 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h,
ip6h->hop_limit = 255;
ip6h->saddr = *src;
ip6h->daddr = *dst;
ip6_set_flow_lbl(ip6h, flow);
return (char *)ip6h + sizeof(*ip6h);
}
/**
* tap_push_uh6() - Build UDPv6 header with checksum
* @c: Execution context
* @src: IPv6 source address
* @sport: UDP source port
* @dst: IPv6 destination address
* @dport: UDP destination port
* @flow: Flow label
* @in: UDP payload contents (not including UDP header)
* @dlen: UDP payload length (not including UDP header)
*
* Return: pointer at which to write the packet's payload
*/
void *tap_push_uh6(struct udphdr *uh,
const struct in6_addr *src, in_port_t sport,
const struct in6_addr *dst, in_port_t dport,
void *in, size_t dlen)
{
size_t l4len = dlen + sizeof(struct udphdr);
const struct iovec iov = {
.iov_base = in,
.iov_len = dlen
};
struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
uh->source = htons(sport);
uh->dest = htons(dport);
uh->len = htons(l4len);
csum_udp6(uh, src, dst, &payload);
return (char *)uh + sizeof(*uh);
ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
return ip6h + 1;
}
/**
@ -357,22 +241,27 @@ void *tap_push_uh6(struct udphdr *uh,
* @dst: IPv6 destination address
* @dport: UDP destination port
* @flow: Flow label
* @in: UDP payload contents (not including UDP header)
* @in: UDP payload contents (not including UDP header)
* @dlen: UDP payload length (not including UDP header)
*/
void tap_udp6_send(const struct ctx *c,
const struct in6_addr *src, in_port_t sport,
const struct in6_addr *dst, in_port_t dport,
uint32_t flow, void *in, size_t dlen)
uint32_t flow, const void *in, size_t dlen)
{
size_t l4len = dlen + sizeof(struct udphdr);
char buf[USHRT_MAX];
struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
l4len, IPPROTO_UDP, flow);
char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
char *data = (char *)(uh + 1);
uh->source = htons(sport);
uh->dest = htons(dport);
uh->len = htons(l4len);
csum_udp6(uh, src, dst, in, dlen);
memcpy(data, in, dlen);
tap_send_single(c, buf, dlen + (data - buf));
}
@ -517,18 +406,10 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
if (!nframes)
return 0;
switch (c->mode) {
case MODE_PASTA:
if (c->mode == MODE_PASTA)
m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes);
break;
case MODE_PASST:
else
m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes);
break;
case MODE_VU:
/* fall through */
default:
ASSERT(0);
}
if (m < nframes)
debug("tap: failed to send %zu frames of %zu",
@ -561,7 +442,6 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
* struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
* @msgs: Count of messages in sequence
* @protocol: Protocol number
* @ttl: Time to live
* @source: Source port
* @dest: Destination port
* @saddr: Source address
@ -570,7 +450,6 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
*/
static struct tap4_l4_t {
uint8_t protocol;
uint8_t ttl;
uint16_t source;
uint16_t dest;
@ -585,17 +464,14 @@ static struct tap4_l4_t {
* struct l4_seq6_t - Message sequence for one protocol handler call, IPv6
* @msgs: Count of messages in sequence
* @protocol: Protocol number
* @flow_lbl: IPv6 flow label
* @source: Source port
* @dest: Destination port
* @saddr: Source address
* @daddr: Destination address
* @hop_limit: Hop limit
* @msg: Array of messages that can be handled in a single call
*/
static struct tap6_l4_t {
uint8_t protocol;
uint32_t flow_lbl :20;
uint16_t source;
uint16_t dest;
@ -603,8 +479,6 @@ static struct tap6_l4_t {
struct in6_addr saddr;
struct in6_addr daddr;
uint8_t hop_limit;
struct pool_l4_t p;
} tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
@ -793,8 +667,7 @@ resume:
#define L4_MATCH(iph, uh, seq) \
((seq)->protocol == (iph)->protocol && \
(seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \
(seq)->saddr.s_addr == (iph)->saddr && \
(seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
(seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
#define L4_SET(iph, uh, seq) \
do { \
@ -803,7 +676,6 @@ resume:
(seq)->dest = (uh)->dest; \
(seq)->saddr.s_addr = (iph)->saddr; \
(seq)->daddr.s_addr = (iph)->daddr; \
(seq)->ttl = (iph)->ttl; \
} while (0)
if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
@ -845,14 +717,14 @@ append:
for (k = 0; k < p->count; )
k += tcp_tap_handler(c, PIF_TAP, AF_INET,
&seq->saddr, &seq->daddr,
0, p, k, now);
p, k, now);
} else if (seq->protocol == IPPROTO_UDP) {
if (c->no_udp)
continue;
for (k = 0; k < p->count; )
k += udp_tap_handler(c, PIF_TAP, AF_INET,
&seq->saddr, &seq->daddr,
seq->ttl, p, k, now);
p, k, now);
}
}
@ -923,9 +795,6 @@ resume:
if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen)) {
c->ip6.addr_seen = *saddr;
}
if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr))
c->ip6.addr = *saddr;
} else if (!IN6_IS_ADDR_UNSPECIFIED(saddr)){
c->ip6.addr_seen = *saddr;
}
@ -973,20 +842,16 @@ resume:
((seq)->protocol == (proto) && \
(seq)->source == (uh)->source && \
(seq)->dest == (uh)->dest && \
(seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \
IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \
IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \
(seq)->hop_limit == (ip6h)->hop_limit)
IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
#define L4_SET(ip6h, proto, uh, seq) \
do { \
(seq)->protocol = (proto); \
(seq)->source = (uh)->source; \
(seq)->dest = (uh)->dest; \
(seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \
(seq)->saddr = *saddr; \
(seq)->daddr = *daddr; \
(seq)->hop_limit = (ip6h)->hop_limit; \
} while (0)
if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
@ -1030,14 +895,14 @@ append:
for (k = 0; k < p->count; )
k += tcp_tap_handler(c, PIF_TAP, AF_INET6,
&seq->saddr, &seq->daddr,
seq->flow_lbl, p, k, now);
p, k, now);
} else if (seq->protocol == IPPROTO_UDP) {
if (c->no_udp)
continue;
for (k = 0; k < p->count; )
k += udp_tap_handler(c, PIF_TAP, AF_INET6,
&seq->saddr, &seq->daddr,
seq->hop_limit, p, k, now);
p, k, now);
}
}
@ -1072,10 +937,8 @@ void tap_handler(struct ctx *c, const struct timespec *now)
* @c: Execution context
* @l2len: Total L2 packet length
* @p: Packet buffer
* @now: Current timestamp
*/
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
const struct timespec *now)
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
{
const struct ethhdr *eh;
@ -1091,17 +954,9 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
case ETH_P_IP:
if (pool_full(pool_tap4)) {
tap4_handler(c, pool_tap4, now);
pool_flush(pool_tap4);
}
packet_add(pool_tap4, l2len, p);
break;
case ETH_P_IPV6:
if (pool_full(pool_tap6)) {
tap6_handler(c, pool_tap6, now);
pool_flush(pool_tap6);
}
packet_add(pool_tap6, l2len, p);
break;
default:
@ -1113,19 +968,17 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
* tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket
* @c: Execution context
*/
void tap_sock_reset(struct ctx *c)
static void tap_sock_reset(struct ctx *c)
{
info("Client connection closed%s", c->one_off ? ", exiting" : "");
if (c->one_off)
_exit(EXIT_SUCCESS);
exit(EXIT_SUCCESS);
/* Close the connected socket, wait for a new connection */
epoll_del(c, c->fd_tap);
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
close(c->fd_tap);
c->fd_tap = -1;
if (c->mode == MODE_VU)
vu_cleanup(c->vdev);
}
/**
@ -1152,7 +1005,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
do {
n = recv(c->fd_tap, pkt_buf + partial_len,
sizeof(pkt_buf) - partial_len, MSG_DONTWAIT);
TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
} while ((n < 0) && errno == EINTR);
if (n < 0) {
@ -1169,7 +1022,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
while (n >= (ssize_t)sizeof(uint32_t)) {
uint32_t l2len = ntohl_unaligned(p);
if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) {
if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
err("Bad frame size from guest, resetting connection");
tap_sock_reset(c);
return;
@ -1182,7 +1035,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
p += sizeof(uint32_t);
n -= sizeof(uint32_t);
tap_add_packet(c, l2len, p, now);
tap_add_packet(c, l2len, p);
p += l2len;
n -= l2len;
@ -1223,10 +1076,8 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
tap_flush_pools();
for (n = 0;
n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA);
n += len) {
len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA);
for (n = 0; n <= (ssize_t)TAP_BUF_BYTES - ETH_MAX_MTU; n += len) {
len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
if (len == 0) {
die("EOF on tap device, exiting");
@ -1244,10 +1095,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
/* Ignore frames of bad length */
if (len < (ssize_t)sizeof(struct ethhdr) ||
len > (ssize_t)L2_MAX_LEN_PASTA)
len > (ssize_t)ETH_MAX_MTU)
continue;
tap_add_packet(c, len, pkt_buf + n, now);
tap_add_packet(c, len, pkt_buf + n);
}
tap_handler(c, now);
@ -1270,35 +1121,70 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
}
/**
* tap_backend_show_hints() - Give help information to start QEMU
* @c: Execution context
* tap_sock_unix_open() - Create and bind AF_UNIX socket
* @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
*
* Return: socket descriptor on success, won't return on failure
*/
static void tap_backend_show_hints(struct ctx *c)
int tap_sock_unix_open(char *sock_path)
{
switch (c->mode) {
case MODE_PASTA:
/* No hints */
break;
case MODE_PASST:
info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):");
info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
c->sock_path);
info("or qrap, for earlier qemu versions:");
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
break;
case MODE_VU:
info("You can start qemu with:");
info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n",
c->sock_path);
break;
int fd = socket(AF_UNIX, SOCK_STREAM, 0);
struct sockaddr_un addr = {
.sun_family = AF_UNIX,
};
int i;
if (fd < 0)
die_perror("Failed to open UNIX domain socket");
for (i = 1; i < UNIX_SOCK_MAX; i++) {
char *path = addr.sun_path;
int ex, ret;
if (*sock_path)
memcpy(path, sock_path, UNIX_PATH_MAX);
else
snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i);
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
if (ex < 0)
die_perror("Failed to check for UNIX domain conflicts");
ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
errno != EACCES)) {
if (*sock_path)
die("Socket path %s already in use", path);
close(ex);
continue;
}
close(ex);
unlink(path);
ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
if (*sock_path && ret)
die_perror("Failed to bind UNIX domain socket");
if (!ret)
break;
}
if (i == UNIX_SOCK_MAX)
die_perror("Failed to bind UNIX domain socket");
info("UNIX domain socket bound at %s", addr.sun_path);
if (!*sock_path)
memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
return fd;
}
/**
* tap_sock_unix_init() - Start listening for connections on AF_UNIX socket
* @c: Execution context
*/
static void tap_sock_unix_init(const struct ctx *c)
static void tap_sock_unix_init(struct ctx *c)
{
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN };
struct epoll_event ev = { 0 };
@ -1309,33 +1195,12 @@ static void tap_sock_unix_init(const struct ctx *c)
ev.events = EPOLLIN | EPOLLET;
ev.data.u64 = ref.u64;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
}
/**
* tap_start_connection() - start a new connection
* @c: Execution context
*/
static void tap_start_connection(const struct ctx *c)
{
struct epoll_event ev = { 0 };
union epoll_ref ref = { 0 };
ref.fd = c->fd_tap;
switch (c->mode) {
case MODE_PASST:
ref.type = EPOLL_TYPE_TAP_PASST;
break;
case MODE_PASTA:
ref.type = EPOLL_TYPE_TAP_PASTA;
break;
case MODE_VU:
ref.type = EPOLL_TYPE_VHOST_CMD;
break;
}
ev.events = EPOLLIN | EPOLLRDHUP;
ev.data.u64 = ref.u64;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):");
info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
c->sock_path);
info("or qrap, for earlier qemu versions:");
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
}
/**
@ -1345,6 +1210,8 @@ static void tap_start_connection(const struct ctx *c)
*/
void tap_listen_handler(struct ctx *c, uint32_t events)
{
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST };
struct epoll_event ev = { 0 };
int v = INT_MAX / 2;
struct ucred ucred;
socklen_t len;
@ -1383,7 +1250,10 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
setsockopt(c->fd_tap, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
trace("tap: failed to set SO_SNDBUF to %i", v);
tap_start_connection(c);
ref.fd = c->fd_tap;
ev.events = EPOLLIN | EPOLLRDHUP;
ev.data.u64 = ref.u64;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
}
/**
@ -1409,7 +1279,7 @@ static int tap_ns_tun(void *arg)
if (fd < 0)
die_perror("Failed to open() /dev/net/tun");
rc = ioctl(fd, (int)TUNSETIFF, &ifr);
rc = ioctl(fd, TUNSETIFF, &ifr);
if (rc < 0)
die_perror("TUNSETIFF ioctl on /dev/net/tun failed");
@ -1427,61 +1297,58 @@ static int tap_ns_tun(void *arg)
*/
static void tap_sock_tun_init(struct ctx *c)
{
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASTA };
struct epoll_event ev = { 0 };
NS_CALL(tap_ns_tun, c);
if (c->fd_tap == -1)
die("Failed to set up tap device in namespace");
pasta_ns_conf(c);
tap_start_connection(c);
ref.fd = c->fd_tap;
ev.events = EPOLLIN | EPOLLRDHUP;
ev.data.u64 = ref.u64;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
}
/**
* tap_sock_update_pool() - Set the buffer base and size for the pool of packets
* @base: Buffer base
* @size Buffer size
*/
void tap_sock_update_pool(void *base, size_t size)
{
int i;
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size);
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size);
for (i = 0; i < TAP_SEQS; i++) {
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
}
}
/**
* tap_backend_init() - Create and set up AF_UNIX socket or
* tuntap file descriptor
* tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor
* @c: Execution context
*/
void tap_backend_init(struct ctx *c)
void tap_sock_init(struct ctx *c)
{
if (c->mode == MODE_VU) {
tap_sock_update_pool(NULL, 0);
vu_init(c);
} else {
tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
size_t sz = sizeof(pkt_buf);
int i;
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz);
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz);
for (i = 0; i < TAP_SEQS; i++) {
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
}
if (c->fd_tap != -1) { /* Passed as --fd */
struct epoll_event ev = { 0 };
union epoll_ref ref;
ASSERT(c->one_off);
tap_start_connection(c);
ref.fd = c->fd_tap;
if (c->mode == MODE_PASST)
ref.type = EPOLL_TYPE_TAP_PASST;
else
ref.type = EPOLL_TYPE_TAP_PASTA;
ev.events = EPOLLIN | EPOLLRDHUP;
ev.data.u64 = ref.u64;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
return;
}
switch (c->mode) {
case MODE_PASTA:
if (c->mode == MODE_PASTA) {
tap_sock_tun_init(c);
break;
case MODE_VU:
repair_sock_init(c);
/* fall through */
case MODE_PASST:
} else {
tap_sock_unix_init(c);
/* In passt mode, we don't know the guest's MAC address until it
@ -1489,8 +1356,5 @@ void tap_backend_init(struct ctx *c)
* first packets will reach it.
*/
memset(&c->guest_mac, 0xff, sizeof(c->guest_mac));
break;
}
tap_backend_show_hints(c);
}

63
tap.h
View file

@ -6,32 +6,7 @@
#ifndef TAP_H
#define TAP_H
/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header)
*
* The kernel tuntap device imposes a maximum frame size of 65535 including
* 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).
*/
#define L2_MAX_LEN_PASTA USHRT_MAX
/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header)
*
* The only structural limit the QEMU socket protocol imposes on frames is
* (2^32-1) bytes, but that would be ludicrously long in practice. For now,
* limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate
* limit with more precision.
*/
#define L2_MAX_LEN_PASST USHRT_MAX
/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header)
*
* vhost-user allows multiple buffers per frame, each of which can be quite
* large, so the inherent frame size limit is rather large. Much larger than is
* actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME:
* Work out an appropriate limit with more precision.
*/
#define L2_MAX_LEN_VU USHRT_MAX
struct udphdr;
#define ETH_HDR_INIT(proto) { .h_proto = htons_constant(proto) }
/**
* struct tap_hdr - tap backend specific headers
@ -46,8 +21,8 @@ struct tap_hdr {
* @c: Execution context
* @taph: Pointer to tap specific header buffer
*
* Return: a struct iovec covering the correct portion of @taph to use as the
* tap specific header in the current configuration.
* Returns: A struct iovec covering the correct portion of @taph to use as the
* tap specific header in the current configuration.
*/
static inline struct iovec tap_hdr_iov(const struct ctx *c,
struct tap_hdr *thdr)
@ -65,27 +40,9 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c,
*/
static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
{
if (thdr)
thdr->vnet_len = htonl(l2len);
thdr->vnet_len = htonl(l2len);
}
unsigned long tap_l2_max_len(const struct ctx *c);
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
struct in_addr dst, size_t l4len, uint8_t proto);
void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
struct in_addr dst, in_port_t dport,
const void *in, size_t dlen);
void *tap_push_uh6(struct udphdr *uh,
const struct in6_addr *src, in_port_t sport,
const struct in6_addr *dst, in_port_t dport,
void *in, size_t dlen);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
struct in_addr dst, size_t l4len, uint8_t proto);
void *tap_push_ip6h(struct ipv6hdr *ip6h,
const struct in6_addr *src,
const struct in6_addr *dst,
size_t l4len, uint8_t proto, uint32_t flow);
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
struct in_addr dst, in_port_t dport,
const void *in, size_t dlen);
@ -93,13 +50,10 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
const void *in, size_t l4len);
const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
const struct in6_addr *src);
void *tap_push_ip6h(struct ipv6hdr *ip6h,
const struct in6_addr *src, const struct in6_addr *dst,
size_t l4len, uint8_t proto, uint32_t flow);
void tap_udp6_send(const struct ctx *c,
const struct in6_addr *src, in_port_t sport,
const struct in6_addr *dst, in_port_t dport,
uint32_t flow, void *in, size_t dlen);
uint32_t flow, const void *in, size_t dlen);
void tap_icmp6_send(const struct ctx *c,
const struct in6_addr *src, const struct in6_addr *dst,
const void *in, size_t l4len);
@ -114,12 +68,9 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now);
int tap_sock_unix_open(char *sock_path);
void tap_sock_reset(struct ctx *c);
void tap_sock_update_pool(void *base, size_t size);
void tap_backend_init(struct ctx *c);
void tap_sock_init(struct ctx *c);
void tap_flush_pools(void);
void tap_handler(struct ctx *c, const struct timespec *now);
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
const struct timespec *now);
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
#endif /* TAP_H */

1759
tcp.c

File diff suppressed because it is too large Load diff

18
tcp.h
View file

@ -10,21 +10,21 @@
struct ctx;
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref);
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
const struct timespec *now);
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events);
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr, uint32_t flow_lbl,
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now);
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
const char *ifname, in_port_t port);
int tcp_init(struct ctx *c);
void tcp_timer(struct ctx *c, const struct timespec *now);
void tcp_defer_handler(struct ctx *c);
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
int tcp_set_peek_offset(int s, int offset);
extern bool peek_offset_cap;
@ -58,12 +58,16 @@ union tcp_listen_epoll_ref {
* @fwd_in: Port forwarding configuration for inbound packets
* @fwd_out: Port forwarding configuration for outbound packets
* @timer_run: Timestamp of most recent timer run
* @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
* @pipe_size: Size of pipes for spliced connections
*/
struct tcp_ctx {
struct fwd_ports fwd_in;
struct fwd_ports fwd_out;
struct timespec timer_run;
#ifdef HAS_SND_WND
int kernel_snd_wnd;
#endif
size_t pipe_size;
};

422
tcp_buf.c
View file

@ -20,7 +20,7 @@
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <linux/tcp.h>
#include "util.h"
#include "ip.h"
@ -38,32 +38,88 @@
(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
/* Static buffers */
/**
* struct tcp_payload_t - TCP header and data to send segments with payload
* @th: TCP header
* @data: TCP data
*/
struct tcp_payload_t {
struct tcphdr th;
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/* Ethernet header for IPv4 and IPv6 frames */
/**
* struct tcp_flags_t - TCP header and data to send zero-length
* segments (flags)
* @th: TCP header
* @opts TCP options
*/
struct tcp_flags_t {
struct tcphdr th;
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/* Ethernet header for IPv4 frames */
static struct ethhdr tcp4_eth_src;
static struct ethhdr tcp6_eth_src;
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
/* IPv4 headers */
static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
/* TCP segments with payload for IPv4 frames */
static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
/* IP headers for IPv4 and IPv6 */
struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
/* TCP segments with payload for IPv4 and IPv6 frames */
static struct tcp_payload_t tcp_payload[TCP_FRAMES_MEM];
static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516");
static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516");
static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
/* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp_payload_used;
static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp4_payload_used;
static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
/* IPv4 headers for TCP segment without payload */
static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
/* TCP segments without payload for IPv4 frames */
static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
static unsigned int tcp4_flags_used;
/* Ethernet header for IPv6 frames */
static struct ethhdr tcp6_eth_src;
static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
/* IPv6 headers */
static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
/* TCP headers and data for IPv6 frames */
static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
/* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp6_payload_used;
static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
/* IPv6 headers for TCP segment without payload */
static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
/* TCP segment without payload for IPv6 frames */
static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
static unsigned int tcp6_flags_used;
/* recvmsg()/sendmsg() data for tap */
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
/**
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses
* @eth_d: Ethernet destination address, NULL if unchanged
@ -76,40 +132,115 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
}
/**
* tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
* tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
* @c: Execution context
*/
void tcp_sock_iov_init(const struct ctx *c)
void tcp_sock4_iov_init(const struct ctx *c)
{
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
struct iovec *iov;
int i;
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
tcp6_payload_ip[i] = ip6;
for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
tcp4_payload_ip[i] = iph;
tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
tcp4_payload[i].th.ack = 1;
}
for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
tcp4_flags_ip[i] = iph;
tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
tcp4_flags[i].th.ack = 1;
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
struct iovec *iov = tcp_l2_iov[i];
iov = tcp4_l2_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp4_l2_flags_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
}
}
/**
* tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
* tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
* @c: Execution context
*/
void tcp_sock6_iov_init(const struct ctx *c)
{
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
struct iovec *iov;
int i;
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
tcp6_payload_ip[i] = ip6;
tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
tcp6_payload[i].th.ack = 1;
}
for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
tcp6_flags_ip[i] = ip6;
tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
tcp6_flags[i].th .ack = 1;
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp6_l2_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp6_l2_flags_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
}
}
/**
* tcp_flags_flush() - Send out buffers for segments with no data (flags)
* @c: Execution context
*/
void tcp_flags_flush(const struct ctx *c)
{
tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
tcp6_flags_used);
tcp6_flags_used = 0;
tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
tcp4_flags_used);
tcp4_flags_used = 0;
}
/**
* tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
* @ctx: Execution context
* @conns: Array of connection pointers corresponding to queued frames
* @frames: Two-dimensional array containing queued frames with sub-iovs
* @num_frames: Number of entries in the two arrays to be compared
*/
static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
{
int i;
@ -125,55 +256,34 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
conn->seq_to_tap = seq;
peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
if (tcp_set_peek_offset(conn, peek_offset))
if (tcp_set_peek_offset(conn->sock, peek_offset))
tcp_rst(c, conn);
}
}
/**
* tcp_payload_flush() - Send out buffers for segments with data or flags
* tcp_payload_flush() - Send out buffers for segments with data
* @c: Execution context
*/
void tcp_payload_flush(const struct ctx *c)
void tcp_payload_flush(struct ctx *c)
{
size_t m;
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
tcp_payload_used);
if (m != tcp_payload_used) {
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
tcp_payload_used - m);
m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
tcp6_payload_used);
if (m != tcp6_payload_used) {
tcp_revert_seq(c, &tcp6_frame_conns[m], &tcp6_l2_iov[m],
tcp6_payload_used - m);
}
tcp_payload_used = 0;
}
tcp6_payload_used = 0;
/**
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
* @conn: Connection pointer
* @iov: Pointer to an array of iovec of TCP pre-cooked buffers
* @check: Checksum, if already known
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*/
static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
struct iovec *iov, const uint16_t *check,
uint32_t seq, bool no_tcp_csum)
{
struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0);
struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr);
struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base;
const struct flowside *tapside = TAPFLOW(conn);
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
struct ipv6hdr *ip6h = NULL;
struct iphdr *ip4h = NULL;
if (a4)
ip4h = iov[TCP_IOV_IP].iov_base;
else
ip6h = iov[TCP_IOV_IP].iov_base;
tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail,
check, seq, no_tcp_csum);
m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
tcp4_payload_used);
if (m != tcp4_payload_used) {
tcp_revert_seq(c, &tcp4_frame_conns[m], &tcp4_l2_iov[m],
tcp4_payload_used - m);
}
tcp4_payload_used = 0;
}
/**
@ -184,50 +294,58 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
*
* Return: negative error code on connection reset, 0 otherwise
*/
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
struct tcp_payload_t *payload;
struct tcp_flags_t *payload;
struct iovec *iov;
size_t optlen;
size_t l4len;
uint32_t seq;
int ret;
iov = tcp_l2_iov[tcp_payload_used];
if (CONN_V4(conn)) {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
if (CONN_V4(conn))
iov = tcp4_l2_flags_iov[tcp4_flags_used++];
else
iov = tcp6_l2_flags_iov[tcp6_flags_used++];
payload = iov[TCP_IOV_PAYLOAD].iov_base;
seq = conn->seq_to_tap;
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
(struct tcp_syn_opts *)&payload->data, &optlen);
if (ret <= 0)
payload->opts, &optlen);
if (ret <= 0) {
if (CONN_V4(conn))
tcp4_flags_used--;
else
tcp6_flags_used--;
return ret;
tcp_payload_used++;
l4len = optlen + sizeof(struct tcphdr);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
if (flags & DUP_ACK) {
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_TAP].iov_len);
dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP];
memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, l4len);
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
}
if (tcp_payload_used > TCP_FRAMES_MEM - 2)
tcp_payload_flush(c);
l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (flags & DUP_ACK) {
struct iovec *dup_iov;
int i;
if (CONN_V4(conn))
dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
else
dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
for (i = 0; i < TCP_NUM_IOVS; i++)
memcpy(dup_iov[i].iov_base, iov[i].iov_base,
iov[i].iov_len);
dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
}
if (CONN_V4(conn)) {
if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
tcp_flags_flush(c);
} else {
if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
tcp_flags_flush(c);
}
return 0;
}
@ -239,41 +357,40 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
* @dlen: TCP payload length
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
* @seq: Sequence number to be sent
* @push: Set PSH flag, last segment in a batch
*/
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
ssize_t dlen, int no_csum, uint32_t seq, bool push)
static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
ssize_t dlen, int no_csum, uint32_t seq)
{
struct tcp_payload_t *payload;
const uint16_t *check = NULL;
struct iovec *iov;
size_t l4len;
conn->seq_to_tap = seq + dlen;
tcp_frame_conns[tcp_payload_used] = conn;
iov = tcp_l2_iov[tcp_payload_used];
if (CONN_V4(conn)) {
if (no_csum) {
struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
if (CONN_V4(conn)) {
struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
const uint16_t *check = NULL;
if (no_csum) {
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check;
}
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
tcp4_frame_conns[tcp4_payload_used] = conn;
iov = tcp4_l2_iov[tcp4_payload_used++];
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
} else if (CONN_V6(conn)) {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
tcp6_frame_conns[tcp6_payload_used] = conn;
iov = tcp6_l2_iov[tcp6_payload_used++];
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
}
payload = iov[TCP_IOV_PAYLOAD].iov_base;
payload->th.th_off = sizeof(struct tcphdr) / 4;
payload->th.th_x2 = 0;
payload->th.th_flags = 0;
payload->th.ack = 1;
payload->th.psh = push;
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
}
/**
@ -285,11 +402,12 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
*
* #syscalls recvmsg
*/
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
{
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
int len, dlen, i, s = conn->sock;
int sendlen, len, dlen, v4 = CONN_V4(conn);
int s = conn->sock, i, ret = 0;
struct msghdr mh_sock = { 0 };
uint16_t mss = MSS_GET(conn);
uint32_t already_sent, seq;
@ -304,14 +422,13 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
conn->seq_ack_from_tap, conn->seq_to_tap);
conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
if (tcp_set_peek_offset(conn, 0)) {
if (tcp_set_peek_offset(s, 0)) {
tcp_rst(c, conn);
return -1;
}
}
if (!wnd_scaled || already_sent >= wnd_scaled) {
conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
conn_flag(c, conn, STALLED);
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
@ -337,15 +454,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
mh_sock.msg_iovlen = fill_bufs;
}
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
(!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
tcp_payload_flush(c);
/* Silence Coverity CWE-125 false positive */
tcp_payload_used = 0;
tcp4_payload_used = tcp6_payload_used = 0;
}
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
if (v4)
iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
else
iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
@ -356,22 +477,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
len = recvmsg(s, &mh_sock, MSG_PEEK);
while (len < 0 && errno == EINTR);
if (len < 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK) {
tcp_rst(c, conn);
return -errno;
}
if (already_sent) /* No new data and EAGAIN: set EPOLLET */
conn_flag(c, conn, STALLED);
return 0;
}
if (len < 0)
goto err;
if (!len) {
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
if (ret) {
if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
tcp_rst(c, conn);
return ret;
}
@ -382,40 +493,45 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
return 0;
}
sendlen = len;
if (!peek_offset_cap)
len -= already_sent;
sendlen -= already_sent;
if (len <= 0) {
if (sendlen <= 0) {
conn_flag(c, conn, STALLED);
return 0;
}
conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
conn_flag(c, conn, ~STALLED);
send_bufs = DIV_ROUND_UP(len, mss);
last_len = len - (send_bufs - 1) * mss;
send_bufs = DIV_ROUND_UP(sendlen, mss);
last_len = sendlen - (send_bufs - 1) * mss;
/* Likely, some new data was acked too. */
tcp_update_seqack_wnd(c, conn, false, NULL);
tcp_update_seqack_wnd(c, conn, 0, NULL);
/* Finally, queue to tap */
dlen = mss;
seq = conn->seq_to_tap;
for (i = 0; i < send_bufs; i++) {
int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
bool push = false;
int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
if (i == send_bufs - 1) {
if (i == send_bufs - 1)
dlen = last_len;
push = true;
}
tcp_data_to_tap(c, conn, dlen, no_csum, seq, push);
tcp_data_to_tap(c, conn, dlen, no_csum, seq);
seq += dlen;
}
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
err:
if (errno != EAGAIN && errno != EWOULDBLOCK) {
ret = -errno;
tcp_rst(c, conn);
}
return ret;
}

View file

@ -6,9 +6,11 @@
#ifndef TCP_BUF_H
#define TCP_BUF_H
void tcp_sock_iov_init(const struct ctx *c);
void tcp_payload_flush(const struct ctx *c);
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
void tcp_sock4_iov_init(const struct ctx *c);
void tcp_sock6_iov_init(const struct ctx *c);
void tcp_flags_flush(const struct ctx *c);
void tcp_payload_flush(struct ctx *c);
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
#endif /*TCP_BUF_H */

View file

@ -19,7 +19,6 @@
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
* @sock: Socket descriptor number
* @events: Connection events, implying connection states
* @listening_sock: Listening socket this socket was accept()ed from, or -1
* @timer: timerfd descriptor for timeout events
* @flags: Connection flags representing internal attributes
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
@ -69,7 +68,6 @@ struct tcp_tap_conn {
#define CONN_STATE_BITS /* Setting these clears other flags */ \
(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
int listening_sock;
int timer :FD_REF_BITS;
@ -79,7 +77,6 @@ struct tcp_tap_conn {
#define ACTIVE_CLOSE BIT(2)
#define ACK_TO_TAP_DUE BIT(3)
#define ACK_FROM_TAP_DUE BIT(4)
#define ACK_FROM_TAP_BLOCKS BIT(5)
#define SNDBUF_BITS 24
unsigned int sndbuf :SNDBUF_BITS;
@ -98,95 +95,6 @@ struct tcp_tap_conn {
uint32_t seq_init_from_tap;
};
/**
* struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
* @pif: Interfaces for each side of the flow
* @side: Addresses and ports for each side of the flow
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
* @ws_from_tap: Window scaling factor advertised from tap/guest
* @ws_to_tap: Window scaling factor advertised to tap/guest
* @events: Connection events, implying connection states
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
* @flags: Connection flags representing internal attributes
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
* @wnd_from_tap: Last window size from tap, unscaled (as received)
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
* @seq_to_tap: Next sequence for packets to tap
* @seq_ack_from_tap: Last ACK number received from tap
* @seq_from_tap: Next sequence for packets from tap (not actually sent)
* @seq_ack_to_tap: Last ACK number sent to tap
* @seq_init_from_tap: Initial sequence number from tap
*/
struct tcp_tap_transfer {
uint8_t pif[SIDES];
struct flowside side[SIDES];
uint8_t retrans;
uint8_t ws_from_tap;
uint8_t ws_to_tap;
uint8_t events;
uint32_t tap_mss;
uint32_t sndbuf;
uint8_t flags;
uint8_t seq_dup_ack_approx;
uint16_t wnd_from_tap;
uint16_t wnd_to_tap;
uint32_t seq_to_tap;
uint32_t seq_ack_from_tap;
uint32_t seq_from_tap;
uint32_t seq_ack_to_tap;
uint32_t seq_init_from_tap;
} __attribute__((packed, aligned(__alignof__(uint32_t))));
/**
* struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order
* @seq_snd: Socket-side send sequence
* @seq_rcv: Socket-side receive sequence
* @sndq: Length of pending send queue (unacknowledged / not sent)
* @notsent: Part of pending send queue that wasn't sent out yet
* @rcvq: Length of pending receive queue
* @mss: Socket-side MSS clamp
* @timestamp: RFC 7323 timestamp
* @snd_wl1: Next sequence used in window probe (next sequence - 1)
* @snd_wnd: Socket-side sending window
* @max_window: Window clamp
* @rcv_wnd: Socket-side receive window
* @rcv_wup: rcv_nxt on last window update sent
* @snd_ws: Window scaling factor, send
* @rcv_ws: Window scaling factor, receive
* @tcpi_state: Connection state in TCP_INFO style (enum, tcp_states.h)
* @tcpi_options: TCPI_OPT_* constants (timestamps, selective ACK)
*/
struct tcp_tap_transfer_ext {
uint32_t seq_snd;
uint32_t seq_rcv;
uint32_t sndq;
uint32_t notsent;
uint32_t rcvq;
uint32_t mss;
uint32_t timestamp;
/* We can't just use struct tcp_repair_window: we need network order */
uint32_t snd_wl1;
uint32_t snd_wnd;
uint32_t max_window;
uint32_t rcv_wnd;
uint32_t rcv_wup;
uint8_t snd_ws;
uint8_t rcv_ws;
uint8_t tcpi_state;
uint8_t tcpi_options;
} __attribute__((packed, aligned(__alignof__(uint32_t))));
/**
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
* @f: Generic flow information
@ -231,23 +139,11 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
int tcp_flow_migrate_target(struct ctx *c, int fd);
int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
int tcp_conn_sock(sa_family_t af);
int tcp_sock_refill_pool(int pool[], sa_family_t af);
int tcp_conn_sock(const struct ctx *c, sa_family_t af);
int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af);
void tcp_splice_refill(const struct ctx *c);
#endif /* TCP_CONN_H */

View file

@ -33,18 +33,16 @@
#define OPT_EOL 0
#define OPT_NOP 1
#define OPT_MSS 2
#define OPT_MSS_LEN 4
#define OPT_WS 3
#define OPT_WS_LEN 3
#define OPT_SACKP 4
#define OPT_SACK 5
#define OPT_TS 8
#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
#define HOSTSIDE(conn_) ((conn_)->f.pif[1] == PIF_HOST)
#define HOSTFLOW(conn_) (&((conn_)->f.side[HOSTSIDE(conn_)]))
#define HOST_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr))
#define CONN_V6(conn) (!CONN_V4(conn))
@ -65,79 +63,6 @@ enum tcp_iov_parts {
TCP_NUM_IOVS
};
/**
* struct tcp_payload_t - TCP header and data to send segments with payload
* @th: TCP header
* @data: TCP data
*/
struct tcp_payload_t {
struct tcphdr th;
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/** struct tcp_opt_nop - TCP NOP option
* @kind: Option kind (OPT_NOP = 1)
*/
struct tcp_opt_nop {
uint8_t kind;
} __attribute__ ((packed));
#define TCP_OPT_NOP ((struct tcp_opt_nop){ .kind = OPT_NOP, })
/** struct tcp_opt_mss - TCP MSS option
* @kind: Option kind (OPT_MSS == 2)
* @len: Option length (4)
* @mss: Maximum Segment Size
*/
struct tcp_opt_mss {
uint8_t kind;
uint8_t len;
uint16_t mss;
} __attribute__ ((packed));
#define TCP_OPT_MSS(mss_) \
((struct tcp_opt_mss) { \
.kind = OPT_MSS, \
.len = sizeof(struct tcp_opt_mss), \
.mss = htons(mss_), \
})
/** struct tcp_opt_ws - TCP Window Scaling option
* @kind: Option kind (OPT_WS == 3)
* @len: Option length (3)
* @shift: Window scaling shift
*/
struct tcp_opt_ws {
uint8_t kind;
uint8_t len;
uint8_t shift;
} __attribute__ ((packed));
#define TCP_OPT_WS(shift_) \
((struct tcp_opt_ws) { \
.kind = OPT_WS, \
.len = sizeof(struct tcp_opt_ws), \
.shift = (shift_), \
})
/** struct tcp_syn_opts - TCP options we apply to SYN packets
* @mss: Maximum Segment Size (MSS) option
* @nop: NOP opt (for alignment)
* @ws: Window Scaling (WS) option
*/
struct tcp_syn_opts {
struct tcp_opt_mss mss;
struct tcp_opt_nop nop;
struct tcp_opt_ws ws;
} __attribute__ ((packed));
#define TCP_SYN_OPTS(mss_, ws_) \
((struct tcp_syn_opts){ \
.mss = TCP_OPT_MSS(mss_), \
.nop = TCP_OPT_NOP, \
.ws = TCP_OPT_WS(ws_), \
})
extern char tcp_buf_discard [MAX_WINDOW];
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
@ -157,26 +82,19 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
conn_event_do(c, conn, event); \
} while (0)
void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
#define tcp_rst(c, conn) \
do { \
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
tcp_rst_do(c, conn); \
} while (0)
struct tcp_info_linux;
void tcp_fill_headers(const struct tcp_tap_conn *conn,
struct tap_hdr *taph,
struct iphdr *ip4h, struct ipv6hdr *ip6h,
struct tcphdr *th, struct iov_tail *payload,
const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum);
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
struct iovec *iov, size_t dlen,
const uint16_t *check, uint32_t seq);
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
bool force_seq, struct tcp_info_linux *tinfo);
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
size_t *optlen);
int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset);
int force_seq, struct tcp_info *tinfo);
int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
struct tcphdr *th, char *data, size_t *optlen);
#endif /* TCP_INTERNAL_H */

View file

@ -28,7 +28,7 @@
* - FIN_SENT_0: FIN (write shutdown) sent to accepted socket
* - FIN_SENT_1: FIN (write shutdown) sent to target socket
*
* #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64
* #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
*/
#include <sched.h>
@ -95,7 +95,7 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
* conn_at_sidx() - Get spliced TCP connection specific flow at given sidx
* @sidx: Flow and side to retrieve
*
* Return: spliced TCP connection at @sidx, or NULL of @sidx is invalid.
* Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid.
* Asserts if the flow at @sidx is not FLOW_TCP_SPLICE.
*/
static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
@ -131,12 +131,8 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
ev[1].events = EPOLLOUT;
}
flow_foreach_sidei(sidei) {
if (events & OUT_WAIT(sidei)) {
ev[sidei].events |= EPOLLOUT;
ev[!sidei].events &= ~EPOLLIN;
}
}
flow_foreach_sidei(sidei)
ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0;
}
/**
@ -164,7 +160,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
int ret = -errno;
flow_perror(conn, "ERROR on epoll_ctl()");
flow_err(conn, "ERROR on epoll_ctl(): %s", strerror(errno));
return ret;
}
@ -204,8 +200,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
}
if (flag == CLOSING) {
epoll_del(c, conn->s[0]);
epoll_del(c, conn->s[1]);
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[0], NULL);
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[1], NULL);
}
}
@ -317,14 +313,14 @@ static int tcp_splice_connect_finish(const struct ctx *c,
if (conn->pipe[sidei][0] < 0) {
if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
flow_perror(conn, "cannot create %d->%d pipe",
sidei, !sidei);
flow_err(conn, "cannot create %d->%d pipe: %s",
sidei, !sidei, strerror(errno));
conn_flag(c, conn, CLOSING);
return -EIO;
}
if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
c->tcp.pipe_size)) {
flow_trace(conn,
"cannot set %d->%d pipe size to %zu",
sidei, !sidei, c->tcp.pipe_size);
@ -352,10 +348,9 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
uint8_t tgtpif = conn->f.pif[TGTSIDE];
union sockaddr_inany sa;
socklen_t sl;
int one = 1;
if (tgtpif == PIF_HOST)
conn->s[1] = tcp_conn_sock(af);
conn->s[1] = tcp_conn_sock(c, af);
else if (tgtpif == PIF_SPLICE)
conn->s[1] = tcp_conn_sock_ns(c, af);
else
@ -364,27 +359,18 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
if (conn->s[1] < 0)
return -1;
if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &one, sizeof(one))) {
if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK,
&((int){ 1 }), sizeof(int))) {
flow_trace(conn, "failed to set TCP_QUICKACK on socket %i",
conn->s[1]);
}
if (setsockopt(conn->s[0], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
conn->s[0]);
}
if (setsockopt(conn->s[1], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
conn->s[1]);
}
pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);
if (connect(conn->s[1], &sa.sa, sl)) {
if (errno != EINPROGRESS) {
flow_trace(conn, "Couldn't connect socket for splice: %s",
strerror_(errno));
strerror(errno));
return -errno;
}
@ -402,7 +388,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
* @c: Execution context
* @af: Address family (AF_INET or AF_INET6)
*
* Return: socket fd in the namespace on success, -errno on failure
* Return: Socket fd in the namespace on success, -errno on failure
*/
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af)
{
@ -482,10 +468,11 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl);
if (rc)
flow_perror(conn, "Error retrieving SO_ERROR");
flow_err(conn, "Error retrieving SO_ERROR: %s",
strerror(errno));
else
flow_trace(conn, "Error event on socket: %s",
strerror_(err));
strerror(err));
goto close;
}
@ -516,27 +503,29 @@ swap:
lowat_act_flag = RCVLOWAT_ACT(fromsidei);
while (1) {
ssize_t readlen, written, pending;
ssize_t readlen, to_write = 0, written;
int more = 0;
retry:
do
readlen = splice(conn->s[fromsidei], NULL,
conn->pipe[fromsidei][1], NULL,
c->tcp.pipe_size,
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
while (readlen < 0 && errno == EINTR);
if (readlen < 0 && errno != EAGAIN)
goto close;
readlen = splice(conn->s[fromsidei], NULL,
conn->pipe[fromsidei][1], NULL,
c->tcp.pipe_size,
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
flow_trace(conn, "%zi from read-side call", readlen);
if (readlen < 0) {
if (errno == EINTR)
goto retry;
if (!readlen) {
if (errno != EAGAIN)
goto close;
to_write = c->tcp.pipe_size;
} else if (!readlen) {
eof = 1;
} else if (readlen > 0) {
to_write = c->tcp.pipe_size;
} else {
never_read = 0;
to_write += readlen;
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
more = SPLICE_F_MORE;
@ -544,25 +533,19 @@ retry:
conn_flag(c, conn, lowat_act_flag);
}
do
written = splice(conn->pipe[fromsidei][0], NULL,
conn->s[!fromsidei], NULL,
c->tcp.pipe_size,
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
while (written < 0 && errno == EINTR);
if (written < 0 && errno != EAGAIN)
goto close;
eintr:
written = splice(conn->pipe[fromsidei][0], NULL,
conn->s[!fromsidei], NULL, to_write,
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
flow_trace(conn, "%zi from write-side call (passed %zi)",
written, c->tcp.pipe_size);
written, to_write);
/* Most common case: skip updating counters. */
if (readlen > 0 && readlen == written) {
if (readlen >= (long)c->tcp.pipe_size * 10 / 100)
continue;
if (!(conn->flags & lowat_set_flag) &&
if (conn->flags & lowat_set_flag &&
readlen > (long)c->tcp.pipe_size / 10) {
int lowat = c->tcp.pipe_size / 4;
@ -571,7 +554,7 @@ retry:
&lowat, sizeof(lowat))) {
flow_trace(conn,
"Setting SO_RCVLOWAT %i: %s",
lowat, strerror_(errno));
lowat, strerror(errno));
} else {
conn_flag(c, conn, lowat_set_flag);
conn_flag(c, conn, lowat_act_flag);
@ -585,6 +568,12 @@ retry:
conn->written[fromsidei] += written > 0 ? written : 0;
if (written < 0) {
if (errno == EINTR)
goto eintr;
if (errno != EAGAIN)
goto close;
if (conn->read[fromsidei] == conn->written[fromsidei])
break;
@ -595,9 +584,10 @@ retry:
if (never_read && written == (long)(c->tcp.pipe_size))
goto retry;
pending = conn->read[fromsidei] - conn->written[fromsidei];
if (!never_read && written > 0 && written < pending)
if (!never_read && written < to_write) {
to_write -= written;
goto retry;
}
if (eof)
break;
@ -686,7 +676,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
continue;
if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
c->tcp.pipe_size)) {
trace("TCP (spliced): cannot set pool pipe size to %zu",
c->tcp.pipe_size);
}
@ -707,16 +697,16 @@ static int tcp_sock_refill_ns(void *arg)
ns_enter(c);
if (c->ifi4) {
int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET);
int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
if (rc < 0)
warn("TCP: Error refilling IPv4 ns socket pool: %s",
strerror_(-rc));
strerror(-rc));
}
if (c->ifi6) {
int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6);
int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
if (rc < 0)
warn("TCP: Error refilling IPv6 ns socket pool: %s",
strerror_(-rc));
strerror(-rc));
}
return 0;

476
tcp_vu.c
View file

@ -1,476 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* tcp_vu.c - TCP L2 vhost-user management functions
*
* Copyright Red Hat
* Author: Laurent Vivier <lvivier@redhat.com>
*/
#include <errno.h>
#include <stddef.h>
#include <stdint.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <sys/socket.h>
#include <netinet/if_ether.h>
#include <linux/virtio_net.h>
#include "util.h"
#include "ip.h"
#include "passt.h"
#include "siphash.h"
#include "inany.h"
#include "vhost_user.h"
#include "tcp.h"
#include "pcap.h"
#include "flow.h"
#include "tcp_conn.h"
#include "flow_table.h"
#include "tcp_vu.h"
#include "tap.h"
#include "tcp_internal.h"
#include "checksum.h"
#include "vu_common.h"
#include <time.h>
static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
static int head[VIRTQUEUE_MAX_SIZE + 1];
/**
* tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
* @v6: Set for IPv6 packet
*
* Return: return the size of the header
*/
static size_t tcp_vu_hdrlen(bool v6)
{
size_t hdrlen;
hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) +
sizeof(struct ethhdr) + sizeof(struct tcphdr);
if (v6)
hdrlen += sizeof(struct ipv6hdr);
else
hdrlen += sizeof(struct iphdr);
return hdrlen;
}
/**
* tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload)
* @c: Execution context
* @conn: Connection pointer
* @flags: TCP flags: if not set, send segment only if ACK is due
*
* Return: negative error code on connection reset, 0 otherwise
*/
int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
size_t optlen, hdrlen;
struct vu_virtq_element flags_elem[2];
struct ipv6hdr *ip6h = NULL;
struct iphdr *ip4h = NULL;
struct iovec flags_iov[2];
struct tcp_syn_opts *opts;
struct iov_tail payload;
struct tcphdr *th;
struct ethhdr *eh;
uint32_t seq;
int elem_cnt;
int nb_ack;
int ret;
hdrlen = tcp_vu_hdrlen(CONN_V6(conn));
vu_set_element(&flags_elem[0], NULL, &flags_iov[0]);
elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1,
hdrlen + sizeof(struct tcp_syn_opts), NULL);
if (elem_cnt != 1)
return -1;
ASSERT(flags_elem[0].in_sg[0].iov_len >=
hdrlen + sizeof(struct tcp_syn_opts));
vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1);
eh = vu_eth(flags_elem[0].in_sg[0].iov_base);
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
if (CONN_V4(conn)) {
eh->h_proto = htons(ETH_P_IP);
ip4h = vu_ip(flags_elem[0].in_sg[0].iov_base);
*ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
th = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
} else {
eh->h_proto = htons(ETH_P_IPV6);
ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base);
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
th = vu_payloadv6(flags_elem[0].in_sg[0].iov_base);
}
memset(th, 0, sizeof(*th));
th->doff = sizeof(*th) / 4;
th->ack = 1;
seq = conn->seq_to_tap;
opts = (struct tcp_syn_opts *)(th + 1);
ret = tcp_prepare_flags(c, conn, flags, th, opts, &optlen);
if (ret <= 0) {
vu_queue_rewind(vq, 1);
return ret;
}
flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
NULL, seq, !*c->pcap);
if (*c->pcap) {
pcap_iov(&flags_elem[0].in_sg[0], 1,
sizeof(struct virtio_net_hdr_mrg_rxbuf));
}
nb_ack = 1;
if (flags & DUP_ACK) {
vu_set_element(&flags_elem[1], NULL, &flags_iov[1]);
elem_cnt = vu_collect(vdev, vq, &flags_elem[1], 1,
flags_elem[0].in_sg[0].iov_len, NULL);
if (elem_cnt == 1 &&
flags_elem[1].in_sg[0].iov_len >=
flags_elem[0].in_sg[0].iov_len) {
memcpy(flags_elem[1].in_sg[0].iov_base,
flags_elem[0].in_sg[0].iov_base,
flags_elem[0].in_sg[0].iov_len);
nb_ack++;
if (*c->pcap) {
pcap_iov(&flags_elem[1].in_sg[0], 1,
sizeof(struct virtio_net_hdr_mrg_rxbuf));
}
}
}
vu_flush(vdev, vq, flags_elem, nb_ack);
return 0;
}
/** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers
* @c: Execution context
* @conn: Connection pointer
* @v6: Set for IPv6 connections
* @already_sent: Number of bytes already sent
* @fillsize: Maximum bytes to fill in guest-side receiving window
* @iov_cnt: number of iov (output)
* @head_cnt: Pointer to store the count of head iov entries (output)
*
* Return: number of bytes received from the socket, or a negative error code
* on failure.
*/
static ssize_t tcp_vu_sock_recv(const struct ctx *c,
const struct tcp_tap_conn *conn, bool v6,
uint32_t already_sent, size_t fillsize,
int *iov_cnt, int *head_cnt)
{
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
struct msghdr mh_sock = { 0 };
uint16_t mss = MSS_GET(conn);
int s = conn->sock;
ssize_t ret, len;
size_t hdrlen;
int elem_cnt;
int i;
*iov_cnt = 0;
hdrlen = tcp_vu_hdrlen(v6);
vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
elem_cnt = 0;
*head_cnt = 0;
while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) {
struct iovec *iov;
size_t frame_size, dlen;
int cnt;
cnt = vu_collect(vdev, vq, &elem[elem_cnt],
VIRTQUEUE_MAX_SIZE - elem_cnt,
MIN(mss, fillsize) + hdrlen, &frame_size);
if (cnt == 0)
break;
dlen = frame_size - hdrlen;
/* reserve space for headers in iov */
iov = &elem[elem_cnt].in_sg[0];
ASSERT(iov->iov_len >= hdrlen);
iov->iov_base = (char *)iov->iov_base + hdrlen;
iov->iov_len -= hdrlen;
head[(*head_cnt)++] = elem_cnt;
fillsize -= dlen;
elem_cnt += cnt;
}
if (peek_offset_cap) {
mh_sock.msg_iov = iov_vu + 1;
mh_sock.msg_iovlen = elem_cnt;
} else {
iov_vu[0].iov_base = tcp_buf_discard;
iov_vu[0].iov_len = already_sent;
mh_sock.msg_iov = iov_vu;
mh_sock.msg_iovlen = elem_cnt + 1;
}
do
ret = recvmsg(s, &mh_sock, MSG_PEEK);
while (ret < 0 && errno == EINTR);
if (ret < 0) {
vu_queue_rewind(vq, elem_cnt);
return -errno;
}
if (!peek_offset_cap)
ret -= already_sent;
/* adjust iov number and length of the last iov */
len = ret;
for (i = 0; len && i < elem_cnt; i++) {
struct iovec *iov = &elem[i].in_sg[0];
if (iov->iov_len > (size_t)len)
iov->iov_len = len;
len -= iov->iov_len;
}
/* adjust head count */
while (*head_cnt > 0 && head[*head_cnt - 1] >= i)
(*head_cnt)--;
/* mark end of array */
head[*head_cnt] = i;
*iov_cnt = i;
/* release unused buffers */
vu_queue_rewind(vq, elem_cnt - i);
/* restore space for headers in iov */
for (i = 0; i < *head_cnt; i++) {
struct iovec *iov = &elem[head[i]].in_sg[0];
iov->iov_base = (char *)iov->iov_base - hdrlen;
iov->iov_len += hdrlen;
}
return ret;
}
/**
* tcp_vu_prepare() - Prepare the frame header
* @c: Execution context
* @conn: Connection pointer
* @iov: Pointer to the array of IO vectors
* @iov_cnt: Number of entries in @iov
* @check: Checksum, if already known
* @no_tcp_csum: Do not set TCP checksum
* @push: Set PSH flag, last segment in a batch
*/
static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
struct iovec *iov, size_t iov_cnt,
const uint16_t **check, bool no_tcp_csum, bool push)
{
const struct flowside *toside = TAPFLOW(conn);
bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
size_t hdrlen = tcp_vu_hdrlen(v6);
struct iov_tail payload = IOV_TAIL(iov, iov_cnt, hdrlen);
char *base = iov[0].iov_base;
struct ipv6hdr *ip6h = NULL;
struct iphdr *ip4h = NULL;
struct tcphdr *th;
struct ethhdr *eh;
/* we guess the first iovec provided by the guest can embed
* all the headers needed by L2 frame
*/
ASSERT(iov[0].iov_len >= hdrlen);
eh = vu_eth(base);
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
/* initialize header */
if (!v6) {
eh->h_proto = htons(ETH_P_IP);
ip4h = vu_ip(base);
*ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
th = vu_payloadv4(base);
} else {
eh->h_proto = htons(ETH_P_IPV6);
ip6h = vu_ip(base);
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
th = vu_payloadv6(base);
}
memset(th, 0, sizeof(*th));
th->doff = sizeof(*th) / 4;
th->ack = 1;
th->psh = push;
tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
*check, conn->seq_to_tap, no_tcp_csum);
if (ip4h)
*check = &ip4h->check;
}
/**
* tcp_vu_data_from_sock() - Handle new data from socket, queue to vhost-user,
* in window
* @c: Execution context
* @conn: Connection pointer
*
* Return: negative on connection reset, 0 otherwise
*/
int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
{
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
ssize_t len, previous_dlen;
int i, iov_cnt, head_cnt;
size_t hdrlen, fillsize;
int v6 = CONN_V6(conn);
uint32_t already_sent;
const uint16_t *check;
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
debug("Got packet, but RX virtqueue not usable yet");
return 0;
}
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
if (SEQ_LT(already_sent, 0)) {
/* RFC 761, section 2.1. */
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
conn->seq_ack_from_tap, conn->seq_to_tap);
conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
if (tcp_set_peek_offset(conn, 0)) {
tcp_rst(c, conn);
return -1;
}
}
if (!wnd_scaled || already_sent >= wnd_scaled) {
conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
conn_flag(c, conn, STALLED);
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
}
/* Set up buffer descriptors we'll fill completely and partially. */
fillsize = wnd_scaled - already_sent;
/* collect the buffers from vhost-user and fill them with the
* data from the socket
*/
len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize,
&iov_cnt, &head_cnt);
if (len < 0) {
if (len != -EAGAIN && len != -EWOULDBLOCK) {
tcp_rst(c, conn);
return len;
}
if (already_sent) /* No new data and EAGAIN: set EPOLLET */
conn_flag(c, conn, STALLED);
return 0;
}
if (!len) {
if (already_sent) {
conn_flag(c, conn, STALLED);
} else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
SOCK_FIN_RCVD) {
int ret = tcp_vu_send_flag(c, conn, FIN | ACK);
if (ret) {
tcp_rst(c, conn);
return ret;
}
conn_event(c, conn, TAP_FIN_SENT);
}
return 0;
}
conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
conn_flag(c, conn, ~STALLED);
/* Likely, some new data was acked too. */
tcp_update_seqack_wnd(c, conn, false, NULL);
/* initialize headers */
/* iov_vu is an array of buffers and the buffer size can be
* smaller than the frame size we want to use but with
* num_buffer we can merge several virtio iov buffers in one packet
* we need only to set the packet headers in the first iov and
* num_buffer to the number of iov entries
*/
hdrlen = tcp_vu_hdrlen(v6);
for (i = 0, previous_dlen = -1, check = NULL; i < head_cnt; i++) {
struct iovec *iov = &elem[head[i]].in_sg[0];
int buf_cnt = head[i + 1] - head[i];
ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen;
bool push = i == head_cnt - 1;
vu_set_vnethdr(vdev, iov->iov_base, buf_cnt);
/* The IPv4 header checksum varies only with dlen */
if (previous_dlen != dlen)
check = NULL;
previous_dlen = dlen;
tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap, push);
if (*c->pcap) {
pcap_iov(iov, buf_cnt,
sizeof(struct virtio_net_hdr_mrg_rxbuf));
}
conn->seq_to_tap += dlen;
}
/* send packets */
vu_flush(vdev, vq, elem, iov_cnt);
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
}

View file

@ -1,12 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Copyright Red Hat
* Author: Laurent Vivier <lvivier@redhat.com>
*/
#ifndef TCP_VU_H
#define TCP_VU_H
int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
#endif /*TCP_VU_H */

1
test/.gitignore vendored
View file

@ -8,6 +8,5 @@ QEMU_EFI.fd
*.raw.xz
*.bin
nstool
rampstream
guest-key
guest-key.pub

View file

@ -8,6 +8,7 @@
WGET = wget -c
DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
debian-9-nocloud-amd64-daily-20200210-166.qcow2 \
debian-10-nocloud-amd64.qcow2 \
debian-10-generic-arm64.qcow2 \
debian-10-generic-ppc64el-20220911-1135.qcow2 \
@ -41,7 +42,8 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \
openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \
openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \
openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \
trusty-server-cloudimg-i386-disk1.img \
@ -52,8 +54,7 @@ UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS)
DOWNLOAD_ASSETS = mbuto podman \
$(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS)
TESTDATA_ASSETS = small.bin big.bin medium.bin \
rampstream
TESTDATA_ASSETS = small.bin big.bin medium.bin
LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \
$(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \
$(UBUNTU_NEW_IMGS:%=prepared-%) \
@ -86,7 +87,7 @@ podman/bin/podman: pull-podman
guest-key guest-key.pub:
ssh-keygen -f guest-key -N ''
mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub rampstream-check.sh $(TESTDATA_ASSETS)
mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub $(TESTDATA_ASSETS)
./mbuto/mbuto -p ./$< -c lz4 -f $@
mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2
@ -134,6 +135,9 @@ realclean: clean
debian-8.11.0-openstack-%.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
debian-9-nocloud-%-daily-20200210-166.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2
debian-10-nocloud-%.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
@ -199,6 +203,9 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz:
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2:
$(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
# Ubuntu downloads
trusty-server-cloudimg-%-disk1.img:
$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img

View file

@ -134,54 +134,6 @@ layout_two_guests() {
get_info_cols
pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #1" qemu_2 guest_2
tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done'
tmux send-keys -t ${PANE_INFO} -N 100 C-m
tmux select-pane -t ${PANE_INFO} -T "test log"
pane_watch_contexts ${PANE_HOST} host host
pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #1" pasta_1 passt_2
info_layout "two guests, two passt instances, in namespaces"
sleep 1
}
# layout_migrate() - Two guest panes, two passt panes, two passt-repair panes,
# plus host and log
layout_migrate() {
sleep 1
tmux kill-pane -a -t 0
cmd_write 0 clear
tmux split-window -v -t passt_test
tmux split-window -h -l '33%'
tmux split-window -h -t passt_test:1.1
tmux split-window -h -l '35%' -t passt_test:1.0
tmux split-window -v -t passt_test:1.0
tmux split-window -v -t passt_test:1.4
tmux split-window -v -t passt_test:1.6
tmux split-window -v -t passt_test:1.3
PANE_GUEST_1=0
PANE_GUEST_2=1
PANE_INFO=2
PANE_MON=3
PANE_HOST=4
PANE_PASST_REPAIR_1=5
PANE_PASST_1=6
PANE_PASST_REPAIR_2=7
PANE_PASST_2=8
get_info_cols
pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2
@ -189,16 +141,11 @@ layout_migrate() {
tmux send-keys -t ${PANE_INFO} -N 100 C-m
tmux select-pane -t ${PANE_INFO} -T "test log"
pane_watch_contexts ${PANE_MON} "QEMU monitor" mon mon
pane_watch_contexts ${PANE_HOST} host host
pane_watch_contexts ${PANE_PASST_REPAIR_1} "passt-repair #1 in namespace #1" repair_1 passt_repair_1
pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
pane_watch_contexts ${PANE_PASST_REPAIR_2} "passt-repair #2 in namespace #2" repair_2 passt_repair_2
pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2
info_layout "two guests, two passt + passt-repair instances, in namespaces"
info_layout "two guests, two passt instances, in namespaces"
sleep 1
}

View file

@ -49,21 +49,6 @@ td:empty { visibility: hidden; }
__passt_tcp_LINE__ __passt_udp_LINE__
</table>
</li><li><p>passt with vhost-user support</p>
<table class="passt" width="70%">
<tr>
<th/>
<th id="perf_passt_vu_tcp" colspan="__passt_vu_tcp_cols__">TCP, __passt_vu_tcp_threads__ at __passt_vu_tcp_freq__ GHz</th>
<th id="perf_passt_vu_udp" colspan="__passt_vu_udp_cols__">UDP, __passt_vu_udp_threads__ at __passt_vu_udp_freq__ GHz</th>
</tr>
<tr>
<td align="right">MTU:</td>
__passt_vu_tcp_header__
__passt_vu_udp_header__
</tr>
__passt_vu_tcp_LINE__ __passt_vu_udp_LINE__
</table>
<style type="text/CSS">
table.pasta_local td { border: 0px solid; padding: 6px; line-height: 1; }
table.pasta_local td { text-align: right; }

View file

@ -15,7 +15,8 @@
INITRAMFS="${BASEPATH}/mbuto.img"
VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )"
MEM_KIB="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
__mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
VMEM="$((${__mem_kib} / 1024 / 4))"
QEMU_ARCH="$(uname -m)"
[ "${QEMU_ARCH}" = "i686" ] && QEMU_ARCH=i386
@ -45,38 +46,24 @@ setup_passt() {
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
context_run passt "make clean"
context_run passt "make valgrind"
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -H hostname1 --fqdn fqdn1.passt.test -P ${STATESETUP}/passt.pid"
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -P ${STATESETUP}/passt.pid"
# pidfile isn't created until passt is listening
wait_for [ -f "${STATESETUP}/passt.pid" ]
__vmem="$((${MEM_KIB} / 1024 / 4))"
if [ ${VHOST_USER} -eq 1 ]; then
__vmem="$(((${__vmem} + 500) / 1000))G"
__qemu_netdev=" \
-chardev socket,id=c,path=${STATESETUP}/passt.socket \
-netdev vhost-user,id=v,chardev=c \
-device virtio-net,netdev=v \
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
-numa node,memdev=m"
else
__qemu_netdev="-device virtio-net-pci,netdev=s \
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket"
fi
GUEST_CID=94557
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
' -machine accel=kvm' \
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
" ${__qemu_netdev}" \
' -device virtio-net-pci,netdev=s0 ' \
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \
" -pidfile ${STATESETUP}/qemu.pid" \
" -device vhost-vsock-pci,guest-cid=$GUEST_CID"
@ -155,43 +142,29 @@ setup_passt_in_ns() {
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_in_pasta.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
if [ ${VALGRIND} -eq 1 ]; then
context_run passt "make clean"
context_run passt "make valgrind"
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
else
context_run passt "make clean"
context_run passt "make"
context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
fi
wait_for [ -f "${STATESETUP}/passt.pid" ]
__vmem="$((${MEM_KIB} / 1024 / 4))"
if [ ${VHOST_USER} -eq 1 ]; then
__vmem="$(((${__vmem} + 500) / 1000))G"
__qemu_netdev=" \
-chardev socket,id=c,path=${STATESETUP}/passt.socket \
-netdev vhost-user,id=v,chardev=c \
-device virtio-net,netdev=v \
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
-numa node,memdev=m"
else
__qemu_netdev="-device virtio-net-pci,netdev=s \
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket"
fi
GUEST_CID=94557
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
' -machine accel=kvm' \
' -M accel=kvm:tcg' \
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
" ${__qemu_netdev}" \
' -device virtio-net-pci,netdev=s0 ' \
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \
" -pidfile ${STATESETUP}/qemu.pid" \
" -device vhost-vsock-pci,guest-cid=$GUEST_CID"
@ -241,126 +214,11 @@ setup_two_guests() {
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} --fqdn fqdn1.passt.test -H hostname1 -t 10001 -u 10001"
wait_for [ -f "${STATESETUP}/passt_1.pid" ]
__opts=
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} --hostname hostname2 --fqdn fqdn2 -t 10004 -u 10004"
wait_for [ -f "${STATESETUP}/passt_2.pid" ]
__vmem="$((${MEM_KIB} / 1024 / 4))"
if [ ${VHOST_USER} -eq 1 ]; then
__vmem="$(((${__vmem} + 500) / 1000))G"
__qemu_netdev1=" \
-chardev socket,id=c,path=${STATESETUP}/passt_1.socket \
-netdev vhost-user,id=v,chardev=c \
-device virtio-net,netdev=v \
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
-numa node,memdev=m"
__qemu_netdev2=" \
-chardev socket,id=c,path=${STATESETUP}/passt_2.socket \
-netdev vhost-user,id=v,chardev=c \
-device virtio-net,netdev=v \
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
-numa node,memdev=m"
else
__qemu_netdev1="-device virtio-net-pci,netdev=s \
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket"
__qemu_netdev2="-device virtio-net-pci,netdev=s \
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket"
fi
GUEST_1_CID=94557
context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
' -M accel=kvm:tcg' \
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
" ${__qemu_netdev1}" \
" -pidfile ${STATESETUP}/qemu_1.pid" \
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"
GUEST_2_CID=94558
context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
' -M accel=kvm:tcg' \
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
" ${__qemu_netdev2}" \
" -pidfile ${STATESETUP}/qemu_2.pid" \
" -device vhost-vsock-pci,guest-cid=$GUEST_2_CID"
context_setup_guest guest_1 ${GUEST_1_CID}
context_setup_guest guest_2 ${GUEST_2_CID}
}
# setup_migrate() - Set up two namespace, run qemu, passt/passt-repair in both
setup_migrate() {
context_setup_host host
context_setup_host mon
context_setup_host pasta_1
context_setup_host pasta_2
layout_migrate
# Ports:
#
# guest #1 | guest #2 | ns #1 | host
# --------- |-----------|-----------|------------
# 10001 as server | | to guest | to ns #1
# 10002 | | as server | to ns #1
# 10003 | | to init | as server
# 10004 | as server | to guest | to ns #1
__opts=
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/pasta_1.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
__map_host4=192.0.2.1
__map_host6=2001:db8:9a55::1
__map_ns4=192.0.2.2
__map_ns6=2001:db8:9a55::2
# Option 1: send stuff via spliced path in pasta
# context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002 -T 10003 -u 10001,10002 -U 10003 --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
# Option 2: send stuff via tap (--map-guest-addr) instead (useful to see capture of full migration)
context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002,10004 -T 10003 -u 10001,10002,10004 -U 10003 --map-guest-addr ${__map_host4} --map-guest-addr ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
context_setup_nstool passt_1 ${STATESETUP}/ns1.hold
context_setup_nstool passt_repair_1 ${STATESETUP}/ns1.hold
context_setup_nstool passt_2 ${STATESETUP}/ns1.hold
context_setup_nstool passt_repair_2 ${STATESETUP}/ns1.hold
context_setup_nstool qemu_1 ${STATESETUP}/ns1.hold
context_setup_nstool qemu_2 ${STATESETUP}/ns1.hold
__ifname="$(context_run qemu_1 "ip -j link show | jq -rM '.[] | select(.link_type == \"ether\").ifname'")"
sleep 1
__opts="--vhost-user"
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001"
wait_for [ -f "${STATESETUP}/passt_1.pid" ]
context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
__opts="--vhost-user"
__opts=
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
@ -368,52 +226,34 @@ setup_migrate() {
context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004"
wait_for [ -f "${STATESETUP}/passt_2.pid" ]
context_run_bg passt_repair_2 "./passt-repair ${STATESETUP}/passt_2.socket.repair"
__vmem="512M" # Keep migration fast
__qemu_netdev1=" \
-chardev socket,id=c,path=${STATESETUP}/passt_1.socket \
-netdev vhost-user,id=v,chardev=c \
-device virtio-net,netdev=v \
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
-numa node,memdev=m"
__qemu_netdev2=" \
-chardev socket,id=c,path=${STATESETUP}/passt_2.socket \
-netdev vhost-user,id=v,chardev=c \
-device virtio-net,netdev=v \
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
-numa node,memdev=m"
GUEST_1_CID=94557
context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
' -M accel=kvm:tcg' \
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
" ${__qemu_netdev1}" \
' -device virtio-net-pci,netdev=s0 ' \
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket " \
" -pidfile ${STATESETUP}/qemu_1.pid" \
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID" \
" -monitor unix:${STATESETUP}/qemu_1_mon.sock,server,nowait"
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"
GUEST_2_CID=94558
context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
' -M accel=kvm:tcg' \
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
" ${__qemu_netdev2}" \
' -device virtio-net-pci,netdev=s0 ' \
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket " \
" -pidfile ${STATESETUP}/qemu_2.pid" \
" -device vhost-vsock-pci,guest-cid=$GUEST_2_CID" \
" -monitor unix:${STATESETUP}/qemu_2_mon.sock,server,nowait" \
" -incoming tcp:0:20005"
" -device vhost-vsock-pci,guest-cid=$GUEST_2_CID"
context_setup_guest guest_1 ${GUEST_1_CID}
# Only available after migration:
( context_setup_guest guest_2 ${GUEST_2_CID} & )
context_setup_guest guest_2 ${GUEST_2_CID}
}
# teardown_context_watch() - Remove contexts and stop panes watching them
@ -486,8 +326,7 @@ teardown_two_guests() {
context_wait pasta_1
context_wait pasta_2
rm "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid"
rm "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid"
rm -f "${STATESETUP}/passt__[12].pid" "${STATESETUP}/pasta_[12].pid"
teardown_context_watch ${PANE_HOST} host
teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
@ -496,30 +335,6 @@ teardown_two_guests() {
teardown_context_watch ${PANE_PASST_2} pasta_2 passt_2
}
# teardown_migrate() - Exit namespaces, kill qemu processes, passt and pasta
teardown_migrate() {
${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_1.pid")
${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_2.pid")
context_wait qemu_1
context_wait qemu_2
${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/passt_2.pid")
context_wait passt_1
context_wait passt_2
${NSTOOL} stop "${STATESETUP}/ns1.hold"
context_wait pasta_1
rm -f "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid"
rm -f "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid"
teardown_context_watch ${PANE_HOST} host
teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
teardown_context_watch ${PANE_GUEST_2} qemu_2 guest_2
teardown_context_watch ${PANE_PASST_1} pasta_1 passt_1
teardown_context_watch ${PANE_PASST_2} pasta_1 passt_2
}
# teardown_demo_passt() - Exit namespace, kill qemu, passt and pasta
teardown_demo_passt() {
tmux send-keys -t ${PANE_GUEST} "C-c"

View file

@ -33,7 +33,7 @@ setup_memory() {
pane_or_context_run guest 'qemu-system-$(uname -m)' \
' -machine accel=kvm' \
' -m '$((${MEM_KIB} / 1024 / 4))' -cpu host -smp '${VCPUS} \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS_MEM}' -nographic -serial stdio' \
' -nodefaults' \

View file

@ -19,7 +19,6 @@ STATUS_FILE_INDEX=0
STATUS_COLS=
STATUS_PASS=0
STATUS_FAIL=0
STATUS_SKIPPED=0
PR_RED='\033[1;31m'
PR_GREEN='\033[1;32m'
@ -32,8 +31,8 @@ PR_DELAY_INIT=100 # ms
# $@: Message to print
info() {
tmux select-pane -t ${PANE_INFO}
printf "${@}\n" >> $STATEBASE/log_pipe
printf "${@}\n" >> "${LOGFILE}"
echo "${@}" >> $STATEBASE/log_pipe
echo "${@}" >> "${LOGFILE}"
}
# info_n() - Highlight, print message to pane and to log file without newline
@ -48,13 +47,13 @@ info_n() {
# $@: Message to print
info_nolog() {
tmux select-pane -t ${PANE_INFO}
printf "${@}\n" >> $STATEBASE/log_pipe
echo "${@}" >> $STATEBASE/log_pipe
}
# info_nolog() - Print message to log file
# $@: Message to print
log() {
printf "${@}\n" >> "${LOGFILE}"
echo "${@}" >> "${LOGFILE}"
}
# info_nolog_n() - Send message to pane without highlighting it, without newline
@ -440,21 +439,19 @@ info_layout() {
# status_test_ok() - Update counter of passed tests, log and display message
status_test_ok() {
STATUS_PASS=$((STATUS_PASS + 1))
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
info_passed
}
# status_test_fail() - Update counter of failed tests, log and display message
status_test_fail() {
STATUS_FAIL=$((STATUS_FAIL + 1))
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
info_failed
}
# status_test_fail() - Update counter of failed tests, log and display message
status_test_skip() {
STATUS_SKIPPED=$((STATUS_SKIPPED + 1))
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
info_skipped
}
@ -667,7 +664,7 @@ pause_continue() {
# run_term() - Start tmux session, running entry point, with recording if needed
run_term() {
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL"
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG"
if [ ${CI} -eq 1 ]; then
printf '\e[8;50;240t'

View file

@ -20,7 +20,10 @@ test_iperf3s() {
__sctx="${1}"
__port="${2}"
pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid'
pane_or_context_run_bg "${__sctx}" \
'iperf3 -s -p'${__port}' & echo $! > s.pid' \
sleep 1 # Wait for server to be ready
}
# test_iperf3k() - Kill iperf3 server
@ -28,7 +31,7 @@ test_iperf3s() {
test_iperf3k() {
__sctx="${1}"
pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)'
pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'
sleep 1 # Wait for kernel to free up ports
}
@ -65,45 +68,6 @@ test_iperf3() {
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
}
# test_iperf3m() - Ugly helper for iperf3 directive, guest migration variant
# $1: Variable name: to put the measure bandwidth into
# $2: Initial source/client context
# $3: Second source/client context the guest is moving to
# $4: Destination name or address for client
# $5: Port number, ${i} is translated to process index
# $6: Run time, in seconds
# $7: Client options
test_iperf3m() {
__var="${1}"; shift
__cctx="${1}"; shift
__cctx2="${1}"; shift
__dest="${1}"; shift
__port="${1}"; shift
__time="${1}"; shift
pane_or_context_run "${__cctx}" 'rm -f c.json'
# A 1s wait for connection on what's basically a local link
# indicates something is pretty wrong
__timeout=1000
pane_or_context_run_bg "${__cctx}" \
'iperf3 -J -c '${__dest}' -p '${__port} \
' --connect-timeout '${__timeout} \
' -t'${__time}' -i0 '"${@}"' > c.json' \
__jval=".end.sum_received.bits_per_second"
sleep $((${__time} + 3))
pane_or_context_output "${__cctx2}" \
'cat c.json'
__bw=$(pane_or_context_output "${__cctx2}" \
'cat c.json | jq -rMs "map('${__jval}') | add"')
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
}
test_one_line() {
__line="${1}"
@ -213,12 +177,6 @@ test_one_line() {
"guest2w")
pane_or_context_wait guest_2 || TEST_ONE_nok=1
;;
"mon")
pane_or_context_run mon "${__arg}" || TEST_ONE_nok=1
;;
"monb")
pane_or_context_run_bg mon "${__arg}"
;;
"ns")
pane_or_context_run ns "${__arg}" || TEST_ONE_nok=1
;;
@ -334,9 +292,6 @@ test_one_line() {
"iperf3")
test_iperf3 ${__arg}
;;
"iperf3m")
test_iperf3m ${__arg}
;;
"set")
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__arg%% *}__" "${__arg#* }")"
;;

View file

@ -1,59 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# test/migrate/basic - Check basic migration functionality
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
g1tools ip jq dhclient socat cat
htools ip jq
set MAP_HOST4 192.0.2.1
set MAP_HOST6 2001:db8:9a55::1
set MAP_NS4 192.0.2.2
set MAP_NS6 2001:db8:9a55::2
test Interface name
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME1__" ]
test DHCP: address
guest1 ip link set dev __IFNAME1__ up
guest1 /sbin/dhclient -4 __IFNAME1__
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
check [ "__ADDR1__" = "__HOST_ADDR__" ]
test DHCPv6: address
# Link is up now, wait for DAD to complete
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
test TCP/IPv4: guest1/guest2 > host
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
sleep 1
# Option 1: via spliced path in pasta, namespace to host
# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003
# Option 2: via --map-guest-addr (tap) in pasta, namespace to host
guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006
sleep 1
mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
hostw
hout MSG cat __STATESETUP__/msg
check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]

View file

@ -1,62 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# test/migrate/basic_fin - Outbound traffic across migration, half-closed socket
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
g1tools ip jq dhclient socat cat
htools ip jq
set MAP_HOST4 192.0.2.1
set MAP_HOST6 2001:db8:9a55::1
set MAP_NS4 192.0.2.2
set MAP_NS6 2001:db8:9a55::2
test Interface name
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME1__" ]
test DHCP: address
guest1 ip link set dev __IFNAME1__ up
guest1 /sbin/dhclient -4 __IFNAME1__
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
check [ "__ADDR1__" = "__HOST_ADDR__" ]
test DHCPv6: address
# Link is up now, wait for DAD to complete
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
test TCP/IPv4: guest1, half-close, guest2 > host
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
hostb echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg
#hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
#sleep 20
# Option 1: via spliced path in pasta, namespace to host
# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003
# Option 2: via --map-guest-addr (tap) in pasta, namespace to host
guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006
sleep 1
mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
hostw
hout MSG cat __STATESETUP__/msg
check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]

View file

@ -1,64 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# test/migrate/bidirectional - Check migration with messages in both directions
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
g1tools ip jq dhclient socat cat
htools ip jq
set MAP_HOST4 192.0.2.1
set MAP_HOST6 2001:db8:9a55::1
set MAP_NS4 192.0.2.2
set MAP_NS6 2001:db8:9a55::2
test Interface name
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME1__" ]
test DHCP: address
guest1 ip link set dev __IFNAME1__ up
guest1 /sbin/dhclient -4 __IFNAME1__
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
check [ "__ADDR1__" = "__HOST_ADDR__" ]
test TCP/IPv4: guest1/guest2 > host, host > guest1/guest2
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
guest1b socat -u TCP4-LISTEN:10001 OPEN:msg,create,trunc
sleep 1
guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006
hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001
sleep 1
guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock
host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock
sleep 1
mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
sleep 2
guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null
host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null
hostw
# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1,
# use sleep 1 for the moment
sleep 1
hout MSG cat __STATESETUP__/msg
check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]
g2out MSG cat msg
check [ "__MSG__" = "Dear guest 1, you are now guest 2" ]

View file

@ -1,64 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# test/migrate/bidirectional_fin - Both directions, half-closed sockets
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
g1tools ip jq dhclient socat cat
htools ip jq
set MAP_HOST4 192.0.2.1
set MAP_HOST6 2001:db8:9a55::1
set MAP_NS4 192.0.2.2
set MAP_NS6 2001:db8:9a55::2
test Interface name
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME1__" ]
test DHCP: address
guest1 ip link set dev __IFNAME1__ up
guest1 /sbin/dhclient -4 __IFNAME1__
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
check [ "__ADDR1__" = "__HOST_ADDR__" ]
test TCP/IPv4: guest1/guest2 <- (half closed) -> host
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
hostb echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg
guest1b echo FIN | socat TCP4-LISTEN:10001,shut-down STDIO,ignoreeof > msg
sleep 1
guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006
hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001
sleep 1
guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock
host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock
sleep 1
mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
sleep 2
guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null
host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null
hostw
# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1,
# use sleep 1 for the moment
sleep 1
hout MSG cat __STATESETUP__/msg
check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]
g2out MSG cat msg
check [ "__MSG__" = "Dear guest 1, you are now guest 2" ]

View file

@ -1,58 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# test/migrate/iperf3_bidir6 - Migration behaviour with many bidirectional flows
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
g1tools ip jq dhclient socat cat
htools ip jq
set MAP_HOST4 192.0.2.1
set MAP_HOST6 2001:db8:9a55::1
set MAP_NS4 192.0.2.2
set MAP_NS6 2001:db8:9a55::2
set THREADS 128
set TIME 3
set OMIT 0.1
set OPTS -Z -P __THREADS__ -O__OMIT__ -N --bidir
test Interface name
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME1__" ]
test DHCP: address
guest1 ip link set dev __IFNAME1__ up
guest1 /sbin/dhclient -4 __IFNAME1__
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
check [ "__ADDR1__" = "__HOST_ADDR__" ]
test DHCPv6: address
# Link is up now, wait for DAD to complete
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
test TCP/IPv6 host <-> guest flood, many flows, during migration
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
iperf3s host 10006
iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
bw __BW__ 1 2
iperf3k host

View file

@ -1,50 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# test/migrate/iperf3_in4 - Migration behaviour under inbound IPv4 flood
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
g1tools ip jq dhclient socat cat
htools ip jq
set MAP_HOST4 192.0.2.1
set MAP_HOST6 2001:db8:9a55::1
set MAP_NS4 192.0.2.2
set MAP_NS6 2001:db8:9a55::2
guest1 /sbin/sysctl -w net.core.rmem_max=33554432
guest1 /sbin/sysctl -w net.core.wmem_max=33554432
set THREADS 1
set TIME 4
set OMIT 0.1
set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R
test Interface name
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME1__" ]
test DHCP: address
guest1 ip link set dev __IFNAME1__ up
guest1 /sbin/dhclient -4 __IFNAME1__
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
check [ "__ADDR1__" = "__HOST_ADDR__" ]
test TCP/IPv4 host to guest throughput during migration
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
iperf3s host 10006
iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__
bw __BW__ 1 2
iperf3k host

View file

@ -1,58 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# test/migrate/iperf3_in6 - Migration behaviour under inbound IPv6 flood
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
g1tools ip jq dhclient socat cat
htools ip jq
set MAP_HOST4 192.0.2.1
set MAP_HOST6 2001:db8:9a55::1
set MAP_NS4 192.0.2.2
set MAP_NS6 2001:db8:9a55::2
set THREADS 4
set TIME 3
set OMIT 0.1
set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R
test Interface name
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME1__" ]
test DHCP: address
guest1 ip link set dev __IFNAME1__ up
guest1 /sbin/dhclient -4 __IFNAME1__
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
check [ "__ADDR1__" = "__HOST_ADDR__" ]
test DHCPv6: address
# Link is up now, wait for DAD to complete
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
test TCP/IPv6 host to guest throughput during migration
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
iperf3s host 10006
iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
bw __BW__ 1 2
iperf3k host

View file

@ -1,60 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# test/migrate/iperf3_many_out6 - Migration behaviour with many outbound flows
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
g1tools ip jq dhclient socat cat
htools ip jq
set MAP_HOST4 192.0.2.1
set MAP_HOST6 2001:db8:9a55::1
set MAP_NS4 192.0.2.2
set MAP_NS6 2001:db8:9a55::2
set THREADS 16
set TIME 3
set OMIT 0.1
set OPTS -Z -P __THREADS__ -O__OMIT__ -N -l 1M
test Interface name
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME1__" ]
test DHCP: address
guest1 ip link set dev __IFNAME1__ up
guest1 /sbin/dhclient -4 __IFNAME1__
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
check [ "__ADDR1__" = "__HOST_ADDR__" ]
test DHCPv6: address
# Link is up now, wait for DAD to complete
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
test TCP/IPv6 guest to host flood, many flows, during migration
test TCP/IPv6 host to guest throughput during migration
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
iperf3s host 10006
iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
bw __BW__ 1 2
iperf3k host

View file

@ -1,47 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# test/migrate/iperf3_out4 - Migration behaviour under outbound IPv4 flood
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
g1tools ip jq dhclient socat cat
htools ip jq
set MAP_HOST4 192.0.2.1
set MAP_HOST6 2001:db8:9a55::1
set MAP_NS4 192.0.2.2
set MAP_NS6 2001:db8:9a55::2
set THREADS 6
set TIME 2
set OMIT 0.1
set OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M
test Interface name
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME1__" ]
test DHCP: address
guest1 ip link set dev __IFNAME1__ up
guest1 /sbin/dhclient -4 __IFNAME1__
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
check [ "__ADDR1__" = "__HOST_ADDR__" ]
test TCP/IPv4 guest to host throughput during migration
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
iperf3s host 10006
iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__
bw __BW__ 1 2
iperf3k host

View file

@ -1,58 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# PASST - Plug A Simple Socket Transport
# for qemu/UNIX domain socket mode
#
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
# test/migrate/iperf3_out6 - Migration behaviour under outbound IPv6 flood
#
# Copyright (c) 2025 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
g1tools ip jq dhclient socat cat
htools ip jq
set MAP_HOST4 192.0.2.1
set MAP_HOST6 2001:db8:9a55::1
set MAP_NS4 192.0.2.2
set MAP_NS6 2001:db8:9a55::2
set THREADS 6
set TIME 2
set OMIT 0.1
set OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M
test Interface name
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME1__" ]
test DHCP: address
guest1 ip link set dev __IFNAME1__ up
guest1 /sbin/dhclient -4 __IFNAME1__
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
check [ "__ADDR1__" = "__HOST_ADDR__" ]
test DHCPv6: address
# Link is up now, wait for DAD to complete
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
test TCP/IPv6 guest to host throughput during migration
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
iperf3s host 10006
iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
bw __BW__ 1 2
iperf3k host

Some files were not shown because too many files have changed in this diff Show more