mirror of
https://passt.top/passt
synced 2025-07-26 11:28:00 +02:00
Compare commits
No commits in common. "master" and "2024_09_06.6b38f07" have entirely different histories.
master
...
2024_09_06
141 changed files with 2600 additions and 13161 deletions
.clang-format.clang-tidy.clangd.gitignoreMakefileREADME.mdarch.carp.cchecksum.cchecksum.hconf.cconf.hdhcp.cdhcpv6.cepoll_type.hflow.cflow.hflow_table.hfwd.cfwd.h
contrib
apparmor
fedora
selinux
doc
migration
platform-requirements
hooks
icmp.cinany.cinany.hiov.ciov.hip.hisolation.clineread.clinux_dep.hlog.clog.hmigrate.cmigrate.hndp.cndp.hnetlink.cpacket.cpacket.hpasst-repair.1passt-repair.cpasst.1passt.cpasst.hpasta.cpcap.cpcap.hpif.cpif.hrepair.crepair.hseccomp.shsiphash.htap.ctap.htcp.ctcp.htcp_buf.ctcp_buf.htcp_conn.htcp_internal.htcp_splice.ctcp_vu.ctcp_vu.htest
126
.clang-format
126
.clang-format
|
@ -1,126 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0
|
||||
#
|
||||
# clang-format configuration file. Intended for clang-format >= 11.
|
||||
#
|
||||
# For more information, see:
|
||||
#
|
||||
# Documentation/dev-tools/clang-format.rst
|
||||
# https://clang.llvm.org/docs/ClangFormat.html
|
||||
# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||
#
|
||||
---
|
||||
AccessModifierOffset: -4
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlines: Left
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: false
|
||||
AllowAllParametersOfDeclarationOnNextLine: false
|
||||
AllowShortBlocksOnASingleLine: false
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: None
|
||||
AllowShortIfStatementsOnASingleLine: false
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: false
|
||||
AlwaysBreakTemplateDeclarations: false
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true
|
||||
BraceWrapping:
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: true
|
||||
AfterNamespace: true
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Custom
|
||||
BreakBeforeInheritanceComma: false
|
||||
BreakBeforeTernaryOperators: false
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BreakConstructorInitializers: BeforeComma
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakStringLiterals: false
|
||||
ColumnLimit: 80
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: false
|
||||
ConstructorInitializerIndentWidth: 8
|
||||
ContinuationIndentWidth: 8
|
||||
Cpp11BracedListStyle: false
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: false
|
||||
|
||||
# Taken from:
|
||||
# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
|
||||
# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
|
||||
# | LC_ALL=C sort -u
|
||||
ForEachMacros:
|
||||
- 'for_each_nst'
|
||||
|
||||
IncludeBlocks: Preserve
|
||||
IncludeCategories:
|
||||
- Regex: '.*'
|
||||
Priority: 1
|
||||
IncludeIsMainRegex: '(Test)?$'
|
||||
IndentCaseLabels: false
|
||||
IndentGotoLabels: false
|
||||
IndentPPDirectives: None
|
||||
IndentWidth: 8
|
||||
IndentWrappedFunctionNames: false
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBinPackProtocolList: Auto
|
||||
ObjCBlockIndentWidth: 8
|
||||
ObjCSpaceAfterProperty: true
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
|
||||
# Taken from git's rules
|
||||
PenaltyBreakAssignment: 10
|
||||
PenaltyBreakBeforeFirstCallParameter: 30
|
||||
PenaltyBreakComment: 10
|
||||
PenaltyBreakFirstLessLess: 0
|
||||
PenaltyBreakString: 10
|
||||
PenaltyExcessCharacter: 100
|
||||
PenaltyReturnTypeOnItsOwnLine: 60
|
||||
|
||||
PointerAlignment: Right
|
||||
ReflowComments: false
|
||||
SortIncludes: false
|
||||
SortUsingDeclarations: false
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeCtorInitializerColon: true
|
||||
SpaceBeforeInheritanceColon: true
|
||||
SpaceBeforeParens: ControlStatementsExceptForEachMacros
|
||||
SpaceBeforeRangeBasedForLoopColon: true
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 1
|
||||
SpacesInAngles: false
|
||||
SpacesInContainerLiterals: false
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
Standard: Cpp03
|
||||
TabWidth: 8
|
||||
UseTab: Always
|
||||
...
|
93
.clang-tidy
93
.clang-tidy
|
@ -1,93 +0,0 @@
|
|||
---
|
||||
Checks:
|
||||
- "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*"
|
||||
|
||||
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
|
||||
- "-clang-analyzer-valist.Uninitialized"
|
||||
|
||||
# Dubious value, would kill readability
|
||||
- "-cppcoreguidelines-init-variables"
|
||||
|
||||
# Dubious value over the compiler's built-in warning. Would
|
||||
# increase verbosity.
|
||||
- "-bugprone-assignment-in-if-condition"
|
||||
|
||||
# Debatable whether these improve readability, right now it would look
|
||||
# like a mess
|
||||
- "-google-readability-braces-around-statements"
|
||||
- "-hicpp-braces-around-statements"
|
||||
- "-readability-braces-around-statements"
|
||||
|
||||
# TODO: in most cases they are justified, but probably not everywhere
|
||||
#
|
||||
- "-readability-magic-numbers"
|
||||
- "-cppcoreguidelines-avoid-magic-numbers"
|
||||
|
||||
# TODO: this is Linux-only for the moment, nice to fix eventually
|
||||
- "-llvmlibc-restrict-system-libc-headers"
|
||||
|
||||
# Those are needed for syscalls, epoll_wait flags, etc.
|
||||
- "-hicpp-signed-bitwise"
|
||||
|
||||
# Probably not doable to impement this without plain memcpy(), memset()
|
||||
- "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
|
||||
|
||||
# TODO: not really important, but nice to fix eventually
|
||||
- "-llvm-include-order"
|
||||
|
||||
# Dubious value, would kill readability
|
||||
- "-readability-isolate-declaration"
|
||||
|
||||
# TODO: nice to fix eventually
|
||||
- "-bugprone-narrowing-conversions"
|
||||
- "-cppcoreguidelines-narrowing-conversions"
|
||||
|
||||
# TODO: check, fix, and more in general constify wherever possible
|
||||
- "-cppcoreguidelines-avoid-non-const-global-variables"
|
||||
|
||||
# TODO: check paths where it might make sense to improve performance
|
||||
- "-altera-unroll-loops"
|
||||
- "-altera-id-dependent-backward-branch"
|
||||
|
||||
# Not much can be done about them other than being careful
|
||||
- "-bugprone-easily-swappable-parameters"
|
||||
|
||||
# TODO: split reported functions
|
||||
- "-readability-function-cognitive-complexity"
|
||||
|
||||
# "Poor" alignment needed for structs reflecting message formats/headers
|
||||
- "-altera-struct-pack-align"
|
||||
|
||||
# TODO: check again if multithreading is implemented
|
||||
- "-concurrency-mt-unsafe"
|
||||
|
||||
# Complains about any identifier <3 characters, reasonable for
|
||||
# globals, pointlessly verbose for locals and parameters.
|
||||
- "-readability-identifier-length"
|
||||
|
||||
# Wants to include headers which *directly* provide the things
|
||||
# we use. That sounds nice, but means it will often want a OS
|
||||
# specific header instead of a mostly standard one, such as
|
||||
# <linux/limits.h> instead of <limits.h>.
|
||||
- "-misc-include-cleaner"
|
||||
|
||||
# Want to replace all #defines of integers with enums. Kind of
|
||||
# makes sense when those defines form an enum-like set, but
|
||||
# weird for cases like standalone constants, and causes other
|
||||
# awkwardness for a bunch of cases we use
|
||||
- "-cppcoreguidelines-macro-to-enum"
|
||||
|
||||
# It's been a couple of centuries since multiplication has been granted
|
||||
# precedence over addition in modern mathematical notation. Adding
|
||||
# parentheses to reinforce that certainly won't improve readability.
|
||||
- "-readability-math-missing-parentheses"
|
||||
WarningsAsErrors: "*"
|
||||
HeaderFileExtensions:
|
||||
- h
|
||||
ImplementationFileExtensions:
|
||||
- c
|
||||
HeaderFilterRegex: ""
|
||||
FormatStyle: none
|
||||
CheckOptions:
|
||||
bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false"
|
||||
SystemHeaders: false
|
3
.clangd
3
.clangd
|
@ -1,3 +0,0 @@
|
|||
CompileFlags:
|
||||
# Don't try to interpret our headers as C++'
|
||||
Add: [-xc, -Wall]
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -3,10 +3,8 @@
|
|||
/passt.avx2
|
||||
/pasta
|
||||
/pasta.avx2
|
||||
/passt-repair
|
||||
/qrap
|
||||
/pasta.1
|
||||
/seccomp.h
|
||||
/seccomp_repair.h
|
||||
/c*.json
|
||||
README.plain.md
|
||||
|
|
206
Makefile
206
Makefile
|
@ -15,13 +15,23 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
|
|||
# the IPv6 socket API? (Linux does)
|
||||
DUAL_STACK_SOCKETS := 1
|
||||
|
||||
RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s')
|
||||
ifeq ($(RLIMIT_STACK_VAL),unlimited)
|
||||
RLIMIT_STACK_VAL := 1024
|
||||
endif
|
||||
|
||||
TARGET ?= $(shell $(CC) -dumpmachine)
|
||||
$(if $(TARGET),,$(error Failed to get target architecture))
|
||||
# Get 'uname -m'-like architecture description for target
|
||||
TARGET_ARCH := $(firstword $(subst -, ,$(TARGET)))
|
||||
TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH))
|
||||
TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH))
|
||||
TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH))
|
||||
TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
|
||||
TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')
|
||||
|
||||
AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
|
||||
|
||||
# On some systems enabling optimization also enables source fortification,
|
||||
# automagically. Do not override it.
|
||||
|
@ -30,32 +40,48 @@ ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /
|
|||
FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
|
||||
endif
|
||||
|
||||
FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security
|
||||
FLAGS := -Wall -Wextra -Wno-format-zero-length
|
||||
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
|
||||
FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
|
||||
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
|
||||
FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
|
||||
FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
|
||||
FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
|
||||
FLAGS += -DARCH=\"$(TARGET_ARCH)\"
|
||||
FLAGS += -DVERSION=\"$(VERSION)\"
|
||||
FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
|
||||
|
||||
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
|
||||
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
|
||||
ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \
|
||||
repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \
|
||||
udp_vu.c util.c vhost_user.c virtio.c vu_common.c
|
||||
ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
|
||||
tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c
|
||||
QRAP_SRCS = qrap.c
|
||||
PASST_REPAIR_SRCS = passt-repair.c
|
||||
SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
|
||||
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
|
||||
|
||||
MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
|
||||
MANPAGES = passt.1 pasta.1 qrap.1
|
||||
|
||||
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
|
||||
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
|
||||
lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
|
||||
pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \
|
||||
tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \
|
||||
udp_vu.h util.h vhost_user.h virtio.h vu_common.h
|
||||
lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
|
||||
siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
|
||||
udp.h udp_flow.h util.h
|
||||
HEADERS = $(PASST_HEADERS) seccomp.h
|
||||
|
||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_SND_WND
|
||||
endif
|
||||
|
||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_BYTES_ACKED
|
||||
endif
|
||||
|
||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 };
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_MIN_RTT
|
||||
endif
|
||||
|
||||
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_GETRANDOM
|
||||
|
@ -65,6 +91,11 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec
|
|||
FLAGS += -fstack-protector-strong
|
||||
endif
|
||||
|
||||
C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE;
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
EXTRA_SYSCALLS += fallocate
|
||||
endif
|
||||
|
||||
prefix ?= /usr/local
|
||||
exec_prefix ?= $(prefix)
|
||||
bindir ?= $(exec_prefix)/bin
|
||||
|
@ -74,9 +105,9 @@ mandir ?= $(datarootdir)/man
|
|||
man1dir ?= $(mandir)/man1
|
||||
|
||||
ifeq ($(TARGET_ARCH),x86_64)
|
||||
BIN := passt passt.avx2 pasta pasta.avx2 qrap passt-repair
|
||||
BIN := passt passt.avx2 pasta pasta.avx2 qrap
|
||||
else
|
||||
BIN := passt pasta qrap passt-repair
|
||||
BIN := passt pasta qrap
|
||||
endif
|
||||
|
||||
all: $(BIN) $(MANPAGES) docs
|
||||
|
@ -85,10 +116,7 @@ static: FLAGS += -static -DGLIBC_NO_STATIC_NSS
|
|||
static: clean all
|
||||
|
||||
seccomp.h: seccomp.sh $(PASST_SRCS) $(PASST_HEADERS)
|
||||
@ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp.h $(PASST_SRCS) $(PASST_HEADERS)
|
||||
|
||||
seccomp_repair.h: seccomp.sh $(PASST_REPAIR_SRCS)
|
||||
@ ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp_repair.h $(PASST_REPAIR_SRCS)
|
||||
@ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh $(PASST_SRCS) $(PASST_HEADERS)
|
||||
|
||||
passt: $(PASST_SRCS) $(HEADERS)
|
||||
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_SRCS) -o passt $(LDFLAGS)
|
||||
|
@ -104,21 +132,17 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
|
|||
ln -sf $< $@
|
||||
|
||||
qrap: $(QRAP_SRCS) passt.h
|
||||
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)
|
||||
|
||||
passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h
|
||||
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS)
|
||||
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
|
||||
|
||||
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
|
||||
rt_sigreturn getpid gettid kill clock_gettime \
|
||||
mmap|mmap2 munmap open unlink gettimeofday futex \
|
||||
statx readlink
|
||||
rt_sigreturn getpid gettid kill clock_gettime mmap \
|
||||
mmap2 munmap open unlink gettimeofday futex
|
||||
valgrind: FLAGS += -g -DVALGRIND
|
||||
valgrind: all
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
$(RM) $(BIN) *~ *.o seccomp.h seccomp_repair.h pasta.1 \
|
||||
$(RM) $(BIN) *~ *.o seccomp.h pasta.1 \
|
||||
passt.tar passt.tar.gz *.deb *.rpm \
|
||||
passt.pid README.plain.md
|
||||
|
||||
|
@ -172,11 +196,116 @@ docs: README.md
|
|||
done < README.md; \
|
||||
) > README.plain.md
|
||||
|
||||
clang-tidy: $(PASST_SRCS) $(HEADERS)
|
||||
clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
|
||||
-DCLANG_TIDY_58992
|
||||
# Checkers currently disabled for clang-tidy:
|
||||
# - llvmlibc-restrict-system-libc-headers
|
||||
# TODO: this is Linux-only for the moment, nice to fix eventually
|
||||
#
|
||||
# - google-readability-braces-around-statements
|
||||
# - hicpp-braces-around-statements
|
||||
# - readability-braces-around-statements
|
||||
# Debatable whether that improves readability, right now it would look
|
||||
# like a mess
|
||||
#
|
||||
# - readability-magic-numbers
|
||||
# - cppcoreguidelines-avoid-magic-numbers
|
||||
# TODO: in most cases they are justified, but probably not everywhere
|
||||
#
|
||||
# - clang-analyzer-valist.Uninitialized
|
||||
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
|
||||
#
|
||||
# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling
|
||||
# Probably not doable to impement this without plain memcpy(), memset()
|
||||
#
|
||||
# - cppcoreguidelines-init-variables
|
||||
# Dubious value, would kill readability
|
||||
#
|
||||
# - hicpp-signed-bitwise
|
||||
# Those are needed for syscalls, epoll_wait flags, etc.
|
||||
#
|
||||
# - llvm-include-order
|
||||
# TODO: not really important, but nice to fix eventually
|
||||
#
|
||||
# - readability-isolate-declaration
|
||||
# Dubious value, would kill readability
|
||||
#
|
||||
# - bugprone-narrowing-conversions
|
||||
# - cppcoreguidelines-narrowing-conversions
|
||||
# TODO: nice to fix eventually
|
||||
#
|
||||
# - cppcoreguidelines-avoid-non-const-global-variables
|
||||
# TODO: check, fix, and more in general constify wherever possible
|
||||
#
|
||||
# - altera-unroll-loops
|
||||
# - altera-id-dependent-backward-branch
|
||||
# TODO: check paths where it might make sense to improve performance
|
||||
#
|
||||
# - bugprone-easily-swappable-parameters
|
||||
# Not much can be done about them other than being careful
|
||||
#
|
||||
# - readability-function-cognitive-complexity
|
||||
# TODO: split reported functions
|
||||
#
|
||||
# - altera-struct-pack-align
|
||||
# "Poor" alignment needed for structs reflecting message formats/headers
|
||||
#
|
||||
# - concurrency-mt-unsafe
|
||||
# TODO: check again if multithreading is implemented
|
||||
#
|
||||
# - readability-identifier-length
|
||||
# Complains about any identifier <3 characters, reasonable for
|
||||
# globals, pointlessly verbose for locals and parameters.
|
||||
#
|
||||
# - bugprone-assignment-in-if-condition
|
||||
# Dubious value over the compiler's built-in warning. Would
|
||||
# increase verbosity.
|
||||
#
|
||||
# - misc-include-cleaner
|
||||
# Wants to include headers which *directly* provide the things
|
||||
# we use. That sounds nice, but means it will often want a OS
|
||||
# specific header instead of a mostly standard one, such as
|
||||
# <linux/limits.h> instead of <limits.h>.
|
||||
#
|
||||
# - cppcoreguidelines-macro-to-enum
|
||||
# Want to replace all #defines of integers with enums. Kind of
|
||||
# makes sense when those defines form an enum-like set, but
|
||||
# weird for cases like standalone constants, and causes other
|
||||
# awkwardness for a bunch of cases we use
|
||||
|
||||
cppcheck: $(PASST_SRCS) $(HEADERS)
|
||||
clang-tidy: $(SRCS) $(HEADERS)
|
||||
clang-tidy -checks=*,-modernize-*,\
|
||||
-clang-analyzer-valist.Uninitialized,\
|
||||
-cppcoreguidelines-init-variables,\
|
||||
-bugprone-assignment-in-if-condition,\
|
||||
-google-readability-braces-around-statements,\
|
||||
-hicpp-braces-around-statements,\
|
||||
-readability-braces-around-statements,\
|
||||
-readability-magic-numbers,\
|
||||
-llvmlibc-restrict-system-libc-headers,\
|
||||
-hicpp-signed-bitwise,\
|
||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
|
||||
-llvm-include-order,\
|
||||
-cppcoreguidelines-avoid-magic-numbers,\
|
||||
-readability-isolate-declaration,\
|
||||
-bugprone-narrowing-conversions,\
|
||||
-cppcoreguidelines-narrowing-conversions,\
|
||||
-cppcoreguidelines-avoid-non-const-global-variables,\
|
||||
-altera-unroll-loops,-altera-id-dependent-backward-branch,\
|
||||
-bugprone-easily-swappable-parameters,\
|
||||
-readability-function-cognitive-complexity,\
|
||||
-altera-struct-pack-align,\
|
||||
-concurrency-mt-unsafe,\
|
||||
-readability-identifier-length,\
|
||||
-misc-include-cleaner,\
|
||||
-cppcoreguidelines-macro-to-enum \
|
||||
-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
|
||||
--warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
|
||||
|
||||
SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
|
||||
ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)
|
||||
VER := $(shell $(CC) -dumpversion)
|
||||
SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include
|
||||
endif
|
||||
cppcheck: $(SRCS) $(HEADERS)
|
||||
if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
|
||||
CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \
|
||||
else \
|
||||
|
@ -185,8 +314,11 @@ cppcheck: $(PASST_SRCS) $(HEADERS)
|
|||
cppcheck --std=c11 --error-exitcode=1 --enable=all --force \
|
||||
--inconclusive --library=posix --quiet \
|
||||
$${CPPCHECK_EXHAUSTIVE} \
|
||||
$(SYSTEM_INCLUDES:%=-I%) \
|
||||
$(SYSTEM_INCLUDES:%=--config-exclude=%) \
|
||||
$(SYSTEM_INCLUDES:%=--suppress=*:%/*) \
|
||||
$(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*) \
|
||||
--inline-suppr \
|
||||
--suppress=missingIncludeSystem \
|
||||
--suppress=unusedStructMember \
|
||||
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936 \
|
||||
$(PASST_SRCS) $(HEADERS)
|
||||
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
|
||||
$(SRCS) $(HEADERS)
|
||||
|
|
|
@ -321,7 +321,7 @@ speeding up local connections, and usually requiring NAT. _pasta_:
|
|||
protocol
|
||||
* ✅ 4 to 50 times IPv4 TCP throughput of existing, conceptually similar
|
||||
solutions depending on MTU (UDP and IPv6 hard to compare)
|
||||
* ✅ [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for
|
||||
* 🛠 [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for
|
||||
maximum one copy on every data path and lower request-response latency
|
||||
* ⌚ [multithreading](https://bugs.passt.top/show_bug.cgi?id=13)
|
||||
* ⌚ [raw IP socket support](https://bugs.passt.top/show_bug.cgi?id=14) if
|
||||
|
|
8
arch.c
8
arch.c
|
@ -19,7 +19,6 @@
|
|||
#include <unistd.h>
|
||||
|
||||
#include "log.h"
|
||||
#include "util.h"
|
||||
|
||||
/**
|
||||
* arch_avx2_exec() - Switch to AVX2 build if supported
|
||||
|
@ -41,11 +40,8 @@ void arch_avx2_exec(char **argv)
|
|||
if (__builtin_cpu_supports("avx2")) {
|
||||
char new_path[PATH_MAX + sizeof(".avx2")];
|
||||
|
||||
if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"),
|
||||
"%s.avx2", exe))
|
||||
die_perror("Can't build AVX2 executable path");
|
||||
|
||||
execv(new_path, argv);
|
||||
snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe);
|
||||
execve(new_path, argv, environ);
|
||||
warn_perror("Can't run AVX2 build, using non-AVX2 version");
|
||||
}
|
||||
}
|
||||
|
|
8
arp.c
8
arp.c
|
@ -59,12 +59,14 @@ int arp(const struct ctx *c, const struct pool *p)
|
|||
ah->ar_op != htons(ARPOP_REQUEST))
|
||||
return 1;
|
||||
|
||||
/* Discard announcements, but not 0.0.0.0 "probes" */
|
||||
if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) &&
|
||||
/* Discard announcements (but not 0.0.0.0 "probes"): we might have the
|
||||
* same IP address, hide that.
|
||||
*/
|
||||
if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
|
||||
!memcmp(am->sip, am->tip, sizeof(am->sip)))
|
||||
return 1;
|
||||
|
||||
/* Don't resolve the guest's assigned address, either. */
|
||||
/* Don't resolve our own address, either. */
|
||||
if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
|
||||
return 1;
|
||||
|
||||
|
|
93
checksum.c
93
checksum.c
|
@ -59,7 +59,6 @@
|
|||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "checksum.h"
|
||||
#include "iov.h"
|
||||
|
||||
/* Checksums are optional for UDP over IPv4, so we usually just set
|
||||
* them to 0. Change this to 1 to calculate real UDP over IPv4
|
||||
|
@ -85,7 +84,7 @@
|
|||
*/
|
||||
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
|
||||
__attribute__((optimize("-fno-strict-aliasing")))
|
||||
static uint32_t sum_16b(const void *buf, size_t len)
|
||||
uint32_t sum_16b(const void *buf, size_t len)
|
||||
{
|
||||
const uint16_t *p = buf;
|
||||
uint32_t sum = 0;
|
||||
|
@ -107,7 +106,7 @@ static uint32_t sum_16b(const void *buf, size_t len)
|
|||
*
|
||||
* Return: 16-bit folded sum
|
||||
*/
|
||||
static uint16_t csum_fold(uint32_t sum)
|
||||
uint16_t csum_fold(uint32_t sum)
|
||||
{
|
||||
while (sum >> 16)
|
||||
sum = (sum & 0xffff) + (sum >> 16);
|
||||
|
@ -145,7 +144,7 @@ uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
|
|||
* @proto: Protocol number
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
* Return: partial checksum of the IPv4 header
|
||||
* Returns: Partial checksum of the IPv4 header
|
||||
*/
|
||||
uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
||||
struct in_addr saddr, struct in_addr daddr)
|
||||
|
@ -161,42 +160,27 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
|||
return psum;
|
||||
}
|
||||
|
||||
/**
|
||||
* csum() - Compute TCP/IP-style checksum
|
||||
* @buf: Input buffer
|
||||
* @len: Input length
|
||||
* @init: Initial 32-bit checksum, 0 for no pre-computed checksum
|
||||
*
|
||||
* Return: 16-bit folded, complemented checksum
|
||||
*/
|
||||
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
|
||||
__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */
|
||||
static uint16_t csum(const void *buf, size_t len, uint32_t init)
|
||||
{
|
||||
return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
|
||||
}
|
||||
|
||||
/**
|
||||
* csum_udp4() - Calculate and set checksum for a UDP over IPv4 packet
|
||||
* @udp4hr: UDP header, initialised apart from checksum
|
||||
* @saddr: IPv4 source address
|
||||
* @daddr: IPv4 destination address
|
||||
* @data: UDP payload (as IO vector tail)
|
||||
* @payload: UDP packet payload
|
||||
* @dlen: Length of @payload (not including UDP header)
|
||||
*/
|
||||
void csum_udp4(struct udphdr *udp4hr,
|
||||
struct in_addr saddr, struct in_addr daddr,
|
||||
struct iov_tail *data)
|
||||
const void *payload, size_t dlen)
|
||||
{
|
||||
/* UDP checksums are optional, so don't bother */
|
||||
udp4hr->check = 0;
|
||||
|
||||
if (UDP4_REAL_CHECKSUMS) {
|
||||
uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr);
|
||||
uint16_t l4len = dlen + sizeof(struct udphdr);
|
||||
uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
|
||||
saddr, daddr);
|
||||
|
||||
psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
|
||||
udp4hr->check = csum_iov_tail(data, psum);
|
||||
udp4hr->check = csum(payload, dlen, psum);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -225,7 +209,7 @@ void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen)
|
|||
* @proto: Protocol number
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
* Return: partial checksum of the IPv6 header
|
||||
* Returns: Partial checksum of the IPv6 header
|
||||
*/
|
||||
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
||||
const struct in6_addr *saddr,
|
||||
|
@ -242,22 +226,19 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
|||
/**
|
||||
* csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet
|
||||
* @udp6hr: UDP header, initialised apart from checksum
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
* @data: UDP payload (as IO vector tail)
|
||||
* @payload: UDP packet payload
|
||||
* @dlen: Length of @payload (not including UDP header)
|
||||
*/
|
||||
void csum_udp6(struct udphdr *udp6hr,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
struct iov_tail *data)
|
||||
const void *payload, size_t dlen)
|
||||
{
|
||||
uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr);
|
||||
uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
|
||||
saddr, daddr);
|
||||
|
||||
uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(struct udphdr),
|
||||
IPPROTO_UDP, saddr, daddr);
|
||||
udp6hr->check = 0;
|
||||
|
||||
psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
|
||||
udp6hr->check = csum_iov_tail(data, psum);
|
||||
udp6hr->check = csum(payload, dlen, psum);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -452,7 +433,7 @@ less_than_128_bytes:
|
|||
}
|
||||
|
||||
/**
|
||||
* csum_unfolded() - Calculate the unfolded checksum of a data buffer.
|
||||
* csum_unfolded - Calculate the unfolded checksum of a data buffer.
|
||||
*
|
||||
* @buf: Input buffer
|
||||
* @len: Input length
|
||||
|
@ -467,8 +448,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
|
|||
intptr_t align = ROUND_UP((intptr_t)buf, sizeof(__m256i));
|
||||
unsigned int pad = align - (intptr_t)buf;
|
||||
|
||||
/* Don't mix sum_16b() and csum_avx2() with odd padding lengths */
|
||||
if (pad & 1 || len < pad)
|
||||
if (len < pad)
|
||||
pad = len;
|
||||
|
||||
if (pad)
|
||||
|
@ -481,7 +461,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
|
|||
}
|
||||
#else /* __AVX2__ */
|
||||
/**
|
||||
* csum_unfolded() - Calculate the unfolded checksum of a data buffer.
|
||||
* csum_unfolded - Calculate the unfolded checksum of a data buffer.
|
||||
*
|
||||
* @buf: Input buffer
|
||||
* @len: Input length
|
||||
|
@ -498,23 +478,36 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
|
|||
#endif /* !__AVX2__ */
|
||||
|
||||
/**
|
||||
* csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector
|
||||
* @tail: IO vector tail to checksum
|
||||
* csum() - Compute TCP/IP-style checksum
|
||||
* @buf: Input buffer
|
||||
* @len: Input length
|
||||
* @init: Initial 32-bit checksum, 0 for no pre-computed checksum
|
||||
*
|
||||
* Return: 16-bit folded, complemented checksum
|
||||
*/
|
||||
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
|
||||
__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */
|
||||
uint16_t csum(const void *buf, size_t len, uint32_t init)
|
||||
{
|
||||
return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
|
||||
}
|
||||
|
||||
/**
|
||||
* csum_iov() - Calculates the unfolded checksum over an array of IO vectors
|
||||
*
|
||||
* @iov Pointer to the array of IO vectors
|
||||
* @n Length of the array
|
||||
* @init Initial 32-bit checksum, 0 for no pre-computed checksum
|
||||
*
|
||||
* Return: 16-bit folded, complemented checksum
|
||||
*/
|
||||
uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init)
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init)
|
||||
{
|
||||
if (iov_tail_prune(tail)) {
|
||||
size_t i;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);
|
||||
|
||||
init = csum_unfolded((char *)tail->iov[0].iov_base + tail->off,
|
||||
tail->iov[0].iov_len - tail->off, init);
|
||||
for (i = 1; i < tail->cnt; i++) {
|
||||
const struct iovec *iov = &tail->iov[i];
|
||||
init = csum_unfolded(iov->iov_base, iov->iov_len, init);
|
||||
}
|
||||
}
|
||||
return (uint16_t)~csum_fold(init);
|
||||
}
|
||||
|
|
10
checksum.h
10
checksum.h
|
@ -9,8 +9,9 @@
|
|||
struct udphdr;
|
||||
struct icmphdr;
|
||||
struct icmp6hdr;
|
||||
struct iov_tail;
|
||||
|
||||
uint32_t sum_16b(const void *buf, size_t len);
|
||||
uint16_t csum_fold(uint32_t sum);
|
||||
uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
|
||||
struct in_addr saddr, struct in_addr daddr);
|
||||
|
@ -18,18 +19,19 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
|||
struct in_addr saddr, struct in_addr daddr);
|
||||
void csum_udp4(struct udphdr *udp4hr,
|
||||
struct in_addr saddr, struct in_addr daddr,
|
||||
struct iov_tail *data);
|
||||
const void *payload, size_t dlen);
|
||||
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
|
||||
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
||||
const struct in6_addr *saddr,
|
||||
const struct in6_addr *daddr);
|
||||
void csum_udp6(struct udphdr *udp6hr,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
struct iov_tail *data);
|
||||
const void *payload, size_t dlen);
|
||||
void csum_icmp6(struct icmp6hdr *icmp6hr,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
const void *payload, size_t dlen);
|
||||
uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init);
|
||||
uint16_t csum(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init);
|
||||
|
||||
#endif /* CHECKSUM_H */
|
||||
|
|
1
conf.h
1
conf.h
|
@ -6,7 +6,6 @@
|
|||
#ifndef CONF_H
|
||||
#define CONF_H
|
||||
|
||||
enum passt_modes conf_mode(int argc, char *argv[]);
|
||||
void conf(struct ctx *c, int argc, char **argv);
|
||||
|
||||
#endif /* CONF_H */
|
||||
|
|
|
@ -27,25 +27,4 @@ profile passt /usr/bin/passt{,.avx2} {
|
|||
|
||||
owner @{HOME}/** w, # pcap(), pidfile_open(),
|
||||
# pidfile_write()
|
||||
|
||||
# Workaround: libvirt's profile comes with a passt subprofile which includes,
|
||||
# in turn, <abstractions/passt>, and adds libvirt-specific rules on top, to
|
||||
# allow passt (when started by libvirtd) to write socket and PID files in the
|
||||
# location requested by libvirtd itself, and to execute passt itself.
|
||||
#
|
||||
# However, when libvirt runs as unprivileged user, the mechanism based on
|
||||
# virt-aa-helper, designed to build per-VM profiles as guests are started,
|
||||
# doesn't work. The helper needs to create and load profiles on the fly, which
|
||||
# can't be done by unprivileged users, of course.
|
||||
#
|
||||
# As a result, libvirtd runs unconfined if guests are started by unprivileged
|
||||
# users, starting passt unconfined as well, which means that passt runs under
|
||||
# its own stand-alone profile (this one), which implies in turn that execve()
|
||||
# of /usr/bin/passt is not allowed, and socket and PID files can't be written.
|
||||
#
|
||||
# Duplicate libvirt-specific rules here as long as this is not solved in
|
||||
# libvirt's profile itself.
|
||||
/usr/bin/passt r,
|
||||
owner @{run}/user/[0-9]*/libvirt/qemu/run/passt/* rw,
|
||||
owner @{run}/libvirt/qemu/passt/* rw,
|
||||
}
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# contrib/apparmor/usr.bin.passt-repair - AppArmor profile for passt-repair(1)
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
abi <abi/3.0>,
|
||||
|
||||
#include <tunables/global>
|
||||
|
||||
profile passt-repair /usr/bin/passt-repair {
|
||||
#include <abstractions/base>
|
||||
/** rw, # passt's ".repair" socket might be anywhere
|
||||
unix (connect, receive, send) type=stream,
|
||||
|
||||
capability dac_override, # connect to passt's socket as root
|
||||
capability net_admin, # currently needed for TCP_REPAIR socket option
|
||||
capability net_raw, # what TCP_REPAIR should require instead
|
||||
|
||||
network unix stream, # connect and use UNIX domain socket
|
||||
network inet stream, # use TCP sockets
|
||||
}
|
|
@ -9,7 +9,6 @@
|
|||
|
||||
%global git_hash {{{ git_head }}}
|
||||
%global selinuxtype targeted
|
||||
%global selinux_policy_version 41.41
|
||||
|
||||
Name: passt
|
||||
Version: {{{ git_version }}}
|
||||
|
@ -34,22 +33,18 @@ for network namespaces: traffic is forwarded using a tap interface inside the
|
|||
namespace, without the need to create further interfaces on the host, hence not
|
||||
requiring any capabilities or privileges.
|
||||
|
||||
%package selinux
|
||||
BuildArch: noarch
|
||||
Summary: SELinux support for passt and pasta
|
||||
Requires: selinux-policy-%{selinuxtype}
|
||||
Requires: container-selinux
|
||||
Requires(post): selinux-policy-%{selinuxtype}
|
||||
Requires(post): container-selinux
|
||||
Requires(post): policycoreutils
|
||||
Requires(post): libselinux-utils
|
||||
Requires(preun): policycoreutils
|
||||
BuildRequires: selinux-policy-devel
|
||||
BuildRequires: pkgconfig(systemd)
|
||||
Recommends: selinux-policy-%{selinuxtype} >= %{selinux_policy_version}
|
||||
%package selinux
|
||||
BuildArch: noarch
|
||||
Summary: SELinux support for passt and pasta
|
||||
Requires: %{name} = %{version}-%{release}
|
||||
Requires: selinux-policy
|
||||
Requires(post): %{name}
|
||||
Requires(post): policycoreutils
|
||||
Requires(preun): %{name}
|
||||
Requires(preun): policycoreutils
|
||||
|
||||
%description selinux
|
||||
This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1).
|
||||
This package adds SELinux enforcement to passt(1) and pasta(1).
|
||||
|
||||
%prep
|
||||
%setup -q -n passt-%{git_hash}
|
||||
|
@ -87,33 +82,23 @@ make -f %{_datadir}/selinux/devel/Makefile
|
|||
install -p -m 644 -D passt.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
|
||||
install -p -m 644 -D passt.if %{buildroot}%{_datadir}/selinux/devel/include/distributed/passt.if
|
||||
install -p -m 644 -D pasta.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
|
||||
install -p -m 644 -D passt-repair.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
|
||||
popd
|
||||
|
||||
%pre selinux
|
||||
%selinux_relabel_pre -s %{selinuxtype}
|
||||
|
||||
%post selinux
|
||||
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
|
||||
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
|
||||
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
|
||||
|
||||
%postun selinux
|
||||
if [ $1 -eq 0 ]; then
|
||||
%selinux_modules_uninstall -s %{selinuxtype} passt pasta passt-repair
|
||||
%selinux_modules_uninstall -s %{selinuxtype} passt
|
||||
%selinux_modules_uninstall -s %{selinuxtype} pasta
|
||||
fi
|
||||
|
||||
%posttrans selinux
|
||||
%selinux_relabel_post -s %{selinuxtype}
|
||||
# %selinux_relabel_post calls fixfiles(8) with the previous file_contexts file
|
||||
# (see selabel_file(5)) in order to restore only the file contexts which
|
||||
# actually changed. However, as file_contexts doesn't support %{USERID}
|
||||
# substitutions, this will not work for specific file contexts that pasta needs
|
||||
# to have under /run/user.
|
||||
#
|
||||
# Restore those explicitly, hiding errors from restorecon(8): we can't pass a
|
||||
# path that's more specific than this, but at the same time /run/user often
|
||||
# contains FUSE mountpoints that can't be accessed as root, leading to
|
||||
# "Permission denied" messages, but not failures.
|
||||
restorecon -R /run/user 2>/dev/null
|
||||
|
||||
%files
|
||||
%license LICENSES/{GPL-2.0-or-later.txt,BSD-3-Clause.txt}
|
||||
|
@ -123,11 +108,9 @@ restorecon -R /run/user 2>/dev/null
|
|||
%{_bindir}/passt
|
||||
%{_bindir}/pasta
|
||||
%{_bindir}/qrap
|
||||
%{_bindir}/passt-repair
|
||||
%{_mandir}/man1/passt.1*
|
||||
%{_mandir}/man1/pasta.1*
|
||||
%{_mandir}/man1/qrap.1*
|
||||
%{_mandir}/man1/passt-repair.1*
|
||||
%ifarch x86_64
|
||||
%{_bindir}/passt.avx2
|
||||
%{_mandir}/man1/passt.avx2.1*
|
||||
|
@ -139,7 +122,6 @@ restorecon -R /run/user 2>/dev/null
|
|||
%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
|
||||
%{_datadir}/selinux/devel/include/distributed/passt.if
|
||||
%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
|
||||
%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
|
||||
|
||||
%changelog
|
||||
{{{ passt_git_changelog }}}
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# contrib/selinux/passt-repair.fc - SELinux: File Context for passt-repair
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
/usr/bin/passt-repair system_u:object_r:passt_repair_exec_t:s0
|
|
@ -1,87 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# contrib/selinux/passt-repair.te - SELinux: Type Enforcement for passt-repair
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
policy_module(passt-repair, 0.1)
|
||||
|
||||
require {
|
||||
type unconfined_t;
|
||||
type passt_t;
|
||||
role unconfined_r;
|
||||
class process transition;
|
||||
|
||||
class file { read execute execute_no_trans entrypoint open map };
|
||||
class capability { dac_override net_admin net_raw };
|
||||
class chr_file { append open getattr read write ioctl };
|
||||
|
||||
class unix_stream_socket { create connect sendto };
|
||||
class sock_file { read write };
|
||||
|
||||
class tcp_socket { read setopt write };
|
||||
|
||||
type console_device_t;
|
||||
type user_devpts_t;
|
||||
type user_tmp_t;
|
||||
|
||||
# Workaround: passt-repair needs to needs to access socket files
|
||||
# that passt, started by libvirt, might create under different
|
||||
# labels, depending on whether passt is started as root or not.
|
||||
#
|
||||
# However, libvirt doesn't maintain its own policy, which makes
|
||||
# updates particularly complicated. To avoid breakage in the short
|
||||
# term, deal with that in passt's own policy.
|
||||
type qemu_var_run_t;
|
||||
type virt_var_run_t;
|
||||
}
|
||||
|
||||
type passt_repair_t;
|
||||
domain_type(passt_repair_t);
|
||||
type passt_repair_exec_t;
|
||||
corecmd_executable_file(passt_repair_exec_t);
|
||||
|
||||
role unconfined_r types passt_repair_t;
|
||||
|
||||
allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans entrypoint open map };
|
||||
type_transition unconfined_t passt_repair_exec_t:process passt_repair_t;
|
||||
allow unconfined_t passt_repair_t:process transition;
|
||||
|
||||
allow passt_repair_t self:capability { dac_override dac_read_search net_admin net_raw };
|
||||
allow passt_repair_t self:capability2 bpf;
|
||||
|
||||
allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl };
|
||||
allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl };
|
||||
|
||||
allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
|
||||
allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
|
||||
allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
|
||||
|
||||
allow passt_repair_t user_tmp_t:dir { getattr read search watch };
|
||||
|
||||
allow passt_repair_t unconfined_t:sock_file { getattr read write };
|
||||
allow passt_repair_t passt_t:sock_file { getattr read write };
|
||||
allow passt_repair_t user_tmp_t:sock_file { getattr read write };
|
||||
|
||||
allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
|
||||
allow passt_repair_t passt_t:tcp_socket { read setopt write };
|
||||
|
||||
# Workaround: passt-repair needs to needs to access socket files
|
||||
# that passt, started by libvirt, might create under different
|
||||
# labels, depending on whether passt is started as root or not.
|
||||
#
|
||||
# However, libvirt doesn't maintain its own policy, which makes
|
||||
# updates particularly complicated. To avoid breakage in the short
|
||||
# term, deal with that in passt's own policy.
|
||||
allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write };
|
||||
allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write };
|
||||
|
||||
allow passt_repair_t qemu_var_run_t:dir { getattr read search watch };
|
||||
allow passt_repair_t virt_var_run_t:dir { getattr read search watch };
|
||||
|
||||
allow passt_repair_t qemu_var_run_t:sock_file { getattr read write };
|
||||
allow passt_repair_t virt_var_run_t:sock_file { getattr read write };
|
|
@ -20,19 +20,9 @@ require {
|
|||
type fs_t;
|
||||
type tmp_t;
|
||||
type user_tmp_t;
|
||||
type user_home_t;
|
||||
type tmpfs_t;
|
||||
type root_t;
|
||||
|
||||
# Workaround: passt --vhost-user needs to map guest memory, but
|
||||
# libvirt doesn't maintain its own policy, which makes updates
|
||||
# particularly complicated. To avoid breakage in the short term,
|
||||
# deal with it in passt's own policy.
|
||||
type svirt_image_t;
|
||||
type svirt_tmpfs_t;
|
||||
type svirt_t;
|
||||
type null_device_t;
|
||||
|
||||
class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map };
|
||||
class dir { search write add_name remove_name mounton };
|
||||
class chr_file { append read write open getattr ioctl };
|
||||
|
@ -48,8 +38,8 @@ require {
|
|||
type net_conf_t;
|
||||
type proc_net_t;
|
||||
type node_t;
|
||||
class tcp_socket { create accept listen name_bind name_connect getattr ioctl };
|
||||
class udp_socket { create accept listen getattr };
|
||||
class tcp_socket { create accept listen name_bind name_connect };
|
||||
class udp_socket { create accept listen };
|
||||
class icmp_socket { bind create name_bind node_bind setopt read write };
|
||||
class sock_file { create unlink write };
|
||||
|
||||
|
@ -57,6 +47,8 @@ require {
|
|||
type port_t;
|
||||
type http_port_t;
|
||||
|
||||
type passwd_file_t;
|
||||
|
||||
class netlink_route_socket { bind create nlmsg_read };
|
||||
type sysctl_net_t;
|
||||
|
||||
|
@ -90,9 +82,6 @@ allow passt_t root_t:dir mounton;
|
|||
allow passt_t tmp_t:dir { add_name mounton remove_name write };
|
||||
allow passt_t tmpfs_t:filesystem mount;
|
||||
allow passt_t fs_t:filesystem unmount;
|
||||
allow passt_t user_home_t:dir search;
|
||||
allow passt_t user_tmp_t:fifo_file append;
|
||||
allow passt_t user_tmp_t:file map;
|
||||
|
||||
manage_files_pattern(passt_t, user_tmp_t, user_tmp_t)
|
||||
files_pid_filetrans(passt_t, user_tmp_t, file)
|
||||
|
@ -107,7 +96,8 @@ allow passt_t self:capability { sys_tty_config setpcap net_bind_service setuid s
|
|||
allow passt_t self:cap_userns { setpcap sys_admin sys_ptrace };
|
||||
allow passt_t self:user_namespace create;
|
||||
|
||||
auth_read_passwd(passt_t)
|
||||
allow passt_t passwd_file_t:file read_file_perms;
|
||||
sssd_search_lib(passt_t)
|
||||
|
||||
allow passt_t proc_net_t:file read;
|
||||
allow passt_t net_conf_t:file { open read };
|
||||
|
@ -132,19 +122,11 @@ corenet_udp_sendrecv_all_ports(passt_t)
|
|||
allow passt_t node_t:icmp_socket { name_bind node_bind };
|
||||
allow passt_t port_t:icmp_socket name_bind;
|
||||
|
||||
allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl };
|
||||
allow passt_t self:udp_socket { create getopt setopt connect bind read write getattr };
|
||||
allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write };
|
||||
allow passt_t self:udp_socket { create getopt setopt connect bind read write };
|
||||
allow passt_t self:icmp_socket { bind create setopt read write };
|
||||
|
||||
allow passt_t user_tmp_t:dir { add_name write };
|
||||
allow passt_t user_tmp_t:file { create open };
|
||||
allow passt_t user_tmp_t:sock_file { create read write unlink };
|
||||
allow passt_t unconfined_t:unix_stream_socket { read write };
|
||||
|
||||
# Workaround: passt --vhost-user needs to map guest memory, but
|
||||
# libvirt doesn't maintain its own policy, which makes updates
|
||||
# particularly complicated. To avoid breakage in the short term,
|
||||
# deal with it in passt's own policy.
|
||||
allow passt_t svirt_image_t:file { read write map };
|
||||
allow passt_t svirt_tmpfs_t:file { read write map };
|
||||
allow passt_t null_device_t:chr_file map;
|
||||
|
|
|
@ -8,9 +8,7 @@
|
|||
# Copyright (c) 2022 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
/usr/bin/pasta system_u:object_r:pasta_exec_t:s0
|
||||
/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0
|
||||
/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0
|
||||
/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0
|
||||
/run/user/%{USERID}/netns system_u:object_r:ifconfig_var_run_t:s0
|
||||
/run/user/%{USERID}/containers/networks/rootless-netns system_u:object_r:ifconfig_var_run_t:s0
|
||||
/usr/bin/pasta system_u:object_r:pasta_exec_t:s0
|
||||
/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0
|
||||
/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0
|
||||
/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0
|
||||
|
|
|
@ -18,7 +18,6 @@ require {
|
|||
type bin_t;
|
||||
type user_home_t;
|
||||
type user_home_dir_t;
|
||||
type user_tmp_t;
|
||||
type fs_t;
|
||||
type tmp_t;
|
||||
type tmpfs_t;
|
||||
|
@ -57,10 +56,8 @@ require {
|
|||
attribute port_type;
|
||||
type port_t;
|
||||
type http_port_t;
|
||||
type http_cache_port_t;
|
||||
type ssh_port_t;
|
||||
type reserved_port_t;
|
||||
type unreserved_port_t;
|
||||
type dns_port_t;
|
||||
type dhcpc_port_t;
|
||||
type chronyd_port_t;
|
||||
|
@ -71,6 +68,9 @@ require {
|
|||
type system_dbusd_t;
|
||||
type systemd_hostnamed_t;
|
||||
type systemd_systemctl_exec_t;
|
||||
type passwd_file_t;
|
||||
type sssd_public_t;
|
||||
type sssd_var_lib_t;
|
||||
class dbus send_msg;
|
||||
class system module_request;
|
||||
class system status;
|
||||
|
@ -89,15 +89,6 @@ require {
|
|||
class capability { sys_tty_config setuid setgid };
|
||||
class cap_userns { setpcap sys_admin sys_ptrace net_bind_service net_admin };
|
||||
class user_namespace create;
|
||||
|
||||
# Container requires
|
||||
attribute_role usernetctl_roles;
|
||||
role container_user_r;
|
||||
role staff_r;
|
||||
role user_r;
|
||||
type container_runtime_t;
|
||||
type container_t;
|
||||
type systemd_user_runtimedir_t;
|
||||
}
|
||||
|
||||
type pasta_t;
|
||||
|
@ -122,12 +113,10 @@ init_daemon_domain(pasta_t, pasta_exec_t)
|
|||
|
||||
allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read_search net_admin sys_resource setuid setgid };
|
||||
allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service };
|
||||
# pasta only calls setuid and setgid with the current UID and GID, so this
|
||||
# denial is harmless. See https://bugzilla.redhat.com/show_bug.cgi?id=2330512#c10
|
||||
dontaudit pasta_t self:cap_userns { setgid setuid };
|
||||
allow pasta_t self:user_namespace create;
|
||||
|
||||
auth_read_passwd(pasta_t)
|
||||
allow pasta_t passwd_file_t:file read_file_perms;
|
||||
sssd_search_lib(pasta_t)
|
||||
|
||||
domain_auto_trans(pasta_t, bin_t, unconfined_t);
|
||||
domain_auto_trans(pasta_t, shell_exec_t, unconfined_t);
|
||||
|
@ -137,22 +126,17 @@ domain_auto_trans(pasta_t, ping_exec_t, ping_t);
|
|||
|
||||
allow pasta_t nsfs_t:file { open read };
|
||||
|
||||
allow pasta_t user_home_t:dir { getattr search };
|
||||
allow pasta_t user_home_t:file { open read getattr setattr execute execute_no_trans map};
|
||||
allow pasta_t user_home_t:dir getattr;
|
||||
allow pasta_t user_home_t:file { open read getattr setattr };
|
||||
allow pasta_t user_home_dir_t:dir { search getattr open add_name read write };
|
||||
allow pasta_t user_home_dir_t:file { create open read write };
|
||||
allow pasta_t tmp_t:dir { add_name mounton remove_name write };
|
||||
allow pasta_t tmpfs_t:filesystem { getattr mount };
|
||||
allow pasta_t tmpfs_t:filesystem mount;
|
||||
allow pasta_t fs_t:filesystem unmount;
|
||||
allow pasta_t root_t:dir mounton;
|
||||
manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t)
|
||||
files_pid_filetrans(pasta_t, pasta_pid_t, file)
|
||||
|
||||
allow pasta_t user_tmp_t:dir { add_name remove_name search write };
|
||||
allow pasta_t user_tmp_t:fifo_file append;
|
||||
allow pasta_t user_tmp_t:file { create open write };
|
||||
allow pasta_t user_tmp_t:sock_file { create unlink };
|
||||
|
||||
allow pasta_t console_device_t:chr_file { open write getattr ioctl };
|
||||
allow pasta_t user_devpts_t:chr_file { getattr read write ioctl };
|
||||
logging_send_syslog_msg(pasta_t)
|
||||
|
@ -168,11 +152,6 @@ allow pasta_t tmp_t:sock_file { create unlink write };
|
|||
allow pasta_t self:tcp_socket create_stream_socket_perms;
|
||||
corenet_tcp_sendrecv_generic_node(pasta_t)
|
||||
corenet_tcp_bind_generic_node(pasta_t)
|
||||
allow pasta_t container_runtime_t:dir { open read search };
|
||||
allow pasta_t container_runtime_t:fifo_file { getattr write };
|
||||
allow pasta_t container_runtime_t:file read;
|
||||
allow pasta_t container_runtime_t:lnk_file read;
|
||||
allow pasta_t container_t:lnk_file read;
|
||||
allow pasta_t pasta_port_t:tcp_socket { name_bind name_connect };
|
||||
allow pasta_t pasta_port_t:udp_socket { name_bind };
|
||||
allow pasta_t http_port_t:tcp_socket { name_bind name_connect };
|
||||
|
@ -185,8 +164,6 @@ allow pasta_t self:udp_socket create_stream_socket_perms;
|
|||
allow pasta_t reserved_port_t:udp_socket name_bind;
|
||||
allow pasta_t llmnr_port_t:tcp_socket name_bind;
|
||||
allow pasta_t llmnr_port_t:udp_socket name_bind;
|
||||
allow pasta_t http_cache_port_t:tcp_socket { name_bind name_connect };
|
||||
allow pasta_t unreserved_port_t:udp_socket name_bind;
|
||||
corenet_udp_sendrecv_generic_node(pasta_t)
|
||||
corenet_udp_bind_generic_node(pasta_t)
|
||||
allow pasta_t node_t:icmp_socket { name_bind node_bind };
|
||||
|
@ -198,12 +175,15 @@ allow pasta_t init_t:lnk_file read;
|
|||
allow pasta_t init_t:unix_stream_socket connectto;
|
||||
allow pasta_t init_t:dbus send_msg;
|
||||
allow pasta_t init_t:system status;
|
||||
allow pasta_t unconfined_t:dir { read search };
|
||||
allow pasta_t unconfined_t:dir search;
|
||||
allow pasta_t unconfined_t:file read;
|
||||
allow pasta_t unconfined_t:lnk_file read;
|
||||
allow pasta_t passwd_file_t:file { getattr open read };
|
||||
allow pasta_t self:process { setpgid setcap };
|
||||
allow pasta_t shell_exec_t:file { execute execute_no_trans map };
|
||||
|
||||
allow pasta_t sssd_var_lib_t:dir search;
|
||||
allow pasta_t sssd_public_t:dir search;
|
||||
allow pasta_t hostname_exec_t:file { execute execute_no_trans getattr open read map };
|
||||
allow pasta_t system_dbusd_t:unix_stream_socket connectto;
|
||||
allow pasta_t system_dbusd_t:dbus send_msg;
|
||||
|
@ -219,6 +199,8 @@ allow pasta_t sysctl_net_t:dir search;
|
|||
allow pasta_t sysctl_net_t:file { open read write };
|
||||
allow pasta_t kernel_t:system module_request;
|
||||
|
||||
allow pasta_t nsfs_t:file read;
|
||||
|
||||
allow pasta_t proc_t:dir mounton;
|
||||
allow pasta_t proc_t:filesystem mount;
|
||||
allow pasta_t net_conf_t:lnk_file read;
|
||||
|
@ -230,28 +212,3 @@ allow pasta_t netutils_t:process { noatsecure rlimitinh siginh };
|
|||
allow pasta_t ping_t:process { noatsecure rlimitinh siginh };
|
||||
allow pasta_t user_tty_device_t:chr_file { append read write };
|
||||
allow pasta_t user_devpts_t:chr_file { append read write };
|
||||
|
||||
# Allow network administration commands for non-privileged users
|
||||
roleattribute container_user_r usernetctl_roles;
|
||||
roleattribute staff_r usernetctl_roles;
|
||||
roleattribute user_r usernetctl_roles;
|
||||
role usernetctl_roles types pasta_t;
|
||||
|
||||
# Make pasta in a container run under the pasta_t context
|
||||
type_transition container_runtime_t pasta_exec_t : process pasta_t;
|
||||
allow container_runtime_t pasta_t:process transition;
|
||||
|
||||
# Label the user network namespace files
|
||||
type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "netns";
|
||||
type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootless-netns";
|
||||
allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write };
|
||||
allow pasta_t ifconfig_var_run_t:file { create open write };
|
||||
allow systemd_user_runtimedir_t ifconfig_var_run_t:dir rmdir;
|
||||
|
||||
# Allow pasta to bind to any port
|
||||
bool pasta_bind_all_ports true;
|
||||
if (pasta_bind_all_ports) {
|
||||
allow pasta_t port_type:icmp_socket { accept getopt name_bind };
|
||||
allow pasta_t port_type:tcp_socket { accept getopt name_bind name_connect };
|
||||
allow pasta_t port_type:udp_socket { accept getopt name_bind };
|
||||
}
|
||||
|
|
142
dhcp.c
142
dhcp.c
|
@ -36,9 +36,9 @@
|
|||
/**
|
||||
* struct opt - DHCP option
|
||||
* @sent: Convenience flag, set while filling replies
|
||||
* @slen: Length of option defined for server, -1 if not going to be sent
|
||||
* @slen: Length of option defined for server
|
||||
* @s: Option payload from server
|
||||
* @clen: Length of option received from client, -1 if not received
|
||||
* @clen: Length of option received from client
|
||||
* @c: Option payload from client
|
||||
*/
|
||||
struct opt {
|
||||
|
@ -63,21 +63,11 @@ static struct opt opts[255];
|
|||
|
||||
#define OPT_MIN 60 /* RFC 951 */
|
||||
|
||||
/* Total option size (excluding end option) is 576 (RFC 2131), minus
|
||||
* offset of options (268), minus end option (1).
|
||||
*/
|
||||
#define OPT_MAX 307
|
||||
|
||||
/**
|
||||
* dhcp_init() - Initialise DHCP options
|
||||
*/
|
||||
void dhcp_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(opts); i++)
|
||||
opts[i].slen = -1;
|
||||
|
||||
opts[1] = (struct opt) { 0, 4, { 0 }, 0, { 0 }, }; /* Mask */
|
||||
opts[3] = (struct opt) { 0, 4, { 0 }, 0, { 0 }, }; /* Router */
|
||||
opts[51] = (struct opt) { 0, 4, { 0xff,
|
||||
|
@ -117,8 +107,6 @@ struct msg {
|
|||
uint32_t xid;
|
||||
uint16_t secs;
|
||||
uint16_t flags;
|
||||
#define FLAG_BROADCAST htons_constant(0x8000)
|
||||
|
||||
uint32_t ciaddr;
|
||||
struct in_addr yiaddr;
|
||||
uint32_t siaddr;
|
||||
|
@ -127,7 +115,7 @@ struct msg {
|
|||
uint8_t sname[64];
|
||||
uint8_t file[128];
|
||||
uint32_t magic;
|
||||
uint8_t o[OPT_MAX + 1 /* End option */ ];
|
||||
uint8_t o[308];
|
||||
} __attribute__((__packed__));
|
||||
|
||||
/**
|
||||
|
@ -135,28 +123,15 @@ struct msg {
|
|||
* @m: Message to fill
|
||||
* @o: Option number
|
||||
* @offset: Current offset within options field, updated on insertion
|
||||
*
|
||||
* Return: false if m has space to write the option, true otherwise
|
||||
*/
|
||||
static bool fill_one(struct msg *m, int o, int *offset)
|
||||
static void fill_one(struct msg *m, int o, int *offset)
|
||||
{
|
||||
size_t slen = opts[o].slen;
|
||||
|
||||
/* If we don't have space to write the option, then just skip */
|
||||
if (*offset + 2 /* code and length of option */ + slen > OPT_MAX)
|
||||
return true;
|
||||
|
||||
m->o[*offset] = o;
|
||||
m->o[*offset + 1] = slen;
|
||||
|
||||
/* Move to option */
|
||||
*offset += 2;
|
||||
|
||||
memcpy(&m->o[*offset], opts[o].s, slen);
|
||||
m->o[*offset + 1] = opts[o].slen;
|
||||
memcpy(&m->o[*offset + 2], opts[o].s, opts[o].slen);
|
||||
|
||||
opts[o].sent = 1;
|
||||
*offset += slen;
|
||||
return false;
|
||||
*offset += 2 + opts[o].slen;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -169,6 +144,9 @@ static int fill(struct msg *m)
|
|||
{
|
||||
int i, o, offset = 0;
|
||||
|
||||
m->op = BOOTREPLY;
|
||||
m->secs = 0;
|
||||
|
||||
for (o = 0; o < 255; o++)
|
||||
opts[o].sent = 0;
|
||||
|
||||
|
@ -176,24 +154,22 @@ static int fill(struct msg *m)
|
|||
* option 53 at the beginning of the list.
|
||||
* Put it there explicitly, unless requested via option 55.
|
||||
*/
|
||||
if (opts[55].clen > 0 && !memchr(opts[55].c, 53, opts[55].clen))
|
||||
if (fill_one(m, 53, &offset))
|
||||
debug("DHCP: skipping option 53");
|
||||
if (!memchr(opts[55].c, 53, opts[55].clen))
|
||||
fill_one(m, 53, &offset);
|
||||
|
||||
for (i = 0; i < opts[55].clen; i++) {
|
||||
o = opts[55].c[i];
|
||||
if (opts[o].slen != -1)
|
||||
if (fill_one(m, o, &offset))
|
||||
debug("DHCP: skipping option %i", o);
|
||||
if (opts[o].slen)
|
||||
fill_one(m, o, &offset);
|
||||
}
|
||||
|
||||
for (o = 0; o < 255; o++) {
|
||||
if (opts[o].slen != -1 && !opts[o].sent)
|
||||
if (fill_one(m, o, &offset))
|
||||
debug("DHCP: skipping option %i", o);
|
||||
if (opts[o].slen && !opts[o].sent)
|
||||
fill_one(m, o, &offset);
|
||||
}
|
||||
|
||||
m->o[offset++] = 255;
|
||||
m->o[offset++] = 0;
|
||||
|
||||
if (offset < OPT_MIN) {
|
||||
memset(&m->o[offset], 0, OPT_MIN - offset);
|
||||
|
@ -288,9 +264,6 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len)
|
|||
".\xc0");
|
||||
}
|
||||
}
|
||||
|
||||
if (!opts[119].slen)
|
||||
opts[119].slen = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -304,13 +277,12 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
|||
{
|
||||
size_t mlen, dlen, offset = 0, opt_len, opt_off = 0;
|
||||
char macstr[ETH_ADDRSTRLEN];
|
||||
struct in_addr mask, dst;
|
||||
const struct ethhdr *eh;
|
||||
const struct iphdr *iph;
|
||||
const struct udphdr *uh;
|
||||
struct msg const *m;
|
||||
struct msg reply;
|
||||
struct in_addr mask;
|
||||
unsigned int i;
|
||||
struct msg *m;
|
||||
|
||||
eh = packet_get(p, 0, offset, sizeof(*eh), NULL);
|
||||
offset += sizeof(*eh);
|
||||
|
@ -339,27 +311,8 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
|||
m->op != BOOTREQUEST)
|
||||
return -1;
|
||||
|
||||
reply.op = BOOTREPLY;
|
||||
reply.htype = m->htype;
|
||||
reply.hlen = m->hlen;
|
||||
reply.hops = 0;
|
||||
reply.xid = m->xid;
|
||||
reply.secs = 0;
|
||||
reply.flags = m->flags;
|
||||
reply.ciaddr = m->ciaddr;
|
||||
reply.yiaddr = c->ip4.addr;
|
||||
reply.siaddr = 0;
|
||||
reply.giaddr = m->giaddr;
|
||||
memcpy(&reply.chaddr, m->chaddr, sizeof(reply.chaddr));
|
||||
memset(&reply.sname, 0, sizeof(reply.sname));
|
||||
memset(&reply.file, 0, sizeof(reply.file));
|
||||
reply.magic = m->magic;
|
||||
|
||||
offset += offsetof(struct msg, o);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(opts); i++)
|
||||
opts[i].clen = -1;
|
||||
|
||||
while (opt_off + 2 < opt_len) {
|
||||
const uint8_t *olen, *val;
|
||||
uint8_t *type;
|
||||
|
@ -378,19 +331,11 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
|||
opt_off += *olen + 2;
|
||||
}
|
||||
|
||||
opts[80].slen = -1;
|
||||
if (opts[53].clen > 0 && opts[53].c[0] == DHCPDISCOVER) {
|
||||
if (opts[80].clen == -1) {
|
||||
info("DHCP: offer to discover");
|
||||
opts[53].s[0] = DHCPOFFER;
|
||||
} else {
|
||||
info("DHCP: ack to discover (Rapid Commit)");
|
||||
opts[53].s[0] = DHCPACK;
|
||||
opts[80].slen = 0;
|
||||
}
|
||||
} else if (opts[53].clen <= 0 || opts[53].c[0] == DHCPREQUEST) {
|
||||
info("%s: ack to request", /* DHCP needs a valid message type */
|
||||
(opts[53].clen <= 0) ? "BOOTP" : "DHCP");
|
||||
if (opts[53].c[0] == DHCPDISCOVER) {
|
||||
info("DHCP: offer to discover");
|
||||
opts[53].s[0] = DHCPOFFER;
|
||||
} else if (opts[53].c[0] == DHCPREQUEST || !opts[53].clen) {
|
||||
info("%s: ack to request", opts[53].clen ? "DHCP" : "BOOTP");
|
||||
opts[53].s[0] = DHCPACK;
|
||||
} else {
|
||||
return -1;
|
||||
|
@ -398,6 +343,7 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
|||
|
||||
info(" from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr)));
|
||||
|
||||
m->yiaddr = c->ip4.addr;
|
||||
mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len));
|
||||
memcpy(opts[1].s, &mask, sizeof(mask));
|
||||
memcpy(opts[3].s, &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
|
||||
|
@ -417,7 +363,7 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
|||
&c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
|
||||
}
|
||||
|
||||
if (c->mtu) {
|
||||
if (c->mtu != -1) {
|
||||
opts[26].slen = 2;
|
||||
opts[26].s[0] = c->mtu / 256;
|
||||
opts[26].s[1] = c->mtu % 256;
|
||||
|
@ -428,44 +374,12 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
|||
((struct in_addr *)opts[6].s)[i] = c->ip4.dns[i];
|
||||
opts[6].slen += sizeof(uint32_t);
|
||||
}
|
||||
if (!opts[6].slen)
|
||||
opts[6].slen = -1;
|
||||
|
||||
opt_len = strlen(c->hostname);
|
||||
if (opt_len > 0) {
|
||||
opts[12].slen = opt_len;
|
||||
memcpy(opts[12].s, &c->hostname, opt_len);
|
||||
}
|
||||
|
||||
opt_len = strlen(c->fqdn);
|
||||
if (opt_len > 0) {
|
||||
opt_len += 3 /* flags */
|
||||
+ 2; /* Length byte for first label, and terminator */
|
||||
|
||||
if (sizeof(opts[81].s) >= opt_len) {
|
||||
opts[81].s[0] = 0x4; /* flags (E) */
|
||||
opts[81].s[1] = 0xff; /* RCODE1 */
|
||||
opts[81].s[2] = 0xff; /* RCODE2 */
|
||||
|
||||
encode_domain_name((char *)opts[81].s + 3, c->fqdn);
|
||||
|
||||
opts[81].slen = opt_len;
|
||||
} else {
|
||||
debug("DHCP: client FQDN option doesn't fit, skipping");
|
||||
}
|
||||
}
|
||||
|
||||
if (!c->no_dhcp_dns_search)
|
||||
opt_set_dns_search(c, sizeof(m->o));
|
||||
|
||||
dlen = offsetof(struct msg, o) + fill(&reply);
|
||||
|
||||
if (m->flags & FLAG_BROADCAST)
|
||||
dst = in4addr_broadcast;
|
||||
else
|
||||
dst = c->ip4.addr;
|
||||
|
||||
tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, &reply, dlen);
|
||||
dlen = offsetof(struct msg, o) + fill(m);
|
||||
tap_udp4_send(c, c->ip4.our_tap_addr, 67, c->ip4.addr, 68, m, dlen);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
158
dhcpv6.c
158
dhcpv6.c
|
@ -48,7 +48,6 @@ struct opt_hdr {
|
|||
# define STATUS_NOTONLINK htons_constant(4)
|
||||
# define OPT_DNS_SERVERS htons_constant(23)
|
||||
# define OPT_DNS_SEARCH htons_constant(24)
|
||||
# define OPT_CLIENT_FQDN htons_constant(39)
|
||||
#define STR_NOTONLINK "Prefix not appropriate for link."
|
||||
|
||||
uint16_t l;
|
||||
|
@ -59,9 +58,6 @@ struct opt_hdr {
|
|||
sizeof(struct opt_hdr))
|
||||
#define OPT_VSIZE(x) (sizeof(struct opt_##x) - \
|
||||
sizeof(struct opt_hdr))
|
||||
#define OPT_MAX_SIZE IPV6_MIN_MTU - (sizeof(struct ipv6hdr) + \
|
||||
sizeof(struct udphdr) + \
|
||||
sizeof(struct msg_hdr))
|
||||
|
||||
/**
|
||||
* struct opt_client_id - DHCPv6 Client Identifier option
|
||||
|
@ -144,9 +140,7 @@ struct opt_ia_addr {
|
|||
struct opt_status_code {
|
||||
struct opt_hdr hdr;
|
||||
uint16_t code;
|
||||
/* "nonstring" is only supported since clang 23 */
|
||||
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
|
||||
__attribute__((nonstring)) char status_msg[sizeof(STR_NOTONLINK) - 1];
|
||||
char status_msg[sizeof(STR_NOTONLINK) - 1];
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
|
@ -169,18 +163,6 @@ struct opt_dns_search {
|
|||
char list[MAXDNSRCH * NS_MAXDNAME];
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct opt_client_fqdn - Client FQDN option (RFC 4704)
|
||||
* @hdr: Option header
|
||||
* @flags: Flags described by RFC 4704
|
||||
* @domain_name: Client FQDN
|
||||
*/
|
||||
struct opt_client_fqdn {
|
||||
struct opt_hdr hdr;
|
||||
uint8_t flags;
|
||||
char domain_name[PASST_MAXDNAME];
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct msg_hdr - DHCPv6 client/server message header
|
||||
* @type: DHCP message type
|
||||
|
@ -211,7 +193,6 @@ struct msg_hdr {
|
|||
* @client_id: Client Identifier, variable length
|
||||
* @dns_servers: DNS Recursive Name Server, here just for storage size
|
||||
* @dns_search: Domain Search List, here just for storage size
|
||||
* @client_fqdn: Client FQDN, variable length
|
||||
*/
|
||||
static struct resp_t {
|
||||
struct msg_hdr hdr;
|
||||
|
@ -222,7 +203,6 @@ static struct resp_t {
|
|||
struct opt_client_id client_id;
|
||||
struct opt_dns_servers dns_servers;
|
||||
struct opt_dns_search dns_search;
|
||||
struct opt_client_fqdn client_fqdn;
|
||||
} __attribute__((__packed__)) resp = {
|
||||
{ 0 },
|
||||
SERVER_ID,
|
||||
|
@ -248,10 +228,6 @@ static struct resp_t {
|
|||
{ { OPT_DNS_SEARCH, 0, },
|
||||
{ 0 },
|
||||
},
|
||||
|
||||
{ { OPT_CLIENT_FQDN, 0, },
|
||||
0, { 0 },
|
||||
},
|
||||
};
|
||||
|
||||
static const struct opt_status_code sc_not_on_link = {
|
||||
|
@ -320,42 +296,47 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
|
|||
static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
|
||||
struct in6_addr *la)
|
||||
{
|
||||
int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
|
||||
const struct opt_ia_addr *opt_addr;
|
||||
char buf[INET6_ADDRSTRLEN];
|
||||
struct in6_addr req_addr;
|
||||
const struct opt_hdr *h;
|
||||
struct opt_hdr *ia;
|
||||
size_t offset;
|
||||
int ia_type;
|
||||
|
||||
foreach(ia_type, ia_types) {
|
||||
offset = 0;
|
||||
while ((ia = dhcpv6_opt(p, &offset, *ia_type))) {
|
||||
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
|
||||
ia_type = OPT_IA_NA;
|
||||
ia_ta:
|
||||
offset = 0;
|
||||
while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
|
||||
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
|
||||
return NULL;
|
||||
|
||||
offset += sizeof(struct opt_ia_na);
|
||||
|
||||
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
|
||||
const struct opt_ia_addr *opt_addr;
|
||||
|
||||
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
|
||||
return NULL;
|
||||
|
||||
offset += sizeof(struct opt_ia_na);
|
||||
|
||||
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
|
||||
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
|
||||
return NULL;
|
||||
|
||||
opt_addr = (const struct opt_ia_addr *)h;
|
||||
req_addr = opt_addr->addr;
|
||||
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr))
|
||||
goto err;
|
||||
|
||||
offset += sizeof(struct opt_ia_addr);
|
||||
opt_addr = (const struct opt_ia_addr *)h;
|
||||
req_addr = opt_addr->addr;
|
||||
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) {
|
||||
info("DHCPv6: requested address %s not on link",
|
||||
inet_ntop(AF_INET6, &req_addr,
|
||||
buf, sizeof(buf)));
|
||||
return ia;
|
||||
}
|
||||
|
||||
offset += sizeof(struct opt_ia_addr);
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
if (ia_type == OPT_IA_NA) {
|
||||
ia_type = OPT_IA_TA;
|
||||
goto ia_ta;
|
||||
}
|
||||
|
||||
err:
|
||||
info("DHCPv6: requested address %s not on link",
|
||||
inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
|
||||
return ia;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -370,6 +351,7 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset)
|
|||
{
|
||||
struct opt_dns_servers *srv = NULL;
|
||||
struct opt_dns_search *srch = NULL;
|
||||
char *p = NULL;
|
||||
int i;
|
||||
|
||||
if (c->no_dhcp_dns)
|
||||
|
@ -406,81 +388,34 @@ search:
|
|||
if (!name_len)
|
||||
continue;
|
||||
|
||||
name_len += 2; /* Length byte for first label, and terminator */
|
||||
if (name_len >
|
||||
NS_MAXDNAME + 1 /* Length byte for first label */ ||
|
||||
name_len > 255) {
|
||||
debug("DHCP: DNS search name '%s' too long, skipping",
|
||||
c->dns_search[i].n);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!srch) {
|
||||
srch = (struct opt_dns_search *)(buf + offset);
|
||||
offset += sizeof(struct opt_hdr);
|
||||
srch->hdr.t = OPT_DNS_SEARCH;
|
||||
srch->hdr.l = 0;
|
||||
p = srch->list;
|
||||
}
|
||||
|
||||
encode_domain_name(buf + offset, c->dns_search[i].n);
|
||||
|
||||
srch->hdr.l += name_len;
|
||||
offset += name_len;
|
||||
|
||||
*p = '.';
|
||||
p = stpncpy(p + 1, c->dns_search[i].n, name_len);
|
||||
p++;
|
||||
srch->hdr.l += name_len + 2;
|
||||
offset += name_len + 2;
|
||||
}
|
||||
|
||||
if (srch)
|
||||
if (srch) {
|
||||
for (i = 0; i < srch->hdr.l; i++) {
|
||||
if (srch->list[i] == '.') {
|
||||
srch->list[i] = strcspn(srch->list + i + 1,
|
||||
".");
|
||||
}
|
||||
}
|
||||
srch->hdr.l = htons(srch->hdr.l);
|
||||
}
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
/**
|
||||
* dhcpv6_client_fqdn_fill() - Fill in client FQDN option
|
||||
* @c: Execution context
|
||||
* @buf: Response message buffer where options will be appended
|
||||
* @offset: Offset in message buffer for new options
|
||||
*
|
||||
* Return: updated length of response message buffer.
|
||||
*/
|
||||
static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
|
||||
char *buf, int offset)
|
||||
|
||||
{
|
||||
struct opt_client_fqdn const *req_opt;
|
||||
struct opt_client_fqdn *o;
|
||||
size_t opt_len;
|
||||
|
||||
opt_len = strlen(c->fqdn);
|
||||
if (opt_len == 0) {
|
||||
return offset;
|
||||
}
|
||||
|
||||
opt_len += 2; /* Length byte for first label, and terminator */
|
||||
if (opt_len > OPT_MAX_SIZE - (offset +
|
||||
sizeof(struct opt_hdr) +
|
||||
1 /* flags */ )) {
|
||||
debug("DHCPv6: client FQDN option doesn't fit, skipping");
|
||||
return offset;
|
||||
}
|
||||
|
||||
o = (struct opt_client_fqdn *)(buf + offset);
|
||||
encode_domain_name(o->domain_name, c->fqdn);
|
||||
req_opt = (struct opt_client_fqdn *)dhcpv6_opt(p, &(size_t){ 0 },
|
||||
OPT_CLIENT_FQDN);
|
||||
if (req_opt && req_opt->flags & 0x01 /* S flag */)
|
||||
o->flags = 0x02 /* O flag */;
|
||||
else
|
||||
o->flags = 0x00;
|
||||
|
||||
opt_len++;
|
||||
|
||||
o->hdr.t = OPT_CLIENT_FQDN;
|
||||
o->hdr.l = htons(opt_len);
|
||||
|
||||
return offset + sizeof(struct opt_hdr) + opt_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* dhcpv6() - Check if this is a DHCPv6 message, reply as needed
|
||||
* @c: Execution context
|
||||
|
@ -493,11 +428,11 @@ static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
|
|||
int dhcpv6(struct ctx *c, const struct pool *p,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr)
|
||||
{
|
||||
const struct opt_hdr *client_id, *server_id, *ia;
|
||||
struct opt_hdr *ia, *bad_ia, *client_id;
|
||||
const struct opt_hdr *server_id;
|
||||
const struct in6_addr *src;
|
||||
const struct msg_hdr *mh;
|
||||
const struct udphdr *uh;
|
||||
struct opt_hdr *bad_ia;
|
||||
size_t mlen, n;
|
||||
|
||||
uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
|
||||
|
@ -614,7 +549,6 @@ int dhcpv6(struct ctx *c, const struct pool *p,
|
|||
n = offsetof(struct resp_t, client_id) +
|
||||
sizeof(struct opt_hdr) + ntohs(client_id->l);
|
||||
n = dhcpv6_dns_fill(c, (char *)&resp, n);
|
||||
n = dhcpv6_client_fqdn_fill(p, c, (char *)&resp, n);
|
||||
|
||||
resp.hdr.xid = mh->xid;
|
||||
|
||||
|
|
2
doc/migration/.gitignore
vendored
2
doc/migration/.gitignore
vendored
|
@ -1,2 +0,0 @@
|
|||
/source
|
||||
/target
|
|
@ -1,20 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
TARGETS = source target
|
||||
CFLAGS = -Wall -Wextra -pedantic
|
||||
|
||||
all: $(TARGETS)
|
||||
|
||||
$(TARGETS): %: %.c
|
||||
|
||||
clean:
|
||||
rm -f $(TARGETS)
|
|
@ -1,51 +0,0 @@
|
|||
<!---
|
||||
SPDX-License-Identifier: GPL-2.0-or-later
|
||||
Copyright (c) 2025 Red Hat GmbH
|
||||
Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
-->
|
||||
|
||||
Migration
|
||||
=========
|
||||
|
||||
These test programs show a migration of a TCP connection from one process to
|
||||
another using the TCP_REPAIR socket option.
|
||||
|
||||
The two processes are a mock of the matching implementation in passt(1), and run
|
||||
unprivileged, so they rely on the passt-repair helper to connect to them and set
|
||||
or clear TCP_REPAIR on the connection socket, transferred to the helper using
|
||||
SCM_RIGHTS.
|
||||
|
||||
The passt-repair helper needs to have the CAP_NET_ADMIN capability, or run as
|
||||
root.
|
||||
|
||||
Example of usage
|
||||
----------------
|
||||
|
||||
* Start the test server
|
||||
|
||||
$ nc -l 9999
|
||||
|
||||
* Start the source side of the TCP client (mock of the source instance of passt)
|
||||
|
||||
$ ./source 127.0.0.1 9999 9998 /tmp/repair.sock
|
||||
|
||||
* The client sends a test string, and waits for a connection from passt-repair
|
||||
|
||||
# passt-repair /tmp/repair.sock
|
||||
|
||||
* The socket is now in repair mode, and `source` dumps sequences, then exits
|
||||
|
||||
sending sequence: 3244673313
|
||||
receiving sequence: 2250449386
|
||||
|
||||
* Continue the connection on the target side, restarting from those sequences
|
||||
|
||||
$ ./target 127.0.0.1 9999 9998 /tmp/repair.sock 3244673313 2250449386
|
||||
|
||||
* The target side now waits for a connection from passt-repair
|
||||
|
||||
# passt-repair /tmp/repair.sock
|
||||
|
||||
* The target side asks passt-repair to switch the socket to repair mode, sets up
|
||||
the TCP sequences, then asks passt-repair to clear repair mode, and sends a
|
||||
test string to the server
|
|
@ -1,92 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* PASST - Plug A Simple Socket Transport
|
||||
* for qemu/UNIX domain socket mode
|
||||
*
|
||||
* PASTA - Pack A Subtle Tap Abstraction
|
||||
* for network namespace/tap device mode
|
||||
*
|
||||
* doc/migration/source.c - Mock of TCP migration source, use with passt-repair
|
||||
*
|
||||
* Copyright (c) 2025 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <unistd.h>
|
||||
#include <netdb.h>
|
||||
#include <netinet/tcp.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } };
|
||||
struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0,
|
||||
NULL, NULL, NULL };
|
||||
struct sockaddr_un a_helper = { AF_UNIX, { 0 } };
|
||||
int seq, s, s_helper;
|
||||
int8_t cmd;
|
||||
struct iovec iov = { &cmd, sizeof(cmd) };
|
||||
char buf[CMSG_SPACE(sizeof(int))];
|
||||
struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
|
||||
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
|
||||
socklen_t seqlen = sizeof(int);
|
||||
struct addrinfo *r;
|
||||
|
||||
(void)argc;
|
||||
|
||||
if (argc != 5) {
|
||||
fprintf(stderr, "%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH\n",
|
||||
argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
strcpy(a_helper.sun_path, argv[4]);
|
||||
getaddrinfo(argv[1], argv[2], &hints, &r);
|
||||
|
||||
/* Connect socket to server and send some data */
|
||||
s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP);
|
||||
setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int));
|
||||
bind(s, (struct sockaddr *)&a, sizeof(a));
|
||||
connect(s, r->ai_addr, r->ai_addrlen);
|
||||
send(s, "before migration\n", sizeof("before migration\n"), 0);
|
||||
|
||||
/* Wait for helper */
|
||||
s_helper = socket(AF_UNIX, SOCK_STREAM, 0);
|
||||
unlink(a_helper.sun_path);
|
||||
bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper));
|
||||
listen(s_helper, 1);
|
||||
s_helper = accept(s_helper, NULL, NULL);
|
||||
|
||||
/* Set up message for helper, with socket */
|
||||
cmsg->cmsg_level = SOL_SOCKET;
|
||||
cmsg->cmsg_type = SCM_RIGHTS;
|
||||
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
|
||||
memcpy(CMSG_DATA(cmsg), &s, sizeof(s));
|
||||
|
||||
/* Send command to helper: turn repair mode on, wait for reply */
|
||||
cmd = TCP_REPAIR_ON;
|
||||
sendmsg(s_helper, &msg, 0);
|
||||
recv(s_helper, &((int8_t){ 0 }), 1, 0);
|
||||
|
||||
/* Terminate helper */
|
||||
close(s_helper);
|
||||
|
||||
/* Get sending sequence */
|
||||
seq = TCP_SEND_QUEUE;
|
||||
setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
|
||||
getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen);
|
||||
fprintf(stdout, "%u ", seq);
|
||||
|
||||
/* Get receiving sequence */
|
||||
seq = TCP_RECV_QUEUE;
|
||||
setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
|
||||
getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen);
|
||||
fprintf(stdout, "%u\n", seq);
|
||||
}
|
|
@ -1,102 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* PASST - Plug A Simple Socket Transport
|
||||
* for qemu/UNIX domain socket mode
|
||||
*
|
||||
* PASTA - Pack A Subtle Tap Abstraction
|
||||
* for network namespace/tap device mode
|
||||
*
|
||||
* doc/migration/target.c - Mock of TCP migration target, use with passt-repair
|
||||
*
|
||||
* Copyright (c) 2025 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <unistd.h>
|
||||
#include <netdb.h>
|
||||
#include <netinet/tcp.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } };
|
||||
struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0,
|
||||
NULL, NULL, NULL };
|
||||
struct sockaddr_un a_helper = { AF_UNIX, { 0 } };
|
||||
int s, s_helper, seq;
|
||||
int8_t cmd;
|
||||
struct iovec iov = { &cmd, sizeof(cmd) };
|
||||
char buf[CMSG_SPACE(sizeof(int))];
|
||||
struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
|
||||
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
|
||||
struct addrinfo *r;
|
||||
|
||||
(void)argc;
|
||||
|
||||
strcpy(a_helper.sun_path, argv[4]);
|
||||
getaddrinfo(argv[1], argv[2], &hints, &r);
|
||||
|
||||
if (argc != 7) {
|
||||
fprintf(stderr,
|
||||
"%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH SSEQ RSEQ\n",
|
||||
argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Prepare socket, bind to source port */
|
||||
s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP);
|
||||
setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int));
|
||||
bind(s, (struct sockaddr *)&a, sizeof(a));
|
||||
|
||||
/* Wait for helper */
|
||||
s_helper = socket(AF_UNIX, SOCK_STREAM, 0);
|
||||
unlink(a_helper.sun_path);
|
||||
bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper));
|
||||
listen(s_helper, 1);
|
||||
s_helper = accept(s_helper, NULL, NULL);
|
||||
|
||||
/* Set up message for helper, with socket */
|
||||
cmsg->cmsg_level = SOL_SOCKET;
|
||||
cmsg->cmsg_type = SCM_RIGHTS;
|
||||
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
|
||||
memcpy(CMSG_DATA(cmsg), &s, sizeof(s));
|
||||
|
||||
/* Send command to helper: turn repair mode on, wait for reply */
|
||||
cmd = TCP_REPAIR_ON;
|
||||
sendmsg(s_helper, &msg, 0);
|
||||
recv(s_helper, &((int){ 0 }), 1, 0);
|
||||
|
||||
/* Set sending sequence */
|
||||
seq = TCP_SEND_QUEUE;
|
||||
setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
|
||||
seq = atoi(argv[5]);
|
||||
setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq));
|
||||
|
||||
/* Set receiving sequence */
|
||||
seq = TCP_RECV_QUEUE;
|
||||
setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq));
|
||||
seq = atoi(argv[6]);
|
||||
setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq));
|
||||
|
||||
/* Connect setting kernel state only, without actual SYN / handshake */
|
||||
connect(s, r->ai_addr, r->ai_addrlen);
|
||||
|
||||
/* Send command to helper: turn repair mode off, wait for reply */
|
||||
cmd = TCP_REPAIR_OFF;
|
||||
sendmsg(s_helper, &msg, 0);
|
||||
|
||||
recv(s_helper, &((int8_t){ 0 }), 1, 0);
|
||||
|
||||
/* Terminate helper */
|
||||
close(s_helper);
|
||||
|
||||
/* Send some more data */
|
||||
send(s, "after migration\n", sizeof("after migration\n"), 0);
|
||||
}
|
1
doc/platform-requirements/.gitignore
vendored
1
doc/platform-requirements/.gitignore
vendored
|
@ -1,4 +1,3 @@
|
|||
/listen-vs-repair
|
||||
/reuseaddr-priority
|
||||
/recv-zero
|
||||
/udp-close-dup
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
# Copyright Red Hat
|
||||
# Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
|
||||
TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair
|
||||
SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c
|
||||
TARGETS = reuseaddr-priority recv-zero udp-close-dup
|
||||
SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c
|
||||
CFLAGS = -Wall
|
||||
|
||||
all: cppcheck clang-tidy $(TARGETS:%=check-%)
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
__attribute__((format(printf, 1, 2), noreturn))
|
||||
static inline void die(const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
|
|
@ -1,128 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* liste-vs-repair.c
|
||||
*
|
||||
* Do listening sockets have address conflicts with sockets under repair
|
||||
* ====================================================================
|
||||
*
|
||||
* When we accept() an incoming connection the accept()ed socket will have the
|
||||
* same local address as the listening socket. This can be a complication on
|
||||
* migration. On the migration target we've already set up listening sockets
|
||||
* according to the command line. However to restore connections that we're
|
||||
* migrating in we need to bind the new sockets to the same address, which would
|
||||
* be an address conflict on the face of it. This test program verifies that
|
||||
* enabling repair mode before bind() correctly suppresses that conflict.
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
*/
|
||||
|
||||
/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */
|
||||
#define _GNU_SOURCE
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <errno.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <net/if.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <sched.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define PORT 13256U
|
||||
#define CPORT 13257U
|
||||
|
||||
/* 127.0.0.1:PORT */
|
||||
static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT);
|
||||
|
||||
/* 127.0.0.1:CPORT */
|
||||
static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT);
|
||||
|
||||
/* Put ourselves into a network sandbox */
|
||||
static void net_sandbox(void)
|
||||
{
|
||||
/* NOLINTNEXTLINE(altera-struct-pack-align) */
|
||||
const struct req_t {
|
||||
struct nlmsghdr nlh;
|
||||
struct ifinfomsg ifm;
|
||||
} __attribute__((packed)) req = {
|
||||
.nlh.nlmsg_type = RTM_NEWLINK,
|
||||
.nlh.nlmsg_flags = NLM_F_REQUEST,
|
||||
.nlh.nlmsg_len = sizeof(req),
|
||||
.nlh.nlmsg_seq = 1,
|
||||
.ifm.ifi_family = AF_UNSPEC,
|
||||
.ifm.ifi_index = 1,
|
||||
.ifm.ifi_flags = IFF_UP,
|
||||
.ifm.ifi_change = IFF_UP,
|
||||
};
|
||||
int nl;
|
||||
|
||||
if (unshare(CLONE_NEWUSER | CLONE_NEWNET))
|
||||
die("unshare(): %s\n", strerror(errno));
|
||||
|
||||
/* Bring up lo in the new netns */
|
||||
nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
|
||||
if (nl < 0)
|
||||
die("Can't create netlink socket: %s\n", strerror(errno));
|
||||
|
||||
if (send(nl, &req, sizeof(req), 0) < 0)
|
||||
die("Netlink send(): %s\n", strerror(errno));
|
||||
close(nl);
|
||||
}
|
||||
|
||||
static void check(void)
|
||||
{
|
||||
int s1, s2, op;
|
||||
|
||||
s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
|
||||
if (s1 < 0)
|
||||
die("socket() 1: %s\n", strerror(errno));
|
||||
|
||||
if (bind(s1, (struct sockaddr *)&addr, sizeof(addr)))
|
||||
die("bind() 1: %s\n", strerror(errno));
|
||||
|
||||
if (listen(s1, 0))
|
||||
die("listen(): %s\n", strerror(errno));
|
||||
|
||||
s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
|
||||
if (s2 < 0)
|
||||
die("socket() 2: %s\n", strerror(errno));
|
||||
|
||||
op = TCP_REPAIR_ON;
|
||||
if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
|
||||
die("TCP_REPAIR: %s\n", strerror(errno));
|
||||
|
||||
if (bind(s2, (struct sockaddr *)&addr, sizeof(addr)))
|
||||
die("bind() 2: %s\n", strerror(errno));
|
||||
|
||||
if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr)))
|
||||
die("connect(): %s\n", strerror(errno));
|
||||
|
||||
op = TCP_REPAIR_OFF_NO_WP;
|
||||
if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
|
||||
die("TCP_REPAIR: %s\n", strerror(errno));
|
||||
|
||||
close(s1);
|
||||
close(s2);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
(void)argc;
|
||||
(void)argv;
|
||||
|
||||
net_sandbox();
|
||||
|
||||
check();
|
||||
|
||||
printf("Repair mode appears to properly suppress conflicts with listening sockets\n");
|
||||
|
||||
exit(0);
|
||||
}
|
|
@ -46,13 +46,13 @@
|
|||
/* Different cases for receiving socket configuration */
|
||||
enum sock_type {
|
||||
/* Socket is bound to 0.0.0.0:DSTPORT and not connected */
|
||||
SOCK_BOUND_ANY,
|
||||
SOCK_BOUND_ANY = 0,
|
||||
|
||||
/* Socket is bound to 127.0.0.1:DSTPORT and not connected */
|
||||
SOCK_BOUND_LO,
|
||||
SOCK_BOUND_LO = 1,
|
||||
|
||||
/* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */
|
||||
SOCK_CONNECTED,
|
||||
SOCK_CONNECTED = 2,
|
||||
|
||||
NUM_SOCK_TYPES,
|
||||
};
|
||||
|
|
12
epoll_type.h
12
epoll_type.h
|
@ -22,8 +22,8 @@ enum epoll_type {
|
|||
EPOLL_TYPE_TCP_TIMER,
|
||||
/* UDP "listening" sockets */
|
||||
EPOLL_TYPE_UDP_LISTEN,
|
||||
/* UDP socket for a specific flow */
|
||||
EPOLL_TYPE_UDP,
|
||||
/* UDP socket for replies on a specific flow */
|
||||
EPOLL_TYPE_UDP_REPLY,
|
||||
/* ICMP/ICMPv6 ping sockets */
|
||||
EPOLL_TYPE_PING,
|
||||
/* inotify fd watching for end of netns (pasta) */
|
||||
|
@ -36,14 +36,6 @@ enum epoll_type {
|
|||
EPOLL_TYPE_TAP_PASST,
|
||||
/* socket listening for qemu socket connections */
|
||||
EPOLL_TYPE_TAP_LISTEN,
|
||||
/* vhost-user command socket */
|
||||
EPOLL_TYPE_VHOST_CMD,
|
||||
/* vhost-user kick event socket */
|
||||
EPOLL_TYPE_VHOST_KICK,
|
||||
/* TCP_REPAIR helper listening socket */
|
||||
EPOLL_TYPE_REPAIR_LISTEN,
|
||||
/* TCP_REPAIR helper socket */
|
||||
EPOLL_TYPE_REPAIR,
|
||||
|
||||
EPOLL_NUM_TYPES,
|
||||
};
|
||||
|
|
441
flow.c
441
flow.c
|
@ -19,7 +19,6 @@
|
|||
#include "inany.h"
|
||||
#include "flow.h"
|
||||
#include "flow_table.h"
|
||||
#include "repair.h"
|
||||
|
||||
const char *flow_state_str[] = {
|
||||
[FLOW_STATE_FREE] = "FREE",
|
||||
|
@ -53,13 +52,6 @@ const uint8_t flow_proto[] = {
|
|||
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
||||
"flow_proto[] doesn't match enum flow_type");
|
||||
|
||||
#define foreach_established_tcp_flow(flow) \
|
||||
flow_foreach_of_type((flow), FLOW_TCP) \
|
||||
if (!tcp_flow_is_established(&(flow)->tcp)) \
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
|
||||
continue; \
|
||||
else
|
||||
|
||||
/* Global Flow Table */
|
||||
|
||||
/**
|
||||
|
@ -81,7 +73,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
|||
*
|
||||
* Free cluster list
|
||||
* flow_first_free gives the index of the first (lowest index) free cluster.
|
||||
* Each free cluster has the index of the next free cluster, or FLOW_MAX if
|
||||
* Each free cluster has the index of the next free cluster, or MAX_FLOW if
|
||||
* it is the last free cluster. Together these form a linked list of free
|
||||
* clusters, in strictly increasing order of index.
|
||||
*
|
||||
|
@ -267,13 +259,11 @@ int flowside_connect(const struct ctx *c, int s,
|
|||
|
||||
/** flow_log_ - Log flow-related message
|
||||
* @f: flow the message is related to
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
* @pri: Log priority
|
||||
* @fmt: Format string
|
||||
* @...: printf-arguments
|
||||
*/
|
||||
void flow_log_(const struct flow_common *f, bool newline, int pri,
|
||||
const char *fmt, ...)
|
||||
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
||||
{
|
||||
const char *type_or_state;
|
||||
char msg[BUFSIZ];
|
||||
|
@ -289,7 +279,7 @@ void flow_log_(const struct flow_common *f, bool newline, int pri,
|
|||
else
|
||||
type_or_state = FLOW_TYPE(f);
|
||||
|
||||
logmsg(newline, false, pri,
|
||||
logmsg(true, false, pri,
|
||||
"Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
|
||||
}
|
||||
|
||||
|
@ -309,7 +299,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
|
|||
const struct flowside *tgt = &f->side[TGTSIDE];
|
||||
|
||||
if (state >= FLOW_STATE_TGT)
|
||||
flow_log_(f, true, pri,
|
||||
flow_log_(f, pri,
|
||||
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
|
||||
pif_name(f->pif[INISIDE]),
|
||||
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||
|
@ -322,7 +312,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
|
|||
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
|
||||
tgt->eport);
|
||||
else if (state >= FLOW_STATE_INI)
|
||||
flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
|
||||
flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
|
||||
pif_name(f->pif[INISIDE]),
|
||||
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||
ini->eport,
|
||||
|
@ -343,7 +333,7 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
|
|||
ASSERT(oldstate < FLOW_NUM_STATES);
|
||||
|
||||
f->state = state;
|
||||
flow_log_(f, true, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
|
||||
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
|
||||
FLOW_STATE(f));
|
||||
|
||||
flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
|
||||
|
@ -396,27 +386,18 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
|
|||
* @flow: Flow to change state
|
||||
* @pif: pif of the initiating side
|
||||
* @ssa: Source socket address
|
||||
* @daddr: Destination address (may be NULL)
|
||||
* @dport: Destination port
|
||||
*
|
||||
* Return: pointer to the initiating flowside information
|
||||
*/
|
||||
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
||||
const union sockaddr_inany *ssa,
|
||||
const union inany_addr *daddr,
|
||||
in_port_t dport)
|
||||
const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
||||
const union sockaddr_inany *ssa,
|
||||
in_port_t dport)
|
||||
{
|
||||
struct flowside *ini = &flow->f.side[INISIDE];
|
||||
|
||||
if (inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa) < 0) {
|
||||
char str[SOCKADDR_STRLEN];
|
||||
|
||||
ASSERT_WITH_MSG(0, "Bad socket address %s",
|
||||
sockaddr_ntop(ssa, str, sizeof(str)));
|
||||
}
|
||||
if (daddr)
|
||||
ini->oaddr = *daddr;
|
||||
else if (inany_v4(&ini->eaddr))
|
||||
inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
|
||||
if (inany_v4(&ini->eaddr))
|
||||
ini->oaddr = inany_any4;
|
||||
else
|
||||
ini->oaddr = inany_any6;
|
||||
|
@ -433,8 +414,8 @@ struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
|||
*
|
||||
* Return: pointer to the target flowside information
|
||||
*/
|
||||
struct flowside *flow_target(const struct ctx *c, union flow *flow,
|
||||
uint8_t proto)
|
||||
const struct flowside *flow_target(const struct ctx *c, union flow *flow,
|
||||
uint8_t proto)
|
||||
{
|
||||
char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
|
||||
struct flow_common *f = &flow->f;
|
||||
|
@ -480,9 +461,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
|
|||
/**
|
||||
* flow_set_type() - Set type and move to TYPED
|
||||
* @flow: Flow to change state
|
||||
* @type: New flow type to assign
|
||||
*
|
||||
* Return: pointer to the modified flow structure.
|
||||
* @pif: pif of the initiating side
|
||||
*/
|
||||
union flow *flow_set_type(union flow *flow, enum flow_type type)
|
||||
{
|
||||
|
@ -618,7 +597,12 @@ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
|
|||
const struct flowside *side = &f->side[sidx.sidei];
|
||||
uint8_t pif = f->pif[sidx.sidei];
|
||||
|
||||
ASSERT(pif != PIF_NONE);
|
||||
/* For the hash table to work, entries must have complete endpoint
|
||||
* information, and at least a forwarding port.
|
||||
*/
|
||||
ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
|
||||
side->eport != 0 && side->oport != 0);
|
||||
|
||||
return flow_hash(c, FLOW_PROTO(f), pif, side);
|
||||
}
|
||||
|
||||
|
@ -627,7 +611,7 @@ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
|
|||
* @hash: Raw hash value for flow & side
|
||||
* @sidx: Flow and side to find bucket for
|
||||
*
|
||||
* Return: if @sidx is in the hash table, its current bucket, otherwise a
|
||||
* Return: If @sidx is in the hash table, its current bucket, otherwise a
|
||||
* suitable free bucket for it.
|
||||
*/
|
||||
static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
|
||||
|
@ -647,7 +631,7 @@ static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
|
|||
* @c: Execution context
|
||||
* @sidx: Flow and side to find bucket for
|
||||
*
|
||||
* Return: if @sidx is in the hash table, its current bucket, otherwise a
|
||||
* Return: If @sidx is in the hash table, its current bucket, otherwise a
|
||||
* suitable free bucket for it.
|
||||
*/
|
||||
static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx)
|
||||
|
@ -762,30 +746,19 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
|
|||
* @proto: Protocol of the flow (IP L4 protocol number)
|
||||
* @pif: Interface of the flow
|
||||
* @esa: Socket address of the endpoint
|
||||
* @oaddr: Our address (may be NULL)
|
||||
* @oport: Our port number
|
||||
*
|
||||
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
||||
*/
|
||||
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||
const void *esa,
|
||||
const union inany_addr *oaddr, in_port_t oport)
|
||||
const void *esa, in_port_t oport)
|
||||
{
|
||||
struct flowside side = {
|
||||
.oport = oport,
|
||||
};
|
||||
|
||||
if (inany_from_sockaddr(&side.eaddr, &side.eport, esa) < 0) {
|
||||
char str[SOCKADDR_STRLEN];
|
||||
|
||||
warn("Flow lookup on bad socket address %s",
|
||||
sockaddr_ntop(esa, str, sizeof(str)));
|
||||
return FLOW_SIDX_NONE;
|
||||
}
|
||||
|
||||
if (oaddr)
|
||||
side.oaddr = *oaddr;
|
||||
else if (inany_v4(&side.eaddr))
|
||||
inany_from_sockaddr(&side.eaddr, &side.eport, esa);
|
||||
if (inany_v4(&side.eaddr))
|
||||
side.oaddr = inany_any4;
|
||||
else
|
||||
side.oaddr = inany_any6;
|
||||
|
@ -802,9 +775,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
{
|
||||
struct flow_free_cluster *free_head = NULL;
|
||||
unsigned *last_next = &flow_first_free;
|
||||
bool to_free[FLOW_MAX] = { 0 };
|
||||
bool timer = false;
|
||||
union flow *flow;
|
||||
unsigned idx;
|
||||
|
||||
if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
|
||||
timer = true;
|
||||
|
@ -813,12 +785,49 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
|
||||
ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
|
||||
|
||||
/* Check which flows we might need to close first, but don't free them
|
||||
* yet as it's not safe to do that in the middle of flow_foreach().
|
||||
*/
|
||||
flow_foreach(flow) {
|
||||
for (idx = 0; idx < FLOW_MAX; idx++) {
|
||||
union flow *flow = &flowtab[idx];
|
||||
bool closed = false;
|
||||
|
||||
switch (flow->f.state) {
|
||||
case FLOW_STATE_FREE: {
|
||||
unsigned skip = flow->free.n;
|
||||
|
||||
/* First entry of a free cluster must have n >= 1 */
|
||||
ASSERT(skip);
|
||||
|
||||
if (free_head) {
|
||||
/* Merge into preceding free cluster */
|
||||
free_head->n += flow->free.n;
|
||||
flow->free.n = flow->free.next = 0;
|
||||
} else {
|
||||
/* New free cluster, add to chain */
|
||||
free_head = &flow->free;
|
||||
*last_next = idx;
|
||||
last_next = &free_head->next;
|
||||
}
|
||||
|
||||
/* Skip remaining empty entries */
|
||||
idx += skip - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
case FLOW_STATE_NEW:
|
||||
case FLOW_STATE_INI:
|
||||
case FLOW_STATE_TGT:
|
||||
case FLOW_STATE_TYPED:
|
||||
/* Incomplete flow at end of cycle */
|
||||
ASSERT(false);
|
||||
break;
|
||||
|
||||
case FLOW_STATE_ACTIVE:
|
||||
/* Nothing to do */
|
||||
break;
|
||||
|
||||
default:
|
||||
ASSERT(false);
|
||||
}
|
||||
|
||||
switch (flow->f.type) {
|
||||
case FLOW_TYPE_NONE:
|
||||
ASSERT(false);
|
||||
|
@ -837,7 +846,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
closed = icmp_ping_timer(c, &flow->ping, now);
|
||||
break;
|
||||
case FLOW_UDP:
|
||||
closed = udp_flow_defer(c, &flow->udp, now);
|
||||
closed = udp_flow_defer(&flow->udp);
|
||||
if (!closed && timer)
|
||||
closed = udp_flow_timer(c, &flow->udp, now);
|
||||
break;
|
||||
|
@ -846,322 +855,30 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
;
|
||||
}
|
||||
|
||||
to_free[FLOW_IDX(flow)] = closed;
|
||||
}
|
||||
|
||||
/* Second step: actually free the flows */
|
||||
flow_foreach_slot(flow) {
|
||||
switch (flow->f.state) {
|
||||
case FLOW_STATE_FREE: {
|
||||
unsigned skip = flow->free.n;
|
||||
|
||||
/* First entry of a free cluster must have n >= 1 */
|
||||
ASSERT(skip);
|
||||
if (closed) {
|
||||
flow_set_state(&flow->f, FLOW_STATE_FREE);
|
||||
memset(flow, 0, sizeof(*flow));
|
||||
|
||||
if (free_head) {
|
||||
/* Merge into preceding free cluster */
|
||||
free_head->n += flow->free.n;
|
||||
/* Add slot to current free cluster */
|
||||
ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
|
||||
free_head->n++;
|
||||
flow->free.n = flow->free.next = 0;
|
||||
} else {
|
||||
/* New free cluster, add to chain */
|
||||
/* Create new free cluster */
|
||||
free_head = &flow->free;
|
||||
*last_next = FLOW_IDX(flow);
|
||||
free_head->n = 1;
|
||||
*last_next = idx;
|
||||
last_next = &free_head->next;
|
||||
}
|
||||
|
||||
/* Skip remaining empty entries */
|
||||
flow += skip - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
case FLOW_STATE_NEW:
|
||||
case FLOW_STATE_INI:
|
||||
case FLOW_STATE_TGT:
|
||||
case FLOW_STATE_TYPED:
|
||||
/* Incomplete flow at end of cycle */
|
||||
ASSERT(false);
|
||||
break;
|
||||
|
||||
case FLOW_STATE_ACTIVE:
|
||||
if (to_free[FLOW_IDX(flow)]) {
|
||||
flow_set_state(&flow->f, FLOW_STATE_FREE);
|
||||
memset(flow, 0, sizeof(*flow));
|
||||
|
||||
if (free_head) {
|
||||
/* Add slot to current free cluster */
|
||||
ASSERT(FLOW_IDX(flow) ==
|
||||
FLOW_IDX(free_head) + free_head->n);
|
||||
free_head->n++;
|
||||
flow->free.n = flow->free.next = 0;
|
||||
} else {
|
||||
/* Create new free cluster */
|
||||
free_head = &flow->free;
|
||||
free_head->n = 1;
|
||||
*last_next = FLOW_IDX(flow);
|
||||
last_next = &free_head->next;
|
||||
}
|
||||
} else {
|
||||
free_head = NULL;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
ASSERT(false);
|
||||
} else {
|
||||
free_head = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
*last_next = FLOW_MAX;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_migrate_source_rollback() - Disable repair mode, return failure
|
||||
* @c: Execution context
|
||||
* @bound: No need to roll back flow indices >= @bound
|
||||
* @ret: Negative error code
|
||||
*
|
||||
* Return: @ret
|
||||
*/
|
||||
static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
|
||||
{
|
||||
union flow *flow;
|
||||
|
||||
debug("...roll back migration");
|
||||
|
||||
foreach_established_tcp_flow(flow) {
|
||||
if (FLOW_IDX(flow) >= bound)
|
||||
break;
|
||||
if (tcp_flow_repair_off(c, &flow->tcp))
|
||||
die("Failed to roll back TCP_REPAIR mode");
|
||||
}
|
||||
|
||||
if (repair_flush(c))
|
||||
die("Failed to roll back TCP_REPAIR mode");
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_migrate_need_repair() - Do we need to set repair mode for any flow?
|
||||
*
|
||||
* Return: true if repair mode is needed, false otherwise
|
||||
*/
|
||||
static bool flow_migrate_need_repair(void)
|
||||
{
|
||||
union flow *flow;
|
||||
|
||||
foreach_established_tcp_flow(flow)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_migrate_repair_all() - Turn repair mode on or off for all flows
|
||||
* @c: Execution context
|
||||
* @enable: Switch repair mode on if set, off otherwise
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*/
|
||||
static int flow_migrate_repair_all(struct ctx *c, bool enable)
|
||||
{
|
||||
union flow *flow;
|
||||
int rc;
|
||||
|
||||
/* If we don't have a repair helper, there's nothing we can do */
|
||||
if (c->fd_repair < 0)
|
||||
return 0;
|
||||
|
||||
foreach_established_tcp_flow(flow) {
|
||||
if (enable)
|
||||
rc = tcp_flow_repair_on(c, &flow->tcp);
|
||||
else
|
||||
rc = tcp_flow_repair_off(c, &flow->tcp);
|
||||
|
||||
if (rc) {
|
||||
debug("Can't %s repair mode: %s",
|
||||
enable ? "enable" : "disable", strerror_(-rc));
|
||||
return flow_migrate_source_rollback(c, FLOW_IDX(flow),
|
||||
rc);
|
||||
}
|
||||
}
|
||||
|
||||
if ((rc = repair_flush(c))) {
|
||||
debug("Can't %s repair mode: %s",
|
||||
enable ? "enable" : "disable", strerror_(-rc));
|
||||
return flow_migrate_source_rollback(c, FLOW_IDX(flow), rc);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_migrate_source_pre() - Prepare flows for migration: enable repair mode
|
||||
* @c: Execution context
|
||||
* @stage: Migration stage information (unused)
|
||||
* @fd: Migration file descriptor (unused)
|
||||
*
|
||||
* Return: 0 on success, positive error code on failure
|
||||
*/
|
||||
int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
|
||||
int fd)
|
||||
{
|
||||
int rc;
|
||||
|
||||
(void)stage;
|
||||
(void)fd;
|
||||
|
||||
if (flow_migrate_need_repair())
|
||||
repair_wait(c);
|
||||
|
||||
if ((rc = flow_migrate_repair_all(c, true)))
|
||||
return -rc;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_migrate_source() - Dump all the remaining information and send data
|
||||
* @c: Execution context (unused)
|
||||
* @stage: Migration stage information (unused)
|
||||
* @fd: Migration file descriptor
|
||||
*
|
||||
* Return: 0 on success, positive error code on failure
|
||||
*/
|
||||
int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
|
||||
int fd)
|
||||
{
|
||||
uint32_t count = 0;
|
||||
bool first = true;
|
||||
union flow *flow;
|
||||
int rc;
|
||||
|
||||
(void)c;
|
||||
(void)stage;
|
||||
|
||||
/* If we don't have a repair helper, we can't migrate TCP flows */
|
||||
if (c->fd_repair >= 0) {
|
||||
foreach_established_tcp_flow(flow)
|
||||
count++;
|
||||
}
|
||||
|
||||
count = htonl(count);
|
||||
if (write_all_buf(fd, &count, sizeof(count))) {
|
||||
rc = errno;
|
||||
err_perror("Can't send flow count (%u)", ntohl(count));
|
||||
return flow_migrate_source_rollback(c, FLOW_MAX, rc);
|
||||
}
|
||||
|
||||
debug("Sending %u flows", ntohl(count));
|
||||
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
/* Dump and send information that can be stored in the flow table.
|
||||
*
|
||||
* Limited rollback options here: if we fail to transfer any data (that
|
||||
* is, on the first flow), undo everything and resume. Otherwise, the
|
||||
* stream might now be inconsistent, and we might have closed listening
|
||||
* TCP sockets, so just terminate.
|
||||
*/
|
||||
foreach_established_tcp_flow(flow) {
|
||||
rc = tcp_flow_migrate_source(fd, &flow->tcp);
|
||||
if (rc) {
|
||||
flow_err(flow, "Can't send data: %s",
|
||||
strerror_(-rc));
|
||||
if (!first)
|
||||
die("Inconsistent migration state, exiting");
|
||||
|
||||
return flow_migrate_source_rollback(c, FLOW_MAX, -rc);
|
||||
}
|
||||
|
||||
first = false;
|
||||
}
|
||||
|
||||
/* And then "extended" data (including window data we saved previously):
|
||||
* the target needs to set repair mode on sockets before it can set
|
||||
* this stuff, but it needs sockets (and flows) for that.
|
||||
*
|
||||
* This also closes sockets so that the target can start connecting
|
||||
* theirs: you can't sendmsg() to queues (using the socket) if the
|
||||
* socket is not connected (EPIPE), not even in repair mode. And the
|
||||
* target needs to restore queues now because we're sending the data.
|
||||
*
|
||||
* So, no rollback here, just try as hard as we can. Tolerate per-flow
|
||||
* failures but not if the stream might be inconsistent (reported here
|
||||
* as EIO).
|
||||
*/
|
||||
foreach_established_tcp_flow(flow) {
|
||||
rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
|
||||
if (rc) {
|
||||
flow_err(flow, "Can't send extended data: %s",
|
||||
strerror_(-rc));
|
||||
|
||||
if (rc == -EIO)
|
||||
die("Inconsistent migration state, exiting");
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_migrate_target() - Receive flows and insert in flow table
|
||||
* @c: Execution context
|
||||
* @stage: Migration stage information (unused)
|
||||
* @fd: Migration file descriptor
|
||||
*
|
||||
* Return: 0 on success, positive error code on failure
|
||||
*/
|
||||
int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
|
||||
int fd)
|
||||
{
|
||||
uint32_t count;
|
||||
unsigned i;
|
||||
int rc;
|
||||
|
||||
(void)stage;
|
||||
|
||||
if (read_all_buf(fd, &count, sizeof(count)))
|
||||
return errno;
|
||||
|
||||
count = ntohl(count);
|
||||
debug("Receiving %u flows", count);
|
||||
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
if ((rc = repair_wait(c)))
|
||||
return -rc;
|
||||
|
||||
if ((rc = flow_migrate_repair_all(c, true)))
|
||||
return -rc;
|
||||
|
||||
repair_flush(c);
|
||||
|
||||
/* TODO: flow header with type, instead? */
|
||||
for (i = 0; i < count; i++) {
|
||||
rc = tcp_flow_migrate_target(c, fd);
|
||||
if (rc) {
|
||||
flow_dbg(FLOW(i), "Migration data failure, abort: %s",
|
||||
strerror_(-rc));
|
||||
return -rc;
|
||||
}
|
||||
}
|
||||
|
||||
repair_flush(c);
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
|
||||
if (rc) {
|
||||
flow_dbg(FLOW(i), "Migration data failure, abort: %s",
|
||||
strerror_(-rc));
|
||||
return -rc;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_init() - Initialise flow related data structures
|
||||
*/
|
||||
|
|
29
flow.h
29
flow.h
|
@ -243,27 +243,18 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
|
|||
const void *eaddr, const void *oaddr,
|
||||
in_port_t eport, in_port_t oport);
|
||||
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||
const void *esa,
|
||||
const union inany_addr *oaddr, in_port_t oport);
|
||||
const void *esa, in_port_t oport);
|
||||
|
||||
union flow;
|
||||
|
||||
void flow_init(void);
|
||||
void flow_defer_handler(const struct ctx *c, const struct timespec *now);
|
||||
int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
|
||||
int fd);
|
||||
int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
|
||||
int fd);
|
||||
int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
|
||||
int fd);
|
||||
int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
|
||||
int fd);
|
||||
|
||||
void flow_log_(const struct flow_common *f, bool newline, int pri,
|
||||
const char *fmt, ...)
|
||||
__attribute__((format(printf, 4, 5)));
|
||||
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
||||
__attribute__((format(printf, 3, 4)));
|
||||
|
||||
#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, (pri), __VA_ARGS__)
|
||||
|
||||
#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, true, (pri), __VA_ARGS__)
|
||||
#define flow_dbg(f, ...) flow_log((f), LOG_DEBUG, __VA_ARGS__)
|
||||
#define flow_err(f, ...) flow_log((f), LOG_ERR, __VA_ARGS__)
|
||||
|
||||
|
@ -273,16 +264,6 @@ void flow_log_(const struct flow_common *f, bool newline, int pri,
|
|||
flow_dbg((f), __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define flow_log_perror_(f, pri, ...) \
|
||||
do { \
|
||||
int errno_ = errno; \
|
||||
flow_log_((f), false, (pri), __VA_ARGS__); \
|
||||
logmsg(true, true, (pri), ": %s", strerror_(errno_)); \
|
||||
} while (0)
|
||||
|
||||
#define flow_dbg_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_DEBUG, __VA_ARGS__)
|
||||
#define flow_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_ERR, __VA_ARGS__)
|
||||
|
||||
void flow_log_details_(const struct flow_common *f, int pri,
|
||||
enum flow_state state);
|
||||
#define flow_log_details(f_, pri) \
|
||||
|
|
52
flow_table.h
52
flow_table.h
|
@ -50,42 +50,6 @@ extern union flow flowtab[];
|
|||
#define flow_foreach_sidei(sidei_) \
|
||||
for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++)
|
||||
|
||||
|
||||
/**
|
||||
* flow_foreach_slot() - Step through each flow table entry
|
||||
* @flow: Takes values of pointer to each flow table entry
|
||||
*
|
||||
* Includes FREE slots.
|
||||
*/
|
||||
#define flow_foreach_slot(flow) \
|
||||
for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++)
|
||||
|
||||
/**
|
||||
* flow_foreach() - Step through each active flow
|
||||
* @flow: Takes values of pointer to each active flow
|
||||
*/
|
||||
#define flow_foreach(flow) \
|
||||
flow_foreach_slot((flow)) \
|
||||
if ((flow)->f.state == FLOW_STATE_FREE) \
|
||||
(flow) += (flow)->free.n - 1; \
|
||||
else if ((flow)->f.state != FLOW_STATE_ACTIVE) { \
|
||||
flow_err((flow), "Bad flow state during traversal"); \
|
||||
continue; \
|
||||
} else
|
||||
|
||||
/**
|
||||
* flow_foreach_of_type() - Step through each active flow of given type
|
||||
* @flow: Takes values of pointer to each flow
|
||||
* @type_: Type of flow to traverse
|
||||
*/
|
||||
#define flow_foreach_of_type(flow, type_) \
|
||||
flow_foreach((flow)) \
|
||||
if ((flow)->f.type != (type_)) \
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
|
||||
continue; \
|
||||
else
|
||||
|
||||
|
||||
/** flow_idx() - Index of flow from common structure
|
||||
* @f: Common flow fields pointer
|
||||
*
|
||||
|
@ -93,7 +57,6 @@ extern union flow flowtab[];
|
|||
*/
|
||||
static inline unsigned flow_idx(const struct flow_common *f)
|
||||
{
|
||||
/* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
|
||||
return (union flow *)f - flowtab;
|
||||
}
|
||||
|
||||
|
@ -140,14 +103,14 @@ static inline uint8_t pif_at_sidx(flow_sidx_t sidx)
|
|||
/** flowside_at_sidx() - Retrieve a specific flowside
|
||||
* @sidx: Flow & side index
|
||||
*
|
||||
* Return: flowside for the flow & side given by @sidx
|
||||
* Return: Flowside for the flow & side given by @sidx
|
||||
*/
|
||||
static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
|
||||
{
|
||||
const union flow *flow = flow_at_sidx(sidx);
|
||||
|
||||
if (!flow)
|
||||
return NULL;
|
||||
return PIF_NONE;
|
||||
|
||||
return &flow->f.side[sidx.sidei];
|
||||
}
|
||||
|
@ -198,16 +161,15 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
|
|||
sa_family_t af,
|
||||
const void *saddr, in_port_t sport,
|
||||
const void *daddr, in_port_t dport);
|
||||
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
||||
const union sockaddr_inany *ssa,
|
||||
const union inany_addr *daddr,
|
||||
in_port_t dport);
|
||||
const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
||||
const union sockaddr_inany *ssa,
|
||||
in_port_t dport);
|
||||
const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
|
||||
sa_family_t af,
|
||||
const void *saddr, in_port_t sport,
|
||||
const void *daddr, in_port_t dport);
|
||||
struct flowside *flow_target(const struct ctx *c, union flow *flow,
|
||||
uint8_t proto);
|
||||
const struct flowside *flow_target(const struct ctx *c, union flow *flow,
|
||||
uint8_t proto);
|
||||
|
||||
union flow *flow_set_type(union flow *flow, enum flow_type type);
|
||||
#define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_)
|
||||
|
|
126
fwd.c
126
fwd.c
|
@ -75,8 +75,8 @@ void fwd_probe_ephemeral(void)
|
|||
if (*end || errno)
|
||||
goto parse_err;
|
||||
|
||||
if (min < 0 || min >= (long)NUM_PORTS ||
|
||||
max < 0 || max >= (long)NUM_PORTS)
|
||||
if (min < 0 || min >= NUM_PORTS ||
|
||||
max < 0 || max >= NUM_PORTS)
|
||||
goto parse_err;
|
||||
|
||||
fwd_ephemeral_min = min;
|
||||
|
@ -323,30 +323,6 @@ static bool fwd_guest_accessible(const struct ctx *c,
|
|||
return fwd_guest_accessible6(c, &addr->a6);
|
||||
}
|
||||
|
||||
/**
|
||||
* nat_outbound() - Apply address translation for outbound (TAP to HOST)
|
||||
* @c: Execution context
|
||||
* @addr: Input address (as seen on TAP interface)
|
||||
* @translated: Output address (as seen on HOST interface)
|
||||
*
|
||||
* Only handles translations that depend *only* on the address. Anything
|
||||
* related to specific ports or flows is handled elsewhere.
|
||||
*/
|
||||
static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
|
||||
union inany_addr *translated)
|
||||
{
|
||||
if (inany_equals4(addr, &c->ip4.map_host_loopback))
|
||||
*translated = inany_loopback4;
|
||||
else if (inany_equals6(addr, &c->ip6.map_host_loopback))
|
||||
*translated = inany_loopback6;
|
||||
else if (inany_equals4(addr, &c->ip4.map_guest_addr))
|
||||
*translated = inany_from_v4(c->ip4.addr);
|
||||
else if (inany_equals6(addr, &c->ip6.map_guest_addr))
|
||||
translated->a6 = c->ip6.addr;
|
||||
else
|
||||
*translated = *addr;
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_nat_from_tap() - Determine to forward a flow from the tap interface
|
||||
* @c: Execution context
|
||||
|
@ -366,8 +342,16 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
|
|||
else if (is_dns_flow(proto, ini) &&
|
||||
inany_equals6(&ini->oaddr, &c->ip6.dns_match))
|
||||
tgt->eaddr.a6 = c->ip6.dns_host;
|
||||
else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
|
||||
tgt->eaddr = inany_loopback4;
|
||||
else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
|
||||
tgt->eaddr = inany_loopback6;
|
||||
else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
|
||||
tgt->eaddr = inany_from_v4(c->ip4.addr);
|
||||
else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
|
||||
tgt->eaddr.a6 = c->ip6.addr;
|
||||
else
|
||||
nat_outbound(c, &ini->oaddr, &tgt->eaddr);
|
||||
tgt->eaddr = ini->oaddr;
|
||||
|
||||
tgt->eport = ini->oport;
|
||||
|
||||
|
@ -418,7 +402,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
|
|||
else
|
||||
tgt->eaddr = inany_loopback6;
|
||||
|
||||
/* Preserve the specific loopback address used, but let the kernel pick
|
||||
/* Preserve the specific loopback adddress used, but let the kernel pick
|
||||
* a source port on the target side
|
||||
*/
|
||||
tgt->oaddr = ini->eaddr;
|
||||
|
@ -439,42 +423,6 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
|
|||
return PIF_HOST;
|
||||
}
|
||||
|
||||
/**
|
||||
* nat_inbound() - Apply address translation for inbound (HOST to TAP)
|
||||
* @c: Execution context
|
||||
* @addr: Input address (as seen on HOST interface)
|
||||
* @translated: Output address (as seen on TAP interface)
|
||||
*
|
||||
* Return: true on success, false if it couldn't translate the address
|
||||
*
|
||||
* Only handles translations that depend *only* on the address. Anything
|
||||
* related to specific ports or flows is handled elsewhere.
|
||||
*/
|
||||
bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
|
||||
union inany_addr *translated)
|
||||
{
|
||||
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
|
||||
inany_equals4(addr, &in4addr_loopback)) {
|
||||
/* Specifically 127.0.0.1, not 127.0.0.0/8 */
|
||||
*translated = inany_from_v4(c->ip4.map_host_loopback);
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
|
||||
inany_equals6(addr, &in6addr_loopback)) {
|
||||
translated->a6 = c->ip6.map_host_loopback;
|
||||
} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
|
||||
inany_equals4(addr, &c->ip4.addr)) {
|
||||
*translated = inany_from_v4(c->ip4.map_guest_addr);
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
|
||||
inany_equals6(addr, &c->ip6.addr)) {
|
||||
translated->a6 = c->ip6.map_guest_addr;
|
||||
} else if (fwd_guest_accessible(c, addr)) {
|
||||
*translated = *addr;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_nat_from_host() - Determine to forward a flow from the host interface
|
||||
* @c: Execution context
|
||||
|
@ -495,43 +443,41 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
|
|||
else if (proto == IPPROTO_UDP)
|
||||
tgt->eport += c->udp.fwd_in.delta[tgt->eport];
|
||||
|
||||
if (!c->no_splice && inany_is_loopback(&ini->eaddr) &&
|
||||
if (c->mode == MODE_PASTA && inany_is_loopback(&ini->eaddr) &&
|
||||
(proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
|
||||
/* spliceable */
|
||||
|
||||
/* The traffic will go over the guest's 'lo' interface, but by
|
||||
* default use its external address, so we don't inadvertently
|
||||
* expose services that listen only on the guest's loopback
|
||||
* address. That can be overridden by --host-lo-to-ns-lo which
|
||||
* will instead forward to the loopback address in the guest.
|
||||
*
|
||||
* In either case, let the kernel pick the source address to
|
||||
* match.
|
||||
/* Preserve the specific loopback adddress used, but let the
|
||||
* kernel pick a source port on the target side
|
||||
*/
|
||||
if (inany_v4(&ini->eaddr)) {
|
||||
if (c->host_lo_to_ns_lo)
|
||||
tgt->eaddr = inany_loopback4;
|
||||
else
|
||||
tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
|
||||
tgt->oaddr = inany_any4;
|
||||
} else {
|
||||
if (c->host_lo_to_ns_lo)
|
||||
tgt->eaddr = inany_loopback6;
|
||||
else
|
||||
tgt->eaddr.a6 = c->ip6.addr_seen;
|
||||
tgt->oaddr = inany_any6;
|
||||
}
|
||||
|
||||
/* Let the kernel pick source port */
|
||||
tgt->oaddr = ini->eaddr;
|
||||
tgt->oport = 0;
|
||||
if (proto == IPPROTO_UDP)
|
||||
/* But for UDP preserve the source port */
|
||||
tgt->oport = ini->eport;
|
||||
|
||||
if (inany_v4(&ini->eaddr))
|
||||
tgt->eaddr = inany_loopback4;
|
||||
else
|
||||
tgt->eaddr = inany_loopback6;
|
||||
|
||||
return PIF_SPLICE;
|
||||
}
|
||||
|
||||
if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) {
|
||||
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
|
||||
inany_equals4(&ini->eaddr, &in4addr_loopback)) {
|
||||
/* Specifically 127.0.0.1, not 127.0.0.0/8 */
|
||||
tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
|
||||
inany_equals6(&ini->eaddr, &in6addr_loopback)) {
|
||||
tgt->oaddr.a6 = c->ip6.map_host_loopback;
|
||||
} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
|
||||
inany_equals4(&ini->eaddr, &c->ip4.addr)) {
|
||||
tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
|
||||
inany_equals6(&ini->eaddr, &c->ip6.addr)) {
|
||||
tgt->oaddr.a6 = c->ip6.map_guest_addr;
|
||||
} else if (!fwd_guest_accessible(c, &ini->eaddr)) {
|
||||
if (inany_v4(&ini->eaddr)) {
|
||||
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
|
||||
/* No source address we can use */
|
||||
|
@ -540,6 +486,8 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
|
|||
} else {
|
||||
tgt->oaddr.a6 = c->ip6.our_tap_ll;
|
||||
}
|
||||
} else {
|
||||
tgt->oaddr = ini->eaddr;
|
||||
}
|
||||
tgt->oport = ini->eport;
|
||||
|
||||
|
|
5
fwd.h
5
fwd.h
|
@ -7,7 +7,6 @@
|
|||
#ifndef FWD_H
|
||||
#define FWD_H
|
||||
|
||||
union inany_addr;
|
||||
struct flowside;
|
||||
|
||||
/* Number of ports for both TCP and UDP */
|
||||
|
@ -27,7 +26,7 @@ enum fwd_ports_mode {
|
|||
#define PORT_BITMAP_SIZE DIV_ROUND_UP(NUM_PORTS, 8)
|
||||
|
||||
/**
|
||||
* fwd_ports() - Describes port forwarding for one protocol and direction
|
||||
* fwd_ports - Describes port forwarding for one protocol and direction
|
||||
* @mode: Overall forwarding mode (all, none, auto, specific ports)
|
||||
* @scan4: /proc/net fd to scan for IPv4 ports when in AUTO mode
|
||||
* @scan6: /proc/net fd to scan for IPv6 ports when in AUTO mode
|
||||
|
@ -48,8 +47,6 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
|
|||
const struct fwd_ports *tcp_rev);
|
||||
void fwd_scan_ports_init(struct ctx *c);
|
||||
|
||||
bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
|
||||
union inany_addr *translated);
|
||||
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
|
||||
const struct flowside *ini, struct flowside *tgt);
|
||||
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
|
||||
|
|
|
@ -56,7 +56,6 @@ cd ..
|
|||
make pkgs
|
||||
scp passt passt.avx2 passt.1 qrap qrap.1 "${USER_HOST}:${BIN}"
|
||||
scp pasta pasta.avx2 pasta.1 "${USER_HOST}:${BIN}"
|
||||
scp passt-repair passt-repair.1 "${USER_HOST}:${BIN}"
|
||||
|
||||
ssh "${USER_HOST}" "rm -f ${BIN}/*.deb"
|
||||
ssh "${USER_HOST}" "rm -f ${BIN}/*.rpm"
|
||||
|
|
9
icmp.c
9
icmp.c
|
@ -85,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
|
|||
|
||||
n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
|
||||
if (n < 0) {
|
||||
flow_perror(pingf, "recvfrom() error");
|
||||
flow_err(pingf, "recvfrom() error: %s", strerror(errno));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -150,7 +150,7 @@ unexpected:
|
|||
static void icmp_ping_close(const struct ctx *c,
|
||||
const struct icmp_ping_flow *pingf)
|
||||
{
|
||||
epoll_del(c, pingf->sock);
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL);
|
||||
close(pingf->sock);
|
||||
flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
|
||||
}
|
||||
|
@ -163,7 +163,7 @@ static void icmp_ping_close(const struct ctx *c,
|
|||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
*
|
||||
* Return: newly opened ping flow, or NULL on failure
|
||||
* Return: Newly opened ping flow, or NULL on failure
|
||||
*/
|
||||
static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
|
||||
sa_family_t af, uint16_t id,
|
||||
|
@ -300,7 +300,8 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
|||
|
||||
pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
|
||||
if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
|
||||
flow_dbg_perror(pingf, "failed to relay request to socket");
|
||||
flow_dbg(pingf, "failed to relay request to socket: %s",
|
||||
strerror(errno));
|
||||
} else {
|
||||
flow_dbg(pingf,
|
||||
"echo request to socket, ID: %"PRIu16", seq: %"PRIu16,
|
||||
|
|
22
inany.c
22
inany.c
|
@ -25,7 +25,7 @@ const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
|
|||
* @dst: output buffer, minimum INANY_ADDRSTRLEN bytes
|
||||
* @size: size of buffer at @dst
|
||||
*
|
||||
* Return: on success, a non-null pointer to @dst, NULL on failure
|
||||
* Return: On success, a non-null pointer to @dst, NULL on failure
|
||||
*/
|
||||
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
|
||||
{
|
||||
|
@ -36,23 +36,3 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
|
|||
|
||||
return inet_ntop(AF_INET6, &src->a6, dst, size);
|
||||
}
|
||||
|
||||
/** inany_pton - Parse an IPv[46] address from text format
|
||||
* @src: IPv[46] address
|
||||
* @dst: output buffer, filled with parsed address
|
||||
*
|
||||
* Return: on success, 1, if no parseable address is found, 0
|
||||
*/
|
||||
int inany_pton(const char *src, union inany_addr *dst)
|
||||
{
|
||||
if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) {
|
||||
memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero));
|
||||
memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one));
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (inet_pton(AF_INET6, src, &dst->a6))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
28
inany.h
28
inany.h
|
@ -237,30 +237,23 @@ static inline void inany_from_af(union inany_addr *aa,
|
|||
}
|
||||
|
||||
/** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr
|
||||
* @dst: Pointer to store IPv[46] address (output)
|
||||
* @aa: Pointer to store IPv[46] address
|
||||
* @port: Pointer to store port number, host order
|
||||
* @addr: Socket address
|
||||
*
|
||||
* Return: 0 on success, -1 on error (bad address family)
|
||||
* @addr: AF_INET or AF_INET6 socket address
|
||||
*/
|
||||
static inline int inany_from_sockaddr(union inany_addr *dst, in_port_t *port,
|
||||
const void *addr)
|
||||
static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
|
||||
const union sockaddr_inany *sa)
|
||||
{
|
||||
const union sockaddr_inany *sa = (const union sockaddr_inany *)addr;
|
||||
|
||||
if (sa->sa_family == AF_INET6) {
|
||||
inany_from_af(dst, AF_INET6, &sa->sa6.sin6_addr);
|
||||
inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr);
|
||||
*port = ntohs(sa->sa6.sin6_port);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (sa->sa_family == AF_INET) {
|
||||
inany_from_af(dst, AF_INET, &sa->sa4.sin_addr);
|
||||
} else if (sa->sa_family == AF_INET) {
|
||||
inany_from_af(aa, AF_INET, &sa->sa4.sin_addr);
|
||||
*port = ntohs(sa->sa4.sin_port);
|
||||
return 0;
|
||||
} else {
|
||||
/* Not valid to call with other address families */
|
||||
ASSERT(0);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/** inany_siphash_feed- Fold IPv[46] address into an in-progress siphash
|
||||
|
@ -277,6 +270,5 @@ static inline void inany_siphash_feed(struct siphash_state *state,
|
|||
#define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)
|
||||
|
||||
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size);
|
||||
int inany_pton(const char *src, union inany_addr *dst);
|
||||
|
||||
#endif /* INANY_H */
|
||||
|
|
114
iov.c
114
iov.c
|
@ -26,8 +26,7 @@
|
|||
#include "iov.h"
|
||||
|
||||
|
||||
/**
|
||||
* iov_skip_bytes() - Skip leading bytes of an IO vector
|
||||
/* iov_skip_bytes() - Skip leading bytes of an IO vector
|
||||
* @iov: IO vector
|
||||
* @n: Number of entries in @iov
|
||||
* @skip: Number of leading bytes of @iov to skip
|
||||
|
@ -57,8 +56,8 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
|
|||
}
|
||||
|
||||
/**
|
||||
* iov_from_buf() - Copy data from a buffer to an I/O vector (struct iovec)
|
||||
* efficiently.
|
||||
* iov_from_buf - Copy data from a buffer to an I/O vector (struct iovec)
|
||||
* efficiently.
|
||||
*
|
||||
* @iov: Pointer to the array of struct iovec describing the
|
||||
* scatter/gather I/O vector.
|
||||
|
@ -67,8 +66,9 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
|
|||
* @buf: Pointer to the source buffer containing the data to copy.
|
||||
* @bytes: Total number of bytes to copy from buf to iov.
|
||||
*
|
||||
* Return: the number of bytes successfully copied.
|
||||
* Returns: The number of bytes successfully copied.
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
||||
size_t offset, const void *buf, size_t bytes)
|
||||
{
|
||||
|
@ -97,8 +97,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
|||
}
|
||||
|
||||
/**
|
||||
* iov_to_buf() - Copy data from a scatter/gather I/O vector (struct iovec) to
|
||||
* a buffer efficiently.
|
||||
* iov_to_buf - Copy data from a scatter/gather I/O vector (struct iovec) to
|
||||
* a buffer efficiently.
|
||||
*
|
||||
* @iov: Pointer to the array of struct iovec describing the scatter/gather
|
||||
* I/O vector.
|
||||
|
@ -107,7 +107,7 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
|||
* @buf: Pointer to the destination buffer where data will be copied.
|
||||
* @bytes: Total number of bytes to copy from iov to buf.
|
||||
*
|
||||
* Return: the number of bytes successfully copied.
|
||||
* Returns: The number of bytes successfully copied.
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
|
||||
|
@ -137,14 +137,14 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
|
|||
}
|
||||
|
||||
/**
|
||||
* iov_size() - Calculate the total size of a scatter/gather I/O vector
|
||||
* (struct iovec).
|
||||
* iov_size - Calculate the total size of a scatter/gather I/O vector
|
||||
* (struct iovec).
|
||||
*
|
||||
* @iov: Pointer to the array of struct iovec describing the
|
||||
* scatter/gather I/O vector.
|
||||
* @iov_cnt: Number of elements in the iov array.
|
||||
*
|
||||
* Return: the total size in bytes.
|
||||
* Returns: The total size in bytes.
|
||||
*/
|
||||
size_t iov_size(const struct iovec *iov, size_t iov_cnt)
|
||||
{
|
||||
|
@ -156,95 +156,3 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt)
|
|||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* iov_tail_prune() - Remove any unneeded buffers from an IOV tail
|
||||
* @tail: IO vector tail (modified)
|
||||
*
|
||||
* If an IOV tail's offset is large enough, it may not include any bytes from
|
||||
* the first (or first several) buffers in the underlying IO vector. Modify the
|
||||
* tail's representation so it contains the same logical bytes, but only
|
||||
* includes buffers that are actually needed. This will avoid stepping through
|
||||
* unnecessary elements of the underlying IO vector on future operations.
|
||||
*
|
||||
* Return: true if the tail still contains any bytes, otherwise false
|
||||
*/
|
||||
bool iov_tail_prune(struct iov_tail *tail)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
i = iov_skip_bytes(tail->iov, tail->cnt, tail->off, &tail->off);
|
||||
tail->iov += i;
|
||||
tail->cnt -= i;
|
||||
|
||||
return !!tail->cnt;
|
||||
}
|
||||
|
||||
/**
|
||||
* iov_tail_size() - Calculate the total size of an IO vector tail
|
||||
* @tail: IO vector tail
|
||||
*
|
||||
* Return: the total size in bytes.
|
||||
*/
|
||||
size_t iov_tail_size(struct iov_tail *tail)
|
||||
{
|
||||
iov_tail_prune(tail);
|
||||
return iov_size(tail->iov, tail->cnt) - tail->off;
|
||||
}
|
||||
|
||||
/**
|
||||
* iov_peek_header_() - Get pointer to a header from an IOV tail
|
||||
* @tail: IOV tail to get header from
|
||||
* @len: Length of header to get, in bytes
|
||||
* @align: Required alignment of header, in bytes
|
||||
*
|
||||
* @tail may be pruned, but will represent the same bytes as before.
|
||||
*
|
||||
* Return: pointer to the first @len logical bytes of the tail, NULL if that
|
||||
* overruns the IO vector, is not contiguous or doesn't have the
|
||||
* requested alignment.
|
||||
*/
|
||||
/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
|
||||
void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
|
||||
{
|
||||
char *p;
|
||||
|
||||
if (!iov_tail_prune(tail))
|
||||
return NULL; /* Nothing left */
|
||||
|
||||
if (tail->off + len < tail->off)
|
||||
return NULL; /* Overflow */
|
||||
|
||||
if (tail->off + len > tail->iov[0].iov_len)
|
||||
return NULL; /* Not contiguous */
|
||||
|
||||
p = (char *)tail->iov[0].iov_base + tail->off;
|
||||
if ((uintptr_t)p % align)
|
||||
return NULL; /* not aligned */
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* iov_remove_header_() - Remove a header from an IOV tail
|
||||
* @tail: IOV tail to remove header from (modified)
|
||||
* @len: Length of header to remove, in bytes
|
||||
* @align: Required alignment of header, in bytes
|
||||
*
|
||||
* On success, @tail is updated so that it longer includes the bytes of the
|
||||
* returned header.
|
||||
*
|
||||
* Return: pointer to the first @len logical bytes of the tail, NULL if that
|
||||
* overruns the IO vector, is not contiguous or doesn't have the
|
||||
* requested alignment.
|
||||
*/
|
||||
void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align)
|
||||
{
|
||||
char *p = iov_peek_header_(tail, len, align);
|
||||
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
tail->off = tail->off + len;
|
||||
return p;
|
||||
}
|
||||
|
|
76
iov.h
76
iov.h
|
@ -28,80 +28,4 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
|||
size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
|
||||
size_t offset, void *buf, size_t bytes);
|
||||
size_t iov_size(const struct iovec *iov, size_t iov_cnt);
|
||||
|
||||
/*
|
||||
* DOC: Theory of Operation, struct iov_tail
|
||||
*
|
||||
* Sometimes a single logical network frame is split across multiple buffers,
|
||||
* represented by an IO vector (struct iovec[]). We often want to process this
|
||||
* one header / network layer at a time. So, it's useful to maintain a "tail"
|
||||
* of the vector representing the parts we haven't yet extracted.
|
||||
*
|
||||
* The headers we extract need not line up with buffer boundaries (though we do
|
||||
* assume they're contiguous within a single buffer for now). So, we could
|
||||
* represent that tail as another struct iovec[], but that would mean copying
|
||||
* the whole array of struct iovecs, just so we can adjust the offset and length
|
||||
* on the first one.
|
||||
*
|
||||
* So, instead represent the tail as pointer into an existing struct iovec[],
|
||||
* with an explicit offset for where the "tail" starts within it. If we extract
|
||||
* enough headers that some buffers of the original vector no longer contain
|
||||
* part of the tail, we (lazily) advance our struct iovec * to the first buffer
|
||||
* we still need, and adjust the vector length and offset to match.
|
||||
*/
|
||||
|
||||
/**
|
||||
* struct iov_tail - An IO vector which may have some headers logically removed
|
||||
* @iov: IO vector
|
||||
* @cnt: Number of entries in @iov
|
||||
* @off: Current offset in @iov
|
||||
*/
|
||||
struct iov_tail {
|
||||
const struct iovec *iov;
|
||||
size_t cnt, off;
|
||||
};
|
||||
|
||||
/**
|
||||
* IOV_TAIL() - Create a new IOV tail
|
||||
* @iov_: IO vector to create tail from
|
||||
* @cnt_: Length of the IO vector at @iov_
|
||||
* @off_: Byte offset in the IO vector where the tail begins
|
||||
*/
|
||||
#define IOV_TAIL(iov_, cnt_, off_) \
|
||||
(struct iov_tail){ .iov = (iov_), .cnt = (cnt_), .off = (off_) }
|
||||
|
||||
bool iov_tail_prune(struct iov_tail *tail);
|
||||
size_t iov_tail_size(struct iov_tail *tail);
|
||||
void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align);
|
||||
void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align);
|
||||
|
||||
/**
|
||||
* IOV_PEEK_HEADER() - Get typed pointer to a header from an IOV tail
|
||||
* @tail_: IOV tail to get header from
|
||||
* @type_: Data type of the header
|
||||
*
|
||||
* @tail_ may be pruned, but will represent the same bytes as before.
|
||||
*
|
||||
* Return: pointer of type (@type_ *) located at the start of @tail_, NULL if
|
||||
* we can't get a contiguous and aligned pointer.
|
||||
*/
|
||||
#define IOV_PEEK_HEADER(tail_, type_) \
|
||||
((type_ *)(iov_peek_header_((tail_), \
|
||||
sizeof(type_), __alignof__(type_))))
|
||||
|
||||
/**
|
||||
* IOV_REMOVE_HEADER() - Remove and return typed header from an IOV tail
|
||||
* @tail_: IOV tail to remove header from (modified)
|
||||
* @type_: Data type of the header to remove
|
||||
*
|
||||
* On success, @tail_ is updated so that it longer includes the bytes of the
|
||||
* returned header.
|
||||
*
|
||||
* Return: pointer of type (@type_ *) located at the old start of @tail_, NULL
|
||||
* if we can't get a contiguous and aligned pointer.
|
||||
*/
|
||||
#define IOV_REMOVE_HEADER(tail_, type_) \
|
||||
((type_ *)(iov_remove_header_((tail_), \
|
||||
sizeof(type_), __alignof__(type_))))
|
||||
|
||||
#endif /* IOVEC_H */
|
||||
|
|
46
ip.h
46
ip.h
|
@ -36,14 +36,13 @@
|
|||
.tos = 0, \
|
||||
.tot_len = 0, \
|
||||
.id = 0, \
|
||||
.frag_off = htons(IP_DF), \
|
||||
.frag_off = 0, \
|
||||
.ttl = 0xff, \
|
||||
.protocol = (proto), \
|
||||
.saddr = 0, \
|
||||
.daddr = 0, \
|
||||
}
|
||||
#define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \
|
||||
(uint32_t)htons_constant(IP_DF) + \
|
||||
(uint32_t)htons(0xff00 | (proto)))
|
||||
|
||||
|
||||
|
@ -91,49 +90,6 @@ struct ipv6_opt_hdr {
|
|||
*/
|
||||
} __attribute__((packed)); /* required for some archs */
|
||||
|
||||
/**
|
||||
* ip6_set_flow_lbl() - Set flow label in an IPv6 header
|
||||
* @ip6h: Pointer to IPv6 header, updated
|
||||
* @flow: Set @ip6h flow label to the low 20 bits of this integer
|
||||
*/
|
||||
static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow)
|
||||
{
|
||||
ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
|
||||
ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
|
||||
ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
|
||||
}
|
||||
|
||||
/** ip6_get_flow_lbl() - Get flow label from an IPv6 header
|
||||
* @ip6h: Pointer to IPv6 header
|
||||
*
|
||||
* Return: flow label from @ip6h as an integer (<= 20 bits)
|
||||
*/
|
||||
static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
|
||||
{
|
||||
return (ip6h->flow_lbl[0] & 0xf) << 16 |
|
||||
ip6h->flow_lbl[1] << 8 |
|
||||
ip6h->flow_lbl[2];
|
||||
}
|
||||
|
||||
char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
|
||||
size_t *dlen);
|
||||
|
||||
/* IPv6 link-local all-nodes multicast address, ff02::1 */
|
||||
static const struct in6_addr in6addr_ll_all_nodes = {
|
||||
.s6_addr = {
|
||||
0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
|
||||
},
|
||||
};
|
||||
|
||||
/* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */
|
||||
static const struct in_addr in4addr_broadcast = { 0xffffffff };
|
||||
|
||||
#ifndef IPV4_MIN_MTU
|
||||
#define IPV4_MIN_MTU 68
|
||||
#endif
|
||||
#ifndef IPV6_MIN_MTU
|
||||
#define IPV6_MIN_MTU 1280
|
||||
#endif
|
||||
|
||||
#endif /* IP_H */
|
||||
|
|
25
isolation.c
25
isolation.c
|
@ -129,7 +129,7 @@ static void drop_caps_ep_except(uint64_t keep)
|
|||
* additional layer of protection. Executing this requires
|
||||
* CAP_SETPCAP, which we will have within our userns.
|
||||
*
|
||||
* Note that dropping capabilities from the bounding set limits
|
||||
* Note that dropping capabilites from the bounding set limits
|
||||
* exec()ed processes, but does not remove them from the effective or
|
||||
* permitted sets, so it doesn't reduce our own capabilities.
|
||||
*/
|
||||
|
@ -174,8 +174,8 @@ static void clamp_caps(void)
|
|||
* Should:
|
||||
* - drop unneeded capabilities
|
||||
* - close all open files except for standard streams and the one from --fd
|
||||
* Mustn't:
|
||||
* - remove filesystem access (we need to access files during setup)
|
||||
* Musn't:
|
||||
* - remove filesytem access (we need to access files during setup)
|
||||
*/
|
||||
void isolate_initial(int argc, char **argv)
|
||||
{
|
||||
|
@ -194,7 +194,7 @@ void isolate_initial(int argc, char **argv)
|
|||
*
|
||||
* It's debatable whether it's useful to drop caps when we
|
||||
* retain SETUID and SYS_ADMIN, but we might as well. We drop
|
||||
* further capabilities in isolate_user() and
|
||||
* further capabilites in isolate_user() and
|
||||
* isolate_prefork().
|
||||
*/
|
||||
keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
|
||||
|
@ -379,21 +379,12 @@ void isolate_postfork(const struct ctx *c)
|
|||
|
||||
prctl(PR_SET_DUMPABLE, 0);
|
||||
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
|
||||
prog.filter = filter_passt;
|
||||
break;
|
||||
case MODE_PASTA:
|
||||
if (c->mode == MODE_PASTA) {
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
|
||||
prog.filter = filter_pasta;
|
||||
break;
|
||||
case MODE_VU:
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_vu);
|
||||
prog.filter = filter_vu;
|
||||
break;
|
||||
default:
|
||||
ASSERT(0);
|
||||
} else {
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
|
||||
prog.filter = filter_passt;
|
||||
}
|
||||
|
||||
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
|
||||
|
|
|
@ -70,7 +70,7 @@ static ssize_t peek_line(struct lineread *lr, bool eof)
|
|||
* @lr: Line reader state structure
|
||||
* @line: Place a pointer to the next line in this variable
|
||||
*
|
||||
* Return: length of line read on success, 0 on EOF, negative on error
|
||||
* Return: Length of line read on success, 0 on EOF, negative on error
|
||||
*/
|
||||
ssize_t lineread_get(struct lineread *lr, char **line)
|
||||
{
|
||||
|
|
144
linux_dep.h
144
linux_dep.h
|
@ -1,144 +0,0 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright Red Hat
|
||||
*
|
||||
* Declarations for Linux specific dependencies
|
||||
*/
|
||||
|
||||
#ifndef LINUX_DEP_H
|
||||
#define LINUX_DEP_H
|
||||
|
||||
/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
|
||||
*
|
||||
* Largely derived from include/linux/tcp.h in the Linux kernel
|
||||
*
|
||||
* Some fields returned by TCP_INFO have been there for ages and are shared with
|
||||
* BSD. struct tcp_info from netinet/tcp.h has only those fields. There are
|
||||
* also a many Linux specific extensions to the structure, which are only found
|
||||
* in the linux/tcp.h version of struct tcp_info.
|
||||
*
|
||||
* We want to use some of those extension fields, when available. We can test
|
||||
* for availability in the runtime kernel using the length returned from
|
||||
* getsockopt(). However, we won't necessarily be compiled against the same
|
||||
* kernel headers as we'll run with, so compiling directly against linux/tcp.h
|
||||
* means wrapping every field access in an #ifdef whose #else does the same
|
||||
* thing as when the field is missing at runtime. This rapidly gets messy.
|
||||
*
|
||||
* Instead we define here struct tcp_info_linux which includes all the Linux
|
||||
* extensions that we want to use. This is taken from v6.11 of the kernel.
|
||||
*/
|
||||
struct tcp_info_linux {
|
||||
uint8_t tcpi_state;
|
||||
uint8_t tcpi_ca_state;
|
||||
uint8_t tcpi_retransmits;
|
||||
uint8_t tcpi_probes;
|
||||
uint8_t tcpi_backoff;
|
||||
uint8_t tcpi_options;
|
||||
uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
|
||||
uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
|
||||
|
||||
uint32_t tcpi_rto;
|
||||
uint32_t tcpi_ato;
|
||||
uint32_t tcpi_snd_mss;
|
||||
uint32_t tcpi_rcv_mss;
|
||||
|
||||
uint32_t tcpi_unacked;
|
||||
uint32_t tcpi_sacked;
|
||||
uint32_t tcpi_lost;
|
||||
uint32_t tcpi_retrans;
|
||||
uint32_t tcpi_fackets;
|
||||
|
||||
/* Times. */
|
||||
uint32_t tcpi_last_data_sent;
|
||||
uint32_t tcpi_last_ack_sent;
|
||||
uint32_t tcpi_last_data_recv;
|
||||
uint32_t tcpi_last_ack_recv;
|
||||
|
||||
/* Metrics. */
|
||||
uint32_t tcpi_pmtu;
|
||||
uint32_t tcpi_rcv_ssthresh;
|
||||
uint32_t tcpi_rtt;
|
||||
uint32_t tcpi_rttvar;
|
||||
uint32_t tcpi_snd_ssthresh;
|
||||
uint32_t tcpi_snd_cwnd;
|
||||
uint32_t tcpi_advmss;
|
||||
uint32_t tcpi_reordering;
|
||||
|
||||
uint32_t tcpi_rcv_rtt;
|
||||
uint32_t tcpi_rcv_space;
|
||||
|
||||
uint32_t tcpi_total_retrans;
|
||||
|
||||
/* Linux extensions */
|
||||
uint64_t tcpi_pacing_rate;
|
||||
uint64_t tcpi_max_pacing_rate;
|
||||
uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
|
||||
uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
|
||||
uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
|
||||
uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
|
||||
|
||||
uint32_t tcpi_notsent_bytes;
|
||||
uint32_t tcpi_min_rtt;
|
||||
uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
|
||||
uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
|
||||
|
||||
uint64_t tcpi_delivery_rate;
|
||||
|
||||
uint64_t tcpi_busy_time; /* Time (usec) busy sending data */
|
||||
uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */
|
||||
uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
|
||||
|
||||
uint32_t tcpi_delivered;
|
||||
uint32_t tcpi_delivered_ce;
|
||||
|
||||
uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
|
||||
uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */
|
||||
uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */
|
||||
uint32_t tcpi_reord_seen; /* reordering events seen */
|
||||
|
||||
uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */
|
||||
|
||||
uint32_t tcpi_snd_wnd; /* peer's advertised receive window after
|
||||
* scaling (bytes)
|
||||
*/
|
||||
uint32_t tcpi_rcv_wnd; /* local advertised receive window after
|
||||
* scaling (bytes)
|
||||
*/
|
||||
|
||||
uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */
|
||||
|
||||
uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including
|
||||
* SYN/SYN-ACK and recurring timeouts.
|
||||
*/
|
||||
uint16_t tcpi_total_rto_recoveries; /* Total number of RTO
|
||||
* recoveries, including any
|
||||
* unfinished recovery.
|
||||
*/
|
||||
uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries
|
||||
* in milliseconds, including any
|
||||
* unfinished recovery.
|
||||
*/
|
||||
};
|
||||
|
||||
#include <linux/falloc.h>
|
||||
|
||||
#ifndef FALLOC_FL_COLLAPSE_RANGE
|
||||
#define FALLOC_FL_COLLAPSE_RANGE 0x08
|
||||
#endif
|
||||
|
||||
#include <linux/close_range.h>
|
||||
|
||||
/* glibc < 2.34 and musl as of 1.2.5 need these */
|
||||
#ifndef SYS_close_range
|
||||
#define SYS_close_range 436
|
||||
#endif
|
||||
#ifndef CLOSE_RANGE_UNSHARE /* Linux kernel < 5.9 */
|
||||
#define CLOSE_RANGE_UNSHARE (1U << 1)
|
||||
#endif
|
||||
|
||||
__attribute__ ((weak))
|
||||
/* cppcheck-suppress funcArgNamesDifferent */
|
||||
int close_range(unsigned int first, unsigned int last, int flags) {
|
||||
return syscall(SYS_close_range, first, last, flags);
|
||||
}
|
||||
|
||||
#endif /* LINUX_DEP_H */
|
75
log.c
75
log.c
|
@ -26,7 +26,6 @@
|
|||
#include <stdarg.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include "linux_dep.h"
|
||||
#include "log.h"
|
||||
#include "util.h"
|
||||
#include "passt.h"
|
||||
|
@ -54,10 +53,9 @@ bool log_stderr = true; /* Not daemonised, no shell spawned */
|
|||
* logtime() - Get the current time for logging purposes
|
||||
* @ts: Buffer into which to store the timestamp
|
||||
*
|
||||
* Return: pointer to @ts on success, or NULL if there was
|
||||
* an error retrieving the time
|
||||
* Return: pointer to @now, or NULL if there was an error retrieving the time
|
||||
*/
|
||||
static const struct timespec *logtime(struct timespec *ts)
|
||||
const struct timespec *logtime(struct timespec *ts)
|
||||
{
|
||||
if (clock_gettime(CLOCK_MONOTONIC, ts))
|
||||
return NULL;
|
||||
|
@ -94,6 +92,7 @@ const char *logfile_prefix[] = {
|
|||
" ", /* LOG_DEBUG */
|
||||
};
|
||||
|
||||
#ifdef FALLOC_FL_COLLAPSE_RANGE
|
||||
/**
|
||||
* logfile_rotate_fallocate() - Write header, set log_written after fallocate()
|
||||
* @fd: Log file descriptor
|
||||
|
@ -127,6 +126,7 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
|
|||
|
||||
log_written -= log_cut_size;
|
||||
}
|
||||
#endif /* FALLOC_FL_COLLAPSE_RANGE */
|
||||
|
||||
/**
|
||||
* logfile_rotate_move() - Fallback: move recent entries toward start, then cut
|
||||
|
@ -198,17 +198,21 @@ out:
|
|||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*
|
||||
* #syscalls fcntl fallocate
|
||||
* #syscalls fcntl
|
||||
*
|
||||
* fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there
|
||||
*/
|
||||
static int logfile_rotate(int fd, const struct timespec *now)
|
||||
{
|
||||
if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */))
|
||||
return -errno;
|
||||
|
||||
#ifdef FALLOC_FL_COLLAPSE_RANGE
|
||||
/* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */
|
||||
if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size))
|
||||
logfile_rotate_fallocate(fd, now);
|
||||
else
|
||||
#endif
|
||||
logfile_rotate_move(fd, now);
|
||||
|
||||
if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND))
|
||||
|
@ -250,30 +254,6 @@ static void logfile_write(bool newline, bool cont, int pri,
|
|||
log_written += n;
|
||||
}
|
||||
|
||||
/**
|
||||
* passt_vsyslog() - vsyslog() implementation not using heap memory
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
* @pri: Facility and level map, same as priority for vsyslog()
|
||||
* @format: Same as vsyslog() format
|
||||
* @ap: Same as vsyslog() ap
|
||||
*/
|
||||
static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
|
||||
{
|
||||
char buf[BUFSIZ];
|
||||
int n;
|
||||
|
||||
/* Send without timestamp, the system logger should add it */
|
||||
n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
|
||||
|
||||
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
||||
|
||||
if (newline && format[strlen(format)] != '\n')
|
||||
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
||||
|
||||
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
|
||||
FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
|
||||
}
|
||||
|
||||
/**
|
||||
* vlogmsg() - Print or send messages to log or output files as configured
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
|
@ -282,7 +262,6 @@ static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
|
|||
* @format: Message
|
||||
* @ap: Variable argument list
|
||||
*/
|
||||
/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
|
||||
void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
|
||||
{
|
||||
bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;
|
||||
|
@ -295,7 +274,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
|
|||
char timestr[LOGTIME_STRLEN];
|
||||
|
||||
logtime_fmt(timestr, sizeof(timestr), now);
|
||||
FPRINTF(stderr, "%s: ", timestr);
|
||||
fprintf(stderr, "%s: ", timestr);
|
||||
}
|
||||
|
||||
if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) {
|
||||
|
@ -314,7 +293,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
|
|||
(log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) {
|
||||
(void)vfprintf(stderr, format, ap);
|
||||
if (newline && format[strlen(format)] != '\n')
|
||||
FPRINTF(stderr, "\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -348,7 +327,7 @@ void logmsg_perror(int pri, const char *format, ...)
|
|||
vlogmsg(false, false, pri, format, ap);
|
||||
va_end(ap);
|
||||
|
||||
logmsg(true, true, pri, ": %s", strerror_(errno_copy));
|
||||
logmsg(true, true, pri, ": %s", strerror(errno_copy));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -399,11 +378,35 @@ void __setlogmask(int mask)
|
|||
setlogmask(mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* passt_vsyslog() - vsyslog() implementation not using heap memory
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
* @pri: Facility and level map, same as priority for vsyslog()
|
||||
* @format: Same as vsyslog() format
|
||||
* @ap: Same as vsyslog() ap
|
||||
*/
|
||||
void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
|
||||
{
|
||||
char buf[BUFSIZ];
|
||||
int n;
|
||||
|
||||
/* Send without timestamp, the system logger should add it */
|
||||
n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
|
||||
|
||||
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
||||
|
||||
if (newline && format[strlen(format)] != '\n')
|
||||
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
||||
|
||||
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
|
||||
fprintf(stderr, "Failed to send %i bytes to syslog\n", n);
|
||||
}
|
||||
|
||||
/**
|
||||
* logfile_init() - Open log file and write header with PID, version, path
|
||||
* @name: Identifier for header: passt or pasta
|
||||
* @path: Path to log file
|
||||
* @size: Maximum size of log file: log_cut_size is calculated here
|
||||
* @size: Maximum size of log file: log_cut_size is calculatd here
|
||||
*/
|
||||
void logfile_init(const char *name, const char *path, size_t size)
|
||||
{
|
||||
|
@ -413,7 +416,8 @@ void logfile_init(const char *name, const char *path, size_t size)
|
|||
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
|
||||
die_perror("Failed to read own /proc/self/exe link");
|
||||
|
||||
log_file = output_file_open(path, O_APPEND | O_RDWR);
|
||||
log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC,
|
||||
S_IRUSR | S_IWUSR);
|
||||
if (log_file == -1)
|
||||
die_perror("Couldn't open log file %s", path);
|
||||
|
||||
|
@ -429,3 +433,4 @@ void logfile_init(const char *name, const char *path, size_t size)
|
|||
/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
|
||||
log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
|
||||
}
|
||||
|
||||
|
|
5
log.h
5
log.h
|
@ -32,13 +32,13 @@ void logmsg_perror(int pri, const char *format, ...)
|
|||
#define die(...) \
|
||||
do { \
|
||||
err(__VA_ARGS__); \
|
||||
_exit(EXIT_FAILURE); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} while (0)
|
||||
|
||||
#define die_perror(...) \
|
||||
do { \
|
||||
err_perror(__VA_ARGS__); \
|
||||
_exit(EXIT_FAILURE); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} while (0)
|
||||
|
||||
extern int log_trace;
|
||||
|
@ -55,6 +55,7 @@ void trace_init(int enable);
|
|||
|
||||
void __openlog(const char *ident, int option, int facility);
|
||||
void logfile_init(const char *name, const char *path, size_t size);
|
||||
void passt_vsyslog(bool newline, int pri, const char *format, va_list ap);
|
||||
void __setlogmask(int mask);
|
||||
|
||||
#endif /* LOG_H */
|
||||
|
|
304
migrate.c
304
migrate.c
|
@ -1,304 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* PASST - Plug A Simple Socket Transport
|
||||
* for qemu/UNIX domain socket mode
|
||||
*
|
||||
* PASTA - Pack A Subtle Tap Abstraction
|
||||
* for network namespace/tap device mode
|
||||
*
|
||||
* migrate.c - Migration sections, layout, and routines
|
||||
*
|
||||
* Copyright (c) 2025 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <sys/uio.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "passt.h"
|
||||
#include "inany.h"
|
||||
#include "flow.h"
|
||||
#include "flow_table.h"
|
||||
|
||||
#include "migrate.h"
|
||||
#include "repair.h"
|
||||
|
||||
/* Magic identifier for migration data */
|
||||
#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0
|
||||
|
||||
/**
|
||||
* struct migrate_seen_addrs_v1 - Migratable guest addresses for v1 state stream
|
||||
* @addr6: Observed guest IPv6 address
|
||||
* @addr6_ll: Observed guest IPv6 link-local address
|
||||
* @addr4: Observed guest IPv4 address
|
||||
* @mac: Observed guest MAC address
|
||||
*/
|
||||
struct migrate_seen_addrs_v1 {
|
||||
struct in6_addr addr6;
|
||||
struct in6_addr addr6_ll;
|
||||
struct in_addr addr4;
|
||||
unsigned char mac[ETH_ALEN];
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* seen_addrs_source_v1() - Copy and send guest observed addresses from source
|
||||
* @c: Execution context
|
||||
* @stage: Migration stage, unused
|
||||
* @fd: File descriptor for state transfer
|
||||
*
|
||||
* Return: 0 on success, positive error code on failure
|
||||
*/
|
||||
/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
|
||||
static int seen_addrs_source_v1(struct ctx *c,
|
||||
const struct migrate_stage *stage, int fd)
|
||||
{
|
||||
struct migrate_seen_addrs_v1 addrs = {
|
||||
.addr6 = c->ip6.addr_seen,
|
||||
.addr6_ll = c->ip6.addr_ll_seen,
|
||||
.addr4 = c->ip4.addr_seen,
|
||||
};
|
||||
|
||||
(void)stage;
|
||||
|
||||
memcpy(addrs.mac, c->guest_mac, sizeof(addrs.mac));
|
||||
|
||||
if (write_all_buf(fd, &addrs, sizeof(addrs)))
|
||||
return errno;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* seen_addrs_target_v1() - Receive and use guest observed addresses on target
|
||||
* @c: Execution context
|
||||
* @stage: Migration stage, unused
|
||||
* @fd: File descriptor for state transfer
|
||||
*
|
||||
* Return: 0 on success, positive error code on failure
|
||||
*/
|
||||
static int seen_addrs_target_v1(struct ctx *c,
|
||||
const struct migrate_stage *stage, int fd)
|
||||
{
|
||||
struct migrate_seen_addrs_v1 addrs;
|
||||
|
||||
(void)stage;
|
||||
|
||||
if (read_all_buf(fd, &addrs, sizeof(addrs)))
|
||||
return errno;
|
||||
|
||||
c->ip6.addr_seen = addrs.addr6;
|
||||
c->ip6.addr_ll_seen = addrs.addr6_ll;
|
||||
c->ip4.addr_seen = addrs.addr4;
|
||||
memcpy(c->guest_mac, addrs.mac, sizeof(c->guest_mac));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Stages for version 2 */
|
||||
static const struct migrate_stage stages_v2[] = {
|
||||
{
|
||||
.name = "observed addresses",
|
||||
.source = seen_addrs_source_v1,
|
||||
.target = seen_addrs_target_v1,
|
||||
},
|
||||
{
|
||||
.name = "prepare flows",
|
||||
.source = flow_migrate_source_pre,
|
||||
.target = NULL,
|
||||
},
|
||||
{
|
||||
.name = "transfer flows",
|
||||
.source = flow_migrate_source,
|
||||
.target = flow_migrate_target,
|
||||
},
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
/* Supported encoding versions, from latest (most preferred) to oldest */
|
||||
static const struct migrate_version versions[] = {
|
||||
{ 2, stages_v2, },
|
||||
/* v1 was released, but not widely used. It had bad endianness for the
|
||||
* MSS and omitted timestamps, which meant it usually wouldn't work.
|
||||
* Therefore we don't attempt to support compatibility with it.
|
||||
*/
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
/* Current encoding version */
|
||||
#define CURRENT_VERSION (&versions[0])
|
||||
|
||||
/**
|
||||
* migrate_source() - Migration as source, send state to hypervisor
|
||||
* @c: Execution context
|
||||
* @fd: File descriptor for state transfer
|
||||
*
|
||||
* Return: 0 on success, positive error code on failure
|
||||
*/
|
||||
static int migrate_source(struct ctx *c, int fd)
|
||||
{
|
||||
const struct migrate_version *v = CURRENT_VERSION;
|
||||
const struct migrate_header header = {
|
||||
.magic = htonll_constant(MIGRATE_MAGIC),
|
||||
.version = htonl(v->id),
|
||||
.compat_version = htonl(v->id),
|
||||
};
|
||||
const struct migrate_stage *s;
|
||||
int ret;
|
||||
|
||||
if (write_all_buf(fd, &header, sizeof(header))) {
|
||||
ret = errno;
|
||||
err("Can't send migration header: %s, abort", strerror_(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (s = v->s; s->name; s++) {
|
||||
if (!s->source)
|
||||
continue;
|
||||
|
||||
debug("Source side migration stage: %s", s->name);
|
||||
|
||||
if ((ret = s->source(c, s, fd))) {
|
||||
err("Source migration stage: %s: %s, abort", s->name,
|
||||
strerror_(ret));
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* migrate_target_read_header() - Read header in target
|
||||
* @fd: Descriptor for state transfer
|
||||
*
|
||||
* Return: version structure on success, NULL on failure with errno set
|
||||
*/
|
||||
static const struct migrate_version *migrate_target_read_header(int fd)
|
||||
{
|
||||
const struct migrate_version *v;
|
||||
struct migrate_header h;
|
||||
uint32_t id, compat_id;
|
||||
|
||||
if (read_all_buf(fd, &h, sizeof(h)))
|
||||
return NULL;
|
||||
|
||||
id = ntohl(h.version);
|
||||
compat_id = ntohl(h.compat_version);
|
||||
|
||||
debug("Source magic: 0x%016" PRIx64 ", version: %u, compat: %u",
|
||||
ntohll(h.magic), id, compat_id);
|
||||
|
||||
if (ntohll(h.magic) != MIGRATE_MAGIC || !id || !compat_id) {
|
||||
err("Invalid incoming device state");
|
||||
errno = EINVAL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (v = versions; v->id; v++)
|
||||
if (v->id <= id && v->id >= compat_id)
|
||||
return v;
|
||||
|
||||
errno = ENOTSUP;
|
||||
err("Unsupported device state version: %u", id);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* migrate_target() - Migration as target, receive state from hypervisor
|
||||
* @c: Execution context
|
||||
* @fd: File descriptor for state transfer
|
||||
*
|
||||
* Return: 0 on success, positive error code on failure
|
||||
*/
|
||||
static int migrate_target(struct ctx *c, int fd)
|
||||
{
|
||||
const struct migrate_version *v;
|
||||
const struct migrate_stage *s;
|
||||
int ret;
|
||||
|
||||
if (!(v = migrate_target_read_header(fd)))
|
||||
return errno;
|
||||
|
||||
for (s = v->s; s->name; s++) {
|
||||
if (!s->target)
|
||||
continue;
|
||||
|
||||
debug("Target side migration stage: %s", s->name);
|
||||
|
||||
if ((ret = s->target(c, s, fd))) {
|
||||
err("Target migration stage: %s: %s, abort", s->name,
|
||||
strerror_(ret));
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* migrate_init() - Set up things necessary for migration
|
||||
* @c: Execution context
|
||||
*/
|
||||
void migrate_init(struct ctx *c)
|
||||
{
|
||||
c->device_state_result = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* migrate_close() - Close migration channel and connection to passt-repair
|
||||
* @c: Execution context
|
||||
*/
|
||||
void migrate_close(struct ctx *c)
|
||||
{
|
||||
if (c->device_state_fd != -1) {
|
||||
debug("Closing migration channel, fd: %d", c->device_state_fd);
|
||||
close(c->device_state_fd);
|
||||
c->device_state_fd = -1;
|
||||
c->device_state_result = -1;
|
||||
}
|
||||
|
||||
repair_close(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* migrate_request() - Request a migration of device state
|
||||
* @c: Execution context
|
||||
* @fd: fd to transfer state
|
||||
* @target: Are we the target of the migration?
|
||||
*/
|
||||
void migrate_request(struct ctx *c, int fd, bool target)
|
||||
{
|
||||
debug("Migration requested, fd: %d (was %d)", fd, c->device_state_fd);
|
||||
|
||||
if (c->device_state_fd != -1)
|
||||
migrate_close(c);
|
||||
|
||||
c->device_state_fd = fd;
|
||||
c->migrate_target = target;
|
||||
}
|
||||
|
||||
/**
|
||||
* migrate_handler() - Send/receive passt internal state to/from hypervisor
|
||||
* @c: Execution context
|
||||
*/
|
||||
void migrate_handler(struct ctx *c)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (c->device_state_fd < 0)
|
||||
return;
|
||||
|
||||
debug("Handling migration request from fd: %d, target: %d",
|
||||
c->device_state_fd, c->migrate_target);
|
||||
|
||||
if (c->migrate_target)
|
||||
rc = migrate_target(c, c->device_state_fd);
|
||||
else
|
||||
rc = migrate_source(c, c->device_state_fd);
|
||||
|
||||
migrate_close(c);
|
||||
|
||||
c->device_state_result = rc;
|
||||
}
|
51
migrate.h
51
migrate.h
|
@ -1,51 +0,0 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright (c) 2025 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#ifndef MIGRATE_H
|
||||
#define MIGRATE_H
|
||||
|
||||
/**
|
||||
* struct migrate_header - Migration header from source
|
||||
* @magic: 0xB1BB1D1B0BB1D1B0, network order
|
||||
* @version: Highest known, target aborts if too old, network order
|
||||
* @compat_version: Lowest version compatible with @version, target aborts
|
||||
* if too new, network order
|
||||
*/
|
||||
struct migrate_header {
|
||||
uint64_t magic;
|
||||
uint32_t version;
|
||||
uint32_t compat_version;
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct migrate_stage - Callbacks and parameters for one stage of migration
|
||||
* @name: Stage name (for debugging)
|
||||
* @source: Callback to implement this stage on the source
|
||||
* @target: Callback to implement this stage on the target
|
||||
*/
|
||||
struct migrate_stage {
|
||||
const char *name;
|
||||
int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd);
|
||||
int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd);
|
||||
|
||||
/* Add here separate rollback callbacks if needed */
|
||||
};
|
||||
|
||||
/**
|
||||
* struct migrate_version - Stages for a particular protocol version
|
||||
* @id: Version number, host order
|
||||
* @s: Ordered array of stages, NULL-terminated
|
||||
*/
|
||||
struct migrate_version {
|
||||
uint32_t id;
|
||||
const struct migrate_stage *s;
|
||||
};
|
||||
|
||||
void migrate_init(struct ctx *c);
|
||||
void migrate_close(struct ctx *c);
|
||||
void migrate_request(struct ctx *c, int fd, bool target);
|
||||
void migrate_handler(struct ctx *c);
|
||||
|
||||
#endif /* MIGRATE_H */
|
223
ndp.c
223
ndp.c
|
@ -33,8 +33,6 @@
|
|||
#include "tap.h"
|
||||
#include "log.h"
|
||||
|
||||
#define RT_LIFETIME 65535
|
||||
|
||||
#define RS 133
|
||||
#define RA 134
|
||||
#define NS 135
|
||||
|
@ -160,7 +158,7 @@ struct ndp_ra {
|
|||
|
||||
unsigned char var[sizeof(struct opt_mtu) + sizeof(struct opt_rdnss) +
|
||||
sizeof(struct opt_dnssl)];
|
||||
} __attribute__((packed, aligned(__alignof__(struct in6_addr))));
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct ndp_ns - NDP Neighbor Solicitation (NS) message
|
||||
|
@ -170,31 +168,19 @@ struct ndp_ra {
|
|||
struct ndp_ns {
|
||||
struct icmp6hdr ih;
|
||||
struct in6_addr target_addr;
|
||||
} __attribute__((packed, aligned(__alignof__(struct in6_addr))));
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* ndp_send() - Send an NDP message
|
||||
* ndp() - Check for NDP solicitations, reply as needed
|
||||
* @c: Execution context
|
||||
* @dst: IPv6 address to send the message to
|
||||
* @buf: ICMPv6 header + message payload
|
||||
* @l4len: Length of message, including ICMPv6 header
|
||||
* @ih: ICMPv6 header
|
||||
* @saddr: Source IPv6 address
|
||||
* @p: Packet pool
|
||||
*
|
||||
* Return: 0 if not handled here, 1 if handled, -1 on failure
|
||||
*/
|
||||
static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
|
||||
const void *buf, size_t l4len)
|
||||
{
|
||||
const struct in6_addr *src = &c->ip6.our_tap_ll;
|
||||
|
||||
tap_icmp6_send(c, src, dst, buf, l4len);
|
||||
}
|
||||
|
||||
/**
|
||||
* ndp_na() - Send an NDP Neighbour Advertisement (NA) message
|
||||
* @c: Execution context
|
||||
* @dst: IPv6 address to send the NA to
|
||||
* @addr: IPv6 address to advertise
|
||||
*/
|
||||
static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
|
||||
const struct in6_addr *addr)
|
||||
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
|
||||
const struct pool *p)
|
||||
{
|
||||
struct ndp_na na = {
|
||||
.ih = {
|
||||
|
@ -204,7 +190,6 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
|
|||
.icmp6_solicited = 1,
|
||||
.icmp6_override = 1,
|
||||
},
|
||||
.target_addr = *addr,
|
||||
.target_l2_addr = {
|
||||
.header = {
|
||||
.type = OPT_TARGET_L2_ADDR,
|
||||
|
@ -212,26 +197,13 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
|
|||
},
|
||||
}
|
||||
};
|
||||
|
||||
memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
|
||||
|
||||
ndp_send(c, dst, &na, sizeof(na));
|
||||
}
|
||||
|
||||
/**
|
||||
* ndp_ra() - Send an NDP Router Advertisement (RA) message
|
||||
* @c: Execution context
|
||||
* @dst: IPv6 address to send the RA to
|
||||
*/
|
||||
static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
|
||||
{
|
||||
struct ndp_ra ra = {
|
||||
.ih = {
|
||||
.icmp6_type = RA,
|
||||
.icmp6_code = 0,
|
||||
.icmp6_hop_limit = 255,
|
||||
/* RFC 8319 */
|
||||
.icmp6_rt_lifetime = htons_constant(RT_LIFETIME),
|
||||
.icmp6_rt_lifetime = htons_constant(65535),
|
||||
.icmp6_addrconf_managed = 1,
|
||||
},
|
||||
.prefix_info = {
|
||||
|
@ -244,7 +216,6 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
|
|||
.valid_lifetime = ~0U,
|
||||
.pref_lifetime = ~0U,
|
||||
},
|
||||
.prefix = c->ip6.addr,
|
||||
.source_ll = {
|
||||
.header = {
|
||||
.type = OPT_SRC_L2_ADDR,
|
||||
|
@ -252,26 +223,59 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
|
|||
},
|
||||
},
|
||||
};
|
||||
const struct in6_addr *rsaddr; /* src addr for reply */
|
||||
unsigned char *ptr = NULL;
|
||||
size_t dlen;
|
||||
|
||||
ptr = &ra.var[0];
|
||||
if (ih->icmp6_type < RS || ih->icmp6_type > NA)
|
||||
return 0;
|
||||
|
||||
if (c->mtu) {
|
||||
struct opt_mtu *mtu = (struct opt_mtu *)ptr;
|
||||
*mtu = (struct opt_mtu) {
|
||||
.header = {
|
||||
.type = OPT_MTU,
|
||||
.len = 1,
|
||||
},
|
||||
.value = htonl(c->mtu),
|
||||
};
|
||||
ptr += sizeof(struct opt_mtu);
|
||||
}
|
||||
if (c->no_ndp)
|
||||
return 1;
|
||||
|
||||
if (!c->no_dhcp_dns) {
|
||||
if (ih->icmp6_type == NS) {
|
||||
struct ndp_ns *ns = packet_get(p, 0, 0, sizeof(struct ndp_ns),
|
||||
NULL);
|
||||
|
||||
if (!ns)
|
||||
return -1;
|
||||
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(saddr))
|
||||
return 1;
|
||||
|
||||
info("NDP: received NS, sending NA");
|
||||
|
||||
memcpy(&na.target_addr, &ns->target_addr,
|
||||
sizeof(na.target_addr));
|
||||
memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
|
||||
|
||||
} else if (ih->icmp6_type == RS) {
|
||||
size_t dns_s_len = 0;
|
||||
int i, n;
|
||||
|
||||
if (c->no_ra)
|
||||
return 1;
|
||||
|
||||
info("NDP: received RS, sending RA");
|
||||
memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix));
|
||||
|
||||
ptr = &ra.var[0];
|
||||
|
||||
if (c->mtu != -1) {
|
||||
struct opt_mtu *mtu = (struct opt_mtu *)ptr;
|
||||
*mtu = (struct opt_mtu) {
|
||||
.header = {
|
||||
.type = OPT_MTU,
|
||||
.len = 1,
|
||||
},
|
||||
.value = htonl(c->mtu),
|
||||
};
|
||||
ptr += sizeof(struct opt_mtu);
|
||||
}
|
||||
|
||||
if (c->no_dhcp_dns)
|
||||
goto dns_done;
|
||||
|
||||
for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n]); n++);
|
||||
if (n) {
|
||||
struct opt_rdnss *rdnss = (struct opt_rdnss *)ptr;
|
||||
|
@ -283,7 +287,8 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
|
|||
.lifetime = ~0U,
|
||||
};
|
||||
for (i = 0; i < n; i++) {
|
||||
rdnss->dns[i] = c->ip6.dns[i];
|
||||
memcpy(&rdnss->dns[i], &c->ip6.dns[i],
|
||||
sizeof(rdnss->dns[i]));
|
||||
}
|
||||
ptr += offsetof(struct opt_rdnss, dns) +
|
||||
i * sizeof(rdnss->dns[0]);
|
||||
|
@ -324,109 +329,27 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
|
|||
memset(ptr, 0, 8 - dns_s_len % 8); /* padding */
|
||||
ptr += 8 - dns_s_len % 8;
|
||||
}
|
||||
|
||||
dns_done:
|
||||
memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
|
||||
memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
|
||||
if (IN6_IS_ADDR_LINKLOCAL(saddr))
|
||||
c->ip6.addr_ll_seen = *saddr;
|
||||
else
|
||||
c->ip6.addr_seen = *saddr;
|
||||
|
||||
/* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
|
||||
ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);
|
||||
}
|
||||
|
||||
/**
|
||||
* ndp() - Check for NDP solicitations, reply as needed
|
||||
* @c: Execution context
|
||||
* @saddr: Source IPv6 address
|
||||
* @p: Packet pool
|
||||
*
|
||||
* Return: 0 if not handled here, 1 if handled, -1 on failure
|
||||
*/
|
||||
int ndp(const struct ctx *c, const struct icmp6hdr *ih,
|
||||
const struct in6_addr *saddr, const struct pool *p)
|
||||
{
|
||||
if (ih->icmp6_type < RS || ih->icmp6_type > NA)
|
||||
return 0;
|
||||
|
||||
if (c->no_ndp)
|
||||
return 1;
|
||||
rsaddr = &c->ip6.our_tap_ll;
|
||||
|
||||
if (ih->icmp6_type == NS) {
|
||||
const struct ndp_ns *ns;
|
||||
|
||||
ns = packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
|
||||
if (!ns)
|
||||
return -1;
|
||||
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(saddr))
|
||||
return 1;
|
||||
|
||||
info("NDP: received NS, sending NA");
|
||||
|
||||
ndp_na(c, saddr, &ns->target_addr);
|
||||
dlen = sizeof(struct ndp_na);
|
||||
tap_icmp6_send(c, rsaddr, saddr, &na, dlen);
|
||||
} else if (ih->icmp6_type == RS) {
|
||||
if (c->no_ra)
|
||||
return 1;
|
||||
|
||||
info("NDP: received RS, sending RA");
|
||||
ndp_ra(c, saddr);
|
||||
dlen = ptr - (unsigned char *)&ra;
|
||||
tap_icmp6_send(c, rsaddr, saddr, &ra, dlen);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Default interval between unsolicited RAs (seconds) */
|
||||
#define DEFAULT_MAX_RTR_ADV_INTERVAL 600 /* RFC 4861, 6.2.1 */
|
||||
|
||||
/* Minimum required interval between RAs (seconds) */
|
||||
#define MIN_DELAY_BETWEEN_RAS 3 /* RFC 4861, 10 */
|
||||
|
||||
static time_t next_ra;
|
||||
|
||||
/**
|
||||
* ndp_timer() - Send unsolicited NDP messages if necessary
|
||||
* @c: Execution context
|
||||
* @now: Current (monotonic) time
|
||||
*/
|
||||
void ndp_timer(const struct ctx *c, const struct timespec *now)
|
||||
{
|
||||
time_t max_rtr_adv_interval = DEFAULT_MAX_RTR_ADV_INTERVAL;
|
||||
time_t min_rtr_adv_interval, interval;
|
||||
|
||||
if (c->fd_tap < 0 || c->no_ra || now->tv_sec < next_ra)
|
||||
return;
|
||||
|
||||
/* We must advertise before the route's lifetime expires */
|
||||
max_rtr_adv_interval = MIN(max_rtr_adv_interval, RT_LIFETIME - 1);
|
||||
|
||||
/* But we must not go smaller than the minimum delay */
|
||||
max_rtr_adv_interval = MAX(max_rtr_adv_interval, MIN_DELAY_BETWEEN_RAS);
|
||||
|
||||
/* RFC 4861, 6.2.1 */
|
||||
min_rtr_adv_interval = MAX(max_rtr_adv_interval / 3,
|
||||
MIN_DELAY_BETWEEN_RAS);
|
||||
|
||||
/* As required by RFC 4861, we randomise the interval between
|
||||
* unsolicited RAs. This is to prevent multiple routers on a link
|
||||
* getting synchronised (e.g. after booting a bunch of routers at once)
|
||||
* and causing flurries of RAs at the same time.
|
||||
*
|
||||
* This random doesn't need to be cryptographically strong, so random(3)
|
||||
* is fine. Other routers on the link also want to avoid
|
||||
* synchronisation, and anything malicious has much easier ways to cause
|
||||
* trouble.
|
||||
*
|
||||
* The modulus also makes this not strictly a uniform distribution, but,
|
||||
* again, it's close enough for our purposes.
|
||||
*/
|
||||
interval = min_rtr_adv_interval +
|
||||
random() % (max_rtr_adv_interval - min_rtr_adv_interval);
|
||||
|
||||
if (!next_ra)
|
||||
goto first;
|
||||
|
||||
info("NDP: sending unsolicited RA, next in %llds", (long long)interval);
|
||||
|
||||
ndp_ra(c, &in6addr_ll_all_nodes);
|
||||
|
||||
first:
|
||||
next_ra = now->tv_sec + interval;
|
||||
}
|
||||
|
|
7
ndp.h
7
ndp.h
|
@ -6,10 +6,7 @@
|
|||
#ifndef NDP_H
|
||||
#define NDP_H
|
||||
|
||||
struct icmp6hdr;
|
||||
|
||||
int ndp(const struct ctx *c, const struct icmp6hdr *ih,
|
||||
const struct in6_addr *saddr, const struct pool *p);
|
||||
void ndp_timer(const struct ctx *c, const struct timespec *now);
|
||||
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
|
||||
const struct pool *p);
|
||||
|
||||
#endif /* NDP_H */
|
||||
|
|
15
netlink.c
15
netlink.c
|
@ -199,7 +199,7 @@ static struct nlmsghdr *nl_next(int s, char *buf, struct nlmsghdr *nh, ssize_t *
|
|||
}
|
||||
|
||||
/**
|
||||
* nl_foreach() - 'for' type macro to step through netlink response messages
|
||||
* nl_foreach - 'for' type macro to step through netlink response messages
|
||||
* nl_foreach_oftype - as above, but only messages of expected type
|
||||
* @nh: Steps through each response header (struct nlmsghdr *)
|
||||
* @status: When loop exits indicates if there was an error (ssize_t)
|
||||
|
@ -297,10 +297,6 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
|||
if (!thisifi)
|
||||
continue; /* No interface for this route */
|
||||
|
||||
/* Skip 'lo': we should test IFF_LOOPBACK, but keep it simple */
|
||||
if (thisifi == 1)
|
||||
continue;
|
||||
|
||||
/* Skip routes to link-local addresses */
|
||||
if (af == AF_INET && dst &&
|
||||
IN4_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len))
|
||||
|
@ -324,7 +320,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
|||
}
|
||||
|
||||
if (status < 0)
|
||||
warn("netlink: RTM_GETROUTE failed: %s", strerror_(-status));
|
||||
warn("netlink: RTM_GETROUTE failed: %s", strerror(-status));
|
||||
|
||||
if (defifi) {
|
||||
if (ndef > 1) {
|
||||
|
@ -355,9 +351,9 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
|||
*
|
||||
* Return: true if a gateway was found, false otherwise
|
||||
*/
|
||||
static bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
|
||||
bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
|
||||
{
|
||||
int nh_len = RTA_PAYLOAD(rta);
|
||||
size_t nh_len = RTA_PAYLOAD(rta);
|
||||
struct rtnexthop *rtnh;
|
||||
bool found = false;
|
||||
int hops = -1;
|
||||
|
@ -586,7 +582,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
|||
|
||||
*(unsigned int *)RTA_DATA(rta) = ifi_dst;
|
||||
} else if (rta->rta_type == RTA_MULTIPATH) {
|
||||
int nh_len = RTA_PAYLOAD(rta);
|
||||
size_t nh_len = RTA_PAYLOAD(rta);
|
||||
struct rtnexthop *rtnh;
|
||||
|
||||
for (rtnh = (struct rtnexthop *)RTA_DATA(rta);
|
||||
|
@ -1024,6 +1020,7 @@ int nl_link_get_mac(int s, unsigned int ifi, void *mac)
|
|||
/**
|
||||
* nl_link_set_mac() - Set link MAC address
|
||||
* @s: Netlink socket
|
||||
* @ns: Use netlink socket in namespace
|
||||
* @ifi: Interface index
|
||||
* @mac: MAC address to set
|
||||
*
|
||||
|
|
185
packet.c
185
packet.c
|
@ -22,74 +22,12 @@
|
|||
#include "util.h"
|
||||
#include "log.h"
|
||||
|
||||
/**
|
||||
* packet_check_range() - Check if a memory range is valid for a pool
|
||||
* @p: Packet pool
|
||||
* @ptr: Start of desired data range
|
||||
* @len: Length of desired data range
|
||||
* @func: For tracing: name of calling function
|
||||
* @line: For tracing: caller line of function call
|
||||
*
|
||||
* Return: 0 if the range is valid, -1 otherwise
|
||||
*/
|
||||
static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
|
||||
const char *func, int line)
|
||||
{
|
||||
if (len > PACKET_MAX_LEN) {
|
||||
debug("packet range length %zu (max %zu), %s:%i",
|
||||
len, PACKET_MAX_LEN, func, line);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (p->buf_size == 0) {
|
||||
int ret;
|
||||
|
||||
ret = vu_packet_check_range((void *)p->buf, ptr, len);
|
||||
|
||||
if (ret == -1)
|
||||
debug("cannot find region, %s:%i", func, line);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ptr < p->buf) {
|
||||
debug("packet range start %p before buffer start %p, %s:%i",
|
||||
(void *)ptr, (void *)p->buf, func, line);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (len > p->buf_size) {
|
||||
debug("packet range length %zu larger than buffer %zu, %s:%i",
|
||||
len, p->buf_size, func, line);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((size_t)(ptr - p->buf) > p->buf_size - len) {
|
||||
debug("packet range %p, len %zu after buffer end %p, %s:%i",
|
||||
(void *)ptr, len, (void *)(p->buf + p->buf_size),
|
||||
func, line);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
/**
|
||||
* pool_full() - Is a packet pool full?
|
||||
* @p: Pointer to packet pool
|
||||
*
|
||||
* Return: true if the pool is full, false if more packets can be added
|
||||
*/
|
||||
bool pool_full(const struct pool *p)
|
||||
{
|
||||
return p->count >= p->size;
|
||||
}
|
||||
|
||||
/**
|
||||
* packet_add_do() - Add data as packet descriptor to given pool
|
||||
* @p: Existing pool
|
||||
* @len: Length of new descriptor
|
||||
* @start: Start of data
|
||||
* @func: For tracing: name of calling function
|
||||
* @func: For tracing: name of calling function, NULL means no trace()
|
||||
* @line: For tracing: caller line of function call
|
||||
*/
|
||||
void packet_add_do(struct pool *p, size_t len, const char *start,
|
||||
|
@ -97,61 +35,42 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
|
|||
{
|
||||
size_t idx = p->count;
|
||||
|
||||
if (pool_full(p)) {
|
||||
debug("add packet index %zu to pool with size %zu, %s:%i",
|
||||
if (idx >= p->size) {
|
||||
trace("add packet index %zu to pool with size %zu, %s:%i",
|
||||
idx, p->size, func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (packet_check_range(p, start, len, func, line))
|
||||
if (start < p->buf) {
|
||||
trace("add packet start %p before buffer start %p, %s:%i",
|
||||
(void *)start, (void *)p->buf, func, line);
|
||||
return;
|
||||
|
||||
p->pkt[idx].iov_base = (void *)start;
|
||||
p->pkt[idx].iov_len = len;
|
||||
|
||||
p->count++;
|
||||
}
|
||||
|
||||
/**
|
||||
* packet_get_try_do() - Get data range from packet descriptor from given pool
|
||||
* @p: Packet pool
|
||||
* @idx: Index of packet descriptor in pool
|
||||
* @offset: Offset of data range in packet descriptor
|
||||
* @len: Length of desired data range
|
||||
* @left: Length of available data after range, set on return, can be NULL
|
||||
* @func: For tracing: name of calling function
|
||||
* @line: For tracing: caller line of function call
|
||||
*
|
||||
* Return: pointer to start of data range, NULL on invalid range or descriptor
|
||||
*/
|
||||
void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
|
||||
size_t len, size_t *left, const char *func, int line)
|
||||
{
|
||||
char *ptr;
|
||||
|
||||
ASSERT_WITH_MSG(p->count <= p->size,
|
||||
"Corrupt pool count: %zu, size: %zu, %s:%i",
|
||||
p->count, p->size, func, line);
|
||||
|
||||
if (idx >= p->count) {
|
||||
debug("packet %zu from pool count: %zu, %s:%i",
|
||||
idx, p->count, func, line);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (offset > p->pkt[idx].iov_len ||
|
||||
len > (p->pkt[idx].iov_len - offset))
|
||||
return NULL;
|
||||
if (start + len > p->buf + p->buf_size) {
|
||||
trace("add packet start %p, length: %zu, buffer end %p, %s:%i",
|
||||
(void *)start, len, (void *)(p->buf + p->buf_size),
|
||||
func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
ptr = (char *)p->pkt[idx].iov_base + offset;
|
||||
if (len > UINT16_MAX) {
|
||||
trace("add packet length %zu, %s:%i", len, func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
ASSERT_WITH_MSG(!packet_check_range(p, ptr, len, func, line),
|
||||
"Corrupt packet pool, %s:%i", func, line);
|
||||
#if UINTPTR_MAX == UINT64_MAX
|
||||
if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {
|
||||
trace("add packet start %p, buffer start %p, %s:%i",
|
||||
(void *)start, (void *)p->buf, func, line);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (left)
|
||||
*left = p->pkt[idx].iov_len - offset - len;
|
||||
p->pkt[idx].offset = start - p->buf;
|
||||
p->pkt[idx].len = len;
|
||||
|
||||
return ptr;
|
||||
p->count++;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -161,24 +80,52 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
|
|||
* @offset: Offset of data range in packet descriptor
|
||||
* @len: Length of desired data range
|
||||
* @left: Length of available data after range, set on return, can be NULL
|
||||
* @func: For tracing: name of calling function
|
||||
* @func: For tracing: name of calling function, NULL means no trace()
|
||||
* @line: For tracing: caller line of function call
|
||||
*
|
||||
* Return: as packet_get_try_do() but log a trace message when returning NULL
|
||||
* Return: pointer to start of data range, NULL on invalid range or descriptor
|
||||
*/
|
||||
void *packet_get_do(const struct pool *p, const size_t idx,
|
||||
size_t offset, size_t len, size_t *left,
|
||||
const char *func, int line)
|
||||
void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
|
||||
size_t len, size_t *left, const char *func, int line)
|
||||
{
|
||||
void *r = packet_get_try_do(p, idx, offset, len, left, func, line);
|
||||
|
||||
if (!r) {
|
||||
trace("missing packet data length %zu, offset %zu from "
|
||||
"length %zu, %s:%i",
|
||||
len, offset, p->pkt[idx].iov_len, func, line);
|
||||
if (idx >= p->size || idx >= p->count) {
|
||||
if (func) {
|
||||
trace("packet %zu from pool size: %zu, count: %zu, "
|
||||
"%s:%i", idx, p->size, p->count, func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return r;
|
||||
if (len > UINT16_MAX || len + offset > UINT32_MAX) {
|
||||
if (func) {
|
||||
trace("packet data length %zu, offset %zu, %s:%i",
|
||||
len, offset, func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (p->pkt[idx].offset + len + offset > p->buf_size) {
|
||||
if (func) {
|
||||
trace("packet offset plus length %zu from size %zu, "
|
||||
"%s:%i", p->pkt[idx].offset + len + offset,
|
||||
p->buf_size, func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (len + offset > p->pkt[idx].len) {
|
||||
if (func) {
|
||||
trace("data length %zu, offset %zu from length %u, "
|
||||
"%s:%i", len, offset, p->pkt[idx].len,
|
||||
func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (left)
|
||||
*left = p->pkt[idx].len - offset - len;
|
||||
|
||||
return p->buf + p->pkt[idx].offset + offset;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
33
packet.h
33
packet.h
|
@ -6,17 +6,20 @@
|
|||
#ifndef PACKET_H
|
||||
#define PACKET_H
|
||||
|
||||
#include <stdbool.h>
|
||||
|
||||
/* Maximum size of a single packet stored in pool, including headers */
|
||||
#define PACKET_MAX_LEN ((size_t)UINT16_MAX)
|
||||
/**
|
||||
* struct desc - Generic offset-based descriptor within buffer
|
||||
* @offset: Offset of descriptor relative to buffer start, 32-bit limit
|
||||
* @len: Length of descriptor, host order, 16-bit limit
|
||||
*/
|
||||
struct desc {
|
||||
uint32_t offset;
|
||||
uint16_t len;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct pool - Generic pool of packets stored in a buffer
|
||||
* @buf: Buffer storing packet descriptors,
|
||||
* a struct vu_dev_region array for passt vhost-user mode
|
||||
* @buf_size: Total size of buffer,
|
||||
* 0 for passt vhost-user mode
|
||||
* @buf: Buffer storing packet descriptors
|
||||
* @buf_size: Total size of buffer
|
||||
* @size: Number of usable descriptors for the pool
|
||||
* @count: Number of used descriptors for the pool
|
||||
* @pkt: Descriptors: see macros below
|
||||
|
@ -26,36 +29,32 @@ struct pool {
|
|||
size_t buf_size;
|
||||
size_t size;
|
||||
size_t count;
|
||||
struct iovec pkt[];
|
||||
struct desc pkt[1];
|
||||
};
|
||||
|
||||
int vu_packet_check_range(void *buf, const char *ptr, size_t len);
|
||||
void packet_add_do(struct pool *p, size_t len, const char *start,
|
||||
const char *func, int line);
|
||||
void *packet_get_try_do(const struct pool *p, const size_t idx,
|
||||
size_t offset, size_t len, size_t *left,
|
||||
const char *func, int line);
|
||||
void *packet_get_do(const struct pool *p, const size_t idx,
|
||||
size_t offset, size_t len, size_t *left,
|
||||
const char *func, int line);
|
||||
bool pool_full(const struct pool *p);
|
||||
void pool_flush(struct pool *p);
|
||||
|
||||
#define packet_add(p, len, start) \
|
||||
packet_add_do(p, len, start, __func__, __LINE__)
|
||||
|
||||
#define packet_get_try(p, idx, offset, len, left) \
|
||||
packet_get_try_do(p, idx, offset, len, left, __func__, __LINE__)
|
||||
#define packet_get(p, idx, offset, len, left) \
|
||||
packet_get_do(p, idx, offset, len, left, __func__, __LINE__)
|
||||
|
||||
#define packet_get_try(p, idx, offset, len, left) \
|
||||
packet_get_do(p, idx, offset, len, left, NULL, 0)
|
||||
|
||||
#define PACKET_POOL_DECL(_name, _size, _buf) \
|
||||
struct _name ## _t { \
|
||||
char *buf; \
|
||||
size_t buf_size; \
|
||||
size_t size; \
|
||||
size_t count; \
|
||||
struct iovec pkt[_size]; \
|
||||
struct desc pkt[_size]; \
|
||||
}
|
||||
|
||||
#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \
|
||||
|
|
|
@ -1,74 +0,0 @@
|
|||
.\" SPDX-License-Identifier: GPL-2.0-or-later
|
||||
.\" Copyright (c) 2025 Red Hat GmbH
|
||||
.\" Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
.TH passt-repair 1
|
||||
|
||||
.SH NAME
|
||||
.B passt-repair
|
||||
\- Helper setting TCP_REPAIR socket options for \fBpasst\fR(1)
|
||||
|
||||
.SH SYNOPSIS
|
||||
.B passt-repair
|
||||
\fIPATH\fR
|
||||
|
||||
.SH DESCRIPTION
|
||||
|
||||
.B passt-repair
|
||||
is a privileged helper setting and clearing repair mode on TCP sockets on behalf
|
||||
of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain
|
||||
socket.
|
||||
|
||||
It can be used to migrate TCP connections between guests without granting
|
||||
additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections,
|
||||
\fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR
|
||||
capability (see \fBcapabilities\fR(7)) to be set or cleared.
|
||||
|
||||
If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to
|
||||
connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file
|
||||
ending with \fI.repair\fR appears in it, and then attempts to connect to it.
|
||||
|
||||
.SH PROTOCOL
|
||||
|
||||
\fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via
|
||||
\fI--repair-path\fR option in \fBpasst\fR(1) itself. By default, the name is the
|
||||
same as the UNIX domain socket used for guest communication, suffixed by
|
||||
\fI.repair\fR.
|
||||
|
||||
The messages consist of one 8-bit signed integer that can be \fITCP_REPAIR_ON\fR
|
||||
(1), \fITCP_REPAIR_OFF\fR (0), or \fITCP_REPAIR_OFF_NO_WP\fR (-1), as defined by
|
||||
the Linux kernel user API, and one to SCM_MAX_FD (253) sockets as SCM_RIGHTS
|
||||
(see \fBunix\fR(7)) ancillary message, sent by the server, \fBpasst\fR(1).
|
||||
|
||||
The client, \fBpasst-repair\fR(1), replies with the same byte (and no ancillary
|
||||
message) to indicate success, and closes the connection on failure.
|
||||
|
||||
The server closes the connection on error or completion.
|
||||
|
||||
.SH NOTES
|
||||
|
||||
\fBpasst-repair\fR(1) can be granted the \fBCAP_NET_ADMIN\fR capability
|
||||
(preferred, as it limits privileges to the strictly necessary ones), or it can
|
||||
be run as root.
|
||||
|
||||
.SH AUTHOR
|
||||
|
||||
Stefano Brivio <sbrivio@redhat.com>.
|
||||
|
||||
.SH REPORTING BUGS
|
||||
|
||||
Please report issues on the bug tracker at https://bugs.passt.top/, or
|
||||
send a message to the passt-user@passt.top mailing list, see
|
||||
https://lists.passt.top/.
|
||||
|
||||
.SH COPYRIGHT
|
||||
|
||||
Copyright (c) 2025 Red Hat GmbH.
|
||||
|
||||
\fBpasst-repair\fR is free software: you can redistribute them and/or modify
|
||||
them under the terms of the GNU General Public License as published by the Free
|
||||
Software Foundation, either version 2 of the License, or (at your option) any
|
||||
later version.
|
||||
|
||||
.SH SEE ALSO
|
||||
|
||||
\fBpasst\fR(1), \fBqemu\fR(1), \fBcapabilities\fR(7), \fBunix\fR(7).
|
266
passt-repair.c
266
passt-repair.c
|
@ -1,266 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* PASST - Plug A Simple Socket Transport
|
||||
* for qemu/UNIX domain socket mode
|
||||
*
|
||||
* PASTA - Pack A Subtle Tap Abstraction
|
||||
* for network namespace/tap device mode
|
||||
*
|
||||
* passt-repair.c - Privileged helper to set/clear TCP_REPAIR on sockets
|
||||
*
|
||||
* Copyright (c) 2025 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*
|
||||
* Connect to passt via UNIX domain socket, receive sockets via SCM_RIGHTS along
|
||||
* with byte commands mapping to TCP_REPAIR values, and switch repair mode on or
|
||||
* off. Reply by echoing the command. Exit on EOF.
|
||||
*/
|
||||
|
||||
#include <sys/inotify.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/un.h>
|
||||
#include <errno.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include <unistd.h>
|
||||
#include <netdb.h>
|
||||
|
||||
#include <netinet/tcp.h>
|
||||
|
||||
#include <linux/audit.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/filter.h>
|
||||
#include <linux/seccomp.h>
|
||||
|
||||
#include "seccomp_repair.h"
|
||||
|
||||
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
|
||||
#define REPAIR_EXT ".repair"
|
||||
#define REPAIR_EXT_LEN strlen(REPAIR_EXT)
|
||||
|
||||
/**
|
||||
* main() - Entry point and whole program with loop
|
||||
* @argc: Argument count, must be 2
|
||||
* @argv: Argument: path of UNIX domain socket to connect to
|
||||
*
|
||||
* Return: 0 on success (EOF), 1 on error, 2 on usage error
|
||||
*
|
||||
* #syscalls:repair connect setsockopt write close exit_group
|
||||
* #syscalls:repair socket s390x:socketcall i686:socketcall
|
||||
* #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv
|
||||
* #syscalls:repair sendto sendmsg arm:send ppc64le:send
|
||||
* #syscalls:repair stat|statx stat64|statx statx
|
||||
* #syscalls:repair fstat|fstat64 newfstatat|fstatat64
|
||||
* #syscalls:repair inotify_init1 inotify_add_watch
|
||||
*/
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
|
||||
__attribute__ ((aligned(__alignof__(struct cmsghdr))));
|
||||
struct sockaddr_un a = { AF_UNIX, "" };
|
||||
int fds[SCM_MAX_FD], s, ret, i, n = 0;
|
||||
bool inotify_dir = false;
|
||||
struct sock_fprog prog;
|
||||
int8_t cmd = INT8_MAX;
|
||||
struct cmsghdr *cmsg;
|
||||
struct msghdr msg;
|
||||
struct iovec iov;
|
||||
size_t cmsg_len;
|
||||
struct stat sb;
|
||||
int op;
|
||||
|
||||
prctl(PR_SET_DUMPABLE, 0);
|
||||
|
||||
prog.len = (unsigned short)sizeof(filter_repair) /
|
||||
sizeof(filter_repair[0]);
|
||||
prog.filter = filter_repair;
|
||||
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
|
||||
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
|
||||
fprintf(stderr, "Failed to apply seccomp filter\n");
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
iov = (struct iovec){ &cmd, sizeof(cmd) };
|
||||
msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0,
|
||||
.msg_iov = &iov, .msg_iovlen = 1,
|
||||
.msg_control = buf,
|
||||
.msg_controllen = sizeof(buf),
|
||||
.msg_flags = 0 };
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "Usage: %s PATH\n", argv[0]);
|
||||
_exit(2);
|
||||
}
|
||||
|
||||
if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
|
||||
fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
if ((stat(argv[1], &sb))) {
|
||||
fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
if ((sb.st_mode & S_IFMT) == S_IFDIR) {
|
||||
char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
|
||||
__attribute__ ((aligned(__alignof__(struct inotify_event))));
|
||||
const struct inotify_event *ev = NULL;
|
||||
char path[PATH_MAX + 1];
|
||||
bool found = false;
|
||||
ssize_t n;
|
||||
int fd;
|
||||
|
||||
if ((fd = inotify_init1(IN_CLOEXEC)) < 0) {
|
||||
fprintf(stderr, "inotify_init1: %i\n", errno);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) {
|
||||
fprintf(stderr, "inotify_add_watch: %i\n", errno);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
do {
|
||||
char *p;
|
||||
|
||||
n = read(fd, buf, sizeof(buf));
|
||||
if (n < 0) {
|
||||
fprintf(stderr, "inotify read: %i\n", errno);
|
||||
_exit(1);
|
||||
}
|
||||
buf[n - 1] = '\0';
|
||||
|
||||
if (n < (ssize_t)sizeof(*ev)) {
|
||||
fprintf(stderr, "Short inotify read: %zi\n", n);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
|
||||
ev = (const struct inotify_event *)p;
|
||||
|
||||
if (ev->len >= REPAIR_EXT_LEN &&
|
||||
!memcmp(ev->name +
|
||||
strnlen(ev->name, ev->len) -
|
||||
REPAIR_EXT_LEN,
|
||||
REPAIR_EXT, REPAIR_EXT_LEN)) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (!found);
|
||||
|
||||
if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') {
|
||||
fprintf(stderr, "Invalid filename from inotify\n");
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name);
|
||||
if ((stat(path, &sb))) {
|
||||
fprintf(stderr, "Can't stat() %s: %i\n", path, errno);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path);
|
||||
inotify_dir = true;
|
||||
} else {
|
||||
ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
|
||||
}
|
||||
|
||||
if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
|
||||
fprintf(stderr, "Invalid socket path\n");
|
||||
_exit(2);
|
||||
}
|
||||
|
||||
if ((sb.st_mode & S_IFMT) != S_IFSOCK) {
|
||||
fprintf(stderr, "%s is not a socket\n", a.sun_path);
|
||||
_exit(2);
|
||||
}
|
||||
|
||||
while (connect(s, (struct sockaddr *)&a, sizeof(a))) {
|
||||
if (inotify_dir && errno == ECONNREFUSED)
|
||||
continue;
|
||||
|
||||
fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path,
|
||||
strerror(errno));
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
loop:
|
||||
ret = recvmsg(s, &msg, 0);
|
||||
if (ret < 0) {
|
||||
if (errno == ECONNRESET) {
|
||||
ret = 0;
|
||||
} else {
|
||||
fprintf(stderr, "Failed to read message: %i\n", errno);
|
||||
_exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!ret) /* Done */
|
||||
_exit(0);
|
||||
|
||||
if (!cmsg ||
|
||||
cmsg->cmsg_len < CMSG_LEN(sizeof(int)) ||
|
||||
cmsg->cmsg_len > CMSG_LEN(sizeof(int) * SCM_MAX_FD) ||
|
||||
cmsg->cmsg_type != SCM_RIGHTS) {
|
||||
fprintf(stderr, "No/bad ancillary data from peer\n");
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
/* No inverse formula for CMSG_LEN(x), and building one with CMSG_LEN(0)
|
||||
* works but there's no guarantee it does. Search the whole domain.
|
||||
*/
|
||||
for (i = 1; i <= SCM_MAX_FD; i++) {
|
||||
if (CMSG_LEN(sizeof(int) * i) == cmsg->cmsg_len) {
|
||||
n = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!n) {
|
||||
cmsg_len = cmsg->cmsg_len; /* socklen_t is 'unsigned' on musl */
|
||||
fprintf(stderr, "Invalid ancillary data length %zu from peer\n",
|
||||
cmsg_len);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
memcpy(fds, CMSG_DATA(cmsg), sizeof(int) * n);
|
||||
|
||||
if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF &&
|
||||
cmd != TCP_REPAIR_OFF_NO_WP) {
|
||||
fprintf(stderr, "Unsupported command 0x%04x\n", cmd);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
op = cmd;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &op, sizeof(op))) {
|
||||
fprintf(stderr,
|
||||
"Setting TCP_REPAIR to %i on socket %i: %s\n",
|
||||
op, fds[i], strerror(errno));
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
/* Close _our_ copy */
|
||||
close(fds[i]);
|
||||
}
|
||||
|
||||
/* Confirm setting by echoing the command back */
|
||||
if (send(s, &cmd, sizeof(cmd), 0) < 0) {
|
||||
fprintf(stderr, "Reply to %i: %s\n", op, strerror(errno));
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
goto loop;
|
||||
|
||||
return 0;
|
||||
}
|
176
passt.1
176
passt.1
|
@ -95,7 +95,7 @@ detached PID namespace after starting, because the PID itself cannot change.
|
|||
Default is to fork into background.
|
||||
|
||||
.TP
|
||||
.BR \-e ", " \-\-stderr " " (DEPRECATED)
|
||||
.BR \-e ", " \-\-stderr
|
||||
This option has no effect, and is maintained for compatibility purposes only.
|
||||
|
||||
Note that this configuration option is \fBdeprecated\fR and will be removed in a
|
||||
|
@ -160,9 +160,7 @@ once for IPv6).
|
|||
By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces
|
||||
with the first default route, if any, for the corresponding IP version. If no
|
||||
default routes are available and there is any interface with any route for a
|
||||
given IP version, the first of these interfaces will be chosen instead. If no
|
||||
such interface exists, the link-local address 169.254.2.1 is assigned for IPv4,
|
||||
and no additional address will be assigned for IPv6.
|
||||
given IP version, the first of these interfaces will be chosen instead.
|
||||
|
||||
.TP
|
||||
.BR \-n ", " \-\-netmask " " \fImask
|
||||
|
@ -176,7 +174,8 @@ according to the CIDR block of the assigned address (RFC 4632).
|
|||
.BR \-M ", " \-\-mac-addr " " \fIaddr
|
||||
Use source MAC address \fIaddr\fR when communicating to the guest or to the
|
||||
target namespace.
|
||||
Default is the locally administered MAC addresses 9a:55:9a:55:9a:55.
|
||||
Default is to use the MAC address of the interface with the first IPv4 default
|
||||
route on the host.
|
||||
|
||||
.TP
|
||||
.BR \-g ", " \-\-gateway " " \fIaddr
|
||||
|
@ -189,9 +188,7 @@ first default route, if any, for the corresponding IP version. If the default
|
|||
route is a multipath one, the gateway is the first nexthop router returned by
|
||||
the kernel which has the highest weight in the set of paths. If no default
|
||||
routes are available and there is just one interface with any route, that
|
||||
interface will be chosen instead. If no such interface exists, the link-local
|
||||
address 169.254.2.2 is used for IPv4, and the link-local address fe80::1 is used
|
||||
for IPv6.
|
||||
interface will be chosen instead.
|
||||
|
||||
Note: these addresses are also used as source address for packets directed to
|
||||
the guest or to the target namespace having a loopback or local source address,
|
||||
|
@ -206,9 +203,7 @@ Default is to use the interfaces specified by \fB--outbound-if4\fR and
|
|||
|
||||
If no interfaces are given, the interface with the first default routes for each
|
||||
IP version is selected. If no default routes are available and there is just one
|
||||
interface with any route, that interface will be chosen instead. If no such
|
||||
interface exists, host interfaces will be ignored for the purposes of assigning
|
||||
addresses and routes, and link-local addresses will be used instead.
|
||||
interface with any route, that interface will be chosen instead.
|
||||
|
||||
.TP
|
||||
.BR \-o ", " \-\-outbound " " \fIaddr
|
||||
|
@ -227,8 +222,7 @@ derive IPv4 addresses and routes.
|
|||
|
||||
By default, the interface given by the default route is selected. If no default
|
||||
routes are available and there is just one interface with any route, that
|
||||
interface will be chosen instead. If no such interface exists, outbound sockets
|
||||
will not be bound to any specific interface.
|
||||
interface will be chosen instead.
|
||||
|
||||
.TP
|
||||
.BR \-\-outbound-if6 " " \fIname
|
||||
|
@ -238,8 +232,7 @@ derive IPv6 addresses and routes.
|
|||
|
||||
By default, the interface given by the default route is selected. If no default
|
||||
routes are available and there is just one interface with any route, that
|
||||
interface will be chosen instead. If no such interface exists, outbound sockets
|
||||
will not be bound to any specific interface.
|
||||
interface will be chosen instead.
|
||||
|
||||
.TP
|
||||
.BR \-D ", " \-\-dns " " \fIaddr
|
||||
|
@ -256,19 +249,10 @@ the host.
|
|||
.TP
|
||||
.BR \-\-dns-forward " " \fIaddr
|
||||
Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the
|
||||
nameserver (with corresponding IP version) specified by the
|
||||
\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or
|
||||
port 853. Replies are translated back with a reverse mapping. This
|
||||
option can be specified zero to two times (once for IPv4, once for
|
||||
IPv6).
|
||||
|
||||
.TP
|
||||
.BR \-\-dns-host " " \fIaddr
|
||||
Configure the host nameserver which guest or namespace queries to the
|
||||
\fB\-\-dns-forward\fR address will be redirected to. This option can
|
||||
be specified zero to two times (once for IPv4, once for IPv6).
|
||||
By default, the first nameserver from the host's
|
||||
\fI/etc/resolv.conf\fR.
|
||||
first configured DNS resolver (with corresponding IP version). Maps
|
||||
only UDP and TCP traffic to port 53 or port 853. Replies are
|
||||
translated back with a reverse mapping. This option can be specified
|
||||
zero to two times (once for IPv4, once for IPv6).
|
||||
|
||||
.TP
|
||||
.BR \-S ", " \-\-search " " \fIlist
|
||||
|
@ -343,16 +327,6 @@ namespace will be silently dropped.
|
|||
Disable Router Advertisements. Router Solicitations coming from guest or target
|
||||
namespace will be ignored.
|
||||
|
||||
.TP
|
||||
.BR \-\-freebind
|
||||
Allow any binding address to be specified for \fB-t\fR and \fB-u\fR
|
||||
options. Usually binding addresses must be addresses currently
|
||||
configured on the host. With \fB\-\-freebind\fR, the
|
||||
\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled
|
||||
allowing any address to be used. This is typically used to bind
|
||||
addresses which might be configured on the host in future, at which
|
||||
point the forwarding will immediately start operating.
|
||||
|
||||
.TP
|
||||
.BR \-\-map-host-loopback " " \fIaddr
|
||||
Translate \fIaddr\fR to refer to the host. Packets from the guest to
|
||||
|
@ -380,14 +354,14 @@ Translate \fIaddr\fR in the guest to be equal to the guest's assigned
|
|||
address on the host. That is, packets from the guest to \fIaddr\fR
|
||||
will be redirected to the address assigned to the guest with \fB-a\fR,
|
||||
or by default the host's global address. This allows the guest to
|
||||
access services available on the host's global address, even though its
|
||||
access services availble on the host's global address, even though its
|
||||
own address shadows that of the host.
|
||||
|
||||
If \fIaddr\fR is 'none', no address is mapped. Only one IPv4 and one
|
||||
IPv6 address can be translated, and if the option is specified
|
||||
multiple times, the last one for each address type takes effect.
|
||||
|
||||
By default, mapping happens as described for the \-\-map-host-loopback option.
|
||||
Default is no mapping.
|
||||
|
||||
.TP
|
||||
.BR \-4 ", " \-\-ipv4-only
|
||||
|
@ -401,44 +375,15 @@ Enable IPv6-only operation. IPv4 traffic will be ignored.
|
|||
By default, IPv4 operation is enabled as long as at least an IPv4 route and an
|
||||
interface address are configured on a given host interface.
|
||||
|
||||
.TP
|
||||
.BR \-H ", " \-\-hostname " " \fIname
|
||||
Hostname to configure the client with.
|
||||
Send \fIname\fR as DHCP option 12 (hostname).
|
||||
|
||||
.TP
|
||||
.BR \-\-fqdn " " \fIname
|
||||
FQDN to configure the client with.
|
||||
Send \fIname\fR as Client FQDN: DHCP option 81 and DHCPv6 option 39.
|
||||
|
||||
.SS \fBpasst\fR-only options
|
||||
|
||||
.TP
|
||||
.BR \-s ", " \-\-socket-path ", " \-\-socket " " \fIpath
|
||||
.BR \-s ", " \-\-socket " " \fIpath
|
||||
Path for UNIX domain socket used by \fBqemu\fR(1) or \fBqrap\fR(1) to connect to
|
||||
\fBpasst\fR.
|
||||
Default is to probe a free socket, not accepting connections, starting from
|
||||
\fI/tmp/passt_1.socket\fR to \fI/tmp/passt_64.socket\fR.
|
||||
|
||||
.TP
|
||||
.BR \-\-vhost-user
|
||||
Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR.
|
||||
|
||||
.TP
|
||||
.BR \-\-print-capabilities
|
||||
Print back-end capabilities in JSON format, only meaningful for vhost-user mode.
|
||||
|
||||
.TP
|
||||
.BR \-\-repair-path " " \fIpath
|
||||
Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect
|
||||
to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during
|
||||
migration. \fB--repair-path none\fR disables this interface (if you need to
|
||||
specify a socket path called "none" you can prefix the path by \fI./\fR).
|
||||
|
||||
Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
|
||||
chosen for the hypervisor UNIX domain socket. No socket is created if not in
|
||||
\-\-vhost-user mode.
|
||||
|
||||
.TP
|
||||
.BR \-F ", " \-\-fd " " \fIFD
|
||||
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
|
||||
|
@ -540,7 +485,6 @@ Default is \fBnone\fR.
|
|||
.BR \-I ", " \-\-ns-ifname " " \fIname
|
||||
Name of tap interface to be created in target namespace.
|
||||
By default, the same interface name as the external, routable interface is used.
|
||||
If no such interface exists, the name \fItap0\fR will be used instead.
|
||||
|
||||
.TP
|
||||
.BR \-t ", " \-\-tcp-ports " " \fIspec
|
||||
|
@ -642,13 +586,6 @@ Configure UDP port forwarding from target namespace to init namespace.
|
|||
|
||||
Default is \fBauto\fR.
|
||||
|
||||
.TP
|
||||
.BR \-\-host-lo-to-ns-lo
|
||||
If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
|
||||
the host's loopback address will appear on the loopback address in the
|
||||
guest as well. Without this option such forwarded packets will appear
|
||||
to come from the guest's public address.
|
||||
|
||||
.TP
|
||||
.BR \-\-userns " " \fIspec
|
||||
Target user namespace to join, as a path. If PID is given, without this option,
|
||||
|
@ -716,11 +653,6 @@ Configure MAC address \fIaddr\fR on the tap interface in the namespace.
|
|||
|
||||
Default is to let the tap driver build a pseudorandom hardware address.
|
||||
|
||||
.TP
|
||||
.BR \-\-no-splice
|
||||
Disable the bypass path for inbound, local traffic. See the section \fBHandling
|
||||
of local traffic in pasta\fR in the \fBNOTES\fR for more details.
|
||||
|
||||
.SH EXAMPLES
|
||||
|
||||
.SS \fBpasta
|
||||
|
@ -931,31 +863,26 @@ root@localhost's password:
|
|||
|
||||
.SH NOTES
|
||||
|
||||
.SS Handling of traffic with loopback destination and source addresses
|
||||
.SS Handling of traffic with local destination and source addresses
|
||||
|
||||
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback
|
||||
address (127.0.0.0/8 or ::1), depending on the configuration. Loopback
|
||||
destination or source addresses need to be changed before packets are
|
||||
delivered to the guest or target namespace: most operating systems
|
||||
would drop packets received with loopback addresses on non-loopback
|
||||
interfaces, and it would also be impossible for guest or target
|
||||
namespace to route answers back.
|
||||
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address,
|
||||
depending on the configuration. Local destination or source addresses need to be
|
||||
changed before packets are delivered to the guest or target namespace: most
|
||||
operating systems would drop packets received from non-loopback interfaces with
|
||||
local addresses, and it would also be impossible for guest or target namespace
|
||||
to route answers back.
|
||||
|
||||
For convenience, the source address on these packets is translated to
|
||||
the address specified by the \fB\-\-map-host-loopback\fR option (with
|
||||
some exceptions in pasta mode, see next section below). If not
|
||||
specified this defaults, somewhat arbitrarily, to the address of
|
||||
default IPv4 or IPv6 gateway (if any) -- this is known to be an
|
||||
existing, valid address on the same subnet. If \fB\-\-no-map-gw\fR or
|
||||
\fB\-\-map-host-loopback none\fR are specified this translation is
|
||||
disabled and packets with loopback addresses are simply dropped.
|
||||
For convenience, and somewhat arbitrarily, the source address on these packets
|
||||
is translated to the address of the default IPv4 or IPv6 gateway (if any) --
|
||||
this is known to be an existing, valid address on the same subnet.
|
||||
|
||||
Loopback destination addresses are translated to the observed external
|
||||
address of the guest or target namespace. For IPv6, the observed
|
||||
link-local address is used if the translated source address is
|
||||
link-local, otherwise the observed global address is used. For both
|
||||
IPv4 and IPv6, if no addresses have been seen yet, the configured
|
||||
addresses will be used instead.
|
||||
Loopback destination addresses are instead translated to the observed external
|
||||
address of the guest or target namespace. For IPv6 packets, if usage of a
|
||||
link-local address by guest or namespace has ever been observed, and the
|
||||
original destination address is also a link-local address, the observed
|
||||
link-local address is used. Otherwise, the observed global address is used. For
|
||||
both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses
|
||||
will be used instead.
|
||||
|
||||
For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1,
|
||||
with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
|
||||
|
@ -963,15 +890,11 @@ the last observed source address from guest or namespace is 192.0.2.2, this will
|
|||
be translated to a connection from 192.0.2.1 to 192.0.2.2.
|
||||
|
||||
Similarly, for traffic coming from guest or namespace, packets with destination
|
||||
address corresponding to the \fB\-\-map-host-loopback\fR address will have their
|
||||
destination address translated to a loopback address.
|
||||
|
||||
As an exception, traffic identified as DNS, originally directed to the
|
||||
\fB\-\-map-host-loopback\fR address, if this address matches a resolver address
|
||||
on the host, is \fBnot\fR translated to loopback, but rather handled in the same
|
||||
way as if specified as \-\-dns-forward address, if no such option was given.
|
||||
In the common case where the host gateway also acts a resolver, this avoids that
|
||||
the host mapping shadows the gateway/resolver itself.
|
||||
address corresponding to the default gateway will have their destination address
|
||||
translated to a loopback address, if and only if a packet, in the opposite
|
||||
direction, with a loopback destination or source address, port-wise matching for
|
||||
UDP, or connection-wise for TCP, has been recently forwarded to guest or
|
||||
namespace. This behaviour can be disabled with \-\-no\-map\-gw.
|
||||
|
||||
.SS Handling of local traffic in pasta
|
||||
|
||||
|
@ -987,15 +910,8 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair
|
|||
of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet
|
||||
transfers.
|
||||
|
||||
Because it's not possible to bind sockets to foreign addresses, this
|
||||
bypass only applies to local connections and traffic. It also means
|
||||
that the address translation differs slightly from passt mode.
|
||||
Connections from loopback to loopback on the host will appear to come
|
||||
from the target namespace's public address within the guest, unless
|
||||
\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will
|
||||
appear to come from loopback in the namespace as well. The latter
|
||||
behaviour used to be the default, but is usually undesirable, since it
|
||||
can unintentionally expose namespace local services to the host.
|
||||
This bypass only applies to local connections and traffic, because it's not
|
||||
possible to bind sockets to foreign addresses.
|
||||
|
||||
.SS Binding to low numbered ports (well-known or system ports, up to 1023)
|
||||
|
||||
|
@ -1080,20 +996,6 @@ If the sending window cannot be queried, it will always be announced as the
|
|||
current sending buffer size to guest or target namespace. This might affect
|
||||
throughput of TCP connections.
|
||||
|
||||
.SS Local mode for disconnected setups
|
||||
|
||||
If \fBpasst\fR and \fBpasta\fR fail to find a host interface with a configured
|
||||
address, other than loopback addresses, they will, obviously, not attempt to
|
||||
source addresses or routes from the host.
|
||||
|
||||
In this case, unless configured otherwise, they will assign the IPv4 link-local
|
||||
address 169.254.2.1 to the guest or target namespace, and no IPv6 address. The
|
||||
notion of the guest or target namespace IPv6 address is derived from the first
|
||||
link-local address observed.
|
||||
|
||||
Default gateways will be assigned as the link-local address 169.254.2.2 for
|
||||
IPv4, and as the link-local address fe80::1 for IPv6.
|
||||
|
||||
.SH LIMITATIONS
|
||||
|
||||
Currently, IGMP/MLD proxying (RFC 4605) and support for SCTP (RFC 4960) are not
|
||||
|
|
115
passt.c
115
passt.c
|
@ -36,6 +36,9 @@
|
|||
#include <sys/prctl.h>
|
||||
#include <netinet/if_ether.h>
|
||||
#include <libgen.h>
|
||||
#ifdef HAS_GETRANDOM
|
||||
#include <sys/random.h>
|
||||
#endif
|
||||
|
||||
#include "util.h"
|
||||
#include "passt.h"
|
||||
|
@ -49,10 +52,6 @@
|
|||
#include "arch.h"
|
||||
#include "log.h"
|
||||
#include "tcp_splice.h"
|
||||
#include "ndp.h"
|
||||
#include "vu_common.h"
|
||||
#include "migrate.h"
|
||||
#include "repair.h"
|
||||
|
||||
#define EPOLL_EVENTS 8
|
||||
|
||||
|
@ -68,17 +67,13 @@ char *epoll_type_str[] = {
|
|||
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
|
||||
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
|
||||
[EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket",
|
||||
[EPOLL_TYPE_UDP] = "UDP flow socket",
|
||||
[EPOLL_TYPE_UDP_REPLY] = "UDP reply socket",
|
||||
[EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket",
|
||||
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
|
||||
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
|
||||
[EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device",
|
||||
[EPOLL_TYPE_TAP_PASST] = "connected qemu socket",
|
||||
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
|
||||
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
|
||||
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
|
||||
[EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
|
||||
[EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
|
||||
};
|
||||
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
|
||||
"epoll_type_str[] doesn't match enum epoll_type");
|
||||
|
@ -115,25 +110,40 @@ static void post_handler(struct ctx *c, const struct timespec *now)
|
|||
|
||||
flow_defer_handler(c, now);
|
||||
#undef CALL_PROTO_HANDLER
|
||||
|
||||
if (!c->no_ndp)
|
||||
ndp_timer(c, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* random_init() - Initialise things based on random data
|
||||
* secret_init() - Create secret value for SipHash calculations
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void random_init(struct ctx *c)
|
||||
static void secret_init(struct ctx *c)
|
||||
{
|
||||
unsigned int seed;
|
||||
#ifndef HAS_GETRANDOM
|
||||
int dev_random = open("/dev/random", O_RDONLY);
|
||||
unsigned int random_read = 0;
|
||||
|
||||
/* Create secret value for SipHash calculations */
|
||||
raw_random(&c->hash_secret, sizeof(c->hash_secret));
|
||||
while (dev_random && random_read < sizeof(c->hash_secret)) {
|
||||
int ret = read(dev_random,
|
||||
(uint8_t *)&c->hash_secret + random_read,
|
||||
sizeof(c->hash_secret) - random_read);
|
||||
|
||||
/* Seed pseudo-RNG for things that need non-cryptographic random */
|
||||
raw_random(&seed, sizeof(seed));
|
||||
srandom(seed);
|
||||
if (ret == -1 && errno == EINTR)
|
||||
continue;
|
||||
|
||||
if (ret <= 0)
|
||||
break;
|
||||
|
||||
random_read += ret;
|
||||
}
|
||||
if (dev_random >= 0)
|
||||
close(dev_random);
|
||||
|
||||
if (random_read < sizeof(c->hash_secret))
|
||||
#else
|
||||
if (getrandom(&c->hash_secret, sizeof(c->hash_secret),
|
||||
GRND_RANDOM) < 0)
|
||||
#endif /* !HAS_GETRANDOM */
|
||||
die_perror("Failed to get random bytes for hash table and TCP");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -166,11 +176,11 @@ void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
|
|||
*
|
||||
* #syscalls exit_group
|
||||
*/
|
||||
static void exit_handler(int signal)
|
||||
void exit_handler(int signal)
|
||||
{
|
||||
(void)signal;
|
||||
|
||||
_exit(EXIT_SUCCESS);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -184,27 +194,26 @@ static void exit_handler(int signal)
|
|||
* #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close
|
||||
* #syscalls bind connect recvfrom sendto shutdown
|
||||
* #syscalls arm:recv ppc64le:recv arm:send ppc64le:send
|
||||
* #syscalls accept4 accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
|
||||
* #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
|
||||
* #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64
|
||||
*/
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct epoll_event events[EPOLL_EVENTS];
|
||||
int nfds, i, devnull_fd = -1;
|
||||
char argv0[PATH_MAX], *name;
|
||||
struct ctx c = { 0 };
|
||||
struct rlimit limit;
|
||||
struct timespec now;
|
||||
struct sigaction sa;
|
||||
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &log_start))
|
||||
die_perror("Failed to get CLOCK_MONOTONIC time");
|
||||
clock_gettime(CLOCK_MONOTONIC, &log_start);
|
||||
|
||||
arch_avx2_exec(argv);
|
||||
|
||||
isolate_initial(argc, argv);
|
||||
|
||||
c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1;
|
||||
c.device_state_fd = -1;
|
||||
|
||||
sigemptyset(&sa.sa_mask);
|
||||
sa.sa_flags = 0;
|
||||
|
@ -212,18 +221,27 @@ int main(int argc, char **argv)
|
|||
sigaction(SIGTERM, &sa, NULL);
|
||||
sigaction(SIGQUIT, &sa, NULL);
|
||||
|
||||
c.mode = conf_mode(argc, argv);
|
||||
if (argc < 1)
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
if (c.mode == MODE_PASTA) {
|
||||
strncpy(argv0, argv[0], PATH_MAX - 1);
|
||||
name = basename(argv0);
|
||||
if (strstr(name, "pasta")) {
|
||||
sa.sa_handler = pasta_child_handler;
|
||||
if (sigaction(SIGCHLD, &sa, NULL))
|
||||
die_perror("Couldn't install signal handlers");
|
||||
|
||||
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
|
||||
die_perror("Couldn't set disposition for SIGPIPE");
|
||||
|
||||
c.mode = MODE_PASTA;
|
||||
} else if (strstr(name, "passt")) {
|
||||
c.mode = MODE_PASST;
|
||||
} else {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
|
||||
die_perror("Couldn't set disposition for SIGPIPE");
|
||||
|
||||
madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE);
|
||||
madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
|
||||
|
||||
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
|
||||
if (c.epollfd == -1)
|
||||
|
@ -243,17 +261,16 @@ int main(int argc, char **argv)
|
|||
|
||||
pasta_netns_quit_init(&c);
|
||||
|
||||
tap_backend_init(&c);
|
||||
tap_sock_init(&c);
|
||||
|
||||
random_init(&c);
|
||||
secret_init(&c);
|
||||
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &now))
|
||||
die_perror("Failed to get CLOCK_MONOTONIC time");
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
|
||||
flow_init();
|
||||
|
||||
if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
|
||||
_exit(EXIT_FAILURE);
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
|
||||
|
||||
|
@ -290,15 +307,13 @@ int main(int argc, char **argv)
|
|||
timer_init(&c, &now);
|
||||
|
||||
loop:
|
||||
/* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||
/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
|
||||
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
|
||||
/* NOLINTEND(bugprone-branch-clone) */
|
||||
if (nfds == -1 && errno != EINTR)
|
||||
die_perror("epoll_wait() failed in main loop");
|
||||
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &now))
|
||||
err_perror("Failed to get CLOCK_MONOTONIC time");
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
|
||||
for (i = 0; i < nfds; i++) {
|
||||
union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
|
||||
|
@ -339,24 +354,12 @@ loop:
|
|||
case EPOLL_TYPE_UDP_LISTEN:
|
||||
udp_listen_sock_handler(&c, ref, eventmask, &now);
|
||||
break;
|
||||
case EPOLL_TYPE_UDP:
|
||||
udp_sock_handler(&c, ref, eventmask, &now);
|
||||
case EPOLL_TYPE_UDP_REPLY:
|
||||
udp_reply_sock_handler(&c, ref, eventmask, &now);
|
||||
break;
|
||||
case EPOLL_TYPE_PING:
|
||||
icmp_sock_handler(&c, ref);
|
||||
break;
|
||||
case EPOLL_TYPE_VHOST_CMD:
|
||||
vu_control_handler(c.vdev, c.fd_tap, eventmask);
|
||||
break;
|
||||
case EPOLL_TYPE_VHOST_KICK:
|
||||
vu_kick_cb(c.vdev, ref, &now);
|
||||
break;
|
||||
case EPOLL_TYPE_REPAIR_LISTEN:
|
||||
repair_listen_handler(&c, eventmask);
|
||||
break;
|
||||
case EPOLL_TYPE_REPAIR:
|
||||
repair_handler(&c, eventmask);
|
||||
break;
|
||||
default:
|
||||
/* Can't happen */
|
||||
ASSERT(0);
|
||||
|
@ -365,7 +368,5 @@ loop:
|
|||
|
||||
post_handler(&c, &now);
|
||||
|
||||
migrate_handler(&c);
|
||||
|
||||
goto loop;
|
||||
}
|
||||
|
|
51
passt.h
51
passt.h
|
@ -20,13 +20,11 @@ union epoll_ref;
|
|||
#include "siphash.h"
|
||||
#include "ip.h"
|
||||
#include "inany.h"
|
||||
#include "migrate.h"
|
||||
#include "flow.h"
|
||||
#include "icmp.h"
|
||||
#include "fwd.h"
|
||||
#include "tcp.h"
|
||||
#include "udp.h"
|
||||
#include "vhost_user.h"
|
||||
|
||||
/* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0
|
||||
* (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise
|
||||
|
@ -45,7 +43,6 @@ union epoll_ref;
|
|||
* @icmp: ICMP-specific reference part
|
||||
* @data: Data handled by protocol handlers
|
||||
* @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone
|
||||
* @queue: vhost-user queue index for this fd
|
||||
* @u64: Opaque reference for epoll_ctl() and epoll_wait()
|
||||
*/
|
||||
union epoll_ref {
|
||||
|
@ -61,7 +58,6 @@ union epoll_ref {
|
|||
union udp_listen_epoll_ref udp;
|
||||
uint32_t data;
|
||||
int nsdir_fd;
|
||||
int queue;
|
||||
};
|
||||
};
|
||||
uint64_t u64;
|
||||
|
@ -69,9 +65,12 @@ union epoll_ref {
|
|||
static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
|
||||
"epoll_ref must have same size as epoll_data");
|
||||
|
||||
/* Large enough for ~128 maximum size frames */
|
||||
#define PKT_BUF_BYTES (8UL << 20)
|
||||
#define TAP_BUF_BYTES \
|
||||
ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
|
||||
#define TAP_MSGS \
|
||||
DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
|
||||
|
||||
#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0)
|
||||
extern char pkt_buf [PKT_BUF_BYTES];
|
||||
|
||||
extern char *epoll_type_str[];
|
||||
|
@ -95,7 +94,6 @@ struct fqdn {
|
|||
enum passt_modes {
|
||||
MODE_PASST,
|
||||
MODE_PASTA,
|
||||
MODE_VU,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -191,7 +189,6 @@ struct ip6_ctx {
|
|||
* @foreground: Run in foreground, don't log to stderr by default
|
||||
* @nofile: Maximum number of open files (ulimit -n)
|
||||
* @sock_path: Path for UNIX domain socket
|
||||
* @repair_path: TCP_REPAIR helper path, can be "none", empty for default
|
||||
* @pcap: Path for packet capture file
|
||||
* @pidfile: Path to PID file, empty string if not configured
|
||||
* @pidfile_fd: File descriptor for PID file, -1 if none
|
||||
|
@ -202,17 +199,13 @@ struct ip6_ctx {
|
|||
* @epollfd: File descriptor for epoll instance
|
||||
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
|
||||
* @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket
|
||||
* @fd_repair_listen: File descriptor for listening TCP_REPAIR socket, if any
|
||||
* @fd_repair: Connected AF_UNIX socket for TCP_REPAIR helper
|
||||
* @our_tap_mac: Pasta/passt's MAC on the tap link
|
||||
* @guest_mac: MAC address of guest or namespace, seen or configured
|
||||
* @hash_secret: 128-bit secret for siphash functions
|
||||
* @ifi4: Template interface for IPv4, -1: none, 0: IPv4 disabled
|
||||
* @ifi4: Index of template interface for IPv4, 0 if IPv4 disabled
|
||||
* @ip: IPv4 configuration
|
||||
* @dns_search: DNS search list
|
||||
* @hostname: Guest hostname
|
||||
* @fqdn: Guest FQDN
|
||||
* @ifi6: Template interface for IPv6, -1: none, 0: IPv6 disabled
|
||||
* @ifi6: Index of template interface for IPv6, 0 if IPv6 disabled
|
||||
* @ip6: IPv6 configuration
|
||||
* @pasta_ifn: Name of namespace interface for pasta
|
||||
* @pasta_ifi: Index of namespace interface for pasta
|
||||
|
@ -232,15 +225,8 @@ struct ip6_ctx {
|
|||
* @no_dhcpv6: Disable DHCPv6 server
|
||||
* @no_ndp: Disable NDP handler altogether
|
||||
* @no_ra: Disable router advertisements
|
||||
* @no_splice: Disable socket splicing for inbound traffic
|
||||
* @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses
|
||||
* @freebind: Allow binding of non-local addresses for forwarding
|
||||
* @low_wmem: Low probed net.core.wmem_max
|
||||
* @low_rmem: Low probed net.core.rmem_max
|
||||
* @vdev: vhost-user device
|
||||
* @device_state_fd: Device state migration channel
|
||||
* @device_state_result: Device state migration result
|
||||
* @migrate_target: Are we the target, on the next migration request?
|
||||
*/
|
||||
struct ctx {
|
||||
enum passt_modes mode;
|
||||
|
@ -250,7 +236,6 @@ struct ctx {
|
|||
int foreground;
|
||||
int nofile;
|
||||
char sock_path[UNIX_PATH_MAX];
|
||||
char repair_path[UNIX_PATH_MAX];
|
||||
char pcap[PATH_MAX];
|
||||
|
||||
char pidfile[PATH_MAX];
|
||||
|
@ -267,23 +252,16 @@ struct ctx {
|
|||
int epollfd;
|
||||
int fd_tap_listen;
|
||||
int fd_tap;
|
||||
int fd_repair_listen;
|
||||
int fd_repair;
|
||||
unsigned char our_tap_mac[ETH_ALEN];
|
||||
unsigned char guest_mac[ETH_ALEN];
|
||||
uint16_t mtu;
|
||||
|
||||
uint64_t hash_secret[2];
|
||||
|
||||
int ifi4;
|
||||
unsigned int ifi4;
|
||||
struct ip4_ctx ip4;
|
||||
|
||||
struct fqdn dns_search[MAXDNSRCH];
|
||||
|
||||
char hostname[PASST_MAXDNAME];
|
||||
char fqdn[PASST_MAXDNAME];
|
||||
|
||||
int ifi6;
|
||||
unsigned int ifi6;
|
||||
struct ip6_ctx ip6;
|
||||
|
||||
char pasta_ifn[IF_NAMESIZE];
|
||||
|
@ -297,6 +275,7 @@ struct ctx {
|
|||
int no_icmp;
|
||||
struct icmp_ctx icmp;
|
||||
|
||||
int mtu;
|
||||
int no_dns;
|
||||
int no_dns_search;
|
||||
int no_dhcp_dns;
|
||||
|
@ -305,19 +284,9 @@ struct ctx {
|
|||
int no_dhcpv6;
|
||||
int no_ndp;
|
||||
int no_ra;
|
||||
int no_splice;
|
||||
int host_lo_to_ns_lo;
|
||||
int freebind;
|
||||
|
||||
int low_wmem;
|
||||
int low_rmem;
|
||||
|
||||
struct vu_dev *vdev;
|
||||
|
||||
/* Migration */
|
||||
int device_state_fd;
|
||||
int device_state_result;
|
||||
bool migrate_target;
|
||||
};
|
||||
|
||||
void proto_update_l2_buf(const unsigned char *eth_d,
|
||||
|
|
89
pasta.c
89
pasta.c
|
@ -57,13 +57,15 @@ int pasta_child_pid;
|
|||
|
||||
/**
|
||||
* pasta_child_handler() - Exit once shell exits (if we started it), reap clones
|
||||
* @signal: Signal number; this handler deals with SIGCHLD only
|
||||
* @signal: Unused, handler deals with SIGCHLD only
|
||||
*/
|
||||
void pasta_child_handler(int signal)
|
||||
{
|
||||
int errno_save = errno;
|
||||
siginfo_t infop;
|
||||
|
||||
(void)signal;
|
||||
|
||||
if (signal != SIGCHLD)
|
||||
return;
|
||||
|
||||
|
@ -71,12 +73,12 @@ void pasta_child_handler(int signal)
|
|||
!waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) {
|
||||
if (infop.si_pid == pasta_child_pid) {
|
||||
if (infop.si_code == CLD_EXITED)
|
||||
_exit(infop.si_status);
|
||||
exit(infop.si_status);
|
||||
|
||||
/* If killed by a signal, si_status is the number.
|
||||
* Follow common shell convention of returning it + 128.
|
||||
*/
|
||||
_exit(infop.si_status + 128);
|
||||
exit(infop.si_status + 128);
|
||||
|
||||
/* Nothing to do, detached PID namespace going away */
|
||||
}
|
||||
|
@ -100,9 +102,7 @@ static int pasta_wait_for_ns(void *arg)
|
|||
int flags = O_RDONLY | O_CLOEXEC;
|
||||
char ns[PATH_MAX];
|
||||
|
||||
if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid))
|
||||
die_perror("Can't build netns path");
|
||||
|
||||
snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
|
||||
do {
|
||||
while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
|
||||
if (errno != ENOENT)
|
||||
|
@ -167,12 +167,10 @@ void pasta_open_ns(struct ctx *c, const char *netns)
|
|||
* struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd()
|
||||
* @exe: Executable to run
|
||||
* @argv: Command and arguments to run
|
||||
* @ctx: Context to read config from
|
||||
*/
|
||||
struct pasta_spawn_cmd_arg {
|
||||
const char *exe;
|
||||
char *const *argv;
|
||||
struct ctx *c;
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -186,7 +184,6 @@ static int pasta_spawn_cmd(void *arg)
|
|||
{
|
||||
char hostname[HOST_NAME_MAX + 1] = HOSTNAME_PREFIX;
|
||||
const struct pasta_spawn_cmd_arg *a;
|
||||
size_t conf_hostname_len;
|
||||
sigset_t set;
|
||||
|
||||
/* We run in a detached PID and mount namespace: mount /proc over */
|
||||
|
@ -196,15 +193,9 @@ static int pasta_spawn_cmd(void *arg)
|
|||
if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
|
||||
warn("Cannot set ping_group_range, ICMP requests might fail");
|
||||
|
||||
a = (const struct pasta_spawn_cmd_arg *)arg;
|
||||
|
||||
conf_hostname_len = strlen(a->c->hostname);
|
||||
if (conf_hostname_len > 0) {
|
||||
if (sethostname(a->c->hostname, conf_hostname_len))
|
||||
warn("Unable to set configured hostname");
|
||||
} else if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1,
|
||||
HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) ||
|
||||
errno == ENAMETOOLONG) {
|
||||
if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1,
|
||||
HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) ||
|
||||
errno == ENAMETOOLONG) {
|
||||
hostname[HOST_NAME_MAX] = '\0';
|
||||
if (sethostname(hostname, strlen(hostname)))
|
||||
warn("Unable to set pasta-prefixed hostname");
|
||||
|
@ -215,6 +206,7 @@ static int pasta_spawn_cmd(void *arg)
|
|||
sigaddset(&set, SIGUSR1);
|
||||
sigwaitinfo(&set, NULL);
|
||||
|
||||
a = (const struct pasta_spawn_cmd_arg *)arg;
|
||||
execvp(a->exe, a->argv);
|
||||
|
||||
die_perror("Failed to start command or shell");
|
||||
|
@ -236,7 +228,6 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
|
|||
struct pasta_spawn_cmd_arg arg = {
|
||||
.exe = argv[0],
|
||||
.argv = argv,
|
||||
.c = c,
|
||||
};
|
||||
char uidmap[BUFSIZ], gidmap[BUFSIZ];
|
||||
char *sh_argv[] = { NULL, NULL };
|
||||
|
@ -248,11 +239,8 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
|
|||
c->quiet = 1;
|
||||
|
||||
/* Configure user and group mappings */
|
||||
if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid))
|
||||
die_perror("Can't build uidmap");
|
||||
|
||||
if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid))
|
||||
die_perror("Can't build gidmap");
|
||||
snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
|
||||
snprintf(gidmap, BUFSIZ, "0 %u 1", gid);
|
||||
|
||||
if (write_file("/proc/self/uid_map", uidmap) ||
|
||||
write_file("/proc/self/setgroups", "deny") ||
|
||||
|
@ -303,7 +291,7 @@ void pasta_ns_conf(struct ctx *c)
|
|||
rc = nl_link_set_flags(nl_sock_ns, 1 /* lo */, IFF_UP, IFF_UP);
|
||||
if (rc < 0)
|
||||
die("Couldn't bring up loopback interface in namespace: %s",
|
||||
strerror_(-rc));
|
||||
strerror(-rc));
|
||||
|
||||
/* Get or set MAC in target namespace */
|
||||
if (MAC_IS_ZERO(c->guest_mac))
|
||||
|
@ -312,12 +300,12 @@ void pasta_ns_conf(struct ctx *c)
|
|||
rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac);
|
||||
if (rc < 0)
|
||||
die("Couldn't set MAC address in namespace: %s",
|
||||
strerror_(-rc));
|
||||
strerror(-rc));
|
||||
|
||||
if (c->pasta_conf_ns) {
|
||||
unsigned int flags = IFF_UP;
|
||||
|
||||
if (c->mtu)
|
||||
if (c->mtu != -1)
|
||||
nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu);
|
||||
|
||||
if (c->ifi6) /* Avoid duplicate address detection on link up */
|
||||
|
@ -339,7 +327,7 @@ void pasta_ns_conf(struct ctx *c)
|
|||
|
||||
if (rc < 0) {
|
||||
die("Couldn't set IPv4 address(es) in namespace: %s",
|
||||
strerror_(-rc));
|
||||
strerror(-rc));
|
||||
}
|
||||
|
||||
if (c->ip4.no_copy_routes) {
|
||||
|
@ -353,7 +341,7 @@ void pasta_ns_conf(struct ctx *c)
|
|||
|
||||
if (rc < 0) {
|
||||
die("Couldn't set IPv4 route(s) in guest: %s",
|
||||
strerror_(-rc));
|
||||
strerror(-rc));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -362,13 +350,13 @@ void pasta_ns_conf(struct ctx *c)
|
|||
&c->ip6.addr_ll_seen);
|
||||
if (rc < 0) {
|
||||
warn("Can't get LL address from namespace: %s",
|
||||
strerror_(-rc));
|
||||
strerror(-rc));
|
||||
}
|
||||
|
||||
rc = nl_addr_set_ll_nodad(nl_sock_ns, c->pasta_ifi);
|
||||
if (rc < 0) {
|
||||
warn("Can't set nodad for LL in namespace: %s",
|
||||
strerror_(-rc));
|
||||
strerror(-rc));
|
||||
}
|
||||
|
||||
/* We dodged DAD: re-enable neighbour solicitations */
|
||||
|
@ -376,11 +364,8 @@ void pasta_ns_conf(struct ctx *c)
|
|||
0, IFF_NOARP);
|
||||
|
||||
if (c->ip6.no_copy_addrs) {
|
||||
if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) {
|
||||
rc = nl_addr_set(nl_sock_ns,
|
||||
c->pasta_ifi, AF_INET6,
|
||||
&c->ip6.addr, 64);
|
||||
}
|
||||
rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
|
||||
AF_INET6, &c->ip6.addr, 64);
|
||||
} else {
|
||||
rc = nl_addr_dup(nl_sock, c->ifi6,
|
||||
nl_sock_ns, c->pasta_ifi,
|
||||
|
@ -389,7 +374,7 @@ void pasta_ns_conf(struct ctx *c)
|
|||
|
||||
if (rc < 0) {
|
||||
die("Couldn't set IPv6 address(es) in namespace: %s",
|
||||
strerror_(-rc));
|
||||
strerror(-rc));
|
||||
}
|
||||
|
||||
if (c->ip6.no_copy_routes) {
|
||||
|
@ -404,7 +389,7 @@ void pasta_ns_conf(struct ctx *c)
|
|||
|
||||
if (rc < 0) {
|
||||
die("Couldn't set IPv6 route(s) in guest: %s",
|
||||
strerror_(-rc));
|
||||
strerror(-rc));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -453,18 +438,18 @@ void pasta_netns_quit_init(const struct ctx *c)
|
|||
return;
|
||||
|
||||
if ((dir_fd = open(c->netns_dir, O_CLOEXEC | O_RDONLY)) < 0)
|
||||
die("netns dir open: %s, exiting", strerror_(errno));
|
||||
die("netns dir open: %s, exiting", strerror(errno));
|
||||
|
||||
if (fstatfs(dir_fd, &s) || s.f_type == DEVPTS_SUPER_MAGIC ||
|
||||
s.f_type == PROC_SUPER_MAGIC || s.f_type == SYSFS_MAGIC)
|
||||
try_inotify = false;
|
||||
|
||||
if (try_inotify && (fd = inotify_init1(flags)) < 0)
|
||||
warn("inotify_init1(): %s, use a timer", strerror_(errno));
|
||||
warn("inotify_init1(): %s, use a timer", strerror(errno));
|
||||
|
||||
if (fd >= 0 && inotify_add_watch(fd, c->netns_dir, IN_DELETE) < 0) {
|
||||
warn("inotify_add_watch(): %s, use a timer",
|
||||
strerror_(errno));
|
||||
strerror(errno));
|
||||
close(fd);
|
||||
fd = -1;
|
||||
}
|
||||
|
@ -496,23 +481,17 @@ void pasta_netns_quit_init(const struct ctx *c)
|
|||
*/
|
||||
void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd)
|
||||
{
|
||||
char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
|
||||
__attribute__ ((aligned(__alignof__(struct inotify_event))));
|
||||
const struct inotify_event *ev;
|
||||
ssize_t n;
|
||||
char *p;
|
||||
char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
|
||||
const struct inotify_event *in_ev = (struct inotify_event *)buf;
|
||||
|
||||
if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev))
|
||||
if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev))
|
||||
return;
|
||||
|
||||
for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
|
||||
ev = (const struct inotify_event *)p;
|
||||
if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base)))
|
||||
return;
|
||||
|
||||
if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) {
|
||||
info("Namespace %s is gone, exiting", c->netns_base);
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
info("Namespace %s is gone, exiting", c->netns_base);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -538,7 +517,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref)
|
|||
return;
|
||||
|
||||
info("Namespace %s is gone, exiting", c->netns_base);
|
||||
_exit(EXIT_SUCCESS);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
close(fd);
|
||||
|
|
84
pcap.c
84
pcap.c
|
@ -33,12 +33,33 @@
|
|||
#include "log.h"
|
||||
#include "pcap.h"
|
||||
#include "iov.h"
|
||||
#include "tap.h"
|
||||
|
||||
#define PCAP_VERSION_MINOR 4
|
||||
|
||||
static int pcap_fd = -1;
|
||||
|
||||
/* See pcap.h from libpcap, or pcap-savefile(5) */
|
||||
static const struct {
|
||||
uint32_t magic;
|
||||
#define PCAP_MAGIC 0xa1b2c3d4
|
||||
|
||||
uint16_t major;
|
||||
#define PCAP_VERSION_MAJOR 2
|
||||
|
||||
uint16_t minor;
|
||||
#define PCAP_VERSION_MINOR 4
|
||||
|
||||
int32_t thiszone;
|
||||
uint32_t sigfigs;
|
||||
uint32_t snaplen;
|
||||
|
||||
uint32_t linktype;
|
||||
#define PCAP_LINKTYPE_ETHERNET 1
|
||||
} pcap_hdr = {
|
||||
PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU,
|
||||
PCAP_LINKTYPE_ETHERNET
|
||||
};
|
||||
|
||||
struct pcap_pkthdr {
|
||||
uint32_t tv_sec;
|
||||
uint32_t tv_usec;
|
||||
|
@ -52,6 +73,8 @@ struct pcap_pkthdr {
|
|||
* @iovcnt: Number of buffers (@iov entries) in frame
|
||||
* @offset: Byte offset of the L2 headers within @iov
|
||||
* @now: Timestamp
|
||||
*
|
||||
* Returns: 0 on success, -errno on error writing to the file
|
||||
*/
|
||||
static void pcap_frame(const struct iovec *iov, size_t iovcnt,
|
||||
size_t offset, const struct timespec *now)
|
||||
|
@ -63,8 +86,9 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
|
|||
.caplen = l2len,
|
||||
.len = l2len
|
||||
};
|
||||
struct iovec hiov = { &h, sizeof(h) };
|
||||
|
||||
if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 ||
|
||||
if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 ||
|
||||
write_remainder(pcap_fd, iov, iovcnt, offset) < 0)
|
||||
debug_perror("Cannot log packet, length %zu", l2len);
|
||||
}
|
||||
|
@ -77,14 +101,12 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
|
|||
void pcap(const char *pkt, size_t l2len)
|
||||
{
|
||||
struct iovec iov = { (char *)pkt, l2len };
|
||||
struct timespec now = { 0 };
|
||||
struct timespec now;
|
||||
|
||||
if (pcap_fd == -1)
|
||||
return;
|
||||
|
||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||
err_perror("Failed to get CLOCK_REALTIME time");
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
pcap_frame(&iov, 1, 0, &now);
|
||||
}
|
||||
|
||||
|
@ -98,38 +120,36 @@ void pcap(const char *pkt, size_t l2len)
|
|||
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
||||
size_t offset)
|
||||
{
|
||||
struct timespec now = { 0 };
|
||||
struct timespec now;
|
||||
unsigned int i;
|
||||
|
||||
if (pcap_fd == -1)
|
||||
return;
|
||||
|
||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||
err_perror("Failed to get CLOCK_REALTIME time");
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
|
||||
}
|
||||
|
||||
/**
|
||||
* pcap_iov() - Write packet data described by an I/O vector
|
||||
/*
|
||||
* pcap_iov - Write packet data described by an I/O vector
|
||||
* to a pcap file descriptor.
|
||||
*
|
||||
* @iov: Pointer to the array of struct iovec describing the I/O vector
|
||||
* containing packet data to write, including L2 header
|
||||
* @iovcnt: Number of buffers (@iov entries)
|
||||
* @offset: Offset of the L2 frame within the full data length
|
||||
*/
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt)
|
||||
{
|
||||
struct timespec now = { 0 };
|
||||
struct timespec now;
|
||||
|
||||
if (pcap_fd == -1)
|
||||
return;
|
||||
|
||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||
err_perror("Failed to get CLOCK_REALTIME time");
|
||||
|
||||
pcap_frame(iov, iovcnt, offset, &now);
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
pcap_frame(iov, iovcnt, 0, &now);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -138,28 +158,7 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
|
|||
*/
|
||||
void pcap_init(struct ctx *c)
|
||||
{
|
||||
/* See pcap.h from libpcap, or pcap-savefile(5) */
|
||||
#define PCAP_MAGIC 0xa1b2c3d4
|
||||
#define PCAP_VERSION_MAJOR 2
|
||||
#define PCAP_VERSION_MINOR 4
|
||||
#define PCAP_LINKTYPE_ETHERNET 1
|
||||
const struct {
|
||||
uint32_t magic;
|
||||
uint16_t major;
|
||||
uint16_t minor;
|
||||
|
||||
int32_t thiszone;
|
||||
uint32_t sigfigs;
|
||||
uint32_t snaplen;
|
||||
|
||||
uint32_t linktype;
|
||||
} pcap_hdr = {
|
||||
.magic = PCAP_MAGIC,
|
||||
.major = PCAP_VERSION_MAJOR,
|
||||
.minor = PCAP_VERSION_MINOR,
|
||||
.snaplen = tap_l2_max_len(c),
|
||||
.linktype = PCAP_LINKTYPE_ETHERNET
|
||||
};
|
||||
int flags = O_WRONLY | O_CREAT | O_TRUNC;
|
||||
|
||||
if (pcap_fd != -1)
|
||||
return;
|
||||
|
@ -167,9 +166,10 @@ void pcap_init(struct ctx *c)
|
|||
if (!*c->pcap)
|
||||
return;
|
||||
|
||||
pcap_fd = output_file_open(c->pcap, O_WRONLY);
|
||||
flags |= c->foreground ? O_CLOEXEC : 0;
|
||||
pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR);
|
||||
if (pcap_fd == -1) {
|
||||
err_perror("Couldn't open pcap file %s", c->pcap);
|
||||
perror("open");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
2
pcap.h
2
pcap.h
|
@ -9,7 +9,7 @@
|
|||
void pcap(const char *pkt, size_t l2len);
|
||||
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
||||
size_t offset);
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset);
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt);
|
||||
void pcap_init(struct ctx *c);
|
||||
|
||||
#endif /* PCAP_H */
|
||||
|
|
42
pif.c
42
pif.c
|
@ -59,45 +59,3 @@ void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
|
|||
*sl = sizeof(sa->sa6);
|
||||
}
|
||||
}
|
||||
|
||||
/** pif_sock_l4() - Open a socket bound to an address on a specified interface
|
||||
* @c: Execution context
|
||||
* @type: Socket epoll type
|
||||
* @pif: Interface for this socket
|
||||
* @addr: Address to bind to, or NULL for dual-stack any
|
||||
* @ifname: Interface for binding, NULL for any
|
||||
* @port: Port number to bind to (host byte order)
|
||||
* @data: epoll reference portion for protocol handlers
|
||||
*
|
||||
* NOTE: For namespace pifs, this must be called having already entered the
|
||||
* relevant namespace.
|
||||
*
|
||||
* Return: newly created socket, negative error code on failure
|
||||
*/
|
||||
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||
const union inany_addr *addr, const char *ifname,
|
||||
in_port_t port, uint32_t data)
|
||||
{
|
||||
union sockaddr_inany sa = {
|
||||
.sa6.sin6_family = AF_INET6,
|
||||
.sa6.sin6_addr = in6addr_any,
|
||||
.sa6.sin6_port = htons(port),
|
||||
};
|
||||
socklen_t sl;
|
||||
|
||||
ASSERT(pif_is_socket(pif));
|
||||
|
||||
if (pif == PIF_SPLICE) {
|
||||
/* Sanity checks */
|
||||
ASSERT(!ifname);
|
||||
ASSERT(addr && inany_is_loopback(addr));
|
||||
}
|
||||
|
||||
if (!addr)
|
||||
return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
|
||||
ifname, false, data);
|
||||
|
||||
pif_sockaddr(c, &sa, &sl, pif, addr, port);
|
||||
return sock_l4_sa(c, type, &sa, sl,
|
||||
ifname, sa.sa_family == AF_INET6, data);
|
||||
}
|
||||
|
|
3
pif.h
3
pif.h
|
@ -59,8 +59,5 @@ static inline bool pif_is_socket(uint8_t pif)
|
|||
|
||||
void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
|
||||
uint8_t pif, const union inany_addr *addr, in_port_t port);
|
||||
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||
const union inany_addr *addr, const char *ifname,
|
||||
in_port_t port, uint32_t data);
|
||||
|
||||
#endif /* PIF_H */
|
||||
|
|
273
repair.c
273
repair.c
|
@ -1,273 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* PASST - Plug A Simple Socket Transport
|
||||
* for qemu/UNIX domain socket mode
|
||||
*
|
||||
* PASTA - Pack A Subtle Tap Abstraction
|
||||
* for network namespace/tap device mode
|
||||
*
|
||||
* repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR
|
||||
*
|
||||
* Copyright (c) 2025 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/uio.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "passt.h"
|
||||
#include "inany.h"
|
||||
#include "flow.h"
|
||||
#include "flow_table.h"
|
||||
|
||||
#include "repair.h"
|
||||
|
||||
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
|
||||
|
||||
/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */
|
||||
#define REPAIR_ACCEPT_TIMEOUT_MS 10
|
||||
#define REPAIR_ACCEPT_TIMEOUT_US (REPAIR_ACCEPT_TIMEOUT_MS * 1000)
|
||||
|
||||
/* Pending file descriptors for next repair_flush() call, or command change */
|
||||
static int repair_fds[SCM_MAX_FD];
|
||||
|
||||
/* Pending command: flush pending file descriptors if it changes */
|
||||
static int8_t repair_cmd;
|
||||
|
||||
/* Number of pending file descriptors set in @repair_fds */
|
||||
static int repair_nfds;
|
||||
|
||||
/**
|
||||
* repair_sock_init() - Start listening for connections on helper socket
|
||||
* @c: Execution context
|
||||
*/
|
||||
void repair_sock_init(const struct ctx *c)
|
||||
{
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN };
|
||||
struct epoll_event ev = { 0 };
|
||||
|
||||
if (c->fd_repair_listen == -1)
|
||||
return;
|
||||
|
||||
if (listen(c->fd_repair_listen, 0)) {
|
||||
err_perror("listen() on repair helper socket, won't migrate");
|
||||
return;
|
||||
}
|
||||
|
||||
ref.fd = c->fd_repair_listen;
|
||||
ev.events = EPOLLIN | EPOLLHUP | EPOLLET;
|
||||
ev.data.u64 = ref.u64;
|
||||
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev))
|
||||
err_perror("repair helper socket epoll_ctl(), won't migrate");
|
||||
}
|
||||
|
||||
/**
|
||||
* repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
|
||||
* @c: Execution context
|
||||
* @events: epoll events
|
||||
*
|
||||
* Return: 0 on valid event with new connected socket, error code on failure
|
||||
*/
|
||||
int repair_listen_handler(struct ctx *c, uint32_t events)
|
||||
{
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
|
||||
struct epoll_event ev = { 0 };
|
||||
struct ucred ucred;
|
||||
socklen_t len;
|
||||
int rc;
|
||||
|
||||
if (events != EPOLLIN) {
|
||||
debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
|
||||
events);
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
len = sizeof(ucred);
|
||||
|
||||
/* Another client is already connected: accept and close right away. */
|
||||
if (c->fd_repair != -1) {
|
||||
int discard = accept4(c->fd_repair_listen, NULL, NULL,
|
||||
SOCK_NONBLOCK);
|
||||
|
||||
if (discard == -1)
|
||||
return errno;
|
||||
|
||||
if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
|
||||
info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
|
||||
|
||||
close(discard);
|
||||
return EEXIST;
|
||||
}
|
||||
|
||||
if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) {
|
||||
rc = errno;
|
||||
debug_perror("accept4() on TCP_REPAIR helper listening socket");
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
|
||||
info("Accepted TCP_REPAIR helper, PID %i", ucred.pid);
|
||||
|
||||
ref.fd = c->fd_repair;
|
||||
ev.events = EPOLLHUP | EPOLLET;
|
||||
ev.data.u64 = ref.u64;
|
||||
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) {
|
||||
rc = errno;
|
||||
debug_perror("epoll_ctl() on TCP_REPAIR helper socket");
|
||||
close(c->fd_repair);
|
||||
c->fd_repair = -1;
|
||||
return rc;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* repair_close() - Close connection to TCP_REPAIR helper
|
||||
* @c: Execution context
|
||||
*/
|
||||
void repair_close(struct ctx *c)
|
||||
{
|
||||
debug("Closing TCP_REPAIR helper socket");
|
||||
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL);
|
||||
close(c->fd_repair);
|
||||
c->fd_repair = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket
|
||||
* @c: Execution context
|
||||
* @events: epoll events
|
||||
*/
|
||||
void repair_handler(struct ctx *c, uint32_t events)
|
||||
{
|
||||
(void)events;
|
||||
|
||||
repair_close(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect
|
||||
* @c: Execution context
|
||||
*
|
||||
* Return: 0 on success or if already connected, error code on failure
|
||||
*/
|
||||
int repair_wait(struct ctx *c)
|
||||
{
|
||||
struct timeval tv = { .tv_sec = 0,
|
||||
.tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) };
|
||||
int rc;
|
||||
|
||||
static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000,
|
||||
".tv_usec is greater than 1000 * 1000");
|
||||
|
||||
if (c->fd_repair >= 0)
|
||||
return 0;
|
||||
|
||||
if (c->fd_repair_listen == -1)
|
||||
return ENOENT;
|
||||
|
||||
if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
|
||||
&tv, sizeof(tv))) {
|
||||
rc = errno;
|
||||
err_perror("Set timeout on TCP_REPAIR listening socket");
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = repair_listen_handler(c, EPOLLIN);
|
||||
|
||||
tv.tv_usec = 0;
|
||||
if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
|
||||
&tv, sizeof(tv)))
|
||||
err_perror("Clear timeout on TCP_REPAIR listening socket");
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* repair_flush() - Flush current set of sockets to helper, with current command
|
||||
* @c: Execution context
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*/
|
||||
int repair_flush(struct ctx *c)
|
||||
{
|
||||
char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
|
||||
__attribute__ ((aligned(__alignof__(struct cmsghdr)))) = { 0 };
|
||||
struct iovec iov = { &repair_cmd, sizeof(repair_cmd) };
|
||||
struct cmsghdr *cmsg;
|
||||
struct msghdr msg;
|
||||
int8_t reply;
|
||||
|
||||
if (!repair_nfds)
|
||||
return 0;
|
||||
|
||||
msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0,
|
||||
.msg_iov = &iov, .msg_iovlen = 1,
|
||||
.msg_control = buf,
|
||||
.msg_controllen = CMSG_SPACE(sizeof(int) *
|
||||
repair_nfds),
|
||||
.msg_flags = 0 };
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
|
||||
cmsg->cmsg_level = SOL_SOCKET;
|
||||
cmsg->cmsg_type = SCM_RIGHTS;
|
||||
cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds);
|
||||
memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds);
|
||||
|
||||
repair_nfds = 0;
|
||||
|
||||
if (sendmsg(c->fd_repair, &msg, 0) < 0) {
|
||||
int ret = -errno;
|
||||
err_perror("Failed to send sockets to TCP_REPAIR helper");
|
||||
repair_close(c);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (recv(c->fd_repair, &reply, sizeof(reply), 0) < 0) {
|
||||
int ret = -errno;
|
||||
err_perror("Failed to receive reply from TCP_REPAIR helper");
|
||||
repair_close(c);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (reply != repair_cmd) {
|
||||
err("Unexpected reply from TCP_REPAIR helper: %d", reply);
|
||||
repair_close(c);
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* repair_set() - Add socket to TCP_REPAIR set with given command
|
||||
* @c: Execution context
|
||||
* @s: Socket to add
|
||||
* @cmd: TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*/
|
||||
int repair_set(struct ctx *c, int s, int cmd)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (repair_nfds && repair_cmd != cmd) {
|
||||
if ((rc = repair_flush(c)))
|
||||
return rc;
|
||||
}
|
||||
|
||||
repair_cmd = cmd;
|
||||
repair_fds[repair_nfds++] = s;
|
||||
|
||||
if (repair_nfds >= SCM_MAX_FD) {
|
||||
if ((rc = repair_flush(c)))
|
||||
return rc;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
17
repair.h
17
repair.h
|
@ -1,17 +0,0 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright (c) 2025 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#ifndef REPAIR_H
|
||||
#define REPAIR_H
|
||||
|
||||
void repair_sock_init(const struct ctx *c);
|
||||
int repair_listen_handler(struct ctx *c, uint32_t events);
|
||||
void repair_handler(struct ctx *c, uint32_t events);
|
||||
void repair_close(struct ctx *c);
|
||||
int repair_wait(struct ctx *c);
|
||||
int repair_flush(struct ctx *c);
|
||||
int repair_set(struct ctx *c, int s, int cmd);
|
||||
|
||||
#endif /* REPAIR_H */
|
22
seccomp.sh
22
seccomp.sh
|
@ -14,23 +14,12 @@
|
|||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
TMP="$(mktemp)"
|
||||
OUT="$(mktemp)"
|
||||
OUT_FINAL="${1}"
|
||||
shift
|
||||
IN="$@"
|
||||
OUT="$(mktemp)"
|
||||
|
||||
[ -z "${ARCH}" ] && ARCH="$(uname -m)"
|
||||
[ -z "${CC}" ] && CC="cc"
|
||||
|
||||
AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z] \
|
||||
| sed 's/^ARM.*/ARM/' \
|
||||
| sed 's/I[456]86/I386/' \
|
||||
| sed 's/PPC64/PPC/' \
|
||||
| sed 's/PPCLE/PPC64LE/' \
|
||||
| sed 's/MIPS64EL/MIPSEL64/' \
|
||||
| sed 's/HPPA/PARISC/' \
|
||||
| sed 's/SH4/SH/')"
|
||||
|
||||
HEADER="/* This file was automatically generated by $(basename ${0}) */
|
||||
|
||||
#ifndef AUDIT_ARCH_PPC64LE
|
||||
|
@ -43,7 +32,7 @@ struct sock_filter filter_@PROFILE@[] = {
|
|||
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
|
||||
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
||||
(offsetof(struct seccomp_data, arch))),
|
||||
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@),
|
||||
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@),
|
||||
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
|
||||
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
||||
(offsetof(struct seccomp_data, nr))),
|
||||
|
@ -244,8 +233,7 @@ gen_profile() {
|
|||
sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}"
|
||||
done
|
||||
|
||||
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \
|
||||
"AUDIT_ARCH:${AUDIT_ARCH}"
|
||||
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))"
|
||||
}
|
||||
|
||||
printf '%s\n' "${HEADER}" > "${OUT}"
|
||||
|
@ -255,7 +243,7 @@ for __p in ${__profiles}; do
|
|||
__calls="${__calls} ${EXTRA_SYSCALLS:-}"
|
||||
__calls="$(filter ${__calls})"
|
||||
|
||||
cols="$(stty -a 2>/dev/null | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
|
||||
cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
|
||||
case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac
|
||||
echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args}
|
||||
|
||||
|
@ -270,4 +258,4 @@ for __p in ${__profiles}; do
|
|||
gen_profile "${__p}" ${__calls}
|
||||
done
|
||||
|
||||
mv "${OUT}" "${OUT_FINAL}"
|
||||
mv "${OUT}" seccomp.h
|
||||
|
|
|
@ -99,7 +99,7 @@ static inline void siphash_feed(struct siphash_state *state, uint64_t in)
|
|||
}
|
||||
|
||||
/**
|
||||
* siphash_final() - Finalize SipHash calculations
|
||||
* siphash_final - Finalize SipHash calculations
|
||||
* @v: siphash state (4 x 64-bit integers)
|
||||
* @len: Total length of input data
|
||||
* @tail: Final data for the hash (<= 7 bytes)
|
||||
|
|
446
tap.c
446
tap.c
|
@ -56,72 +56,16 @@
|
|||
#include "netlink.h"
|
||||
#include "pasta.h"
|
||||
#include "packet.h"
|
||||
#include "repair.h"
|
||||
#include "tap.h"
|
||||
#include "log.h"
|
||||
#include "vhost_user.h"
|
||||
#include "vu_common.h"
|
||||
|
||||
/* Maximum allowed frame lengths (including L2 header) */
|
||||
|
||||
/* Verify that an L2 frame length limit is large enough to contain the header,
|
||||
* but small enough to fit in the packet pool
|
||||
*/
|
||||
#define CHECK_FRAME_LEN(len) \
|
||||
static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN, \
|
||||
#len " has bad value")
|
||||
|
||||
CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
|
||||
CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
|
||||
CHECK_FRAME_LEN(L2_MAX_LEN_VU);
|
||||
|
||||
/* We try size the packet pools so that we can use a single batch for the entire
|
||||
* packet buffer. This might be exceeded for vhost-user, though, which uses its
|
||||
* own buffers rather than pkt_buf.
|
||||
*
|
||||
* This is just a tuning parameter, the code will work with slightly more
|
||||
* overhead if it's incorrect. So, we estimate based on the minimum practical
|
||||
* frame size - an empty UDP datagram - rather than the minimum theoretical
|
||||
* frame size.
|
||||
*
|
||||
* FIXME: Profile to work out how big this actually needs to be to amortise
|
||||
* per-batch syscall overheads
|
||||
*/
|
||||
#define TAP_MSGS_IP4 \
|
||||
DIV_ROUND_UP(sizeof(pkt_buf), \
|
||||
ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr))
|
||||
#define TAP_MSGS_IP6 \
|
||||
DIV_ROUND_UP(sizeof(pkt_buf), \
|
||||
ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr))
|
||||
|
||||
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
|
||||
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4, pkt_buf);
|
||||
static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf);
|
||||
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
|
||||
static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
|
||||
|
||||
#define TAP_SEQS 128 /* Different L4 tuples in one batch */
|
||||
#define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */
|
||||
|
||||
/**
|
||||
* tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
|
||||
* @c: Execution context
|
||||
*/
|
||||
unsigned long tap_l2_max_len(const struct ctx *c)
|
||||
{
|
||||
/* NOLINTBEGIN(bugprone-branch-clone): values can be the same */
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
return L2_MAX_LEN_PASST;
|
||||
case MODE_PASTA:
|
||||
return L2_MAX_LEN_PASTA;
|
||||
case MODE_VU:
|
||||
return L2_MAX_LEN_VU;
|
||||
}
|
||||
/* NOLINTEND(bugprone-branch-clone) */
|
||||
ASSERT(0);
|
||||
|
||||
return 0; /* Unreachable, for cppcheck's sake */
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_send_single() - Send a single frame
|
||||
* @c: Execution context
|
||||
|
@ -134,22 +78,16 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
|
|||
struct iovec iov[2];
|
||||
size_t iovcnt = 0;
|
||||
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
if (c->mode == MODE_PASST) {
|
||||
iov[iovcnt] = IOV_OF_LVALUE(vnet_len);
|
||||
iovcnt++;
|
||||
/* fall through */
|
||||
case MODE_PASTA:
|
||||
iov[iovcnt].iov_base = (void *)data;
|
||||
iov[iovcnt].iov_len = l2len;
|
||||
iovcnt++;
|
||||
|
||||
tap_send_frames(c, iov, iovcnt, 1);
|
||||
break;
|
||||
case MODE_VU:
|
||||
vu_send_single(c, data, l2len);
|
||||
break;
|
||||
}
|
||||
|
||||
iov[iovcnt].iov_base = (void *)data;
|
||||
iov[iovcnt].iov_len = l2len;
|
||||
iovcnt++;
|
||||
|
||||
tap_send_frames(c, iov, iovcnt, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -175,7 +113,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
|
|||
*
|
||||
* Return: pointer at which to write the packet's payload
|
||||
*/
|
||||
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
|
||||
static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
|
||||
{
|
||||
struct ethhdr *eh = (struct ethhdr *)buf;
|
||||
|
||||
|
@ -196,8 +134,8 @@ void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
|
|||
*
|
||||
* Return: pointer at which to write the packet's payload
|
||||
*/
|
||||
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
||||
struct in_addr dst, size_t l4len, uint8_t proto)
|
||||
static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
||||
struct in_addr dst, size_t l4len, uint8_t proto)
|
||||
{
|
||||
uint16_t l3len = l4len + sizeof(*ip4h);
|
||||
|
||||
|
@ -206,43 +144,13 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
|||
ip4h->tos = 0;
|
||||
ip4h->tot_len = htons(l3len);
|
||||
ip4h->id = 0;
|
||||
ip4h->frag_off = htons(IP_DF);
|
||||
ip4h->frag_off = 0;
|
||||
ip4h->ttl = 255;
|
||||
ip4h->protocol = proto;
|
||||
ip4h->saddr = src.s_addr;
|
||||
ip4h->daddr = dst.s_addr;
|
||||
ip4h->check = csum_ip4_header(l3len, proto, src, dst);
|
||||
return (char *)ip4h + sizeof(*ip4h);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_push_uh4() - Build UDPv4 header with checksum
|
||||
* @c: Execution context
|
||||
* @src: IPv4 source address
|
||||
* @sport: UDP source port
|
||||
* @dst: IPv4 destination address
|
||||
* @dport: UDP destination port
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @dlen: UDP payload length (not including UDP header)
|
||||
*
|
||||
* Return: pointer at which to write the packet's payload
|
||||
*/
|
||||
void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
|
||||
struct in_addr dst, in_port_t dport,
|
||||
const void *in, size_t dlen)
|
||||
{
|
||||
size_t l4len = dlen + sizeof(struct udphdr);
|
||||
const struct iovec iov = {
|
||||
.iov_base = (void *)in,
|
||||
.iov_len = dlen
|
||||
};
|
||||
struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
|
||||
|
||||
uh->source = htons(sport);
|
||||
uh->dest = htons(dport);
|
||||
uh->len = htons(l4len);
|
||||
csum_udp4(uh, src, dst, &payload);
|
||||
return (char *)uh + sizeof(*uh);
|
||||
return ip4h + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -252,7 +160,7 @@ void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
|
|||
* @sport: UDP source port
|
||||
* @dst: IPv4 destination address
|
||||
* @dport: UDP destination port
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @dlen: UDP payload length (not including UDP header)
|
||||
*/
|
||||
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
||||
|
@ -263,9 +171,14 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
|||
char buf[USHRT_MAX];
|
||||
struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
|
||||
struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
|
||||
char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
|
||||
char *data = (char *)(uh + 1);
|
||||
|
||||
uh->source = htons(sport);
|
||||
uh->dest = htons(dport);
|
||||
uh->len = htons(l4len);
|
||||
csum_udp4(uh, src, dst, in, dlen);
|
||||
memcpy(data, in, dlen);
|
||||
|
||||
tap_send_single(c, buf, dlen + (data - buf));
|
||||
}
|
||||
|
||||
|
@ -302,9 +215,10 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
|
|||
*
|
||||
* Return: pointer at which to write the packet's payload
|
||||
*/
|
||||
void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
||||
const struct in6_addr *src, const struct in6_addr *dst,
|
||||
size_t l4len, uint8_t proto, uint32_t flow)
|
||||
static void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
||||
const struct in6_addr *src,
|
||||
const struct in6_addr *dst,
|
||||
size_t l4len, uint8_t proto, uint32_t flow)
|
||||
{
|
||||
ip6h->payload_len = htons(l4len);
|
||||
ip6h->priority = 0;
|
||||
|
@ -313,40 +227,10 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
|||
ip6h->hop_limit = 255;
|
||||
ip6h->saddr = *src;
|
||||
ip6h->daddr = *dst;
|
||||
ip6_set_flow_lbl(ip6h, flow);
|
||||
return (char *)ip6h + sizeof(*ip6h);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_push_uh6() - Build UDPv6 header with checksum
|
||||
* @c: Execution context
|
||||
* @src: IPv6 source address
|
||||
* @sport: UDP source port
|
||||
* @dst: IPv6 destination address
|
||||
* @dport: UDP destination port
|
||||
* @flow: Flow label
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @dlen: UDP payload length (not including UDP header)
|
||||
*
|
||||
* Return: pointer at which to write the packet's payload
|
||||
*/
|
||||
void *tap_push_uh6(struct udphdr *uh,
|
||||
const struct in6_addr *src, in_port_t sport,
|
||||
const struct in6_addr *dst, in_port_t dport,
|
||||
void *in, size_t dlen)
|
||||
{
|
||||
size_t l4len = dlen + sizeof(struct udphdr);
|
||||
const struct iovec iov = {
|
||||
.iov_base = in,
|
||||
.iov_len = dlen
|
||||
};
|
||||
struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
|
||||
|
||||
uh->source = htons(sport);
|
||||
uh->dest = htons(dport);
|
||||
uh->len = htons(l4len);
|
||||
csum_udp6(uh, src, dst, &payload);
|
||||
return (char *)uh + sizeof(*uh);
|
||||
ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
|
||||
ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
|
||||
ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
|
||||
return ip6h + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -357,22 +241,27 @@ void *tap_push_uh6(struct udphdr *uh,
|
|||
* @dst: IPv6 destination address
|
||||
* @dport: UDP destination port
|
||||
* @flow: Flow label
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @dlen: UDP payload length (not including UDP header)
|
||||
*/
|
||||
void tap_udp6_send(const struct ctx *c,
|
||||
const struct in6_addr *src, in_port_t sport,
|
||||
const struct in6_addr *dst, in_port_t dport,
|
||||
uint32_t flow, void *in, size_t dlen)
|
||||
uint32_t flow, const void *in, size_t dlen)
|
||||
{
|
||||
size_t l4len = dlen + sizeof(struct udphdr);
|
||||
char buf[USHRT_MAX];
|
||||
struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
|
||||
struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
|
||||
l4len, IPPROTO_UDP, flow);
|
||||
char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
|
||||
char *data = (char *)(uh + 1);
|
||||
|
||||
uh->source = htons(sport);
|
||||
uh->dest = htons(dport);
|
||||
uh->len = htons(l4len);
|
||||
csum_udp6(uh, src, dst, in, dlen);
|
||||
memcpy(data, in, dlen);
|
||||
|
||||
tap_send_single(c, buf, dlen + (data - buf));
|
||||
}
|
||||
|
||||
|
@ -517,18 +406,10 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
|
|||
if (!nframes)
|
||||
return 0;
|
||||
|
||||
switch (c->mode) {
|
||||
case MODE_PASTA:
|
||||
if (c->mode == MODE_PASTA)
|
||||
m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes);
|
||||
break;
|
||||
case MODE_PASST:
|
||||
else
|
||||
m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes);
|
||||
break;
|
||||
case MODE_VU:
|
||||
/* fall through */
|
||||
default:
|
||||
ASSERT(0);
|
||||
}
|
||||
|
||||
if (m < nframes)
|
||||
debug("tap: failed to send %zu frames of %zu",
|
||||
|
@ -561,7 +442,6 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
|
|||
* struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
|
||||
* @msgs: Count of messages in sequence
|
||||
* @protocol: Protocol number
|
||||
* @ttl: Time to live
|
||||
* @source: Source port
|
||||
* @dest: Destination port
|
||||
* @saddr: Source address
|
||||
|
@ -570,7 +450,6 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
|
|||
*/
|
||||
static struct tap4_l4_t {
|
||||
uint8_t protocol;
|
||||
uint8_t ttl;
|
||||
|
||||
uint16_t source;
|
||||
uint16_t dest;
|
||||
|
@ -585,17 +464,14 @@ static struct tap4_l4_t {
|
|||
* struct l4_seq6_t - Message sequence for one protocol handler call, IPv6
|
||||
* @msgs: Count of messages in sequence
|
||||
* @protocol: Protocol number
|
||||
* @flow_lbl: IPv6 flow label
|
||||
* @source: Source port
|
||||
* @dest: Destination port
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
* @hop_limit: Hop limit
|
||||
* @msg: Array of messages that can be handled in a single call
|
||||
*/
|
||||
static struct tap6_l4_t {
|
||||
uint8_t protocol;
|
||||
uint32_t flow_lbl :20;
|
||||
|
||||
uint16_t source;
|
||||
uint16_t dest;
|
||||
|
@ -603,8 +479,6 @@ static struct tap6_l4_t {
|
|||
struct in6_addr saddr;
|
||||
struct in6_addr daddr;
|
||||
|
||||
uint8_t hop_limit;
|
||||
|
||||
struct pool_l4_t p;
|
||||
} tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
|
||||
|
||||
|
@ -793,8 +667,7 @@ resume:
|
|||
#define L4_MATCH(iph, uh, seq) \
|
||||
((seq)->protocol == (iph)->protocol && \
|
||||
(seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \
|
||||
(seq)->saddr.s_addr == (iph)->saddr && \
|
||||
(seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
|
||||
(seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
|
||||
|
||||
#define L4_SET(iph, uh, seq) \
|
||||
do { \
|
||||
|
@ -803,7 +676,6 @@ resume:
|
|||
(seq)->dest = (uh)->dest; \
|
||||
(seq)->saddr.s_addr = (iph)->saddr; \
|
||||
(seq)->daddr.s_addr = (iph)->daddr; \
|
||||
(seq)->ttl = (iph)->ttl; \
|
||||
} while (0)
|
||||
|
||||
if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
|
||||
|
@ -845,14 +717,14 @@ append:
|
|||
for (k = 0; k < p->count; )
|
||||
k += tcp_tap_handler(c, PIF_TAP, AF_INET,
|
||||
&seq->saddr, &seq->daddr,
|
||||
0, p, k, now);
|
||||
p, k, now);
|
||||
} else if (seq->protocol == IPPROTO_UDP) {
|
||||
if (c->no_udp)
|
||||
continue;
|
||||
for (k = 0; k < p->count; )
|
||||
k += udp_tap_handler(c, PIF_TAP, AF_INET,
|
||||
&seq->saddr, &seq->daddr,
|
||||
seq->ttl, p, k, now);
|
||||
p, k, now);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -923,9 +795,6 @@ resume:
|
|||
if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen)) {
|
||||
c->ip6.addr_seen = *saddr;
|
||||
}
|
||||
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr))
|
||||
c->ip6.addr = *saddr;
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(saddr)){
|
||||
c->ip6.addr_seen = *saddr;
|
||||
}
|
||||
|
@ -973,20 +842,16 @@ resume:
|
|||
((seq)->protocol == (proto) && \
|
||||
(seq)->source == (uh)->source && \
|
||||
(seq)->dest == (uh)->dest && \
|
||||
(seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \
|
||||
IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \
|
||||
IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \
|
||||
(seq)->hop_limit == (ip6h)->hop_limit)
|
||||
IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
|
||||
|
||||
#define L4_SET(ip6h, proto, uh, seq) \
|
||||
do { \
|
||||
(seq)->protocol = (proto); \
|
||||
(seq)->source = (uh)->source; \
|
||||
(seq)->dest = (uh)->dest; \
|
||||
(seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \
|
||||
(seq)->saddr = *saddr; \
|
||||
(seq)->daddr = *daddr; \
|
||||
(seq)->hop_limit = (ip6h)->hop_limit; \
|
||||
} while (0)
|
||||
|
||||
if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
|
||||
|
@ -1030,14 +895,14 @@ append:
|
|||
for (k = 0; k < p->count; )
|
||||
k += tcp_tap_handler(c, PIF_TAP, AF_INET6,
|
||||
&seq->saddr, &seq->daddr,
|
||||
seq->flow_lbl, p, k, now);
|
||||
p, k, now);
|
||||
} else if (seq->protocol == IPPROTO_UDP) {
|
||||
if (c->no_udp)
|
||||
continue;
|
||||
for (k = 0; k < p->count; )
|
||||
k += udp_tap_handler(c, PIF_TAP, AF_INET6,
|
||||
&seq->saddr, &seq->daddr,
|
||||
seq->hop_limit, p, k, now);
|
||||
p, k, now);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1072,10 +937,8 @@ void tap_handler(struct ctx *c, const struct timespec *now)
|
|||
* @c: Execution context
|
||||
* @l2len: Total L2 packet length
|
||||
* @p: Packet buffer
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
|
||||
const struct timespec *now)
|
||||
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
|
||||
{
|
||||
const struct ethhdr *eh;
|
||||
|
||||
|
@ -1091,17 +954,9 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
|
|||
switch (ntohs(eh->h_proto)) {
|
||||
case ETH_P_ARP:
|
||||
case ETH_P_IP:
|
||||
if (pool_full(pool_tap4)) {
|
||||
tap4_handler(c, pool_tap4, now);
|
||||
pool_flush(pool_tap4);
|
||||
}
|
||||
packet_add(pool_tap4, l2len, p);
|
||||
break;
|
||||
case ETH_P_IPV6:
|
||||
if (pool_full(pool_tap6)) {
|
||||
tap6_handler(c, pool_tap6, now);
|
||||
pool_flush(pool_tap6);
|
||||
}
|
||||
packet_add(pool_tap6, l2len, p);
|
||||
break;
|
||||
default:
|
||||
|
@ -1113,19 +968,17 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
|
|||
* tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tap_sock_reset(struct ctx *c)
|
||||
static void tap_sock_reset(struct ctx *c)
|
||||
{
|
||||
info("Client connection closed%s", c->one_off ? ", exiting" : "");
|
||||
|
||||
if (c->one_off)
|
||||
_exit(EXIT_SUCCESS);
|
||||
exit(EXIT_SUCCESS);
|
||||
|
||||
/* Close the connected socket, wait for a new connection */
|
||||
epoll_del(c, c->fd_tap);
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
|
||||
close(c->fd_tap);
|
||||
c->fd_tap = -1;
|
||||
if (c->mode == MODE_VU)
|
||||
vu_cleanup(c->vdev);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1152,7 +1005,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
|
|||
|
||||
do {
|
||||
n = recv(c->fd_tap, pkt_buf + partial_len,
|
||||
sizeof(pkt_buf) - partial_len, MSG_DONTWAIT);
|
||||
TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
|
||||
} while ((n < 0) && errno == EINTR);
|
||||
|
||||
if (n < 0) {
|
||||
|
@ -1169,7 +1022,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
|
|||
while (n >= (ssize_t)sizeof(uint32_t)) {
|
||||
uint32_t l2len = ntohl_unaligned(p);
|
||||
|
||||
if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) {
|
||||
if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
|
||||
err("Bad frame size from guest, resetting connection");
|
||||
tap_sock_reset(c);
|
||||
return;
|
||||
|
@ -1182,7 +1035,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
|
|||
p += sizeof(uint32_t);
|
||||
n -= sizeof(uint32_t);
|
||||
|
||||
tap_add_packet(c, l2len, p, now);
|
||||
tap_add_packet(c, l2len, p);
|
||||
|
||||
p += l2len;
|
||||
n -= l2len;
|
||||
|
@ -1223,10 +1076,8 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
|
|||
|
||||
tap_flush_pools();
|
||||
|
||||
for (n = 0;
|
||||
n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA);
|
||||
n += len) {
|
||||
len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA);
|
||||
for (n = 0; n <= (ssize_t)TAP_BUF_BYTES - ETH_MAX_MTU; n += len) {
|
||||
len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
|
||||
|
||||
if (len == 0) {
|
||||
die("EOF on tap device, exiting");
|
||||
|
@ -1244,10 +1095,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
|
|||
|
||||
/* Ignore frames of bad length */
|
||||
if (len < (ssize_t)sizeof(struct ethhdr) ||
|
||||
len > (ssize_t)L2_MAX_LEN_PASTA)
|
||||
len > (ssize_t)ETH_MAX_MTU)
|
||||
continue;
|
||||
|
||||
tap_add_packet(c, len, pkt_buf + n, now);
|
||||
tap_add_packet(c, len, pkt_buf + n);
|
||||
}
|
||||
|
||||
tap_handler(c, now);
|
||||
|
@ -1270,35 +1121,70 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
|
|||
}
|
||||
|
||||
/**
|
||||
* tap_backend_show_hints() - Give help information to start QEMU
|
||||
* @c: Execution context
|
||||
* tap_sock_unix_open() - Create and bind AF_UNIX socket
|
||||
* @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
|
||||
*
|
||||
* Return: socket descriptor on success, won't return on failure
|
||||
*/
|
||||
static void tap_backend_show_hints(struct ctx *c)
|
||||
int tap_sock_unix_open(char *sock_path)
|
||||
{
|
||||
switch (c->mode) {
|
||||
case MODE_PASTA:
|
||||
/* No hints */
|
||||
break;
|
||||
case MODE_PASST:
|
||||
info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):");
|
||||
info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
|
||||
c->sock_path);
|
||||
info("or qrap, for earlier qemu versions:");
|
||||
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
|
||||
break;
|
||||
case MODE_VU:
|
||||
info("You can start qemu with:");
|
||||
info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n",
|
||||
c->sock_path);
|
||||
break;
|
||||
int fd = socket(AF_UNIX, SOCK_STREAM, 0);
|
||||
struct sockaddr_un addr = {
|
||||
.sun_family = AF_UNIX,
|
||||
};
|
||||
int i;
|
||||
|
||||
if (fd < 0)
|
||||
die_perror("Failed to open UNIX domain socket");
|
||||
|
||||
for (i = 1; i < UNIX_SOCK_MAX; i++) {
|
||||
char *path = addr.sun_path;
|
||||
int ex, ret;
|
||||
|
||||
if (*sock_path)
|
||||
memcpy(path, sock_path, UNIX_PATH_MAX);
|
||||
else
|
||||
snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i);
|
||||
|
||||
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
|
||||
if (ex < 0)
|
||||
die_perror("Failed to check for UNIX domain conflicts");
|
||||
|
||||
ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
|
||||
if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
|
||||
errno != EACCES)) {
|
||||
if (*sock_path)
|
||||
die("Socket path %s already in use", path);
|
||||
|
||||
close(ex);
|
||||
continue;
|
||||
}
|
||||
close(ex);
|
||||
|
||||
unlink(path);
|
||||
ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
|
||||
if (*sock_path && ret)
|
||||
die_perror("Failed to bind UNIX domain socket");
|
||||
|
||||
if (!ret)
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == UNIX_SOCK_MAX)
|
||||
die_perror("Failed to bind UNIX domain socket");
|
||||
|
||||
info("UNIX domain socket bound at %s", addr.sun_path);
|
||||
if (!*sock_path)
|
||||
memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
|
||||
|
||||
return fd;
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_sock_unix_init() - Start listening for connections on AF_UNIX socket
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void tap_sock_unix_init(const struct ctx *c)
|
||||
static void tap_sock_unix_init(struct ctx *c)
|
||||
{
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN };
|
||||
struct epoll_event ev = { 0 };
|
||||
|
@ -1309,33 +1195,12 @@ static void tap_sock_unix_init(const struct ctx *c)
|
|||
ev.events = EPOLLIN | EPOLLET;
|
||||
ev.data.u64 = ref.u64;
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_start_connection() - start a new connection
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void tap_start_connection(const struct ctx *c)
|
||||
{
|
||||
struct epoll_event ev = { 0 };
|
||||
union epoll_ref ref = { 0 };
|
||||
|
||||
ref.fd = c->fd_tap;
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
ref.type = EPOLL_TYPE_TAP_PASST;
|
||||
break;
|
||||
case MODE_PASTA:
|
||||
ref.type = EPOLL_TYPE_TAP_PASTA;
|
||||
break;
|
||||
case MODE_VU:
|
||||
ref.type = EPOLL_TYPE_VHOST_CMD;
|
||||
break;
|
||||
}
|
||||
|
||||
ev.events = EPOLLIN | EPOLLRDHUP;
|
||||
ev.data.u64 = ref.u64;
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
|
||||
info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):");
|
||||
info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
|
||||
c->sock_path);
|
||||
info("or qrap, for earlier qemu versions:");
|
||||
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1345,6 +1210,8 @@ static void tap_start_connection(const struct ctx *c)
|
|||
*/
|
||||
void tap_listen_handler(struct ctx *c, uint32_t events)
|
||||
{
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST };
|
||||
struct epoll_event ev = { 0 };
|
||||
int v = INT_MAX / 2;
|
||||
struct ucred ucred;
|
||||
socklen_t len;
|
||||
|
@ -1383,7 +1250,10 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
|
|||
setsockopt(c->fd_tap, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
|
||||
trace("tap: failed to set SO_SNDBUF to %i", v);
|
||||
|
||||
tap_start_connection(c);
|
||||
ref.fd = c->fd_tap;
|
||||
ev.events = EPOLLIN | EPOLLRDHUP;
|
||||
ev.data.u64 = ref.u64;
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1409,7 +1279,7 @@ static int tap_ns_tun(void *arg)
|
|||
if (fd < 0)
|
||||
die_perror("Failed to open() /dev/net/tun");
|
||||
|
||||
rc = ioctl(fd, (int)TUNSETIFF, &ifr);
|
||||
rc = ioctl(fd, TUNSETIFF, &ifr);
|
||||
if (rc < 0)
|
||||
die_perror("TUNSETIFF ioctl on /dev/net/tun failed");
|
||||
|
||||
|
@ -1427,61 +1297,58 @@ static int tap_ns_tun(void *arg)
|
|||
*/
|
||||
static void tap_sock_tun_init(struct ctx *c)
|
||||
{
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASTA };
|
||||
struct epoll_event ev = { 0 };
|
||||
|
||||
NS_CALL(tap_ns_tun, c);
|
||||
if (c->fd_tap == -1)
|
||||
die("Failed to set up tap device in namespace");
|
||||
|
||||
pasta_ns_conf(c);
|
||||
|
||||
tap_start_connection(c);
|
||||
ref.fd = c->fd_tap;
|
||||
ev.events = EPOLLIN | EPOLLRDHUP;
|
||||
ev.data.u64 = ref.u64;
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_sock_update_pool() - Set the buffer base and size for the pool of packets
|
||||
* @base: Buffer base
|
||||
* @size Buffer size
|
||||
*/
|
||||
void tap_sock_update_pool(void *base, size_t size)
|
||||
{
|
||||
int i;
|
||||
|
||||
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size);
|
||||
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size);
|
||||
|
||||
for (i = 0; i < TAP_SEQS; i++) {
|
||||
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
|
||||
tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_backend_init() - Create and set up AF_UNIX socket or
|
||||
* tuntap file descriptor
|
||||
* tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tap_backend_init(struct ctx *c)
|
||||
void tap_sock_init(struct ctx *c)
|
||||
{
|
||||
if (c->mode == MODE_VU) {
|
||||
tap_sock_update_pool(NULL, 0);
|
||||
vu_init(c);
|
||||
} else {
|
||||
tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
|
||||
size_t sz = sizeof(pkt_buf);
|
||||
int i;
|
||||
|
||||
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz);
|
||||
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz);
|
||||
|
||||
for (i = 0; i < TAP_SEQS; i++) {
|
||||
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
|
||||
tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
|
||||
}
|
||||
|
||||
if (c->fd_tap != -1) { /* Passed as --fd */
|
||||
struct epoll_event ev = { 0 };
|
||||
union epoll_ref ref;
|
||||
|
||||
ASSERT(c->one_off);
|
||||
tap_start_connection(c);
|
||||
ref.fd = c->fd_tap;
|
||||
if (c->mode == MODE_PASST)
|
||||
ref.type = EPOLL_TYPE_TAP_PASST;
|
||||
else
|
||||
ref.type = EPOLL_TYPE_TAP_PASTA;
|
||||
|
||||
ev.events = EPOLLIN | EPOLLRDHUP;
|
||||
ev.data.u64 = ref.u64;
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (c->mode) {
|
||||
case MODE_PASTA:
|
||||
if (c->mode == MODE_PASTA) {
|
||||
tap_sock_tun_init(c);
|
||||
break;
|
||||
case MODE_VU:
|
||||
repair_sock_init(c);
|
||||
/* fall through */
|
||||
case MODE_PASST:
|
||||
} else {
|
||||
tap_sock_unix_init(c);
|
||||
|
||||
/* In passt mode, we don't know the guest's MAC address until it
|
||||
|
@ -1489,8 +1356,5 @@ void tap_backend_init(struct ctx *c)
|
|||
* first packets will reach it.
|
||||
*/
|
||||
memset(&c->guest_mac, 0xff, sizeof(c->guest_mac));
|
||||
break;
|
||||
}
|
||||
|
||||
tap_backend_show_hints(c);
|
||||
}
|
||||
|
|
63
tap.h
63
tap.h
|
@ -6,32 +6,7 @@
|
|||
#ifndef TAP_H
|
||||
#define TAP_H
|
||||
|
||||
/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header)
|
||||
*
|
||||
* The kernel tuntap device imposes a maximum frame size of 65535 including
|
||||
* 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).
|
||||
*/
|
||||
#define L2_MAX_LEN_PASTA USHRT_MAX
|
||||
|
||||
/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header)
|
||||
*
|
||||
* The only structural limit the QEMU socket protocol imposes on frames is
|
||||
* (2^32-1) bytes, but that would be ludicrously long in practice. For now,
|
||||
* limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate
|
||||
* limit with more precision.
|
||||
*/
|
||||
#define L2_MAX_LEN_PASST USHRT_MAX
|
||||
|
||||
/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header)
|
||||
*
|
||||
* vhost-user allows multiple buffers per frame, each of which can be quite
|
||||
* large, so the inherent frame size limit is rather large. Much larger than is
|
||||
* actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME:
|
||||
* Work out an appropriate limit with more precision.
|
||||
*/
|
||||
#define L2_MAX_LEN_VU USHRT_MAX
|
||||
|
||||
struct udphdr;
|
||||
#define ETH_HDR_INIT(proto) { .h_proto = htons_constant(proto) }
|
||||
|
||||
/**
|
||||
* struct tap_hdr - tap backend specific headers
|
||||
|
@ -46,8 +21,8 @@ struct tap_hdr {
|
|||
* @c: Execution context
|
||||
* @taph: Pointer to tap specific header buffer
|
||||
*
|
||||
* Return: a struct iovec covering the correct portion of @taph to use as the
|
||||
* tap specific header in the current configuration.
|
||||
* Returns: A struct iovec covering the correct portion of @taph to use as the
|
||||
* tap specific header in the current configuration.
|
||||
*/
|
||||
static inline struct iovec tap_hdr_iov(const struct ctx *c,
|
||||
struct tap_hdr *thdr)
|
||||
|
@ -65,27 +40,9 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c,
|
|||
*/
|
||||
static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
|
||||
{
|
||||
if (thdr)
|
||||
thdr->vnet_len = htonl(l2len);
|
||||
thdr->vnet_len = htonl(l2len);
|
||||
}
|
||||
|
||||
unsigned long tap_l2_max_len(const struct ctx *c);
|
||||
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
|
||||
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
||||
struct in_addr dst, size_t l4len, uint8_t proto);
|
||||
void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
|
||||
struct in_addr dst, in_port_t dport,
|
||||
const void *in, size_t dlen);
|
||||
void *tap_push_uh6(struct udphdr *uh,
|
||||
const struct in6_addr *src, in_port_t sport,
|
||||
const struct in6_addr *dst, in_port_t dport,
|
||||
void *in, size_t dlen);
|
||||
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
||||
struct in_addr dst, size_t l4len, uint8_t proto);
|
||||
void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
||||
const struct in6_addr *src,
|
||||
const struct in6_addr *dst,
|
||||
size_t l4len, uint8_t proto, uint32_t flow);
|
||||
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
||||
struct in_addr dst, in_port_t dport,
|
||||
const void *in, size_t dlen);
|
||||
|
@ -93,13 +50,10 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
|
|||
const void *in, size_t l4len);
|
||||
const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
|
||||
const struct in6_addr *src);
|
||||
void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
||||
const struct in6_addr *src, const struct in6_addr *dst,
|
||||
size_t l4len, uint8_t proto, uint32_t flow);
|
||||
void tap_udp6_send(const struct ctx *c,
|
||||
const struct in6_addr *src, in_port_t sport,
|
||||
const struct in6_addr *dst, in_port_t dport,
|
||||
uint32_t flow, void *in, size_t dlen);
|
||||
uint32_t flow, const void *in, size_t dlen);
|
||||
void tap_icmp6_send(const struct ctx *c,
|
||||
const struct in6_addr *src, const struct in6_addr *dst,
|
||||
const void *in, size_t l4len);
|
||||
|
@ -114,12 +68,9 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
|
|||
void tap_handler_passt(struct ctx *c, uint32_t events,
|
||||
const struct timespec *now);
|
||||
int tap_sock_unix_open(char *sock_path);
|
||||
void tap_sock_reset(struct ctx *c);
|
||||
void tap_sock_update_pool(void *base, size_t size);
|
||||
void tap_backend_init(struct ctx *c);
|
||||
void tap_sock_init(struct ctx *c);
|
||||
void tap_flush_pools(void);
|
||||
void tap_handler(struct ctx *c, const struct timespec *now);
|
||||
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
|
||||
const struct timespec *now);
|
||||
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
|
||||
|
||||
#endif /* TAP_H */
|
||||
|
|
18
tcp.h
18
tcp.h
|
@ -10,21 +10,21 @@
|
|||
|
||||
struct ctx;
|
||||
|
||||
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref);
|
||||
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
|
||||
void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
|
||||
void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
|
||||
const struct timespec *now);
|
||||
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events);
|
||||
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
const void *saddr, const void *daddr, uint32_t flow_lbl,
|
||||
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
|
||||
int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
const void *saddr, const void *daddr,
|
||||
const struct pool *p, int idx, const struct timespec *now);
|
||||
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
|
||||
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
|
||||
const char *ifname, in_port_t port);
|
||||
int tcp_init(struct ctx *c);
|
||||
void tcp_timer(struct ctx *c, const struct timespec *now);
|
||||
void tcp_defer_handler(struct ctx *c);
|
||||
|
||||
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
|
||||
int tcp_set_peek_offset(int s, int offset);
|
||||
|
||||
extern bool peek_offset_cap;
|
||||
|
||||
|
@ -58,12 +58,16 @@ union tcp_listen_epoll_ref {
|
|||
* @fwd_in: Port forwarding configuration for inbound packets
|
||||
* @fwd_out: Port forwarding configuration for outbound packets
|
||||
* @timer_run: Timestamp of most recent timer run
|
||||
* @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
|
||||
* @pipe_size: Size of pipes for spliced connections
|
||||
*/
|
||||
struct tcp_ctx {
|
||||
struct fwd_ports fwd_in;
|
||||
struct fwd_ports fwd_out;
|
||||
struct timespec timer_run;
|
||||
#ifdef HAS_SND_WND
|
||||
int kernel_snd_wnd;
|
||||
#endif
|
||||
size_t pipe_size;
|
||||
};
|
||||
|
||||
|
|
422
tcp_buf.c
422
tcp_buf.c
|
@ -20,7 +20,7 @@
|
|||
|
||||
#include <netinet/ip.h>
|
||||
|
||||
#include <netinet/tcp.h>
|
||||
#include <linux/tcp.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
|
@ -38,32 +38,88 @@
|
|||
(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
|
||||
|
||||
/* Static buffers */
|
||||
/**
|
||||
* struct tcp_payload_t - TCP header and data to send segments with payload
|
||||
* @th: TCP header
|
||||
* @data: TCP data
|
||||
*/
|
||||
struct tcp_payload_t {
|
||||
struct tcphdr th;
|
||||
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
/* Ethernet header for IPv4 and IPv6 frames */
|
||||
/**
|
||||
* struct tcp_flags_t - TCP header and data to send zero-length
|
||||
* segments (flags)
|
||||
* @th: TCP header
|
||||
* @opts TCP options
|
||||
*/
|
||||
struct tcp_flags_t {
|
||||
struct tcphdr th;
|
||||
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)));
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
/* Ethernet header for IPv4 frames */
|
||||
static struct ethhdr tcp4_eth_src;
|
||||
static struct ethhdr tcp6_eth_src;
|
||||
|
||||
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
|
||||
static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
|
||||
/* IPv4 headers */
|
||||
static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
|
||||
/* TCP segments with payload for IPv4 frames */
|
||||
static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
|
||||
|
||||
/* IP headers for IPv4 and IPv6 */
|
||||
struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
|
||||
struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
|
||||
|
||||
/* TCP segments with payload for IPv4 and IPv6 frames */
|
||||
static struct tcp_payload_t tcp_payload[TCP_FRAMES_MEM];
|
||||
|
||||
static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516");
|
||||
static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516");
|
||||
static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
|
||||
|
||||
/* References tracking the owner connection of frames in the tap outqueue */
|
||||
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp_payload_used;
|
||||
static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp4_payload_used;
|
||||
|
||||
static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
|
||||
/* IPv4 headers for TCP segment without payload */
|
||||
static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
|
||||
/* TCP segments without payload for IPv4 frames */
|
||||
static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp4_flags_used;
|
||||
|
||||
/* Ethernet header for IPv6 frames */
|
||||
static struct ethhdr tcp6_eth_src;
|
||||
|
||||
static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
|
||||
/* IPv6 headers */
|
||||
static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
|
||||
/* TCP headers and data for IPv6 frames */
|
||||
static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
|
||||
|
||||
static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
|
||||
|
||||
/* References tracking the owner connection of frames in the tap outqueue */
|
||||
static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp6_payload_used;
|
||||
|
||||
static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
|
||||
/* IPv6 headers for TCP segment without payload */
|
||||
static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
|
||||
/* TCP segment without payload for IPv6 frames */
|
||||
static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp6_flags_used;
|
||||
|
||||
/* recvmsg()/sendmsg() data for tap */
|
||||
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
|
||||
|
||||
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
|
||||
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
/**
|
||||
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses
|
||||
* @eth_d: Ethernet destination address, NULL if unchanged
|
||||
|
@ -76,40 +132,115 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
|
|||
}
|
||||
|
||||
/**
|
||||
* tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
||||
* tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_sock_iov_init(const struct ctx *c)
|
||||
void tcp_sock4_iov_init(const struct ctx *c)
|
||||
{
|
||||
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
struct iovec *iov;
|
||||
int i;
|
||||
|
||||
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
|
||||
tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
|
||||
tcp6_payload_ip[i] = ip6;
|
||||
for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
|
||||
tcp4_payload_ip[i] = iph;
|
||||
tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
|
||||
tcp4_payload[i].th.ack = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
|
||||
tcp4_flags_ip[i] = iph;
|
||||
tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
|
||||
tcp4_flags[i].th.ack = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
struct iovec *iov = tcp_l2_iov[i];
|
||||
iov = tcp4_l2_iov[i];
|
||||
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
|
||||
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
|
||||
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
iov = tcp4_l2_flags_iov[i];
|
||||
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
|
||||
* tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_sock6_iov_init(const struct ctx *c)
|
||||
{
|
||||
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
struct iovec *iov;
|
||||
int i;
|
||||
|
||||
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
|
||||
tcp6_payload_ip[i] = ip6;
|
||||
tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
|
||||
tcp6_payload[i].th.ack = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
|
||||
tcp6_flags_ip[i] = ip6;
|
||||
tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
|
||||
tcp6_flags[i].th .ack = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
iov = tcp6_l2_iov[i];
|
||||
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
|
||||
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
iov = tcp6_l2_flags_iov[i];
|
||||
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
|
||||
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_flags_flush() - Send out buffers for segments with no data (flags)
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_flags_flush(const struct ctx *c)
|
||||
{
|
||||
tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp6_flags_used);
|
||||
tcp6_flags_used = 0;
|
||||
|
||||
tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp4_flags_used);
|
||||
tcp4_flags_used = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
|
||||
* @ctx: Execution context
|
||||
* @conns: Array of connection pointers corresponding to queued frames
|
||||
* @frames: Two-dimensional array containing queued frames with sub-iovs
|
||||
* @num_frames: Number of entries in the two arrays to be compared
|
||||
*/
|
||||
static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
|
||||
static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
|
||||
struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
|
||||
{
|
||||
int i;
|
||||
|
@ -125,55 +256,34 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
|
|||
|
||||
conn->seq_to_tap = seq;
|
||||
peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
if (tcp_set_peek_offset(conn, peek_offset))
|
||||
if (tcp_set_peek_offset(conn->sock, peek_offset))
|
||||
tcp_rst(c, conn);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_payload_flush() - Send out buffers for segments with data or flags
|
||||
* tcp_payload_flush() - Send out buffers for segments with data
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_payload_flush(const struct ctx *c)
|
||||
void tcp_payload_flush(struct ctx *c)
|
||||
{
|
||||
size_t m;
|
||||
|
||||
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp_payload_used);
|
||||
if (m != tcp_payload_used) {
|
||||
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
|
||||
tcp_payload_used - m);
|
||||
m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp6_payload_used);
|
||||
if (m != tcp6_payload_used) {
|
||||
tcp_revert_seq(c, &tcp6_frame_conns[m], &tcp6_l2_iov[m],
|
||||
tcp6_payload_used - m);
|
||||
}
|
||||
tcp_payload_used = 0;
|
||||
}
|
||||
tcp6_payload_used = 0;
|
||||
|
||||
/**
|
||||
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
|
||||
* @conn: Connection pointer
|
||||
* @iov: Pointer to an array of iovec of TCP pre-cooked buffers
|
||||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
* @no_tcp_csum: Do not set TCP checksum
|
||||
*/
|
||||
static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
||||
struct iovec *iov, const uint16_t *check,
|
||||
uint32_t seq, bool no_tcp_csum)
|
||||
{
|
||||
struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0);
|
||||
struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr);
|
||||
struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base;
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
|
||||
struct ipv6hdr *ip6h = NULL;
|
||||
struct iphdr *ip4h = NULL;
|
||||
|
||||
if (a4)
|
||||
ip4h = iov[TCP_IOV_IP].iov_base;
|
||||
else
|
||||
ip6h = iov[TCP_IOV_IP].iov_base;
|
||||
|
||||
tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail,
|
||||
check, seq, no_tcp_csum);
|
||||
m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp4_payload_used);
|
||||
if (m != tcp4_payload_used) {
|
||||
tcp_revert_seq(c, &tcp4_frame_conns[m], &tcp4_l2_iov[m],
|
||||
tcp4_payload_used - m);
|
||||
}
|
||||
tcp4_payload_used = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -184,50 +294,58 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
|||
*
|
||||
* Return: negative error code on connection reset, 0 otherwise
|
||||
*/
|
||||
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
{
|
||||
struct tcp_payload_t *payload;
|
||||
struct tcp_flags_t *payload;
|
||||
struct iovec *iov;
|
||||
size_t optlen;
|
||||
size_t l4len;
|
||||
uint32_t seq;
|
||||
int ret;
|
||||
|
||||
iov = tcp_l2_iov[tcp_payload_used];
|
||||
if (CONN_V4(conn)) {
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
} else {
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||
}
|
||||
if (CONN_V4(conn))
|
||||
iov = tcp4_l2_flags_iov[tcp4_flags_used++];
|
||||
else
|
||||
iov = tcp6_l2_flags_iov[tcp6_flags_used++];
|
||||
|
||||
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
||||
|
||||
seq = conn->seq_to_tap;
|
||||
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
|
||||
(struct tcp_syn_opts *)&payload->data, &optlen);
|
||||
if (ret <= 0)
|
||||
payload->opts, &optlen);
|
||||
if (ret <= 0) {
|
||||
if (CONN_V4(conn))
|
||||
tcp4_flags_used--;
|
||||
else
|
||||
tcp6_flags_used--;
|
||||
return ret;
|
||||
|
||||
tcp_payload_used++;
|
||||
l4len = optlen + sizeof(struct tcphdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
|
||||
|
||||
if (flags & DUP_ACK) {
|
||||
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
|
||||
|
||||
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
|
||||
iov[TCP_IOV_TAP].iov_len);
|
||||
dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
|
||||
dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP];
|
||||
memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base, l4len);
|
||||
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
}
|
||||
|
||||
if (tcp_payload_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_payload_flush(c);
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
|
||||
if (flags & DUP_ACK) {
|
||||
struct iovec *dup_iov;
|
||||
int i;
|
||||
|
||||
if (CONN_V4(conn))
|
||||
dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
|
||||
else
|
||||
dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
|
||||
|
||||
for (i = 0; i < TCP_NUM_IOVS; i++)
|
||||
memcpy(dup_iov[i].iov_base, iov[i].iov_base,
|
||||
iov[i].iov_len);
|
||||
dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
|
||||
}
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_flags_flush(c);
|
||||
} else {
|
||||
if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_flags_flush(c);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -239,41 +357,40 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|||
* @dlen: TCP payload length
|
||||
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
|
||||
* @seq: Sequence number to be sent
|
||||
* @push: Set PSH flag, last segment in a batch
|
||||
*/
|
||||
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
ssize_t dlen, int no_csum, uint32_t seq, bool push)
|
||||
static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
|
||||
ssize_t dlen, int no_csum, uint32_t seq)
|
||||
{
|
||||
struct tcp_payload_t *payload;
|
||||
const uint16_t *check = NULL;
|
||||
struct iovec *iov;
|
||||
size_t l4len;
|
||||
|
||||
conn->seq_to_tap = seq + dlen;
|
||||
tcp_frame_conns[tcp_payload_used] = conn;
|
||||
iov = tcp_l2_iov[tcp_payload_used];
|
||||
if (CONN_V4(conn)) {
|
||||
if (no_csum) {
|
||||
struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
|
||||
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
|
||||
const uint16_t *check = NULL;
|
||||
|
||||
if (no_csum) {
|
||||
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
|
||||
check = &iph->check;
|
||||
}
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
|
||||
tcp4_frame_conns[tcp4_payload_used] = conn;
|
||||
|
||||
iov = tcp4_l2_iov[tcp4_payload_used++];
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_payload_flush(c);
|
||||
} else if (CONN_V6(conn)) {
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||
tcp6_frame_conns[tcp6_payload_used] = conn;
|
||||
|
||||
iov = tcp6_l2_iov[tcp6_payload_used++];
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_payload_flush(c);
|
||||
}
|
||||
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
||||
payload->th.th_off = sizeof(struct tcphdr) / 4;
|
||||
payload->th.th_x2 = 0;
|
||||
payload->th.th_flags = 0;
|
||||
payload->th.ack = 1;
|
||||
payload->th.psh = push;
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
|
||||
tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
|
||||
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_payload_flush(c);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -285,11 +402,12 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
*
|
||||
* #syscalls recvmsg
|
||||
*/
|
||||
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
||||
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
|
||||
int len, dlen, i, s = conn->sock;
|
||||
int sendlen, len, dlen, v4 = CONN_V4(conn);
|
||||
int s = conn->sock, i, ret = 0;
|
||||
struct msghdr mh_sock = { 0 };
|
||||
uint16_t mss = MSS_GET(conn);
|
||||
uint32_t already_sent, seq;
|
||||
|
@ -304,14 +422,13 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
already_sent = 0;
|
||||
if (tcp_set_peek_offset(conn, 0)) {
|
||||
if (tcp_set_peek_offset(s, 0)) {
|
||||
tcp_rst(c, conn);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||
conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
|
||||
conn_flag(c, conn, STALLED);
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
return 0;
|
||||
|
@ -337,15 +454,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
mh_sock.msg_iovlen = fill_bufs;
|
||||
}
|
||||
|
||||
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
|
||||
if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
|
||||
(!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
|
||||
tcp_payload_flush(c);
|
||||
|
||||
/* Silence Coverity CWE-125 false positive */
|
||||
tcp_payload_used = 0;
|
||||
tcp4_payload_used = tcp6_payload_used = 0;
|
||||
}
|
||||
|
||||
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
|
||||
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
|
||||
if (v4)
|
||||
iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
|
||||
else
|
||||
iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
|
||||
iov->iov_len = mss;
|
||||
}
|
||||
if (iov_rem)
|
||||
|
@ -356,22 +477,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
len = recvmsg(s, &mh_sock, MSG_PEEK);
|
||||
while (len < 0 && errno == EINTR);
|
||||
|
||||
if (len < 0) {
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
tcp_rst(c, conn);
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (already_sent) /* No new data and EAGAIN: set EPOLLET */
|
||||
conn_flag(c, conn, STALLED);
|
||||
|
||||
return 0;
|
||||
}
|
||||
if (len < 0)
|
||||
goto err;
|
||||
|
||||
if (!len) {
|
||||
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
||||
int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
|
||||
if (ret) {
|
||||
if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
}
|
||||
|
@ -382,40 +493,45 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
return 0;
|
||||
}
|
||||
|
||||
sendlen = len;
|
||||
if (!peek_offset_cap)
|
||||
len -= already_sent;
|
||||
sendlen -= already_sent;
|
||||
|
||||
if (len <= 0) {
|
||||
if (sendlen <= 0) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
|
||||
conn_flag(c, conn, ~STALLED);
|
||||
|
||||
send_bufs = DIV_ROUND_UP(len, mss);
|
||||
last_len = len - (send_bufs - 1) * mss;
|
||||
send_bufs = DIV_ROUND_UP(sendlen, mss);
|
||||
last_len = sendlen - (send_bufs - 1) * mss;
|
||||
|
||||
/* Likely, some new data was acked too. */
|
||||
tcp_update_seqack_wnd(c, conn, false, NULL);
|
||||
tcp_update_seqack_wnd(c, conn, 0, NULL);
|
||||
|
||||
/* Finally, queue to tap */
|
||||
dlen = mss;
|
||||
seq = conn->seq_to_tap;
|
||||
for (i = 0; i < send_bufs; i++) {
|
||||
int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
|
||||
bool push = false;
|
||||
int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
|
||||
|
||||
if (i == send_bufs - 1) {
|
||||
if (i == send_bufs - 1)
|
||||
dlen = last_len;
|
||||
push = true;
|
||||
}
|
||||
|
||||
tcp_data_to_tap(c, conn, dlen, no_csum, seq, push);
|
||||
tcp_data_to_tap(c, conn, dlen, no_csum, seq);
|
||||
seq += dlen;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
ret = -errno;
|
||||
tcp_rst(c, conn);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
10
tcp_buf.h
10
tcp_buf.h
|
@ -6,9 +6,11 @@
|
|||
#ifndef TCP_BUF_H
|
||||
#define TCP_BUF_H
|
||||
|
||||
void tcp_sock_iov_init(const struct ctx *c);
|
||||
void tcp_payload_flush(const struct ctx *c);
|
||||
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
|
||||
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
void tcp_sock4_iov_init(const struct ctx *c);
|
||||
void tcp_sock6_iov_init(const struct ctx *c);
|
||||
void tcp_flags_flush(const struct ctx *c);
|
||||
void tcp_payload_flush(struct ctx *c);
|
||||
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
|
||||
#endif /*TCP_BUF_H */
|
||||
|
|
108
tcp_conn.h
108
tcp_conn.h
|
@ -19,7 +19,6 @@
|
|||
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
||||
* @sock: Socket descriptor number
|
||||
* @events: Connection events, implying connection states
|
||||
* @listening_sock: Listening socket this socket was accept()ed from, or -1
|
||||
* @timer: timerfd descriptor for timeout events
|
||||
* @flags: Connection flags representing internal attributes
|
||||
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
|
||||
|
@ -69,7 +68,6 @@ struct tcp_tap_conn {
|
|||
#define CONN_STATE_BITS /* Setting these clears other flags */ \
|
||||
(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
|
||||
|
||||
int listening_sock;
|
||||
|
||||
int timer :FD_REF_BITS;
|
||||
|
||||
|
@ -79,7 +77,6 @@ struct tcp_tap_conn {
|
|||
#define ACTIVE_CLOSE BIT(2)
|
||||
#define ACK_TO_TAP_DUE BIT(3)
|
||||
#define ACK_FROM_TAP_DUE BIT(4)
|
||||
#define ACK_FROM_TAP_BLOCKS BIT(5)
|
||||
|
||||
#define SNDBUF_BITS 24
|
||||
unsigned int sndbuf :SNDBUF_BITS;
|
||||
|
@ -98,95 +95,6 @@ struct tcp_tap_conn {
|
|||
uint32_t seq_init_from_tap;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
|
||||
* @pif: Interfaces for each side of the flow
|
||||
* @side: Addresses and ports for each side of the flow
|
||||
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
|
||||
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
||||
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
||||
* @events: Connection events, implying connection states
|
||||
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
||||
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
|
||||
* @flags: Connection flags representing internal attributes
|
||||
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
|
||||
* @wnd_from_tap: Last window size from tap, unscaled (as received)
|
||||
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
|
||||
* @seq_to_tap: Next sequence for packets to tap
|
||||
* @seq_ack_from_tap: Last ACK number received from tap
|
||||
* @seq_from_tap: Next sequence for packets from tap (not actually sent)
|
||||
* @seq_ack_to_tap: Last ACK number sent to tap
|
||||
* @seq_init_from_tap: Initial sequence number from tap
|
||||
*/
|
||||
struct tcp_tap_transfer {
|
||||
uint8_t pif[SIDES];
|
||||
struct flowside side[SIDES];
|
||||
|
||||
uint8_t retrans;
|
||||
uint8_t ws_from_tap;
|
||||
uint8_t ws_to_tap;
|
||||
uint8_t events;
|
||||
|
||||
uint32_t tap_mss;
|
||||
|
||||
uint32_t sndbuf;
|
||||
|
||||
uint8_t flags;
|
||||
uint8_t seq_dup_ack_approx;
|
||||
|
||||
uint16_t wnd_from_tap;
|
||||
uint16_t wnd_to_tap;
|
||||
|
||||
uint32_t seq_to_tap;
|
||||
uint32_t seq_ack_from_tap;
|
||||
uint32_t seq_from_tap;
|
||||
uint32_t seq_ack_to_tap;
|
||||
uint32_t seq_init_from_tap;
|
||||
} __attribute__((packed, aligned(__alignof__(uint32_t))));
|
||||
|
||||
/**
|
||||
* struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order
|
||||
* @seq_snd: Socket-side send sequence
|
||||
* @seq_rcv: Socket-side receive sequence
|
||||
* @sndq: Length of pending send queue (unacknowledged / not sent)
|
||||
* @notsent: Part of pending send queue that wasn't sent out yet
|
||||
* @rcvq: Length of pending receive queue
|
||||
* @mss: Socket-side MSS clamp
|
||||
* @timestamp: RFC 7323 timestamp
|
||||
* @snd_wl1: Next sequence used in window probe (next sequence - 1)
|
||||
* @snd_wnd: Socket-side sending window
|
||||
* @max_window: Window clamp
|
||||
* @rcv_wnd: Socket-side receive window
|
||||
* @rcv_wup: rcv_nxt on last window update sent
|
||||
* @snd_ws: Window scaling factor, send
|
||||
* @rcv_ws: Window scaling factor, receive
|
||||
* @tcpi_state: Connection state in TCP_INFO style (enum, tcp_states.h)
|
||||
* @tcpi_options: TCPI_OPT_* constants (timestamps, selective ACK)
|
||||
*/
|
||||
struct tcp_tap_transfer_ext {
|
||||
uint32_t seq_snd;
|
||||
uint32_t seq_rcv;
|
||||
|
||||
uint32_t sndq;
|
||||
uint32_t notsent;
|
||||
uint32_t rcvq;
|
||||
|
||||
uint32_t mss;
|
||||
uint32_t timestamp;
|
||||
|
||||
/* We can't just use struct tcp_repair_window: we need network order */
|
||||
uint32_t snd_wl1;
|
||||
uint32_t snd_wnd;
|
||||
uint32_t max_window;
|
||||
uint32_t rcv_wnd;
|
||||
uint32_t rcv_wup;
|
||||
|
||||
uint8_t snd_ws;
|
||||
uint8_t rcv_ws;
|
||||
uint8_t tcpi_state;
|
||||
uint8_t tcpi_options;
|
||||
} __attribute__((packed, aligned(__alignof__(uint32_t))));
|
||||
|
||||
/**
|
||||
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
|
||||
* @f: Generic flow information
|
||||
|
@ -231,23 +139,11 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
|
|||
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
|
||||
|
||||
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
|
||||
|
||||
int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
|
||||
int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
|
||||
|
||||
int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
|
||||
int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
|
||||
|
||||
int tcp_flow_migrate_target(struct ctx *c, int fd);
|
||||
int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
|
||||
|
||||
bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
|
||||
|
||||
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
|
||||
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
|
||||
int tcp_conn_pool_sock(int pool[]);
|
||||
int tcp_conn_sock(sa_family_t af);
|
||||
int tcp_sock_refill_pool(int pool[], sa_family_t af);
|
||||
int tcp_conn_sock(const struct ctx *c, sa_family_t af);
|
||||
int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af);
|
||||
void tcp_splice_refill(const struct ctx *c);
|
||||
|
||||
#endif /* TCP_CONN_H */
|
||||
|
|
106
tcp_internal.h
106
tcp_internal.h
|
@ -33,18 +33,16 @@
|
|||
#define OPT_EOL 0
|
||||
#define OPT_NOP 1
|
||||
#define OPT_MSS 2
|
||||
#define OPT_MSS_LEN 4
|
||||
#define OPT_WS 3
|
||||
#define OPT_WS_LEN 3
|
||||
#define OPT_SACKP 4
|
||||
#define OPT_SACK 5
|
||||
#define OPT_TS 8
|
||||
|
||||
#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
|
||||
#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
|
||||
#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
|
||||
|
||||
#define HOSTSIDE(conn_) ((conn_)->f.pif[1] == PIF_HOST)
|
||||
#define HOSTFLOW(conn_) (&((conn_)->f.side[HOSTSIDE(conn_)]))
|
||||
#define HOST_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
|
||||
#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
|
||||
#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
|
||||
#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
|
||||
|
||||
#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr))
|
||||
#define CONN_V6(conn) (!CONN_V4(conn))
|
||||
|
@ -65,79 +63,6 @@ enum tcp_iov_parts {
|
|||
TCP_NUM_IOVS
|
||||
};
|
||||
|
||||
/**
|
||||
* struct tcp_payload_t - TCP header and data to send segments with payload
|
||||
* @th: TCP header
|
||||
* @data: TCP data
|
||||
*/
|
||||
struct tcp_payload_t {
|
||||
struct tcphdr th;
|
||||
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
/** struct tcp_opt_nop - TCP NOP option
|
||||
* @kind: Option kind (OPT_NOP = 1)
|
||||
*/
|
||||
struct tcp_opt_nop {
|
||||
uint8_t kind;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_OPT_NOP ((struct tcp_opt_nop){ .kind = OPT_NOP, })
|
||||
|
||||
/** struct tcp_opt_mss - TCP MSS option
|
||||
* @kind: Option kind (OPT_MSS == 2)
|
||||
* @len: Option length (4)
|
||||
* @mss: Maximum Segment Size
|
||||
*/
|
||||
struct tcp_opt_mss {
|
||||
uint8_t kind;
|
||||
uint8_t len;
|
||||
uint16_t mss;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_OPT_MSS(mss_) \
|
||||
((struct tcp_opt_mss) { \
|
||||
.kind = OPT_MSS, \
|
||||
.len = sizeof(struct tcp_opt_mss), \
|
||||
.mss = htons(mss_), \
|
||||
})
|
||||
|
||||
/** struct tcp_opt_ws - TCP Window Scaling option
|
||||
* @kind: Option kind (OPT_WS == 3)
|
||||
* @len: Option length (3)
|
||||
* @shift: Window scaling shift
|
||||
*/
|
||||
struct tcp_opt_ws {
|
||||
uint8_t kind;
|
||||
uint8_t len;
|
||||
uint8_t shift;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_OPT_WS(shift_) \
|
||||
((struct tcp_opt_ws) { \
|
||||
.kind = OPT_WS, \
|
||||
.len = sizeof(struct tcp_opt_ws), \
|
||||
.shift = (shift_), \
|
||||
})
|
||||
|
||||
/** struct tcp_syn_opts - TCP options we apply to SYN packets
|
||||
* @mss: Maximum Segment Size (MSS) option
|
||||
* @nop: NOP opt (for alignment)
|
||||
* @ws: Window Scaling (WS) option
|
||||
*/
|
||||
struct tcp_syn_opts {
|
||||
struct tcp_opt_mss mss;
|
||||
struct tcp_opt_nop nop;
|
||||
struct tcp_opt_ws ws;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_SYN_OPTS(mss_, ws_) \
|
||||
((struct tcp_syn_opts){ \
|
||||
.mss = TCP_OPT_MSS(mss_), \
|
||||
.nop = TCP_OPT_NOP, \
|
||||
.ws = TCP_OPT_WS(ws_), \
|
||||
})
|
||||
|
||||
extern char tcp_buf_discard [MAX_WINDOW];
|
||||
|
||||
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
|
@ -157,26 +82,19 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
conn_event_do(c, conn, event); \
|
||||
} while (0)
|
||||
|
||||
void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
|
||||
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
#define tcp_rst(c, conn) \
|
||||
do { \
|
||||
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
|
||||
tcp_rst_do(c, conn); \
|
||||
} while (0)
|
||||
|
||||
struct tcp_info_linux;
|
||||
|
||||
void tcp_fill_headers(const struct tcp_tap_conn *conn,
|
||||
struct tap_hdr *taph,
|
||||
struct iphdr *ip4h, struct ipv6hdr *ip6h,
|
||||
struct tcphdr *th, struct iov_tail *payload,
|
||||
const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum);
|
||||
|
||||
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
||||
struct iovec *iov, size_t dlen,
|
||||
const uint16_t *check, uint32_t seq);
|
||||
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
bool force_seq, struct tcp_info_linux *tinfo);
|
||||
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
|
||||
size_t *optlen);
|
||||
int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset);
|
||||
int force_seq, struct tcp_info *tinfo);
|
||||
int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
|
||||
struct tcphdr *th, char *data, size_t *optlen);
|
||||
|
||||
#endif /* TCP_INTERNAL_H */
|
||||
|
|
120
tcp_splice.c
120
tcp_splice.c
|
@ -28,7 +28,7 @@
|
|||
* - FIN_SENT_0: FIN (write shutdown) sent to accepted socket
|
||||
* - FIN_SENT_1: FIN (write shutdown) sent to target socket
|
||||
*
|
||||
* #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64
|
||||
* #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
|
||||
*/
|
||||
|
||||
#include <sched.h>
|
||||
|
@ -95,7 +95,7 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
|
|||
* conn_at_sidx() - Get spliced TCP connection specific flow at given sidx
|
||||
* @sidx: Flow and side to retrieve
|
||||
*
|
||||
* Return: spliced TCP connection at @sidx, or NULL of @sidx is invalid.
|
||||
* Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid.
|
||||
* Asserts if the flow at @sidx is not FLOW_TCP_SPLICE.
|
||||
*/
|
||||
static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
|
||||
|
@ -131,12 +131,8 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
|
|||
ev[1].events = EPOLLOUT;
|
||||
}
|
||||
|
||||
flow_foreach_sidei(sidei) {
|
||||
if (events & OUT_WAIT(sidei)) {
|
||||
ev[sidei].events |= EPOLLOUT;
|
||||
ev[!sidei].events &= ~EPOLLIN;
|
||||
}
|
||||
}
|
||||
flow_foreach_sidei(sidei)
|
||||
ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -164,7 +160,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
|
|||
if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
|
||||
epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
|
||||
int ret = -errno;
|
||||
flow_perror(conn, "ERROR on epoll_ctl()");
|
||||
flow_err(conn, "ERROR on epoll_ctl(): %s", strerror(errno));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -204,8 +200,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
|
|||
}
|
||||
|
||||
if (flag == CLOSING) {
|
||||
epoll_del(c, conn->s[0]);
|
||||
epoll_del(c, conn->s[1]);
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[0], NULL);
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[1], NULL);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -317,14 +313,14 @@ static int tcp_splice_connect_finish(const struct ctx *c,
|
|||
|
||||
if (conn->pipe[sidei][0] < 0) {
|
||||
if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
|
||||
flow_perror(conn, "cannot create %d->%d pipe",
|
||||
sidei, !sidei);
|
||||
flow_err(conn, "cannot create %d->%d pipe: %s",
|
||||
sidei, !sidei, strerror(errno));
|
||||
conn_flag(c, conn, CLOSING);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
|
||||
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
|
||||
c->tcp.pipe_size)) {
|
||||
flow_trace(conn,
|
||||
"cannot set %d->%d pipe size to %zu",
|
||||
sidei, !sidei, c->tcp.pipe_size);
|
||||
|
@ -352,10 +348,9 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
|
|||
uint8_t tgtpif = conn->f.pif[TGTSIDE];
|
||||
union sockaddr_inany sa;
|
||||
socklen_t sl;
|
||||
int one = 1;
|
||||
|
||||
if (tgtpif == PIF_HOST)
|
||||
conn->s[1] = tcp_conn_sock(af);
|
||||
conn->s[1] = tcp_conn_sock(c, af);
|
||||
else if (tgtpif == PIF_SPLICE)
|
||||
conn->s[1] = tcp_conn_sock_ns(c, af);
|
||||
else
|
||||
|
@ -364,27 +359,18 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
|
|||
if (conn->s[1] < 0)
|
||||
return -1;
|
||||
|
||||
if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &one, sizeof(one))) {
|
||||
if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK,
|
||||
&((int){ 1 }), sizeof(int))) {
|
||||
flow_trace(conn, "failed to set TCP_QUICKACK on socket %i",
|
||||
conn->s[1]);
|
||||
}
|
||||
|
||||
if (setsockopt(conn->s[0], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
|
||||
flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
|
||||
conn->s[0]);
|
||||
}
|
||||
|
||||
if (setsockopt(conn->s[1], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
|
||||
flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
|
||||
conn->s[1]);
|
||||
}
|
||||
|
||||
pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);
|
||||
|
||||
if (connect(conn->s[1], &sa.sa, sl)) {
|
||||
if (errno != EINPROGRESS) {
|
||||
flow_trace(conn, "Couldn't connect socket for splice: %s",
|
||||
strerror_(errno));
|
||||
strerror(errno));
|
||||
return -errno;
|
||||
}
|
||||
|
||||
|
@ -402,7 +388,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
|
|||
* @c: Execution context
|
||||
* @af: Address family (AF_INET or AF_INET6)
|
||||
*
|
||||
* Return: socket fd in the namespace on success, -errno on failure
|
||||
* Return: Socket fd in the namespace on success, -errno on failure
|
||||
*/
|
||||
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af)
|
||||
{
|
||||
|
@ -482,10 +468,11 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
|||
|
||||
rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl);
|
||||
if (rc)
|
||||
flow_perror(conn, "Error retrieving SO_ERROR");
|
||||
flow_err(conn, "Error retrieving SO_ERROR: %s",
|
||||
strerror(errno));
|
||||
else
|
||||
flow_trace(conn, "Error event on socket: %s",
|
||||
strerror_(err));
|
||||
strerror(err));
|
||||
|
||||
goto close;
|
||||
}
|
||||
|
@ -516,27 +503,29 @@ swap:
|
|||
lowat_act_flag = RCVLOWAT_ACT(fromsidei);
|
||||
|
||||
while (1) {
|
||||
ssize_t readlen, written, pending;
|
||||
ssize_t readlen, to_write = 0, written;
|
||||
int more = 0;
|
||||
|
||||
retry:
|
||||
do
|
||||
readlen = splice(conn->s[fromsidei], NULL,
|
||||
conn->pipe[fromsidei][1], NULL,
|
||||
c->tcp.pipe_size,
|
||||
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
|
||||
while (readlen < 0 && errno == EINTR);
|
||||
|
||||
if (readlen < 0 && errno != EAGAIN)
|
||||
goto close;
|
||||
|
||||
readlen = splice(conn->s[fromsidei], NULL,
|
||||
conn->pipe[fromsidei][1], NULL,
|
||||
c->tcp.pipe_size,
|
||||
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
|
||||
flow_trace(conn, "%zi from read-side call", readlen);
|
||||
if (readlen < 0) {
|
||||
if (errno == EINTR)
|
||||
goto retry;
|
||||
|
||||
if (!readlen) {
|
||||
if (errno != EAGAIN)
|
||||
goto close;
|
||||
|
||||
to_write = c->tcp.pipe_size;
|
||||
} else if (!readlen) {
|
||||
eof = 1;
|
||||
} else if (readlen > 0) {
|
||||
to_write = c->tcp.pipe_size;
|
||||
} else {
|
||||
never_read = 0;
|
||||
|
||||
to_write += readlen;
|
||||
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
|
||||
more = SPLICE_F_MORE;
|
||||
|
||||
|
@ -544,25 +533,19 @@ retry:
|
|||
conn_flag(c, conn, lowat_act_flag);
|
||||
}
|
||||
|
||||
do
|
||||
written = splice(conn->pipe[fromsidei][0], NULL,
|
||||
conn->s[!fromsidei], NULL,
|
||||
c->tcp.pipe_size,
|
||||
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
|
||||
while (written < 0 && errno == EINTR);
|
||||
|
||||
if (written < 0 && errno != EAGAIN)
|
||||
goto close;
|
||||
|
||||
eintr:
|
||||
written = splice(conn->pipe[fromsidei][0], NULL,
|
||||
conn->s[!fromsidei], NULL, to_write,
|
||||
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
|
||||
flow_trace(conn, "%zi from write-side call (passed %zi)",
|
||||
written, c->tcp.pipe_size);
|
||||
written, to_write);
|
||||
|
||||
/* Most common case: skip updating counters. */
|
||||
if (readlen > 0 && readlen == written) {
|
||||
if (readlen >= (long)c->tcp.pipe_size * 10 / 100)
|
||||
continue;
|
||||
|
||||
if (!(conn->flags & lowat_set_flag) &&
|
||||
if (conn->flags & lowat_set_flag &&
|
||||
readlen > (long)c->tcp.pipe_size / 10) {
|
||||
int lowat = c->tcp.pipe_size / 4;
|
||||
|
||||
|
@ -571,7 +554,7 @@ retry:
|
|||
&lowat, sizeof(lowat))) {
|
||||
flow_trace(conn,
|
||||
"Setting SO_RCVLOWAT %i: %s",
|
||||
lowat, strerror_(errno));
|
||||
lowat, strerror(errno));
|
||||
} else {
|
||||
conn_flag(c, conn, lowat_set_flag);
|
||||
conn_flag(c, conn, lowat_act_flag);
|
||||
|
@ -585,6 +568,12 @@ retry:
|
|||
conn->written[fromsidei] += written > 0 ? written : 0;
|
||||
|
||||
if (written < 0) {
|
||||
if (errno == EINTR)
|
||||
goto eintr;
|
||||
|
||||
if (errno != EAGAIN)
|
||||
goto close;
|
||||
|
||||
if (conn->read[fromsidei] == conn->written[fromsidei])
|
||||
break;
|
||||
|
||||
|
@ -595,9 +584,10 @@ retry:
|
|||
if (never_read && written == (long)(c->tcp.pipe_size))
|
||||
goto retry;
|
||||
|
||||
pending = conn->read[fromsidei] - conn->written[fromsidei];
|
||||
if (!never_read && written > 0 && written < pending)
|
||||
if (!never_read && written < to_write) {
|
||||
to_write -= written;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (eof)
|
||||
break;
|
||||
|
@ -686,7 +676,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
|
|||
continue;
|
||||
|
||||
if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
|
||||
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
|
||||
c->tcp.pipe_size)) {
|
||||
trace("TCP (spliced): cannot set pool pipe size to %zu",
|
||||
c->tcp.pipe_size);
|
||||
}
|
||||
|
@ -707,16 +697,16 @@ static int tcp_sock_refill_ns(void *arg)
|
|||
ns_enter(c);
|
||||
|
||||
if (c->ifi4) {
|
||||
int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET);
|
||||
int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
|
||||
if (rc < 0)
|
||||
warn("TCP: Error refilling IPv4 ns socket pool: %s",
|
||||
strerror_(-rc));
|
||||
strerror(-rc));
|
||||
}
|
||||
if (c->ifi6) {
|
||||
int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6);
|
||||
int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
|
||||
if (rc < 0)
|
||||
warn("TCP: Error refilling IPv6 ns socket pool: %s",
|
||||
strerror_(-rc));
|
||||
strerror(-rc));
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
476
tcp_vu.c
476
tcp_vu.c
|
@ -1,476 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/* tcp_vu.c - TCP L2 vhost-user management functions
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/tcp.h>
|
||||
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include <netinet/if_ether.h>
|
||||
#include <linux/virtio_net.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "passt.h"
|
||||
#include "siphash.h"
|
||||
#include "inany.h"
|
||||
#include "vhost_user.h"
|
||||
#include "tcp.h"
|
||||
#include "pcap.h"
|
||||
#include "flow.h"
|
||||
#include "tcp_conn.h"
|
||||
#include "flow_table.h"
|
||||
#include "tcp_vu.h"
|
||||
#include "tap.h"
|
||||
#include "tcp_internal.h"
|
||||
#include "checksum.h"
|
||||
#include "vu_common.h"
|
||||
#include <time.h>
|
||||
|
||||
static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
|
||||
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
|
||||
static int head[VIRTQUEUE_MAX_SIZE + 1];
|
||||
|
||||
/**
|
||||
* tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
|
||||
* @v6: Set for IPv6 packet
|
||||
*
|
||||
* Return: return the size of the header
|
||||
*/
|
||||
static size_t tcp_vu_hdrlen(bool v6)
|
||||
{
|
||||
size_t hdrlen;
|
||||
|
||||
hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) +
|
||||
sizeof(struct ethhdr) + sizeof(struct tcphdr);
|
||||
|
||||
if (v6)
|
||||
hdrlen += sizeof(struct ipv6hdr);
|
||||
else
|
||||
hdrlen += sizeof(struct iphdr);
|
||||
|
||||
return hdrlen;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload)
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @flags: TCP flags: if not set, send segment only if ACK is due
|
||||
*
|
||||
* Return: negative error code on connection reset, 0 otherwise
|
||||
*/
|
||||
int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
{
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
size_t optlen, hdrlen;
|
||||
struct vu_virtq_element flags_elem[2];
|
||||
struct ipv6hdr *ip6h = NULL;
|
||||
struct iphdr *ip4h = NULL;
|
||||
struct iovec flags_iov[2];
|
||||
struct tcp_syn_opts *opts;
|
||||
struct iov_tail payload;
|
||||
struct tcphdr *th;
|
||||
struct ethhdr *eh;
|
||||
uint32_t seq;
|
||||
int elem_cnt;
|
||||
int nb_ack;
|
||||
int ret;
|
||||
|
||||
hdrlen = tcp_vu_hdrlen(CONN_V6(conn));
|
||||
|
||||
vu_set_element(&flags_elem[0], NULL, &flags_iov[0]);
|
||||
|
||||
elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1,
|
||||
hdrlen + sizeof(struct tcp_syn_opts), NULL);
|
||||
if (elem_cnt != 1)
|
||||
return -1;
|
||||
|
||||
ASSERT(flags_elem[0].in_sg[0].iov_len >=
|
||||
hdrlen + sizeof(struct tcp_syn_opts));
|
||||
|
||||
vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1);
|
||||
|
||||
eh = vu_eth(flags_elem[0].in_sg[0].iov_base);
|
||||
|
||||
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
eh->h_proto = htons(ETH_P_IP);
|
||||
|
||||
ip4h = vu_ip(flags_elem[0].in_sg[0].iov_base);
|
||||
*ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
|
||||
th = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
|
||||
} else {
|
||||
eh->h_proto = htons(ETH_P_IPV6);
|
||||
|
||||
ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base);
|
||||
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
th = vu_payloadv6(flags_elem[0].in_sg[0].iov_base);
|
||||
}
|
||||
|
||||
memset(th, 0, sizeof(*th));
|
||||
th->doff = sizeof(*th) / 4;
|
||||
th->ack = 1;
|
||||
|
||||
seq = conn->seq_to_tap;
|
||||
opts = (struct tcp_syn_opts *)(th + 1);
|
||||
ret = tcp_prepare_flags(c, conn, flags, th, opts, &optlen);
|
||||
if (ret <= 0) {
|
||||
vu_queue_rewind(vq, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
|
||||
payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
|
||||
|
||||
tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
|
||||
NULL, seq, !*c->pcap);
|
||||
|
||||
if (*c->pcap) {
|
||||
pcap_iov(&flags_elem[0].in_sg[0], 1,
|
||||
sizeof(struct virtio_net_hdr_mrg_rxbuf));
|
||||
}
|
||||
nb_ack = 1;
|
||||
|
||||
if (flags & DUP_ACK) {
|
||||
vu_set_element(&flags_elem[1], NULL, &flags_iov[1]);
|
||||
|
||||
elem_cnt = vu_collect(vdev, vq, &flags_elem[1], 1,
|
||||
flags_elem[0].in_sg[0].iov_len, NULL);
|
||||
if (elem_cnt == 1 &&
|
||||
flags_elem[1].in_sg[0].iov_len >=
|
||||
flags_elem[0].in_sg[0].iov_len) {
|
||||
memcpy(flags_elem[1].in_sg[0].iov_base,
|
||||
flags_elem[0].in_sg[0].iov_base,
|
||||
flags_elem[0].in_sg[0].iov_len);
|
||||
nb_ack++;
|
||||
|
||||
if (*c->pcap) {
|
||||
pcap_iov(&flags_elem[1].in_sg[0], 1,
|
||||
sizeof(struct virtio_net_hdr_mrg_rxbuf));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vu_flush(vdev, vq, flags_elem, nb_ack);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @v6: Set for IPv6 connections
|
||||
* @already_sent: Number of bytes already sent
|
||||
* @fillsize: Maximum bytes to fill in guest-side receiving window
|
||||
* @iov_cnt: number of iov (output)
|
||||
* @head_cnt: Pointer to store the count of head iov entries (output)
|
||||
*
|
||||
* Return: number of bytes received from the socket, or a negative error code
|
||||
* on failure.
|
||||
*/
|
||||
static ssize_t tcp_vu_sock_recv(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn, bool v6,
|
||||
uint32_t already_sent, size_t fillsize,
|
||||
int *iov_cnt, int *head_cnt)
|
||||
{
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
struct msghdr mh_sock = { 0 };
|
||||
uint16_t mss = MSS_GET(conn);
|
||||
int s = conn->sock;
|
||||
ssize_t ret, len;
|
||||
size_t hdrlen;
|
||||
int elem_cnt;
|
||||
int i;
|
||||
|
||||
*iov_cnt = 0;
|
||||
|
||||
hdrlen = tcp_vu_hdrlen(v6);
|
||||
|
||||
vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
|
||||
|
||||
elem_cnt = 0;
|
||||
*head_cnt = 0;
|
||||
while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) {
|
||||
struct iovec *iov;
|
||||
size_t frame_size, dlen;
|
||||
int cnt;
|
||||
|
||||
cnt = vu_collect(vdev, vq, &elem[elem_cnt],
|
||||
VIRTQUEUE_MAX_SIZE - elem_cnt,
|
||||
MIN(mss, fillsize) + hdrlen, &frame_size);
|
||||
if (cnt == 0)
|
||||
break;
|
||||
|
||||
dlen = frame_size - hdrlen;
|
||||
|
||||
/* reserve space for headers in iov */
|
||||
iov = &elem[elem_cnt].in_sg[0];
|
||||
ASSERT(iov->iov_len >= hdrlen);
|
||||
iov->iov_base = (char *)iov->iov_base + hdrlen;
|
||||
iov->iov_len -= hdrlen;
|
||||
head[(*head_cnt)++] = elem_cnt;
|
||||
|
||||
fillsize -= dlen;
|
||||
elem_cnt += cnt;
|
||||
}
|
||||
|
||||
if (peek_offset_cap) {
|
||||
mh_sock.msg_iov = iov_vu + 1;
|
||||
mh_sock.msg_iovlen = elem_cnt;
|
||||
} else {
|
||||
iov_vu[0].iov_base = tcp_buf_discard;
|
||||
iov_vu[0].iov_len = already_sent;
|
||||
|
||||
mh_sock.msg_iov = iov_vu;
|
||||
mh_sock.msg_iovlen = elem_cnt + 1;
|
||||
}
|
||||
|
||||
do
|
||||
ret = recvmsg(s, &mh_sock, MSG_PEEK);
|
||||
while (ret < 0 && errno == EINTR);
|
||||
|
||||
if (ret < 0) {
|
||||
vu_queue_rewind(vq, elem_cnt);
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (!peek_offset_cap)
|
||||
ret -= already_sent;
|
||||
|
||||
/* adjust iov number and length of the last iov */
|
||||
len = ret;
|
||||
for (i = 0; len && i < elem_cnt; i++) {
|
||||
struct iovec *iov = &elem[i].in_sg[0];
|
||||
|
||||
if (iov->iov_len > (size_t)len)
|
||||
iov->iov_len = len;
|
||||
|
||||
len -= iov->iov_len;
|
||||
}
|
||||
/* adjust head count */
|
||||
while (*head_cnt > 0 && head[*head_cnt - 1] >= i)
|
||||
(*head_cnt)--;
|
||||
|
||||
/* mark end of array */
|
||||
head[*head_cnt] = i;
|
||||
*iov_cnt = i;
|
||||
|
||||
/* release unused buffers */
|
||||
vu_queue_rewind(vq, elem_cnt - i);
|
||||
|
||||
/* restore space for headers in iov */
|
||||
for (i = 0; i < *head_cnt; i++) {
|
||||
struct iovec *iov = &elem[head[i]].in_sg[0];
|
||||
|
||||
iov->iov_base = (char *)iov->iov_base - hdrlen;
|
||||
iov->iov_len += hdrlen;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_vu_prepare() - Prepare the frame header
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @iov: Pointer to the array of IO vectors
|
||||
* @iov_cnt: Number of entries in @iov
|
||||
* @check: Checksum, if already known
|
||||
* @no_tcp_csum: Do not set TCP checksum
|
||||
* @push: Set PSH flag, last segment in a batch
|
||||
*/
|
||||
static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
struct iovec *iov, size_t iov_cnt,
|
||||
const uint16_t **check, bool no_tcp_csum, bool push)
|
||||
{
|
||||
const struct flowside *toside = TAPFLOW(conn);
|
||||
bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
|
||||
size_t hdrlen = tcp_vu_hdrlen(v6);
|
||||
struct iov_tail payload = IOV_TAIL(iov, iov_cnt, hdrlen);
|
||||
char *base = iov[0].iov_base;
|
||||
struct ipv6hdr *ip6h = NULL;
|
||||
struct iphdr *ip4h = NULL;
|
||||
struct tcphdr *th;
|
||||
struct ethhdr *eh;
|
||||
|
||||
/* we guess the first iovec provided by the guest can embed
|
||||
* all the headers needed by L2 frame
|
||||
*/
|
||||
ASSERT(iov[0].iov_len >= hdrlen);
|
||||
|
||||
eh = vu_eth(base);
|
||||
|
||||
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
|
||||
|
||||
/* initialize header */
|
||||
|
||||
if (!v6) {
|
||||
eh->h_proto = htons(ETH_P_IP);
|
||||
|
||||
ip4h = vu_ip(base);
|
||||
*ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
th = vu_payloadv4(base);
|
||||
} else {
|
||||
eh->h_proto = htons(ETH_P_IPV6);
|
||||
|
||||
ip6h = vu_ip(base);
|
||||
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
|
||||
th = vu_payloadv6(base);
|
||||
}
|
||||
|
||||
memset(th, 0, sizeof(*th));
|
||||
th->doff = sizeof(*th) / 4;
|
||||
th->ack = 1;
|
||||
th->psh = push;
|
||||
|
||||
tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
|
||||
*check, conn->seq_to_tap, no_tcp_csum);
|
||||
if (ip4h)
|
||||
*check = &ip4h->check;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_vu_data_from_sock() - Handle new data from socket, queue to vhost-user,
|
||||
* in window
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
*
|
||||
* Return: negative on connection reset, 0 otherwise
|
||||
*/
|
||||
int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
ssize_t len, previous_dlen;
|
||||
int i, iov_cnt, head_cnt;
|
||||
size_t hdrlen, fillsize;
|
||||
int v6 = CONN_V6(conn);
|
||||
uint32_t already_sent;
|
||||
const uint16_t *check;
|
||||
|
||||
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
|
||||
debug("Got packet, but RX virtqueue not usable yet");
|
||||
return 0;
|
||||
}
|
||||
|
||||
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
|
||||
if (SEQ_LT(already_sent, 0)) {
|
||||
/* RFC 761, section 2.1. */
|
||||
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
|
||||
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
already_sent = 0;
|
||||
if (tcp_set_peek_offset(conn, 0)) {
|
||||
tcp_rst(c, conn);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||
conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
|
||||
conn_flag(c, conn, STALLED);
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up buffer descriptors we'll fill completely and partially. */
|
||||
|
||||
fillsize = wnd_scaled - already_sent;
|
||||
|
||||
/* collect the buffers from vhost-user and fill them with the
|
||||
* data from the socket
|
||||
*/
|
||||
len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize,
|
||||
&iov_cnt, &head_cnt);
|
||||
if (len < 0) {
|
||||
if (len != -EAGAIN && len != -EWOULDBLOCK) {
|
||||
tcp_rst(c, conn);
|
||||
return len;
|
||||
}
|
||||
|
||||
if (already_sent) /* No new data and EAGAIN: set EPOLLET */
|
||||
conn_flag(c, conn, STALLED);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!len) {
|
||||
if (already_sent) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
} else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
|
||||
SOCK_FIN_RCVD) {
|
||||
int ret = tcp_vu_send_flag(c, conn, FIN | ACK);
|
||||
if (ret) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
}
|
||||
|
||||
conn_event(c, conn, TAP_FIN_SENT);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
|
||||
conn_flag(c, conn, ~STALLED);
|
||||
|
||||
/* Likely, some new data was acked too. */
|
||||
tcp_update_seqack_wnd(c, conn, false, NULL);
|
||||
|
||||
/* initialize headers */
|
||||
/* iov_vu is an array of buffers and the buffer size can be
|
||||
* smaller than the frame size we want to use but with
|
||||
* num_buffer we can merge several virtio iov buffers in one packet
|
||||
* we need only to set the packet headers in the first iov and
|
||||
* num_buffer to the number of iov entries
|
||||
*/
|
||||
|
||||
hdrlen = tcp_vu_hdrlen(v6);
|
||||
for (i = 0, previous_dlen = -1, check = NULL; i < head_cnt; i++) {
|
||||
struct iovec *iov = &elem[head[i]].in_sg[0];
|
||||
int buf_cnt = head[i + 1] - head[i];
|
||||
ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen;
|
||||
bool push = i == head_cnt - 1;
|
||||
|
||||
vu_set_vnethdr(vdev, iov->iov_base, buf_cnt);
|
||||
|
||||
/* The IPv4 header checksum varies only with dlen */
|
||||
if (previous_dlen != dlen)
|
||||
check = NULL;
|
||||
previous_dlen = dlen;
|
||||
|
||||
tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap, push);
|
||||
|
||||
if (*c->pcap) {
|
||||
pcap_iov(iov, buf_cnt,
|
||||
sizeof(struct virtio_net_hdr_mrg_rxbuf));
|
||||
}
|
||||
|
||||
conn->seq_to_tap += dlen;
|
||||
}
|
||||
|
||||
/* send packets */
|
||||
vu_flush(vdev, vq, elem, iov_cnt);
|
||||
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
|
||||
return 0;
|
||||
}
|
12
tcp_vu.h
12
tcp_vu.h
|
@ -1,12 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*/
|
||||
|
||||
#ifndef TCP_VU_H
|
||||
#define TCP_VU_H
|
||||
|
||||
int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
|
||||
|
||||
#endif /*TCP_VU_H */
|
1
test/.gitignore
vendored
1
test/.gitignore
vendored
|
@ -8,6 +8,5 @@ QEMU_EFI.fd
|
|||
*.raw.xz
|
||||
*.bin
|
||||
nstool
|
||||
rampstream
|
||||
guest-key
|
||||
guest-key.pub
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
WGET = wget -c
|
||||
|
||||
DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
|
||||
debian-9-nocloud-amd64-daily-20200210-166.qcow2 \
|
||||
debian-10-nocloud-amd64.qcow2 \
|
||||
debian-10-generic-arm64.qcow2 \
|
||||
debian-10-generic-ppc64el-20220911-1135.qcow2 \
|
||||
|
@ -41,7 +42,8 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \
|
|||
openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \
|
||||
openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \
|
||||
openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \
|
||||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
|
||||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \
|
||||
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
|
||||
|
||||
UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \
|
||||
trusty-server-cloudimg-i386-disk1.img \
|
||||
|
@ -52,8 +54,7 @@ UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS)
|
|||
|
||||
DOWNLOAD_ASSETS = mbuto podman \
|
||||
$(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS)
|
||||
TESTDATA_ASSETS = small.bin big.bin medium.bin \
|
||||
rampstream
|
||||
TESTDATA_ASSETS = small.bin big.bin medium.bin
|
||||
LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \
|
||||
$(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \
|
||||
$(UBUNTU_NEW_IMGS:%=prepared-%) \
|
||||
|
@ -86,7 +87,7 @@ podman/bin/podman: pull-podman
|
|||
guest-key guest-key.pub:
|
||||
ssh-keygen -f guest-key -N ''
|
||||
|
||||
mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub rampstream-check.sh $(TESTDATA_ASSETS)
|
||||
mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub $(TESTDATA_ASSETS)
|
||||
./mbuto/mbuto -p ./$< -c lz4 -f $@
|
||||
|
||||
mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2
|
||||
|
@ -134,6 +135,9 @@ realclean: clean
|
|||
debian-8.11.0-openstack-%.qcow2:
|
||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
|
||||
|
||||
debian-9-nocloud-%-daily-20200210-166.qcow2:
|
||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2
|
||||
|
||||
debian-10-nocloud-%.qcow2:
|
||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
|
||||
|
||||
|
@ -199,6 +203,9 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz:
|
|||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
|
||||
$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
|
||||
|
||||
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2:
|
||||
$(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
|
||||
|
||||
# Ubuntu downloads
|
||||
trusty-server-cloudimg-%-disk1.img:
|
||||
$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img
|
||||
|
|
|
@ -134,54 +134,6 @@ layout_two_guests() {
|
|||
|
||||
get_info_cols
|
||||
|
||||
pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
|
||||
pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #1" qemu_2 guest_2
|
||||
|
||||
tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done'
|
||||
tmux send-keys -t ${PANE_INFO} -N 100 C-m
|
||||
tmux select-pane -t ${PANE_INFO} -T "test log"
|
||||
|
||||
pane_watch_contexts ${PANE_HOST} host host
|
||||
pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
|
||||
pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #1" pasta_1 passt_2
|
||||
|
||||
info_layout "two guests, two passt instances, in namespaces"
|
||||
|
||||
sleep 1
|
||||
}
|
||||
|
||||
# layout_migrate() - Two guest panes, two passt panes, two passt-repair panes,
|
||||
# plus host and log
|
||||
layout_migrate() {
|
||||
sleep 1
|
||||
|
||||
tmux kill-pane -a -t 0
|
||||
cmd_write 0 clear
|
||||
|
||||
tmux split-window -v -t passt_test
|
||||
tmux split-window -h -l '33%'
|
||||
tmux split-window -h -t passt_test:1.1
|
||||
|
||||
tmux split-window -h -l '35%' -t passt_test:1.0
|
||||
tmux split-window -v -t passt_test:1.0
|
||||
|
||||
tmux split-window -v -t passt_test:1.4
|
||||
tmux split-window -v -t passt_test:1.6
|
||||
|
||||
tmux split-window -v -t passt_test:1.3
|
||||
|
||||
PANE_GUEST_1=0
|
||||
PANE_GUEST_2=1
|
||||
PANE_INFO=2
|
||||
PANE_MON=3
|
||||
PANE_HOST=4
|
||||
PANE_PASST_REPAIR_1=5
|
||||
PANE_PASST_1=6
|
||||
PANE_PASST_REPAIR_2=7
|
||||
PANE_PASST_2=8
|
||||
|
||||
get_info_cols
|
||||
|
||||
pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
|
||||
pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2
|
||||
|
||||
|
@ -189,16 +141,11 @@ layout_migrate() {
|
|||
tmux send-keys -t ${PANE_INFO} -N 100 C-m
|
||||
tmux select-pane -t ${PANE_INFO} -T "test log"
|
||||
|
||||
pane_watch_contexts ${PANE_MON} "QEMU monitor" mon mon
|
||||
|
||||
pane_watch_contexts ${PANE_HOST} host host
|
||||
pane_watch_contexts ${PANE_PASST_REPAIR_1} "passt-repair #1 in namespace #1" repair_1 passt_repair_1
|
||||
pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
|
||||
|
||||
pane_watch_contexts ${PANE_PASST_REPAIR_2} "passt-repair #2 in namespace #2" repair_2 passt_repair_2
|
||||
pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2
|
||||
|
||||
info_layout "two guests, two passt + passt-repair instances, in namespaces"
|
||||
info_layout "two guests, two passt instances, in namespaces"
|
||||
|
||||
sleep 1
|
||||
}
|
||||
|
|
|
@ -49,21 +49,6 @@ td:empty { visibility: hidden; }
|
|||
__passt_tcp_LINE__ __passt_udp_LINE__
|
||||
</table>
|
||||
|
||||
</li><li><p>passt with vhost-user support</p>
|
||||
<table class="passt" width="70%">
|
||||
<tr>
|
||||
<th/>
|
||||
<th id="perf_passt_vu_tcp" colspan="__passt_vu_tcp_cols__">TCP, __passt_vu_tcp_threads__ at __passt_vu_tcp_freq__ GHz</th>
|
||||
<th id="perf_passt_vu_udp" colspan="__passt_vu_udp_cols__">UDP, __passt_vu_udp_threads__ at __passt_vu_udp_freq__ GHz</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="right">MTU:</td>
|
||||
__passt_vu_tcp_header__
|
||||
__passt_vu_udp_header__
|
||||
</tr>
|
||||
__passt_vu_tcp_LINE__ __passt_vu_udp_LINE__
|
||||
</table>
|
||||
|
||||
<style type="text/CSS">
|
||||
table.pasta_local td { border: 0px solid; padding: 6px; line-height: 1; }
|
||||
table.pasta_local td { text-align: right; }
|
||||
|
|
237
test/lib/setup
237
test/lib/setup
|
@ -15,7 +15,8 @@
|
|||
|
||||
INITRAMFS="${BASEPATH}/mbuto.img"
|
||||
VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )"
|
||||
MEM_KIB="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
|
||||
__mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
|
||||
VMEM="$((${__mem_kib} / 1024 / 4))"
|
||||
QEMU_ARCH="$(uname -m)"
|
||||
[ "${QEMU_ARCH}" = "i686" ] && QEMU_ARCH=i386
|
||||
|
||||
|
@ -45,38 +46,24 @@ setup_passt() {
|
|||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
|
||||
|
||||
context_run passt "make clean"
|
||||
context_run passt "make valgrind"
|
||||
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -H hostname1 --fqdn fqdn1.passt.test -P ${STATESETUP}/passt.pid"
|
||||
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -P ${STATESETUP}/passt.pid"
|
||||
|
||||
# pidfile isn't created until passt is listening
|
||||
wait_for [ -f "${STATESETUP}/passt.pid" ]
|
||||
|
||||
__vmem="$((${MEM_KIB} / 1024 / 4))"
|
||||
if [ ${VHOST_USER} -eq 1 ]; then
|
||||
__vmem="$(((${__vmem} + 500) / 1000))G"
|
||||
__qemu_netdev=" \
|
||||
-chardev socket,id=c,path=${STATESETUP}/passt.socket \
|
||||
-netdev vhost-user,id=v,chardev=c \
|
||||
-device virtio-net,netdev=v \
|
||||
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
|
||||
-numa node,memdev=m"
|
||||
else
|
||||
__qemu_netdev="-device virtio-net-pci,netdev=s \
|
||||
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket"
|
||||
fi
|
||||
|
||||
GUEST_CID=94557
|
||||
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -machine accel=kvm' \
|
||||
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
" ${__qemu_netdev}" \
|
||||
' -device virtio-net-pci,netdev=s0 ' \
|
||||
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \
|
||||
" -pidfile ${STATESETUP}/qemu.pid" \
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_CID"
|
||||
|
||||
|
@ -155,43 +142,29 @@ setup_passt_in_ns() {
|
|||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_in_pasta.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
|
||||
|
||||
if [ ${VALGRIND} -eq 1 ]; then
|
||||
context_run passt "make clean"
|
||||
context_run passt "make valgrind"
|
||||
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
|
||||
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
|
||||
else
|
||||
context_run passt "make clean"
|
||||
context_run passt "make"
|
||||
context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
|
||||
context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
|
||||
fi
|
||||
wait_for [ -f "${STATESETUP}/passt.pid" ]
|
||||
|
||||
__vmem="$((${MEM_KIB} / 1024 / 4))"
|
||||
if [ ${VHOST_USER} -eq 1 ]; then
|
||||
__vmem="$(((${__vmem} + 500) / 1000))G"
|
||||
__qemu_netdev=" \
|
||||
-chardev socket,id=c,path=${STATESETUP}/passt.socket \
|
||||
-netdev vhost-user,id=v,chardev=c \
|
||||
-device virtio-net,netdev=v \
|
||||
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
|
||||
-numa node,memdev=m"
|
||||
else
|
||||
__qemu_netdev="-device virtio-net-pci,netdev=s \
|
||||
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket"
|
||||
fi
|
||||
|
||||
GUEST_CID=94557
|
||||
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -machine accel=kvm' \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
" ${__qemu_netdev}" \
|
||||
' -device virtio-net-pci,netdev=s0 ' \
|
||||
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \
|
||||
" -pidfile ${STATESETUP}/qemu.pid" \
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_CID"
|
||||
|
||||
|
@ -241,126 +214,11 @@ setup_two_guests() {
|
|||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
|
||||
|
||||
context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} --fqdn fqdn1.passt.test -H hostname1 -t 10001 -u 10001"
|
||||
wait_for [ -f "${STATESETUP}/passt_1.pid" ]
|
||||
|
||||
__opts=
|
||||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
|
||||
|
||||
context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} --hostname hostname2 --fqdn fqdn2 -t 10004 -u 10004"
|
||||
wait_for [ -f "${STATESETUP}/passt_2.pid" ]
|
||||
|
||||
__vmem="$((${MEM_KIB} / 1024 / 4))"
|
||||
if [ ${VHOST_USER} -eq 1 ]; then
|
||||
__vmem="$(((${__vmem} + 500) / 1000))G"
|
||||
__qemu_netdev1=" \
|
||||
-chardev socket,id=c,path=${STATESETUP}/passt_1.socket \
|
||||
-netdev vhost-user,id=v,chardev=c \
|
||||
-device virtio-net,netdev=v \
|
||||
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
|
||||
-numa node,memdev=m"
|
||||
__qemu_netdev2=" \
|
||||
-chardev socket,id=c,path=${STATESETUP}/passt_2.socket \
|
||||
-netdev vhost-user,id=v,chardev=c \
|
||||
-device virtio-net,netdev=v \
|
||||
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
|
||||
-numa node,memdev=m"
|
||||
else
|
||||
__qemu_netdev1="-device virtio-net-pci,netdev=s \
|
||||
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket"
|
||||
__qemu_netdev2="-device virtio-net-pci,netdev=s \
|
||||
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket"
|
||||
fi
|
||||
|
||||
GUEST_1_CID=94557
|
||||
context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
" ${__qemu_netdev1}" \
|
||||
" -pidfile ${STATESETUP}/qemu_1.pid" \
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"
|
||||
|
||||
GUEST_2_CID=94558
|
||||
context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
" ${__qemu_netdev2}" \
|
||||
" -pidfile ${STATESETUP}/qemu_2.pid" \
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_2_CID"
|
||||
|
||||
context_setup_guest guest_1 ${GUEST_1_CID}
|
||||
context_setup_guest guest_2 ${GUEST_2_CID}
|
||||
}
|
||||
|
||||
# setup_migrate() - Set up two namespace, run qemu, passt/passt-repair in both
|
||||
setup_migrate() {
|
||||
context_setup_host host
|
||||
context_setup_host mon
|
||||
context_setup_host pasta_1
|
||||
context_setup_host pasta_2
|
||||
|
||||
layout_migrate
|
||||
|
||||
# Ports:
|
||||
#
|
||||
# guest #1 | guest #2 | ns #1 | host
|
||||
# --------- |-----------|-----------|------------
|
||||
# 10001 as server | | to guest | to ns #1
|
||||
# 10002 | | as server | to ns #1
|
||||
# 10003 | | to init | as server
|
||||
# 10004 | as server | to guest | to ns #1
|
||||
|
||||
__opts=
|
||||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/pasta_1.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
|
||||
__map_host4=192.0.2.1
|
||||
__map_host6=2001:db8:9a55::1
|
||||
__map_ns4=192.0.2.2
|
||||
__map_ns6=2001:db8:9a55::2
|
||||
|
||||
# Option 1: send stuff via spliced path in pasta
|
||||
# context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002 -T 10003 -u 10001,10002 -U 10003 --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
|
||||
# Option 2: send stuff via tap (--map-guest-addr) instead (useful to see capture of full migration)
|
||||
context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002,10004 -T 10003 -u 10001,10002,10004 -U 10003 --map-guest-addr ${__map_host4} --map-guest-addr ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
|
||||
context_setup_nstool passt_1 ${STATESETUP}/ns1.hold
|
||||
context_setup_nstool passt_repair_1 ${STATESETUP}/ns1.hold
|
||||
|
||||
context_setup_nstool passt_2 ${STATESETUP}/ns1.hold
|
||||
context_setup_nstool passt_repair_2 ${STATESETUP}/ns1.hold
|
||||
|
||||
context_setup_nstool qemu_1 ${STATESETUP}/ns1.hold
|
||||
context_setup_nstool qemu_2 ${STATESETUP}/ns1.hold
|
||||
|
||||
__ifname="$(context_run qemu_1 "ip -j link show | jq -rM '.[] | select(.link_type == \"ether\").ifname'")"
|
||||
|
||||
sleep 1
|
||||
|
||||
__opts="--vhost-user"
|
||||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
|
||||
context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001"
|
||||
wait_for [ -f "${STATESETUP}/passt_1.pid" ]
|
||||
|
||||
context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
|
||||
|
||||
__opts="--vhost-user"
|
||||
__opts=
|
||||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
|
@ -368,52 +226,34 @@ setup_migrate() {
|
|||
context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004"
|
||||
wait_for [ -f "${STATESETUP}/passt_2.pid" ]
|
||||
|
||||
context_run_bg passt_repair_2 "./passt-repair ${STATESETUP}/passt_2.socket.repair"
|
||||
|
||||
__vmem="512M" # Keep migration fast
|
||||
__qemu_netdev1=" \
|
||||
-chardev socket,id=c,path=${STATESETUP}/passt_1.socket \
|
||||
-netdev vhost-user,id=v,chardev=c \
|
||||
-device virtio-net,netdev=v \
|
||||
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
|
||||
-numa node,memdev=m"
|
||||
__qemu_netdev2=" \
|
||||
-chardev socket,id=c,path=${STATESETUP}/passt_2.socket \
|
||||
-netdev vhost-user,id=v,chardev=c \
|
||||
-device virtio-net,netdev=v \
|
||||
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
|
||||
-numa node,memdev=m"
|
||||
|
||||
GUEST_1_CID=94557
|
||||
context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
" ${__qemu_netdev1}" \
|
||||
' -device virtio-net-pci,netdev=s0 ' \
|
||||
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket " \
|
||||
" -pidfile ${STATESETUP}/qemu_1.pid" \
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID" \
|
||||
" -monitor unix:${STATESETUP}/qemu_1_mon.sock,server,nowait"
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"
|
||||
|
||||
GUEST_2_CID=94558
|
||||
context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
" ${__qemu_netdev2}" \
|
||||
' -device virtio-net-pci,netdev=s0 ' \
|
||||
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket " \
|
||||
" -pidfile ${STATESETUP}/qemu_2.pid" \
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_2_CID" \
|
||||
" -monitor unix:${STATESETUP}/qemu_2_mon.sock,server,nowait" \
|
||||
" -incoming tcp:0:20005"
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_2_CID"
|
||||
|
||||
context_setup_guest guest_1 ${GUEST_1_CID}
|
||||
# Only available after migration:
|
||||
( context_setup_guest guest_2 ${GUEST_2_CID} & )
|
||||
context_setup_guest guest_2 ${GUEST_2_CID}
|
||||
}
|
||||
|
||||
# teardown_context_watch() - Remove contexts and stop panes watching them
|
||||
|
@ -486,8 +326,7 @@ teardown_two_guests() {
|
|||
context_wait pasta_1
|
||||
context_wait pasta_2
|
||||
|
||||
rm "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid"
|
||||
rm "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid"
|
||||
rm -f "${STATESETUP}/passt__[12].pid" "${STATESETUP}/pasta_[12].pid"
|
||||
|
||||
teardown_context_watch ${PANE_HOST} host
|
||||
teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
|
||||
|
@ -496,30 +335,6 @@ teardown_two_guests() {
|
|||
teardown_context_watch ${PANE_PASST_2} pasta_2 passt_2
|
||||
}
|
||||
|
||||
# teardown_migrate() - Exit namespaces, kill qemu processes, passt and pasta
|
||||
teardown_migrate() {
|
||||
${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_1.pid")
|
||||
${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_2.pid")
|
||||
context_wait qemu_1
|
||||
context_wait qemu_2
|
||||
|
||||
${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/passt_2.pid")
|
||||
context_wait passt_1
|
||||
context_wait passt_2
|
||||
${NSTOOL} stop "${STATESETUP}/ns1.hold"
|
||||
context_wait pasta_1
|
||||
|
||||
rm -f "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid"
|
||||
rm -f "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid"
|
||||
|
||||
teardown_context_watch ${PANE_HOST} host
|
||||
|
||||
teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
|
||||
teardown_context_watch ${PANE_GUEST_2} qemu_2 guest_2
|
||||
teardown_context_watch ${PANE_PASST_1} pasta_1 passt_1
|
||||
teardown_context_watch ${PANE_PASST_2} pasta_1 passt_2
|
||||
}
|
||||
|
||||
# teardown_demo_passt() - Exit namespace, kill qemu, passt and pasta
|
||||
teardown_demo_passt() {
|
||||
tmux send-keys -t ${PANE_GUEST} "C-c"
|
||||
|
|
|
@ -33,7 +33,7 @@ setup_memory() {
|
|||
|
||||
pane_or_context_run guest 'qemu-system-$(uname -m)' \
|
||||
' -machine accel=kvm' \
|
||||
' -m '$((${MEM_KIB} / 1024 / 4))' -cpu host -smp '${VCPUS} \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS_MEM}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
|
|
|
@ -19,7 +19,6 @@ STATUS_FILE_INDEX=0
|
|||
STATUS_COLS=
|
||||
STATUS_PASS=0
|
||||
STATUS_FAIL=0
|
||||
STATUS_SKIPPED=0
|
||||
|
||||
PR_RED='\033[1;31m'
|
||||
PR_GREEN='\033[1;32m'
|
||||
|
@ -32,8 +31,8 @@ PR_DELAY_INIT=100 # ms
|
|||
# $@: Message to print
|
||||
info() {
|
||||
tmux select-pane -t ${PANE_INFO}
|
||||
printf "${@}\n" >> $STATEBASE/log_pipe
|
||||
printf "${@}\n" >> "${LOGFILE}"
|
||||
echo "${@}" >> $STATEBASE/log_pipe
|
||||
echo "${@}" >> "${LOGFILE}"
|
||||
}
|
||||
|
||||
# info_n() - Highlight, print message to pane and to log file without newline
|
||||
|
@ -48,13 +47,13 @@ info_n() {
|
|||
# $@: Message to print
|
||||
info_nolog() {
|
||||
tmux select-pane -t ${PANE_INFO}
|
||||
printf "${@}\n" >> $STATEBASE/log_pipe
|
||||
echo "${@}" >> $STATEBASE/log_pipe
|
||||
}
|
||||
|
||||
# info_nolog() - Print message to log file
|
||||
# $@: Message to print
|
||||
log() {
|
||||
printf "${@}\n" >> "${LOGFILE}"
|
||||
echo "${@}" >> "${LOGFILE}"
|
||||
}
|
||||
|
||||
# info_nolog_n() - Send message to pane without highlighting it, without newline
|
||||
|
@ -440,21 +439,19 @@ info_layout() {
|
|||
# status_test_ok() - Update counter of passed tests, log and display message
|
||||
status_test_ok() {
|
||||
STATUS_PASS=$((STATUS_PASS + 1))
|
||||
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
|
||||
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
|
||||
info_passed
|
||||
}
|
||||
|
||||
# status_test_fail() - Update counter of failed tests, log and display message
|
||||
status_test_fail() {
|
||||
STATUS_FAIL=$((STATUS_FAIL + 1))
|
||||
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
|
||||
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
|
||||
info_failed
|
||||
}
|
||||
|
||||
# status_test_fail() - Update counter of failed tests, log and display message
|
||||
status_test_skip() {
|
||||
STATUS_SKIPPED=$((STATUS_SKIPPED + 1))
|
||||
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
|
||||
info_skipped
|
||||
}
|
||||
|
||||
|
@ -667,7 +664,7 @@ pause_continue() {
|
|||
|
||||
# run_term() - Start tmux session, running entry point, with recording if needed
|
||||
run_term() {
|
||||
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL"
|
||||
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG"
|
||||
|
||||
if [ ${CI} -eq 1 ]; then
|
||||
printf '\e[8;50;240t'
|
||||
|
|
|
@ -20,7 +20,10 @@ test_iperf3s() {
|
|||
__sctx="${1}"
|
||||
__port="${2}"
|
||||
|
||||
pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid'
|
||||
pane_or_context_run_bg "${__sctx}" \
|
||||
'iperf3 -s -p'${__port}' & echo $! > s.pid' \
|
||||
|
||||
sleep 1 # Wait for server to be ready
|
||||
}
|
||||
|
||||
# test_iperf3k() - Kill iperf3 server
|
||||
|
@ -28,7 +31,7 @@ test_iperf3s() {
|
|||
test_iperf3k() {
|
||||
__sctx="${1}"
|
||||
|
||||
pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)'
|
||||
pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'
|
||||
|
||||
sleep 1 # Wait for kernel to free up ports
|
||||
}
|
||||
|
@ -65,45 +68,6 @@ test_iperf3() {
|
|||
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
|
||||
}
|
||||
|
||||
# test_iperf3m() - Ugly helper for iperf3 directive, guest migration variant
|
||||
# $1: Variable name: to put the measure bandwidth into
|
||||
# $2: Initial source/client context
|
||||
# $3: Second source/client context the guest is moving to
|
||||
# $4: Destination name or address for client
|
||||
# $5: Port number, ${i} is translated to process index
|
||||
# $6: Run time, in seconds
|
||||
# $7: Client options
|
||||
test_iperf3m() {
|
||||
__var="${1}"; shift
|
||||
__cctx="${1}"; shift
|
||||
__cctx2="${1}"; shift
|
||||
__dest="${1}"; shift
|
||||
__port="${1}"; shift
|
||||
__time="${1}"; shift
|
||||
|
||||
pane_or_context_run "${__cctx}" 'rm -f c.json'
|
||||
|
||||
# A 1s wait for connection on what's basically a local link
|
||||
# indicates something is pretty wrong
|
||||
__timeout=1000
|
||||
pane_or_context_run_bg "${__cctx}" \
|
||||
'iperf3 -J -c '${__dest}' -p '${__port} \
|
||||
' --connect-timeout '${__timeout} \
|
||||
' -t'${__time}' -i0 '"${@}"' > c.json' \
|
||||
|
||||
__jval=".end.sum_received.bits_per_second"
|
||||
|
||||
sleep $((${__time} + 3))
|
||||
|
||||
pane_or_context_output "${__cctx2}" \
|
||||
'cat c.json'
|
||||
|
||||
__bw=$(pane_or_context_output "${__cctx2}" \
|
||||
'cat c.json | jq -rMs "map('${__jval}') | add"')
|
||||
|
||||
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
|
||||
}
|
||||
|
||||
test_one_line() {
|
||||
__line="${1}"
|
||||
|
||||
|
@ -213,12 +177,6 @@ test_one_line() {
|
|||
"guest2w")
|
||||
pane_or_context_wait guest_2 || TEST_ONE_nok=1
|
||||
;;
|
||||
"mon")
|
||||
pane_or_context_run mon "${__arg}" || TEST_ONE_nok=1
|
||||
;;
|
||||
"monb")
|
||||
pane_or_context_run_bg mon "${__arg}"
|
||||
;;
|
||||
"ns")
|
||||
pane_or_context_run ns "${__arg}" || TEST_ONE_nok=1
|
||||
;;
|
||||
|
@ -334,9 +292,6 @@ test_one_line() {
|
|||
"iperf3")
|
||||
test_iperf3 ${__arg}
|
||||
;;
|
||||
"iperf3m")
|
||||
test_iperf3m ${__arg}
|
||||
;;
|
||||
"set")
|
||||
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__arg%% *}__" "${__arg#* }")"
|
||||
;;
|
||||
|
|
|
@ -1,59 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/migrate/basic - Check basic migration functionality
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
g1tools ip jq dhclient socat cat
|
||||
htools ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
test Interface name
|
||||
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME1__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest1 ip link set dev __IFNAME1__ up
|
||||
guest1 /sbin/dhclient -4 __IFNAME1__
|
||||
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR1__" = "__HOST_ADDR__" ]
|
||||
|
||||
test DHCPv6: address
|
||||
# Link is up now, wait for DAD to complete
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
guest1 /sbin/dhclient -6 __IFNAME1__
|
||||
# Wait for DAD to complete on the DHCP address
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
|
||||
|
||||
test TCP/IPv4: guest1/guest2 > host
|
||||
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
|
||||
sleep 1
|
||||
# Option 1: via spliced path in pasta, namespace to host
|
||||
# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003
|
||||
# Option 2: via --map-guest-addr (tap) in pasta, namespace to host
|
||||
guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006
|
||||
sleep 1
|
||||
|
||||
mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
|
||||
|
||||
hostw
|
||||
hout MSG cat __STATESETUP__/msg
|
||||
check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]
|
|
@ -1,62 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/migrate/basic_fin - Outbound traffic across migration, half-closed socket
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
g1tools ip jq dhclient socat cat
|
||||
htools ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
test Interface name
|
||||
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME1__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest1 ip link set dev __IFNAME1__ up
|
||||
guest1 /sbin/dhclient -4 __IFNAME1__
|
||||
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR1__" = "__HOST_ADDR__" ]
|
||||
|
||||
test DHCPv6: address
|
||||
# Link is up now, wait for DAD to complete
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
guest1 /sbin/dhclient -6 __IFNAME1__
|
||||
# Wait for DAD to complete on the DHCP address
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
|
||||
|
||||
test TCP/IPv4: guest1, half-close, guest2 > host
|
||||
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
|
||||
hostb echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg
|
||||
#hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
|
||||
|
||||
#sleep 20
|
||||
# Option 1: via spliced path in pasta, namespace to host
|
||||
# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003
|
||||
# Option 2: via --map-guest-addr (tap) in pasta, namespace to host
|
||||
guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006
|
||||
sleep 1
|
||||
|
||||
mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
|
||||
|
||||
hostw
|
||||
hout MSG cat __STATESETUP__/msg
|
||||
check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]
|
|
@ -1,64 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/migrate/bidirectional - Check migration with messages in both directions
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
g1tools ip jq dhclient socat cat
|
||||
htools ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
test Interface name
|
||||
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME1__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest1 ip link set dev __IFNAME1__ up
|
||||
guest1 /sbin/dhclient -4 __IFNAME1__
|
||||
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR1__" = "__HOST_ADDR__" ]
|
||||
|
||||
test TCP/IPv4: guest1/guest2 > host, host > guest1/guest2
|
||||
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
|
||||
hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
|
||||
guest1b socat -u TCP4-LISTEN:10001 OPEN:msg,create,trunc
|
||||
sleep 1
|
||||
|
||||
guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006
|
||||
hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001
|
||||
sleep 1
|
||||
guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock
|
||||
host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock
|
||||
sleep 1
|
||||
|
||||
mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
|
||||
|
||||
sleep 2
|
||||
guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null
|
||||
host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null
|
||||
|
||||
hostw
|
||||
# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1,
|
||||
# use sleep 1 for the moment
|
||||
sleep 1
|
||||
|
||||
hout MSG cat __STATESETUP__/msg
|
||||
check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]
|
||||
|
||||
g2out MSG cat msg
|
||||
check [ "__MSG__" = "Dear guest 1, you are now guest 2" ]
|
|
@ -1,64 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/migrate/bidirectional_fin - Both directions, half-closed sockets
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
g1tools ip jq dhclient socat cat
|
||||
htools ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
test Interface name
|
||||
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME1__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest1 ip link set dev __IFNAME1__ up
|
||||
guest1 /sbin/dhclient -4 __IFNAME1__
|
||||
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR1__" = "__HOST_ADDR__" ]
|
||||
|
||||
test TCP/IPv4: guest1/guest2 <- (half closed) -> host
|
||||
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
|
||||
hostb echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg
|
||||
guest1b echo FIN | socat TCP4-LISTEN:10001,shut-down STDIO,ignoreeof > msg
|
||||
sleep 1
|
||||
|
||||
guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006
|
||||
hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001
|
||||
sleep 1
|
||||
guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock
|
||||
host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock
|
||||
sleep 1
|
||||
|
||||
mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
|
||||
|
||||
sleep 2
|
||||
guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null
|
||||
host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null
|
||||
|
||||
hostw
|
||||
# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1,
|
||||
# use sleep 1 for the moment
|
||||
sleep 1
|
||||
|
||||
hout MSG cat __STATESETUP__/msg
|
||||
check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]
|
||||
|
||||
g2out MSG cat msg
|
||||
check [ "__MSG__" = "Dear guest 1, you are now guest 2" ]
|
|
@ -1,58 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/migrate/iperf3_bidir6 - Migration behaviour with many bidirectional flows
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
g1tools ip jq dhclient socat cat
|
||||
htools ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
set THREADS 128
|
||||
set TIME 3
|
||||
set OMIT 0.1
|
||||
set OPTS -Z -P __THREADS__ -O__OMIT__ -N --bidir
|
||||
|
||||
test Interface name
|
||||
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME1__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest1 ip link set dev __IFNAME1__ up
|
||||
guest1 /sbin/dhclient -4 __IFNAME1__
|
||||
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR1__" = "__HOST_ADDR__" ]
|
||||
|
||||
test DHCPv6: address
|
||||
# Link is up now, wait for DAD to complete
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
guest1 /sbin/dhclient -6 __IFNAME1__
|
||||
# Wait for DAD to complete on the DHCP address
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
|
||||
|
||||
test TCP/IPv6 host <-> guest flood, many flows, during migration
|
||||
|
||||
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
|
||||
|
||||
iperf3s host 10006
|
||||
iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
|
||||
bw __BW__ 1 2
|
||||
|
||||
iperf3k host
|
|
@ -1,50 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/migrate/iperf3_in4 - Migration behaviour under inbound IPv4 flood
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
g1tools ip jq dhclient socat cat
|
||||
htools ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
guest1 /sbin/sysctl -w net.core.rmem_max=33554432
|
||||
guest1 /sbin/sysctl -w net.core.wmem_max=33554432
|
||||
|
||||
set THREADS 1
|
||||
set TIME 4
|
||||
set OMIT 0.1
|
||||
set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R
|
||||
|
||||
test Interface name
|
||||
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME1__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest1 ip link set dev __IFNAME1__ up
|
||||
guest1 /sbin/dhclient -4 __IFNAME1__
|
||||
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR1__" = "__HOST_ADDR__" ]
|
||||
|
||||
test TCP/IPv4 host to guest throughput during migration
|
||||
|
||||
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
|
||||
|
||||
iperf3s host 10006
|
||||
iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__
|
||||
bw __BW__ 1 2
|
||||
|
||||
iperf3k host
|
|
@ -1,58 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/migrate/iperf3_in6 - Migration behaviour under inbound IPv6 flood
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
g1tools ip jq dhclient socat cat
|
||||
htools ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
set THREADS 4
|
||||
set TIME 3
|
||||
set OMIT 0.1
|
||||
set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R
|
||||
|
||||
test Interface name
|
||||
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME1__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest1 ip link set dev __IFNAME1__ up
|
||||
guest1 /sbin/dhclient -4 __IFNAME1__
|
||||
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR1__" = "__HOST_ADDR__" ]
|
||||
|
||||
test DHCPv6: address
|
||||
# Link is up now, wait for DAD to complete
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
guest1 /sbin/dhclient -6 __IFNAME1__
|
||||
# Wait for DAD to complete on the DHCP address
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
|
||||
|
||||
test TCP/IPv6 host to guest throughput during migration
|
||||
|
||||
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
|
||||
|
||||
iperf3s host 10006
|
||||
iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
|
||||
bw __BW__ 1 2
|
||||
|
||||
iperf3k host
|
|
@ -1,60 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/migrate/iperf3_many_out6 - Migration behaviour with many outbound flows
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
g1tools ip jq dhclient socat cat
|
||||
htools ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
set THREADS 16
|
||||
set TIME 3
|
||||
set OMIT 0.1
|
||||
set OPTS -Z -P __THREADS__ -O__OMIT__ -N -l 1M
|
||||
|
||||
test Interface name
|
||||
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME1__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest1 ip link set dev __IFNAME1__ up
|
||||
guest1 /sbin/dhclient -4 __IFNAME1__
|
||||
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR1__" = "__HOST_ADDR__" ]
|
||||
|
||||
test DHCPv6: address
|
||||
# Link is up now, wait for DAD to complete
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
guest1 /sbin/dhclient -6 __IFNAME1__
|
||||
# Wait for DAD to complete on the DHCP address
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
|
||||
|
||||
test TCP/IPv6 guest to host flood, many flows, during migration
|
||||
|
||||
test TCP/IPv6 host to guest throughput during migration
|
||||
|
||||
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
|
||||
|
||||
iperf3s host 10006
|
||||
iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
|
||||
bw __BW__ 1 2
|
||||
|
||||
iperf3k host
|
|
@ -1,47 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/migrate/iperf3_out4 - Migration behaviour under outbound IPv4 flood
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
g1tools ip jq dhclient socat cat
|
||||
htools ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
set THREADS 6
|
||||
set TIME 2
|
||||
set OMIT 0.1
|
||||
set OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M
|
||||
|
||||
test Interface name
|
||||
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME1__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest1 ip link set dev __IFNAME1__ up
|
||||
guest1 /sbin/dhclient -4 __IFNAME1__
|
||||
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR1__" = "__HOST_ADDR__" ]
|
||||
|
||||
test TCP/IPv4 guest to host throughput during migration
|
||||
|
||||
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
|
||||
|
||||
iperf3s host 10006
|
||||
iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__
|
||||
bw __BW__ 1 2
|
||||
|
||||
iperf3k host
|
|
@ -1,58 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/migrate/iperf3_out6 - Migration behaviour under outbound IPv6 flood
|
||||
#
|
||||
# Copyright (c) 2025 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
g1tools ip jq dhclient socat cat
|
||||
htools ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
set THREADS 6
|
||||
set TIME 2
|
||||
set OMIT 0.1
|
||||
set OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M
|
||||
|
||||
test Interface name
|
||||
g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME1__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest1 ip link set dev __IFNAME1__ up
|
||||
guest1 /sbin/dhclient -4 __IFNAME1__
|
||||
g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR1__" = "__HOST_ADDR__" ]
|
||||
|
||||
test DHCPv6: address
|
||||
# Link is up now, wait for DAD to complete
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
guest1 /sbin/dhclient -6 __IFNAME1__
|
||||
# Wait for DAD to complete on the DHCP address
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
|
||||
|
||||
test TCP/IPv6 guest to host throughput during migration
|
||||
|
||||
monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
|
||||
|
||||
iperf3s host 10006
|
||||
iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
|
||||
bw __BW__ 1 2
|
||||
|
||||
iperf3k host
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue