Compare commits
16 commits
master
...
vhost-user
Author | SHA1 | Date | |
---|---|---|---|
|
60e35ab2bd | ||
|
95aebad0a4 | ||
|
2d5528c9be | ||
|
1bf4abe402 | ||
|
37f457a76c | ||
|
b2229bd24f | ||
|
45b1403f42 | ||
|
bb3877dde3 | ||
|
27a713947c | ||
|
0938100596 | ||
|
72cadf34ad | ||
|
4d7ca742ef | ||
|
9cc20cbdb1 | ||
|
c38f260820 | ||
|
576c1cca2c | ||
|
a66fceb280 |
120 changed files with 7451 additions and 8209 deletions
126
.clang-format
126
.clang-format
|
@ -1,126 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0
|
||||
#
|
||||
# clang-format configuration file. Intended for clang-format >= 11.
|
||||
#
|
||||
# For more information, see:
|
||||
#
|
||||
# Documentation/dev-tools/clang-format.rst
|
||||
# https://clang.llvm.org/docs/ClangFormat.html
|
||||
# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||
#
|
||||
---
|
||||
AccessModifierOffset: -4
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlines: Left
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: false
|
||||
AllowAllParametersOfDeclarationOnNextLine: false
|
||||
AllowShortBlocksOnASingleLine: false
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: None
|
||||
AllowShortIfStatementsOnASingleLine: false
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: false
|
||||
AlwaysBreakTemplateDeclarations: false
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true
|
||||
BraceWrapping:
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: true
|
||||
AfterNamespace: true
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Custom
|
||||
BreakBeforeInheritanceComma: false
|
||||
BreakBeforeTernaryOperators: false
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BreakConstructorInitializers: BeforeComma
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakStringLiterals: false
|
||||
ColumnLimit: 80
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: false
|
||||
ConstructorInitializerIndentWidth: 8
|
||||
ContinuationIndentWidth: 8
|
||||
Cpp11BracedListStyle: false
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: false
|
||||
|
||||
# Taken from:
|
||||
# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
|
||||
# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
|
||||
# | LC_ALL=C sort -u
|
||||
ForEachMacros:
|
||||
- 'for_each_nst'
|
||||
|
||||
IncludeBlocks: Preserve
|
||||
IncludeCategories:
|
||||
- Regex: '.*'
|
||||
Priority: 1
|
||||
IncludeIsMainRegex: '(Test)?$'
|
||||
IndentCaseLabels: false
|
||||
IndentGotoLabels: false
|
||||
IndentPPDirectives: None
|
||||
IndentWidth: 8
|
||||
IndentWrappedFunctionNames: false
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBinPackProtocolList: Auto
|
||||
ObjCBlockIndentWidth: 8
|
||||
ObjCSpaceAfterProperty: true
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
|
||||
# Taken from git's rules
|
||||
PenaltyBreakAssignment: 10
|
||||
PenaltyBreakBeforeFirstCallParameter: 30
|
||||
PenaltyBreakComment: 10
|
||||
PenaltyBreakFirstLessLess: 0
|
||||
PenaltyBreakString: 10
|
||||
PenaltyExcessCharacter: 100
|
||||
PenaltyReturnTypeOnItsOwnLine: 60
|
||||
|
||||
PointerAlignment: Right
|
||||
ReflowComments: false
|
||||
SortIncludes: false
|
||||
SortUsingDeclarations: false
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeCtorInitializerColon: true
|
||||
SpaceBeforeInheritanceColon: true
|
||||
SpaceBeforeParens: ControlStatementsExceptForEachMacros
|
||||
SpaceBeforeRangeBasedForLoopColon: true
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 1
|
||||
SpacesInAngles: false
|
||||
SpacesInContainerLiterals: false
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
Standard: Cpp03
|
||||
TabWidth: 8
|
||||
UseTab: Always
|
||||
...
|
93
.clang-tidy
93
.clang-tidy
|
@ -1,93 +0,0 @@
|
|||
---
|
||||
Checks:
|
||||
- "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*"
|
||||
|
||||
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
|
||||
- "-clang-analyzer-valist.Uninitialized"
|
||||
|
||||
# Dubious value, would kill readability
|
||||
- "-cppcoreguidelines-init-variables"
|
||||
|
||||
# Dubious value over the compiler's built-in warning. Would
|
||||
# increase verbosity.
|
||||
- "-bugprone-assignment-in-if-condition"
|
||||
|
||||
# Debatable whether these improve readability, right now it would look
|
||||
# like a mess
|
||||
- "-google-readability-braces-around-statements"
|
||||
- "-hicpp-braces-around-statements"
|
||||
- "-readability-braces-around-statements"
|
||||
|
||||
# TODO: in most cases they are justified, but probably not everywhere
|
||||
#
|
||||
- "-readability-magic-numbers"
|
||||
- "-cppcoreguidelines-avoid-magic-numbers"
|
||||
|
||||
# TODO: this is Linux-only for the moment, nice to fix eventually
|
||||
- "-llvmlibc-restrict-system-libc-headers"
|
||||
|
||||
# Those are needed for syscalls, epoll_wait flags, etc.
|
||||
- "-hicpp-signed-bitwise"
|
||||
|
||||
# Probably not doable to impement this without plain memcpy(), memset()
|
||||
- "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
|
||||
|
||||
# TODO: not really important, but nice to fix eventually
|
||||
- "-llvm-include-order"
|
||||
|
||||
# Dubious value, would kill readability
|
||||
- "-readability-isolate-declaration"
|
||||
|
||||
# TODO: nice to fix eventually
|
||||
- "-bugprone-narrowing-conversions"
|
||||
- "-cppcoreguidelines-narrowing-conversions"
|
||||
|
||||
# TODO: check, fix, and more in general constify wherever possible
|
||||
- "-cppcoreguidelines-avoid-non-const-global-variables"
|
||||
|
||||
# TODO: check paths where it might make sense to improve performance
|
||||
- "-altera-unroll-loops"
|
||||
- "-altera-id-dependent-backward-branch"
|
||||
|
||||
# Not much can be done about them other than being careful
|
||||
- "-bugprone-easily-swappable-parameters"
|
||||
|
||||
# TODO: split reported functions
|
||||
- "-readability-function-cognitive-complexity"
|
||||
|
||||
# "Poor" alignment needed for structs reflecting message formats/headers
|
||||
- "-altera-struct-pack-align"
|
||||
|
||||
# TODO: check again if multithreading is implemented
|
||||
- "-concurrency-mt-unsafe"
|
||||
|
||||
# Complains about any identifier <3 characters, reasonable for
|
||||
# globals, pointlessly verbose for locals and parameters.
|
||||
- "-readability-identifier-length"
|
||||
|
||||
# Wants to include headers which *directly* provide the things
|
||||
# we use. That sounds nice, but means it will often want a OS
|
||||
# specific header instead of a mostly standard one, such as
|
||||
# <linux/limits.h> instead of <limits.h>.
|
||||
- "-misc-include-cleaner"
|
||||
|
||||
# Want to replace all #defines of integers with enums. Kind of
|
||||
# makes sense when those defines form an enum-like set, but
|
||||
# weird for cases like standalone constants, and causes other
|
||||
# awkwardness for a bunch of cases we use
|
||||
- "-cppcoreguidelines-macro-to-enum"
|
||||
|
||||
# It's been a couple of centuries since multiplication has been granted
|
||||
# precedence over addition in modern mathematical notation. Adding
|
||||
# parentheses to reinforce that certainly won't improve readability.
|
||||
- "-readability-math-missing-parentheses"
|
||||
WarningsAsErrors: "*"
|
||||
HeaderFileExtensions:
|
||||
- h
|
||||
ImplementationFileExtensions:
|
||||
- c
|
||||
HeaderFilterRegex: ""
|
||||
FormatStyle: none
|
||||
CheckOptions:
|
||||
bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false"
|
||||
SystemHeaders: false
|
3
.clangd
3
.clangd
|
@ -1,3 +0,0 @@
|
|||
CompileFlags:
|
||||
# Don't try to interpret our headers as C++'
|
||||
Add: [-xc, -Wall]
|
177
Makefile
177
Makefile
|
@ -15,41 +15,66 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
|
|||
# the IPv6 socket API? (Linux does)
|
||||
DUAL_STACK_SOCKETS := 1
|
||||
|
||||
RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s')
|
||||
ifeq ($(RLIMIT_STACK_VAL),unlimited)
|
||||
RLIMIT_STACK_VAL := 1024
|
||||
endif
|
||||
|
||||
TARGET ?= $(shell $(CC) -dumpmachine)
|
||||
# Get 'uname -m'-like architecture description for target
|
||||
TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
|
||||
TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')
|
||||
|
||||
# On some systems enabling optimization also enables source fortification,
|
||||
# automagically. Do not override it.
|
||||
FORTIFY_FLAG :=
|
||||
ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /dev/null; echo $$?),1)
|
||||
FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
|
||||
endif
|
||||
AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
|
||||
|
||||
FLAGS := -Wall -Wextra -Wno-format-zero-length
|
||||
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
|
||||
FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
|
||||
FLAGS += -D_FORTIFY_SOURCE=2 -O2 -pie -fPIE
|
||||
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
|
||||
FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
|
||||
FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
|
||||
FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
|
||||
FLAGS += -DARCH=\"$(TARGET_ARCH)\"
|
||||
FLAGS += -DVERSION=\"$(VERSION)\"
|
||||
FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
|
||||
|
||||
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
|
||||
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
|
||||
ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
|
||||
tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c
|
||||
tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_vu.c util.c vhost_user.c virtio.c
|
||||
QRAP_SRCS = qrap.c
|
||||
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
|
||||
|
||||
MANPAGES = passt.1 pasta.1 qrap.1
|
||||
|
||||
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
|
||||
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
|
||||
lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
|
||||
siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
|
||||
udp.h udp_flow.h util.h
|
||||
flow_table.h icmp.h inany.h iov.h ip.h isolation.h lineread.h log.h \
|
||||
ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h siphash.h tap.h \
|
||||
tcp.h tcp_buf.h tcp_conn.h tcp_splice.h tcp_vu.h udp.h udp_internal.h \
|
||||
udp_vu.h util.h vhost_user.h virtio.h
|
||||
HEADERS = $(PASST_HEADERS) seccomp.h
|
||||
|
||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_SND_WND
|
||||
endif
|
||||
|
||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_BYTES_ACKED
|
||||
endif
|
||||
|
||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 };
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_MIN_RTT
|
||||
endif
|
||||
|
||||
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_GETRANDOM
|
||||
|
@ -59,6 +84,11 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec
|
|||
FLAGS += -fstack-protector-strong
|
||||
endif
|
||||
|
||||
C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE;
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
EXTRA_SYSCALLS += fallocate
|
||||
endif
|
||||
|
||||
prefix ?= /usr/local
|
||||
exec_prefix ?= $(prefix)
|
||||
bindir ?= $(exec_prefix)/bin
|
||||
|
@ -95,11 +125,11 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
|
|||
ln -sf $< $@
|
||||
|
||||
qrap: $(QRAP_SRCS) passt.h
|
||||
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)
|
||||
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
|
||||
|
||||
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
|
||||
rt_sigreturn getpid gettid kill clock_gettime mmap \
|
||||
mmap2 munmap open unlink gettimeofday futex
|
||||
getpid gettid kill clock_gettime mmap \
|
||||
munmap open unlink gettimeofday futex
|
||||
valgrind: FLAGS += -g -DVALGRIND
|
||||
valgrind: all
|
||||
|
||||
|
@ -159,11 +189,111 @@ docs: README.md
|
|||
done < README.md; \
|
||||
) > README.plain.md
|
||||
|
||||
clang-tidy: $(PASST_SRCS) $(HEADERS)
|
||||
clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
|
||||
-DCLANG_TIDY_58992
|
||||
# Checkers currently disabled for clang-tidy:
|
||||
# - llvmlibc-restrict-system-libc-headers
|
||||
# TODO: this is Linux-only for the moment, nice to fix eventually
|
||||
#
|
||||
# - bugprone-macro-parentheses
|
||||
# - google-readability-braces-around-statements
|
||||
# - hicpp-braces-around-statements
|
||||
# - readability-braces-around-statements
|
||||
# Debatable whether that improves readability, right now it would look
|
||||
# like a mess
|
||||
#
|
||||
# - readability-magic-numbers
|
||||
# - cppcoreguidelines-avoid-magic-numbers
|
||||
# TODO: in most cases they are justified, but probably not everywhere
|
||||
#
|
||||
# - clang-analyzer-valist.Uninitialized
|
||||
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
|
||||
#
|
||||
# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling
|
||||
# Probably not doable to impement this without plain memcpy(), memset()
|
||||
#
|
||||
# - cppcoreguidelines-init-variables
|
||||
# Dubious value, would kill readability
|
||||
#
|
||||
# - hicpp-signed-bitwise
|
||||
# Those are needed for syscalls, epoll_wait flags, etc.
|
||||
#
|
||||
# - llvm-include-order
|
||||
# TODO: not really important, but nice to fix eventually
|
||||
#
|
||||
# - readability-isolate-declaration
|
||||
# Dubious value, would kill readability
|
||||
#
|
||||
# - bugprone-narrowing-conversions
|
||||
# - cppcoreguidelines-narrowing-conversions
|
||||
# TODO: nice to fix eventually
|
||||
#
|
||||
# - cppcoreguidelines-avoid-non-const-global-variables
|
||||
# TODO: check, fix, and more in general constify wherever possible
|
||||
#
|
||||
# - altera-unroll-loops
|
||||
# - altera-id-dependent-backward-branch
|
||||
# TODO: check paths where it might make sense to improve performance
|
||||
#
|
||||
# - bugprone-easily-swappable-parameters
|
||||
# Not much can be done about them other than being careful
|
||||
#
|
||||
# - readability-function-cognitive-complexity
|
||||
# TODO: split reported functions
|
||||
#
|
||||
# - altera-struct-pack-align
|
||||
# "Poor" alignment needed for structs reflecting message formats/headers
|
||||
#
|
||||
# - concurrency-mt-unsafe
|
||||
# TODO: check again if multithreading is implemented
|
||||
#
|
||||
# - readability-identifier-length
|
||||
# Complains about any identifier <3 characters, reasonable for
|
||||
# globals, pointlessly verbose for locals and parameters.
|
||||
#
|
||||
# - bugprone-assignment-in-if-condition
|
||||
# Dubious value over the compiler's built-in warning. Would
|
||||
# increase verbosity.
|
||||
#
|
||||
# - misc-include-cleaner
|
||||
# Wants to include headers which *directly* provide the things
|
||||
# we use. That sounds nice, but means it will often want a OS
|
||||
# specific header instead of a mostly standard one, such as
|
||||
# <linux/limits.h> instead of <limits.h>.
|
||||
|
||||
cppcheck: $(PASST_SRCS) $(HEADERS)
|
||||
clang-tidy: $(SRCS) $(HEADERS)
|
||||
clang-tidy -checks=*,-modernize-*,\
|
||||
-clang-analyzer-valist.Uninitialized,\
|
||||
-cppcoreguidelines-init-variables,\
|
||||
-bugprone-assignment-in-if-condition,\
|
||||
-bugprone-macro-parentheses,\
|
||||
-google-readability-braces-around-statements,\
|
||||
-hicpp-braces-around-statements,\
|
||||
-readability-braces-around-statements,\
|
||||
-readability-magic-numbers,\
|
||||
-llvmlibc-restrict-system-libc-headers,\
|
||||
-hicpp-signed-bitwise,\
|
||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
|
||||
-llvm-include-order,\
|
||||
-cppcoreguidelines-avoid-magic-numbers,\
|
||||
-readability-isolate-declaration,\
|
||||
-bugprone-narrowing-conversions,\
|
||||
-cppcoreguidelines-narrowing-conversions,\
|
||||
-cppcoreguidelines-avoid-non-const-global-variables,\
|
||||
-altera-unroll-loops,-altera-id-dependent-backward-branch,\
|
||||
-bugprone-easily-swappable-parameters,\
|
||||
-readability-function-cognitive-complexity,\
|
||||
-altera-struct-pack-align,\
|
||||
-concurrency-mt-unsafe,\
|
||||
-readability-identifier-length,\
|
||||
-misc-include-cleaner \
|
||||
-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
|
||||
--warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
|
||||
|
||||
SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
|
||||
ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)
|
||||
VER := $(shell $(CC) -dumpversion)
|
||||
SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include
|
||||
endif
|
||||
cppcheck: $(SRCS) $(HEADERS)
|
||||
if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
|
||||
CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \
|
||||
else \
|
||||
|
@ -172,8 +302,11 @@ cppcheck: $(PASST_SRCS) $(HEADERS)
|
|||
cppcheck --std=c11 --error-exitcode=1 --enable=all --force \
|
||||
--inconclusive --library=posix --quiet \
|
||||
$${CPPCHECK_EXHAUSTIVE} \
|
||||
$(SYSTEM_INCLUDES:%=-I%) \
|
||||
$(SYSTEM_INCLUDES:%=--config-exclude=%) \
|
||||
$(SYSTEM_INCLUDES:%=--suppress=*:%/*) \
|
||||
$(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*) \
|
||||
--inline-suppr \
|
||||
--suppress=missingIncludeSystem \
|
||||
--suppress=unusedStructMember \
|
||||
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936 \
|
||||
$(PASST_SRCS) $(HEADERS)
|
||||
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
|
||||
.
|
||||
|
|
10
README.md
10
README.md
|
@ -338,24 +338,20 @@ speeding up local connections, and usually requiring NAT. _pasta_:
|
|||
[_slirp4netns_ replacement](/passt/tree/slirp4netns.sh)
|
||||
* ✅ out-of-tree patch for
|
||||
[Kata Containers](/passt/tree/contrib/kata-containers) available
|
||||
* ✅ rootless Docker
|
||||
[network back-end](https://docs.docker.com/engine/security/rootless/#networking-errors)
|
||||
via moby/rootlesskit
|
||||
* ⌚ drop-in replacement for VPNKit (rootless Docker)
|
||||
|
||||
### Availability
|
||||
* official packages for:
|
||||
* ✅ [Alpine Linux](https://pkgs.alpinelinux.org/packages?name=passt)
|
||||
* ✅ [Arch Linux](https://archlinux.org/packages/extra/x86_64/passt/) ([aarch64](https://archlinuxarm.org/packages/aarch64/passt), [i486](https://www.archlinux32.org/packages/?q=passt))
|
||||
* ✅ [CentOS Stream](https://gitlab.com/redhat/centos-stream/rpms/passt)
|
||||
* ✅ [Debian](https://tracker.debian.org/pkg/passt)
|
||||
* ✅ [Fedora](https://src.fedoraproject.org/rpms/passt)
|
||||
* ✅ [Gentoo](https://packages.gentoo.org/packages/net-misc/passt)
|
||||
* ✅ [GNU Guix](https://packages.guix.gnu.org/packages/passt/)
|
||||
* ✅ [OpenSUSE](https://build.opensuse.org/package/requests/Virtualization:containers/passt)
|
||||
* ✅ [Ubuntu](https://launchpad.net/ubuntu/+source/passt)
|
||||
* ✅ [Void Linux](https://voidlinux.org/packages/?q=passt)
|
||||
* unofficial packages for:
|
||||
* ✅ [EPEL, Mageia](https://copr.fedorainfracloud.org/coprs/sbrivio/passt/)
|
||||
* 🛠 [openSUSE](https://build.opensuse.org/package/show/Virtualization:containers/passt)
|
||||
* ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64
|
||||
static builds for other RPM-based distributions
|
||||
* ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64
|
||||
|
@ -400,7 +396,7 @@ services:
|
|||
and nameserver using SLAAC
|
||||
* [DHCPv6 server](/passt/tree/dhcpv6.c): a simple
|
||||
implementation handing out one single IPv6 address to the guest or namespace,
|
||||
namely, the same address as the first one configured for the upstream host
|
||||
namely, the the same address as the first one configured for the upstream host
|
||||
interface, and passing the nameservers configured on the host
|
||||
|
||||
## Addresses
|
||||
|
|
18
arch.c
18
arch.c
|
@ -18,9 +18,6 @@
|
|||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "log.h"
|
||||
#include "util.h"
|
||||
|
||||
/**
|
||||
* arch_avx2_exec() - Switch to AVX2 build if supported
|
||||
* @argv: Arguments from command line
|
||||
|
@ -31,8 +28,10 @@ void arch_avx2_exec(char **argv)
|
|||
char exe[PATH_MAX] = { 0 };
|
||||
const char *p;
|
||||
|
||||
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
|
||||
die_perror("Failed to read own /proc/self/exe link");
|
||||
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) {
|
||||
perror("readlink /proc/self/exe");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
p = strstr(exe, ".avx2");
|
||||
if (p && strlen(p) == strlen(".avx2"))
|
||||
|
@ -41,12 +40,9 @@ void arch_avx2_exec(char **argv)
|
|||
if (__builtin_cpu_supports("avx2")) {
|
||||
char new_path[PATH_MAX + sizeof(".avx2")];
|
||||
|
||||
if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"),
|
||||
"%s.avx2", exe))
|
||||
die_perror("Can't build AVX2 executable path");
|
||||
|
||||
execv(new_path, argv);
|
||||
warn_perror("Can't run AVX2 build, using non-AVX2 version");
|
||||
snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe);
|
||||
execve(new_path, argv, environ);
|
||||
perror("Can't run AVX2 build, using non-AVX2 version");
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
|
20
arp.c
20
arp.c
|
@ -43,7 +43,8 @@ int arp(const struct ctx *c, const struct pool *p)
|
|||
struct ethhdr *eh;
|
||||
struct arphdr *ah;
|
||||
struct arpmsg *am;
|
||||
size_t l2len;
|
||||
size_t len;
|
||||
int ret;
|
||||
|
||||
eh = packet_get(p, 0, 0, sizeof(*eh), NULL);
|
||||
ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL);
|
||||
|
@ -59,28 +60,31 @@ int arp(const struct ctx *c, const struct pool *p)
|
|||
ah->ar_op != htons(ARPOP_REQUEST))
|
||||
return 1;
|
||||
|
||||
/* Discard announcements, but not 0.0.0.0 "probes" */
|
||||
if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) &&
|
||||
/* Discard announcements (but not 0.0.0.0 "probes"): we might have the
|
||||
* same IP address, hide that.
|
||||
*/
|
||||
if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
|
||||
!memcmp(am->sip, am->tip, sizeof(am->sip)))
|
||||
return 1;
|
||||
|
||||
/* Don't resolve the guest's assigned address, either. */
|
||||
/* Don't resolve our own address, either. */
|
||||
if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
|
||||
return 1;
|
||||
|
||||
ah->ar_op = htons(ARPOP_REPLY);
|
||||
memcpy(am->tha, am->sha, sizeof(am->tha));
|
||||
memcpy(am->sha, c->our_tap_mac, sizeof(am->sha));
|
||||
memcpy(am->sha, c->mac, sizeof(am->sha));
|
||||
|
||||
memcpy(swap, am->tip, sizeof(am->tip));
|
||||
memcpy(am->tip, am->sip, sizeof(am->tip));
|
||||
memcpy(am->sip, swap, sizeof(am->sip));
|
||||
|
||||
l2len = sizeof(*eh) + sizeof(*ah) + sizeof(*am);
|
||||
len = sizeof(*eh) + sizeof(*ah) + sizeof(*am);
|
||||
memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
|
||||
memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
|
||||
|
||||
tap_send_single(c, eh, l2len);
|
||||
if ((ret = tap_send(c, eh, len)) < 0)
|
||||
warn("ARP: send: %s", strerror(ret));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
96
checksum.c
96
checksum.c
|
@ -59,7 +59,6 @@
|
|||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "checksum.h"
|
||||
#include "iov.h"
|
||||
|
||||
/* Checksums are optional for UDP over IPv4, so we usually just set
|
||||
* them to 0. Change this to 1 to calculate real UDP over IPv4
|
||||
|
@ -117,19 +116,19 @@ uint16_t csum_fold(uint32_t sum)
|
|||
|
||||
/**
|
||||
* csum_ip4_header() - Calculate IPv4 header checksum
|
||||
* @l3len: IPv4 packet length (host order)
|
||||
* @protocol: Protocol number
|
||||
* @saddr: IPv4 source address
|
||||
* @daddr: IPv4 destination address
|
||||
* @tot_len: IPv4 payload length (data + IP header, network order)
|
||||
* @protocol: Protocol number (network order)
|
||||
* @saddr: IPv4 source address (network order)
|
||||
* @daddr: IPv4 destination address (network order)
|
||||
*
|
||||
* Return: 16-bit folded sum of the IPv4 header
|
||||
*/
|
||||
uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
|
||||
uint16_t csum_ip4_header(uint16_t tot_len, uint8_t protocol,
|
||||
struct in_addr saddr, struct in_addr daddr)
|
||||
{
|
||||
uint32_t sum = L2_BUF_IP4_PSUM(protocol);
|
||||
|
||||
sum += htons(l3len);
|
||||
sum += tot_len;
|
||||
sum += (saddr.s_addr >> 16) & 0xffff;
|
||||
sum += saddr.s_addr & 0xffff;
|
||||
sum += (daddr.s_addr >> 16) & 0xffff;
|
||||
|
@ -141,13 +140,13 @@ uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
|
|||
/**
|
||||
* proto_ipv4_header_psum() - Calculates the partial checksum of an
|
||||
* IPv4 header for UDP or TCP
|
||||
* @l4len: IPv4 Payload length (host order)
|
||||
* @proto: Protocol number
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
* @tot_len: IPv4 Payload length (host order)
|
||||
* @proto: Protocol number (host order)
|
||||
* @saddr: Source address (network order)
|
||||
* @daddr: Destination address (network order)
|
||||
* Returns: Partial checksum of the IPv4 header
|
||||
*/
|
||||
uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
||||
uint32_t proto_ipv4_header_psum(uint16_t tot_len, uint8_t protocol,
|
||||
struct in_addr saddr, struct in_addr daddr)
|
||||
{
|
||||
uint32_t psum = htons(protocol);
|
||||
|
@ -156,7 +155,7 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
|||
psum += saddr.s_addr & 0xffff;
|
||||
psum += (daddr.s_addr >> 16) & 0xffff;
|
||||
psum += daddr.s_addr & 0xffff;
|
||||
psum += htons(l4len);
|
||||
psum += htons(tot_len);
|
||||
|
||||
return psum;
|
||||
}
|
||||
|
@ -166,24 +165,22 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
|||
* @udp4hr: UDP header, initialised apart from checksum
|
||||
* @saddr: IPv4 source address
|
||||
* @daddr: IPv4 destination address
|
||||
* @iov: Pointer to the array of IO vectors
|
||||
* @iov_cnt: Length of the array
|
||||
* @offset: UDP payload offset in the iovec array
|
||||
* @payload: ICMPv4 packet payload
|
||||
* @len: Length of @payload (not including UDP)
|
||||
*/
|
||||
void csum_udp4(struct udphdr *udp4hr,
|
||||
struct in_addr saddr, struct in_addr daddr,
|
||||
const struct iovec *iov, int iov_cnt, size_t offset)
|
||||
const void *payload, size_t len)
|
||||
{
|
||||
/* UDP checksums are optional, so don't bother */
|
||||
udp4hr->check = 0;
|
||||
|
||||
if (UDP4_REAL_CHECKSUMS) {
|
||||
uint16_t l4len = iov_size(iov, iov_cnt) - offset +
|
||||
sizeof(struct udphdr);
|
||||
uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
|
||||
uint16_t tot_len = len + sizeof(struct udphdr);
|
||||
uint32_t psum = proto_ipv4_header_psum(tot_len, IPPROTO_UDP,
|
||||
saddr, daddr);
|
||||
psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
|
||||
udp4hr->check = csum_iov(iov, iov_cnt, offset, psum);
|
||||
udp4hr->check = csum(payload, len, psum);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -191,9 +188,9 @@ void csum_udp4(struct udphdr *udp4hr,
|
|||
* csum_icmp4() - Calculate and set checksum for an ICMP packet
|
||||
* @icmp4hr: ICMP header, initialised apart from checksum
|
||||
* @payload: ICMP packet payload
|
||||
* @dlen: Length of @payload (not including ICMP header)
|
||||
* @len: Length of @payload (not including ICMP header)
|
||||
*/
|
||||
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen)
|
||||
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t len)
|
||||
{
|
||||
uint32_t psum;
|
||||
|
||||
|
@ -202,16 +199,16 @@ void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen)
|
|||
/* Partial checksum for ICMP header alone */
|
||||
psum = sum_16b(icmp4hr, sizeof(*icmp4hr));
|
||||
|
||||
icmp4hr->checksum = csum(payload, dlen, psum);
|
||||
icmp4hr->checksum = csum(payload, len, psum);
|
||||
}
|
||||
|
||||
/**
|
||||
* proto_ipv6_header_psum() - Calculates the partial checksum of an
|
||||
* IPv6 header for UDP or TCP
|
||||
* @payload_len: IPv6 payload length (host order)
|
||||
* @proto: Protocol number
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
* @proto: Protocol number (host order)
|
||||
* @saddr: Source address (network order)
|
||||
* @daddr: Destination address (network order)
|
||||
* Returns: Partial checksum of the IPv6 header
|
||||
*/
|
||||
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
||||
|
@ -229,24 +226,19 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
|||
/**
|
||||
* csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet
|
||||
* @udp6hr: UDP header, initialised apart from checksum
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
* @iov: Pointer to the array of IO vectors
|
||||
* @iov_cnt: Length of the array
|
||||
* @offset: UDP payload offset in the iovec array
|
||||
* @payload: UDP packet payload
|
||||
* @len: Length of @payload (not including UDP header)
|
||||
*/
|
||||
void csum_udp6(struct udphdr *udp6hr,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
const struct iovec *iov, int iov_cnt, size_t offset)
|
||||
const void *payload, size_t len)
|
||||
{
|
||||
uint16_t l4len = iov_size(iov, iov_cnt) - offset +
|
||||
sizeof(struct udphdr);
|
||||
uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
|
||||
saddr, daddr);
|
||||
uint32_t psum = proto_ipv6_header_psum(len + sizeof(struct udphdr),
|
||||
IPPROTO_UDP, saddr, daddr);
|
||||
udp6hr->check = 0;
|
||||
|
||||
psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
|
||||
udp6hr->check = csum_iov(iov, iov_cnt, offset, psum);
|
||||
udp6hr->check = csum(payload, len, psum);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -255,19 +247,21 @@ void csum_udp6(struct udphdr *udp6hr,
|
|||
* @saddr: IPv6 source address
|
||||
* @daddr: IPv6 destination address
|
||||
* @payload: ICMP packet payload
|
||||
* @dlen: Length of @payload (not including ICMPv6 header)
|
||||
* @len: Length of @payload (not including ICMPv6 header)
|
||||
*/
|
||||
void csum_icmp6(struct icmp6hdr *icmp6hr,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
const void *payload, size_t dlen)
|
||||
const void *payload, size_t len)
|
||||
{
|
||||
uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(*icmp6hr),
|
||||
IPPROTO_ICMPV6, saddr, daddr);
|
||||
/* Partial checksum for the pseudo-IPv6 header */
|
||||
uint32_t psum = sum_16b(saddr, sizeof(*saddr)) +
|
||||
sum_16b(daddr, sizeof(*daddr)) +
|
||||
htons(len + sizeof(*icmp6hr)) + htons(IPPROTO_ICMPV6);
|
||||
|
||||
icmp6hr->icmp6_cksum = 0;
|
||||
/* Add in partial checksum for the ICMPv6 header alone */
|
||||
psum += sum_16b(icmp6hr, sizeof(*icmp6hr));
|
||||
icmp6hr->icmp6_cksum = csum(payload, dlen, psum);
|
||||
icmp6hr->icmp6_cksum = csum(payload, len, psum);
|
||||
}
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
@ -505,26 +499,16 @@ uint16_t csum(const void *buf, size_t len, uint32_t init)
|
|||
*
|
||||
* @iov Pointer to the array of IO vectors
|
||||
* @n Length of the array
|
||||
* @offset: Offset of the data to checksum within the full data length
|
||||
* @init Initial 32-bit checksum, 0 for no pre-computed checksum
|
||||
*
|
||||
* Return: 16-bit folded, complemented checksum
|
||||
*/
|
||||
uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
|
||||
uint32_t init)
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init)
|
||||
{
|
||||
unsigned int i;
|
||||
size_t first;
|
||||
|
||||
i = iov_skip_bytes(iov, n, offset, &first);
|
||||
if (i >= n)
|
||||
return (uint16_t)~csum_fold(init);
|
||||
|
||||
init = csum_unfolded((char *)iov[i].iov_base + first,
|
||||
iov[i].iov_len - first, init);
|
||||
i++;
|
||||
|
||||
for (; i < n; i++)
|
||||
for (i = 0; i < n; i++)
|
||||
init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);
|
||||
|
||||
return (uint16_t)~csum_fold(init);
|
||||
|
|
15
checksum.h
15
checksum.h
|
@ -13,26 +13,25 @@ struct icmp6hdr;
|
|||
uint32_t sum_16b(const void *buf, size_t len);
|
||||
uint16_t csum_fold(uint32_t sum);
|
||||
uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
|
||||
uint16_t csum_ip4_header(uint16_t tot_len, uint8_t protocol,
|
||||
struct in_addr saddr, struct in_addr daddr);
|
||||
uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
||||
uint32_t proto_ipv4_header_psum(uint16_t tot_len, uint8_t protocol,
|
||||
struct in_addr saddr, struct in_addr daddr);
|
||||
void csum_udp4(struct udphdr *udp4hr,
|
||||
struct in_addr saddr, struct in_addr daddr,
|
||||
const struct iovec *iov, int iov_cnt, size_t offset);
|
||||
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
|
||||
const void *payload, size_t len);
|
||||
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t len);
|
||||
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
||||
const struct in6_addr *saddr,
|
||||
const struct in6_addr *daddr);
|
||||
void csum_udp6(struct udphdr *udp6hr,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
const struct iovec *iov, int iov_cnt, size_t offset);
|
||||
const void *payload, size_t len);
|
||||
void csum_icmp6(struct icmp6hdr *icmp6hr,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
const void *payload, size_t dlen);
|
||||
const void *payload, size_t len);
|
||||
uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
|
||||
uint32_t init);
|
||||
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init);
|
||||
|
||||
#endif /* CHECKSUM_H */
|
||||
|
|
|
@ -26,16 +26,13 @@
|
|||
capability sys_ptrace,
|
||||
|
||||
/ r, # isolate_prefork(), isolation.c
|
||||
mount options=(rw, runbindable) -> /,
|
||||
mount "" -> "/",
|
||||
mount options=(rw, runbindable) /,
|
||||
mount "" -> "/tmp/",
|
||||
pivot_root "/tmp/" -> "/tmp/",
|
||||
umount "/",
|
||||
|
||||
owner @{PROC}/@{pid}/uid_map r, # conf_ugid()
|
||||
|
||||
@{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral()
|
||||
|
||||
network netlink raw, # nl_sock_init_do(), netlink.c
|
||||
|
||||
network inet stream, # tcp.c
|
||||
|
|
|
@ -27,9 +27,8 @@
|
|||
@{PROC}/@{pid}/net/udp r,
|
||||
@{PROC}/@{pid}/net/udp6 r,
|
||||
|
||||
@{run}/user/@{uid}/** rw, # pasta_open_ns()
|
||||
@{run}/user/@{uid}/netns/* r, # pasta_open_ns(), pasta.c
|
||||
|
||||
@{PROC}/[0-9]*/ns/ r, # pasta_netns_quit_init(),
|
||||
@{PROC}/[0-9]*/ns/net r, # pasta_wait_for_ns(),
|
||||
@{PROC}/[0-9]*/ns/user r, # conf_pasta_ns()
|
||||
|
||||
|
@ -43,5 +42,3 @@
|
|||
/{usr/,}bin/** Ux,
|
||||
|
||||
/usr/bin/pasta.avx2 ix, # arch_avx2_exec(), arch.c
|
||||
|
||||
ptrace r, # pasta_open_ns()
|
||||
|
|
|
@ -19,12 +19,9 @@ profile passt /usr/bin/passt{,.avx2} {
|
|||
include <abstractions/passt>
|
||||
|
||||
# Alternatively: include <abstractions/user-tmp>
|
||||
owner /tmp/** w, # tap_sock_unix_open(),
|
||||
# tap_sock_unix_init(), pcap(),
|
||||
# pidfile_open(),
|
||||
# pidfile_write(),
|
||||
owner /tmp/** w, # tap_sock_unix_init(), pcap(),
|
||||
# write_pidfile(),
|
||||
# logfile_init()
|
||||
|
||||
owner @{HOME}/** w, # pcap(), pidfile_open(),
|
||||
# pidfile_write()
|
||||
owner @{HOME}/** w, # pcap(), write_pidfile()
|
||||
}
|
||||
|
|
|
@ -19,13 +19,9 @@ profile pasta /usr/bin/pasta{,.avx2} flags=(attach_disconnected) {
|
|||
include <abstractions/pasta>
|
||||
|
||||
# Alternatively: include <abstractions/user-tmp>
|
||||
/tmp/** rw, # tap_sock_unix_open(),
|
||||
# tap_sock_unix_init(), pcap(),
|
||||
# pidfile_open(),
|
||||
# pidfile_write(),
|
||||
# logfile_init(),
|
||||
# pasta_open_ns()
|
||||
owner /tmp/** w, # tap_sock_unix_init(), pcap(),
|
||||
# write_pidfile(),
|
||||
# logfile_init()
|
||||
|
||||
owner @{HOME}/** w, # pcap(), pidfile_open(),
|
||||
# pidfile_write()
|
||||
owner @{HOME}/** w, # pcap(), write_pidfile()
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ Name: passt
|
|||
Version: {{{ git_version }}}
|
||||
Release: 1%{?dist}
|
||||
Summary: User-mode networking daemons for virtual machines and namespaces
|
||||
License: GPL-2.0-or-later AND BSD-3-Clause
|
||||
License: GPLv2+ and BSD
|
||||
Group: System Environment/Daemons
|
||||
URL: https://passt.top/
|
||||
Source: https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz
|
||||
|
|
|
@ -29,11 +29,7 @@ function passt_git_changelog_entry {
|
|||
[ -z "${__from}" ] && __from="$(git rev-list --max-parents=0 HEAD)"
|
||||
|
||||
__date="$(git log --pretty="format:%cI" "${__to}" -1)"
|
||||
__author="Stefano Brivio <sbrivio@redhat.com>"
|
||||
# Use:
|
||||
# __author="$(git log -1 --pretty="format:%an <%ae>" ${__to} -- contrib/fedora)"
|
||||
# if you want the author of changelog entries to match the latest
|
||||
# author for contrib/fedora
|
||||
__author="$(git log -1 --pretty="format:%an <%ae>" ${__to} -- contrib/fedora)"
|
||||
|
||||
printf "* %s %s - %s\n" "$(date "+%a %b %e %Y" -d "${__date}")" "${__author}" "$(git_version "${__to}")-1"
|
||||
|
||||
|
|
|
@ -50,7 +50,6 @@ require {
|
|||
type passwd_file_t;
|
||||
|
||||
class netlink_route_socket { bind create nlmsg_read };
|
||||
type sysctl_net_t;
|
||||
|
||||
class capability { sys_tty_config setuid setgid };
|
||||
class cap_userns { setpcap sys_admin sys_ptrace };
|
||||
|
@ -105,8 +104,6 @@ allow passt_t net_conf_t:lnk_file read;
|
|||
allow passt_t tmp_t:sock_file { create unlink write };
|
||||
allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
|
||||
kernel_search_network_sysctl(passt_t)
|
||||
allow passt_t sysctl_net_t:dir search;
|
||||
allow passt_t sysctl_net_t:file { open read };
|
||||
|
||||
corenet_tcp_bind_all_nodes(passt_t)
|
||||
corenet_udp_bind_all_nodes(passt_t)
|
||||
|
|
|
@ -196,7 +196,7 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch };
|
|||
allow pasta_t self:tun_socket create;
|
||||
allow pasta_t tun_tap_device_t:chr_file { ioctl open read write };
|
||||
allow pasta_t sysctl_net_t:dir search;
|
||||
allow pasta_t sysctl_net_t:file { open read write };
|
||||
allow pasta_t sysctl_net_t:file { open write };
|
||||
allow pasta_t kernel_t:system module_request;
|
||||
|
||||
allow pasta_t nsfs_t:file read;
|
||||
|
@ -211,4 +211,3 @@ allow pasta_t ifconfig_t:process { noatsecure rlimitinh siginh };
|
|||
allow pasta_t netutils_t:process { noatsecure rlimitinh siginh };
|
||||
allow pasta_t ping_t:process { noatsecure rlimitinh siginh };
|
||||
allow pasta_t user_tty_device_t:chr_file { append read write };
|
||||
allow pasta_t user_devpts_t:chr_file { append read write };
|
||||
|
|
25
dhcp.c
25
dhcp.c
|
@ -275,8 +275,7 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len)
|
|||
*/
|
||||
int dhcp(const struct ctx *c, const struct pool *p)
|
||||
{
|
||||
size_t mlen, dlen, offset = 0, opt_len, opt_off = 0;
|
||||
char macstr[ETH_ADDRSTRLEN];
|
||||
size_t mlen, len, offset = 0, opt_len, opt_off = 0;
|
||||
const struct ethhdr *eh;
|
||||
const struct iphdr *iph;
|
||||
const struct udphdr *uh;
|
||||
|
@ -341,26 +340,26 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
|||
return -1;
|
||||
}
|
||||
|
||||
info(" from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr)));
|
||||
info(" from %02x:%02x:%02x:%02x:%02x:%02x",
|
||||
m->chaddr[0], m->chaddr[1], m->chaddr[2],
|
||||
m->chaddr[3], m->chaddr[4], m->chaddr[5]);
|
||||
|
||||
m->yiaddr = c->ip4.addr;
|
||||
mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len));
|
||||
memcpy(opts[1].s, &mask, sizeof(mask));
|
||||
memcpy(opts[3].s, &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
|
||||
memcpy(opts[54].s, &c->ip4.our_tap_addr, sizeof(c->ip4.our_tap_addr));
|
||||
memcpy(opts[1].s, &mask, sizeof(mask));
|
||||
memcpy(opts[3].s, &c->ip4.gw, sizeof(c->ip4.gw));
|
||||
memcpy(opts[54].s, &c->ip4.gw, sizeof(c->ip4.gw));
|
||||
|
||||
/* If the gateway is not on the assigned subnet, send an option 121
|
||||
* (Classless Static Routing) adding a dummy route to it.
|
||||
*/
|
||||
if ((c->ip4.addr.s_addr & mask.s_addr)
|
||||
!= (c->ip4.guest_gw.s_addr & mask.s_addr)) {
|
||||
!= (c->ip4.gw.s_addr & mask.s_addr)) {
|
||||
/* a.b.c.d/32:0.0.0.0, 0:a.b.c.d */
|
||||
opts[121].slen = 14;
|
||||
opts[121].s[0] = 32;
|
||||
memcpy(opts[121].s + 1,
|
||||
&c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
|
||||
memcpy(opts[121].s + 10,
|
||||
&c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
|
||||
memcpy(opts[121].s + 1, &c->ip4.gw, sizeof(c->ip4.gw));
|
||||
memcpy(opts[121].s + 10, &c->ip4.gw, sizeof(c->ip4.gw));
|
||||
}
|
||||
|
||||
if (c->mtu != -1) {
|
||||
|
@ -378,8 +377,8 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
|||
if (!c->no_dhcp_dns_search)
|
||||
opt_set_dns_search(c, sizeof(m->o));
|
||||
|
||||
dlen = offsetof(struct msg, o) + fill(m);
|
||||
tap_udp4_send(c, c->ip4.our_tap_addr, 67, c->ip4.addr, 68, m, dlen);
|
||||
len = offsetof(struct msg, o) + fill(m);
|
||||
tap_udp4_send(c, c->ip4.gw, 67, c->ip4.addr, 68, m, len);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
70
dhcpv6.c
70
dhcpv6.c
|
@ -296,42 +296,45 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
|
|||
static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
|
||||
struct in6_addr *la)
|
||||
{
|
||||
int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
|
||||
const struct opt_ia_addr *opt_addr;
|
||||
char buf[INET6_ADDRSTRLEN];
|
||||
struct in6_addr req_addr;
|
||||
const struct opt_hdr *h;
|
||||
struct opt_hdr *ia;
|
||||
struct opt_hdr *ia, *h;
|
||||
size_t offset;
|
||||
int ia_type;
|
||||
|
||||
foreach(ia_type, ia_types) {
|
||||
offset = 0;
|
||||
while ((ia = dhcpv6_opt(p, &offset, *ia_type))) {
|
||||
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
|
||||
ia_type = OPT_IA_NA;
|
||||
ia_ta:
|
||||
offset = 0;
|
||||
while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
|
||||
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
|
||||
return NULL;
|
||||
|
||||
offset += sizeof(struct opt_ia_na);
|
||||
|
||||
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
|
||||
struct opt_ia_addr *opt_addr = (struct opt_ia_addr *)h;
|
||||
|
||||
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
|
||||
return NULL;
|
||||
|
||||
offset += sizeof(struct opt_ia_na);
|
||||
|
||||
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
|
||||
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
|
||||
return NULL;
|
||||
|
||||
opt_addr = (const struct opt_ia_addr *)h;
|
||||
req_addr = opt_addr->addr;
|
||||
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr))
|
||||
goto err;
|
||||
|
||||
offset += sizeof(struct opt_ia_addr);
|
||||
memcpy(&req_addr, &opt_addr->addr, sizeof(req_addr));
|
||||
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) {
|
||||
info("DHCPv6: requested address %s not on link",
|
||||
inet_ntop(AF_INET6, &req_addr,
|
||||
buf, sizeof(buf)));
|
||||
return ia;
|
||||
}
|
||||
|
||||
offset += sizeof(struct opt_ia_addr);
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
if (ia_type == OPT_IA_NA) {
|
||||
ia_type = OPT_IA_TA;
|
||||
goto ia_ta;
|
||||
}
|
||||
|
||||
err:
|
||||
info("DHCPv6: requested address %s not on link",
|
||||
inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
|
||||
return ia;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -360,7 +363,7 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset)
|
|||
srv->hdr.l = 0;
|
||||
}
|
||||
|
||||
srv->addr[i] = c->ip6.dns[i];
|
||||
memcpy(&srv->addr[i], &c->ip6.dns[i], sizeof(srv->addr[i]));
|
||||
srv->hdr.l += sizeof(srv->addr[i]);
|
||||
offset += sizeof(srv->addr[i]);
|
||||
}
|
||||
|
@ -423,11 +426,11 @@ search:
|
|||
int dhcpv6(struct ctx *c, const struct pool *p,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr)
|
||||
{
|
||||
const struct opt_hdr *client_id, *server_id, *ia;
|
||||
struct opt_hdr *ia, *bad_ia, *client_id;
|
||||
const struct opt_hdr *server_id;
|
||||
const struct in6_addr *src;
|
||||
const struct msg_hdr *mh;
|
||||
const struct udphdr *uh;
|
||||
struct opt_hdr *bad_ia;
|
||||
size_t mlen, n;
|
||||
|
||||
uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
|
||||
|
@ -448,7 +451,10 @@ int dhcpv6(struct ctx *c, const struct pool *p,
|
|||
|
||||
c->ip6.addr_ll_seen = *saddr;
|
||||
|
||||
src = &c->ip6.our_tap_ll;
|
||||
if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
|
||||
src = &c->ip6.gw;
|
||||
else
|
||||
src = &c->ip6.addr_ll;
|
||||
|
||||
mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL);
|
||||
if (!mh)
|
||||
|
@ -568,10 +574,8 @@ void dhcpv6_init(const struct ctx *c)
|
|||
resp.server_id.duid_time = duid_time;
|
||||
resp_not_on_link.server_id.duid_time = duid_time;
|
||||
|
||||
memcpy(resp.server_id.duid_lladdr,
|
||||
c->our_tap_mac, sizeof(c->our_tap_mac));
|
||||
memcpy(resp_not_on_link.server_id.duid_lladdr,
|
||||
c->our_tap_mac, sizeof(c->our_tap_mac));
|
||||
memcpy(resp.server_id.duid_lladdr, c->mac, sizeof(c->mac));
|
||||
memcpy(resp_not_on_link.server_id.duid_lladdr, c->mac, sizeof(c->mac));
|
||||
|
||||
resp.ia_addr.addr = c->ip6.addr;
|
||||
}
|
||||
|
|
3
doc/platform-requirements/.gitignore
vendored
3
doc/platform-requirements/.gitignore
vendored
|
@ -1,3 +0,0 @@
|
|||
/reuseaddr-priority
|
||||
/recv-zero
|
||||
/udp-close-dup
|
|
@ -1,45 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# Copyright Red Hat
|
||||
# Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
|
||||
TARGETS = reuseaddr-priority recv-zero udp-close-dup
|
||||
SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c
|
||||
CFLAGS = -Wall
|
||||
|
||||
all: cppcheck clang-tidy $(TARGETS:%=check-%)
|
||||
|
||||
$(TARGETS): %: %.c common.c common.h
|
||||
|
||||
check-%: %
|
||||
./$<
|
||||
|
||||
cppcheck:
|
||||
cppcheck --std=c11 --error-exitcode=1 --enable=all --force \
|
||||
--check-level=exhaustive --inline-suppr \
|
||||
--inconclusive --library=posix --quiet \
|
||||
--suppress=missingIncludeSystem \
|
||||
$(SRCS)
|
||||
|
||||
clang-tidy:
|
||||
clang-tidy --checks=*,\
|
||||
-altera-id-dependent-backward-branch,\
|
||||
-altera-unroll-loops,\
|
||||
-bugprone-easily-swappable-parameters,\
|
||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
|
||||
-concurrency-mt-unsafe,\
|
||||
-cppcoreguidelines-avoid-non-const-global-variables,\
|
||||
-cppcoreguidelines-init-variables,\
|
||||
-cppcoreguidelines-macro-to-enum,\
|
||||
-google-readability-braces-around-statements,\
|
||||
-hicpp-braces-around-statements,\
|
||||
-llvmlibc-restrict-system-libc-headers,\
|
||||
-misc-include-cleaner,\
|
||||
-modernize-macro-to-enum,\
|
||||
-readability-braces-around-statements,\
|
||||
-readability-identifier-length,\
|
||||
-readability-isolate-declaration \
|
||||
$(SRCS)
|
||||
|
||||
clean:
|
||||
rm -f $(TARGETS) *.o *~
|
|
@ -1,18 +0,0 @@
|
|||
Platform Requirements
|
||||
=====================
|
||||
|
||||
TODO: document the various Linux specific features we currently require
|
||||
|
||||
|
||||
Test Programs
|
||||
-------------
|
||||
|
||||
In some places we rely on quite specific behaviour of sockets.
|
||||
Although Linux, at least, seems to behave as required, It's not always
|
||||
clear from the available documentation if this is required by POSIX or
|
||||
some other specification.
|
||||
|
||||
To specifically document those expectations this directory has some
|
||||
test programs which explicitly check for the behaviour we need.
|
||||
When/if we attempt a port to a new platform, running these to check
|
||||
behaviour would be a good place to start.
|
|
@ -1,66 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* common.c
|
||||
*
|
||||
* Common helper functions for testing SO_REUSEADDR behaviour
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <netinet/in.h>
|
||||
#include <string.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int sock_reuseaddr(void)
|
||||
{
|
||||
int y = 1;
|
||||
int s;
|
||||
|
||||
|
||||
s = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
|
||||
if (s < 0)
|
||||
die("socket(): %s\n", strerror(errno));
|
||||
|
||||
if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) , 0)
|
||||
die("SO_REUSEADDR: %s\n", strerror(errno));
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Send a token via the given connected socket */
|
||||
void send_token(int s, long token)
|
||||
{
|
||||
ssize_t rc;
|
||||
|
||||
rc = send(s, &token, sizeof(token), 0);
|
||||
if (rc < 0)
|
||||
die("send(): %s\n", strerror(errno));
|
||||
if (rc < sizeof(token))
|
||||
die("short send()\n");
|
||||
}
|
||||
|
||||
/* Attempt to receive a token via the given socket.
|
||||
*
|
||||
* Returns true if we received the token, false if we got an EAGAIN, dies in any
|
||||
* other case */
|
||||
bool recv_token(int s, long token)
|
||||
{
|
||||
ssize_t rc;
|
||||
long buf;
|
||||
|
||||
rc = recv(s, &buf, sizeof(buf), MSG_DONTWAIT);
|
||||
if (rc < 0) {
|
||||
if (errno == EWOULDBLOCK)
|
||||
return false;
|
||||
die("recv(): %s\n", strerror(errno));
|
||||
}
|
||||
if (rc < sizeof(buf))
|
||||
die("short recv()\n");
|
||||
if (buf != token)
|
||||
die("data mismatch\n");
|
||||
return true;
|
||||
}
|
|
@ -1,47 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* common.h
|
||||
*
|
||||
* Useful shared functions
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
*/
|
||||
#ifndef REUSEADDR_COMMON_H
|
||||
#define REUSEADDR_COMMON_H
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static inline void die(const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, fmt);
|
||||
(void)vfprintf(stderr, fmt, ap);
|
||||
va_end(ap);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#if __BYTE_ORDER == __BIG_ENDIAN
|
||||
#define htons_constant(x) (x)
|
||||
#define htonl_constant(x) (x)
|
||||
#else
|
||||
#define htons_constant(x) (__bswap_constant_16(x))
|
||||
#define htonl_constant(x) (__bswap_constant_32(x))
|
||||
#endif
|
||||
|
||||
#define SOCKADDR_INIT(addr, port) \
|
||||
{ \
|
||||
.sin_family = AF_INET, \
|
||||
.sin_addr = { .s_addr = htonl_constant(addr) }, \
|
||||
.sin_port = htons_constant(port), \
|
||||
}
|
||||
|
||||
int sock_reuseaddr(void);
|
||||
void send_token(int s, long token);
|
||||
bool recv_token(int s, long token);
|
||||
|
||||
#endif /* REUSEADDR_COMMON_H */
|
|
@ -1,118 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* recv-zero.c
|
||||
*
|
||||
* Verify that we're able to discard datagrams by recv()ing into a zero-length
|
||||
* buffer.
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
*/
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <errno.h>
|
||||
#include <net/if.h>
|
||||
#include <netinet/in.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define DSTPORT 13257U
|
||||
|
||||
enum discard_method {
|
||||
DISCARD_NULL_BUF,
|
||||
DISCARD_ZERO_IOV,
|
||||
DISCARD_NULL_IOV,
|
||||
NUM_METHODS,
|
||||
};
|
||||
|
||||
/* 127.0.0.1:DSTPORT */
|
||||
static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT);
|
||||
|
||||
static void test_discard(enum discard_method method)
|
||||
{
|
||||
struct iovec zero_iov = { .iov_base = NULL, .iov_len = 0, };
|
||||
struct msghdr mh_zero = {
|
||||
.msg_iov = &zero_iov,
|
||||
.msg_iovlen = 1,
|
||||
};
|
||||
struct msghdr mh_null = {
|
||||
.msg_iov = NULL,
|
||||
.msg_iovlen = 0,
|
||||
};
|
||||
long token1, token2;
|
||||
int recv_s, send_s;
|
||||
ssize_t rc;
|
||||
|
||||
token1 = random();
|
||||
token2 = random();
|
||||
|
||||
recv_s = sock_reuseaddr();
|
||||
if (bind(recv_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0)
|
||||
die("bind(): %s\n", strerror(errno));
|
||||
|
||||
send_s = sock_reuseaddr();
|
||||
if (connect(send_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0)
|
||||
die("connect(): %s\n", strerror(errno));
|
||||
|
||||
send_token(send_s, token1);
|
||||
send_token(send_s, token2);
|
||||
|
||||
switch (method) {
|
||||
case DISCARD_NULL_BUF:
|
||||
/* cppcheck-suppress nullPointer */
|
||||
rc = recv(recv_s, NULL, 0, MSG_DONTWAIT);
|
||||
if (rc < 0)
|
||||
die("discarding recv(): %s\n", strerror(errno));
|
||||
break;
|
||||
|
||||
case DISCARD_ZERO_IOV:
|
||||
rc = recvmsg(recv_s, &mh_zero, MSG_DONTWAIT);
|
||||
if (rc < 0)
|
||||
die("recvmsg() with zero-length buffer: %s\n",
|
||||
strerror(errno));
|
||||
if (!((unsigned)mh_zero.msg_flags & MSG_TRUNC))
|
||||
die("Missing MSG_TRUNC flag\n");
|
||||
break;
|
||||
|
||||
case DISCARD_NULL_IOV:
|
||||
rc = recvmsg(recv_s, &mh_null, MSG_DONTWAIT);
|
||||
if (rc < 0)
|
||||
die("recvmsg() with zero-length iov: %s\n",
|
||||
strerror(errno));
|
||||
if (!((unsigned)mh_null.msg_flags & MSG_TRUNC))
|
||||
die("Missing MSG_TRUNC flag\n");
|
||||
break;
|
||||
|
||||
default:
|
||||
die("Bad method\n");
|
||||
}
|
||||
|
||||
recv_token(recv_s, token2);
|
||||
|
||||
/* cppcheck-suppress nullPointer */
|
||||
rc = recv(recv_s, NULL, 0, MSG_DONTWAIT);
|
||||
if (rc < 0 && errno != EAGAIN)
|
||||
die("redundant discarding recv(): %s\n", strerror(errno));
|
||||
if (rc >= 0)
|
||||
die("Unexpected receive: rc=%zd\n", rc);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
enum discard_method method;
|
||||
|
||||
(void)argc;
|
||||
(void)argv;
|
||||
|
||||
for (method = 0; method < NUM_METHODS; method++)
|
||||
test_discard(method);
|
||||
|
||||
printf("Discarding datagrams with 0-length receives seems to work\n");
|
||||
|
||||
exit(0);
|
||||
}
|
|
@ -1,240 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* reuseaddr-priority.c
|
||||
*
|
||||
* Verify which SO_REUSEADDR UDP sockets get priority to receive
|
||||
* =============================================================
|
||||
*
|
||||
* SO_REUSEADDR allows multiple sockets to bind to overlapping addresses, so
|
||||
* there can be multiple sockets eligible to receive the same packet. The exact
|
||||
* semantics of which socket will receive in this circumstance isn't very well
|
||||
* documented.
|
||||
*
|
||||
* This program verifies that things behave the way we expect. Specifically we
|
||||
* expect:
|
||||
*
|
||||
* - If both a connected and an unconnected socket could receive a datagram, the
|
||||
* connected one will receive it in preference to the unconnected one.
|
||||
*
|
||||
* - If an unconnected socket bound to a specific address and an unconnected
|
||||
* socket bound to the "any" address (0.0.0.0 or ::) could receive a datagram,
|
||||
* then the one with a specific address will receive it in preference to the
|
||||
* other.
|
||||
*
|
||||
* These should be true regardless of the order the sockets are created in, or
|
||||
* the order they're polled in.
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
*/
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <errno.h>
|
||||
#include <net/if.h>
|
||||
#include <netinet/in.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define SRCPORT 13246U
|
||||
#define DSTPORT 13247U
|
||||
|
||||
/* Different cases for receiving socket configuration */
|
||||
enum sock_type {
|
||||
/* Socket is bound to 0.0.0.0:DSTPORT and not connected */
|
||||
SOCK_BOUND_ANY = 0,
|
||||
|
||||
/* Socket is bound to 127.0.0.1:DSTPORT and not connected */
|
||||
SOCK_BOUND_LO = 1,
|
||||
|
||||
/* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */
|
||||
SOCK_CONNECTED = 2,
|
||||
|
||||
NUM_SOCK_TYPES,
|
||||
};
|
||||
|
||||
typedef enum sock_type order_t[NUM_SOCK_TYPES];
|
||||
|
||||
static order_t orders[] = {
|
||||
{0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {1, 2, 0}, {2, 0, 1}, {2, 1, 0},
|
||||
};
|
||||
|
||||
/* 127.0.0.2 */
|
||||
#define INADDR_LOOPBACK2 ((in_addr_t)(0x7f000002))
|
||||
|
||||
/* 0.0.0.0:DSTPORT */
|
||||
static const struct sockaddr_in any_dst = SOCKADDR_INIT(INADDR_ANY, DSTPORT);
|
||||
/* 127.0.0.1:DSTPORT */
|
||||
static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT);
|
||||
|
||||
/* 127.0.0.2:DSTPORT */
|
||||
static const struct sockaddr_in lo2_dst = SOCKADDR_INIT(INADDR_LOOPBACK2, DSTPORT);
|
||||
|
||||
/* 127.0.0.1:SRCPORT */
|
||||
static const struct sockaddr_in lo_src = SOCKADDR_INIT(INADDR_LOOPBACK, SRCPORT);
|
||||
|
||||
/* Random token to send in datagram */
|
||||
static long token;
|
||||
|
||||
/* Get a socket of the specified type for receiving */
|
||||
static int sock_recv(enum sock_type type)
|
||||
{
|
||||
const struct sockaddr *connect_sa = NULL;
|
||||
const struct sockaddr *bind_sa = NULL;
|
||||
int s;
|
||||
|
||||
s = sock_reuseaddr();
|
||||
|
||||
switch (type) {
|
||||
case SOCK_CONNECTED:
|
||||
connect_sa = (struct sockaddr *)&lo_src;
|
||||
/* fallthrough */
|
||||
case SOCK_BOUND_ANY:
|
||||
bind_sa = (struct sockaddr *)&any_dst;
|
||||
break;
|
||||
|
||||
case SOCK_BOUND_LO:
|
||||
bind_sa = (struct sockaddr *)&lo_dst;
|
||||
break;
|
||||
|
||||
default:
|
||||
die("bug");
|
||||
}
|
||||
|
||||
if (bind_sa)
|
||||
if (bind(s, bind_sa, sizeof(struct sockaddr_in)) < 0)
|
||||
die("bind(): %s\n", strerror(errno));
|
||||
if (connect_sa)
|
||||
if (connect(s, connect_sa, sizeof(struct sockaddr_in)) < 0)
|
||||
die("connect(): %s\n", strerror(errno));
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Get a socket suitable for sending to the given type of receiving socket */
|
||||
static int sock_send(enum sock_type type)
|
||||
{
|
||||
const struct sockaddr *connect_sa = NULL;
|
||||
const struct sockaddr *bind_sa = NULL;
|
||||
int s;
|
||||
|
||||
s = sock_reuseaddr();
|
||||
|
||||
switch (type) {
|
||||
case SOCK_BOUND_ANY:
|
||||
connect_sa = (struct sockaddr *)&lo2_dst;
|
||||
break;
|
||||
|
||||
case SOCK_CONNECTED:
|
||||
bind_sa = (struct sockaddr *)&lo_src;
|
||||
/* fallthrough */
|
||||
case SOCK_BOUND_LO:
|
||||
connect_sa = (struct sockaddr *)&lo_dst;
|
||||
break;
|
||||
|
||||
default:
|
||||
die("bug");
|
||||
}
|
||||
|
||||
if (bind_sa)
|
||||
if (bind(s, bind_sa, sizeof(struct sockaddr_in)) < 0)
|
||||
die("bind(): %s\n", strerror(errno));
|
||||
if (connect_sa)
|
||||
if (connect(s, connect_sa, sizeof(struct sockaddr_in)) < 0)
|
||||
die("connect(): %s\n", strerror(errno));
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Check for expected behaviour with one specific ordering for various operations:
|
||||
*
|
||||
* @recv_create_order: Order to create receiving sockets in
|
||||
* @send_create_order: Order to create sending sockets in
|
||||
* @test_order: Order to test the behaviour of different types
|
||||
* @recv_order: Order to check the receiving sockets
|
||||
*/
|
||||
static void check_one_order(const order_t recv_create_order,
|
||||
const order_t send_create_order,
|
||||
const order_t test_order,
|
||||
const order_t recv_order)
|
||||
{
|
||||
int rs[NUM_SOCK_TYPES];
|
||||
int ss[NUM_SOCK_TYPES];
|
||||
int nfds = 0;
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < NUM_SOCK_TYPES; i++) {
|
||||
enum sock_type t = recv_create_order[i];
|
||||
int s;
|
||||
|
||||
s = sock_recv(t);
|
||||
if (s >= nfds)
|
||||
nfds = s + 1;
|
||||
|
||||
rs[t] = s;
|
||||
}
|
||||
|
||||
for (i = 0; i < NUM_SOCK_TYPES; i++) {
|
||||
enum sock_type t = send_create_order[i];
|
||||
|
||||
ss[t] = sock_send(t);
|
||||
}
|
||||
|
||||
for (i = 0; i < NUM_SOCK_TYPES; i++) {
|
||||
enum sock_type ti = test_order[i];
|
||||
int recv_via = -1;
|
||||
|
||||
send_token(ss[ti], token);
|
||||
|
||||
for (j = 0; j < NUM_SOCK_TYPES; j++) {
|
||||
enum sock_type tj = recv_order[j];
|
||||
|
||||
if (recv_token(rs[tj], token)) {
|
||||
if (recv_via != -1)
|
||||
die("Received token more than once\n");
|
||||
recv_via = tj;
|
||||
}
|
||||
}
|
||||
|
||||
if (recv_via == -1)
|
||||
die("Didn't receive token at all\n");
|
||||
if (recv_via != ti)
|
||||
die("Received token via unexpected socket\n");
|
||||
}
|
||||
|
||||
for (i = 0; i < NUM_SOCK_TYPES; i++) {
|
||||
close(rs[i]);
|
||||
close(ss[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void check_all_orders(void)
|
||||
{
|
||||
int norders = sizeof(orders) / sizeof(orders[0]);
|
||||
int i, j, k, l;
|
||||
|
||||
for (i = 0; i < norders; i++)
|
||||
for (j = 0; j < norders; j++)
|
||||
for (k = 0; k < norders; k++)
|
||||
for (l = 0; l < norders; l++)
|
||||
check_one_order(orders[i], orders[j],
|
||||
orders[k], orders[l]);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
(void)argc;
|
||||
(void)argv;
|
||||
|
||||
token = random();
|
||||
|
||||
check_all_orders();
|
||||
|
||||
printf("SO_REUSEADDR receive priorities seem to work as expected\n");
|
||||
|
||||
exit(0);
|
||||
}
|
|
@ -1,105 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* udp-close-dup.c
|
||||
*
|
||||
* Verify that closing one dup() of a UDP socket won't stop other dups from
|
||||
* receiving packets.
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
*/
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <net/if.h>
|
||||
#include <netinet/in.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define DSTPORT 13257U
|
||||
|
||||
/* 127.0.0.1:DSTPORT */
|
||||
static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT);
|
||||
|
||||
enum dup_method {
|
||||
DUP_DUP,
|
||||
DUP_FCNTL,
|
||||
NUM_METHODS,
|
||||
};
|
||||
|
||||
static void test_close_dup(enum dup_method method)
|
||||
{
|
||||
long token;
|
||||
int s1, s2, send_s;
|
||||
ssize_t rc;
|
||||
|
||||
s1 = sock_reuseaddr();
|
||||
if (bind(s1, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0)
|
||||
die("bind(): %s\n", strerror(errno));
|
||||
|
||||
send_s = sock_reuseaddr();
|
||||
if (connect(send_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0)
|
||||
die("connect(): %s\n", strerror(errno));
|
||||
|
||||
/* Receive before duplicating */
|
||||
token = random();
|
||||
send_token(send_s, token);
|
||||
recv_token(s1, token);
|
||||
|
||||
switch (method) {
|
||||
case DUP_DUP:
|
||||
/* NOLINTNEXTLINE(android-cloexec-dup) */
|
||||
s2 = dup(s1);
|
||||
if (s2 < 0)
|
||||
die("dup(): %s\n", strerror(errno));
|
||||
break;
|
||||
case DUP_FCNTL:
|
||||
s2 = fcntl(s1, F_DUPFD_CLOEXEC, 0);
|
||||
if (s2 < 0)
|
||||
die("F_DUPFD_CLOEXEC: %s\n", strerror(errno));
|
||||
break;
|
||||
default:
|
||||
die("Bad method\n");
|
||||
}
|
||||
|
||||
/* Receive via original handle */
|
||||
token = random();
|
||||
send_token(send_s, token);
|
||||
recv_token(s1, token);
|
||||
|
||||
/* Receive via duplicated handle */
|
||||
token = random();
|
||||
send_token(send_s, token);
|
||||
recv_token(s2, token);
|
||||
|
||||
/* Close duplicate */
|
||||
rc = close(s2);
|
||||
if (rc < 0)
|
||||
die("close() dup: %s\n", strerror(errno));
|
||||
|
||||
/* Receive after closing duplicate */
|
||||
token = random();
|
||||
send_token(send_s, token);
|
||||
recv_token(s1, token);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
enum dup_method method;
|
||||
|
||||
(void)argc;
|
||||
(void)argv;
|
||||
|
||||
for (method = 0; method < NUM_METHODS; method++)
|
||||
test_close_dup(method);
|
||||
|
||||
printf("Closing dup()ed UDP sockets seems to work as expected\n");
|
||||
|
||||
exit(0);
|
||||
}
|
43
epoll_type.h
43
epoll_type.h
|
@ -1,43 +0,0 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright Red Hat
|
||||
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
*/
|
||||
|
||||
#ifndef EPOLL_TYPE_H
|
||||
#define EPOLL_TYPE_H
|
||||
|
||||
/**
|
||||
* enum epoll_type - Different types of fds we poll over
|
||||
*/
|
||||
enum epoll_type {
|
||||
/* Special value to indicate an invalid type */
|
||||
EPOLL_TYPE_NONE = 0,
|
||||
/* Connected TCP sockets */
|
||||
EPOLL_TYPE_TCP,
|
||||
/* Connected TCP sockets (spliced) */
|
||||
EPOLL_TYPE_TCP_SPLICE,
|
||||
/* Listening TCP sockets */
|
||||
EPOLL_TYPE_TCP_LISTEN,
|
||||
/* timerfds used for TCP timers */
|
||||
EPOLL_TYPE_TCP_TIMER,
|
||||
/* UDP "listening" sockets */
|
||||
EPOLL_TYPE_UDP_LISTEN,
|
||||
/* UDP socket for replies on a specific flow */
|
||||
EPOLL_TYPE_UDP_REPLY,
|
||||
/* ICMP/ICMPv6 ping sockets */
|
||||
EPOLL_TYPE_PING,
|
||||
/* inotify fd watching for end of netns (pasta) */
|
||||
EPOLL_TYPE_NSQUIT_INOTIFY,
|
||||
/* timer fd watching for end of netns, fallback for inotify (pasta) */
|
||||
EPOLL_TYPE_NSQUIT_TIMER,
|
||||
/* tuntap character device */
|
||||
EPOLL_TYPE_TAP_PASTA,
|
||||
/* socket connected to qemu */
|
||||
EPOLL_TYPE_TAP_PASST,
|
||||
/* socket listening for qemu socket connections */
|
||||
EPOLL_TYPE_TAP_LISTEN,
|
||||
|
||||
EPOLL_NUM_TYPES,
|
||||
};
|
||||
|
||||
#endif /* EPOLL_TYPE_H */
|
695
flow.c
695
flow.c
|
@ -5,11 +5,9 @@
|
|||
* Tracking for logical "flows" of packets.
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <sched.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "util.h"
|
||||
|
@ -20,24 +18,10 @@
|
|||
#include "flow.h"
|
||||
#include "flow_table.h"
|
||||
|
||||
const char *flow_state_str[] = {
|
||||
[FLOW_STATE_FREE] = "FREE",
|
||||
[FLOW_STATE_NEW] = "NEW",
|
||||
[FLOW_STATE_INI] = "INI",
|
||||
[FLOW_STATE_TGT] = "TGT",
|
||||
[FLOW_STATE_TYPED] = "TYPED",
|
||||
[FLOW_STATE_ACTIVE] = "ACTIVE",
|
||||
};
|
||||
static_assert(ARRAY_SIZE(flow_state_str) == FLOW_NUM_STATES,
|
||||
"flow_state_str[] doesn't match enum flow_state");
|
||||
|
||||
const char *flow_type_str[] = {
|
||||
[FLOW_TYPE_NONE] = "<none>",
|
||||
[FLOW_TCP] = "TCP connection",
|
||||
[FLOW_TCP_SPLICE] = "TCP connection (spliced)",
|
||||
[FLOW_PING4] = "ICMP ping sequence",
|
||||
[FLOW_PING6] = "ICMPv6 ping sequence",
|
||||
[FLOW_UDP] = "UDP flow",
|
||||
};
|
||||
static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
|
||||
"flow_type_str[] doesn't match enum flow_type");
|
||||
|
@ -45,15 +29,52 @@ static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
|
|||
const uint8_t flow_proto[] = {
|
||||
[FLOW_TCP] = IPPROTO_TCP,
|
||||
[FLOW_TCP_SPLICE] = IPPROTO_TCP,
|
||||
[FLOW_PING4] = IPPROTO_ICMP,
|
||||
[FLOW_PING6] = IPPROTO_ICMPV6,
|
||||
[FLOW_UDP] = IPPROTO_UDP,
|
||||
};
|
||||
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
||||
"flow_proto[] doesn't match enum flow_type");
|
||||
|
||||
/* Global Flow Table */
|
||||
|
||||
/**
|
||||
* DOC: Theory of Operation - flow entry life cycle
|
||||
*
|
||||
* An individual flow table entry moves through these logical states, usually in
|
||||
* this order.
|
||||
*
|
||||
* FREE - Part of the general pool of free flow table entries
|
||||
* Operations:
|
||||
* - flow_alloc() finds an entry and moves it to ALLOC state
|
||||
*
|
||||
* ALLOC - A tentatively allocated entry
|
||||
* Operations:
|
||||
* - flow_alloc_cancel() returns the entry to FREE state
|
||||
* - FLOW_START() set the entry's type and moves to START state
|
||||
* Caveats:
|
||||
* - It's not safe to write fields in the flow entry
|
||||
* - It's not safe to allocate further entries with flow_alloc()
|
||||
* - It's not safe to return to the main epoll loop (use FLOW_START()
|
||||
* to move to START state before doing so)
|
||||
* - It's not safe to use flow_*() logging functions
|
||||
*
|
||||
* START - An entry being prepared by flow type specific code
|
||||
* Operations:
|
||||
* - Flow type specific fields may be accessed
|
||||
* - flow_*() logging functions
|
||||
* - flow_alloc_cancel() returns the entry to FREE state
|
||||
* Caveats:
|
||||
* - Returning to the main epoll loop or allocating another entry
|
||||
* with flow_alloc() implicitly moves the entry to ACTIVE state.
|
||||
*
|
||||
* ACTIVE - An active flow entry managed by flow type specific code
|
||||
* Operations:
|
||||
* - Flow type specific fields may be accessed
|
||||
* - flow_*() logging functions
|
||||
* - Flow may be expired by returning 'true' from flow type specific
|
||||
* deferred or timer handler. This will return it to FREE state.
|
||||
* Caveats:
|
||||
* - It's not safe to call flow_alloc_cancel()
|
||||
*/
|
||||
|
||||
/**
|
||||
* DOC: Theory of Operation - allocating and freeing flow entries
|
||||
*
|
||||
|
@ -107,156 +128,10 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
|||
|
||||
unsigned flow_first_free;
|
||||
union flow flowtab[FLOW_MAX];
|
||||
static const union flow *flow_new_entry; /* = NULL */
|
||||
|
||||
/* Hash table to index it */
|
||||
#define FLOW_HASH_LOAD 70 /* % */
|
||||
#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD))
|
||||
|
||||
/* Table for lookup from flowside information */
|
||||
static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE];
|
||||
|
||||
static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX,
|
||||
"Safe linear probing requires hash table with more entries than the number of sides in the flow table");
|
||||
|
||||
/* Last time the flow timers ran */
|
||||
static struct timespec flow_timer_run;
|
||||
|
||||
/** flowside_from_af() - Initialise flowside from addresses
|
||||
* @side: flowside to initialise
|
||||
* @af: Address family (AF_INET or AF_INET6)
|
||||
* @eaddr: Endpoint address (pointer to in_addr or in6_addr)
|
||||
* @eport: Endpoint port
|
||||
* @oaddr: Our address (pointer to in_addr or in6_addr)
|
||||
* @oport: Our port
|
||||
*/
|
||||
static void flowside_from_af(struct flowside *side, sa_family_t af,
|
||||
const void *eaddr, in_port_t eport,
|
||||
const void *oaddr, in_port_t oport)
|
||||
{
|
||||
if (oaddr)
|
||||
inany_from_af(&side->oaddr, af, oaddr);
|
||||
else
|
||||
side->oaddr = inany_any6;
|
||||
side->oport = oport;
|
||||
|
||||
if (eaddr)
|
||||
inany_from_af(&side->eaddr, af, eaddr);
|
||||
else
|
||||
side->eaddr = inany_any6;
|
||||
side->eport = eport;
|
||||
}
|
||||
|
||||
/**
|
||||
* struct flowside_sock_args - Parameters for flowside_sock_splice()
|
||||
* @c: Execution context
|
||||
* @fd: Filled in with new socket fd
|
||||
* @err: Filled in with errno if something failed
|
||||
* @type: Socket epoll type
|
||||
* @sa: Socket address
|
||||
* @sl: Length of @sa
|
||||
* @data: epoll reference data
|
||||
*/
|
||||
struct flowside_sock_args {
|
||||
const struct ctx *c;
|
||||
int fd;
|
||||
int err;
|
||||
enum epoll_type type;
|
||||
const struct sockaddr *sa;
|
||||
socklen_t sl;
|
||||
const char *path;
|
||||
uint32_t data;
|
||||
};
|
||||
|
||||
/** flowside_sock_splice() - Create and bind socket for PIF_SPLICE based on flowside
|
||||
* @arg: Argument as a struct flowside_sock_args
|
||||
*
|
||||
* Return: 0
|
||||
*/
|
||||
static int flowside_sock_splice(void *arg)
|
||||
{
|
||||
struct flowside_sock_args *a = arg;
|
||||
|
||||
ns_enter(a->c);
|
||||
|
||||
a->fd = sock_l4_sa(a->c, a->type, a->sa, a->sl, NULL,
|
||||
a->sa->sa_family == AF_INET6, a->data);
|
||||
a->err = errno;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** flowside_sock_l4() - Create and bind socket based on flowside
|
||||
* @c: Execution context
|
||||
* @type: Socket epoll type
|
||||
* @pif: Interface for this socket
|
||||
* @tgt: Target flowside
|
||||
* @data: epoll reference portion for protocol handlers
|
||||
*
|
||||
* Return: socket fd of protocol @proto bound to our address and port from @tgt
|
||||
* (if specified).
|
||||
*/
|
||||
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||
const struct flowside *tgt, uint32_t data)
|
||||
{
|
||||
const char *ifname = NULL;
|
||||
union sockaddr_inany sa;
|
||||
socklen_t sl;
|
||||
|
||||
ASSERT(pif_is_socket(pif));
|
||||
|
||||
pif_sockaddr(c, &sa, &sl, pif, &tgt->oaddr, tgt->oport);
|
||||
|
||||
switch (pif) {
|
||||
case PIF_HOST:
|
||||
if (inany_is_loopback(&tgt->oaddr))
|
||||
ifname = NULL;
|
||||
else if (sa.sa_family == AF_INET)
|
||||
ifname = c->ip4.ifname_out;
|
||||
else if (sa.sa_family == AF_INET6)
|
||||
ifname = c->ip6.ifname_out;
|
||||
|
||||
return sock_l4_sa(c, type, &sa, sl, ifname,
|
||||
sa.sa_family == AF_INET6, data);
|
||||
|
||||
case PIF_SPLICE: {
|
||||
struct flowside_sock_args args = {
|
||||
.c = c, .type = type,
|
||||
.sa = &sa.sa, .sl = sl, .data = data,
|
||||
};
|
||||
NS_CALL(flowside_sock_splice, &args);
|
||||
errno = args.err;
|
||||
return args.fd;
|
||||
}
|
||||
|
||||
default:
|
||||
/* If we add new socket pifs, they'll need to be implemented
|
||||
* here
|
||||
*/
|
||||
ASSERT(0);
|
||||
}
|
||||
}
|
||||
|
||||
/** flowside_connect() - Connect a socket based on flowside
|
||||
* @c: Execution context
|
||||
* @s: Socket to connect
|
||||
* @pif: Target pif
|
||||
* @tgt: Target flowside
|
||||
*
|
||||
* Connect @s to the endpoint address and port from @tgt.
|
||||
*
|
||||
* Return: 0 on success, negative on error
|
||||
*/
|
||||
int flowside_connect(const struct ctx *c, int s,
|
||||
uint8_t pif, const struct flowside *tgt)
|
||||
{
|
||||
union sockaddr_inany sa;
|
||||
socklen_t sl;
|
||||
|
||||
pif_sockaddr(c, &sa, &sl, pif, &tgt->eaddr, tgt->eport);
|
||||
return connect(s, &sa.sa, sl);
|
||||
}
|
||||
|
||||
/** flow_log_ - Log flow-related message
|
||||
* @f: flow the message is related to
|
||||
* @pri: Log priority
|
||||
|
@ -265,7 +140,6 @@ int flowside_connect(const struct ctx *c, int s,
|
|||
*/
|
||||
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
||||
{
|
||||
const char *type_or_state;
|
||||
char msg[BUFSIZ];
|
||||
va_list args;
|
||||
|
||||
|
@ -273,221 +147,40 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
|||
(void)vsnprintf(msg, sizeof(msg), fmt, args);
|
||||
va_end(args);
|
||||
|
||||
/* Show type if it's set, otherwise the state */
|
||||
if (f->state < FLOW_STATE_TYPED)
|
||||
type_or_state = FLOW_STATE(f);
|
||||
else
|
||||
type_or_state = FLOW_TYPE(f);
|
||||
|
||||
logmsg(true, false, pri,
|
||||
"Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
|
||||
logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg);
|
||||
}
|
||||
|
||||
/** flow_log_details_() - Log the details of a flow
|
||||
* @f: flow to log
|
||||
* @pri: Log priority
|
||||
* @state: State to log details according to
|
||||
/**
|
||||
* flow_start() - Set flow type for new flow and log
|
||||
* @flow: Flow to set type for
|
||||
* @type: Type for new flow
|
||||
* @iniside: Which side initiated the new flow
|
||||
*
|
||||
* Logs the details of the flow: endpoints, interfaces, type etc.
|
||||
*/
|
||||
void flow_log_details_(const struct flow_common *f, int pri,
|
||||
enum flow_state state)
|
||||
{
|
||||
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
|
||||
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
|
||||
const struct flowside *ini = &f->side[INISIDE];
|
||||
const struct flowside *tgt = &f->side[TGTSIDE];
|
||||
|
||||
if (state >= FLOW_STATE_TGT)
|
||||
flow_log_(f, pri,
|
||||
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
|
||||
pif_name(f->pif[INISIDE]),
|
||||
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||
ini->eport,
|
||||
inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
|
||||
ini->oport,
|
||||
pif_name(f->pif[TGTSIDE]),
|
||||
inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)),
|
||||
tgt->oport,
|
||||
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
|
||||
tgt->eport);
|
||||
else if (state >= FLOW_STATE_INI)
|
||||
flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
|
||||
pif_name(f->pif[INISIDE]),
|
||||
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||
ini->eport,
|
||||
inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
|
||||
ini->oport);
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_set_state() - Change flow's state
|
||||
* @f: Flow changing state
|
||||
* @state: New state
|
||||
*/
|
||||
static void flow_set_state(struct flow_common *f, enum flow_state state)
|
||||
{
|
||||
uint8_t oldstate = f->state;
|
||||
|
||||
ASSERT(state < FLOW_NUM_STATES);
|
||||
ASSERT(oldstate < FLOW_NUM_STATES);
|
||||
|
||||
f->state = state;
|
||||
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
|
||||
FLOW_STATE(f));
|
||||
|
||||
flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_initiate_() - Move flow to INI, setting pif[INISIDE]
|
||||
* @flow: Flow to change state
|
||||
* @pif: pif of the initiating side
|
||||
*/
|
||||
static void flow_initiate_(union flow *flow, uint8_t pif)
|
||||
{
|
||||
struct flow_common *f = &flow->f;
|
||||
|
||||
ASSERT(pif != PIF_NONE);
|
||||
ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW);
|
||||
ASSERT(f->type == FLOW_TYPE_NONE);
|
||||
ASSERT(f->pif[INISIDE] == PIF_NONE && f->pif[TGTSIDE] == PIF_NONE);
|
||||
|
||||
f->pif[INISIDE] = pif;
|
||||
flow_set_state(f, FLOW_STATE_INI);
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_initiate_af() - Move flow to INI, setting INISIDE details
|
||||
* @flow: Flow to change state
|
||||
* @pif: pif of the initiating side
|
||||
* @af: Address family of @saddr and @daddr
|
||||
* @saddr: Source address (pointer to in_addr or in6_addr)
|
||||
* @sport: Endpoint port
|
||||
* @daddr: Destination address (pointer to in_addr or in6_addr)
|
||||
* @dport: Destination port
|
||||
* Return: @flow
|
||||
*
|
||||
* Return: pointer to the initiating flowside information
|
||||
* Should be called before setting any flow type specific fields in the flow
|
||||
* table entry.
|
||||
*/
|
||||
const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
|
||||
sa_family_t af,
|
||||
const void *saddr, in_port_t sport,
|
||||
const void *daddr, in_port_t dport)
|
||||
union flow *flow_start(union flow *flow, enum flow_type type,
|
||||
unsigned iniside)
|
||||
{
|
||||
struct flowside *ini = &flow->f.side[INISIDE];
|
||||
|
||||
flowside_from_af(ini, af, saddr, sport, daddr, dport);
|
||||
flow_initiate_(flow, pif);
|
||||
return ini;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_initiate_sa() - Move flow to INI, setting INISIDE details
|
||||
* @flow: Flow to change state
|
||||
* @pif: pif of the initiating side
|
||||
* @ssa: Source socket address
|
||||
* @dport: Destination port
|
||||
*
|
||||
* Return: pointer to the initiating flowside information
|
||||
*/
|
||||
const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
||||
const union sockaddr_inany *ssa,
|
||||
in_port_t dport)
|
||||
{
|
||||
struct flowside *ini = &flow->f.side[INISIDE];
|
||||
|
||||
inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
|
||||
if (inany_v4(&ini->eaddr))
|
||||
ini->oaddr = inany_any4;
|
||||
else
|
||||
ini->oaddr = inany_any6;
|
||||
ini->oport = dport;
|
||||
flow_initiate_(flow, pif);
|
||||
return ini;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_target() - Determine where flow should forward to, and move to TGT
|
||||
* @c: Execution context
|
||||
* @flow: Flow to forward
|
||||
* @proto: Protocol
|
||||
*
|
||||
* Return: pointer to the target flowside information
|
||||
*/
|
||||
const struct flowside *flow_target(const struct ctx *c, union flow *flow,
|
||||
uint8_t proto)
|
||||
{
|
||||
char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
|
||||
struct flow_common *f = &flow->f;
|
||||
const struct flowside *ini = &f->side[INISIDE];
|
||||
struct flowside *tgt = &f->side[TGTSIDE];
|
||||
uint8_t tgtpif = PIF_NONE;
|
||||
|
||||
ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_INI);
|
||||
ASSERT(f->type == FLOW_TYPE_NONE);
|
||||
ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] == PIF_NONE);
|
||||
ASSERT(flow->f.state == FLOW_STATE_INI);
|
||||
|
||||
switch (f->pif[INISIDE]) {
|
||||
case PIF_TAP:
|
||||
tgtpif = fwd_nat_from_tap(c, proto, ini, tgt);
|
||||
break;
|
||||
|
||||
case PIF_SPLICE:
|
||||
tgtpif = fwd_nat_from_splice(c, proto, ini, tgt);
|
||||
break;
|
||||
|
||||
case PIF_HOST:
|
||||
tgtpif = fwd_nat_from_host(c, proto, ini, tgt);
|
||||
break;
|
||||
|
||||
default:
|
||||
flow_err(flow, "No rules to forward %s [%s]:%hu -> [%s]:%hu",
|
||||
pif_name(f->pif[INISIDE]),
|
||||
inany_ntop(&ini->eaddr, estr, sizeof(estr)),
|
||||
ini->eport,
|
||||
inany_ntop(&ini->oaddr, fstr, sizeof(fstr)),
|
||||
ini->oport);
|
||||
}
|
||||
|
||||
if (tgtpif == PIF_NONE)
|
||||
return NULL;
|
||||
|
||||
f->pif[TGTSIDE] = tgtpif;
|
||||
flow_set_state(f, FLOW_STATE_TGT);
|
||||
return tgt;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_set_type() - Set type and move to TYPED
|
||||
* @flow: Flow to change state
|
||||
* @pif: pif of the initiating side
|
||||
*/
|
||||
union flow *flow_set_type(union flow *flow, enum flow_type type)
|
||||
{
|
||||
struct flow_common *f = &flow->f;
|
||||
|
||||
ASSERT(type != FLOW_TYPE_NONE);
|
||||
ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_TGT);
|
||||
ASSERT(f->type == FLOW_TYPE_NONE);
|
||||
ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE);
|
||||
|
||||
f->type = type;
|
||||
flow_set_state(f, FLOW_STATE_TYPED);
|
||||
(void)iniside;
|
||||
flow->f.type = type;
|
||||
flow_dbg(flow, "START %s", flow_type_str[flow->f.type]);
|
||||
return flow;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_activate() - Move flow to ACTIVE
|
||||
* @f: Flow to change state
|
||||
* flow_end() - Clear flow type for finished flow and log
|
||||
* @flow: Flow to clear
|
||||
*/
|
||||
void flow_activate(struct flow_common *f)
|
||||
static void flow_end(union flow *flow)
|
||||
{
|
||||
ASSERT(&flow_new_entry->f == f && f->state == FLOW_STATE_TYPED);
|
||||
ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE);
|
||||
if (flow->f.type == FLOW_TYPE_NONE)
|
||||
return; /* Nothing to do */
|
||||
|
||||
flow_set_state(f, FLOW_STATE_ACTIVE);
|
||||
flow_new_entry = NULL;
|
||||
flow_dbg(flow, "END %s", flow_type_str[flow->f.type]);
|
||||
flow->f.type = FLOW_TYPE_NONE;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -499,12 +192,9 @@ union flow *flow_alloc(void)
|
|||
{
|
||||
union flow *flow = &flowtab[flow_first_free];
|
||||
|
||||
ASSERT(!flow_new_entry);
|
||||
|
||||
if (flow_first_free >= FLOW_MAX)
|
||||
return NULL;
|
||||
|
||||
ASSERT(flow->f.state == FLOW_STATE_FREE);
|
||||
ASSERT(flow->f.type == FLOW_TYPE_NONE);
|
||||
ASSERT(flow->free.n >= 1);
|
||||
ASSERT(flow_first_free + flow->free.n <= FLOW_MAX);
|
||||
|
@ -527,10 +217,7 @@ union flow *flow_alloc(void)
|
|||
flow_first_free = flow->free.next;
|
||||
}
|
||||
|
||||
flow_new_entry = flow;
|
||||
memset(flow, 0, sizeof(*flow));
|
||||
flow_set_state(&flow->f, FLOW_STATE_NEW);
|
||||
|
||||
return flow;
|
||||
}
|
||||
|
||||
|
@ -542,228 +229,15 @@ union flow *flow_alloc(void)
|
|||
*/
|
||||
void flow_alloc_cancel(union flow *flow)
|
||||
{
|
||||
ASSERT(flow_new_entry == flow);
|
||||
ASSERT(flow->f.state == FLOW_STATE_NEW ||
|
||||
flow->f.state == FLOW_STATE_INI ||
|
||||
flow->f.state == FLOW_STATE_TGT ||
|
||||
flow->f.state == FLOW_STATE_TYPED);
|
||||
ASSERT(flow_first_free > FLOW_IDX(flow));
|
||||
|
||||
flow_set_state(&flow->f, FLOW_STATE_FREE);
|
||||
memset(flow, 0, sizeof(*flow));
|
||||
|
||||
flow_end(flow);
|
||||
/* Put it back in a length 1 free cluster, don't attempt to fully
|
||||
* reverse flow_alloc()s steps. This will get folded together the next
|
||||
* time flow_defer_handler runs anyway() */
|
||||
flow->free.n = 1;
|
||||
flow->free.next = flow_first_free;
|
||||
flow_first_free = FLOW_IDX(flow);
|
||||
flow_new_entry = NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_hash() - Calculate hash value for one side of a flow
|
||||
* @c: Execution context
|
||||
* @proto: Protocol of this flow (IP L4 protocol number)
|
||||
* @pif: pif of the side to hash
|
||||
* @side: Flowside (must not have unspecified parts)
|
||||
*
|
||||
* Return: hash value
|
||||
*/
|
||||
static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||
const struct flowside *side)
|
||||
{
|
||||
struct siphash_state state = SIPHASH_INIT(c->hash_secret);
|
||||
|
||||
inany_siphash_feed(&state, &side->oaddr);
|
||||
inany_siphash_feed(&state, &side->eaddr);
|
||||
|
||||
return siphash_final(&state, 38, (uint64_t)proto << 40 |
|
||||
(uint64_t)pif << 32 |
|
||||
(uint64_t)side->oport << 16 |
|
||||
(uint64_t)side->eport);
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_sidx_hash() - Calculate hash value for given side of a given flow
|
||||
* @c: Execution context
|
||||
* @sidx: Flow & side index to get hash for
|
||||
*
|
||||
* Return: hash value, of the flow & side represented by @sidx
|
||||
*/
|
||||
static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
|
||||
{
|
||||
const struct flow_common *f = &flow_at_sidx(sidx)->f;
|
||||
const struct flowside *side = &f->side[sidx.sidei];
|
||||
uint8_t pif = f->pif[sidx.sidei];
|
||||
|
||||
/* For the hash table to work, entries must have complete endpoint
|
||||
* information, and at least a forwarding port.
|
||||
*/
|
||||
ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
|
||||
side->eport != 0 && side->oport != 0);
|
||||
|
||||
return flow_hash(c, FLOW_PROTO(f), pif, side);
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_hash_probe_() - Find hash bucket for a flow, given hash
|
||||
* @hash: Raw hash value for flow & side
|
||||
* @sidx: Flow and side to find bucket for
|
||||
*
|
||||
* Return: If @sidx is in the hash table, its current bucket, otherwise a
|
||||
* suitable free bucket for it.
|
||||
*/
|
||||
static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
|
||||
{
|
||||
unsigned b = hash % FLOW_HASH_SIZE;
|
||||
|
||||
/* Linear probing */
|
||||
while (flow_sidx_valid(flow_hashtab[b]) &&
|
||||
!flow_sidx_eq(flow_hashtab[b], sidx))
|
||||
b = mod_sub(b, 1, FLOW_HASH_SIZE);
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_hash_probe() - Find hash bucket for a flow
|
||||
* @c: Execution context
|
||||
* @sidx: Flow and side to find bucket for
|
||||
*
|
||||
* Return: If @sidx is in the hash table, its current bucket, otherwise a
|
||||
* suitable free bucket for it.
|
||||
*/
|
||||
static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx)
|
||||
{
|
||||
return flow_hash_probe_(flow_sidx_hash(c, sidx), sidx);
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_hash_insert() - Insert side of a flow into into hash table
|
||||
* @c: Execution context
|
||||
* @sidx: Flow & side index
|
||||
*
|
||||
* Return: raw (un-modded) hash value of side of flow
|
||||
*/
|
||||
uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx)
|
||||
{
|
||||
uint64_t hash = flow_sidx_hash(c, sidx);
|
||||
unsigned b = flow_hash_probe_(hash, sidx);
|
||||
|
||||
flow_hashtab[b] = sidx;
|
||||
flow_dbg(flow_at_sidx(sidx), "Side %u hash table insert: bucket: %u",
|
||||
sidx.sidei, b);
|
||||
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_hash_remove() - Drop side of a flow from the hash table
|
||||
* @c: Execution context
|
||||
* @sidx: Side of flow to remove
|
||||
*/
|
||||
void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx)
|
||||
{
|
||||
unsigned b = flow_hash_probe(c, sidx), s;
|
||||
|
||||
if (!flow_sidx_valid(flow_hashtab[b]))
|
||||
return; /* Redundant remove */
|
||||
|
||||
flow_dbg(flow_at_sidx(sidx), "Side %u hash table remove: bucket: %u",
|
||||
sidx.sidei, b);
|
||||
|
||||
/* Scan the remainder of the cluster */
|
||||
for (s = mod_sub(b, 1, FLOW_HASH_SIZE);
|
||||
flow_sidx_valid(flow_hashtab[s]);
|
||||
s = mod_sub(s, 1, FLOW_HASH_SIZE)) {
|
||||
unsigned h = flow_sidx_hash(c, flow_hashtab[s]) % FLOW_HASH_SIZE;
|
||||
|
||||
if (!mod_between(h, s, b, FLOW_HASH_SIZE)) {
|
||||
/* flow_hashtab[s] can live in flow_hashtab[b]'s slot */
|
||||
debug("hash table remove: shuffle %u -> %u", s, b);
|
||||
flow_hashtab[b] = flow_hashtab[s];
|
||||
b = s;
|
||||
}
|
||||
}
|
||||
|
||||
flow_hashtab[b] = FLOW_SIDX_NONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* flowside_lookup() - Look for a matching flowside in the flow table
|
||||
* @c: Execution context
|
||||
* @proto: Protocol of the flow (IP L4 protocol number)
|
||||
* @pif: pif to look for in the table
|
||||
* @side: Flowside to look for in the table
|
||||
*
|
||||
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
||||
*/
|
||||
static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
|
||||
uint8_t pif, const struct flowside *side)
|
||||
{
|
||||
flow_sidx_t sidx;
|
||||
union flow *flow;
|
||||
unsigned b;
|
||||
|
||||
b = flow_hash(c, proto, pif, side) % FLOW_HASH_SIZE;
|
||||
while ((sidx = flow_hashtab[b], flow = flow_at_sidx(sidx)) &&
|
||||
!(FLOW_PROTO(&flow->f) == proto &&
|
||||
flow->f.pif[sidx.sidei] == pif &&
|
||||
flowside_eq(&flow->f.side[sidx.sidei], side)))
|
||||
b = mod_sub(b, 1, FLOW_HASH_SIZE);
|
||||
|
||||
return flow_hashtab[b];
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_lookup_af() - Look up a flow given addressing information
|
||||
* @c: Execution context
|
||||
* @proto: Protocol of the flow (IP L4 protocol number)
|
||||
* @pif: Interface of the flow
|
||||
* @af: Address family, AF_INET or AF_INET6
|
||||
* @eaddr: Guest side endpoint address (guest local address)
|
||||
* @oaddr: Our guest side address (guest remote address)
|
||||
* @eport: Guest side endpoint port (guest local port)
|
||||
* @oport: Our guest side port (guest remote port)
|
||||
*
|
||||
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
||||
*/
|
||||
flow_sidx_t flow_lookup_af(const struct ctx *c,
|
||||
uint8_t proto, uint8_t pif, sa_family_t af,
|
||||
const void *eaddr, const void *oaddr,
|
||||
in_port_t eport, in_port_t oport)
|
||||
{
|
||||
struct flowside side;
|
||||
|
||||
flowside_from_af(&side, af, eaddr, eport, oaddr, oport);
|
||||
return flowside_lookup(c, proto, pif, &side);
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_lookup_sa() - Look up a flow given an endpoint socket address
|
||||
* @c: Execution context
|
||||
* @proto: Protocol of the flow (IP L4 protocol number)
|
||||
* @pif: Interface of the flow
|
||||
* @esa: Socket address of the endpoint
|
||||
* @oport: Our port number
|
||||
*
|
||||
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
||||
*/
|
||||
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||
const void *esa, in_port_t oport)
|
||||
{
|
||||
struct flowside side = {
|
||||
.oport = oport,
|
||||
};
|
||||
|
||||
inany_from_sockaddr(&side.eaddr, &side.eport, esa);
|
||||
if (inany_v4(&side.eaddr))
|
||||
side.oaddr = inany_any4;
|
||||
else
|
||||
side.oaddr = inany_any6;
|
||||
|
||||
return flowside_lookup(c, proto, pif, &side);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -783,14 +257,11 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
flow_timer_run = *now;
|
||||
}
|
||||
|
||||
ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
|
||||
|
||||
for (idx = 0; idx < FLOW_MAX; idx++) {
|
||||
union flow *flow = &flowtab[idx];
|
||||
bool closed = false;
|
||||
|
||||
switch (flow->f.state) {
|
||||
case FLOW_STATE_FREE: {
|
||||
if (flow->f.type == FLOW_TYPE_NONE) {
|
||||
unsigned skip = flow->free.n;
|
||||
|
||||
/* First entry of a free cluster must have n >= 1 */
|
||||
|
@ -812,43 +283,17 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
continue;
|
||||
}
|
||||
|
||||
case FLOW_STATE_NEW:
|
||||
case FLOW_STATE_INI:
|
||||
case FLOW_STATE_TGT:
|
||||
case FLOW_STATE_TYPED:
|
||||
/* Incomplete flow at end of cycle */
|
||||
ASSERT(false);
|
||||
break;
|
||||
|
||||
case FLOW_STATE_ACTIVE:
|
||||
/* Nothing to do */
|
||||
break;
|
||||
|
||||
default:
|
||||
ASSERT(false);
|
||||
}
|
||||
|
||||
switch (flow->f.type) {
|
||||
case FLOW_TYPE_NONE:
|
||||
ASSERT(false);
|
||||
break;
|
||||
case FLOW_TCP:
|
||||
closed = tcp_flow_defer(&flow->tcp);
|
||||
closed = tcp_flow_defer(flow);
|
||||
break;
|
||||
case FLOW_TCP_SPLICE:
|
||||
closed = tcp_splice_flow_defer(&flow->tcp_splice);
|
||||
closed = tcp_splice_flow_defer(flow);
|
||||
if (!closed && timer)
|
||||
tcp_splice_timer(c, &flow->tcp_splice);
|
||||
break;
|
||||
case FLOW_PING4:
|
||||
case FLOW_PING6:
|
||||
if (timer)
|
||||
closed = icmp_ping_timer(c, &flow->ping, now);
|
||||
break;
|
||||
case FLOW_UDP:
|
||||
closed = udp_flow_defer(&flow->udp);
|
||||
if (!closed && timer)
|
||||
closed = udp_flow_timer(c, &flow->udp, now);
|
||||
tcp_splice_timer(c, flow);
|
||||
break;
|
||||
default:
|
||||
/* Assume other flow types don't need any handling */
|
||||
|
@ -856,8 +301,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
}
|
||||
|
||||
if (closed) {
|
||||
flow_set_state(&flow->f, FLOW_STATE_FREE);
|
||||
memset(flow, 0, sizeof(*flow));
|
||||
flow_end(flow);
|
||||
|
||||
if (free_head) {
|
||||
/* Add slot to current free cluster */
|
||||
|
@ -884,12 +328,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
*/
|
||||
void flow_init(void)
|
||||
{
|
||||
unsigned b;
|
||||
|
||||
/* Initial state is a single free cluster containing the whole table */
|
||||
flowtab[0].free.n = FLOW_MAX;
|
||||
flowtab[0].free.next = FLOW_MAX;
|
||||
|
||||
for (b = 0; b < FLOW_HASH_SIZE; b++)
|
||||
flow_hashtab[b] = FLOW_SIDX_NONE;
|
||||
}
|
||||
|
|
199
flow.h
199
flow.h
|
@ -9,98 +9,6 @@
|
|||
|
||||
#define FLOW_TIMER_INTERVAL 1000 /* ms */
|
||||
|
||||
/**
|
||||
* enum flow_state - States of a flow table entry
|
||||
*
|
||||
* An individual flow table entry moves through these states, usually in this
|
||||
* order.
|
||||
* General rules:
|
||||
* - Code outside flow.c should never write common fields of union flow.
|
||||
* - The state field may always be read.
|
||||
*
|
||||
* FREE - Part of the general pool of free flow table entries
|
||||
* Operations:
|
||||
* - flow_alloc() finds an entry and moves it to NEW
|
||||
*
|
||||
* NEW - Freshly allocated, uninitialised entry
|
||||
* Operations:
|
||||
* - flow_alloc_cancel() returns the entry to FREE
|
||||
* - flow_initiate() sets the entry's INISIDE details and moves to
|
||||
* INI
|
||||
* - FLOW_SET_TYPE() sets the entry's type and moves to TYPED
|
||||
* Caveats:
|
||||
* - No fields other than state may be accessed
|
||||
* - At most one entry may be NEW, INI, TGT or TYPED at a time, so
|
||||
* it's unsafe to use flow_alloc() again until this entry moves to
|
||||
* ACTIVE or FREE
|
||||
* - You may not return to the main epoll loop while any flow is NEW
|
||||
*
|
||||
* INI - An entry with INISIDE common information completed
|
||||
* Operations:
|
||||
* - Common fields related to INISIDE may be read
|
||||
* - flow_alloc_cancel() returns the entry to FREE
|
||||
* - flow_target() sets the entry's TGTSIDE details and moves to TGT
|
||||
* Caveats:
|
||||
* - Other common fields may not be read
|
||||
* - Type specific fields may not be read or written
|
||||
* - At most one entry may be NEW, INI, TGT or TYPED at a time, so
|
||||
* it's unsafe to use flow_alloc() again until this entry moves to
|
||||
* ACTIVE or FREE
|
||||
* - You may not return to the main epoll loop while any flow is INI
|
||||
*
|
||||
* TGT - An entry with only INISIDE and TGTSIDE common information completed
|
||||
* Operations:
|
||||
* - Common fields related to INISIDE & TGTSIDE may be read
|
||||
* - flow_alloc_cancel() returns the entry to FREE
|
||||
* - FLOW_SET_TYPE() sets the entry's type and moves to TYPED
|
||||
* Caveats:
|
||||
* - Other common fields may not be read
|
||||
* - Type specific fields may not be read or written
|
||||
* - At most one entry may be NEW, INI, TGT or TYPED at a time, so
|
||||
* it's unsafe to use flow_alloc() again until this entry moves to
|
||||
* ACTIVE or FREE
|
||||
* - You may not return to the main epoll loop while any flow is TGT
|
||||
*
|
||||
* TYPED - Generic info initialised, type specific initialisation underway
|
||||
* Operations:
|
||||
* - All common fields may be read
|
||||
* - Type specific fields may be read and written
|
||||
* - flow_alloc_cancel() returns the entry to FREE
|
||||
* - FLOW_ACTIVATE() moves the entry to ACTIVE
|
||||
* Caveats:
|
||||
* - At most one entry may be NEW, INI, TGT or TYPED at a time, so
|
||||
* it's unsafe to use flow_alloc() again until this entry moves to
|
||||
* ACTIVE or FREE
|
||||
* - You may not return to the main epoll loop while any flow is
|
||||
* TYPED
|
||||
*
|
||||
* ACTIVE - An active, fully-initialised flow entry
|
||||
* Operations:
|
||||
* - All common fields may be read
|
||||
* - Type specific fields may be read and written
|
||||
* - Flow returns to FREE when it expires, signalled by returning
|
||||
* 'true' from flow type specific deferred or timer handler
|
||||
* Caveats:
|
||||
* - flow_alloc_cancel() may not be called on it
|
||||
*/
|
||||
enum flow_state {
|
||||
FLOW_STATE_FREE,
|
||||
FLOW_STATE_NEW,
|
||||
FLOW_STATE_INI,
|
||||
FLOW_STATE_TGT,
|
||||
FLOW_STATE_TYPED,
|
||||
FLOW_STATE_ACTIVE,
|
||||
|
||||
FLOW_NUM_STATES,
|
||||
};
|
||||
#define FLOW_STATE_BITS 8
|
||||
static_assert(FLOW_NUM_STATES <= (1 << FLOW_STATE_BITS),
|
||||
"Too many flow states for FLOW_STATE_BITS");
|
||||
|
||||
extern const char *flow_state_str[];
|
||||
#define FLOW_STATE(f) \
|
||||
((f)->state < FLOW_NUM_STATES ? flow_state_str[(f)->state] : "?")
|
||||
|
||||
/**
|
||||
* enum flow_type - Different types of packet flows we track
|
||||
*/
|
||||
|
@ -111,18 +19,9 @@ enum flow_type {
|
|||
FLOW_TCP,
|
||||
/* A TCP connection between a host socket and ns socket */
|
||||
FLOW_TCP_SPLICE,
|
||||
/* ICMP echo requests from guest to host and matching replies back */
|
||||
FLOW_PING4,
|
||||
/* ICMPv6 echo requests from guest to host and matching replies back */
|
||||
FLOW_PING6,
|
||||
/* UDP pseudo-connection */
|
||||
FLOW_UDP,
|
||||
|
||||
FLOW_NUM_TYPES,
|
||||
};
|
||||
#define FLOW_TYPE_BITS 8
|
||||
static_assert(FLOW_NUM_TYPES <= (1 << FLOW_TYPE_BITS),
|
||||
"Too many flow types for FLOW_TYPE_BITS");
|
||||
|
||||
extern const char *flow_type_str[];
|
||||
#define FLOW_TYPE(f) \
|
||||
|
@ -132,66 +31,12 @@ extern const uint8_t flow_proto[];
|
|||
#define FLOW_PROTO(f) \
|
||||
((f)->type < FLOW_NUM_TYPES ? flow_proto[(f)->type] : 0)
|
||||
|
||||
#define SIDES 2
|
||||
|
||||
#define INISIDE 0 /* Initiating side index */
|
||||
#define TGTSIDE 1 /* Target side index */
|
||||
|
||||
/**
|
||||
* struct flowside - Address information for one side of a flow
|
||||
* @eaddr: Endpoint address (remote address from passt's PoV)
|
||||
* @oaddr: Our address (local address from passt's PoV)
|
||||
* @eport: Endpoint port
|
||||
* @oport: Our port
|
||||
*/
|
||||
struct flowside {
|
||||
union inany_addr oaddr;
|
||||
union inany_addr eaddr;
|
||||
in_port_t oport;
|
||||
in_port_t eport;
|
||||
};
|
||||
|
||||
/**
|
||||
* flowside_eq() - Check if two flowsides are equal
|
||||
* @left, @right: Flowsides to compare
|
||||
*
|
||||
* Return: true if equal, false otherwise
|
||||
*/
|
||||
static inline bool flowside_eq(const struct flowside *left,
|
||||
const struct flowside *right)
|
||||
{
|
||||
return inany_equals(&left->eaddr, &right->eaddr) &&
|
||||
left->eport == right->eport &&
|
||||
inany_equals(&left->oaddr, &right->oaddr) &&
|
||||
left->oport == right->oport;
|
||||
}
|
||||
|
||||
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||
const struct flowside *tgt, uint32_t data);
|
||||
int flowside_connect(const struct ctx *c, int s,
|
||||
uint8_t pif, const struct flowside *tgt);
|
||||
|
||||
/**
|
||||
* struct flow_common - Common fields for packet flows
|
||||
* @state: State of the flow table entry
|
||||
* @type: Type of packet flow
|
||||
* @pif[]: Interface for each side of the flow
|
||||
* @side[]: Information for each side of the flow
|
||||
*/
|
||||
struct flow_common {
|
||||
#ifdef __GNUC__
|
||||
enum flow_state state:FLOW_STATE_BITS;
|
||||
enum flow_type type:FLOW_TYPE_BITS;
|
||||
#else
|
||||
uint8_t state;
|
||||
static_assert(sizeof(uint8_t) * 8 >= FLOW_STATE_BITS,
|
||||
"Not enough bits for state field");
|
||||
uint8_t type;
|
||||
static_assert(sizeof(uint8_t) * 8 >= FLOW_TYPE_BITS,
|
||||
"Not enough bits for type field");
|
||||
#endif
|
||||
uint8_t pif[SIDES];
|
||||
struct flowside side[SIDES];
|
||||
};
|
||||
|
||||
#define FLOW_INDEX_BITS 17 /* 128k - 1 */
|
||||
|
@ -200,30 +45,24 @@ struct flow_common {
|
|||
#define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */
|
||||
#define FLOW_FILE_PRESSURE 30 /* % of c->nofile */
|
||||
|
||||
union flow *flow_start(union flow *flow, enum flow_type type,
|
||||
unsigned iniside);
|
||||
#define FLOW_START(flow_, t_, var_, i_) \
|
||||
(&flow_start((flow_), (t_), (i_))->var_)
|
||||
|
||||
/**
|
||||
* struct flow_sidx - ID for one side of a specific flow
|
||||
* @sidei: Index of side referenced (0 or 1)
|
||||
* @flowi: Index of flow referenced
|
||||
* @side: Side referenced (0 or 1)
|
||||
* @flow: Index of flow referenced
|
||||
*/
|
||||
typedef struct flow_sidx {
|
||||
unsigned sidei :1;
|
||||
unsigned flowi :FLOW_INDEX_BITS;
|
||||
unsigned side :1;
|
||||
unsigned flow :FLOW_INDEX_BITS;
|
||||
} flow_sidx_t;
|
||||
static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t),
|
||||
"flow_sidx_t must fit within 32 bits");
|
||||
|
||||
#define FLOW_SIDX_NONE ((flow_sidx_t){ .flowi = FLOW_MAX })
|
||||
|
||||
/**
|
||||
* flow_sidx_valid() - Test if a sidx is valid
|
||||
* @sidx: sidx value
|
||||
*
|
||||
* Return: true if @sidx refers to a valid flow & side
|
||||
*/
|
||||
static inline bool flow_sidx_valid(flow_sidx_t sidx)
|
||||
{
|
||||
return sidx.flowi < FLOW_MAX;
|
||||
}
|
||||
#define FLOW_SIDX_NONE ((flow_sidx_t){ .flow = FLOW_MAX })
|
||||
|
||||
/**
|
||||
* flow_sidx_eq() - Test if two sidx values are equal
|
||||
|
@ -233,18 +72,9 @@ static inline bool flow_sidx_valid(flow_sidx_t sidx)
|
|||
*/
|
||||
static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b)
|
||||
{
|
||||
return (a.flowi == b.flowi) && (a.sidei == b.sidei);
|
||||
return (a.flow == b.flow) && (a.side == b.side);
|
||||
}
|
||||
|
||||
uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx);
|
||||
void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx);
|
||||
flow_sidx_t flow_lookup_af(const struct ctx *c,
|
||||
uint8_t proto, uint8_t pif, sa_family_t af,
|
||||
const void *eaddr, const void *oaddr,
|
||||
in_port_t eport, in_port_t oport);
|
||||
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||
const void *esa, in_port_t oport);
|
||||
|
||||
union flow;
|
||||
|
||||
void flow_init(void);
|
||||
|
@ -264,11 +94,4 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
|||
flow_dbg((f), __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
void flow_log_details_(const struct flow_common *f, int pri,
|
||||
enum flow_state state);
|
||||
#define flow_log_details(f_, pri) \
|
||||
flow_log_details_(&((f_)->f), (pri), (f_)->f.state)
|
||||
#define flow_dbg_details(f_) flow_log_details((f_), LOG_DEBUG)
|
||||
#define flow_err_details(f_) flow_log_details((f_), LOG_ERR)
|
||||
|
||||
#endif /* FLOW_H */
|
||||
|
|
103
flow_table.h
103
flow_table.h
|
@ -8,8 +8,6 @@
|
|||
#define FLOW_TABLE_H
|
||||
|
||||
#include "tcp_conn.h"
|
||||
#include "icmp_flow.h"
|
||||
#include "udp_flow.h"
|
||||
|
||||
/**
|
||||
* struct flow_free_cluster - Information about a cluster of free entries
|
||||
|
@ -35,22 +33,14 @@ union flow {
|
|||
struct flow_free_cluster free;
|
||||
struct tcp_tap_conn tcp;
|
||||
struct tcp_splice_conn tcp_splice;
|
||||
struct icmp_ping_flow ping;
|
||||
struct udp_flow udp;
|
||||
};
|
||||
|
||||
/* Global Flow Table */
|
||||
extern unsigned flow_first_free;
|
||||
extern union flow flowtab[];
|
||||
|
||||
/**
|
||||
* flow_foreach_sidei() - 'for' type macro to step through each side of flow
|
||||
* @sidei_: Takes value INISIDE, then TGTSIDE
|
||||
*/
|
||||
#define flow_foreach_sidei(sidei_) \
|
||||
for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++)
|
||||
|
||||
/** flow_idx() - Index of flow from common structure
|
||||
/** flow_idx - Index of flow from common structure
|
||||
* @f: Common flow fields pointer
|
||||
*
|
||||
* Return: index of @f in the flow table
|
||||
|
@ -60,122 +50,59 @@ static inline unsigned flow_idx(const struct flow_common *f)
|
|||
return (union flow *)f - flowtab;
|
||||
}
|
||||
|
||||
/** FLOW_IDX() - Find the index of a flow
|
||||
/** FLOW_IDX - Find the index of a flow
|
||||
* @f_: Flow pointer, either union flow * or protocol specific
|
||||
*
|
||||
* Return: index of @f in the flow table
|
||||
*/
|
||||
#define FLOW_IDX(f_) (flow_idx(&(f_)->f))
|
||||
|
||||
/** FLOW() - Flow entry at a given index
|
||||
/** FLOW - Flow entry at a given index
|
||||
* @idx: Flow index
|
||||
*
|
||||
* Return: pointer to entry @idx in the flow table
|
||||
*/
|
||||
#define FLOW(idx) (&flowtab[(idx)])
|
||||
|
||||
/** flow_at_sidx() - Flow entry for a given sidx
|
||||
/** flow_at_sidx - Flow entry for a given sidx
|
||||
* @sidx: Flow & side index
|
||||
*
|
||||
* Return: pointer to the corresponding flow entry, or NULL
|
||||
*/
|
||||
static inline union flow *flow_at_sidx(flow_sidx_t sidx)
|
||||
{
|
||||
if (!flow_sidx_valid(sidx))
|
||||
if (sidx.flow >= FLOW_MAX)
|
||||
return NULL;
|
||||
return FLOW(sidx.flowi);
|
||||
return FLOW(sidx.flow);
|
||||
}
|
||||
|
||||
/** pif_at_sidx() - Interface for a given flow and side
|
||||
* @sidx: Flow & side index
|
||||
*
|
||||
* Return: pif for the flow & side given by @sidx
|
||||
*/
|
||||
static inline uint8_t pif_at_sidx(flow_sidx_t sidx)
|
||||
{
|
||||
const union flow *flow = flow_at_sidx(sidx);
|
||||
|
||||
if (!flow)
|
||||
return PIF_NONE;
|
||||
return flow->f.pif[sidx.sidei];
|
||||
}
|
||||
|
||||
/** flowside_at_sidx() - Retrieve a specific flowside
|
||||
* @sidx: Flow & side index
|
||||
*
|
||||
* Return: Flowside for the flow & side given by @sidx
|
||||
*/
|
||||
static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
|
||||
{
|
||||
const union flow *flow = flow_at_sidx(sidx);
|
||||
|
||||
if (!flow)
|
||||
return NULL;
|
||||
|
||||
return &flow->f.side[sidx.sidei];
|
||||
}
|
||||
|
||||
/** flow_sidx_opposite() - Get the other side of the same flow
|
||||
* @sidx: Flow & side index
|
||||
*
|
||||
* Return: sidx for the other side of the same flow as @sidx
|
||||
*/
|
||||
static inline flow_sidx_t flow_sidx_opposite(flow_sidx_t sidx)
|
||||
{
|
||||
if (!flow_sidx_valid(sidx))
|
||||
return FLOW_SIDX_NONE;
|
||||
|
||||
return (flow_sidx_t){.flowi = sidx.flowi, .sidei = !sidx.sidei};
|
||||
}
|
||||
|
||||
/** flow_sidx() - Index of one side of a flow from common structure
|
||||
/** flow_sidx_t - Index of one side of a flow from common structure
|
||||
* @f: Common flow fields pointer
|
||||
* @sidei: Which side to refer to (0 or 1)
|
||||
* @side: Which side to refer to (0 or 1)
|
||||
*
|
||||
* Return: index of @f and @side in the flow table
|
||||
*/
|
||||
static inline flow_sidx_t flow_sidx(const struct flow_common *f,
|
||||
unsigned sidei)
|
||||
int side)
|
||||
{
|
||||
/* cppcheck-suppress [knownConditionTrueFalse, unmatchedSuppression] */
|
||||
ASSERT(sidei == !!sidei);
|
||||
ASSERT(side == !!side);
|
||||
|
||||
return (flow_sidx_t){
|
||||
.sidei = sidei,
|
||||
.flowi = flow_idx(f),
|
||||
.side = side,
|
||||
.flow = flow_idx(f),
|
||||
};
|
||||
}
|
||||
|
||||
/** FLOW_SIDX() - Find the index of one side of a flow
|
||||
/** FLOW_SIDX - Find the index of one side of a flow
|
||||
* @f_: Flow pointer, either union flow * or protocol specific
|
||||
* @sidei: Which side to index (0 or 1)
|
||||
* @side: Which side to index (0 or 1)
|
||||
*
|
||||
* Return: index of @f and @side in the flow table
|
||||
*/
|
||||
#define FLOW_SIDX(f_, sidei) (flow_sidx(&(f_)->f, (sidei)))
|
||||
#define FLOW_SIDX(f_, side) (flow_sidx(&(f_)->f, (side)))
|
||||
|
||||
union flow *flow_alloc(void);
|
||||
void flow_alloc_cancel(union flow *flow);
|
||||
|
||||
const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
|
||||
sa_family_t af,
|
||||
const void *saddr, in_port_t sport,
|
||||
const void *daddr, in_port_t dport);
|
||||
const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
||||
const union sockaddr_inany *ssa,
|
||||
in_port_t dport);
|
||||
const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
|
||||
sa_family_t af,
|
||||
const void *saddr, in_port_t sport,
|
||||
const void *daddr, in_port_t dport);
|
||||
const struct flowside *flow_target(const struct ctx *c, union flow *flow,
|
||||
uint8_t proto);
|
||||
|
||||
union flow *flow_set_type(union flow *flow, enum flow_type type);
|
||||
#define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_)
|
||||
|
||||
void flow_activate(struct flow_common *f);
|
||||
#define FLOW_ACTIVATE(flow_) \
|
||||
(flow_activate(&(flow_)->f))
|
||||
|
||||
#endif /* FLOW_TABLE_H */
|
||||
|
|
387
fwd.c
387
fwd.c
|
@ -25,81 +25,6 @@
|
|||
#include "fwd.h"
|
||||
#include "passt.h"
|
||||
#include "lineread.h"
|
||||
#include "flow_table.h"
|
||||
|
||||
/* Empheral port range: values from RFC 6335 */
|
||||
static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
|
||||
static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
|
||||
|
||||
#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
|
||||
|
||||
/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
|
||||
*
|
||||
* Work out what ports the host thinks are emphemeral and record it for later
|
||||
* use by fwd_port_is_ephemeral(). If we're unable to probe, assume the range
|
||||
* recommended by RFC 6335.
|
||||
*/
|
||||
void fwd_probe_ephemeral(void)
|
||||
{
|
||||
char *line, *tab, *end;
|
||||
struct lineread lr;
|
||||
long min, max;
|
||||
ssize_t len;
|
||||
int fd;
|
||||
|
||||
fd = open(PORT_RANGE_SYSCTL, O_RDONLY | O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
warn_perror("Unable to open %s", PORT_RANGE_SYSCTL);
|
||||
return;
|
||||
}
|
||||
|
||||
lineread_init(&lr, fd);
|
||||
len = lineread_get(&lr, &line);
|
||||
close(fd);
|
||||
|
||||
if (len < 0)
|
||||
goto parse_err;
|
||||
|
||||
tab = strchr(line, '\t');
|
||||
if (!tab)
|
||||
goto parse_err;
|
||||
*tab = '\0';
|
||||
|
||||
errno = 0;
|
||||
min = strtol(line, &end, 10);
|
||||
if (*end || errno)
|
||||
goto parse_err;
|
||||
|
||||
errno = 0;
|
||||
max = strtol(tab + 1, &end, 10);
|
||||
if (*end || errno)
|
||||
goto parse_err;
|
||||
|
||||
if (min < 0 || min >= (long)NUM_PORTS ||
|
||||
max < 0 || max >= (long)NUM_PORTS)
|
||||
goto parse_err;
|
||||
|
||||
fwd_ephemeral_min = min;
|
||||
fwd_ephemeral_max = max;
|
||||
|
||||
return;
|
||||
|
||||
parse_err:
|
||||
warn("Unable to parse %s", PORT_RANGE_SYSCTL);
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_port_is_ephemeral() - Is port number ephemeral?
|
||||
* @port: Port number
|
||||
*
|
||||
* Return: true if @port is ephemeral, that is may be allocated by the kernel as
|
||||
* a local port for outgoing connections or datagrams, but should not be
|
||||
* used for binding services to.
|
||||
*/
|
||||
bool fwd_port_is_ephemeral(in_port_t port)
|
||||
{
|
||||
return (port >= fwd_ephemeral_min) && (port <= fwd_ephemeral_max);
|
||||
}
|
||||
|
||||
/* See enum in kernel's include/net/tcp_states.h */
|
||||
#define UDP_LISTEN 0x07
|
||||
|
@ -113,7 +38,7 @@ bool fwd_port_is_ephemeral(in_port_t port)
|
|||
* @exclude: Bitmap of ports to exclude from setting (and clear)
|
||||
*
|
||||
* #syscalls:pasta lseek
|
||||
* #syscalls:pasta ppc64le:_llseek ppc64:_llseek arm:_llseek
|
||||
* #syscalls:pasta ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
|
||||
*/
|
||||
static void procfs_scan_listen(int fd, unsigned int lstate,
|
||||
uint8_t *map, const uint8_t *exclude)
|
||||
|
@ -127,7 +52,7 @@ static void procfs_scan_listen(int fd, unsigned int lstate,
|
|||
return;
|
||||
|
||||
if (lseek(fd, 0, SEEK_SET)) {
|
||||
warn_perror("lseek() failed on /proc/net file");
|
||||
warn("lseek() failed on /proc/net file: %s", strerror(errno));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -203,18 +128,18 @@ void fwd_scan_ports_init(struct ctx *c)
|
|||
|
||||
c->tcp.fwd_in.scan4 = c->tcp.fwd_in.scan6 = -1;
|
||||
c->tcp.fwd_out.scan4 = c->tcp.fwd_out.scan6 = -1;
|
||||
c->udp.fwd_in.scan4 = c->udp.fwd_in.scan6 = -1;
|
||||
c->udp.fwd_out.scan4 = c->udp.fwd_out.scan6 = -1;
|
||||
c->udp.fwd_in.f.scan4 = c->udp.fwd_in.f.scan6 = -1;
|
||||
c->udp.fwd_out.f.scan4 = c->udp.fwd_out.f.scan6 = -1;
|
||||
|
||||
if (c->tcp.fwd_in.mode == FWD_AUTO) {
|
||||
c->tcp.fwd_in.scan4 = open_in_ns(c, "/proc/net/tcp", flags);
|
||||
c->tcp.fwd_in.scan6 = open_in_ns(c, "/proc/net/tcp6", flags);
|
||||
fwd_scan_ports_tcp(&c->tcp.fwd_in, &c->tcp.fwd_out);
|
||||
}
|
||||
if (c->udp.fwd_in.mode == FWD_AUTO) {
|
||||
c->udp.fwd_in.scan4 = open_in_ns(c, "/proc/net/udp", flags);
|
||||
c->udp.fwd_in.scan6 = open_in_ns(c, "/proc/net/udp6", flags);
|
||||
fwd_scan_ports_udp(&c->udp.fwd_in, &c->udp.fwd_out,
|
||||
if (c->udp.fwd_in.f.mode == FWD_AUTO) {
|
||||
c->udp.fwd_in.f.scan4 = open_in_ns(c, "/proc/net/udp", flags);
|
||||
c->udp.fwd_in.f.scan6 = open_in_ns(c, "/proc/net/udp6", flags);
|
||||
fwd_scan_ports_udp(&c->udp.fwd_in.f, &c->udp.fwd_out.f,
|
||||
&c->tcp.fwd_in, &c->tcp.fwd_out);
|
||||
}
|
||||
if (c->tcp.fwd_out.mode == FWD_AUTO) {
|
||||
|
@ -222,298 +147,10 @@ void fwd_scan_ports_init(struct ctx *c)
|
|||
c->tcp.fwd_out.scan6 = open("/proc/net/tcp6", flags);
|
||||
fwd_scan_ports_tcp(&c->tcp.fwd_out, &c->tcp.fwd_in);
|
||||
}
|
||||
if (c->udp.fwd_out.mode == FWD_AUTO) {
|
||||
c->udp.fwd_out.scan4 = open("/proc/net/udp", flags);
|
||||
c->udp.fwd_out.scan6 = open("/proc/net/udp6", flags);
|
||||
fwd_scan_ports_udp(&c->udp.fwd_out, &c->udp.fwd_in,
|
||||
if (c->udp.fwd_out.f.mode == FWD_AUTO) {
|
||||
c->udp.fwd_out.f.scan4 = open("/proc/net/udp", flags);
|
||||
c->udp.fwd_out.f.scan6 = open("/proc/net/udp6", flags);
|
||||
fwd_scan_ports_udp(&c->udp.fwd_out.f, &c->udp.fwd_in.f,
|
||||
&c->tcp.fwd_out, &c->tcp.fwd_in);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* is_dns_flow() - Determine if flow appears to be a DNS request
|
||||
* @proto: Protocol (IP L4 protocol number)
|
||||
* @ini: Flow address information of the initiating side
|
||||
*
|
||||
* Return: true if the flow appears to be directed at a dns server, that is a
|
||||
* TCP or UDP flow to port 53 (domain) or port 853 (domain-s)
|
||||
*/
|
||||
static bool is_dns_flow(uint8_t proto, const struct flowside *ini)
|
||||
{
|
||||
return ((proto == IPPROTO_UDP) || (proto == IPPROTO_TCP)) &&
|
||||
((ini->oport == 53) || (ini->oport == 853));
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_guest_accessible4() - Is IPv4 address guest-accessible
|
||||
* @c: Execution context
|
||||
* @addr: Host visible IPv4 address
|
||||
*
|
||||
* Return: true if @addr on the host is accessible to the guest without
|
||||
* translation, false otherwise
|
||||
*/
|
||||
static bool fwd_guest_accessible4(const struct ctx *c,
|
||||
const struct in_addr *addr)
|
||||
{
|
||||
if (IN4_IS_ADDR_LOOPBACK(addr))
|
||||
return false;
|
||||
|
||||
/* In socket interfaces 0.0.0.0 generally means "any" or unspecified,
|
||||
* however on the wire it can mean "this host on this network". Since
|
||||
* that has a different meaning for host and guest, we can't let it
|
||||
* through untranslated.
|
||||
*/
|
||||
if (IN4_IS_ADDR_UNSPECIFIED(addr))
|
||||
return false;
|
||||
|
||||
/* For IPv4, addr_seen is initialised to addr, so is always a valid
|
||||
* address
|
||||
*/
|
||||
if (IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr) ||
|
||||
IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr_seen))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_guest_accessible6() - Is IPv6 address guest-accessible
|
||||
* @c: Execution context
|
||||
* @addr: Host visible IPv6 address
|
||||
*
|
||||
* Return: true if @addr on the host is accessible to the guest without
|
||||
* translation, false otherwise
|
||||
*/
|
||||
static bool fwd_guest_accessible6(const struct ctx *c,
|
||||
const struct in6_addr *addr)
|
||||
{
|
||||
if (IN6_IS_ADDR_LOOPBACK(addr))
|
||||
return false;
|
||||
|
||||
if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr))
|
||||
return false;
|
||||
|
||||
/* For IPv6, addr_seen starts unspecified, because we don't know what LL
|
||||
* address the guest will take until we see it. Only check against it
|
||||
* if it has been set to a real address.
|
||||
*/
|
||||
if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen) &&
|
||||
IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr_seen))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_guest_accessible() - Is IPv[46] address guest-accessible
|
||||
* @c: Execution context
|
||||
* @addr: Host visible IPv[46] address
|
||||
*
|
||||
* Return: true if @addr on the host is accessible to the guest without
|
||||
* translation, false otherwise
|
||||
*/
|
||||
static bool fwd_guest_accessible(const struct ctx *c,
|
||||
const union inany_addr *addr)
|
||||
{
|
||||
const struct in_addr *a4 = inany_v4(addr);
|
||||
|
||||
if (a4)
|
||||
return fwd_guest_accessible4(c, a4);
|
||||
|
||||
return fwd_guest_accessible6(c, &addr->a6);
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_nat_from_tap() - Determine to forward a flow from the tap interface
|
||||
* @c: Execution context
|
||||
* @proto: Protocol (IP L4 protocol number)
|
||||
* @ini: Flow address information of the initiating side
|
||||
* @tgt: Flow address information on the target side (updated)
|
||||
*
|
||||
* Return: pif of the target interface to forward the flow to, PIF_NONE if the
|
||||
* flow cannot or should not be forwarded at all.
|
||||
*/
|
||||
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
|
||||
const struct flowside *ini, struct flowside *tgt)
|
||||
{
|
||||
if (is_dns_flow(proto, ini) &&
|
||||
inany_equals4(&ini->oaddr, &c->ip4.dns_match))
|
||||
tgt->eaddr = inany_from_v4(c->ip4.dns_host);
|
||||
else if (is_dns_flow(proto, ini) &&
|
||||
inany_equals6(&ini->oaddr, &c->ip6.dns_match))
|
||||
tgt->eaddr.a6 = c->ip6.dns_host;
|
||||
else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
|
||||
tgt->eaddr = inany_loopback4;
|
||||
else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
|
||||
tgt->eaddr = inany_loopback6;
|
||||
else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
|
||||
tgt->eaddr = inany_from_v4(c->ip4.addr);
|
||||
else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
|
||||
tgt->eaddr.a6 = c->ip6.addr;
|
||||
else
|
||||
tgt->eaddr = ini->oaddr;
|
||||
|
||||
tgt->eport = ini->oport;
|
||||
|
||||
/* The relevant addr_out controls the host side source address. This
|
||||
* may be unspecified, which allows the kernel to pick an address.
|
||||
*/
|
||||
if (inany_v4(&tgt->eaddr))
|
||||
tgt->oaddr = inany_from_v4(c->ip4.addr_out);
|
||||
else
|
||||
tgt->oaddr.a6 = c->ip6.addr_out;
|
||||
|
||||
/* Let the kernel pick a host side source port */
|
||||
tgt->oport = 0;
|
||||
if (proto == IPPROTO_UDP) {
|
||||
/* But for UDP we preserve the source port */
|
||||
tgt->oport = ini->eport;
|
||||
}
|
||||
|
||||
return PIF_HOST;
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_nat_from_splice() - Determine to forward a flow from the splice interface
|
||||
* @c: Execution context
|
||||
* @proto: Protocol (IP L4 protocol number)
|
||||
* @ini: Flow address information of the initiating side
|
||||
* @tgt: Flow address information on the target side (updated)
|
||||
*
|
||||
* Return: pif of the target interface to forward the flow to, PIF_NONE if the
|
||||
* flow cannot or should not be forwarded at all.
|
||||
*/
|
||||
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
|
||||
const struct flowside *ini, struct flowside *tgt)
|
||||
{
|
||||
if (!inany_is_loopback(&ini->eaddr) ||
|
||||
(!inany_is_loopback(&ini->oaddr) && !inany_is_unspecified(&ini->oaddr))) {
|
||||
char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
|
||||
|
||||
debug("Non loopback address on %s: [%s]:%hu -> [%s]:%hu",
|
||||
pif_name(PIF_SPLICE),
|
||||
inany_ntop(&ini->eaddr, estr, sizeof(estr)), ini->eport,
|
||||
inany_ntop(&ini->oaddr, fstr, sizeof(fstr)), ini->oport);
|
||||
return PIF_NONE;
|
||||
}
|
||||
|
||||
if (inany_v4(&ini->eaddr))
|
||||
tgt->eaddr = inany_loopback4;
|
||||
else
|
||||
tgt->eaddr = inany_loopback6;
|
||||
|
||||
/* Preserve the specific loopback adddress used, but let the kernel pick
|
||||
* a source port on the target side
|
||||
*/
|
||||
tgt->oaddr = ini->eaddr;
|
||||
tgt->oport = 0;
|
||||
|
||||
tgt->eport = ini->oport;
|
||||
if (proto == IPPROTO_TCP)
|
||||
tgt->eport += c->tcp.fwd_out.delta[tgt->eport];
|
||||
else if (proto == IPPROTO_UDP)
|
||||
tgt->eport += c->udp.fwd_out.delta[tgt->eport];
|
||||
|
||||
/* Let the kernel pick a host side source port */
|
||||
tgt->oport = 0;
|
||||
if (proto == IPPROTO_UDP)
|
||||
/* But for UDP preserve the source port */
|
||||
tgt->oport = ini->eport;
|
||||
|
||||
return PIF_HOST;
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_nat_from_host() - Determine to forward a flow from the host interface
|
||||
* @c: Execution context
|
||||
* @proto: Protocol (IP L4 protocol number)
|
||||
* @ini: Flow address information of the initiating side
|
||||
* @tgt: Flow address information on the target side (updated)
|
||||
*
|
||||
* Return: pif of the target interface to forward the flow to, PIF_NONE if the
|
||||
* flow cannot or should not be forwarded at all.
|
||||
*/
|
||||
uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
|
||||
const struct flowside *ini, struct flowside *tgt)
|
||||
{
|
||||
/* Common for spliced and non-spliced cases */
|
||||
tgt->eport = ini->oport;
|
||||
if (proto == IPPROTO_TCP)
|
||||
tgt->eport += c->tcp.fwd_in.delta[tgt->eport];
|
||||
else if (proto == IPPROTO_UDP)
|
||||
tgt->eport += c->udp.fwd_in.delta[tgt->eport];
|
||||
|
||||
if (c->mode == MODE_PASTA && inany_is_loopback(&ini->eaddr) &&
|
||||
(proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
|
||||
/* spliceable */
|
||||
|
||||
/* The traffic will go over the guest's 'lo' interface, but by
|
||||
* default use its external address, so we don't inadvertently
|
||||
* expose services that listen only on the guest's loopback
|
||||
* address. That can be overridden by --host-lo-to-ns-lo which
|
||||
* will instead forward to the loopback address in the guest.
|
||||
*
|
||||
* In either case, let the kernel pick the source address to
|
||||
* match.
|
||||
*/
|
||||
if (inany_v4(&ini->eaddr)) {
|
||||
if (c->host_lo_to_ns_lo)
|
||||
tgt->eaddr = inany_loopback4;
|
||||
else
|
||||
tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
|
||||
tgt->oaddr = inany_any4;
|
||||
} else {
|
||||
if (c->host_lo_to_ns_lo)
|
||||
tgt->eaddr = inany_loopback6;
|
||||
else
|
||||
tgt->eaddr.a6 = c->ip6.addr_seen;
|
||||
tgt->oaddr = inany_any6;
|
||||
}
|
||||
|
||||
/* Let the kernel pick source port */
|
||||
tgt->oport = 0;
|
||||
if (proto == IPPROTO_UDP)
|
||||
/* But for UDP preserve the source port */
|
||||
tgt->oport = ini->eport;
|
||||
|
||||
return PIF_SPLICE;
|
||||
}
|
||||
|
||||
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
|
||||
inany_equals4(&ini->eaddr, &in4addr_loopback)) {
|
||||
/* Specifically 127.0.0.1, not 127.0.0.0/8 */
|
||||
tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
|
||||
inany_equals6(&ini->eaddr, &in6addr_loopback)) {
|
||||
tgt->oaddr.a6 = c->ip6.map_host_loopback;
|
||||
} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
|
||||
inany_equals4(&ini->eaddr, &c->ip4.addr)) {
|
||||
tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
|
||||
inany_equals6(&ini->eaddr, &c->ip6.addr)) {
|
||||
tgt->oaddr.a6 = c->ip6.map_guest_addr;
|
||||
} else if (!fwd_guest_accessible(c, &ini->eaddr)) {
|
||||
if (inany_v4(&ini->eaddr)) {
|
||||
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
|
||||
/* No source address we can use */
|
||||
return PIF_NONE;
|
||||
tgt->oaddr = inany_from_v4(c->ip4.our_tap_addr);
|
||||
} else {
|
||||
tgt->oaddr.a6 = c->ip6.our_tap_ll;
|
||||
}
|
||||
} else {
|
||||
tgt->oaddr = ini->eaddr;
|
||||
}
|
||||
tgt->oport = ini->eport;
|
||||
|
||||
if (inany_v4(&tgt->oaddr)) {
|
||||
tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
|
||||
} else {
|
||||
if (inany_is_linklocal6(&tgt->oaddr))
|
||||
tgt->eaddr.a6 = c->ip6.addr_ll_seen;
|
||||
else
|
||||
tgt->eaddr.a6 = c->ip6.addr_seen;
|
||||
}
|
||||
|
||||
return PIF_TAP;
|
||||
}
|
||||
|
|
13
fwd.h
13
fwd.h
|
@ -7,16 +7,10 @@
|
|||
#ifndef FWD_H
|
||||
#define FWD_H
|
||||
|
||||
struct flowside;
|
||||
|
||||
/* Number of ports for both TCP and UDP */
|
||||
#define NUM_PORTS (1U << 16)
|
||||
|
||||
void fwd_probe_ephemeral(void);
|
||||
bool fwd_port_is_ephemeral(in_port_t port);
|
||||
|
||||
enum fwd_ports_mode {
|
||||
FWD_UNSET = 0,
|
||||
FWD_SPEC = 1,
|
||||
FWD_NONE,
|
||||
FWD_AUTO,
|
||||
|
@ -47,11 +41,4 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
|
|||
const struct fwd_ports *tcp_rev);
|
||||
void fwd_scan_ports_init(struct ctx *c);
|
||||
|
||||
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
|
||||
const struct flowside *ini, struct flowside *tgt);
|
||||
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
|
||||
const struct flowside *ini, struct flowside *tgt);
|
||||
uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
|
||||
const struct flowside *ini, struct flowside *tgt);
|
||||
|
||||
#endif /* FWD_H */
|
||||
|
|
262
icmp.c
262
icmp.c
|
@ -40,38 +40,36 @@
|
|||
#include "siphash.h"
|
||||
#include "inany.h"
|
||||
#include "icmp.h"
|
||||
#include "flow_table.h"
|
||||
|
||||
#define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */
|
||||
#define ICMP_NUM_IDS (1U << 16)
|
||||
|
||||
/**
|
||||
* ping_at_sidx() - Get ping specific flow at given sidx
|
||||
* @sidx: Flow and side to retrieve
|
||||
*
|
||||
* Return: ping specific flow at @sidx, or NULL of @sidx is invalid. Asserts if
|
||||
* the flow at @sidx is not FLOW_PING4 or FLOW_PING6
|
||||
* struct icmp_id_sock - Tracking information for single ICMP echo identifier
|
||||
* @sock: Bound socket for identifier
|
||||
* @seq: Last sequence number sent to tap, host order, -1: not sent yet
|
||||
* @ts: Last associated activity from tap, seconds
|
||||
*/
|
||||
static struct icmp_ping_flow *ping_at_sidx(flow_sidx_t sidx)
|
||||
{
|
||||
union flow *flow = flow_at_sidx(sidx);
|
||||
struct icmp_id_sock {
|
||||
int sock;
|
||||
int seq;
|
||||
time_t ts;
|
||||
};
|
||||
|
||||
if (!flow)
|
||||
return NULL;
|
||||
|
||||
ASSERT(flow->f.type == FLOW_PING4 || flow->f.type == FLOW_PING6);
|
||||
return &flow->ping;
|
||||
}
|
||||
/* Indexed by ICMP echo identifier */
|
||||
static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS];
|
||||
|
||||
/**
|
||||
* icmp_sock_handler() - Handle new data from ICMP or ICMPv6 socket
|
||||
* @c: Execution context
|
||||
* @af: Address family (AF_INET or AF_INET6)
|
||||
* @ref: epoll reference
|
||||
*/
|
||||
void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
|
||||
void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref)
|
||||
{
|
||||
struct icmp_ping_flow *pingf = ping_at_sidx(ref.flowside);
|
||||
const struct flowside *ini = &pingf->f.side[INISIDE];
|
||||
struct icmp_id_sock *const id_sock = af == AF_INET
|
||||
? &icmp_id_map[V4][ref.icmp.id] : &icmp_id_map[V6][ref.icmp.id];
|
||||
const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6";
|
||||
union sockaddr_inany sr;
|
||||
socklen_t sl = sizeof(sr);
|
||||
char buf[USHRT_MAX];
|
||||
|
@ -81,33 +79,33 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
|
|||
if (c->no_icmp)
|
||||
return;
|
||||
|
||||
ASSERT(pingf);
|
||||
|
||||
n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
|
||||
if (n < 0) {
|
||||
flow_err(pingf, "recvfrom() error: %s", strerror(errno));
|
||||
warn("%s: recvfrom() error on ping socket: %s",
|
||||
pname, strerror(errno));
|
||||
return;
|
||||
}
|
||||
if (sr.sa_family != af)
|
||||
goto unexpected;
|
||||
|
||||
if (pingf->f.type == FLOW_PING4) {
|
||||
if (af == AF_INET) {
|
||||
struct icmphdr *ih4 = (struct icmphdr *)buf;
|
||||
|
||||
if (sr.sa_family != AF_INET || (size_t)n < sizeof(*ih4) ||
|
||||
ih4->type != ICMP_ECHOREPLY)
|
||||
if ((size_t)n < sizeof(*ih4) || ih4->type != ICMP_ECHOREPLY)
|
||||
goto unexpected;
|
||||
|
||||
/* Adjust packet back to guest-side ID */
|
||||
ih4->un.echo.id = htons(ini->eport);
|
||||
ih4->un.echo.id = htons(ref.icmp.id);
|
||||
seq = ntohs(ih4->un.echo.sequence);
|
||||
} else if (pingf->f.type == FLOW_PING6) {
|
||||
} else if (af == AF_INET6) {
|
||||
struct icmp6hdr *ih6 = (struct icmp6hdr *)buf;
|
||||
|
||||
if (sr.sa_family != AF_INET6 || (size_t)n < sizeof(*ih6) ||
|
||||
if ((size_t)n < sizeof(*ih6) ||
|
||||
ih6->icmp6_type != ICMPV6_ECHO_REPLY)
|
||||
goto unexpected;
|
||||
|
||||
/* Adjust packet back to guest-side ID */
|
||||
ih6->icmp6_identifier = htons(ini->eport);
|
||||
ih6->icmp6_identifier = htons(ref.icmp.id);
|
||||
seq = ntohs(ih6->icmp6_sequence);
|
||||
} else {
|
||||
ASSERT(0);
|
||||
|
@ -115,111 +113,87 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
|
|||
|
||||
/* In PASTA mode, we'll get any reply we send, discard them. */
|
||||
if (c->mode == MODE_PASTA) {
|
||||
if (pingf->seq == seq)
|
||||
if (id_sock->seq == seq)
|
||||
return;
|
||||
|
||||
pingf->seq = seq;
|
||||
id_sock->seq = seq;
|
||||
}
|
||||
|
||||
flow_dbg(pingf, "echo reply to tap, ID: %"PRIu16", seq: %"PRIu16,
|
||||
ini->eport, seq);
|
||||
|
||||
if (pingf->f.type == FLOW_PING4) {
|
||||
const struct in_addr *saddr = inany_v4(&ini->oaddr);
|
||||
const struct in_addr *daddr = inany_v4(&ini->eaddr);
|
||||
|
||||
ASSERT(saddr && daddr); /* Must have IPv4 addresses */
|
||||
tap_icmp4_send(c, *saddr, *daddr, buf, n);
|
||||
} else if (pingf->f.type == FLOW_PING6) {
|
||||
const struct in6_addr *saddr = &ini->oaddr.a6;
|
||||
const struct in6_addr *daddr = &ini->eaddr.a6;
|
||||
|
||||
tap_icmp6_send(c, saddr, daddr, buf, n);
|
||||
}
|
||||
debug("%s: echo reply to tap, ID: %"PRIu16", seq: %"PRIu16, pname,
|
||||
ref.icmp.id, seq);
|
||||
if (af == AF_INET)
|
||||
tap_icmp4_send(c, sr.sa4.sin_addr, tap_ip4_daddr(c), buf, n);
|
||||
else if (af == AF_INET6)
|
||||
tap_icmp6_send(c, &sr.sa6.sin6_addr,
|
||||
tap_ip6_daddr(c, &sr.sa6.sin6_addr), buf, n);
|
||||
return;
|
||||
|
||||
unexpected:
|
||||
flow_err(pingf, "Unexpected packet on ping socket");
|
||||
warn("%s: Unexpected packet on ping socket", pname);
|
||||
}
|
||||
|
||||
/**
|
||||
* icmp_ping_close() - Close and clean up a ping flow
|
||||
* icmp_ping_close() - Close and clean up a ping socket
|
||||
* @c: Execution context
|
||||
* @pingf: ping flow entry to close
|
||||
* @id_sock: Socket number and other info
|
||||
*/
|
||||
static void icmp_ping_close(const struct ctx *c,
|
||||
const struct icmp_ping_flow *pingf)
|
||||
static void icmp_ping_close(const struct ctx *c, struct icmp_id_sock *id_sock)
|
||||
{
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL);
|
||||
close(pingf->sock);
|
||||
flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_sock->sock, NULL);
|
||||
close(id_sock->sock);
|
||||
id_sock->sock = -1;
|
||||
id_sock->seq = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* icmp_ping_new() - Prepare a new ping socket for a new id
|
||||
* @c: Execution context
|
||||
* @id_sock: Socket fd and other information
|
||||
* @af: Address family, AF_INET or AF_INET6
|
||||
* @id: ICMP id for the new socket
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
*
|
||||
* Return: Newly opened ping flow, or NULL on failure
|
||||
* Return: Newly opened ping socket fd, or -1 on failure
|
||||
*/
|
||||
static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
|
||||
sa_family_t af, uint16_t id,
|
||||
const void *saddr, const void *daddr)
|
||||
static int icmp_ping_new(const struct ctx *c, struct icmp_id_sock *id_sock,
|
||||
sa_family_t af, uint16_t id)
|
||||
{
|
||||
uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6;
|
||||
uint8_t flowtype = af == AF_INET ? FLOW_PING4 : FLOW_PING6;
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_PING };
|
||||
union flow *flow = flow_alloc();
|
||||
struct icmp_ping_flow *pingf;
|
||||
const struct flowside *tgt;
|
||||
const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6";
|
||||
union icmp_epoll_ref iref = { .id = id };
|
||||
const void *bind_addr;
|
||||
const char *bind_if;
|
||||
int s;
|
||||
|
||||
if (!flow)
|
||||
return NULL;
|
||||
|
||||
flow_initiate_af(flow, PIF_TAP, af, saddr, id, daddr, id);
|
||||
if (!(tgt = flow_target(c, flow, proto)))
|
||||
goto cancel;
|
||||
|
||||
if (flow->f.pif[TGTSIDE] != PIF_HOST) {
|
||||
flow_err(flow, "No support for forwarding %s from %s to %s",
|
||||
proto == IPPROTO_ICMP ? "ICMP" : "ICMPv6",
|
||||
pif_name(flow->f.pif[INISIDE]),
|
||||
pif_name(flow->f.pif[TGTSIDE]));
|
||||
goto cancel;
|
||||
if (af == AF_INET) {
|
||||
bind_addr = &c->ip4.addr_out;
|
||||
bind_if = c->ip4.ifname_out;
|
||||
} else {
|
||||
bind_addr = &c->ip6.addr_out;
|
||||
bind_if = c->ip6.ifname_out;
|
||||
}
|
||||
|
||||
pingf = FLOW_SET_TYPE(flow, flowtype, ping);
|
||||
s = sock_l4(c, af, proto, bind_addr, bind_if, 0, iref.u32);
|
||||
|
||||
pingf->seq = -1;
|
||||
|
||||
ref.flowside = FLOW_SIDX(flow, TGTSIDE);
|
||||
pingf->sock = flowside_sock_l4(c, EPOLL_TYPE_PING, PIF_HOST,
|
||||
tgt, ref.data);
|
||||
|
||||
if (pingf->sock < 0) {
|
||||
if (s < 0) {
|
||||
warn("Cannot open \"ping\" socket. You might need to:");
|
||||
warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\"");
|
||||
warn("...echo requests/replies will fail.");
|
||||
goto cancel;
|
||||
}
|
||||
|
||||
if (pingf->sock > FD_REF_MAX)
|
||||
if (s > FD_REF_MAX)
|
||||
goto cancel;
|
||||
|
||||
flow_dbg(pingf, "new socket %i for echo ID %"PRIu16, pingf->sock, id);
|
||||
id_sock->sock = s;
|
||||
|
||||
flow_hash_insert(c, FLOW_SIDX(pingf, INISIDE));
|
||||
debug("%s: new socket %i for echo ID %"PRIu16, pname, s, id);
|
||||
|
||||
FLOW_ACTIVATE(pingf);
|
||||
|
||||
return pingf;
|
||||
return s;
|
||||
|
||||
cancel:
|
||||
flow_alloc_cancel(flow);
|
||||
return NULL;
|
||||
if (s >= 0)
|
||||
close(s);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -238,93 +212,111 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
|||
const void *saddr, const void *daddr,
|
||||
const struct pool *p, const struct timespec *now)
|
||||
{
|
||||
struct icmp_ping_flow *pingf;
|
||||
const struct flowside *tgt;
|
||||
union sockaddr_inany sa;
|
||||
size_t dlen, l4len;
|
||||
const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6";
|
||||
union sockaddr_inany sa = { .sa_family = af };
|
||||
const socklen_t sl = af == AF_INET ? sizeof(sa.sa4) : sizeof(sa.sa6);
|
||||
struct icmp_id_sock *id_sock;
|
||||
uint16_t id, seq;
|
||||
union flow *flow;
|
||||
uint8_t proto;
|
||||
socklen_t sl;
|
||||
size_t plen;
|
||||
void *pkt;
|
||||
int s;
|
||||
|
||||
(void)saddr;
|
||||
ASSERT(pif == PIF_TAP);
|
||||
(void)pif;
|
||||
|
||||
if (af == AF_INET) {
|
||||
const struct icmphdr *ih;
|
||||
|
||||
if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen)))
|
||||
if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &plen)))
|
||||
return 1;
|
||||
|
||||
ih = (struct icmphdr *)pkt;
|
||||
l4len = dlen + sizeof(*ih);
|
||||
plen += sizeof(*ih);
|
||||
|
||||
if (ih->type != ICMP_ECHO)
|
||||
return 1;
|
||||
|
||||
proto = IPPROTO_ICMP;
|
||||
id = ntohs(ih->un.echo.id);
|
||||
id_sock = &icmp_id_map[V4][id];
|
||||
seq = ntohs(ih->un.echo.sequence);
|
||||
sa.sa4.sin_addr = *(struct in_addr *)daddr;
|
||||
} else if (af == AF_INET6) {
|
||||
const struct icmp6hdr *ih;
|
||||
|
||||
if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen)))
|
||||
if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &plen)))
|
||||
return 1;
|
||||
|
||||
ih = (struct icmp6hdr *)pkt;
|
||||
l4len = dlen + sizeof(*ih);
|
||||
plen += sizeof(*ih);
|
||||
|
||||
if (ih->icmp6_type != ICMPV6_ECHO_REQUEST)
|
||||
return 1;
|
||||
|
||||
proto = IPPROTO_ICMPV6;
|
||||
id = ntohs(ih->icmp6_identifier);
|
||||
id_sock = &icmp_id_map[V6][id];
|
||||
seq = ntohs(ih->icmp6_sequence);
|
||||
sa.sa6.sin6_addr = *(struct in6_addr *)daddr;
|
||||
sa.sa6.sin6_scope_id = c->ifi6;
|
||||
} else {
|
||||
ASSERT(0);
|
||||
}
|
||||
|
||||
flow = flow_at_sidx(flow_lookup_af(c, proto, PIF_TAP,
|
||||
af, saddr, daddr, id, id));
|
||||
if ((s = id_sock->sock) < 0)
|
||||
if ((s = icmp_ping_new(c, id_sock, af, id)) < 0)
|
||||
return 1;
|
||||
|
||||
if (flow)
|
||||
pingf = &flow->ping;
|
||||
else if (!(pingf = icmp_ping_new(c, af, id, saddr, daddr)))
|
||||
return 1;
|
||||
id_sock->ts = now->tv_sec;
|
||||
|
||||
tgt = &pingf->f.side[TGTSIDE];
|
||||
|
||||
ASSERT(flow_proto[pingf->f.type] == proto);
|
||||
pingf->ts = now->tv_sec;
|
||||
|
||||
pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
|
||||
if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
|
||||
flow_dbg(pingf, "failed to relay request to socket: %s",
|
||||
strerror(errno));
|
||||
if (sendto(s, pkt, plen, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
|
||||
debug("%s: failed to relay request to socket: %s",
|
||||
pname, strerror(errno));
|
||||
} else {
|
||||
flow_dbg(pingf,
|
||||
"echo request to socket, ID: %"PRIu16", seq: %"PRIu16,
|
||||
id, seq);
|
||||
debug("%s: echo request to socket, ID: %"PRIu16", seq: %"PRIu16,
|
||||
pname, id, seq);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* icmp_ping_timer() - Handler for timed events related to a given flow
|
||||
* icmp_timer_one() - Handler for timed events related to a given identifier
|
||||
* @c: Execution context
|
||||
* @pingf: Ping flow to check for timeout
|
||||
* @id_sock: Socket fd and activity timestamp
|
||||
* @now: Current timestamp
|
||||
*
|
||||
* Return: true if the flow is ready to free, false otherwise
|
||||
*/
|
||||
bool icmp_ping_timer(const struct ctx *c, const struct icmp_ping_flow *pingf,
|
||||
const struct timespec *now)
|
||||
static void icmp_timer_one(const struct ctx *c, struct icmp_id_sock *id_sock,
|
||||
const struct timespec *now)
|
||||
{
|
||||
if (now->tv_sec - pingf->ts <= ICMP_ECHO_TIMEOUT)
|
||||
return false;
|
||||
if (id_sock->sock < 0 || now->tv_sec - id_sock->ts <= ICMP_ECHO_TIMEOUT)
|
||||
return;
|
||||
|
||||
icmp_ping_close(c, pingf);
|
||||
return true;
|
||||
icmp_ping_close(c, id_sock);
|
||||
}
|
||||
|
||||
/**
|
||||
* icmp_timer() - Scan activity bitmap for identifiers with timed events
|
||||
* @c: Execution context
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void icmp_timer(const struct ctx *c, const struct timespec *now)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < ICMP_NUM_IDS; i++) {
|
||||
icmp_timer_one(c, &icmp_id_map[V4][i], now);
|
||||
icmp_timer_one(c, &icmp_id_map[V6][i], now);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* icmp_init() - Initialise sequences in ID map to -1 (no sequence sent yet)
|
||||
*/
|
||||
void icmp_init(void)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < ICMP_NUM_IDS; i++) {
|
||||
icmp_id_map[V4][i].seq = icmp_id_map[V6][i].seq = -1;
|
||||
icmp_id_map[V4][i].sock = icmp_id_map[V6][i].sock = -1;
|
||||
}
|
||||
}
|
||||
|
|
15
icmp.h
15
icmp.h
|
@ -9,14 +9,25 @@
|
|||
#define ICMP_TIMER_INTERVAL 10000 /* ms */
|
||||
|
||||
struct ctx;
|
||||
struct icmp_ping_flow;
|
||||
|
||||
void icmp_sock_handler(const struct ctx *c, union epoll_ref ref);
|
||||
void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref);
|
||||
int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
const void *saddr, const void *daddr,
|
||||
const struct pool *p, const struct timespec *now);
|
||||
void icmp_timer(const struct ctx *c, const struct timespec *now);
|
||||
void icmp_init(void);
|
||||
|
||||
/**
|
||||
* union icmp_epoll_ref - epoll reference portion for ICMP tracking
|
||||
* @v6: Set for IPv6 sockets or connections
|
||||
* @u32: Opaque u32 value of reference
|
||||
* @id: Associated echo identifier, needed if bind() fails
|
||||
*/
|
||||
union icmp_epoll_ref {
|
||||
uint16_t id;
|
||||
uint32_t u32;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct icmp_ctx - Execution context for ICMP routines
|
||||
* @timer_run: Timestamp of most recent timer run
|
||||
|
|
29
icmp_flow.h
29
icmp_flow.h
|
@ -1,29 +0,0 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright Red Hat
|
||||
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
*
|
||||
* ICMP flow tracking data structures
|
||||
*/
|
||||
#ifndef ICMP_FLOW_H
|
||||
#define ICMP_FLOW_H
|
||||
|
||||
/**
|
||||
* struct icmp_ping_flow - Descriptor for a flow of ping requests/replies
|
||||
* @f: Generic flow information
|
||||
* @seq: Last sequence number sent to tap, host order, -1: not sent yet
|
||||
* @sock: "ping" socket
|
||||
* @ts: Last associated activity from tap, seconds
|
||||
*/
|
||||
struct icmp_ping_flow {
|
||||
/* Must be first element */
|
||||
struct flow_common f;
|
||||
|
||||
int seq;
|
||||
int sock;
|
||||
time_t ts;
|
||||
};
|
||||
|
||||
bool icmp_ping_timer(const struct ctx *c, const struct icmp_ping_flow *pingf,
|
||||
const struct timespec *now);
|
||||
|
||||
#endif /* ICMP_FLOW_H */
|
37
inany.c
37
inany.c
|
@ -17,8 +17,21 @@
|
|||
#include "siphash.h"
|
||||
#include "inany.h"
|
||||
|
||||
const union inany_addr inany_loopback4 = INANY_INIT4(IN4ADDR_LOOPBACK_INIT);
|
||||
const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
|
||||
const union inany_addr inany_loopback4 = {
|
||||
.v4mapped = {
|
||||
.zero = { 0 },
|
||||
.one = { 0xff, 0xff, },
|
||||
.a4 = IN4ADDR_LOOPBACK_INIT,
|
||||
},
|
||||
};
|
||||
|
||||
const union inany_addr inany_any4 = {
|
||||
.v4mapped = {
|
||||
.zero = { 0 },
|
||||
.one = { 0xff, 0xff, },
|
||||
.a4 = IN4ADDR_ANY_INIT,
|
||||
},
|
||||
};
|
||||
|
||||
/** inany_ntop - Convert an IPv[46] address to text format
|
||||
* @src: IPv[46] address
|
||||
|
@ -36,23 +49,3 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
|
|||
|
||||
return inet_ntop(AF_INET6, &src->a6, dst, size);
|
||||
}
|
||||
|
||||
/** inany_pton - Parse an IPv[46] address from text format
|
||||
* @src: IPv[46] address
|
||||
* @dst: output buffer, filled with parsed address
|
||||
*
|
||||
* Return: On success, 1, if no parseable address is found, 0
|
||||
*/
|
||||
int inany_pton(const char *src, union inany_addr *dst)
|
||||
{
|
||||
if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) {
|
||||
memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero));
|
||||
memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one));
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (inet_pton(AF_INET6, src, &dst->a6))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
98
inany.h
98
inany.h
|
@ -43,17 +43,6 @@ extern const union inany_addr inany_any4;
|
|||
#define in4addr_loopback (inany_loopback4.v4mapped.a4)
|
||||
#define in4addr_any (inany_any4.v4mapped.a4)
|
||||
|
||||
#define INANY_INIT4(a4init) { \
|
||||
.v4mapped = { \
|
||||
.zero = { 0 }, \
|
||||
.one = { 0xff, 0xff }, \
|
||||
.a4 = a4init, \
|
||||
}, \
|
||||
}
|
||||
|
||||
#define inany_from_v4(a4) \
|
||||
((union inany_addr)INANY_INIT4((a4)))
|
||||
|
||||
/** union sockaddr_inany - Either a sockaddr_in or a sockaddr_in6
|
||||
* @sa_family: Address family, AF_INET or AF_INET6
|
||||
* @sa: Plain struct sockaddr (useful to avoid casts)
|
||||
|
@ -90,84 +79,16 @@ static inline bool inany_equals(const union inany_addr *a,
|
|||
return IN6_ARE_ADDR_EQUAL(&a->a6, &b->a6);
|
||||
}
|
||||
|
||||
/** inany_equals4 - Compare an IPv[46] address to an IPv4 address
|
||||
* @a: IPv[46] addresses
|
||||
* @b: IPv4 address
|
||||
*
|
||||
* Return: true if @a and @b are the same address
|
||||
*/
|
||||
static inline bool inany_equals4(const union inany_addr *a,
|
||||
const struct in_addr *b)
|
||||
{
|
||||
const struct in_addr *a4 = inany_v4(a);
|
||||
|
||||
return a4 && IN4_ARE_ADDR_EQUAL(a4, b);
|
||||
}
|
||||
|
||||
/** inany_equals6 - Compare an IPv[46] address to an IPv6 address
|
||||
* @a: IPv[46] addresses
|
||||
* @b: IPv6 address
|
||||
*
|
||||
* Return: true if @a and @b are the same address
|
||||
*/
|
||||
static inline bool inany_equals6(const union inany_addr *a,
|
||||
const struct in6_addr *b)
|
||||
{
|
||||
return IN6_ARE_ADDR_EQUAL(&a->a6, b);
|
||||
}
|
||||
|
||||
/** inany_is_loopback4() - Check if address is IPv4 loopback
|
||||
* @a: IPv[46] address
|
||||
*
|
||||
* Return: true if @a is in 127.0.0.1/8
|
||||
*/
|
||||
static inline bool inany_is_loopback4(const union inany_addr *a)
|
||||
{
|
||||
const struct in_addr *v4 = inany_v4(a);
|
||||
|
||||
return v4 && IN4_IS_ADDR_LOOPBACK(v4);
|
||||
}
|
||||
|
||||
/** inany_is_loopback6() - Check if address is IPv6 loopback
|
||||
* @a: IPv[46] address
|
||||
*
|
||||
* Return: true if @a is in ::1
|
||||
*/
|
||||
static inline bool inany_is_loopback6(const union inany_addr *a)
|
||||
{
|
||||
return IN6_IS_ADDR_LOOPBACK(&a->a6);
|
||||
}
|
||||
|
||||
/** inany_is_loopback() - Check if address is loopback
|
||||
* @a: IPv[46] address
|
||||
*
|
||||
* Return: true if @a is either ::1 or in 127.0.0.1/8
|
||||
*/
|
||||
static inline bool inany_is_loopback(const union inany_addr *a)
|
||||
{
|
||||
return inany_is_loopback4(a) || inany_is_loopback6(a);
|
||||
}
|
||||
|
||||
/** inany_is_unspecified4() - Check if address is unspecified IPv4
|
||||
* @a: IPv[46] address
|
||||
*
|
||||
* Return: true if @a is 0.0.0.0
|
||||
*/
|
||||
static inline bool inany_is_unspecified4(const union inany_addr *a)
|
||||
{
|
||||
const struct in_addr *v4 = inany_v4(a);
|
||||
|
||||
return v4 && IN4_IS_ADDR_UNSPECIFIED(v4);
|
||||
}
|
||||
|
||||
/** inany_is_unspecified6() - Check if address is unspecified IPv6
|
||||
* @a: IPv[46] address
|
||||
*
|
||||
* Return: true if @a is ::
|
||||
*/
|
||||
static inline bool inany_is_unspecified6(const union inany_addr *a)
|
||||
{
|
||||
return IN6_IS_ADDR_UNSPECIFIED(&a->a6);
|
||||
return IN6_IS_ADDR_LOOPBACK(&a->a6) || (v4 && IN4_IS_ADDR_LOOPBACK(v4));
|
||||
}
|
||||
|
||||
/** inany_is_unspecified() - Check if address is unspecified
|
||||
|
@ -177,19 +98,10 @@ static inline bool inany_is_unspecified6(const union inany_addr *a)
|
|||
*/
|
||||
static inline bool inany_is_unspecified(const union inany_addr *a)
|
||||
{
|
||||
return inany_is_unspecified4(a) || inany_is_unspecified6(a);
|
||||
}
|
||||
const struct in_addr *v4 = inany_v4(a);
|
||||
|
||||
/* FIXME: consider handling of IPv4 link-local addresses */
|
||||
|
||||
/** inany_is_linklocal6() - Check if address is link-local IPv6
|
||||
* @a: IPv[46] address
|
||||
*
|
||||
* Return: true if @a is in fe80::/10 (IPv6 link local unicast)
|
||||
*/
|
||||
static inline bool inany_is_linklocal6(const union inany_addr *a)
|
||||
{
|
||||
return IN6_IS_ADDR_LINKLOCAL(&a->a6);
|
||||
return IN6_IS_ADDR_UNSPECIFIED(&a->a6) ||
|
||||
(v4 && IN4_IS_ADDR_UNSPECIFIED(v4));
|
||||
}
|
||||
|
||||
/** inany_is_multicast() - Check if address is multicast or broadcast
|
||||
|
@ -211,6 +123,7 @@ static inline bool inany_is_multicast(const union inany_addr *a)
|
|||
*
|
||||
* Return: true if @a is specified and a unicast address
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
static inline bool inany_is_unicast(const union inany_addr *a)
|
||||
{
|
||||
return !inany_is_unspecified(a) && !inany_is_multicast(a);
|
||||
|
@ -270,6 +183,5 @@ static inline void inany_siphash_feed(struct siphash_state *state,
|
|||
#define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)
|
||||
|
||||
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size);
|
||||
int inany_pton(const char *src, union inany_addr *dst);
|
||||
|
||||
#endif /* INANY_H */
|
||||
|
|
3
iov.h
3
iov.h
|
@ -18,9 +18,6 @@
|
|||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
|
||||
#define IOV_OF_LVALUE(lval) \
|
||||
(struct iovec){ .iov_base = &(lval), .iov_len = sizeof(lval) }
|
||||
|
||||
size_t iov_skip_bytes(const struct iovec *iov, size_t n,
|
||||
size_t skip, size_t *offset);
|
||||
size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
||||
|
|
11
ip.h
11
ip.h
|
@ -24,11 +24,6 @@
|
|||
#define IN4ADDR_ANY_INIT \
|
||||
{ .s_addr = htonl_constant(INADDR_ANY) }
|
||||
|
||||
#define IN4_IS_ADDR_LINKLOCAL(a) \
|
||||
((ntohl(((struct in_addr *)(a))->s_addr) >> 16) == 0xa9fe)
|
||||
#define IN4_IS_PREFIX_LINKLOCAL(a, len) \
|
||||
((len) >= 16 && IN4_IS_ADDR_LINKLOCAL(a))
|
||||
|
||||
#define L2_BUF_IP4_INIT(proto) \
|
||||
{ \
|
||||
.version = 4, \
|
||||
|
@ -43,11 +38,7 @@
|
|||
.daddr = 0, \
|
||||
}
|
||||
#define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \
|
||||
(uint32_t)htons(0xff00 | (proto)))
|
||||
|
||||
|
||||
#define IN6_IS_PREFIX_LINKLOCAL(a, len) \
|
||||
((len) >= 10 && IN6_IS_ADDR_LINKLOCAL(a))
|
||||
(uint32_t)htons_constant(0xff00 | (proto)))
|
||||
|
||||
#define L2_BUF_IP6_INIT(proto) \
|
||||
{ \
|
||||
|
|
58
isolation.c
58
isolation.c
|
@ -29,8 +29,7 @@
|
|||
*
|
||||
* Executed immediately after startup, drops capabilities we don't
|
||||
* need at any point during execution (or which we gain back when we
|
||||
* need by joining other namespaces), and closes any leaked file we
|
||||
* might have inherited from the parent process.
|
||||
* need by joining other namespaces).
|
||||
*
|
||||
* 2. isolate_user()
|
||||
* =================
|
||||
|
@ -106,7 +105,7 @@ static void drop_caps_ep_except(uint64_t keep)
|
|||
int i;
|
||||
|
||||
if (syscall(SYS_capget, &hdr, data))
|
||||
die_perror("Couldn't get current capabilities");
|
||||
die("Couldn't get current capabilities: %s", strerror(errno));
|
||||
|
||||
for (i = 0; i < CAP_WORDS; i++) {
|
||||
uint32_t mask = keep >> (32 * i);
|
||||
|
@ -116,7 +115,7 @@ static void drop_caps_ep_except(uint64_t keep)
|
|||
}
|
||||
|
||||
if (syscall(SYS_capset, &hdr, data))
|
||||
die_perror("Couldn't drop capabilities");
|
||||
die("Couldn't drop capabilities: %s", strerror(errno));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -153,31 +152,30 @@ static void clamp_caps(void)
|
|||
*/
|
||||
if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) &&
|
||||
errno != EINVAL && errno != EPERM)
|
||||
die_perror("Couldn't drop cap %i from bounding set", i);
|
||||
die("Couldn't drop cap %i from bounding set: %s",
|
||||
i, strerror(errno));
|
||||
}
|
||||
|
||||
if (syscall(SYS_capget, &hdr, data))
|
||||
die_perror("Couldn't get current capabilities");
|
||||
die("Couldn't get current capabilities: %s", strerror(errno));
|
||||
|
||||
for (i = 0; i < CAP_WORDS; i++)
|
||||
data[i].inheritable = 0;
|
||||
|
||||
if (syscall(SYS_capset, &hdr, data))
|
||||
die_perror("Couldn't drop inheritable capabilities");
|
||||
die("Couldn't drop inheritable capabilities: %s",
|
||||
strerror(errno));
|
||||
}
|
||||
|
||||
/**
|
||||
* isolate_initial() - Early, mostly config independent self isolation
|
||||
* @argc: Argument count
|
||||
* @argv: Command line options: only --fd (if present) is relevant here
|
||||
* isolate_initial() - Early, config independent self isolation
|
||||
*
|
||||
* Should:
|
||||
* - drop unneeded capabilities
|
||||
* - close all open files except for standard streams and the one from --fd
|
||||
* Musn't:
|
||||
* - remove filesytem access (we need to access files during setup)
|
||||
*/
|
||||
void isolate_initial(int argc, char **argv)
|
||||
void isolate_initial(void)
|
||||
{
|
||||
uint64_t keep;
|
||||
|
||||
|
@ -211,8 +209,6 @@ void isolate_initial(int argc, char **argv)
|
|||
keep |= BIT(CAP_SETFCAP) | BIT(CAP_SYS_PTRACE);
|
||||
|
||||
drop_caps_ep_except(keep);
|
||||
|
||||
close_open_files(argc, argv);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -238,30 +234,34 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns,
|
|||
if (setgroups(0, NULL)) {
|
||||
/* If we don't have CAP_SETGID, this will EPERM */
|
||||
if (errno != EPERM)
|
||||
die_perror("Can't drop supplementary groups");
|
||||
die("Can't drop supplementary groups: %s",
|
||||
strerror(errno));
|
||||
}
|
||||
|
||||
if (setgid(gid) != 0)
|
||||
die_perror("Can't set GID to %u", gid);
|
||||
die("Can't set GID to %u: %s", gid, strerror(errno));
|
||||
|
||||
if (setuid(uid) != 0)
|
||||
die_perror("Can't set UID to %u", uid);
|
||||
die("Can't set UID to %u: %s", uid, strerror(errno));
|
||||
|
||||
if (*userns) { /* If given a userns, join it */
|
||||
int ufd;
|
||||
|
||||
ufd = open(userns, O_RDONLY | O_CLOEXEC);
|
||||
if (ufd < 0)
|
||||
die_perror("Couldn't open user namespace %s", userns);
|
||||
die("Couldn't open user namespace %s: %s",
|
||||
userns, strerror(errno));
|
||||
|
||||
if (setns(ufd, CLONE_NEWUSER) != 0)
|
||||
die_perror("Couldn't enter user namespace %s", userns);
|
||||
die("Couldn't enter user namespace %s: %s",
|
||||
userns, strerror(errno));
|
||||
|
||||
close(ufd);
|
||||
|
||||
} else if (use_userns) { /* Create and join a new userns */
|
||||
if (unshare(CLONE_NEWUSER) != 0)
|
||||
die_perror("Couldn't create user namespace");
|
||||
die("Couldn't create user namespace: %s",
|
||||
strerror(errno));
|
||||
}
|
||||
|
||||
/* Joining a new userns gives us full capabilities; drop the
|
||||
|
@ -316,34 +316,34 @@ int isolate_prefork(const struct ctx *c)
|
|||
flags |= CLONE_NEWPID;
|
||||
|
||||
if (unshare(flags)) {
|
||||
err_perror("Failed to detach isolating namespaces");
|
||||
perror("unshare");
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL)) {
|
||||
err_perror("Failed to remount /");
|
||||
perror("mount /");
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (mount("", TMPDIR, "tmpfs",
|
||||
MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY,
|
||||
"nr_inodes=2,nr_blocks=0")) {
|
||||
err_perror("Failed to mount empty tmpfs for pivot_root()");
|
||||
perror("mount tmpfs");
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (chdir(TMPDIR)) {
|
||||
err_perror("Failed to change directory into empty tmpfs");
|
||||
perror("chdir");
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (syscall(SYS_pivot_root, ".", ".")) {
|
||||
err_perror("Failed to pivot_root() into empty tmpfs");
|
||||
perror("pivot_root");
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW)) {
|
||||
err_perror("Failed to unmount original root filesystem");
|
||||
perror("umount2");
|
||||
return -errno;
|
||||
}
|
||||
|
||||
|
@ -388,6 +388,8 @@ void isolate_postfork(const struct ctx *c)
|
|||
}
|
||||
|
||||
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
|
||||
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog))
|
||||
die_perror("Failed to apply seccomp filter");
|
||||
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
|
||||
perror("prctl");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
#ifndef ISOLATION_H
|
||||
#define ISOLATION_H
|
||||
|
||||
void isolate_initial(int argc, char **argv);
|
||||
void isolate_initial(void);
|
||||
void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns,
|
||||
enum passt_modes mode);
|
||||
int isolate_prefork(const struct ctx *c);
|
||||
|
|
10
lineread.c
10
lineread.c
|
@ -39,11 +39,13 @@ void lineread_init(struct lineread *lr, int fd)
|
|||
*
|
||||
* Return: length of line in bytes, -1 if no line was found
|
||||
*/
|
||||
static ssize_t peek_line(struct lineread *lr, bool eof)
|
||||
static int peek_line(struct lineread *lr, bool eof)
|
||||
{
|
||||
char *nl;
|
||||
|
||||
/* Sanity checks (which also document invariants) */
|
||||
ASSERT(lr->count >= 0);
|
||||
ASSERT(lr->next_line >= 0);
|
||||
ASSERT(lr->next_line + lr->count >= lr->next_line);
|
||||
ASSERT(lr->next_line + lr->count <= LINEREAD_BUFFER_SIZE);
|
||||
|
||||
|
@ -72,13 +74,13 @@ static ssize_t peek_line(struct lineread *lr, bool eof)
|
|||
*
|
||||
* Return: Length of line read on success, 0 on EOF, negative on error
|
||||
*/
|
||||
ssize_t lineread_get(struct lineread *lr, char **line)
|
||||
int lineread_get(struct lineread *lr, char **line)
|
||||
{
|
||||
bool eof = false;
|
||||
ssize_t line_len;
|
||||
int line_len;
|
||||
|
||||
while ((line_len = peek_line(lr, eof)) < 0) {
|
||||
ssize_t rc;
|
||||
int rc;
|
||||
|
||||
if ((lr->next_line + lr->count) == LINEREAD_BUFFER_SIZE) {
|
||||
/* No space at end */
|
||||
|
|
|
@ -18,15 +18,14 @@
|
|||
* @buf: Buffer storing data read from file.
|
||||
*/
|
||||
struct lineread {
|
||||
int fd;
|
||||
ssize_t next_line;
|
||||
ssize_t count;
|
||||
int fd; int next_line;
|
||||
int count;
|
||||
|
||||
/* One extra byte for possible trailing \0 */
|
||||
char buf[LINEREAD_BUFFER_SIZE+1];
|
||||
};
|
||||
|
||||
void lineread_init(struct lineread *lr, int fd);
|
||||
ssize_t lineread_get(struct lineread *lr, char **line);
|
||||
int lineread_get(struct lineread *lr, char **line);
|
||||
|
||||
#endif /* _LINEREAD_H */
|
||||
|
|
144
linux_dep.h
144
linux_dep.h
|
@ -1,144 +0,0 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright Red Hat
|
||||
*
|
||||
* Declarations for Linux specific dependencies
|
||||
*/
|
||||
|
||||
#ifndef LINUX_DEP_H
|
||||
#define LINUX_DEP_H
|
||||
|
||||
/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
|
||||
*
|
||||
* Largely derived from include/linux/tcp.h in the Linux kernel
|
||||
*
|
||||
* Some fields returned by TCP_INFO have been there for ages and are shared with
|
||||
* BSD. struct tcp_info from netinet/tcp.h has only those fields. There are
|
||||
* also a many Linux specific extensions to the structure, which are only found
|
||||
* in the linux/tcp.h version of struct tcp_info.
|
||||
*
|
||||
* We want to use some of those extension fields, when available. We can test
|
||||
* for availability in the runtime kernel using the length returned from
|
||||
* getsockopt(). However, we won't necessarily be compiled against the same
|
||||
* kernel headers as we'll run with, so compiling directly against linux/tcp.h
|
||||
* means wrapping every field access in an #ifdef whose #else does the same
|
||||
* thing as when the field is missing at runtime. This rapidly gets messy.
|
||||
*
|
||||
* Instead we define here struct tcp_info_linux which includes all the Linux
|
||||
* extensions that we want to use. This is taken from v6.11 of the kernel.
|
||||
*/
|
||||
struct tcp_info_linux {
|
||||
uint8_t tcpi_state;
|
||||
uint8_t tcpi_ca_state;
|
||||
uint8_t tcpi_retransmits;
|
||||
uint8_t tcpi_probes;
|
||||
uint8_t tcpi_backoff;
|
||||
uint8_t tcpi_options;
|
||||
uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
|
||||
uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
|
||||
|
||||
uint32_t tcpi_rto;
|
||||
uint32_t tcpi_ato;
|
||||
uint32_t tcpi_snd_mss;
|
||||
uint32_t tcpi_rcv_mss;
|
||||
|
||||
uint32_t tcpi_unacked;
|
||||
uint32_t tcpi_sacked;
|
||||
uint32_t tcpi_lost;
|
||||
uint32_t tcpi_retrans;
|
||||
uint32_t tcpi_fackets;
|
||||
|
||||
/* Times. */
|
||||
uint32_t tcpi_last_data_sent;
|
||||
uint32_t tcpi_last_ack_sent;
|
||||
uint32_t tcpi_last_data_recv;
|
||||
uint32_t tcpi_last_ack_recv;
|
||||
|
||||
/* Metrics. */
|
||||
uint32_t tcpi_pmtu;
|
||||
uint32_t tcpi_rcv_ssthresh;
|
||||
uint32_t tcpi_rtt;
|
||||
uint32_t tcpi_rttvar;
|
||||
uint32_t tcpi_snd_ssthresh;
|
||||
uint32_t tcpi_snd_cwnd;
|
||||
uint32_t tcpi_advmss;
|
||||
uint32_t tcpi_reordering;
|
||||
|
||||
uint32_t tcpi_rcv_rtt;
|
||||
uint32_t tcpi_rcv_space;
|
||||
|
||||
uint32_t tcpi_total_retrans;
|
||||
|
||||
/* Linux extensions */
|
||||
uint64_t tcpi_pacing_rate;
|
||||
uint64_t tcpi_max_pacing_rate;
|
||||
uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
|
||||
uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
|
||||
uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
|
||||
uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
|
||||
|
||||
uint32_t tcpi_notsent_bytes;
|
||||
uint32_t tcpi_min_rtt;
|
||||
uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
|
||||
uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
|
||||
|
||||
uint64_t tcpi_delivery_rate;
|
||||
|
||||
uint64_t tcpi_busy_time; /* Time (usec) busy sending data */
|
||||
uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */
|
||||
uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
|
||||
|
||||
uint32_t tcpi_delivered;
|
||||
uint32_t tcpi_delivered_ce;
|
||||
|
||||
uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
|
||||
uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */
|
||||
uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */
|
||||
uint32_t tcpi_reord_seen; /* reordering events seen */
|
||||
|
||||
uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */
|
||||
|
||||
uint32_t tcpi_snd_wnd; /* peer's advertised receive window after
|
||||
* scaling (bytes)
|
||||
*/
|
||||
uint32_t tcpi_rcv_wnd; /* local advertised receive window after
|
||||
* scaling (bytes)
|
||||
*/
|
||||
|
||||
uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */
|
||||
|
||||
uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including
|
||||
* SYN/SYN-ACK and recurring timeouts.
|
||||
*/
|
||||
uint16_t tcpi_total_rto_recoveries; /* Total number of RTO
|
||||
* recoveries, including any
|
||||
* unfinished recovery.
|
||||
*/
|
||||
uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries
|
||||
* in milliseconds, including any
|
||||
* unfinished recovery.
|
||||
*/
|
||||
};
|
||||
|
||||
#include <linux/falloc.h>
|
||||
|
||||
#ifndef FALLOC_FL_COLLAPSE_RANGE
|
||||
#define FALLOC_FL_COLLAPSE_RANGE 0x08
|
||||
#endif
|
||||
|
||||
#include <linux/close_range.h>
|
||||
|
||||
/* glibc < 2.34 and musl as of 1.2.5 need these */
|
||||
#ifndef SYS_close_range
|
||||
#define SYS_close_range 436
|
||||
#endif
|
||||
#ifndef CLOSE_RANGE_UNSHARE /* Linux kernel < 5.9 */
|
||||
#define CLOSE_RANGE_UNSHARE (1U << 1)
|
||||
#endif
|
||||
|
||||
__attribute__ ((weak))
|
||||
/* cppcheck-suppress funcArgNamesDifferent */
|
||||
int close_range(unsigned int first, unsigned int last, int flags) {
|
||||
return syscall(SYS_close_range, first, last, flags);
|
||||
}
|
||||
|
||||
#endif /* LINUX_DEP_H */
|
423
log.c
423
log.c
|
@ -26,14 +26,17 @@
|
|||
#include <stdarg.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include "linux_dep.h"
|
||||
#include "log.h"
|
||||
#include "util.h"
|
||||
#include "passt.h"
|
||||
|
||||
/* LOG_EARLY means we don't know yet: log everything. LOG_EMERG is unused */
|
||||
#define LOG_EARLY LOG_MASK(LOG_EMERG)
|
||||
|
||||
static int log_sock = -1; /* Optional socket to system logger */
|
||||
static char log_ident[BUFSIZ]; /* Identifier string for openlog() */
|
||||
static int log_mask; /* Current log priority mask */
|
||||
static int log_mask = LOG_EARLY; /* Current log priority mask */
|
||||
static int log_opt; /* Options for openlog() */
|
||||
|
||||
static int log_file = -1; /* Optional log file descriptor */
|
||||
static size_t log_size; /* Maximum log file size in bytes */
|
||||
|
@ -41,46 +44,50 @@ static size_t log_written; /* Currently used bytes in log file */
|
|||
static size_t log_cut_size; /* Bytes to cut at start on rotation */
|
||||
static char log_header[BUFSIZ]; /* File header, written back on cuts */
|
||||
|
||||
struct timespec log_start; /* Start timestamp */
|
||||
|
||||
static time_t log_start; /* Start timestamp */
|
||||
int log_trace; /* --trace mode enabled */
|
||||
bool log_conf_parsed; /* Logging options already parsed */
|
||||
bool log_stderr = true; /* Not daemonised, no shell spawned */
|
||||
int log_to_stdout; /* Print to stdout instead of stderr */
|
||||
|
||||
#define LL_STRLEN (sizeof("-9223372036854775808"))
|
||||
#define LOGTIME_STRLEN (LL_STRLEN + 5)
|
||||
|
||||
/**
|
||||
* logtime() - Get the current time for logging purposes
|
||||
* @ts: Buffer into which to store the timestamp
|
||||
*
|
||||
* Return: pointer to @now, or NULL if there was an error retrieving the time
|
||||
*/
|
||||
const struct timespec *logtime(struct timespec *ts)
|
||||
void vlogmsg(int pri, const char *format, va_list ap)
|
||||
{
|
||||
if (clock_gettime(CLOCK_MONOTONIC, ts))
|
||||
return NULL;
|
||||
return ts;
|
||||
}
|
||||
bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;
|
||||
bool early_print = LOG_PRI(log_mask) == LOG_EARLY;
|
||||
FILE *out = log_to_stdout ? stdout : stderr;
|
||||
struct timespec tp;
|
||||
|
||||
/**
|
||||
* logtime_fmt() - Format timestamp into a string for the log
|
||||
* @buf: Buffer into which to format the time
|
||||
* @size: Size of @buf
|
||||
* @ts: Time to format (or NULL on error)
|
||||
*
|
||||
* Return: number of characters written to @buf (excluding \0)
|
||||
*/
|
||||
static int logtime_fmt(char *buf, size_t size, const struct timespec *ts)
|
||||
{
|
||||
if (ts) {
|
||||
int64_t delta = timespec_diff_us(ts, &log_start);
|
||||
|
||||
return snprintf(buf, size, "%lli.%04lli", delta / 1000000LL,
|
||||
(delta / 100LL) % 10000);
|
||||
if (debug_print) {
|
||||
clock_gettime(CLOCK_REALTIME, &tp);
|
||||
fprintf(out, "%lli.%04lli: ",
|
||||
(long long int)tp.tv_sec - log_start,
|
||||
(long long int)tp.tv_nsec / (100L * 1000));
|
||||
}
|
||||
|
||||
return snprintf(buf, size, "<error>");
|
||||
if ((log_mask & LOG_MASK(LOG_PRI(pri))) || early_print) {
|
||||
va_list ap2;
|
||||
|
||||
va_copy(ap2, ap); /* Don't clobber ap, we need it again */
|
||||
if (log_file != -1)
|
||||
logfile_write(pri, format, ap2);
|
||||
else if (!(log_mask & LOG_MASK(LOG_DEBUG)))
|
||||
passt_vsyslog(pri, format, ap2);
|
||||
|
||||
va_end(ap2);
|
||||
}
|
||||
|
||||
if (debug_print || (early_print && !(log_opt & LOG_PERROR))) {
|
||||
(void)vfprintf(out, format, ap);
|
||||
if (format[strlen(format)] != '\n')
|
||||
fprintf(out, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
void logmsg(int pri, const char *format, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, format);
|
||||
vlogmsg(pri, format, ap);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
/* Prefixes for log file messages, indexed by priority */
|
||||
|
@ -93,12 +100,127 @@ const char *logfile_prefix[] = {
|
|||
" ", /* LOG_DEBUG */
|
||||
};
|
||||
|
||||
/**
|
||||
* trace_init() - Set log_trace depending on trace (debug) mode
|
||||
* @enable: Tracing debug mode enabled if non-zero
|
||||
*/
|
||||
void trace_init(int enable)
|
||||
{
|
||||
log_trace = enable;
|
||||
}
|
||||
|
||||
/**
|
||||
* __openlog() - Non-optional openlog() implementation, for custom vsyslog()
|
||||
* @ident: openlog() identity (program name)
|
||||
* @option: openlog() options
|
||||
* @facility: openlog() facility (LOG_DAEMON)
|
||||
*/
|
||||
void __openlog(const char *ident, int option, int facility)
|
||||
{
|
||||
struct timespec tp;
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &tp);
|
||||
log_start = tp.tv_sec;
|
||||
|
||||
if (log_sock < 0) {
|
||||
struct sockaddr_un a = { .sun_family = AF_UNIX, };
|
||||
|
||||
log_sock = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0);
|
||||
if (log_sock < 0)
|
||||
return;
|
||||
|
||||
strncpy(a.sun_path, _PATH_LOG, sizeof(a.sun_path));
|
||||
if (connect(log_sock, (const struct sockaddr *)&a, sizeof(a))) {
|
||||
close(log_sock);
|
||||
log_sock = -1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
log_mask |= facility;
|
||||
strncpy(log_ident, ident, sizeof(log_ident) - 1);
|
||||
log_opt = option;
|
||||
}
|
||||
|
||||
/**
|
||||
* __setlogmask() - setlogmask() wrapper, to allow custom vsyslog()
|
||||
* @mask: Same as setlogmask() mask
|
||||
*/
|
||||
void __setlogmask(int mask)
|
||||
{
|
||||
log_mask = mask;
|
||||
setlogmask(mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* passt_vsyslog() - vsyslog() implementation not using heap memory
|
||||
* @pri: Facility and level map, same as priority for vsyslog()
|
||||
* @format: Same as vsyslog() format
|
||||
* @ap: Same as vsyslog() ap
|
||||
*/
|
||||
void passt_vsyslog(int pri, const char *format, va_list ap)
|
||||
{
|
||||
int prefix_len, n;
|
||||
char buf[BUFSIZ];
|
||||
|
||||
/* Send without timestamp, the system logger should add it */
|
||||
n = prefix_len = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
|
||||
|
||||
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
||||
|
||||
if (format[strlen(format)] != '\n')
|
||||
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
||||
|
||||
if (log_opt & LOG_PERROR)
|
||||
fprintf(stderr, "%s", buf + prefix_len);
|
||||
|
||||
if (send(log_sock, buf, n, 0) != n)
|
||||
fprintf(stderr, "Failed to send %i bytes to syslog\n", n);
|
||||
}
|
||||
|
||||
/**
|
||||
* logfile_init() - Open log file and write header with PID, version, path
|
||||
* @name: Identifier for header: passt or pasta
|
||||
* @path: Path to log file
|
||||
* @size: Maximum size of log file: log_cut_size is calculatd here
|
||||
*/
|
||||
void logfile_init(const char *name, const char *path, size_t size)
|
||||
{
|
||||
char nl = '\n', exe[PATH_MAX] = { 0 };
|
||||
int n;
|
||||
|
||||
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) {
|
||||
perror("readlink /proc/self/exe");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC,
|
||||
S_IRUSR | S_IWUSR);
|
||||
if (log_file == -1)
|
||||
die("Couldn't open log file %s: %s", path, strerror(errno));
|
||||
|
||||
log_size = size ? size : LOGFILE_SIZE_DEFAULT;
|
||||
|
||||
n = snprintf(log_header, sizeof(log_header), "%s " VERSION ": %s (%i)",
|
||||
name, exe, getpid());
|
||||
|
||||
if (write(log_file, log_header, n) <= 0 ||
|
||||
write(log_file, &nl, 1) <= 0) {
|
||||
perror("Couldn't write to log file\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
|
||||
log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
|
||||
}
|
||||
|
||||
#ifdef FALLOC_FL_COLLAPSE_RANGE
|
||||
/**
|
||||
* logfile_rotate_fallocate() - Write header, set log_written after fallocate()
|
||||
* @fd: Log file descriptor
|
||||
* @now: Current timestamp
|
||||
*
|
||||
* #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek i686:_llseek
|
||||
* #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
|
||||
*/
|
||||
static void logfile_rotate_fallocate(int fd, const struct timespec *now)
|
||||
{
|
||||
|
@ -111,8 +233,10 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
|
|||
if (read(fd, buf, BUFSIZ) == -1)
|
||||
return;
|
||||
|
||||
n = snprintf(buf, BUFSIZ, "%s - log truncated at ", log_header);
|
||||
n += logtime_fmt(buf + n, BUFSIZ - n, now);
|
||||
n = snprintf(buf, BUFSIZ,
|
||||
"%s - log truncated at %lli.%04lli", log_header,
|
||||
(long long int)(now->tv_sec - log_start),
|
||||
(long long int)(now->tv_nsec / (100L * 1000)));
|
||||
|
||||
/* Avoid partial lines by padding the header with spaces */
|
||||
nl = memchr(buf + n + 1, '\n', BUFSIZ - n - 1);
|
||||
|
@ -126,13 +250,14 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
|
|||
|
||||
log_written -= log_cut_size;
|
||||
}
|
||||
#endif /* FALLOC_FL_COLLAPSE_RANGE */
|
||||
|
||||
/**
|
||||
* logfile_rotate_move() - Fallback: move recent entries toward start, then cut
|
||||
* @fd: Log file descriptor
|
||||
* @now: Current timestamp
|
||||
*
|
||||
* #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek
|
||||
* #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
|
||||
* #syscalls ftruncate
|
||||
*/
|
||||
static void logfile_rotate_move(int fd, const struct timespec *now)
|
||||
|
@ -141,10 +266,10 @@ static void logfile_rotate_move(int fd, const struct timespec *now)
|
|||
char buf[BUFSIZ];
|
||||
const char *nl;
|
||||
|
||||
header_len = snprintf(buf, BUFSIZ, "%s - log truncated at ",
|
||||
log_header);
|
||||
header_len += logtime_fmt(buf + header_len, BUFSIZ - header_len, now);
|
||||
|
||||
header_len = snprintf(buf, BUFSIZ,
|
||||
"%s - log truncated at %lli.%04lli\n", log_header,
|
||||
(long long int)(now->tv_sec - log_start),
|
||||
(long long int)(now->tv_nsec / (100L * 1000)));
|
||||
if (lseek(fd, 0, SEEK_SET) == -1)
|
||||
return;
|
||||
if (write(fd, buf, header_len) == -1)
|
||||
|
@ -197,17 +322,21 @@ out:
|
|||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*
|
||||
* #syscalls fcntl fallocate
|
||||
* #syscalls fcntl
|
||||
*
|
||||
* fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there
|
||||
*/
|
||||
static int logfile_rotate(int fd, const struct timespec *now)
|
||||
{
|
||||
if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */))
|
||||
return -errno;
|
||||
|
||||
#ifdef FALLOC_FL_COLLAPSE_RANGE
|
||||
/* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */
|
||||
if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size))
|
||||
logfile_rotate_fallocate(fd, now);
|
||||
else
|
||||
#endif
|
||||
logfile_rotate_move(fd, now);
|
||||
|
||||
if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND))
|
||||
|
@ -218,212 +347,32 @@ static int logfile_rotate(int fd, const struct timespec *now)
|
|||
|
||||
/**
|
||||
* logfile_write() - Write entry to log file, trigger rotation if full
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
* @cont: Continuation of a previous message, on the same line
|
||||
* @pri: Facility and level map, same as priority for vsyslog()
|
||||
* @now: Timestamp
|
||||
* @format: Same as vsyslog() format
|
||||
* @ap: Same as vsyslog() ap
|
||||
*/
|
||||
static void logfile_write(bool newline, bool cont, int pri,
|
||||
const struct timespec *now,
|
||||
const char *format, va_list ap)
|
||||
void logfile_write(int pri, const char *format, va_list ap)
|
||||
{
|
||||
struct timespec now;
|
||||
char buf[BUFSIZ];
|
||||
int n = 0;
|
||||
int n;
|
||||
|
||||
if (!cont) {
|
||||
n += logtime_fmt(buf, BUFSIZ, now);
|
||||
n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]);
|
||||
}
|
||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||
return;
|
||||
|
||||
n = snprintf(buf, BUFSIZ, "%lli.%04lli: %s",
|
||||
(long long int)(now.tv_sec - log_start),
|
||||
(long long int)(now.tv_nsec / (100L * 1000)),
|
||||
logfile_prefix[pri]);
|
||||
|
||||
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
||||
|
||||
if (newline && format[strlen(format)] != '\n')
|
||||
if (format[strlen(format)] != '\n')
|
||||
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
||||
|
||||
if ((log_written + n >= log_size) && logfile_rotate(log_file, now))
|
||||
if ((log_written + n >= log_size) && logfile_rotate(log_file, &now))
|
||||
return;
|
||||
|
||||
if ((n = write(log_file, buf, n)) >= 0)
|
||||
log_written += n;
|
||||
}
|
||||
|
||||
/**
|
||||
* vlogmsg() - Print or send messages to log or output files as configured
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
* @cont: Continuation of a previous message, on the same line
|
||||
* @pri: Facility and level map, same as priority for vsyslog()
|
||||
* @format: Message
|
||||
* @ap: Variable argument list
|
||||
*/
|
||||
void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
|
||||
{
|
||||
bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;
|
||||
const struct timespec *now;
|
||||
struct timespec ts;
|
||||
|
||||
now = logtime(&ts);
|
||||
|
||||
if (debug_print && !cont) {
|
||||
char timestr[LOGTIME_STRLEN];
|
||||
|
||||
logtime_fmt(timestr, sizeof(timestr), now);
|
||||
FPRINTF(stderr, "%s: ", timestr);
|
||||
}
|
||||
|
||||
if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) {
|
||||
va_list ap2;
|
||||
|
||||
va_copy(ap2, ap); /* Don't clobber ap, we need it again */
|
||||
if (log_file != -1)
|
||||
logfile_write(newline, cont, pri, now, format, ap2);
|
||||
else if (!(log_mask & LOG_MASK(LOG_DEBUG)))
|
||||
passt_vsyslog(newline, pri, format, ap2);
|
||||
|
||||
va_end(ap2);
|
||||
}
|
||||
|
||||
if (debug_print || !log_conf_parsed ||
|
||||
(log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) {
|
||||
(void)vfprintf(stderr, format, ap);
|
||||
if (newline && format[strlen(format)] != '\n')
|
||||
FPRINTF(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* logmsg() - vlogmsg() wrapper for variable argument lists
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
* @cont: Continuation of a previous message, on the same line
|
||||
* @pri: Facility and level map, same as priority for vsyslog()
|
||||
* @format: Message
|
||||
*/
|
||||
void logmsg(bool newline, bool cont, int pri, const char *format, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, format);
|
||||
vlogmsg(newline, cont, pri, format, ap);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
/**
|
||||
* logmsg_perror() - vlogmsg() wrapper with perror()-like functionality
|
||||
* @pri: Facility and level map, same as priority for vsyslog()
|
||||
* @format: Message
|
||||
*/
|
||||
void logmsg_perror(int pri, const char *format, ...)
|
||||
{
|
||||
int errno_copy = errno;
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, format);
|
||||
vlogmsg(false, false, pri, format, ap);
|
||||
va_end(ap);
|
||||
|
||||
logmsg(true, true, pri, ": %s", strerror(errno_copy));
|
||||
}
|
||||
|
||||
/**
|
||||
* trace_init() - Set log_trace depending on trace (debug) mode
|
||||
* @enable: Tracing debug mode enabled if non-zero
|
||||
*/
|
||||
void trace_init(int enable)
|
||||
{
|
||||
log_trace = enable;
|
||||
}
|
||||
|
||||
/**
|
||||
* __openlog() - Non-optional openlog() implementation, for custom vsyslog()
|
||||
* @ident: openlog() identity (program name)
|
||||
* @option: openlog() options, unused
|
||||
* @facility: openlog() facility (LOG_DAEMON)
|
||||
*/
|
||||
void __openlog(const char *ident, int option, int facility)
|
||||
{
|
||||
(void)option;
|
||||
|
||||
if (log_sock < 0) {
|
||||
struct sockaddr_un a = { .sun_family = AF_UNIX, };
|
||||
|
||||
log_sock = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0);
|
||||
if (log_sock < 0)
|
||||
return;
|
||||
|
||||
strncpy(a.sun_path, _PATH_LOG, sizeof(a.sun_path));
|
||||
if (connect(log_sock, (const struct sockaddr *)&a, sizeof(a))) {
|
||||
close(log_sock);
|
||||
log_sock = -1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
log_mask |= facility;
|
||||
strncpy(log_ident, ident, sizeof(log_ident) - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* __setlogmask() - setlogmask() wrapper, to allow custom vsyslog()
|
||||
* @mask: Same as setlogmask() mask
|
||||
*/
|
||||
void __setlogmask(int mask)
|
||||
{
|
||||
log_mask = mask;
|
||||
setlogmask(mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* passt_vsyslog() - vsyslog() implementation not using heap memory
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
* @pri: Facility and level map, same as priority for vsyslog()
|
||||
* @format: Same as vsyslog() format
|
||||
* @ap: Same as vsyslog() ap
|
||||
*/
|
||||
void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
|
||||
{
|
||||
char buf[BUFSIZ];
|
||||
int n;
|
||||
|
||||
/* Send without timestamp, the system logger should add it */
|
||||
n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
|
||||
|
||||
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
||||
|
||||
if (newline && format[strlen(format)] != '\n')
|
||||
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
||||
|
||||
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
|
||||
FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
|
||||
}
|
||||
|
||||
/**
|
||||
* logfile_init() - Open log file and write header with PID, version, path
|
||||
* @name: Identifier for header: passt or pasta
|
||||
* @path: Path to log file
|
||||
* @size: Maximum size of log file: log_cut_size is calculatd here
|
||||
*/
|
||||
void logfile_init(const char *name, const char *path, size_t size)
|
||||
{
|
||||
char nl = '\n', exe[PATH_MAX] = { 0 };
|
||||
int n;
|
||||
|
||||
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
|
||||
die_perror("Failed to read own /proc/self/exe link");
|
||||
|
||||
log_file = output_file_open(path, O_APPEND | O_RDWR);
|
||||
if (log_file == -1)
|
||||
die_perror("Couldn't open log file %s", path);
|
||||
|
||||
log_size = size ? size : LOGFILE_SIZE_DEFAULT;
|
||||
|
||||
n = snprintf(log_header, sizeof(log_header), "%s " VERSION ": %s (%i)",
|
||||
name, exe, getpid());
|
||||
|
||||
if (write(log_file, log_header, n) <= 0 ||
|
||||
write(log_file, &nl, 1) <= 0)
|
||||
die_perror("Couldn't write to log file");
|
||||
|
||||
/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
|
||||
log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
|
||||
}
|
||||
|
|
34
log.h
34
log.h
|
@ -6,28 +6,20 @@
|
|||
#ifndef LOG_H
|
||||
#define LOG_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <syslog.h>
|
||||
|
||||
#define LOGFILE_SIZE_DEFAULT (1024 * 1024UL)
|
||||
#define LOGFILE_CUT_RATIO 30 /* When full, cut ~30% size */
|
||||
#define LOGFILE_SIZE_MIN (5UL * MAX(BUFSIZ, PAGE_SIZE))
|
||||
|
||||
void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap);
|
||||
void logmsg(bool newline, bool cont, int pri, const char *format, ...)
|
||||
__attribute__((format(printf, 4, 5)));
|
||||
void logmsg_perror(int pri, const char *format, ...)
|
||||
void vlogmsg(int pri, const char *format, va_list ap);
|
||||
void logmsg(int pri, const char *format, ...)
|
||||
__attribute__((format(printf, 2, 3)));
|
||||
|
||||
#define err(...) logmsg(true, false, LOG_ERR, __VA_ARGS__)
|
||||
#define warn(...) logmsg(true, false, LOG_WARNING, __VA_ARGS__)
|
||||
#define info(...) logmsg(true, false, LOG_INFO, __VA_ARGS__)
|
||||
#define debug(...) logmsg(true, false, LOG_DEBUG, __VA_ARGS__)
|
||||
|
||||
#define err_perror(...) logmsg_perror( LOG_ERR, __VA_ARGS__)
|
||||
#define warn_perror(...) logmsg_perror( LOG_WARNING, __VA_ARGS__)
|
||||
#define info_perror(...) logmsg_perror( LOG_INFO, __VA_ARGS__)
|
||||
#define debug_perror(...) logmsg_perror( LOG_DEBUG, __VA_ARGS__)
|
||||
#define err(...) logmsg(LOG_ERR, __VA_ARGS__)
|
||||
#define warn(...) logmsg(LOG_WARNING, __VA_ARGS__)
|
||||
#define info(...) logmsg(LOG_INFO, __VA_ARGS__)
|
||||
#define debug(...) logmsg(LOG_DEBUG, __VA_ARGS__)
|
||||
|
||||
#define die(...) \
|
||||
do { \
|
||||
|
@ -35,17 +27,8 @@ void logmsg_perror(int pri, const char *format, ...)
|
|||
exit(EXIT_FAILURE); \
|
||||
} while (0)
|
||||
|
||||
#define die_perror(...) \
|
||||
do { \
|
||||
err_perror(__VA_ARGS__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} while (0)
|
||||
|
||||
extern int log_trace;
|
||||
extern bool log_conf_parsed;
|
||||
extern bool log_stderr;
|
||||
extern struct timespec log_start;
|
||||
|
||||
extern int log_to_stdout;
|
||||
void trace_init(int enable);
|
||||
#define trace(...) \
|
||||
do { \
|
||||
|
@ -55,7 +38,8 @@ void trace_init(int enable);
|
|||
|
||||
void __openlog(const char *ident, int option, int facility);
|
||||
void logfile_init(const char *name, const char *path, size_t size);
|
||||
void passt_vsyslog(bool newline, int pri, const char *format, va_list ap);
|
||||
void passt_vsyslog(int pri, const char *format, va_list ap);
|
||||
void logfile_write(int pri, const char *format, va_list ap);
|
||||
void __setlogmask(int mask);
|
||||
|
||||
#endif /* LOG_H */
|
||||
|
|
320
ndp.c
320
ndp.c
|
@ -38,194 +38,23 @@
|
|||
#define NS 135
|
||||
#define NA 136
|
||||
|
||||
enum ndp_option_types {
|
||||
OPT_SRC_L2_ADDR = 1,
|
||||
OPT_TARGET_L2_ADDR = 2,
|
||||
OPT_PREFIX_INFO = 3,
|
||||
OPT_MTU = 5,
|
||||
OPT_RDNSS_TYPE = 25,
|
||||
OPT_DNSSL_TYPE = 31,
|
||||
};
|
||||
|
||||
/**
|
||||
* struct opt_header - Option header
|
||||
* @type: Option type
|
||||
* @len: Option length, in units of 8 bytes
|
||||
*/
|
||||
struct opt_header {
|
||||
uint8_t type;
|
||||
uint8_t len;
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct opt_l2_addr - Link-layer address
|
||||
* @header: Option header
|
||||
* @mac: MAC address
|
||||
*/
|
||||
struct opt_l2_addr {
|
||||
struct opt_header header;
|
||||
unsigned char mac[ETH_ALEN];
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct ndp_na - NDP Neighbor Advertisement (NA) message
|
||||
* @ih: ICMPv6 header
|
||||
* @target_addr: Target IPv6 address
|
||||
* @target_l2_addr: Target link-layer address
|
||||
*/
|
||||
struct ndp_na {
|
||||
struct icmp6hdr ih;
|
||||
struct in6_addr target_addr;
|
||||
struct opt_l2_addr target_l2_addr;
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct opt_prefix_info - Prefix Information option
|
||||
* @header: Option header
|
||||
* @prefix_len: The number of leading bits in the Prefix that are valid
|
||||
* @prefix_flags: Flags associated with the prefix
|
||||
* @valid_lifetime: Valid lifetime (ms)
|
||||
* @pref_lifetime: Preferred lifetime (ms)
|
||||
* @reserved: Unused
|
||||
*/
|
||||
struct opt_prefix_info {
|
||||
struct opt_header header;
|
||||
uint8_t prefix_len;
|
||||
uint8_t prefix_flags;
|
||||
uint32_t valid_lifetime;
|
||||
uint32_t pref_lifetime;
|
||||
uint32_t reserved;
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct opt_mtu - Maximum transmission unit (MTU) option
|
||||
* @header: Option header
|
||||
* @reserved: Unused
|
||||
* @value: MTU value, network order
|
||||
*/
|
||||
struct opt_mtu {
|
||||
struct opt_header header;
|
||||
uint16_t reserved;
|
||||
uint32_t value;
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct rdnss - Recursive DNS Server (RDNSS) option
|
||||
* @header: Option header
|
||||
* @reserved: Unused
|
||||
* @lifetime: Validity time (s)
|
||||
* @dns: List of DNS server addresses
|
||||
*/
|
||||
struct opt_rdnss {
|
||||
struct opt_header header;
|
||||
uint16_t reserved;
|
||||
uint32_t lifetime;
|
||||
struct in6_addr dns[MAXNS + 1];
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct dnssl - DNS Search List (DNSSL) option
|
||||
* @header: Option header
|
||||
* @reserved: Unused
|
||||
* @lifetime: Validity time (s)
|
||||
* @domains: List of NULL-seperated search domains
|
||||
*/
|
||||
struct opt_dnssl {
|
||||
struct opt_header header;
|
||||
uint16_t reserved;
|
||||
uint32_t lifetime;
|
||||
unsigned char domains[MAXDNSRCH * NS_MAXDNAME];
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct ndp_ra - NDP Router Advertisement (RA) message
|
||||
* @ih: ICMPv6 header
|
||||
* @reachable: Reachability time, after confirmation (ms)
|
||||
* @retrans: Time between retransmitted NS messages (ms)
|
||||
* @prefix_info: Prefix Information option
|
||||
* @prefix: IPv6 prefix
|
||||
* @mtu: MTU option
|
||||
* @source_ll: Target link-layer address
|
||||
* @var: Variable fields
|
||||
*/
|
||||
struct ndp_ra {
|
||||
struct icmp6hdr ih;
|
||||
uint32_t reachable;
|
||||
uint32_t retrans;
|
||||
struct opt_prefix_info prefix_info;
|
||||
struct in6_addr prefix;
|
||||
struct opt_l2_addr source_ll;
|
||||
|
||||
unsigned char var[sizeof(struct opt_mtu) + sizeof(struct opt_rdnss) +
|
||||
sizeof(struct opt_dnssl)];
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* struct ndp_ns - NDP Neighbor Solicitation (NS) message
|
||||
* @ih: ICMPv6 header
|
||||
* @target_addr: Target IPv6 address
|
||||
*/
|
||||
struct ndp_ns {
|
||||
struct icmp6hdr ih;
|
||||
struct in6_addr target_addr;
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
* ndp() - Check for NDP solicitations, reply as needed
|
||||
* @c: Execution context
|
||||
* @ih: ICMPv6 header
|
||||
* @saddr: Source IPv6 address
|
||||
* @p: Packet pool
|
||||
* @saddr Source IPv6 address
|
||||
*
|
||||
* Return: 0 if not handled here, 1 if handled, -1 on failure
|
||||
*/
|
||||
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
|
||||
const struct pool *p)
|
||||
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr)
|
||||
{
|
||||
struct ndp_na na = {
|
||||
.ih = {
|
||||
.icmp6_type = NA,
|
||||
.icmp6_code = 0,
|
||||
.icmp6_router = 1,
|
||||
.icmp6_solicited = 1,
|
||||
.icmp6_override = 1,
|
||||
},
|
||||
.target_l2_addr = {
|
||||
.header = {
|
||||
.type = OPT_TARGET_L2_ADDR,
|
||||
.len = 1,
|
||||
},
|
||||
}
|
||||
};
|
||||
struct ndp_ra ra = {
|
||||
.ih = {
|
||||
.icmp6_type = RA,
|
||||
.icmp6_code = 0,
|
||||
.icmp6_hop_limit = 255,
|
||||
/* RFC 8319 */
|
||||
.icmp6_rt_lifetime = htons_constant(65535),
|
||||
.icmp6_addrconf_managed = 1,
|
||||
},
|
||||
.prefix_info = {
|
||||
.header = {
|
||||
.type = OPT_PREFIX_INFO,
|
||||
.len = 4,
|
||||
},
|
||||
.prefix_len = 64,
|
||||
.prefix_flags = 0xc0, /* prefix flags: L, A */
|
||||
.valid_lifetime = ~0U,
|
||||
.pref_lifetime = ~0U,
|
||||
},
|
||||
.source_ll = {
|
||||
.header = {
|
||||
.type = OPT_SRC_L2_ADDR,
|
||||
.len = 1,
|
||||
},
|
||||
},
|
||||
};
|
||||
const struct in6_addr *rsaddr; /* src addr for reply */
|
||||
unsigned char *ptr = NULL;
|
||||
size_t dlen;
|
||||
char buf[BUFSIZ] = { 0 };
|
||||
struct ipv6hdr *ip6hr;
|
||||
struct icmp6hdr *ihr;
|
||||
struct ethhdr *ehr;
|
||||
unsigned char *p;
|
||||
size_t len;
|
||||
|
||||
if (ih->icmp6_type < RS || ih->icmp6_type > NA)
|
||||
return 0;
|
||||
|
@ -233,22 +62,28 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
|
|||
if (c->no_ndp)
|
||||
return 1;
|
||||
|
||||
ehr = (struct ethhdr *)buf;
|
||||
ip6hr = (struct ipv6hdr *)(ehr + 1);
|
||||
ihr = (struct icmp6hdr *)(ip6hr + 1);
|
||||
|
||||
if (ih->icmp6_type == NS) {
|
||||
const struct ndp_ns *ns =
|
||||
packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
|
||||
|
||||
if (!ns)
|
||||
return -1;
|
||||
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(saddr))
|
||||
return 1;
|
||||
|
||||
info("NDP: received NS, sending NA");
|
||||
ihr->icmp6_type = NA;
|
||||
ihr->icmp6_code = 0;
|
||||
ihr->icmp6_router = 1;
|
||||
ihr->icmp6_solicited = 1;
|
||||
ihr->icmp6_override = 1;
|
||||
|
||||
memcpy(&na.target_addr, &ns->target_addr,
|
||||
sizeof(na.target_addr));
|
||||
memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
|
||||
|
||||
p = (unsigned char *)(ihr + 1);
|
||||
memcpy(p, ih + 1, sizeof(struct in6_addr)); /* target address */
|
||||
p += 16;
|
||||
*p++ = 2; /* target ll */
|
||||
*p++ = 1; /* length */
|
||||
memcpy(p, c->mac, ETH_ALEN);
|
||||
p += 6;
|
||||
} else if (ih->icmp6_type == RS) {
|
||||
size_t dns_s_len = 0;
|
||||
int i, n;
|
||||
|
@ -257,20 +92,31 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
|
|||
return 1;
|
||||
|
||||
info("NDP: received RS, sending RA");
|
||||
memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix));
|
||||
ihr->icmp6_type = RA;
|
||||
ihr->icmp6_code = 0;
|
||||
ihr->icmp6_hop_limit = 255;
|
||||
ihr->icmp6_rt_lifetime = htons(65535); /* RFC 8319 */
|
||||
ihr->icmp6_addrconf_managed = 1;
|
||||
|
||||
ptr = &ra.var[0];
|
||||
p = (unsigned char *)(ihr + 1);
|
||||
p += 8; /* reachable, retrans time */
|
||||
*p++ = 3; /* prefix */
|
||||
*p++ = 4; /* length */
|
||||
*p++ = 64; /* prefix length */
|
||||
*p++ = 0xc0; /* prefix flags: L, A */
|
||||
*(uint32_t *)p = (uint32_t)~0U; /* lifetime */
|
||||
p += 4;
|
||||
*(uint32_t *)p = (uint32_t)~0U; /* preferred lifetime */
|
||||
p += 8;
|
||||
memcpy(p, &c->ip6.addr, 8); /* prefix */
|
||||
p += 16;
|
||||
|
||||
if (c->mtu != -1) {
|
||||
struct opt_mtu *mtu = (struct opt_mtu *)ptr;
|
||||
*mtu = (struct opt_mtu) {
|
||||
.header = {
|
||||
.type = OPT_MTU,
|
||||
.len = 1,
|
||||
},
|
||||
.value = htonl(c->mtu),
|
||||
};
|
||||
ptr += sizeof(struct opt_mtu);
|
||||
*p++ = 5; /* type */
|
||||
*p++ = 1; /* length */
|
||||
p += 2; /* reserved */
|
||||
*(uint32_t *)p = htonl(c->mtu); /* MTU */
|
||||
p += 4;
|
||||
}
|
||||
|
||||
if (c->no_dhcp_dns)
|
||||
|
@ -278,78 +124,70 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
|
|||
|
||||
for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n]); n++);
|
||||
if (n) {
|
||||
struct opt_rdnss *rdnss = (struct opt_rdnss *)ptr;
|
||||
*rdnss = (struct opt_rdnss) {
|
||||
.header = {
|
||||
.type = OPT_RDNSS_TYPE,
|
||||
.len = 1 + 2 * n,
|
||||
},
|
||||
.lifetime = ~0U,
|
||||
};
|
||||
*p++ = 25; /* RDNSS */
|
||||
*p++ = 1 + 2 * n; /* length */
|
||||
p += 2; /* reserved */
|
||||
*(uint32_t *)p = (uint32_t)~0U; /* lifetime */
|
||||
p += 4;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
memcpy(&rdnss->dns[i], &c->ip6.dns[i],
|
||||
sizeof(rdnss->dns[i]));
|
||||
memcpy(p, &c->ip6.dns[i], 16); /* address */
|
||||
p += 16;
|
||||
}
|
||||
ptr += offsetof(struct opt_rdnss, dns) +
|
||||
i * sizeof(rdnss->dns[0]);
|
||||
|
||||
for (n = 0; *c->dns_search[n].n; n++)
|
||||
dns_s_len += strlen(c->dns_search[n].n) + 2;
|
||||
}
|
||||
|
||||
if (!c->no_dhcp_dns_search && dns_s_len) {
|
||||
struct opt_dnssl *dnssl = (struct opt_dnssl *)ptr;
|
||||
*dnssl = (struct opt_dnssl) {
|
||||
.header = {
|
||||
.type = OPT_DNSSL_TYPE,
|
||||
.len = DIV_ROUND_UP(dns_s_len, 8) + 1,
|
||||
},
|
||||
.lifetime = ~0U,
|
||||
};
|
||||
ptr = dnssl->domains;
|
||||
*p++ = 31; /* DNSSL */
|
||||
*p++ = (dns_s_len + 8 - 1) / 8 + 1; /* length */
|
||||
p += 2; /* reserved */
|
||||
*(uint32_t *)p = (uint32_t)~0U; /* lifetime */
|
||||
p += 4;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
size_t len;
|
||||
char *dot;
|
||||
|
||||
*(ptr++) = '.';
|
||||
*(p++) = '.';
|
||||
|
||||
len = sizeof(dnssl->domains) -
|
||||
(ptr - dnssl->domains);
|
||||
|
||||
strncpy((char *)ptr, c->dns_search[i].n, len);
|
||||
for (dot = (char *)ptr - 1; *dot; dot++) {
|
||||
strncpy((char *)p, c->dns_search[i].n,
|
||||
sizeof(buf) -
|
||||
((intptr_t)p - (intptr_t)buf));
|
||||
for (dot = (char *)p - 1; *dot; dot++) {
|
||||
if (*dot == '.')
|
||||
*dot = strcspn(dot + 1, ".");
|
||||
}
|
||||
ptr += strlen(c->dns_search[i].n);
|
||||
*(ptr++) = 0;
|
||||
p += strlen(c->dns_search[i].n);
|
||||
*(p++) = 0;
|
||||
}
|
||||
|
||||
memset(ptr, 0, 8 - dns_s_len % 8); /* padding */
|
||||
ptr += 8 - dns_s_len % 8;
|
||||
memset(p, 0, 8 - dns_s_len % 8); /* padding */
|
||||
p += 8 - dns_s_len % 8;
|
||||
}
|
||||
|
||||
dns_done:
|
||||
memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
|
||||
*p++ = 1; /* source ll */
|
||||
*p++ = 1; /* length */
|
||||
memcpy(p, c->mac, ETH_ALEN);
|
||||
p += 6;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
|
||||
len = (uintptr_t)p - (uintptr_t)ihr - sizeof(*ihr);
|
||||
|
||||
if (IN6_IS_ADDR_LINKLOCAL(saddr))
|
||||
c->ip6.addr_ll_seen = *saddr;
|
||||
else
|
||||
c->ip6.addr_seen = *saddr;
|
||||
|
||||
rsaddr = &c->ip6.our_tap_ll;
|
||||
if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
|
||||
rsaddr = &c->ip6.gw;
|
||||
else
|
||||
rsaddr = &c->ip6.addr_ll;
|
||||
|
||||
if (ih->icmp6_type == NS) {
|
||||
dlen = sizeof(struct ndp_na);
|
||||
tap_icmp6_send(c, rsaddr, saddr, &na, dlen);
|
||||
} else if (ih->icmp6_type == RS) {
|
||||
dlen = ptr - (unsigned char *)&ra;
|
||||
tap_icmp6_send(c, rsaddr, saddr, &ra, dlen);
|
||||
}
|
||||
tap_icmp6_send(c, rsaddr, saddr, ihr, len + sizeof(*ihr));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
3
ndp.h
3
ndp.h
|
@ -6,7 +6,6 @@
|
|||
#ifndef NDP_H
|
||||
#define NDP_H
|
||||
|
||||
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
|
||||
const struct pool *p);
|
||||
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr);
|
||||
|
||||
#endif /* NDP_H */
|
||||
|
|
326
netlink.c
326
netlink.c
|
@ -33,13 +33,8 @@
|
|||
#include "util.h"
|
||||
#include "passt.h"
|
||||
#include "log.h"
|
||||
#include "ip.h"
|
||||
#include "netlink.h"
|
||||
|
||||
/* Same as RTA_NEXT() but for nexthops: RTNH_NEXT() doesn't take 'attrlen' */
|
||||
#define RTNH_NEXT_AND_DEC(rtnh, attrlen) \
|
||||
((attrlen) -= RTNH_ALIGN((rtnh)->rtnh_len), RTNH_NEXT(rtnh))
|
||||
|
||||
/* Netlink expects a buffer of at least 8kiB or the system page size,
|
||||
* whichever is larger. 32kiB is recommended for more efficient.
|
||||
* Since the largest page size on any remotely common Linux setup is
|
||||
|
@ -133,7 +128,7 @@ static uint32_t nl_send(int s, void *req, uint16_t type,
|
|||
|
||||
n = send(s, req, len, 0);
|
||||
if (n < 0)
|
||||
die_perror("netlink: Failed to send()");
|
||||
die("netlink: Failed to send(): %s", strerror(errno));
|
||||
else if (n < len)
|
||||
die("netlink: Short send (%zd of %zd bytes)", n, len);
|
||||
|
||||
|
@ -189,7 +184,7 @@ static struct nlmsghdr *nl_next(int s, char *buf, struct nlmsghdr *nh, ssize_t *
|
|||
|
||||
*n = recv(s, buf, NLBUFSIZ, 0);
|
||||
if (*n < 0)
|
||||
die_perror("netlink: Failed to recv()");
|
||||
die("netlink: Failed to recv(): %s", strerror(errno));
|
||||
|
||||
nh = (struct nlmsghdr *)buf;
|
||||
if (!NLMSG_OK(nh, *n))
|
||||
|
@ -259,8 +254,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
|||
.rtm.rtm_type = RTN_UNICAST,
|
||||
.rtm.rtm_family = af,
|
||||
};
|
||||
unsigned defifi = 0, anyifi = 0;
|
||||
unsigned ndef = 0, nany = 0;
|
||||
unsigned int ifi = 0;
|
||||
struct nlmsghdr *nh;
|
||||
struct rtattr *rta;
|
||||
char buf[NLBUFSIZ];
|
||||
|
@ -268,80 +262,30 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
|||
uint32_t seq;
|
||||
size_t na;
|
||||
|
||||
/* Look for an interface with a default route first, failing that, look
|
||||
* for any interface with a route, and pick the first one, if any.
|
||||
*/
|
||||
seq = nl_send(s, &req, RTM_GETROUTE, NLM_F_DUMP, sizeof(req));
|
||||
nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWROUTE) {
|
||||
struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh);
|
||||
const void *dst = NULL;
|
||||
unsigned thisifi = 0;
|
||||
|
||||
if (rtm->rtm_family != af)
|
||||
if (ifi || rtm->rtm_dst_len || rtm->rtm_family != af)
|
||||
continue;
|
||||
|
||||
for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
|
||||
rta = RTA_NEXT(rta, na)) {
|
||||
if (rta->rta_type == RTA_OIF) {
|
||||
thisifi = *(unsigned int *)RTA_DATA(rta);
|
||||
ifi = *(unsigned int *)RTA_DATA(rta);
|
||||
} else if (rta->rta_type == RTA_MULTIPATH) {
|
||||
const struct rtnexthop *rtnh;
|
||||
|
||||
rtnh = (struct rtnexthop *)RTA_DATA(rta);
|
||||
thisifi = rtnh->rtnh_ifindex;
|
||||
} else if (rta->rta_type == RTA_DST) {
|
||||
dst = RTA_DATA(rta);
|
||||
ifi = rtnh->rtnh_ifindex;
|
||||
}
|
||||
}
|
||||
|
||||
if (!thisifi)
|
||||
continue; /* No interface for this route */
|
||||
|
||||
/* Skip routes to link-local addresses */
|
||||
if (af == AF_INET && dst &&
|
||||
IN4_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len))
|
||||
continue;
|
||||
|
||||
if (af == AF_INET6 && dst &&
|
||||
IN6_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len))
|
||||
continue;
|
||||
|
||||
if (rtm->rtm_dst_len == 0) {
|
||||
/* Default route */
|
||||
ndef++;
|
||||
if (!defifi)
|
||||
defifi = thisifi;
|
||||
} else {
|
||||
/* Non-default route */
|
||||
nany++;
|
||||
if (!anyifi)
|
||||
anyifi = thisifi;
|
||||
}
|
||||
}
|
||||
|
||||
if (status < 0)
|
||||
warn("netlink: RTM_GETROUTE failed: %s", strerror(-status));
|
||||
|
||||
if (defifi) {
|
||||
if (ndef > 1) {
|
||||
info("Multiple default %s routes, picked first",
|
||||
af_name(af));
|
||||
}
|
||||
return defifi;
|
||||
}
|
||||
|
||||
if (anyifi) {
|
||||
if (nany > 1) {
|
||||
info("Multiple interfaces with %s routes, picked first",
|
||||
af_name(af));
|
||||
}
|
||||
return anyifi;
|
||||
}
|
||||
|
||||
if (!nany)
|
||||
info("No interfaces with usable %s routes", af_name(af));
|
||||
|
||||
return 0;
|
||||
return ifi;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -353,13 +297,12 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
|||
*/
|
||||
bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
|
||||
{
|
||||
int nh_len = RTA_PAYLOAD(rta);
|
||||
struct rtnexthop *rtnh;
|
||||
bool found = false;
|
||||
int hops = -1;
|
||||
|
||||
for (rtnh = (struct rtnexthop *)RTA_DATA(rta);
|
||||
RTNH_OK(rtnh, nh_len); rtnh = RTNH_NEXT_AND_DEC(rtnh, nh_len)) {
|
||||
RTNH_OK(rtnh, RTA_PAYLOAD(rta)); rtnh = RTNH_NEXT(rtnh)) {
|
||||
size_t len = rtnh->rtnh_len - sizeof(*rtnh);
|
||||
struct rtattr *rta_inner;
|
||||
|
||||
|
@ -389,7 +332,7 @@ bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
|
|||
* @af: Address family
|
||||
* @gw: Default gateway to fill on NL_GET
|
||||
*
|
||||
* Return: error on netlink failure, or 0 (gw unset if default route not found)
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*/
|
||||
int nl_route_get_def(int s, unsigned int ifi, sa_family_t af, void *gw)
|
||||
{
|
||||
|
@ -536,7 +479,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
|||
.rta.rta_len = RTA_LENGTH(sizeof(unsigned int)),
|
||||
.ifi = ifi_src,
|
||||
};
|
||||
ssize_t nlmsgs_size, left, status;
|
||||
ssize_t nlmsgs_size, status;
|
||||
unsigned dup_routes = 0;
|
||||
struct nlmsghdr *nh;
|
||||
char buf[NLBUFSIZ];
|
||||
|
@ -550,83 +493,39 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
|||
* routes in the buffer at once.
|
||||
*/
|
||||
nh = nl_next(s_src, buf, NULL, &nlmsgs_size);
|
||||
for (left = nlmsgs_size;
|
||||
NLMSG_OK(nh, left) && (status = nl_status(nh, left, seq)) > 0;
|
||||
nh = NLMSG_NEXT(nh, left)) {
|
||||
for (status = nlmsgs_size;
|
||||
NLMSG_OK(nh, status) && (status = nl_status(nh, status, seq)) > 0;
|
||||
nh = NLMSG_NEXT(nh, status)) {
|
||||
struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh);
|
||||
bool discard = false;
|
||||
struct rtattr *rta;
|
||||
size_t na;
|
||||
|
||||
if (nh->nlmsg_type != RTM_NEWROUTE)
|
||||
continue;
|
||||
|
||||
dup_routes++;
|
||||
|
||||
for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
|
||||
rta = RTA_NEXT(rta, na)) {
|
||||
/* RTA_OIF and RTA_MULTIPATH attributes carry the
|
||||
* identifier of a host interface. If they match the
|
||||
* host interface we're copying from, change them to
|
||||
* match the corresponding identifier in the target
|
||||
* namespace.
|
||||
*
|
||||
* If RTA_OIF doesn't match (NETLINK_GET_STRICT_CHK not
|
||||
* available), or if any interface index in nexthop
|
||||
* objects differ from the host interface, discard the
|
||||
* route altogether.
|
||||
*/
|
||||
if (rta->rta_type == RTA_OIF) {
|
||||
if (*(unsigned int *)RTA_DATA(rta) != ifi_src) {
|
||||
discard = true;
|
||||
break;
|
||||
}
|
||||
|
||||
*(unsigned int *)RTA_DATA(rta) = ifi_dst;
|
||||
} else if (rta->rta_type == RTA_MULTIPATH) {
|
||||
int nh_len = RTA_PAYLOAD(rta);
|
||||
struct rtnexthop *rtnh;
|
||||
|
||||
for (rtnh = (struct rtnexthop *)RTA_DATA(rta);
|
||||
RTNH_OK(rtnh, nh_len);
|
||||
rtnh = RTNH_NEXT_AND_DEC(rtnh, nh_len)) {
|
||||
int src = (int)ifi_src;
|
||||
|
||||
if (rtnh->rtnh_ifindex != src) {
|
||||
discard = true;
|
||||
break;
|
||||
}
|
||||
|
||||
rtnh->rtnh_ifindex = ifi_dst;
|
||||
}
|
||||
|
||||
if (discard)
|
||||
break;
|
||||
} else if (rta->rta_type == RTA_PREFSRC ||
|
||||
rta->rta_type == RTA_NH_ID) {
|
||||
/* Strip RTA_PREFSRC attributes: host routes
|
||||
* might include a preferred source address,
|
||||
* which must be one of the host's addresses.
|
||||
* However, with -a, pasta will use a different
|
||||
* namespace address, making such a route
|
||||
* invalid in the namespace.
|
||||
*
|
||||
* Strip RTA_NH_ID attributes: host routes set
|
||||
* up via routing protocols (e.g. OSPF) might
|
||||
* contain a nexthop ID (and not nexthop
|
||||
* objects, which are taken care of in the
|
||||
* RTA_MULTIPATH case above) that's not valid
|
||||
* in the target namespace.
|
||||
/* The host obviously list's the host interface
|
||||
* id here, we need to change it to the
|
||||
* namespace's interface id
|
||||
*/
|
||||
*(unsigned int *)RTA_DATA(rta) = ifi_dst;
|
||||
} else if (rta->rta_type == RTA_PREFSRC) {
|
||||
/* Host routes might include a preferred source
|
||||
* address, which must be one of the host's
|
||||
* addresses. However, with -a pasta will use a
|
||||
* different namespace address, making such a
|
||||
* route invalid in the namespace. Strip off
|
||||
* RTA_PREFSRC attributes to avoid that. */
|
||||
rta->rta_type = RTA_UNSPEC;
|
||||
}
|
||||
}
|
||||
|
||||
if (discard)
|
||||
nh->nlmsg_type = NLMSG_NOOP;
|
||||
else
|
||||
dup_routes++;
|
||||
}
|
||||
|
||||
if (!NLMSG_OK(nh, left)) {
|
||||
if (!NLMSG_OK(nh, status) || status > 0) {
|
||||
/* Process any remaining datagrams in a different
|
||||
* buffer so we don't overwrite the first one.
|
||||
*/
|
||||
|
@ -652,9 +551,9 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
|||
* to calculate dependencies: let the kernel do that.
|
||||
*/
|
||||
for (i = 0; i < dup_routes; i++) {
|
||||
for (nh = (struct nlmsghdr *)buf, left = nlmsgs_size;
|
||||
NLMSG_OK(nh, left);
|
||||
nh = NLMSG_NEXT(nh, left)) {
|
||||
for (nh = (struct nlmsghdr *)buf, status = nlmsgs_size;
|
||||
NLMSG_OK(nh, status);
|
||||
nh = NLMSG_NEXT(nh, status)) {
|
||||
uint16_t flags = nh->nlmsg_flags;
|
||||
int rc;
|
||||
|
||||
|
@ -664,8 +563,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
|||
rc = nl_do(s_dst, nh, RTM_NEWROUTE,
|
||||
(flags & ~NLM_F_DUMP_FILTERED) | NLM_F_CREATE,
|
||||
nh->nlmsg_len);
|
||||
if (rc < 0 && rc != -EEXIST &&
|
||||
rc != -ENETUNREACH && rc != -EHOSTUNREACH)
|
||||
if (rc < 0 && rc != -ENETUNREACH && rc != -EEXIST)
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
@ -673,63 +571,6 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
|||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* nl_addr_set_ll_nodad() - Set IFA_F_NODAD on IPv6 link-local addresses
|
||||
* @s: Netlink socket
|
||||
* @ifi: Interface index in target namespace
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*/
|
||||
int nl_addr_set_ll_nodad(int s, unsigned int ifi)
|
||||
{
|
||||
struct req_t {
|
||||
struct nlmsghdr nlh;
|
||||
struct ifaddrmsg ifa;
|
||||
} req = {
|
||||
.ifa.ifa_family = AF_INET6,
|
||||
.ifa.ifa_index = ifi,
|
||||
};
|
||||
uint32_t seq, last_seq = 0;
|
||||
ssize_t status, ret = 0;
|
||||
struct nlmsghdr *nh;
|
||||
char buf[NLBUFSIZ];
|
||||
|
||||
seq = nl_send(s, &req, RTM_GETADDR, NLM_F_DUMP, sizeof(req));
|
||||
nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWADDR) {
|
||||
struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);
|
||||
struct rtattr *rta;
|
||||
size_t na;
|
||||
|
||||
if (ifa->ifa_index != ifi || ifa->ifa_scope != RT_SCOPE_LINK)
|
||||
continue;
|
||||
|
||||
ifa->ifa_flags |= IFA_F_NODAD;
|
||||
|
||||
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
|
||||
rta = RTA_NEXT(rta, na)) {
|
||||
/* If 32-bit flags are used, add IFA_F_NODAD there */
|
||||
if (rta->rta_type == IFA_FLAGS)
|
||||
*(uint32_t *)RTA_DATA(rta) |= IFA_F_NODAD;
|
||||
}
|
||||
|
||||
last_seq = nl_send(s, nh, RTM_NEWADDR, NLM_F_REPLACE,
|
||||
nh->nlmsg_len);
|
||||
}
|
||||
|
||||
if (status < 0)
|
||||
ret = status;
|
||||
|
||||
for (seq = seq + 1; seq <= last_seq; seq++) {
|
||||
nl_foreach(nh, status, s, buf, seq)
|
||||
warn("netlink: Unexpected response message");
|
||||
|
||||
if (!ret && status < 0)
|
||||
ret = status;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* nl_addr_get() - Get most specific global address, given interface and family
|
||||
* @s: Netlink socket
|
||||
|
@ -739,7 +580,7 @@ int nl_addr_set_ll_nodad(int s, unsigned int ifi)
|
|||
* @prefix_len: Mask or prefix length, to fill (for IPv4)
|
||||
* @addr_l: Link-scoped address to fill (for IPv6)
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
* Return: 9 on success, negative error code on failure
|
||||
*/
|
||||
int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
|
||||
void *addr, int *prefix_len, void *addr_l)
|
||||
|
@ -763,13 +604,12 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
|
|||
struct rtattr *rta;
|
||||
size_t na;
|
||||
|
||||
if (ifa->ifa_index != ifi || ifa->ifa_flags & IFA_F_DEPRECATED)
|
||||
if (ifa->ifa_index != ifi)
|
||||
continue;
|
||||
|
||||
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
|
||||
rta = RTA_NEXT(rta, na)) {
|
||||
if ((af == AF_INET && rta->rta_type != IFA_LOCAL) ||
|
||||
(af == AF_INET6 && rta->rta_type != IFA_ADDRESS))
|
||||
if (rta->rta_type != IFA_ADDRESS)
|
||||
continue;
|
||||
|
||||
if (af == AF_INET && ifa->ifa_prefixlen > prefix_max) {
|
||||
|
@ -797,54 +637,7 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
|
|||
}
|
||||
|
||||
/**
|
||||
* nl_addr_get_ll() - Get first IPv6 link-local address for a given interface
|
||||
* @s: Netlink socket
|
||||
* @ifi: Interface index in outer network namespace
|
||||
* @addr: Link-local address to fill
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*/
|
||||
int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr)
|
||||
{
|
||||
struct req_t {
|
||||
struct nlmsghdr nlh;
|
||||
struct ifaddrmsg ifa;
|
||||
} req = {
|
||||
.ifa.ifa_family = AF_INET6,
|
||||
.ifa.ifa_index = ifi,
|
||||
};
|
||||
struct nlmsghdr *nh;
|
||||
bool found = false;
|
||||
char buf[NLBUFSIZ];
|
||||
ssize_t status;
|
||||
uint32_t seq;
|
||||
|
||||
seq = nl_send(s, &req, RTM_GETADDR, NLM_F_DUMP, sizeof(req));
|
||||
nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWADDR) {
|
||||
struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);
|
||||
struct rtattr *rta;
|
||||
size_t na;
|
||||
|
||||
if (ifa->ifa_index != ifi || ifa->ifa_scope != RT_SCOPE_LINK ||
|
||||
found)
|
||||
continue;
|
||||
|
||||
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
|
||||
rta = RTA_NEXT(rta, na)) {
|
||||
if (rta->rta_type != IFA_ADDRESS)
|
||||
continue;
|
||||
|
||||
if (!found) {
|
||||
memcpy(addr, RTA_DATA(rta), RTA_PAYLOAD(rta));
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* nl_addr_set() - Set IP addresses for given interface and address family
|
||||
* nl_add_set() - Set IP addresses for given interface and address family
|
||||
* @s: Netlink socket
|
||||
* @ifi: Interface index
|
||||
* @af: Address family
|
||||
|
@ -947,13 +740,10 @@ int nl_addr_dup(int s_src, unsigned int ifi_src,
|
|||
ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);
|
||||
|
||||
if (rc < 0 || ifa->ifa_scope == RT_SCOPE_LINK ||
|
||||
ifa->ifa_index != ifi_src ||
|
||||
ifa->ifa_flags & IFA_F_DEPRECATED)
|
||||
ifa->ifa_index != ifi_src)
|
||||
continue;
|
||||
|
||||
ifa->ifa_index = ifi_dst;
|
||||
/* Same as nl_addr_set(), but here it's more than a default */
|
||||
ifa->ifa_flags |= IFA_F_NODAD;
|
||||
|
||||
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
|
||||
rta = RTA_NEXT(rta, na)) {
|
||||
|
@ -961,10 +751,6 @@ int nl_addr_dup(int s_src, unsigned int ifi_src,
|
|||
if (rta->rta_type == IFA_LABEL ||
|
||||
rta->rta_type == IFA_CACHEINFO)
|
||||
rta->rta_type = IFA_UNSPEC;
|
||||
|
||||
/* If 32-bit flags are used, add IFA_F_NODAD there */
|
||||
if (rta->rta_type == IFA_FLAGS)
|
||||
*(uint32_t *)RTA_DATA(rta) |= IFA_F_NODAD;
|
||||
}
|
||||
|
||||
rc = nl_do(s_dst, nh, RTM_NEWADDR,
|
||||
|
@ -1046,14 +832,14 @@ int nl_link_set_mac(int s, unsigned int ifi, const void *mac)
|
|||
}
|
||||
|
||||
/**
|
||||
* nl_link_set_mtu() - Set link MTU
|
||||
* nl_link_up() - Bring link up
|
||||
* @s: Netlink socket
|
||||
* @ifi: Interface index
|
||||
* @mtu: Interface MTU
|
||||
* @mtu: If non-zero, set interface MTU
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*/
|
||||
int nl_link_set_mtu(int s, unsigned int ifi, int mtu)
|
||||
int nl_link_up(int s, unsigned int ifi, int mtu)
|
||||
{
|
||||
struct req_t {
|
||||
struct nlmsghdr nlh;
|
||||
|
@ -1063,35 +849,17 @@ int nl_link_set_mtu(int s, unsigned int ifi, int mtu)
|
|||
} req = {
|
||||
.ifm.ifi_family = AF_UNSPEC,
|
||||
.ifm.ifi_index = ifi,
|
||||
.ifm.ifi_flags = IFF_UP,
|
||||
.ifm.ifi_change = IFF_UP,
|
||||
.rta.rta_type = IFLA_MTU,
|
||||
.rta.rta_len = RTA_LENGTH(sizeof(unsigned int)),
|
||||
.mtu = mtu,
|
||||
};
|
||||
ssize_t len = sizeof(req);
|
||||
|
||||
return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req));
|
||||
}
|
||||
|
||||
/**
|
||||
* nl_link_set_flags() - Set link flags
|
||||
* @s: Netlink socket
|
||||
* @ifi: Interface index
|
||||
* @set: Device flags to set
|
||||
* @change: Mask of device flag changes
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*/
|
||||
int nl_link_set_flags(int s, unsigned int ifi,
|
||||
unsigned int set, unsigned int change)
|
||||
{
|
||||
struct req_t {
|
||||
struct nlmsghdr nlh;
|
||||
struct ifinfomsg ifm;
|
||||
} req = {
|
||||
.ifm.ifi_family = AF_UNSPEC,
|
||||
.ifm.ifi_index = ifi,
|
||||
.ifm.ifi_flags = set,
|
||||
.ifm.ifi_change = change,
|
||||
};
|
||||
|
||||
return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req));
|
||||
if (!mtu)
|
||||
/* Shorten request to drop MTU attribute */
|
||||
len = offsetof(struct req_t, rta);
|
||||
|
||||
return nl_do(s, &req, RTM_NEWLINK, 0, len);
|
||||
}
|
||||
|
|
|
@ -19,14 +19,10 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
|
|||
void *addr, int *prefix_len, void *addr_l);
|
||||
int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
|
||||
const void *addr, int prefix_len);
|
||||
int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr);
|
||||
int nl_addr_set_ll_nodad(int s, unsigned int ifi);
|
||||
int nl_addr_dup(int s_src, unsigned int ifi_src,
|
||||
int s_dst, unsigned int ifi_dst, sa_family_t af);
|
||||
int nl_link_get_mac(int s, unsigned int ifi, void *mac);
|
||||
int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
|
||||
int nl_link_set_mtu(int s, unsigned int ifi, int mtu);
|
||||
int nl_link_set_flags(int s, unsigned int ifi,
|
||||
unsigned int set, unsigned int change);
|
||||
int nl_link_up(int s, unsigned int ifi, int mtu);
|
||||
|
||||
#endif /* NETLINK_H */
|
||||
|
|
85
packet.c
85
packet.c
|
@ -22,6 +22,42 @@
|
|||
#include "util.h"
|
||||
#include "log.h"
|
||||
|
||||
static int packet_check_range(const struct pool *p, size_t offset, size_t len,
|
||||
const char *start, const char *func, int line)
|
||||
{
|
||||
ASSERT(p->buf);
|
||||
|
||||
if (p->buf_size == 0)
|
||||
return vu_packet_check_range((void *)p->buf, offset, len, start,
|
||||
func, line);
|
||||
|
||||
if (start < p->buf) {
|
||||
if (func) {
|
||||
trace("add packet start %p before buffer start %p, "
|
||||
"%s:%i", (void *)start, (void *)p->buf, func, line);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (start + len + offset > p->buf + p->buf_size) {
|
||||
if (func) {
|
||||
trace("packet offset plus length %lu from size %lu, "
|
||||
"%s:%i", start - p->buf + len + offset,
|
||||
p->buf_size, func, line);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
#if UINTPTR_MAX == UINT64_MAX
|
||||
if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {
|
||||
trace("add packet start %p, buffer start %p, %s:%i",
|
||||
(void *)start, (void *)p->buf, func, line);
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
/**
|
||||
* packet_add_do() - Add data as packet descriptor to given pool
|
||||
* @p: Existing pool
|
||||
|
@ -41,34 +77,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
|
|||
return;
|
||||
}
|
||||
|
||||
if (start < p->buf) {
|
||||
trace("add packet start %p before buffer start %p, %s:%i",
|
||||
(void *)start, (void *)p->buf, func, line);
|
||||
if (packet_check_range(p, 0, len, start, func, line))
|
||||
return;
|
||||
}
|
||||
|
||||
if (start + len > p->buf + p->buf_size) {
|
||||
trace("add packet start %p, length: %zu, buffer end %p, %s:%i",
|
||||
(void *)start, len, (void *)(p->buf + p->buf_size),
|
||||
func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (len > UINT16_MAX) {
|
||||
trace("add packet length %zu, %s:%i", len, func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
#if UINTPTR_MAX == UINT64_MAX
|
||||
if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {
|
||||
trace("add packet start %p, buffer start %p, %s:%i",
|
||||
(void *)start, (void *)p->buf, func, line);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
p->pkt[idx].offset = start - p->buf;
|
||||
p->pkt[idx].len = len;
|
||||
p->pkt[idx].iov_base = (void *)start;
|
||||
p->pkt[idx].iov_len = len;
|
||||
|
||||
p->count++;
|
||||
}
|
||||
|
@ -104,28 +122,23 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (p->pkt[idx].offset + len + offset > p->buf_size) {
|
||||
if (len + offset > p->pkt[idx].iov_len) {
|
||||
if (func) {
|
||||
trace("packet offset plus length %zu from size %zu, "
|
||||
"%s:%i", p->pkt[idx].offset + len + offset,
|
||||
p->buf_size, func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (len + offset > p->pkt[idx].len) {
|
||||
if (func) {
|
||||
trace("data length %zu, offset %zu from length %u, "
|
||||
"%s:%i", len, offset, p->pkt[idx].len,
|
||||
trace("data length %zu, offset %zu from length %zu, "
|
||||
"%s:%i", len, offset, p->pkt[idx].iov_len,
|
||||
func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (left)
|
||||
*left = p->pkt[idx].len - offset - len;
|
||||
if (packet_check_range(p, offset, len, p->pkt[idx].iov_base,
|
||||
func, line))
|
||||
return NULL;
|
||||
|
||||
return p->buf + p->pkt[idx].offset + offset;
|
||||
if (left)
|
||||
*left = p->pkt[idx].iov_len - offset - len;
|
||||
|
||||
return (char *)p->pkt[idx].iov_base + offset;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
16
packet.h
16
packet.h
|
@ -6,16 +6,6 @@
|
|||
#ifndef PACKET_H
|
||||
#define PACKET_H
|
||||
|
||||
/**
|
||||
* struct desc - Generic offset-based descriptor within buffer
|
||||
* @offset: Offset of descriptor relative to buffer start, 32-bit limit
|
||||
* @len: Length of descriptor, host order, 16-bit limit
|
||||
*/
|
||||
struct desc {
|
||||
uint32_t offset;
|
||||
uint16_t len;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct pool - Generic pool of packets stored in a buffer
|
||||
* @buf: Buffer storing packet descriptors
|
||||
|
@ -29,9 +19,11 @@ struct pool {
|
|||
size_t buf_size;
|
||||
size_t size;
|
||||
size_t count;
|
||||
struct desc pkt[1];
|
||||
struct iovec pkt[1];
|
||||
};
|
||||
|
||||
int vu_packet_check_range(void *buf, size_t offset, size_t len,
|
||||
const char *start, const char *func, int line);
|
||||
void packet_add_do(struct pool *p, size_t len, const char *start,
|
||||
const char *func, int line);
|
||||
void *packet_get_do(const struct pool *p, const size_t idx,
|
||||
|
@ -54,7 +46,7 @@ struct _name ## _t { \
|
|||
size_t buf_size; \
|
||||
size_t size; \
|
||||
size_t count; \
|
||||
struct desc pkt[_size]; \
|
||||
struct iovec pkt[_size]; \
|
||||
}
|
||||
|
||||
#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \
|
||||
|
|
216
passt.1
216
passt.1
|
@ -73,9 +73,6 @@ for performance reasons.
|
|||
|
||||
.SH OPTIONS
|
||||
|
||||
Unless otherwise noted below, \fBif conflicting or multiple options are given,
|
||||
the last one takes effect.\fR
|
||||
|
||||
.TP
|
||||
.BR \-d ", " \-\-debug
|
||||
Be verbose, don't log to the system logger.
|
||||
|
@ -95,18 +92,14 @@ detached PID namespace after starting, because the PID itself cannot change.
|
|||
Default is to fork into background.
|
||||
|
||||
.TP
|
||||
.BR \-e ", " \-\-stderr " " (DEPRECATED)
|
||||
This option has no effect, and is maintained for compatibility purposes only.
|
||||
|
||||
Note that this configuration option is \fBdeprecated\fR and will be removed in a
|
||||
future version.
|
||||
.BR \-e ", " \-\-stderr
|
||||
Log to standard error too.
|
||||
Default is to log to the system logger only, if started from an interactive
|
||||
terminal, and to both system logger and standard error otherwise.
|
||||
|
||||
.TP
|
||||
.BR \-l ", " \-\-log-file " " \fIPATH\fR
|
||||
Log to file \fIPATH\fR, and not to the system logger.
|
||||
|
||||
Specifying this option multiple times does \fInot\fR lead to multiple log files:
|
||||
the last given option takes effect.
|
||||
Log to file \fIPATH\fR, not to standard error, and not to the system logger.
|
||||
|
||||
.TP
|
||||
.BR \-\-log-size " " \fISIZE\fR
|
||||
|
@ -135,9 +128,6 @@ Show version and exit.
|
|||
Capture tap-facing (that is, guest-side or namespace-side) network packets to
|
||||
\fIfile\fR in \fBpcap\fR format.
|
||||
|
||||
Specifying this option multiple times does \fInot\fR lead to multiple capture
|
||||
files: the last given option takes effect.
|
||||
|
||||
.TP
|
||||
.BR \-P ", " \-\-pid " " \fIfile
|
||||
Write own PID to \fIfile\fR once initialisation is done, before forking to
|
||||
|
@ -158,9 +148,7 @@ for an IPv6 \fIaddr\fR.
|
|||
This option can be specified zero (for defaults) to two times (once for IPv4,
|
||||
once for IPv6).
|
||||
By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces
|
||||
with the first default route, if any, for the corresponding IP version. If no
|
||||
default routes are available and there is any interface with any route for a
|
||||
given IP version, the first of these interfaces will be chosen instead.
|
||||
with the first default route for the corresponding IP version.
|
||||
|
||||
.TP
|
||||
.BR \-n ", " \-\-netmask " " \fImask
|
||||
|
@ -184,11 +172,9 @@ Assign IPv4 \fIaddr\fR as default gateway via DHCP (option 3), or IPv6
|
|||
This option can be specified zero (for defaults) to two times (once for IPv4,
|
||||
once for IPv6).
|
||||
By default, IPv4 and IPv6 gateways are taken from the host interface with the
|
||||
first default route, if any, for the corresponding IP version. If the default
|
||||
route is a multipath one, the gateway is the first nexthop router returned by
|
||||
the kernel which has the highest weight in the set of paths. If no default
|
||||
routes are available and there is just one interface with any route, that
|
||||
interface will be chosen instead.
|
||||
first default route for the corresponding IP version. If the default route is a
|
||||
multipath one, the gateway is the first nexthop router returned by the kernel
|
||||
which has the highest weight in the set of paths.
|
||||
|
||||
Note: these addresses are also used as source address for packets directed to
|
||||
the guest or to the target namespace having a loopback or local source address,
|
||||
|
@ -199,11 +185,9 @@ to allow mapping of local traffic to guest and target namespace. See the
|
|||
.BR \-i ", " \-\-interface " " \fIname
|
||||
Use host interface \fIname\fR to derive addresses and routes.
|
||||
Default is to use the interfaces specified by \fB--outbound-if4\fR and
|
||||
\fB--outbound-if6\fR, for IPv4 and IPv6 addresses and routes, respectively.
|
||||
|
||||
If no interfaces are given, the interface with the first default routes for each
|
||||
IP version is selected. If no default routes are available and there is just one
|
||||
interface with any route, that interface will be chosen instead.
|
||||
\fB--outbound-if6\fR, for IPv4 and IPv6 addresses and routes, respectively. If
|
||||
no interfaces are given, the interface with the first default routes for each IP
|
||||
version is selected.
|
||||
|
||||
.TP
|
||||
.BR \-o ", " \-\-outbound " " \fIaddr
|
||||
|
@ -219,49 +203,30 @@ By default, the source address is selected by the routing tables.
|
|||
Bind IPv4 outbound sockets to host interface \fIname\fR, and, unless another
|
||||
interface is specified via \fB-i\fR, \fB--interface\fR, use this interface to
|
||||
derive IPv4 addresses and routes.
|
||||
|
||||
By default, the interface given by the default route is selected. If no default
|
||||
routes are available and there is just one interface with any route, that
|
||||
interface will be chosen instead.
|
||||
By default, the interface given by the default route is selected.
|
||||
|
||||
.TP
|
||||
.BR \-\-outbound-if6 " " \fIname
|
||||
Bind IPv6 outbound sockets to host interface \fIname\fR, and, unless another
|
||||
interface is specified via \fB-i\fR, \fB--interface\fR, use this interface to
|
||||
derive IPv6 addresses and routes.
|
||||
|
||||
By default, the interface given by the default route is selected. If no default
|
||||
routes are available and there is just one interface with any route, that
|
||||
interface will be chosen instead.
|
||||
By default, the interface given by the default route is selected.
|
||||
|
||||
.TP
|
||||
.BR \-D ", " \-\-dns " " \fIaddr
|
||||
Instruct the guest (via DHCP, DHVPv6 or NDP) to use \fIaddr\fR (IPv4
|
||||
or IPv6) as a nameserver, as configured (see options
|
||||
\fB--no-dhcp-dns\fR, \fB--dhcp-dns\fR) instead of reading addresses
|
||||
from \fI/etc/resolv.conf\fR. This option can be specified multiple
|
||||
times. Specifying \fB-D none\fR disables usage of DNS addresses
|
||||
altogether. Unlike addresses from \fI/etc/resolv.conf\fR, \fIaddr\fR
|
||||
is given to the guest without remapping. For example \fB--dns
|
||||
127.0.0.1\fR will instruct the guest to use itself as nameserver, not
|
||||
the host.
|
||||
Use \fIaddr\fR (IPv4 or IPv6) for DHCP, DHCPv6, NDP or DNS forwarding, as
|
||||
configured (see options \fB--no-dhcp-dns\fR, \fB--dhcp-dns\fR,
|
||||
\fB--dns-forward\fR) instead of reading addresses from \fI/etc/resolv.conf\fR.
|
||||
This option can be specified multiple times. Specifying \fB-D none\fR disables
|
||||
usage of DNS addresses altogether.
|
||||
|
||||
.TP
|
||||
.BR \-\-dns-forward " " \fIaddr
|
||||
Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the
|
||||
nameserver (with corresponding IP version) specified by the
|
||||
\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or
|
||||
port 853. Replies are translated back with a reverse mapping. This
|
||||
option can be specified zero to two times (once for IPv4, once for
|
||||
IPv6).
|
||||
|
||||
.TP
|
||||
.BR \-\-dns-host " " \fIaddr
|
||||
Configure the host nameserver which guest or namespace queries to the
|
||||
\fB\-\-dns-forward\fR address will be redirected to. This option can
|
||||
be specified zero to two times (once for IPv4, once for IPv6).
|
||||
By default, the first nameserver from the host's
|
||||
\fI/etc/resolv.conf\fR.
|
||||
Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the first
|
||||
configured DNS resolver (with corresponding IP version). Mapping is limited to
|
||||
UDP traffic directed to port 53, and DNS answers are translated back with a
|
||||
reverse mapping.
|
||||
This option can be specified zero to two times (once for IPv4, once for IPv6).
|
||||
|
||||
.TP
|
||||
.BR \-S ", " \-\-search " " \fIlist
|
||||
|
@ -272,28 +237,28 @@ list altogether (if you need to search a domain called "none" you can use
|
|||
\fB--search none.\fR).
|
||||
|
||||
.TP
|
||||
.BR \-\-no-dhcp-dns
|
||||
.BR \-\-no-dhcp-dns " " \fIaddr
|
||||
In \fIpasst\fR mode, do not assign IPv4 addresses via DHCP (option 23) or IPv6
|
||||
addresses via NDP Router Advertisement (option type 25) and DHCPv6 (option 23)
|
||||
as DNS resolvers.
|
||||
By default, all the configured addresses are passed.
|
||||
|
||||
.TP
|
||||
.BR \-\-dhcp-dns
|
||||
.BR \-\-dhcp-dns " " \fIaddr
|
||||
In \fIpasta\fR mode, assign IPv4 addresses via DHCP (option 23) or IPv6
|
||||
addresses via NDP Router Advertisement (option type 25) and DHCPv6 (option 23)
|
||||
as DNS resolvers.
|
||||
By default, configured addresses, if any, are not passed.
|
||||
|
||||
.TP
|
||||
.BR \-\-no-dhcp-search
|
||||
.BR \-\-no-dhcp-search " " \fIaddr
|
||||
In \fIpasst\fR mode, do not send the DNS domain search list addresses via DHCP
|
||||
(option 119), via NDP Router Advertisement (option type 31) and DHCPv6 (option
|
||||
24).
|
||||
By default, the DNS domain search list resulting from configuration is passed.
|
||||
|
||||
.TP
|
||||
.BR \-\-dhcp-search
|
||||
.BR \-\-dhcp-search " " \fIaddr
|
||||
In \fIpasta\fR mode, send the DNS domain search list addresses via DHCP (option
|
||||
119), via NDP Router Advertisement (option type 31) and DHCPv6 (option 24).
|
||||
By default, the DNS domain search list resulting from configuration is not
|
||||
|
@ -336,63 +301,23 @@ namespace will be silently dropped.
|
|||
Disable Router Advertisements. Router Solicitations coming from guest or target
|
||||
namespace will be ignored.
|
||||
|
||||
.TP
|
||||
.BR \-\-freebind
|
||||
Allow any binding address to be specified for \fB-t\fR and \fB-u\fR
|
||||
options. Usually binding addresses must be addresses currently
|
||||
configured on the host. With \fB\-\-freebind\fR, the
|
||||
\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled
|
||||
allowing any address to be used. This is typically used to bind
|
||||
addresses which might be configured on the host in future, at which
|
||||
point the forwarding will immediately start operating.
|
||||
|
||||
.TP
|
||||
.BR \-\-map-host-loopback " " \fIaddr
|
||||
Translate \fIaddr\fR to refer to the host. Packets from the guest to
|
||||
\fIaddr\fR will be redirected to the host. On the host such packets
|
||||
will appear to have both source and destination of 127.0.0.1 or ::1.
|
||||
|
||||
If \fIaddr\fR is 'none', no address is mapped (this implies
|
||||
\fB--no-map-gw\fR). Only one IPv4 and one IPv6 address can be
|
||||
translated, if the option is specified multiple times, the last one
|
||||
takes effect.
|
||||
|
||||
Default is to translate the guest's default gateway address, unless
|
||||
\fB--no-map-gw\fR is given, in which case no address is mapped.
|
||||
|
||||
.TP
|
||||
.BR \-\-no-map-gw
|
||||
Don't remap TCP connections and untracked UDP traffic, with the gateway address
|
||||
as destination, to the host. Implied if there is no gateway on the selected
|
||||
default route, or if there is no default route, for any of the enabled address
|
||||
families.
|
||||
|
||||
.TP
|
||||
.BR \-\-map-guest-addr " " \fIaddr
|
||||
Translate \fIaddr\fR in the guest to be equal to the guest's assigned
|
||||
address on the host. That is, packets from the guest to \fIaddr\fR
|
||||
will be redirected to the address assigned to the guest with \fB-a\fR,
|
||||
or by default the host's global address. This allows the guest to
|
||||
access services availble on the host's global address, even though its
|
||||
own address shadows that of the host.
|
||||
|
||||
If \fIaddr\fR is 'none', no address is mapped. Only one IPv4 and one
|
||||
IPv6 address can be translated, and if the option is specified
|
||||
multiple times, the last one for each address type takes effect.
|
||||
|
||||
Default is no mapping.
|
||||
default route for any of the enabled address families.
|
||||
|
||||
.TP
|
||||
.BR \-4 ", " \-\-ipv4-only
|
||||
Enable IPv4-only operation. IPv6 traffic will be ignored.
|
||||
By default, IPv6 operation is enabled as long as at least an IPv6 route and an
|
||||
interface address are configured on a given host interface.
|
||||
By default, IPv6 operation is enabled as long as at least an IPv6 default route
|
||||
and an interface address are configured on a given host interface.
|
||||
|
||||
.TP
|
||||
.BR \-6 ", " \-\-ipv6-only
|
||||
Enable IPv6-only operation. IPv4 traffic will be ignored.
|
||||
By default, IPv4 operation is enabled as long as at least an IPv4 route and an
|
||||
interface address are configured on a given host interface.
|
||||
By default, IPv4 operation is enabled as long as at least an IPv4 default route
|
||||
and an interface address are configured on a given host interface.
|
||||
|
||||
.SS \fBpasst\fR-only options
|
||||
|
||||
|
@ -605,13 +530,6 @@ Configure UDP port forwarding from target namespace to init namespace.
|
|||
|
||||
Default is \fBauto\fR.
|
||||
|
||||
.TP
|
||||
.BR \-\-host-lo-to-ns-lo " " (DEPRECATED)
|
||||
If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
|
||||
the host's loopback address will appear on the loopback address in the
|
||||
guest as well. Without this option such forwarded packets will appear
|
||||
to come from the guest's public address.
|
||||
|
||||
.TP
|
||||
.BR \-\-userns " " \fIspec
|
||||
Target user namespace to join, as a path. If PID is given, without this option,
|
||||
|
@ -648,7 +566,7 @@ or sourced from the host, and bring up the tap interface.
|
|||
.BR \-\-no-copy-routes " " (DEPRECATED)
|
||||
With \-\-config-net, do not copy all the routes associated to the interface we
|
||||
derive addresses and routes from: set up only the default gateway. Implied by
|
||||
-g, \-\-gateway, for the corresponding IP version only.
|
||||
-g, \-\-gateway.
|
||||
|
||||
Default is to copy all the routing entries from the interface in the outer
|
||||
namespace to the target namespace, translating the output interface attribute to
|
||||
|
@ -663,7 +581,7 @@ below.
|
|||
.BR \-\-no-copy-addrs " " (DEPRECATED)
|
||||
With \-\-config-net, do not copy all the addresses associated to the interface
|
||||
we derive addresses and routes from: set up a single one. Implied by \-a,
|
||||
\-\-address, for the corresponding IP version only.
|
||||
\-\-address.
|
||||
|
||||
Default is to copy all the addresses, except for link-local ones, from the
|
||||
interface from the outer namespace to the target namespace.
|
||||
|
@ -889,41 +807,38 @@ root@localhost's password:
|
|||
|
||||
.SH NOTES
|
||||
|
||||
.SS Handling of traffic with loopback destination and source addresses
|
||||
.SS Handling of traffic with local destination and source addresses
|
||||
|
||||
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback
|
||||
address (127.0.0.0/8 or ::1), depending on the configuration. Loopback
|
||||
destination or source addresses need to be changed before packets are
|
||||
delivered to the guest or target namespace: most operating systems
|
||||
would drop packets received with loopback addresses on non-loopback
|
||||
interfaces, and it would also be impossible for guest or target
|
||||
namespace to route answers back.
|
||||
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address,
|
||||
depending on the configuration. Local destination or source addresses need to be
|
||||
changed before packets are delivered to the guest or target namespace: most
|
||||
operating systems would drop packets received from non-loopback interfaces with
|
||||
local addresses, and it would also be impossible for guest or target namespace
|
||||
to route answers back.
|
||||
|
||||
For convenience, the source address on these packets is translated to
|
||||
the address specified by the \fB\-\-map-host-loopback\fR option (with
|
||||
some exceptions in pasta mode, see next section below). If not
|
||||
specified this defaults, somewhat arbitrarily, to the address of
|
||||
default IPv4 or IPv6 gateway (if any) -- this is known to be an
|
||||
existing, valid address on the same subnet. If \fB\-\-no-map-gw\fR or
|
||||
\fB\-\-map-host-loopback none\fR are specified this translation is
|
||||
disabled and packets with loopback addresses are simply dropped.
|
||||
For convenience, and somewhat arbitrarily, the source address on these packets
|
||||
is translated to the address of the default IPv4 or IPv6 gateway -- this is
|
||||
known to be an existing, valid address on the same subnet.
|
||||
|
||||
Loopback destination addresses are translated to the observed external
|
||||
address of the guest or target namespace. For IPv6, the observed
|
||||
link-local address is used if the translated source address is
|
||||
link-local, otherwise the observed global address is used. For both
|
||||
IPv4 and IPv6, if no addresses have been seen yet, the configured
|
||||
addresses will be used instead.
|
||||
Loopback destination addresses are instead translated to the observed external
|
||||
address of the guest or target namespace. For IPv6 packets, if usage of a
|
||||
link-local address by guest or namespace has ever been observed, and the
|
||||
original destination address is also a link-local address, the observed
|
||||
link-local address is used. Otherwise, the observed global address is used. For
|
||||
both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses
|
||||
will be used instead.
|
||||
|
||||
For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1,
|
||||
with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
|
||||
the last observed source address from guest or namespace is 192.0.2.2, this will
|
||||
be translated to a connection from 192.0.2.1 to 192.0.2.2.
|
||||
|
||||
Similarly, for traffic coming from guest or namespace, packets with
|
||||
destination address corresponding to the \fB\-\-map-host-loopback\fR
|
||||
address will have their destination address translated to a loopback
|
||||
address.
|
||||
Similarly, for traffic coming from guest or namespace, packets with destination
|
||||
address corresponding to the default gateway will have their destination address
|
||||
translated to a loopback address, if and only if a packet, in the opposite
|
||||
direction, with a loopback destination or source address, port-wise matching for
|
||||
UDP, or connection-wise for TCP, has been recently forwarded to guest or
|
||||
namespace. This behaviour can be disabled with \-\-no\-map\-gw.
|
||||
|
||||
.SS Handling of local traffic in pasta
|
||||
|
||||
|
@ -939,15 +854,8 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair
|
|||
of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet
|
||||
transfers.
|
||||
|
||||
Because it's not possible to bind sockets to foreign addresses, this
|
||||
bypass only applies to local connections and traffic. It also means
|
||||
that the address translation differs slightly from passt mode.
|
||||
Connections from loopback to loopback on the host will appear to come
|
||||
from the target namespace's public address within the guest, unless
|
||||
\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will
|
||||
appear to come from loopback in the namespace as well. The latter
|
||||
behaviour used to be the default, but is usually undesirable, since it
|
||||
can unintentionally expose namespace local services to the host.
|
||||
This bypass only applies to local connections and traffic, because it's not
|
||||
possible to bind sockets to foreign addresses.
|
||||
|
||||
.SS Binding to low numbered ports (well-known or system ports, up to 1023)
|
||||
|
||||
|
@ -1056,8 +964,8 @@ https://passt.top/passt/lists.
|
|||
Copyright (c) 2020-2022 Red Hat GmbH.
|
||||
|
||||
\fBpasst\fR and \fBpasta\fR are free software: you can redistribute them and/or
|
||||
modify them under the terms of the GNU General Public License as
|
||||
published by the Free Software Foundation, either version 2 of the License, or
|
||||
modify them under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
.SH SEE ALSO
|
||||
|
|
158
passt.c
158
passt.c
|
@ -35,7 +35,6 @@
|
|||
#include <syslog.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <netinet/if_ether.h>
|
||||
#include <libgen.h>
|
||||
#ifdef HAS_GETRANDOM
|
||||
#include <sys/random.h>
|
||||
#endif
|
||||
|
@ -66,14 +65,16 @@ char *epoll_type_str[] = {
|
|||
[EPOLL_TYPE_TCP_SPLICE] = "connected spliced TCP socket",
|
||||
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
|
||||
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
|
||||
[EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket",
|
||||
[EPOLL_TYPE_UDP_REPLY] = "UDP reply socket",
|
||||
[EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket",
|
||||
[EPOLL_TYPE_UDP] = "UDP socket",
|
||||
[EPOLL_TYPE_ICMP] = "ICMP socket",
|
||||
[EPOLL_TYPE_ICMPV6] = "ICMPv6 socket",
|
||||
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
|
||||
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
|
||||
[EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device",
|
||||
[EPOLL_TYPE_TAP_PASST] = "connected qemu socket",
|
||||
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
|
||||
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
|
||||
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
|
||||
};
|
||||
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
|
||||
"epoll_type_str[] doesn't match enum epoll_type");
|
||||
|
@ -85,7 +86,7 @@ static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
|
|||
*/
|
||||
static void post_handler(struct ctx *c, const struct timespec *now)
|
||||
{
|
||||
#define CALL_PROTO_HANDLER(lc, uc) \
|
||||
#define CALL_PROTO_HANDLER(c, now, lc, uc) \
|
||||
do { \
|
||||
extern void \
|
||||
lc ## _defer_handler (struct ctx *c) \
|
||||
|
@ -104,9 +105,11 @@ static void post_handler(struct ctx *c, const struct timespec *now)
|
|||
} while (0)
|
||||
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||
CALL_PROTO_HANDLER(tcp, TCP);
|
||||
CALL_PROTO_HANDLER(c, now, tcp, TCP);
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||
CALL_PROTO_HANDLER(udp, UDP);
|
||||
CALL_PROTO_HANDLER(c, now, udp, UDP);
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||
CALL_PROTO_HANDLER(c, now, icmp, ICMP);
|
||||
|
||||
flow_defer_handler(c, now);
|
||||
#undef CALL_PROTO_HANDLER
|
||||
|
@ -137,13 +140,14 @@ static void secret_init(struct ctx *c)
|
|||
}
|
||||
if (dev_random >= 0)
|
||||
close(dev_random);
|
||||
|
||||
if (random_read < sizeof(c->hash_secret))
|
||||
if (random_read < sizeof(c->hash_secret)) {
|
||||
#else
|
||||
if (getrandom(&c->hash_secret, sizeof(c->hash_secret),
|
||||
GRND_RANDOM) < 0)
|
||||
GRND_RANDOM) < 0) {
|
||||
#endif /* !HAS_GETRANDOM */
|
||||
die_perror("Failed to get random bytes for hash table and TCP");
|
||||
perror("TCP initial sequence getrandom");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -163,7 +167,7 @@ static void timer_init(struct ctx *c, const struct timespec *now)
|
|||
*/
|
||||
void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
|
||||
{
|
||||
tcp_update_l2_buf(eth_d, eth_s);
|
||||
tcp_buf_update_l2(eth_d, eth_s);
|
||||
udp_update_l2_buf(eth_d, eth_s);
|
||||
}
|
||||
|
||||
|
@ -191,30 +195,28 @@ void exit_handler(int signal)
|
|||
* Return: non-zero on failure
|
||||
*
|
||||
* #syscalls read write writev
|
||||
* #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close
|
||||
* #syscalls bind connect recvfrom sendto shutdown
|
||||
* #syscalls arm:recv ppc64le:recv arm:send ppc64le:send
|
||||
* #syscalls socket bind connect getsockopt setsockopt s390x:socketcall close
|
||||
* #syscalls recvfrom sendto shutdown
|
||||
* #syscalls armv6l:recv armv7l:recv ppc64le:recv
|
||||
* #syscalls armv6l:send armv7l:send ppc64le:send
|
||||
* #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
|
||||
* #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64
|
||||
* #syscalls clock_gettime armv6l:clock_gettime64 armv7l:clock_gettime64
|
||||
*/
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int nfds, i, devnull_fd = -1, pidfile_fd = -1;
|
||||
struct epoll_event events[EPOLL_EVENTS];
|
||||
int nfds, i, devnull_fd = -1;
|
||||
char argv0[PATH_MAX], *name;
|
||||
char *log_name, argv0[PATH_MAX], *name;
|
||||
struct ctx c = { 0 };
|
||||
struct rlimit limit;
|
||||
struct timespec now;
|
||||
struct sigaction sa;
|
||||
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &log_start))
|
||||
die_perror("Failed to get CLOCK_MONOTONIC time");
|
||||
|
||||
arch_avx2_exec(argv);
|
||||
|
||||
isolate_initial(argc, argv);
|
||||
isolate_initial();
|
||||
|
||||
c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1;
|
||||
c.pasta_netns_fd = c.fd_tap = c.fd_tap_listen = -1;
|
||||
|
||||
sigemptyset(&sa.sa_mask);
|
||||
sa.sa_flags = 0;
|
||||
|
@ -229,52 +231,70 @@ int main(int argc, char **argv)
|
|||
name = basename(argv0);
|
||||
if (strstr(name, "pasta")) {
|
||||
sa.sa_handler = pasta_child_handler;
|
||||
if (sigaction(SIGCHLD, &sa, NULL))
|
||||
die_perror("Couldn't install signal handlers");
|
||||
if (sigaction(SIGCHLD, &sa, NULL)) {
|
||||
die("Couldn't install signal handlers: %s",
|
||||
strerror(errno));
|
||||
}
|
||||
|
||||
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
|
||||
die_perror("Couldn't set disposition for SIGPIPE");
|
||||
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
|
||||
die("Couldn't set disposition for SIGPIPE: %s",
|
||||
strerror(errno));
|
||||
}
|
||||
|
||||
c.mode = MODE_PASTA;
|
||||
log_name = "pasta";
|
||||
} else if (strstr(name, "passt")) {
|
||||
c.mode = MODE_PASST;
|
||||
log_name = "passt";
|
||||
} else {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
|
||||
|
||||
__openlog(log_name, 0, LOG_DAEMON);
|
||||
|
||||
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
|
||||
if (c.epollfd == -1)
|
||||
die_perror("Failed to create epoll file descriptor");
|
||||
|
||||
if (getrlimit(RLIMIT_NOFILE, &limit))
|
||||
die_perror("Failed to get maximum value of open files limit");
|
||||
if (c.epollfd == -1) {
|
||||
perror("epoll_create1");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (getrlimit(RLIMIT_NOFILE, &limit)) {
|
||||
perror("getrlimit");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
c.nofile = limit.rlim_cur = limit.rlim_max;
|
||||
if (setrlimit(RLIMIT_NOFILE, &limit))
|
||||
die_perror("Failed to set current limit for open files");
|
||||
|
||||
if (setrlimit(RLIMIT_NOFILE, &limit)) {
|
||||
perror("setrlimit");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
sock_probe_mem(&c);
|
||||
|
||||
conf(&c, argc, argv);
|
||||
trace_init(c.trace);
|
||||
|
||||
if (c.force_stderr || isatty(fileno(stdout)))
|
||||
__openlog(log_name, LOG_PERROR, LOG_DAEMON);
|
||||
|
||||
pasta_netns_quit_init(&c);
|
||||
|
||||
tap_sock_init(&c);
|
||||
vu_init(&c);
|
||||
|
||||
secret_init(&c);
|
||||
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &now))
|
||||
die_perror("Failed to get CLOCK_MONOTONIC time");
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
|
||||
flow_init();
|
||||
|
||||
if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
|
||||
if (!c.no_icmp)
|
||||
icmp_init();
|
||||
|
||||
proto_update_l2_buf(c.mac_guest, c.mac);
|
||||
|
||||
if (c.ifi4 && !c.no_dhcp)
|
||||
dhcp_init();
|
||||
|
@ -285,39 +305,46 @@ int main(int argc, char **argv)
|
|||
pcap_init(&c);
|
||||
|
||||
if (!c.foreground) {
|
||||
if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0)
|
||||
die_perror("Failed to open /dev/null");
|
||||
if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0) {
|
||||
perror("/dev/null open");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
if (*c.pid_file) {
|
||||
if ((pidfile_fd = open(c.pid_file,
|
||||
O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC,
|
||||
S_IRUSR | S_IWUSR)) < 0) {
|
||||
perror("PID file open");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
if (isolate_prefork(&c))
|
||||
die("Failed to sandbox process, exiting");
|
||||
|
||||
if (!c.foreground) {
|
||||
__daemon(c.pidfile_fd, devnull_fd);
|
||||
log_stderr = false;
|
||||
} else {
|
||||
pidfile_write(c.pidfile_fd, getpid());
|
||||
}
|
||||
if (!c.foreground)
|
||||
__daemon(pidfile_fd, devnull_fd);
|
||||
else
|
||||
write_pidfile(pidfile_fd, getpid());
|
||||
|
||||
if (pasta_child_pid) {
|
||||
if (pasta_child_pid)
|
||||
kill(pasta_child_pid, SIGUSR1);
|
||||
log_stderr = false;
|
||||
}
|
||||
|
||||
isolate_postfork(&c);
|
||||
|
||||
timer_init(&c, &now);
|
||||
|
||||
loop:
|
||||
/* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||
/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
|
||||
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
|
||||
/* NOLINTEND(bugprone-branch-clone) */
|
||||
if (nfds == -1 && errno != EINTR)
|
||||
die_perror("epoll_wait() failed in main loop");
|
||||
if (nfds == -1 && errno != EINTR) {
|
||||
perror("epoll_wait");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &now))
|
||||
err_perror("Failed to get CLOCK_MONOTONIC time");
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
|
||||
for (i = 0; i < nfds; i++) {
|
||||
union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
|
||||
|
@ -355,14 +382,23 @@ loop:
|
|||
case EPOLL_TYPE_TCP_TIMER:
|
||||
tcp_timer_handler(&c, ref);
|
||||
break;
|
||||
case EPOLL_TYPE_UDP_LISTEN:
|
||||
udp_listen_sock_handler(&c, ref, eventmask, &now);
|
||||
case EPOLL_TYPE_UDP:
|
||||
if (c.mode == MODE_VU)
|
||||
udp_vu_sock_handler(&c, ref, eventmask, &now);
|
||||
else
|
||||
udp_buf_sock_handler(&c, ref, eventmask, &now);
|
||||
break;
|
||||
case EPOLL_TYPE_UDP_REPLY:
|
||||
udp_reply_sock_handler(&c, ref, eventmask, &now);
|
||||
case EPOLL_TYPE_ICMP:
|
||||
icmp_sock_handler(&c, AF_INET, ref);
|
||||
break;
|
||||
case EPOLL_TYPE_PING:
|
||||
icmp_sock_handler(&c, ref);
|
||||
case EPOLL_TYPE_ICMPV6:
|
||||
icmp_sock_handler(&c, AF_INET6, ref);
|
||||
break;
|
||||
case EPOLL_TYPE_VHOST_CMD:
|
||||
tap_handler_vu(&c, eventmask);
|
||||
break;
|
||||
case EPOLL_TYPE_VHOST_KICK:
|
||||
vu_kick_cb(&c, ref);
|
||||
break;
|
||||
default:
|
||||
/* Can't happen */
|
||||
|
|
158
passt.h
158
passt.h
|
@ -9,6 +9,26 @@
|
|||
#define UNIX_SOCK_MAX 100
|
||||
#define UNIX_SOCK_PATH "/tmp/passt_%i.socket"
|
||||
|
||||
/**
|
||||
* struct tap_msg - Generic message descriptor for arrays of messages
|
||||
* @pkt_buf_offset: Offset from @pkt_buf
|
||||
* @len: Message length, with L2 headers
|
||||
*/
|
||||
struct tap_msg {
|
||||
uint32_t pkt_buf_offset;
|
||||
uint16_t len;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct tap_l4_msg - Layer-4 message descriptor for protocol handlers
|
||||
* @pkt_buf_offset: Offset of message from @pkt_buf
|
||||
* @l4_len: Length of Layer-4 payload, host order
|
||||
*/
|
||||
struct tap_l4_msg {
|
||||
uint32_t pkt_buf_offset;
|
||||
uint16_t l4_len;
|
||||
};
|
||||
|
||||
union epoll_ref;
|
||||
|
||||
#include <stdbool.h>
|
||||
|
@ -17,21 +37,51 @@ union epoll_ref;
|
|||
|
||||
#include "pif.h"
|
||||
#include "packet.h"
|
||||
#include "siphash.h"
|
||||
#include "ip.h"
|
||||
#include "inany.h"
|
||||
#include "flow.h"
|
||||
#include "icmp.h"
|
||||
#include "fwd.h"
|
||||
#include "tcp.h"
|
||||
#include "udp.h"
|
||||
#include "udp_vu.h"
|
||||
#include "vhost_user.h"
|
||||
|
||||
/* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0
|
||||
* (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise
|
||||
* it's arbitrary.
|
||||
/**
|
||||
* enum epoll_type - Different types of fds we poll over
|
||||
*/
|
||||
#define MAC_OUR_LAA \
|
||||
((uint8_t [ETH_ALEN]){0x9a, 0x55, 0x9a, 0x55, 0x9a, 0x55})
|
||||
enum epoll_type {
|
||||
/* Special value to indicate an invalid type */
|
||||
EPOLL_TYPE_NONE = 0,
|
||||
/* Connected TCP sockets */
|
||||
EPOLL_TYPE_TCP,
|
||||
/* Connected TCP sockets (spliced) */
|
||||
EPOLL_TYPE_TCP_SPLICE,
|
||||
/* Listening TCP sockets */
|
||||
EPOLL_TYPE_TCP_LISTEN,
|
||||
/* timerfds used for TCP timers */
|
||||
EPOLL_TYPE_TCP_TIMER,
|
||||
/* UDP sockets */
|
||||
EPOLL_TYPE_UDP,
|
||||
/* IPv4 ICMP sockets */
|
||||
EPOLL_TYPE_ICMP,
|
||||
/* ICMPv6 sockets */
|
||||
EPOLL_TYPE_ICMPV6,
|
||||
/* inotify fd watching for end of netns (pasta) */
|
||||
EPOLL_TYPE_NSQUIT_INOTIFY,
|
||||
/* timer fd watching for end of netns, fallback for inotify (pasta) */
|
||||
EPOLL_TYPE_NSQUIT_TIMER,
|
||||
/* tuntap character device */
|
||||
EPOLL_TYPE_TAP_PASTA,
|
||||
/* socket connected to qemu */
|
||||
EPOLL_TYPE_TAP_PASST,
|
||||
/* socket listening for qemu socket connections */
|
||||
EPOLL_TYPE_TAP_LISTEN,
|
||||
/* vhost-user command socket */
|
||||
EPOLL_TYPE_VHOST_CMD,
|
||||
/* vhost-user kick event socket */
|
||||
EPOLL_TYPE_VHOST_KICK,
|
||||
|
||||
EPOLL_NUM_TYPES,
|
||||
};
|
||||
|
||||
/**
|
||||
* union epoll_ref - Breakdown of reference for epoll fd bookkeeping
|
||||
|
@ -55,7 +105,8 @@ union epoll_ref {
|
|||
uint32_t flow;
|
||||
flow_sidx_t flowside;
|
||||
union tcp_listen_epoll_ref tcp_listen;
|
||||
union udp_listen_epoll_ref udp;
|
||||
union udp_epoll_ref udp;
|
||||
union icmp_epoll_ref icmp;
|
||||
uint32_t data;
|
||||
int nsdir_fd;
|
||||
};
|
||||
|
@ -67,6 +118,7 @@ static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
|
|||
|
||||
#define TAP_BUF_BYTES \
|
||||
ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
|
||||
#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t))
|
||||
#define TAP_MSGS \
|
||||
DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
|
||||
|
||||
|
@ -94,88 +146,59 @@ struct fqdn {
|
|||
enum passt_modes {
|
||||
MODE_PASST,
|
||||
MODE_PASTA,
|
||||
MODE_VU,
|
||||
};
|
||||
|
||||
/**
|
||||
* struct ip4_ctx - IPv4 execution context
|
||||
* @addr: IPv4 address assigned to guest
|
||||
* @addr: IPv4 address for external, routable interface
|
||||
* @addr_seen: Latest IPv4 address seen as source from tap
|
||||
* @prefixlen: IPv4 prefix length (netmask)
|
||||
* @guest_gw: IPv4 gateway as seen by the guest
|
||||
* @map_host_loopback: Outbound connections to this address are NATted to the
|
||||
* host's 127.0.0.1
|
||||
* @map_guest_addr: Outbound connections to this address are NATted to the
|
||||
* guest's assigned address
|
||||
* @dns: DNS addresses for DHCP, zero-terminated
|
||||
* @dns_match: Forward DNS query if sent to this address
|
||||
* @our_tap_addr: IPv4 address for passt's use on tap
|
||||
* @dns_host: Use this DNS on the host for forwarding
|
||||
* @gw: Default IPv4 gateway, network order
|
||||
* @dns: DNS addresses for DHCP, zero-terminated, network order
|
||||
* @dns_match: Forward DNS query if sent to this address, network order
|
||||
* @dns_host: Use this DNS on the host for forwarding, network order
|
||||
* @addr_out: Optional source address for outbound traffic
|
||||
* @ifname_out: Optional interface name to bind outbound sockets to
|
||||
* @no_copy_routes: Don't copy all routes when configuring target namespace
|
||||
* @no_copy_addrs: Don't copy all addresses when configuring namespace
|
||||
*/
|
||||
struct ip4_ctx {
|
||||
/* PIF_TAP addresses */
|
||||
struct in_addr addr;
|
||||
struct in_addr addr_seen;
|
||||
int prefix_len;
|
||||
struct in_addr guest_gw;
|
||||
struct in_addr map_host_loopback;
|
||||
struct in_addr map_guest_addr;
|
||||
struct in_addr gw;
|
||||
struct in_addr dns[MAXNS + 1];
|
||||
struct in_addr dns_match;
|
||||
struct in_addr our_tap_addr;
|
||||
|
||||
/* PIF_HOST addresses */
|
||||
struct in_addr dns_host;
|
||||
|
||||
struct in_addr addr_out;
|
||||
|
||||
char ifname_out[IFNAMSIZ];
|
||||
|
||||
bool no_copy_routes;
|
||||
bool no_copy_addrs;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct ip6_ctx - IPv6 execution context
|
||||
* @addr: IPv6 address assigned to guest
|
||||
* @addr: IPv6 address for external, routable interface
|
||||
* @addr_ll: Link-local IPv6 address on external, routable interface
|
||||
* @addr_seen: Latest IPv6 global/site address seen as source from tap
|
||||
* @addr_ll_seen: Latest IPv6 link-local address seen as source from tap
|
||||
* @guest_gw: IPv6 gateway as seen by the guest
|
||||
* @map_host_loopback: Outbound connections to this address are NATted to the
|
||||
* host's [::1]
|
||||
* @map_guest_addr: Outbound connections to this address are NATted to the
|
||||
* guest's assigned address
|
||||
* @gw: Default IPv6 gateway
|
||||
* @dns: DNS addresses for DHCPv6 and NDP, zero-terminated
|
||||
* @dns_match: Forward DNS query if sent to this address
|
||||
* @our_tap_ll: Link-local IPv6 address for passt's use on tap
|
||||
* @dns_host: Use this DNS on the host for forwarding
|
||||
* @addr_out: Optional source address for outbound traffic
|
||||
* @ifname_out: Optional interface name to bind outbound sockets to
|
||||
* @no_copy_routes: Don't copy all routes when configuring target namespace
|
||||
* @no_copy_addrs: Don't copy all addresses when configuring namespace
|
||||
*/
|
||||
struct ip6_ctx {
|
||||
/* PIF_TAP addresses */
|
||||
struct in6_addr addr;
|
||||
struct in6_addr addr_ll;
|
||||
struct in6_addr addr_seen;
|
||||
struct in6_addr addr_ll_seen;
|
||||
struct in6_addr guest_gw;
|
||||
struct in6_addr map_host_loopback;
|
||||
struct in6_addr map_guest_addr;
|
||||
struct in6_addr gw;
|
||||
struct in6_addr dns[MAXNS + 1];
|
||||
struct in6_addr dns_match;
|
||||
struct in6_addr our_tap_ll;
|
||||
|
||||
/* PIF_HOST addresses */
|
||||
struct in6_addr dns_host;
|
||||
|
||||
struct in6_addr addr_out;
|
||||
|
||||
char ifname_out[IFNAMSIZ];
|
||||
|
||||
bool no_copy_routes;
|
||||
bool no_copy_addrs;
|
||||
};
|
||||
|
||||
#include <netinet/if_ether.h>
|
||||
|
@ -187,11 +210,11 @@ struct ip6_ctx {
|
|||
* @trace: Enable tracing (extra debug) mode
|
||||
* @quiet: Don't print informational messages
|
||||
* @foreground: Run in foreground, don't log to stderr by default
|
||||
* @force_stderr: Force logging to stderr
|
||||
* @nofile: Maximum number of open files (ulimit -n)
|
||||
* @sock_path: Path for UNIX domain socket
|
||||
* @pcap: Path for packet capture file
|
||||
* @pidfile: Path to PID file, empty string if not configured
|
||||
* @pidfile_fd: File descriptor for PID file, -1 if none
|
||||
* @pid_file: Path to PID file, empty string if not configured
|
||||
* @pasta_netns_fd: File descriptor for network namespace in pasta mode
|
||||
* @no_netns_quit: In pasta mode, don't exit if fs-bound namespace is gone
|
||||
* @netns_base: Base name for fs-bound namespace, if any, in pasta mode
|
||||
|
@ -199,8 +222,8 @@ struct ip6_ctx {
|
|||
* @epollfd: File descriptor for epoll instance
|
||||
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
|
||||
* @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket
|
||||
* @our_tap_mac: Pasta/passt's MAC on the tap link
|
||||
* @guest_mac: MAC address of guest or namespace, seen or configured
|
||||
* @mac: Host MAC address
|
||||
* @mac_guest: MAC address of guest or namespace, seen or configured
|
||||
* @hash_secret: 128-bit secret for siphash functions
|
||||
* @ifi4: Index of template interface for IPv4, 0 if IPv4 disabled
|
||||
* @ip: IPv4 configuration
|
||||
|
@ -210,6 +233,8 @@ struct ip6_ctx {
|
|||
* @pasta_ifn: Name of namespace interface for pasta
|
||||
* @pasta_ifi: Index of namespace interface for pasta
|
||||
* @pasta_conf_ns: Configure namespace after creating it
|
||||
* @no_copy_routes: Don't copy all routes when configuring target namespace
|
||||
* @no_copy_addrs: Don't copy all addresses when configuring namespace
|
||||
* @no_tcp: Disable TCP operation
|
||||
* @tcp: Context for TCP protocol handler
|
||||
* @no_tcp: Disable UDP operation
|
||||
|
@ -225,8 +250,7 @@ struct ip6_ctx {
|
|||
* @no_dhcpv6: Disable DHCPv6 server
|
||||
* @no_ndp: Disable NDP handler altogether
|
||||
* @no_ra: Disable router advertisements
|
||||
* @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses
|
||||
* @freebind: Allow binding of non-local addresses for forwarding
|
||||
* @no_map_gw: Don't map connections, untracked UDP to gateway to host
|
||||
* @low_wmem: Low probed net.core.wmem_max
|
||||
* @low_rmem: Low probed net.core.rmem_max
|
||||
*/
|
||||
|
@ -236,13 +260,11 @@ struct ctx {
|
|||
int trace;
|
||||
int quiet;
|
||||
int foreground;
|
||||
int force_stderr;
|
||||
int nofile;
|
||||
char sock_path[UNIX_PATH_MAX];
|
||||
char pcap[PATH_MAX];
|
||||
|
||||
char pidfile[PATH_MAX];
|
||||
int pidfile_fd;
|
||||
|
||||
char pid_file[PATH_MAX];
|
||||
int one_off;
|
||||
|
||||
int pasta_netns_fd;
|
||||
|
@ -254,8 +276,8 @@ struct ctx {
|
|||
int epollfd;
|
||||
int fd_tap_listen;
|
||||
int fd_tap;
|
||||
unsigned char our_tap_mac[ETH_ALEN];
|
||||
unsigned char guest_mac[ETH_ALEN];
|
||||
unsigned char mac[ETH_ALEN];
|
||||
unsigned char mac_guest[ETH_ALEN];
|
||||
uint64_t hash_secret[2];
|
||||
|
||||
unsigned int ifi4;
|
||||
|
@ -269,6 +291,8 @@ struct ctx {
|
|||
char pasta_ifn[IF_NAMESIZE];
|
||||
unsigned int pasta_ifi;
|
||||
int pasta_conf_ns;
|
||||
int no_copy_routes;
|
||||
int no_copy_addrs;
|
||||
|
||||
int no_tcp;
|
||||
struct tcp_ctx tcp;
|
||||
|
@ -286,11 +310,13 @@ struct ctx {
|
|||
int no_dhcpv6;
|
||||
int no_ndp;
|
||||
int no_ra;
|
||||
int host_lo_to_ns_lo;
|
||||
int freebind;
|
||||
int no_map_gw;
|
||||
|
||||
int low_wmem;
|
||||
int low_rmem;
|
||||
|
||||
/* vhost-user */
|
||||
struct VuDev vdev;
|
||||
};
|
||||
|
||||
void proto_update_l2_buf(const unsigned char *eth_d,
|
||||
|
|
114
pasta.c
114
pasta.c
|
@ -12,8 +12,8 @@
|
|||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*
|
||||
* #syscalls:pasta clone waitid exit exit_group rt_sigprocmask
|
||||
* #syscalls:pasta rt_sigreturn|sigreturn
|
||||
* #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn i686:sigreturn
|
||||
* #syscalls:pasta rt_sigreturn|sigreturn armv6l:sigreturn armv7l:sigreturn
|
||||
* #syscalls:pasta ppc64:sigreturn s390x:sigreturn
|
||||
*/
|
||||
|
||||
#include <sched.h>
|
||||
|
@ -50,8 +50,6 @@
|
|||
#include "netlink.h"
|
||||
#include "log.h"
|
||||
|
||||
#define HOSTNAME_PREFIX "pasta-"
|
||||
|
||||
/* PID of child, in case we created a namespace */
|
||||
int pasta_child_pid;
|
||||
|
||||
|
@ -61,7 +59,6 @@ int pasta_child_pid;
|
|||
*/
|
||||
void pasta_child_handler(int signal)
|
||||
{
|
||||
int errno_save = errno;
|
||||
siginfo_t infop;
|
||||
|
||||
(void)signal;
|
||||
|
@ -86,8 +83,6 @@ void pasta_child_handler(int signal)
|
|||
|
||||
waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
|
||||
waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
|
||||
|
||||
errno = errno_save;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -102,9 +97,7 @@ static int pasta_wait_for_ns(void *arg)
|
|||
int flags = O_RDONLY | O_CLOEXEC;
|
||||
char ns[PATH_MAX];
|
||||
|
||||
if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid))
|
||||
die_perror("Can't build netns path");
|
||||
|
||||
snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
|
||||
do {
|
||||
while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
|
||||
if (errno != ENOENT)
|
||||
|
@ -145,15 +138,17 @@ void pasta_open_ns(struct ctx *c, const char *netns)
|
|||
int nfd = -1;
|
||||
|
||||
nfd = open(netns, O_RDONLY | O_CLOEXEC);
|
||||
if (nfd < 0)
|
||||
die_perror("Couldn't open network namespace %s", netns);
|
||||
if (nfd < 0) {
|
||||
die("Couldn't open network namespace %s: %s",
|
||||
netns, strerror(errno));
|
||||
}
|
||||
|
||||
c->pasta_netns_fd = nfd;
|
||||
|
||||
NS_CALL(ns_check, c);
|
||||
|
||||
if (c->pasta_netns_fd < 0)
|
||||
die_perror("Couldn't switch to pasta namespaces");
|
||||
die("Couldn't switch to pasta namespaces: %s", strerror(errno));
|
||||
|
||||
if (!c->no_netns_quit) {
|
||||
char buf[PATH_MAX] = { 0 };
|
||||
|
@ -181,28 +176,18 @@ struct pasta_spawn_cmd_arg {
|
|||
*
|
||||
* Return: this function never returns
|
||||
*/
|
||||
/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
|
||||
static int pasta_spawn_cmd(void *arg)
|
||||
{
|
||||
char hostname[HOST_NAME_MAX + 1] = HOSTNAME_PREFIX;
|
||||
const struct pasta_spawn_cmd_arg *a;
|
||||
sigset_t set;
|
||||
|
||||
/* We run in a detached PID and mount namespace: mount /proc over */
|
||||
if (mount("", "/proc", "proc", 0, NULL))
|
||||
warn_perror("Couldn't mount /proc");
|
||||
warn("Couldn't mount /proc: %s", strerror(errno));
|
||||
|
||||
if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
|
||||
warn("Cannot set ping_group_range, ICMP requests might fail");
|
||||
|
||||
if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1,
|
||||
HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) ||
|
||||
errno == ENAMETOOLONG) {
|
||||
hostname[HOST_NAME_MAX] = '\0';
|
||||
if (sethostname(hostname, strlen(hostname)))
|
||||
warn("Unable to set pasta-prefixed hostname");
|
||||
}
|
||||
|
||||
/* Wait for the parent to be ready: see main() */
|
||||
sigemptyset(&set);
|
||||
sigaddset(&set, SIGUSR1);
|
||||
|
@ -211,7 +196,8 @@ static int pasta_spawn_cmd(void *arg)
|
|||
a = (const struct pasta_spawn_cmd_arg *)arg;
|
||||
execvp(a->exe, a->argv);
|
||||
|
||||
die_perror("Failed to start command or shell");
|
||||
perror("execvp");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -225,13 +211,12 @@ static int pasta_spawn_cmd(void *arg)
|
|||
void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
|
||||
int argc, char *argv[])
|
||||
{
|
||||
char ns_fn_stack[NS_FN_STACK_SIZE]
|
||||
__attribute__ ((aligned(__alignof__(max_align_t))));
|
||||
struct pasta_spawn_cmd_arg arg = {
|
||||
.exe = argv[0],
|
||||
.argv = argv,
|
||||
};
|
||||
char uidmap[BUFSIZ], gidmap[BUFSIZ];
|
||||
char ns_fn_stack[NS_FN_STACK_SIZE];
|
||||
char *sh_argv[] = { NULL, NULL };
|
||||
char sh_arg0[PATH_MAX + 1];
|
||||
sigset_t set;
|
||||
|
@ -241,11 +226,8 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
|
|||
c->quiet = 1;
|
||||
|
||||
/* Configure user and group mappings */
|
||||
if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid))
|
||||
die_perror("Can't build uidmap");
|
||||
|
||||
if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid))
|
||||
die_perror("Can't build gidmap");
|
||||
snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
|
||||
snprintf(gidmap, BUFSIZ, "0 %u 1", gid);
|
||||
|
||||
if (write_file("/proc/self/uid_map", uidmap) ||
|
||||
write_file("/proc/self/setgroups", "deny") ||
|
||||
|
@ -277,12 +259,14 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
|
|||
CLONE_NEWUTS | CLONE_NEWNS | SIGCHLD,
|
||||
(void *)&arg);
|
||||
|
||||
if (pasta_child_pid == -1)
|
||||
die_perror("Failed to clone process with detached namespaces");
|
||||
if (pasta_child_pid == -1) {
|
||||
perror("clone");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
NS_CALL(pasta_wait_for_ns, c);
|
||||
if (c->pasta_netns_fd < 0)
|
||||
die_perror("Failed to join network namespace");
|
||||
die("Failed to join network namespace: %s", strerror(errno));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -293,33 +277,25 @@ void pasta_ns_conf(struct ctx *c)
|
|||
{
|
||||
int rc = 0;
|
||||
|
||||
rc = nl_link_set_flags(nl_sock_ns, 1 /* lo */, IFF_UP, IFF_UP);
|
||||
rc = nl_link_up(nl_sock_ns, 1 /* lo */, 0);
|
||||
if (rc < 0)
|
||||
die("Couldn't bring up loopback interface in namespace: %s",
|
||||
strerror(-rc));
|
||||
|
||||
/* Get or set MAC in target namespace */
|
||||
if (MAC_IS_ZERO(c->guest_mac))
|
||||
nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac);
|
||||
if (MAC_IS_ZERO(c->mac_guest))
|
||||
nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest);
|
||||
else
|
||||
rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac);
|
||||
rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest);
|
||||
if (rc < 0)
|
||||
die("Couldn't set MAC address in namespace: %s",
|
||||
strerror(-rc));
|
||||
|
||||
if (c->pasta_conf_ns) {
|
||||
unsigned int flags = IFF_UP;
|
||||
|
||||
if (c->mtu != -1)
|
||||
nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu);
|
||||
|
||||
if (c->ifi6) /* Avoid duplicate address detection on link up */
|
||||
flags |= IFF_NOARP;
|
||||
|
||||
nl_link_set_flags(nl_sock_ns, c->pasta_ifi, flags, flags);
|
||||
nl_link_up(nl_sock_ns, c->pasta_ifi, c->mtu);
|
||||
|
||||
if (c->ifi4) {
|
||||
if (c->ip4.no_copy_addrs) {
|
||||
if (c->no_copy_addrs) {
|
||||
rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
|
||||
AF_INET,
|
||||
&c->ip4.addr,
|
||||
|
@ -335,10 +311,9 @@ void pasta_ns_conf(struct ctx *c)
|
|||
strerror(-rc));
|
||||
}
|
||||
|
||||
if (c->ip4.no_copy_routes) {
|
||||
if (c->no_copy_routes) {
|
||||
rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi,
|
||||
AF_INET,
|
||||
&c->ip4.guest_gw);
|
||||
AF_INET, &c->ip4.gw);
|
||||
} else {
|
||||
rc = nl_route_dup(nl_sock, c->ifi4, nl_sock_ns,
|
||||
c->pasta_ifi, AF_INET);
|
||||
|
@ -351,24 +326,7 @@ void pasta_ns_conf(struct ctx *c)
|
|||
}
|
||||
|
||||
if (c->ifi6) {
|
||||
rc = nl_addr_get_ll(nl_sock_ns, c->pasta_ifi,
|
||||
&c->ip6.addr_ll_seen);
|
||||
if (rc < 0) {
|
||||
warn("Can't get LL address from namespace: %s",
|
||||
strerror(-rc));
|
||||
}
|
||||
|
||||
rc = nl_addr_set_ll_nodad(nl_sock_ns, c->pasta_ifi);
|
||||
if (rc < 0) {
|
||||
warn("Can't set nodad for LL in namespace: %s",
|
||||
strerror(-rc));
|
||||
}
|
||||
|
||||
/* We dodged DAD: re-enable neighbour solicitations */
|
||||
nl_link_set_flags(nl_sock_ns, c->pasta_ifi,
|
||||
0, IFF_NOARP);
|
||||
|
||||
if (c->ip6.no_copy_addrs) {
|
||||
if (c->no_copy_addrs) {
|
||||
rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
|
||||
AF_INET6, &c->ip6.addr, 64);
|
||||
} else {
|
||||
|
@ -382,10 +340,9 @@ void pasta_ns_conf(struct ctx *c)
|
|||
strerror(-rc));
|
||||
}
|
||||
|
||||
if (c->ip6.no_copy_routes) {
|
||||
if (c->no_copy_routes) {
|
||||
rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi,
|
||||
AF_INET6,
|
||||
&c->ip6.guest_gw);
|
||||
AF_INET6, &c->ip6.gw);
|
||||
} else {
|
||||
rc = nl_route_dup(nl_sock, c->ifi6,
|
||||
nl_sock_ns, c->pasta_ifi,
|
||||
|
@ -399,7 +356,7 @@ void pasta_ns_conf(struct ctx *c)
|
|||
}
|
||||
}
|
||||
|
||||
proto_update_l2_buf(c->guest_mac, NULL);
|
||||
proto_update_l2_buf(c->mac_guest, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -413,12 +370,12 @@ static int pasta_netns_quit_timer(void)
|
|||
struct itimerspec it = { { 1, 0 }, { 1, 0 } }; /* one-second interval */
|
||||
|
||||
if (fd == -1) {
|
||||
err_perror("Failed to create timerfd for quit timer");
|
||||
err("timerfd_create(): %s", strerror(errno));
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (timerfd_settime(fd, 0, &it, NULL) < 0) {
|
||||
err_perror("Failed to set interval for quit timer");
|
||||
err("timerfd_settime(): %s", strerror(errno));
|
||||
close(fd);
|
||||
return -errno;
|
||||
}
|
||||
|
@ -432,12 +389,12 @@ static int pasta_netns_quit_timer(void)
|
|||
*/
|
||||
void pasta_netns_quit_init(const struct ctx *c)
|
||||
{
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY };
|
||||
struct epoll_event ev = { .events = EPOLLIN };
|
||||
int flags = O_NONBLOCK | O_CLOEXEC;
|
||||
struct statfs s = { 0 };
|
||||
bool try_inotify = true;
|
||||
int fd = -1, dir_fd;
|
||||
union epoll_ref ref;
|
||||
|
||||
if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
|
||||
return;
|
||||
|
@ -468,7 +425,6 @@ void pasta_netns_quit_init(const struct ctx *c)
|
|||
ref.type = EPOLL_TYPE_NSQUIT_TIMER;
|
||||
} else {
|
||||
close(dir_fd);
|
||||
ref.type = EPOLL_TYPE_NSQUIT_INOTIFY;
|
||||
}
|
||||
|
||||
if (fd > FD_REF_MAX)
|
||||
|
@ -512,7 +468,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref)
|
|||
|
||||
n = read(ref.fd, &expirations, sizeof(expirations));
|
||||
if (n < 0)
|
||||
die_perror("Namespace watch timer read() error");
|
||||
die("Namespace watch timer read() error: %s", strerror(errno));
|
||||
if ((size_t)n < sizeof(expirations))
|
||||
warn("Namespace watch timer: short read(): %zi", n);
|
||||
|
||||
|
|
64
pcap.c
64
pcap.c
|
@ -72,43 +72,44 @@ struct pcap_pkthdr {
|
|||
* @iov: IO vector containing frame (with L2 headers and tap headers)
|
||||
* @iovcnt: Number of buffers (@iov entries) in frame
|
||||
* @offset: Byte offset of the L2 headers within @iov
|
||||
* @now: Timestamp
|
||||
* @tv: Timestamp
|
||||
*
|
||||
* Returns: 0 on success, -errno on error writing to the file
|
||||
*/
|
||||
static void pcap_frame(const struct iovec *iov, size_t iovcnt,
|
||||
size_t offset, const struct timespec *now)
|
||||
size_t offset, const struct timeval *tv)
|
||||
{
|
||||
size_t l2len = iov_size(iov, iovcnt) - offset;
|
||||
size_t len = iov_size(iov, iovcnt) - offset;
|
||||
struct pcap_pkthdr h = {
|
||||
.tv_sec = now->tv_sec,
|
||||
.tv_usec = DIV_ROUND_CLOSEST(now->tv_nsec, 1000),
|
||||
.caplen = l2len,
|
||||
.len = l2len
|
||||
.tv_sec = tv->tv_sec,
|
||||
.tv_usec = tv->tv_usec,
|
||||
.caplen = len,
|
||||
.len = len
|
||||
};
|
||||
struct iovec hiov = { &h, sizeof(h) };
|
||||
|
||||
if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 ||
|
||||
write_remainder(pcap_fd, iov, iovcnt, offset) < 0)
|
||||
debug_perror("Cannot log packet, length %zu", l2len);
|
||||
if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 ||
|
||||
write_remainder(pcap_fd, iov, iovcnt, offset) < 0) {
|
||||
debug("Cannot log packet, length %zu: %s",
|
||||
len, strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* pcap() - Capture a single frame to pcap file
|
||||
* @pkt: Pointer to data buffer, including L2 headers
|
||||
* @l2len: L2 frame length
|
||||
* @len: L2 packet length
|
||||
*/
|
||||
void pcap(const char *pkt, size_t l2len)
|
||||
void pcap(const char *pkt, size_t len)
|
||||
{
|
||||
struct iovec iov = { (char *)pkt, l2len };
|
||||
struct timespec now = { 0 };
|
||||
struct iovec iov = { (char *)pkt, len };
|
||||
struct timeval tv;
|
||||
|
||||
if (pcap_fd == -1)
|
||||
return;
|
||||
|
||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||
err_perror("Failed to get CLOCK_REALTIME time");
|
||||
|
||||
pcap_frame(&iov, 1, 0, &now);
|
||||
gettimeofday(&tv, NULL);
|
||||
pcap_frame(&iov, 1, 0, &tv);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -121,17 +122,16 @@ void pcap(const char *pkt, size_t l2len)
|
|||
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
||||
size_t offset)
|
||||
{
|
||||
struct timespec now = { 0 };
|
||||
struct timeval tv;
|
||||
unsigned int i;
|
||||
|
||||
if (pcap_fd == -1)
|
||||
return;
|
||||
|
||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||
err_perror("Failed to get CLOCK_REALTIME time");
|
||||
gettimeofday(&tv, NULL);
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
|
||||
pcap_frame(iov + i * frame_parts, frame_parts, offset, &tv);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -141,20 +141,17 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
|||
* @iov: Pointer to the array of struct iovec describing the I/O vector
|
||||
* containing packet data to write, including L2 header
|
||||
* @iovcnt: Number of buffers (@iov entries)
|
||||
* @offset: Offset of the L2 frame within the full data length
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt)
|
||||
{
|
||||
struct timespec now = { 0 };
|
||||
struct timeval tv;
|
||||
|
||||
if (pcap_fd == -1)
|
||||
return;
|
||||
|
||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||
err_perror("Failed to get CLOCK_REALTIME time");
|
||||
|
||||
pcap_frame(iov, iovcnt, offset, &now);
|
||||
gettimeofday(&tv, NULL);
|
||||
pcap_frame(iov, iovcnt, 0, &tv);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -163,20 +160,23 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
|
|||
*/
|
||||
void pcap_init(struct ctx *c)
|
||||
{
|
||||
int flags = O_WRONLY | O_CREAT | O_TRUNC;
|
||||
|
||||
if (pcap_fd != -1)
|
||||
return;
|
||||
|
||||
if (!*c->pcap)
|
||||
return;
|
||||
|
||||
pcap_fd = output_file_open(c->pcap, O_WRONLY);
|
||||
flags |= c->foreground ? O_CLOEXEC : 0;
|
||||
pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR);
|
||||
if (pcap_fd == -1) {
|
||||
err_perror("Couldn't open pcap file %s", c->pcap);
|
||||
perror("open");
|
||||
return;
|
||||
}
|
||||
|
||||
info("Saving packet capture to %s", c->pcap);
|
||||
|
||||
if (write(pcap_fd, &pcap_hdr, sizeof(pcap_hdr)) < 0)
|
||||
warn_perror("Cannot write PCAP header");
|
||||
warn("Cannot write PCAP header: %s", strerror(errno));
|
||||
}
|
||||
|
|
4
pcap.h
4
pcap.h
|
@ -6,10 +6,10 @@
|
|||
#ifndef PCAP_H
|
||||
#define PCAP_H
|
||||
|
||||
void pcap(const char *pkt, size_t l2len);
|
||||
void pcap(const char *pkt, size_t len);
|
||||
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
||||
size_t offset);
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset);
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt);
|
||||
void pcap_init(struct ctx *c);
|
||||
|
||||
#endif /* PCAP_H */
|
||||
|
|
82
pif.c
82
pif.c
|
@ -7,14 +7,9 @@
|
|||
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <netinet/in.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "pif.h"
|
||||
#include "siphash.h"
|
||||
#include "ip.h"
|
||||
#include "inany.h"
|
||||
#include "passt.h"
|
||||
|
||||
const char *pif_type_str[] = {
|
||||
[PIF_NONE] = "<none>",
|
||||
|
@ -24,80 +19,3 @@ const char *pif_type_str[] = {
|
|||
};
|
||||
static_assert(ARRAY_SIZE(pif_type_str) == PIF_NUM_TYPES,
|
||||
"pif_type_str[] doesn't match enum pif_type");
|
||||
|
||||
|
||||
/** pif_sockaddr() - Construct a socket address suitable for an interface
|
||||
* @c: Execution context
|
||||
* @sa: Pointer to sockaddr to fill in
|
||||
* @sl: Updated to relevant length of initialised @sa
|
||||
* @pif: Interface to create the socket address
|
||||
* @addr: IPv[46] address
|
||||
* @port: Port (host byte order)
|
||||
*/
|
||||
void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
|
||||
uint8_t pif, const union inany_addr *addr, in_port_t port)
|
||||
{
|
||||
const struct in_addr *v4 = inany_v4(addr);
|
||||
|
||||
ASSERT(pif_is_socket(pif));
|
||||
|
||||
if (v4) {
|
||||
sa->sa_family = AF_INET;
|
||||
sa->sa4.sin_addr = *v4;
|
||||
sa->sa4.sin_port = htons(port);
|
||||
memset(&sa->sa4.sin_zero, 0, sizeof(sa->sa4.sin_zero));
|
||||
*sl = sizeof(sa->sa4);
|
||||
} else {
|
||||
sa->sa_family = AF_INET6;
|
||||
sa->sa6.sin6_addr = addr->a6;
|
||||
sa->sa6.sin6_port = htons(port);
|
||||
if (pif == PIF_HOST && IN6_IS_ADDR_LINKLOCAL(&addr->a6))
|
||||
sa->sa6.sin6_scope_id = c->ifi6;
|
||||
else
|
||||
sa->sa6.sin6_scope_id = 0;
|
||||
sa->sa6.sin6_flowinfo = 0;
|
||||
*sl = sizeof(sa->sa6);
|
||||
}
|
||||
}
|
||||
|
||||
/** pif_sock_l4() - Open a socket bound to an address on a specified interface
|
||||
* @c: Execution context
|
||||
* @type: Socket epoll type
|
||||
* @pif: Interface for this socket
|
||||
* @addr: Address to bind to, or NULL for dual-stack any
|
||||
* @ifname: Interface for binding, NULL for any
|
||||
* @port: Port number to bind to (host byte order)
|
||||
* @data: epoll reference portion for protocol handlers
|
||||
*
|
||||
* NOTE: For namespace pifs, this must be called having already entered the
|
||||
* relevant namespace.
|
||||
*
|
||||
* Return: newly created socket, negative error code on failure
|
||||
*/
|
||||
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||
const union inany_addr *addr, const char *ifname,
|
||||
in_port_t port, uint32_t data)
|
||||
{
|
||||
union sockaddr_inany sa = {
|
||||
.sa6.sin6_family = AF_INET6,
|
||||
.sa6.sin6_addr = in6addr_any,
|
||||
.sa6.sin6_port = htons(port),
|
||||
};
|
||||
socklen_t sl;
|
||||
|
||||
ASSERT(pif_is_socket(pif));
|
||||
|
||||
if (pif == PIF_SPLICE) {
|
||||
/* Sanity checks */
|
||||
ASSERT(!ifname);
|
||||
ASSERT(addr && inany_is_loopback(addr));
|
||||
}
|
||||
|
||||
if (!addr)
|
||||
return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
|
||||
ifname, false, data);
|
||||
|
||||
pif_sockaddr(c, &sa, &sl, pif, addr, port);
|
||||
return sock_l4_sa(c, type, &sa, sl,
|
||||
ifname, sa.sa_family == AF_INET6, data);
|
||||
}
|
||||
|
|
21
pif.h
21
pif.h
|
@ -7,9 +7,6 @@
|
|||
#ifndef PIF_H
|
||||
#define PIF_H
|
||||
|
||||
union inany_addr;
|
||||
union sockaddr_inany;
|
||||
|
||||
/**
|
||||
* enum pif_type - Type of passt/pasta interface ("pif")
|
||||
*
|
||||
|
@ -41,26 +38,10 @@ static inline const char *pif_type(enum pif_type pt)
|
|||
return "?";
|
||||
}
|
||||
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
static inline const char *pif_name(uint8_t pif)
|
||||
{
|
||||
return pif_type(pif);
|
||||
}
|
||||
|
||||
/**
|
||||
* pif_is_socket() - Is interface implemented via L4 sockets?
|
||||
* @pif: pif to check
|
||||
*
|
||||
* Return: true of @pif is an L4 socket based interface, otherwise false
|
||||
*/
|
||||
static inline bool pif_is_socket(uint8_t pif)
|
||||
{
|
||||
return pif == PIF_HOST || pif == PIF_SPLICE;
|
||||
}
|
||||
|
||||
void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
|
||||
uint8_t pif, const union inany_addr *addr, in_port_t port);
|
||||
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||
const union inany_addr *addr, const char *ifname,
|
||||
in_port_t port, uint32_t data);
|
||||
|
||||
#endif /* PIF_H */
|
||||
|
|
4
qrap.1
4
qrap.1
|
@ -66,8 +66,8 @@ issues to Stefano Brivio <sbrivio@redhat.com>.
|
|||
Copyright (c) 2020-2021 Red Hat GmbH.
|
||||
|
||||
\fBqrap\fR is free software: you can redistribute is and/or modify it under the
|
||||
terms of the GNU General Public License as published by the Free Software
|
||||
Foundation, either version 2 of the License, or (at your option) any later
|
||||
terms of the GNU Affero General Public License as published by the Free Software
|
||||
Foundation, either version 3 of the License, or (at your option) any later
|
||||
version.
|
||||
|
||||
.SH SEE ALSO
|
||||
|
|
23
seccomp.sh
23
seccomp.sh
|
@ -20,15 +20,6 @@ OUT="$(mktemp)"
|
|||
[ -z "${ARCH}" ] && ARCH="$(uname -m)"
|
||||
[ -z "${CC}" ] && CC="cc"
|
||||
|
||||
AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z] \
|
||||
| sed 's/^ARM.*/ARM/' \
|
||||
| sed 's/I[456]86/I386/' \
|
||||
| sed 's/PPC64/PPC/' \
|
||||
| sed 's/PPCLE/PPC64LE/' \
|
||||
| sed 's/MIPS64EL/MIPSEL64/' \
|
||||
| sed 's/HPPA/PARISC/' \
|
||||
| sed 's/SH4/SH/')"
|
||||
|
||||
HEADER="/* This file was automatically generated by $(basename ${0}) */
|
||||
|
||||
#ifndef AUDIT_ARCH_PPC64LE
|
||||
|
@ -38,11 +29,11 @@ HEADER="/* This file was automatically generated by $(basename ${0}) */
|
|||
# Prefix for each profile: check that 'arch' in seccomp_data is matching
|
||||
PRE='
|
||||
struct sock_filter filter_@PROFILE@[] = {
|
||||
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
|
||||
/* cppcheck-suppress badBitmaskCheck */
|
||||
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
||||
(offsetof(struct seccomp_data, arch))),
|
||||
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@),
|
||||
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
|
||||
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@),
|
||||
/* cppcheck-suppress badBitmaskCheck */
|
||||
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
||||
(offsetof(struct seccomp_data, nr))),
|
||||
|
||||
|
@ -242,8 +233,7 @@ gen_profile() {
|
|||
sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}"
|
||||
done
|
||||
|
||||
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \
|
||||
"AUDIT_ARCH:${AUDIT_ARCH}"
|
||||
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))"
|
||||
}
|
||||
|
||||
printf '%s\n' "${HEADER}" > "${OUT}"
|
||||
|
@ -252,10 +242,7 @@ for __p in ${__profiles}; do
|
|||
__calls="$(sed -n 's/[\t ]*\*[\t ]*#syscalls\(:'"${__p}"'\|\)[\t ]\{1,\}\(.*\)/\2/p' ${IN})"
|
||||
__calls="${__calls} ${EXTRA_SYSCALLS:-}"
|
||||
__calls="$(filter ${__calls})"
|
||||
|
||||
cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
|
||||
case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac
|
||||
echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args}
|
||||
echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t
|
||||
|
||||
# Pad here to keep gen_profile() "simple"
|
||||
__count=0
|
||||
|
|
|
@ -115,4 +115,10 @@ static inline uint64_t siphash_final(struct siphash_state *state,
|
|||
return state->v[0] ^ state->v[1] ^ state->v[2] ^ state->v[3];
|
||||
}
|
||||
|
||||
uint64_t siphash_8b(const uint8_t *in, const uint64_t *k);
|
||||
uint64_t siphash_12b(const uint8_t *in, const uint64_t *k);
|
||||
uint64_t siphash_20b(const uint8_t *in, const uint64_t *k);
|
||||
uint64_t siphash_32b(const uint8_t *in, const uint64_t *k);
|
||||
uint64_t siphash_36b(const uint8_t *in, const uint64_t *k);
|
||||
|
||||
#endif /* SIPHASH_H */
|
||||
|
|
93
tap.h
93
tap.h
|
@ -6,60 +6,90 @@
|
|||
#ifndef TAP_H
|
||||
#define TAP_H
|
||||
|
||||
#define ETH_HDR_INIT(proto) { .h_proto = htons_constant(proto) }
|
||||
/*
|
||||
* TCP frame iovec array:
|
||||
* TCP_IOV_VNET vnet length
|
||||
* TCP_IOV_ETH ethernet header
|
||||
* TCP_IOV_IP IP (v4/v6) header
|
||||
* TCP_IOV_PAYLOAD IP payload (TCP header + data)
|
||||
* TCP_IOV_NUM is the number of entries in the iovec array
|
||||
*/
|
||||
#define TCP_IOV_VNET 0
|
||||
#define TCP_IOV_ETH 1
|
||||
#define TCP_IOV_IP 2
|
||||
#define TCP_IOV_PAYLOAD 3
|
||||
#define TCP_IOV_NUM 4
|
||||
|
||||
/**
|
||||
* struct tap_hdr - tap backend specific headers
|
||||
* struct tap_hdr - L2 and tap specific headers
|
||||
* @vnet_len: Frame length (for qemu socket transport)
|
||||
* @eh: Ethernet header
|
||||
*/
|
||||
struct tap_hdr {
|
||||
uint32_t vnet_len;
|
||||
struct ethhdr eh;
|
||||
} __attribute__((packed));
|
||||
|
||||
#define TAP_HDR_INIT(proto) { .eh.h_proto = htons_constant(proto) }
|
||||
|
||||
static inline size_t tap_hdr_len_(const struct ctx *c)
|
||||
{
|
||||
if (c->mode == MODE_PASST)
|
||||
return sizeof(struct tap_hdr);
|
||||
else
|
||||
return sizeof(struct ethhdr);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_hdr_iov() - struct iovec for a tap header
|
||||
* tap_iov_base() - Find start of tap frame
|
||||
* @c: Execution context
|
||||
* @taph: Pointer to tap specific header buffer
|
||||
* @taph: Pointer to L2 header buffer
|
||||
*
|
||||
* Returns: A struct iovec covering the correct portion of @taph to use as the
|
||||
* tap specific header in the current configuration.
|
||||
* Returns: pointer to the start of tap frame - suitable for an
|
||||
* iov_base to be passed to tap_send_frames())
|
||||
*/
|
||||
static inline struct iovec tap_hdr_iov(const struct ctx *c,
|
||||
struct tap_hdr *thdr)
|
||||
static inline void *tap_iov_base(const struct ctx *c, struct tap_hdr *taph)
|
||||
{
|
||||
return (struct iovec){
|
||||
.iov_base = thdr,
|
||||
.iov_len = c->mode == MODE_PASST ? sizeof(*thdr) : 0,
|
||||
};
|
||||
return (char *)(taph + 1) - tap_hdr_len_(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_hdr_update() - Update the tap specific header for a frame
|
||||
* @taph: Tap specific header buffer to update
|
||||
* @l2len: Frame length (including L2 headers)
|
||||
* tap_iov_len() - Finalize tap frame and return total length
|
||||
* @c: Execution context
|
||||
* @taph: Tap header to finalize
|
||||
* @plen: L2 payload length (excludes L2 and tap specific headers)
|
||||
*
|
||||
* Returns: length of the tap frame including L2 and tap specific
|
||||
* headers - suitable for an iov_len to be passed to
|
||||
* tap_send_frames()
|
||||
*/
|
||||
static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
|
||||
static inline size_t tap_iov_len(const struct ctx *c, struct tap_hdr *taph,
|
||||
size_t plen)
|
||||
{
|
||||
thdr->vnet_len = htonl(l2len);
|
||||
if (c->mode == MODE_PASST)
|
||||
taph->vnet_len = htonl(plen + sizeof(taph->eh));
|
||||
return plen + tap_hdr_len_(c);
|
||||
}
|
||||
|
||||
struct in_addr tap_ip4_daddr(const struct ctx *c);
|
||||
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
||||
struct in_addr dst, in_port_t dport,
|
||||
const void *in, size_t dlen);
|
||||
const void *in, size_t len);
|
||||
void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
|
||||
const void *in, size_t l4len);
|
||||
const void *in, size_t len);
|
||||
const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
|
||||
const struct in6_addr *src);
|
||||
void tap_udp6_send(const struct ctx *c,
|
||||
const struct in6_addr *src, in_port_t sport,
|
||||
const struct in6_addr *dst, in_port_t dport,
|
||||
uint32_t flow, void *in, size_t dlen);
|
||||
uint32_t flow, const void *in, size_t len);
|
||||
void tap_icmp6_send(const struct ctx *c,
|
||||
const struct in6_addr *src, const struct in6_addr *dst,
|
||||
const void *in, size_t l4len);
|
||||
void tap_send_single(const struct ctx *c, const void *data, size_t l2len);
|
||||
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
|
||||
size_t bufs_per_frame, size_t nframes);
|
||||
const void *in, size_t len);
|
||||
int tap_send(const struct ctx *c, const void *data, size_t len);
|
||||
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n);
|
||||
size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
|
||||
size_t n);
|
||||
void eth_update_mac(struct ethhdr *eh,
|
||||
const unsigned char *eth_d, const unsigned char *eth_s);
|
||||
void tap_listen_handler(struct ctx *c, uint32_t events);
|
||||
|
@ -67,10 +97,17 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
|
|||
const struct timespec *now);
|
||||
void tap_handler_passt(struct ctx *c, uint32_t events,
|
||||
const struct timespec *now);
|
||||
int tap_sock_unix_open(char *sock_path);
|
||||
void tap_sock_reset(struct ctx *c);
|
||||
void tap_sock_update_buf(void *base, size_t size);
|
||||
void tap_sock_init(struct ctx *c);
|
||||
void tap_flush_pools(void);
|
||||
void tap_handler(struct ctx *c, const struct timespec *now);
|
||||
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
|
||||
void pool_flush_all(void);
|
||||
void tap_handler_all(struct ctx *c, const struct timespec *now);
|
||||
|
||||
void packet_add_do(struct pool *p, size_t len, const char *start,
|
||||
const char *func, int line);
|
||||
void packet_add_all_do(struct ctx *c, ssize_t len, char *p,
|
||||
const char *func, int line);
|
||||
#define packet_add_all(p, len, start) \
|
||||
packet_add_all_do(p, len, start, __func__, __LINE__)
|
||||
|
||||
#endif /* TAP_H */
|
||||
|
|
20
tcp.h
20
tcp.h
|
@ -10,24 +10,20 @@
|
|||
|
||||
struct ctx;
|
||||
|
||||
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref);
|
||||
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
|
||||
void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
|
||||
void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
|
||||
const struct timespec *now);
|
||||
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events);
|
||||
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
|
||||
int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
const void *saddr, const void *daddr,
|
||||
const struct pool *p, int idx, const struct timespec *now);
|
||||
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
|
||||
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
|
||||
const char *ifname, in_port_t port);
|
||||
int tcp_init(struct ctx *c);
|
||||
void tcp_timer(struct ctx *c, const struct timespec *now);
|
||||
void tcp_defer_handler(struct ctx *c);
|
||||
|
||||
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
|
||||
int tcp_set_peek_offset(int s, int offset);
|
||||
|
||||
extern bool peek_offset_cap;
|
||||
void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s);
|
||||
|
||||
/**
|
||||
* union tcp_epoll_ref - epoll reference portion for TCP connections
|
||||
|
@ -59,12 +55,16 @@ union tcp_listen_epoll_ref {
|
|||
* @fwd_in: Port forwarding configuration for inbound packets
|
||||
* @fwd_out: Port forwarding configuration for outbound packets
|
||||
* @timer_run: Timestamp of most recent timer run
|
||||
* @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
|
||||
* @pipe_size: Size of pipes for spliced connections
|
||||
*/
|
||||
struct tcp_ctx {
|
||||
struct fwd_ports fwd_in;
|
||||
struct fwd_ports fwd_out;
|
||||
struct timespec timer_run;
|
||||
#ifdef HAS_SND_WND
|
||||
int kernel_snd_wnd;
|
||||
#endif
|
||||
size_t pipe_size;
|
||||
};
|
||||
|
||||
|
|
459
tcp_buf.c
459
tcp_buf.c
|
@ -6,9 +6,9 @@
|
|||
* PASTA - Pack A Subtle Tap Abstraction
|
||||
* for network namespace/tap device mode
|
||||
*
|
||||
* tcp_buf.c - TCP L2 buffer management functions
|
||||
* tcp_buf.c - TCP L2-L4 translation state machine
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Copyright (c) 2020-2022 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
|
@ -20,11 +20,10 @@
|
|||
|
||||
#include <netinet/ip.h>
|
||||
|
||||
#include <netinet/tcp.h>
|
||||
#include <linux/tcp.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "iov.h"
|
||||
#include "passt.h"
|
||||
#include "tap.h"
|
||||
#include "siphash.h"
|
||||
|
@ -34,169 +33,283 @@
|
|||
#include "tcp_buf.h"
|
||||
|
||||
#define TCP_FRAMES_MEM 128
|
||||
#define TCP_FRAMES \
|
||||
#define TCP_FRAMES \
|
||||
(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
|
||||
|
||||
/* Static buffers */
|
||||
/**
|
||||
* tcp_buf_seq_update - Sequences to update with length of frames once sent
|
||||
* @seq: Pointer to sequence number sent to tap-side, to be updated
|
||||
* @len: TCP payload length
|
||||
*/
|
||||
struct tcp_buf_seq_update {
|
||||
uint32_t *seq;
|
||||
uint16_t len;
|
||||
};
|
||||
|
||||
/* Ethernet header for IPv4 and IPv6 frames */
|
||||
/* Static buffers */
|
||||
/**
|
||||
* tcp_l2_flags_t - TCP header and data to send option flags
|
||||
* @th: TCP header
|
||||
* @opts TCP option flags
|
||||
*/
|
||||
struct tcp_l2_flags_t {
|
||||
struct tcphdr th;
|
||||
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
|
||||
};
|
||||
/**
|
||||
* tcp_l2_payload_t - TCP header and data to send data
|
||||
* 32 bytes aligned to be able to use AVX2 checksum
|
||||
* @th: TCP header
|
||||
* @data: TCP data
|
||||
*/
|
||||
struct tcp_l2_payload_t {
|
||||
struct tcphdr th; /* 20 bytes */
|
||||
uint8_t data[MSS]; /* 65516 bytes */
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)));
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
/* Ethernet header for IPv4 frames */
|
||||
static struct ethhdr tcp4_eth_src;
|
||||
|
||||
/* IPv4 headers */
|
||||
static struct iphdr tcp4_l2_ip[TCP_FRAMES_MEM];
|
||||
/* TCP headers and data for IPv4 frames */
|
||||
static struct tcp_l2_payload_t tcp4_l2_payload[TCP_FRAMES_MEM];
|
||||
|
||||
static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp4_l2_buf_used;
|
||||
|
||||
/* IPv4 headers for TCP option flags frames */
|
||||
static struct iphdr tcp4_l2_flags_ip[TCP_FRAMES_MEM];
|
||||
/* TCP headers and option flags for IPv4 frames */
|
||||
static struct tcp_l2_flags_t tcp4_l2_flags[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp4_l2_flags_buf_used;
|
||||
|
||||
/* Ethernet header for IPv6 frames */
|
||||
static struct ethhdr tcp6_eth_src;
|
||||
|
||||
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
|
||||
/* IPv6 headers */
|
||||
static struct ipv6hdr tcp6_l2_ip[TCP_FRAMES_MEM];
|
||||
/* TCP headers and data for IPv6 frames */
|
||||
static struct tcp_l2_payload_t tcp6_l2_payload[TCP_FRAMES_MEM];
|
||||
|
||||
/* IP headers for IPv4 and IPv6 */
|
||||
struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
|
||||
struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
|
||||
static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp6_l2_buf_used;
|
||||
|
||||
/* TCP segments with payload for IPv4 and IPv6 frames */
|
||||
static struct tcp_payload_t tcp_payload[TCP_FRAMES_MEM];
|
||||
/* IPv6 headers for TCP option flags frames */
|
||||
static struct ipv6hdr tcp6_l2_flags_ip[TCP_FRAMES_MEM];
|
||||
/* TCP headers and option flags for IPv6 frames */
|
||||
static struct tcp_l2_flags_t tcp6_l2_flags[TCP_FRAMES_MEM];
|
||||
|
||||
static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516");
|
||||
static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516");
|
||||
|
||||
/* References tracking the owner connection of frames in the tap outqueue */
|
||||
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp_payload_used;
|
||||
static unsigned int tcp6_l2_flags_buf_used;
|
||||
|
||||
/* recvmsg()/sendmsg() data for tap */
|
||||
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
|
||||
|
||||
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
|
||||
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
|
||||
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
|
||||
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
|
||||
|
||||
/**
|
||||
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses
|
||||
* tcp_buf_update_l2() - Update L2 buffers with Ethernet and IPv4 addresses
|
||||
* @eth_d: Ethernet destination address, NULL if unchanged
|
||||
* @eth_s: Ethernet source address, NULL if unchanged
|
||||
*/
|
||||
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
|
||||
void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s)
|
||||
{
|
||||
eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
|
||||
eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
||||
* tcp_buf_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_sock_iov_init(const struct ctx *c)
|
||||
void tcp_buf_sock4_iov_init(const struct ctx *c)
|
||||
{
|
||||
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
int i;
|
||||
|
||||
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
|
||||
(void)c;
|
||||
|
||||
tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
|
||||
tcp6_payload_ip[i] = ip6;
|
||||
tcp4_payload_ip[i] = iph;
|
||||
}
|
||||
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
struct iovec *iov = tcp_l2_iov[i];
|
||||
struct iovec *iov;
|
||||
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
|
||||
/* headers */
|
||||
tcp4_l2_ip[i] = iph;
|
||||
tcp4_l2_payload[i].th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
tcp4_l2_flags_ip[i] = iph;
|
||||
tcp4_l2_flags[i].th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
/* iovecs */
|
||||
iov = tcp4_l2_iov[i];
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
|
||||
iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i];
|
||||
iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i];
|
||||
|
||||
iov = tcp4_l2_flags_iov[i];
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||
iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i];
|
||||
iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
|
||||
* @ctx: Execution context
|
||||
* @conns: Array of connection pointers corresponding to queued frames
|
||||
* @frames: Two-dimensional array containing queued frames with sub-iovs
|
||||
* @num_frames: Number of entries in the two arrays to be compared
|
||||
*/
|
||||
static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
|
||||
struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < num_frames; i++) {
|
||||
const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
|
||||
struct tcp_tap_conn *conn = conns[i];
|
||||
uint32_t seq = ntohl(th->seq);
|
||||
uint32_t peek_offset;
|
||||
|
||||
if (SEQ_LE(conn->seq_to_tap, seq))
|
||||
continue;
|
||||
|
||||
conn->seq_to_tap = seq;
|
||||
peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
if (tcp_set_peek_offset(conn->sock, peek_offset))
|
||||
tcp_rst(c, conn);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_payload_flush() - Send out buffers for segments with data or flags
|
||||
* tcp_buf_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_payload_flush(const struct ctx *c)
|
||||
void tcp_buf_sock6_iov_init(const struct ctx *c)
|
||||
{
|
||||
size_t m;
|
||||
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
int i;
|
||||
|
||||
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp_payload_used);
|
||||
if (m != tcp_payload_used) {
|
||||
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
|
||||
tcp_payload_used - m);
|
||||
(void)c;
|
||||
|
||||
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
struct iovec *iov;
|
||||
|
||||
/* headers */
|
||||
tcp6_l2_ip[i] = ip6;
|
||||
tcp6_l2_payload[i].th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
tcp6_l2_flags_ip[i] = ip6;
|
||||
tcp6_l2_flags[i].th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
/* iovecs */
|
||||
iov = tcp6_l2_iov[i];
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||
iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i];
|
||||
iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i];
|
||||
|
||||
iov = tcp6_l2_flags_iov[i];
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||
iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i];
|
||||
iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i];
|
||||
}
|
||||
tcp_payload_used = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_buf_send_flag() - Send segment with flags to tap (no payload)
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @flags: TCP flags: if not set, send segment only if ACK is due
|
||||
*
|
||||
* Return: negative error code on connection reset, 0 otherwise
|
||||
* tcp_buf_l2_flags_flush() - Send out buffers for segments with no data (flags)
|
||||
* @c: Execution context
|
||||
*/
|
||||
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
void tcp_buf_l2_flags_flush(const struct ctx *c)
|
||||
{
|
||||
struct tcp_payload_t *payload;
|
||||
tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
|
||||
tcp6_l2_flags_buf_used = 0;
|
||||
|
||||
tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
|
||||
tcp4_l2_flags_buf_used = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_buf_l2_data_flush() - Send out buffers for segments with data
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_buf_l2_data_flush(const struct ctx *c)
|
||||
{
|
||||
unsigned i;
|
||||
size_t m;
|
||||
|
||||
m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used);
|
||||
for (i = 0; i < m; i++)
|
||||
*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
|
||||
tcp6_l2_buf_used = 0;
|
||||
|
||||
m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used);
|
||||
for (i = 0; i < m; i++)
|
||||
*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
|
||||
tcp4_l2_buf_used = 0;
|
||||
}
|
||||
|
||||
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
{
|
||||
struct tcp_l2_flags_t *payload;
|
||||
struct iovec *dup_iov;
|
||||
struct iovec *iov;
|
||||
size_t optlen;
|
||||
size_t l4len;
|
||||
uint32_t seq;
|
||||
struct tcphdr *th;
|
||||
size_t optlen = 0;
|
||||
size_t ip_len;
|
||||
char *data;
|
||||
int ret;
|
||||
|
||||
iov = tcp_l2_iov[tcp_payload_used];
|
||||
if (CONN_V4(conn)) {
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++];
|
||||
dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used];
|
||||
} else {
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||
iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++];
|
||||
dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used];
|
||||
}
|
||||
|
||||
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
||||
seq = conn->seq_to_tap;
|
||||
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
|
||||
(struct tcp_syn_opts *)&payload->data, &optlen);
|
||||
th = &payload->th;
|
||||
data = payload->opts;
|
||||
|
||||
ret = tcp_fill_flag_header(c, conn, flags, th, data, &optlen);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
|
||||
tcp_payload_used++;
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
if (flags & DUP_ACK) {
|
||||
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
|
||||
if (CONN_V4(conn)) {
|
||||
struct iphdr *iph = iov[TCP_IOV_IP].iov_base;
|
||||
|
||||
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
|
||||
iov[TCP_IOV_TAP].iov_len);
|
||||
dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
|
||||
dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP];
|
||||
memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base, l4len);
|
||||
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
ip_len = tcp_fill_headers4(c, conn, iph, th, optlen, NULL,
|
||||
conn->seq_to_tap);
|
||||
} else {
|
||||
struct ipv6hdr *ip6h = iov[TCP_IOV_IP].iov_base;
|
||||
|
||||
ip_len = tcp_fill_headers6(c, conn, ip6h, th, optlen,
|
||||
conn->seq_to_tap);
|
||||
}
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
|
||||
|
||||
if (flags & DUP_ACK) {
|
||||
int i;
|
||||
for (i = 0; i < TCP_IOV_NUM; i++) {
|
||||
memcpy(dup_iov[i].iov_base, iov[i].iov_base,
|
||||
iov[i].iov_len);
|
||||
dup_iov[i].iov_len = iov[i].iov_len;
|
||||
}
|
||||
}
|
||||
|
||||
if (tcp_payload_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_payload_flush(c);
|
||||
if (CONN_V4(conn)) {
|
||||
if (flags & DUP_ACK)
|
||||
tcp4_l2_flags_buf_used++;
|
||||
|
||||
if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_buf_l2_flags_flush(c);
|
||||
} else {
|
||||
if (flags & DUP_ACK)
|
||||
tcp6_l2_flags_buf_used++;
|
||||
|
||||
if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_buf_l2_flags_flush(c);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -205,43 +318,49 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|||
* tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @dlen: TCP payload length
|
||||
* @plen: Payload length at L4
|
||||
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
|
||||
* @seq: Sequence number to be sent
|
||||
*/
|
||||
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
ssize_t dlen, int no_csum, uint32_t seq)
|
||||
ssize_t plen, int no_csum, uint32_t seq)
|
||||
{
|
||||
struct tcp_payload_t *payload;
|
||||
const uint16_t *check = NULL;
|
||||
uint32_t *seq_update = &conn->seq_to_tap;
|
||||
struct iovec *iov;
|
||||
size_t l4len;
|
||||
|
||||
conn->seq_to_tap = seq + dlen;
|
||||
tcp_frame_conns[tcp_payload_used] = conn;
|
||||
iov = tcp_l2_iov[tcp_payload_used];
|
||||
if (CONN_V4(conn)) {
|
||||
if (no_csum) {
|
||||
struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
|
||||
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
|
||||
struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1];
|
||||
const uint16_t *check = NULL;
|
||||
|
||||
if (no_csum) {
|
||||
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
|
||||
check = &iph->check;
|
||||
}
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
|
||||
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
|
||||
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
|
||||
|
||||
iov = tcp4_l2_iov[tcp4_l2_buf_used++];
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = tcp_fill_headers4(c, conn,
|
||||
iov[TCP_IOV_IP].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base,
|
||||
plen, check, seq);
|
||||
|
||||
if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_buf_l2_data_flush(c);
|
||||
} else if (CONN_V6(conn)) {
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
|
||||
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
|
||||
|
||||
iov = tcp6_l2_iov[tcp6_l2_buf_used++];
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = tcp_fill_headers6(c, conn,
|
||||
iov[TCP_IOV_IP].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base,
|
||||
plen, seq);
|
||||
|
||||
if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_buf_l2_data_flush(c);
|
||||
}
|
||||
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
||||
payload->th.th_off = sizeof(struct tcphdr) / 4;
|
||||
payload->th.th_x2 = 0;
|
||||
payload->th.th_flags = 0;
|
||||
payload->th.ack = 1;
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_payload_flush(c);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -253,17 +372,17 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
*
|
||||
* #syscalls recvmsg
|
||||
*/
|
||||
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
||||
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
|
||||
int len, dlen, i, s = conn->sock;
|
||||
int sendlen, len, plen, v4 = CONN_V4(conn);
|
||||
int s = conn->sock, i, ret = 0;
|
||||
struct msghdr mh_sock = { 0 };
|
||||
uint16_t mss = MSS_GET(conn);
|
||||
uint32_t already_sent, seq;
|
||||
struct iovec *iov;
|
||||
|
||||
/* How much have we read/sent since last received ack ? */
|
||||
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
|
||||
if (SEQ_LT(already_sent, 0)) {
|
||||
|
@ -272,10 +391,6 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
already_sent = 0;
|
||||
if (tcp_set_peek_offset(s, 0)) {
|
||||
tcp_rst(c, conn);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||
|
@ -293,26 +408,25 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
iov_rem = (wnd_scaled - already_sent) % mss;
|
||||
}
|
||||
|
||||
/* Prepare iov according to kernel capability */
|
||||
if (!peek_offset_cap) {
|
||||
mh_sock.msg_iov = iov_sock;
|
||||
iov_sock[0].iov_base = tcp_buf_discard;
|
||||
iov_sock[0].iov_len = already_sent;
|
||||
mh_sock.msg_iovlen = fill_bufs + 1;
|
||||
} else {
|
||||
mh_sock.msg_iov = &iov_sock[1];
|
||||
mh_sock.msg_iovlen = fill_bufs;
|
||||
}
|
||||
mh_sock.msg_iov = iov_sock;
|
||||
mh_sock.msg_iovlen = fill_bufs + 1;
|
||||
|
||||
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
|
||||
tcp_payload_flush(c);
|
||||
iov_sock[0].iov_base = tcp_buf_discard;
|
||||
iov_sock[0].iov_len = already_sent;
|
||||
|
||||
if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) ||
|
||||
(!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) {
|
||||
tcp_buf_l2_data_flush(c);
|
||||
|
||||
/* Silence Coverity CWE-125 false positive */
|
||||
tcp_payload_used = 0;
|
||||
tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
|
||||
}
|
||||
|
||||
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
|
||||
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
|
||||
if (v4)
|
||||
iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data;
|
||||
else
|
||||
iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data;
|
||||
iov->iov_len = mss;
|
||||
}
|
||||
if (iov_rem)
|
||||
|
@ -323,19 +437,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
len = recvmsg(s, &mh_sock, MSG_PEEK);
|
||||
while (len < 0 && errno == EINTR);
|
||||
|
||||
if (len < 0) {
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
tcp_rst(c, conn);
|
||||
return -errno;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
if (len < 0)
|
||||
goto err;
|
||||
|
||||
if (!len) {
|
||||
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
||||
int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
|
||||
if (ret) {
|
||||
if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
}
|
||||
|
@ -346,36 +453,42 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (!peek_offset_cap)
|
||||
len -= already_sent;
|
||||
|
||||
if (len <= 0) {
|
||||
sendlen = len - already_sent;
|
||||
if (sendlen <= 0) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ~STALLED);
|
||||
|
||||
send_bufs = DIV_ROUND_UP(len, mss);
|
||||
last_len = len - (send_bufs - 1) * mss;
|
||||
send_bufs = DIV_ROUND_UP(sendlen, mss);
|
||||
last_len = sendlen - (send_bufs - 1) * mss;
|
||||
|
||||
/* Likely, some new data was acked too. */
|
||||
tcp_update_seqack_wnd(c, conn, false, NULL);
|
||||
tcp_update_seqack_wnd(c, conn, 0, NULL);
|
||||
|
||||
/* Finally, queue to tap */
|
||||
dlen = mss;
|
||||
plen = mss;
|
||||
seq = conn->seq_to_tap;
|
||||
for (i = 0; i < send_bufs; i++) {
|
||||
int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
|
||||
int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
|
||||
|
||||
if (i == send_bufs - 1)
|
||||
dlen = last_len;
|
||||
plen = last_len;
|
||||
|
||||
tcp_data_to_tap(c, conn, dlen, no_csum, seq);
|
||||
seq += dlen;
|
||||
tcp_data_to_tap(c, conn, plen, no_csum, seq);
|
||||
seq += plen;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
ret = -errno;
|
||||
tcp_rst(c, conn);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
11
tcp_buf.h
11
tcp_buf.h
|
@ -6,9 +6,12 @@
|
|||
#ifndef TCP_BUF_H
|
||||
#define TCP_BUF_H
|
||||
|
||||
void tcp_sock_iov_init(const struct ctx *c);
|
||||
void tcp_payload_flush(const struct ctx *c);
|
||||
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
|
||||
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
void tcp_buf_sock4_iov_init(const struct ctx *c);
|
||||
void tcp_buf_sock6_iov_init(const struct ctx *c);
|
||||
void tcp_buf_l2_flags_flush(const struct ctx *c);
|
||||
void tcp_buf_l2_data_flush(const struct ctx *c);
|
||||
uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn);
|
||||
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
|
||||
#endif /*TCP_BUF_H */
|
||||
|
|
65
tcp_conn.h
65
tcp_conn.h
|
@ -13,16 +13,19 @@
|
|||
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
|
||||
* @f: Generic flow information
|
||||
* @in_epoll: Is the connection in the epoll set?
|
||||
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
|
||||
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
||||
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
||||
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
||||
* @sock: Socket descriptor number
|
||||
* @events: Connection events, implying connection states
|
||||
* @timer: timerfd descriptor for timeout events
|
||||
* @flags: Connection flags representing internal attributes
|
||||
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
|
||||
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
||||
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
||||
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
|
||||
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
|
||||
* @faddr: Guest side forwarding address (guest's remote address)
|
||||
* @eport: Guest side endpoint port (guest's local port)
|
||||
* @fport: Guest side forwarding port (guest's remote port)
|
||||
* @wnd_from_tap: Last window size from tap, unscaled (as received)
|
||||
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
|
||||
* @seq_to_tap: Next sequence for packets to tap
|
||||
|
@ -46,10 +49,6 @@ struct tcp_tap_conn {
|
|||
unsigned int ws_from_tap :TCP_WS_BITS;
|
||||
unsigned int ws_to_tap :TCP_WS_BITS;
|
||||
|
||||
#define TCP_MSS_BITS 14
|
||||
unsigned int tap_mss :TCP_MSS_BITS;
|
||||
#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
|
||||
#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
|
||||
|
||||
int sock :FD_REF_BITS;
|
||||
|
||||
|
@ -78,6 +77,13 @@ struct tcp_tap_conn {
|
|||
#define ACK_TO_TAP_DUE BIT(3)
|
||||
#define ACK_FROM_TAP_DUE BIT(4)
|
||||
|
||||
|
||||
#define TCP_MSS_BITS 14
|
||||
unsigned int tap_mss :TCP_MSS_BITS;
|
||||
#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
|
||||
#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
|
||||
|
||||
|
||||
#define SNDBUF_BITS 24
|
||||
unsigned int sndbuf :SNDBUF_BITS;
|
||||
#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
|
||||
|
@ -85,6 +91,11 @@ struct tcp_tap_conn {
|
|||
|
||||
uint8_t seq_dup_ack_approx;
|
||||
|
||||
|
||||
union inany_addr faddr;
|
||||
in_port_t eport;
|
||||
in_port_t fport;
|
||||
|
||||
uint16_t wnd_from_tap;
|
||||
uint16_t wnd_to_tap;
|
||||
|
||||
|
@ -95,41 +106,47 @@ struct tcp_tap_conn {
|
|||
uint32_t seq_init_from_tap;
|
||||
};
|
||||
|
||||
#define SIDES 2
|
||||
/**
|
||||
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
|
||||
* @f: Generic flow information
|
||||
* @in_epoll: Is the connection in the epoll set?
|
||||
* @s: File descriptor for sockets
|
||||
* @pipe: File descriptors for pipes
|
||||
* @read: Bytes read (not fully written to other side in one shot)
|
||||
* @written: Bytes written (not fully written from one other side read)
|
||||
* @events: Events observed/actions performed on connection
|
||||
* @flags: Connection flags (attributes, not events)
|
||||
* @in_epoll: Is the connection in the epoll set?
|
||||
*/
|
||||
* @read: Bytes read (not fully written to other side in one shot)
|
||||
* @written: Bytes written (not fully written from one other side read)
|
||||
*/
|
||||
struct tcp_splice_conn {
|
||||
/* Must be first element */
|
||||
struct flow_common f;
|
||||
|
||||
bool in_epoll :1;
|
||||
int s[SIDES];
|
||||
int pipe[SIDES][2];
|
||||
|
||||
uint32_t read[SIDES];
|
||||
uint32_t written[SIDES];
|
||||
|
||||
uint8_t events;
|
||||
#define SPLICE_CLOSED 0
|
||||
#define SPLICE_CONNECT BIT(0)
|
||||
#define SPLICE_ESTABLISHED BIT(1)
|
||||
#define OUT_WAIT(sidei_) ((sidei_) ? BIT(3) : BIT(2))
|
||||
#define FIN_RCVD(sidei_) ((sidei_) ? BIT(5) : BIT(4))
|
||||
#define FIN_SENT(sidei_) ((sidei_) ? BIT(7) : BIT(6))
|
||||
#define OUT_WAIT_0 BIT(2)
|
||||
#define OUT_WAIT_1 BIT(3)
|
||||
#define FIN_RCVD_0 BIT(4)
|
||||
#define FIN_RCVD_1 BIT(5)
|
||||
#define FIN_SENT_0 BIT(6)
|
||||
#define FIN_SENT_1 BIT(7)
|
||||
|
||||
uint8_t flags;
|
||||
#define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0))
|
||||
#define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2))
|
||||
#define CLOSING BIT(4)
|
||||
#define SPLICE_V6 BIT(0)
|
||||
#define RCVLOWAT_SET_0 BIT(1)
|
||||
#define RCVLOWAT_SET_1 BIT(2)
|
||||
#define RCVLOWAT_ACT_0 BIT(3)
|
||||
#define RCVLOWAT_ACT_1 BIT(4)
|
||||
#define CLOSING BIT(5)
|
||||
|
||||
bool in_epoll :1;
|
||||
uint32_t read[SIDES];
|
||||
uint32_t written[SIDES];
|
||||
};
|
||||
|
||||
/* Socket pools */
|
||||
|
@ -138,9 +155,9 @@ struct tcp_splice_conn {
|
|||
extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
|
||||
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
|
||||
|
||||
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
|
||||
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
|
||||
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
|
||||
bool tcp_flow_defer(union flow *flow);
|
||||
bool tcp_splice_flow_defer(union flow *flow);
|
||||
void tcp_splice_timer(const struct ctx *c, union flow *flow);
|
||||
int tcp_conn_pool_sock(int pool[]);
|
||||
int tcp_conn_sock(const struct ctx *c, sa_family_t af);
|
||||
int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af);
|
||||
|
|
134
tcp_internal.h
134
tcp_internal.h
|
@ -8,15 +8,7 @@
|
|||
|
||||
#define MAX_WS 8
|
||||
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
|
||||
|
||||
#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
|
||||
sizeof(struct tcphdr) - \
|
||||
sizeof(struct iphdr), \
|
||||
sizeof(uint32_t))
|
||||
#define MSS6 ROUND_DOWN(IP_MAX_MTU - \
|
||||
sizeof(struct tcphdr) - \
|
||||
sizeof(struct ipv6hdr), \
|
||||
sizeof(uint32_t))
|
||||
#define MSS (USHRT_MAX - sizeof(struct tcphdr))
|
||||
|
||||
#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
|
||||
#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
|
||||
|
@ -33,108 +25,17 @@
|
|||
#define OPT_EOL 0
|
||||
#define OPT_NOP 1
|
||||
#define OPT_MSS 2
|
||||
#define OPT_MSS_LEN 4
|
||||
#define OPT_WS 3
|
||||
#define OPT_WS_LEN 3
|
||||
#define OPT_SACKP 4
|
||||
#define OPT_SACK 5
|
||||
#define OPT_TS 8
|
||||
|
||||
#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
|
||||
#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
|
||||
#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
|
||||
|
||||
#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr))
|
||||
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
|
||||
#define CONN_V6(conn) (!CONN_V4(conn))
|
||||
|
||||
/*
|
||||
* enum tcp_iov_parts - I/O vector parts for one TCP frame
|
||||
* @TCP_IOV_TAP tap backend specific header
|
||||
* @TCP_IOV_ETH Ethernet header
|
||||
* @TCP_IOV_IP IP (v4/v6) header
|
||||
* @TCP_IOV_PAYLOAD IP payload (TCP header + data)
|
||||
* @TCP_NUM_IOVS the number of entries in the iovec array
|
||||
*/
|
||||
enum tcp_iov_parts {
|
||||
TCP_IOV_TAP = 0,
|
||||
TCP_IOV_ETH = 1,
|
||||
TCP_IOV_IP = 2,
|
||||
TCP_IOV_PAYLOAD = 3,
|
||||
TCP_NUM_IOVS
|
||||
};
|
||||
|
||||
/**
|
||||
* struct tcp_payload_t - TCP header and data to send segments with payload
|
||||
* @th: TCP header
|
||||
* @data: TCP data
|
||||
*/
|
||||
struct tcp_payload_t {
|
||||
struct tcphdr th;
|
||||
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
/** struct tcp_opt_nop - TCP NOP option
|
||||
* @kind: Option kind (OPT_NOP = 1)
|
||||
*/
|
||||
struct tcp_opt_nop {
|
||||
uint8_t kind;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_OPT_NOP ((struct tcp_opt_nop){ .kind = OPT_NOP, })
|
||||
|
||||
/** struct tcp_opt_mss - TCP MSS option
|
||||
* @kind: Option kind (OPT_MSS == 2)
|
||||
* @len: Option length (4)
|
||||
* @mss: Maximum Segment Size
|
||||
*/
|
||||
struct tcp_opt_mss {
|
||||
uint8_t kind;
|
||||
uint8_t len;
|
||||
uint16_t mss;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_OPT_MSS(mss_) \
|
||||
((struct tcp_opt_mss) { \
|
||||
.kind = OPT_MSS, \
|
||||
.len = sizeof(struct tcp_opt_mss), \
|
||||
.mss = htons(mss_), \
|
||||
})
|
||||
|
||||
/** struct tcp_opt_ws - TCP Window Scaling option
|
||||
* @kind: Option kind (OPT_WS == 3)
|
||||
* @len: Option length (3)
|
||||
* @shift: Window scaling shift
|
||||
*/
|
||||
struct tcp_opt_ws {
|
||||
uint8_t kind;
|
||||
uint8_t len;
|
||||
uint8_t shift;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_OPT_WS(shift_) \
|
||||
((struct tcp_opt_ws) { \
|
||||
.kind = OPT_WS, \
|
||||
.len = sizeof(struct tcp_opt_ws), \
|
||||
.shift = (shift_), \
|
||||
})
|
||||
|
||||
/** struct tcp_syn_opts - TCP options we apply to SYN packets
|
||||
* @mss: Maximum Segment Size (MSS) option
|
||||
* @nop: NOP opt (for alignment)
|
||||
* @ws: Window Scaling (WS) option
|
||||
*/
|
||||
struct tcp_syn_opts {
|
||||
struct tcp_opt_mss mss;
|
||||
struct tcp_opt_nop nop;
|
||||
struct tcp_opt_ws ws;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_SYN_OPTS(mss_, ws_) \
|
||||
((struct tcp_syn_opts){ \
|
||||
.mss = TCP_OPT_MSS(mss_), \
|
||||
.nop = TCP_OPT_NOP, \
|
||||
.ws = TCP_OPT_WS(ws_), \
|
||||
})
|
||||
|
||||
extern char tcp_buf_discard [MAX_WINDOW];
|
||||
extern char tcp_buf_discard[MAX_WINDOW];
|
||||
|
||||
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
unsigned long flag);
|
||||
|
@ -153,23 +54,28 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
conn_event_do(c, conn, event); \
|
||||
} while (0)
|
||||
|
||||
void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
|
||||
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
#define tcp_rst(c, conn) \
|
||||
do { \
|
||||
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
|
||||
tcp_rst_do(c, conn); \
|
||||
} while (0)
|
||||
|
||||
struct tcp_info_linux;
|
||||
|
||||
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
||||
struct iovec *iov, size_t dlen,
|
||||
const uint16_t *check, uint32_t seq,
|
||||
bool no_tcp_csum);
|
||||
|
||||
size_t tcp_fill_headers4(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn,
|
||||
struct iphdr *iph, struct tcphdr *th,
|
||||
size_t plen, const uint16_t *check,
|
||||
uint32_t seq);
|
||||
size_t tcp_fill_headers6(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn,
|
||||
struct ipv6hdr *ip6h, struct tcphdr *th,
|
||||
size_t plen, uint32_t seq);
|
||||
|
||||
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
bool force_seq, struct tcp_info_linux *tinfo);
|
||||
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
|
||||
size_t *optlen);
|
||||
int force_seq, struct tcp_info *tinfo);
|
||||
int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags,
|
||||
struct tcphdr *th, char *opts, size_t *optlen);
|
||||
|
||||
#endif /* TCP_INTERNAL_H */
|
||||
|
|
299
tcp_splice.c
299
tcp_splice.c
|
@ -28,7 +28,7 @@
|
|||
* - FIN_SENT_0: FIN (write shutdown) sent to accepted socket
|
||||
* - FIN_SENT_1: FIN (write shutdown) sent to target socket
|
||||
*
|
||||
* #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
|
||||
* #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64
|
||||
*/
|
||||
|
||||
#include <sched.h>
|
||||
|
@ -73,7 +73,10 @@ static int ns_sock_pool6 [TCP_SOCK_POOL_SIZE];
|
|||
/* Pool of pre-opened pipes */
|
||||
static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2];
|
||||
|
||||
#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set))
|
||||
#define CONN_V6(x) (x->flags & SPLICE_V6)
|
||||
#define CONN_V4(x) (!CONN_V6(x))
|
||||
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
|
||||
#define CONN(idx) (&FLOW(idx)->tcp_splice)
|
||||
|
||||
/* Display strings for connection events */
|
||||
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
|
||||
|
@ -91,24 +94,6 @@ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
|
|||
static int tcp_sock_refill_ns(void *arg);
|
||||
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
|
||||
|
||||
/**
|
||||
* conn_at_sidx() - Get spliced TCP connection specific flow at given sidx
|
||||
* @sidx: Flow and side to retrieve
|
||||
*
|
||||
* Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid.
|
||||
* Asserts if the flow at @sidx is not FLOW_TCP_SPLICE.
|
||||
*/
|
||||
static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
|
||||
{
|
||||
union flow *flow = flow_at_sidx(sidx);
|
||||
|
||||
if (!flow)
|
||||
return NULL;
|
||||
|
||||
ASSERT(flow->f.type == FLOW_TCP_SPLICE);
|
||||
return &flow->tcp_splice;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_splice_conn_epoll_events() - epoll events masks for given state
|
||||
* @events: Connection event flags
|
||||
|
@ -117,22 +102,19 @@ static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
|
|||
static void tcp_splice_conn_epoll_events(uint16_t events,
|
||||
struct epoll_event ev[])
|
||||
{
|
||||
unsigned sidei;
|
||||
|
||||
flow_foreach_sidei(sidei)
|
||||
ev[sidei].events = 0;
|
||||
ev[0].events = ev[1].events = 0;
|
||||
|
||||
if (events & SPLICE_ESTABLISHED) {
|
||||
flow_foreach_sidei(sidei) {
|
||||
if (!(events & FIN_SENT(!sidei)))
|
||||
ev[sidei].events = EPOLLIN | EPOLLRDHUP;
|
||||
}
|
||||
if (!(events & FIN_SENT_1))
|
||||
ev[0].events = EPOLLIN | EPOLLRDHUP;
|
||||
if (!(events & FIN_SENT_0))
|
||||
ev[1].events = EPOLLIN | EPOLLRDHUP;
|
||||
} else if (events & SPLICE_CONNECT) {
|
||||
ev[1].events = EPOLLOUT;
|
||||
}
|
||||
|
||||
flow_foreach_sidei(sidei)
|
||||
ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0;
|
||||
ev[0].events |= (events & OUT_WAIT_0) ? EPOLLOUT : 0;
|
||||
ev[1].events |= (events & OUT_WAIT_1) ? EPOLLOUT : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -253,31 +235,32 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
|
|||
|
||||
/**
|
||||
* tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed)
|
||||
* @conn: Connection entry to handle
|
||||
* @flow: Flow table entry for this connection
|
||||
*
|
||||
* Return: true if the flow is ready to free, false otherwise
|
||||
*/
|
||||
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn)
|
||||
bool tcp_splice_flow_defer(union flow *flow)
|
||||
{
|
||||
unsigned sidei;
|
||||
struct tcp_splice_conn *conn = &flow->tcp_splice;
|
||||
unsigned side;
|
||||
|
||||
if (!(conn->flags & CLOSING))
|
||||
if (!(flow->tcp_splice.flags & CLOSING))
|
||||
return false;
|
||||
|
||||
flow_foreach_sidei(sidei) {
|
||||
for (side = 0; side < SIDES; side++) {
|
||||
/* Flushing might need to block: don't recycle them. */
|
||||
if (conn->pipe[sidei][0] >= 0) {
|
||||
close(conn->pipe[sidei][0]);
|
||||
close(conn->pipe[sidei][1]);
|
||||
conn->pipe[sidei][0] = conn->pipe[sidei][1] = -1;
|
||||
if (conn->pipe[side][0] >= 0) {
|
||||
close(conn->pipe[side][0]);
|
||||
close(conn->pipe[side][1]);
|
||||
conn->pipe[side][0] = conn->pipe[side][1] = -1;
|
||||
}
|
||||
|
||||
if (conn->s[sidei] >= 0) {
|
||||
close(conn->s[sidei]);
|
||||
conn->s[sidei] = -1;
|
||||
if (conn->s[side] >= 0) {
|
||||
close(conn->s[side]);
|
||||
conn->s[side] = -1;
|
||||
}
|
||||
|
||||
conn->read[sidei] = conn->written[sidei] = 0;
|
||||
conn->read[side] = conn->written[side] = 0;
|
||||
}
|
||||
|
||||
conn->events = SPLICE_CLOSED;
|
||||
|
@ -297,33 +280,33 @@ bool tcp_splice_flow_defer(struct tcp_splice_conn *conn)
|
|||
static int tcp_splice_connect_finish(const struct ctx *c,
|
||||
struct tcp_splice_conn *conn)
|
||||
{
|
||||
unsigned sidei;
|
||||
unsigned side;
|
||||
int i = 0;
|
||||
|
||||
flow_foreach_sidei(sidei) {
|
||||
for (side = 0; side < SIDES; side++) {
|
||||
for (; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
|
||||
if (splice_pipe_pool[i][0] >= 0) {
|
||||
SWAP(conn->pipe[sidei][0],
|
||||
SWAP(conn->pipe[side][0],
|
||||
splice_pipe_pool[i][0]);
|
||||
SWAP(conn->pipe[sidei][1],
|
||||
SWAP(conn->pipe[side][1],
|
||||
splice_pipe_pool[i][1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (conn->pipe[sidei][0] < 0) {
|
||||
if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
|
||||
if (conn->pipe[side][0] < 0) {
|
||||
if (pipe2(conn->pipe[side], O_NONBLOCK | O_CLOEXEC)) {
|
||||
flow_err(conn, "cannot create %d->%d pipe: %s",
|
||||
sidei, !sidei, strerror(errno));
|
||||
side, !side, strerror(errno));
|
||||
conn_flag(c, conn, CLOSING);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
|
||||
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
|
||||
if (fcntl(conn->pipe[side][0], F_SETPIPE_SZ,
|
||||
c->tcp.pipe_size)) {
|
||||
flow_trace(conn,
|
||||
"cannot set %d->%d pipe size to %zu",
|
||||
sidei, !sidei, c->tcp.pipe_size);
|
||||
side, !side, c->tcp.pipe_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -338,20 +321,31 @@ static int tcp_splice_connect_finish(const struct ctx *c,
|
|||
* tcp_splice_connect() - Create and connect socket for new spliced connection
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @af: Address family
|
||||
* @pif: pif on which to create socket
|
||||
* @port: Destination port, host order
|
||||
*
|
||||
* Return: 0 for connect() succeeded or in progress, negative value on error
|
||||
*/
|
||||
static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
|
||||
static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
|
||||
sa_family_t af, uint8_t pif, in_port_t port)
|
||||
{
|
||||
const struct flowside *tgt = &conn->f.side[TGTSIDE];
|
||||
sa_family_t af = inany_v4(&tgt->eaddr) ? AF_INET : AF_INET6;
|
||||
uint8_t tgtpif = conn->f.pif[TGTSIDE];
|
||||
union sockaddr_inany sa;
|
||||
struct sockaddr_in6 addr6 = {
|
||||
.sin6_family = AF_INET6,
|
||||
.sin6_port = htons(port),
|
||||
.sin6_addr = IN6ADDR_LOOPBACK_INIT,
|
||||
};
|
||||
struct sockaddr_in addr4 = {
|
||||
.sin_family = AF_INET,
|
||||
.sin_port = htons(port),
|
||||
.sin_addr = IN4ADDR_LOOPBACK_INIT,
|
||||
};
|
||||
const struct sockaddr *sa;
|
||||
socklen_t sl;
|
||||
|
||||
if (tgtpif == PIF_HOST)
|
||||
if (pif == PIF_HOST)
|
||||
conn->s[1] = tcp_conn_sock(c, af);
|
||||
else if (tgtpif == PIF_SPLICE)
|
||||
else if (pif == PIF_SPLICE)
|
||||
conn->s[1] = tcp_conn_sock_ns(c, af);
|
||||
else
|
||||
ASSERT(0);
|
||||
|
@ -365,9 +359,15 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
|
|||
conn->s[1]);
|
||||
}
|
||||
|
||||
pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);
|
||||
if (CONN_V6(conn)) {
|
||||
sa = (struct sockaddr *)&addr6;
|
||||
sl = sizeof(addr6);
|
||||
} else {
|
||||
sa = (struct sockaddr *)&addr4;
|
||||
sl = sizeof(addr4);
|
||||
}
|
||||
|
||||
if (connect(conn->s[1], &sa.sa, sl)) {
|
||||
if (connect(conn->s[1], sa, sl)) {
|
||||
if (errno != EINPROGRESS) {
|
||||
flow_trace(conn, "Couldn't connect socket for splice: %s",
|
||||
strerror(errno));
|
||||
|
@ -414,19 +414,67 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af)
|
|||
/**
|
||||
* tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection
|
||||
* @c: Execution context
|
||||
* @pif0: pif id of side 0
|
||||
* @dstport: Side 0 destination port of connection
|
||||
* @flow: flow to initialise
|
||||
* @s0: Accepted (side 0) socket
|
||||
* @sa: Peer address of connection
|
||||
*
|
||||
* Return: true if able to create a spliced connection, false otherwise
|
||||
* #syscalls:pasta setsockopt
|
||||
*/
|
||||
void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0)
|
||||
bool tcp_splice_conn_from_sock(const struct ctx *c,
|
||||
uint8_t pif0, in_port_t dstport,
|
||||
union flow *flow, int s0,
|
||||
const union sockaddr_inany *sa)
|
||||
{
|
||||
struct tcp_splice_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE,
|
||||
tcp_splice);
|
||||
struct tcp_splice_conn *conn;
|
||||
union inany_addr src;
|
||||
in_port_t srcport;
|
||||
sa_family_t af;
|
||||
uint8_t pif1;
|
||||
|
||||
ASSERT(c->mode == MODE_PASTA);
|
||||
if (c->mode != MODE_PASTA)
|
||||
return false;
|
||||
|
||||
inany_from_sockaddr(&src, &srcport, sa);
|
||||
af = inany_v4(&src) ? AF_INET : AF_INET6;
|
||||
|
||||
switch (pif0) {
|
||||
case PIF_SPLICE:
|
||||
if (!inany_is_loopback(&src)) {
|
||||
char str[INANY_ADDRSTRLEN];
|
||||
|
||||
/* We can't use flow_err() etc. because we haven't set
|
||||
* the flow type yet
|
||||
*/
|
||||
warn("Bad source address %s for splice, closing",
|
||||
inany_ntop(&src, str, sizeof(str)));
|
||||
|
||||
/* We *don't* want to fall back to tap */
|
||||
flow_alloc_cancel(flow);
|
||||
return true;
|
||||
}
|
||||
|
||||
pif1 = PIF_HOST;
|
||||
dstport += c->tcp.fwd_out.delta[dstport];
|
||||
break;
|
||||
|
||||
case PIF_HOST:
|
||||
if (!inany_is_loopback(&src))
|
||||
return false;
|
||||
|
||||
pif1 = PIF_SPLICE;
|
||||
dstport += c->tcp.fwd_in.delta[dstport];
|
||||
break;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0);
|
||||
|
||||
conn->flags = af == AF_INET ? 0 : SPLICE_V6;
|
||||
conn->s[0] = s0;
|
||||
conn->s[1] = -1;
|
||||
conn->pipe[0][0] = conn->pipe[0][1] = -1;
|
||||
|
@ -435,10 +483,10 @@ void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0)
|
|||
if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int)))
|
||||
flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0);
|
||||
|
||||
if (tcp_splice_connect(c, conn))
|
||||
if (tcp_splice_connect(c, conn, af, pif1, dstport))
|
||||
conn_flag(c, conn, CLOSING);
|
||||
|
||||
FLOW_ACTIVATE(conn);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -452,8 +500,8 @@ void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0)
|
|||
void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events)
|
||||
{
|
||||
struct tcp_splice_conn *conn = conn_at_sidx(ref.flowside);
|
||||
unsigned evsidei = ref.flowside.sidei, fromsidei;
|
||||
struct tcp_splice_conn *conn = CONN(ref.flowside.flow);
|
||||
unsigned side = ref.flowside.side, fromside;
|
||||
uint8_t lowat_set_flag, lowat_act_flag;
|
||||
int eof, never_read;
|
||||
|
||||
|
@ -485,31 +533,30 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
|||
}
|
||||
|
||||
if (events & EPOLLOUT) {
|
||||
fromsidei = !evsidei;
|
||||
conn_event(c, conn, ~OUT_WAIT(evsidei));
|
||||
fromside = !side;
|
||||
conn_event(c, conn, side == 0 ? ~OUT_WAIT_0 : ~OUT_WAIT_1);
|
||||
} else {
|
||||
fromsidei = evsidei;
|
||||
fromside = side;
|
||||
}
|
||||
|
||||
if (events & EPOLLRDHUP)
|
||||
/* For side 0 this is fake, but implied */
|
||||
conn_event(c, conn, FIN_RCVD(evsidei));
|
||||
conn_event(c, conn, side == 0 ? FIN_RCVD_0 : FIN_RCVD_1);
|
||||
|
||||
swap:
|
||||
eof = 0;
|
||||
never_read = 1;
|
||||
|
||||
lowat_set_flag = RCVLOWAT_SET(fromsidei);
|
||||
lowat_act_flag = RCVLOWAT_ACT(fromsidei);
|
||||
lowat_set_flag = fromside == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1;
|
||||
lowat_act_flag = fromside == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1;
|
||||
|
||||
while (1) {
|
||||
ssize_t readlen, written, pending;
|
||||
ssize_t readlen, to_write = 0, written;
|
||||
int more = 0;
|
||||
|
||||
retry:
|
||||
readlen = splice(conn->s[fromsidei], NULL,
|
||||
conn->pipe[fromsidei][1], NULL,
|
||||
c->tcp.pipe_size,
|
||||
readlen = splice(conn->s[fromside], NULL,
|
||||
conn->pipe[fromside][1], NULL, c->tcp.pipe_size,
|
||||
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
|
||||
flow_trace(conn, "%zi from read-side call", readlen);
|
||||
if (readlen < 0) {
|
||||
|
@ -518,11 +565,14 @@ retry:
|
|||
|
||||
if (errno != EAGAIN)
|
||||
goto close;
|
||||
|
||||
to_write = c->tcp.pipe_size;
|
||||
} else if (!readlen) {
|
||||
eof = 1;
|
||||
to_write = c->tcp.pipe_size;
|
||||
} else {
|
||||
never_read = 0;
|
||||
|
||||
to_write += readlen;
|
||||
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
|
||||
more = SPLICE_F_MORE;
|
||||
|
||||
|
@ -531,11 +581,11 @@ retry:
|
|||
}
|
||||
|
||||
eintr:
|
||||
written = splice(conn->pipe[fromsidei][0], NULL,
|
||||
conn->s[!fromsidei], NULL, c->tcp.pipe_size,
|
||||
written = splice(conn->pipe[fromside][0], NULL,
|
||||
conn->s[!fromside], NULL, to_write,
|
||||
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
|
||||
flow_trace(conn, "%zi from write-side call (passed %zi)",
|
||||
written, c->tcp.pipe_size);
|
||||
written, to_write);
|
||||
|
||||
/* Most common case: skip updating counters. */
|
||||
if (readlen > 0 && readlen == written) {
|
||||
|
@ -546,23 +596,18 @@ eintr:
|
|||
readlen > (long)c->tcp.pipe_size / 10) {
|
||||
int lowat = c->tcp.pipe_size / 4;
|
||||
|
||||
if (setsockopt(conn->s[fromsidei], SOL_SOCKET,
|
||||
SO_RCVLOWAT,
|
||||
&lowat, sizeof(lowat))) {
|
||||
flow_trace(conn,
|
||||
"Setting SO_RCVLOWAT %i: %s",
|
||||
lowat, strerror(errno));
|
||||
} else {
|
||||
conn_flag(c, conn, lowat_set_flag);
|
||||
conn_flag(c, conn, lowat_act_flag);
|
||||
}
|
||||
setsockopt(conn->s[fromside], SOL_SOCKET,
|
||||
SO_RCVLOWAT, &lowat, sizeof(lowat));
|
||||
|
||||
conn_flag(c, conn, lowat_set_flag);
|
||||
conn_flag(c, conn, lowat_act_flag);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
conn->read[fromsidei] += readlen > 0 ? readlen : 0;
|
||||
conn->written[fromsidei] += written > 0 ? written : 0;
|
||||
conn->read[fromside] += readlen > 0 ? readlen : 0;
|
||||
conn->written[fromside] += written > 0 ? written : 0;
|
||||
|
||||
if (written < 0) {
|
||||
if (errno == EINTR)
|
||||
|
@ -571,43 +616,47 @@ eintr:
|
|||
if (errno != EAGAIN)
|
||||
goto close;
|
||||
|
||||
if (conn->read[fromsidei] == conn->written[fromsidei])
|
||||
if (never_read)
|
||||
break;
|
||||
|
||||
conn_event(c, conn, OUT_WAIT(!fromsidei));
|
||||
conn_event(c, conn,
|
||||
fromside == 0 ? OUT_WAIT_1 : OUT_WAIT_0);
|
||||
break;
|
||||
}
|
||||
|
||||
if (never_read && written == (long)(c->tcp.pipe_size))
|
||||
goto retry;
|
||||
|
||||
pending = conn->read[fromsidei] - conn->written[fromsidei];
|
||||
if (!never_read && written > 0 && written < pending)
|
||||
if (!never_read && written < to_write) {
|
||||
to_write -= written;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (eof)
|
||||
break;
|
||||
}
|
||||
|
||||
if (conn->read[fromsidei] == conn->written[fromsidei] && eof) {
|
||||
unsigned sidei;
|
||||
|
||||
flow_foreach_sidei(sidei) {
|
||||
if ((conn->events & FIN_RCVD(sidei)) &&
|
||||
!(conn->events & FIN_SENT(!sidei))) {
|
||||
shutdown(conn->s[!sidei], SHUT_WR);
|
||||
conn_event(c, conn, FIN_SENT(!sidei));
|
||||
}
|
||||
if ((conn->events & FIN_RCVD_0) && !(conn->events & FIN_SENT_1)) {
|
||||
if (conn->read[fromside] == conn->written[fromside] && eof) {
|
||||
shutdown(conn->s[1], SHUT_WR);
|
||||
conn_event(c, conn, FIN_SENT_1);
|
||||
}
|
||||
}
|
||||
|
||||
if (CONN_HAS(conn, FIN_SENT(0) | FIN_SENT(1)))
|
||||
if ((conn->events & FIN_RCVD_1) && !(conn->events & FIN_SENT_0)) {
|
||||
if (conn->read[fromside] == conn->written[fromside] && eof) {
|
||||
shutdown(conn->s[0], SHUT_WR);
|
||||
conn_event(c, conn, FIN_SENT_0);
|
||||
}
|
||||
}
|
||||
|
||||
if (CONN_HAS(conn, FIN_SENT_0 | FIN_SENT_1))
|
||||
goto close;
|
||||
|
||||
if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
|
||||
events = EPOLLIN;
|
||||
|
||||
fromsidei = !fromsidei;
|
||||
fromside = !fromside;
|
||||
goto swap;
|
||||
}
|
||||
|
||||
|
@ -672,7 +721,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
|
|||
continue;
|
||||
|
||||
if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
|
||||
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
|
||||
c->tcp.pipe_size)) {
|
||||
trace("TCP (spliced): cannot set pool pipe size to %zu",
|
||||
c->tcp.pipe_size);
|
||||
}
|
||||
|
@ -685,7 +734,6 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
|
|||
*
|
||||
* Return: 0
|
||||
*/
|
||||
/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
|
||||
static int tcp_sock_refill_ns(void *arg)
|
||||
{
|
||||
const struct ctx *c = (const struct ctx *)arg;
|
||||
|
@ -738,26 +786,29 @@ void tcp_splice_init(struct ctx *c)
|
|||
/**
|
||||
* tcp_splice_timer() - Timer for spliced connections
|
||||
* @c: Execution context
|
||||
* @conn: Connection to handle
|
||||
* @flow: Flow table entry
|
||||
*/
|
||||
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn)
|
||||
void tcp_splice_timer(const struct ctx *c, union flow *flow)
|
||||
{
|
||||
unsigned sidei;
|
||||
struct tcp_splice_conn *conn = &flow->tcp_splice;
|
||||
int side;
|
||||
|
||||
ASSERT(!(conn->flags & CLOSING));
|
||||
|
||||
flow_foreach_sidei(sidei) {
|
||||
if ((conn->flags & RCVLOWAT_SET(sidei)) &&
|
||||
!(conn->flags & RCVLOWAT_ACT(sidei))) {
|
||||
if (setsockopt(conn->s[sidei], SOL_SOCKET, SO_RCVLOWAT,
|
||||
for (side = 0; side < SIDES; side++) {
|
||||
uint8_t set = side == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1;
|
||||
uint8_t act = side == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1;
|
||||
|
||||
if ((conn->flags & set) && !(conn->flags & act)) {
|
||||
if (setsockopt(conn->s[side], SOL_SOCKET, SO_RCVLOWAT,
|
||||
&((int){ 1 }), sizeof(int))) {
|
||||
flow_trace(conn, "can't set SO_RCVLOWAT on %d",
|
||||
conn->s[sidei]);
|
||||
conn->s[side]);
|
||||
}
|
||||
conn_flag(c, conn, ~RCVLOWAT_SET(sidei));
|
||||
conn_flag(c, conn, ~set);
|
||||
}
|
||||
}
|
||||
|
||||
flow_foreach_sidei(sidei)
|
||||
conn_flag(c, conn, ~RCVLOWAT_ACT(sidei));
|
||||
conn_flag(c, conn, ~RCVLOWAT_ACT_0);
|
||||
conn_flag(c, conn, ~RCVLOWAT_ACT_1);
|
||||
}
|
||||
|
|
|
@ -11,7 +11,10 @@ union sockaddr_inany;
|
|||
|
||||
void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events);
|
||||
void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0);
|
||||
bool tcp_splice_conn_from_sock(const struct ctx *c,
|
||||
uint8_t pif0, in_port_t dstport,
|
||||
union flow *flow, int s0,
|
||||
const union sockaddr_inany *sa);
|
||||
void tcp_splice_init(struct ctx *c);
|
||||
|
||||
#endif /* TCP_SPLICE_H */
|
||||
|
|
460
tcp_vu.c
Normal file
460
tcp_vu.c
Normal file
|
@ -0,0 +1,460 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <errno.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <netinet/ip.h>
|
||||
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/virtio_net.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "passt.h"
|
||||
#include "siphash.h"
|
||||
#include "inany.h"
|
||||
#include "vhost_user.h"
|
||||
#include "tcp.h"
|
||||
#include "pcap.h"
|
||||
#include "flow.h"
|
||||
#include "tcp_conn.h"
|
||||
#include "flow_table.h"
|
||||
#include "tcp_vu.h"
|
||||
#include "tcp_internal.h"
|
||||
#include "checksum.h"
|
||||
|
||||
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
|
||||
#define CONN_V6(conn) (!CONN_V4(conn))
|
||||
|
||||
/* vhost-user */
|
||||
static const struct virtio_net_hdr vu_header = {
|
||||
.flags = VIRTIO_NET_HDR_F_DATA_VALID,
|
||||
.gso_type = VIRTIO_NET_HDR_GSO_NONE,
|
||||
};
|
||||
|
||||
static unsigned char buffer[65536];
|
||||
static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE];
|
||||
static unsigned int indexes [VIRTQUEUE_MAX_SIZE];
|
||||
|
||||
uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn)
|
||||
{
|
||||
(void)conn;
|
||||
return USHRT_MAX;
|
||||
}
|
||||
|
||||
int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
{
|
||||
VuDev *vdev = (VuDev *)&c->vdev;
|
||||
VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
size_t tlen, vnet_hdrlen, ip_len, optlen = 0;
|
||||
struct virtio_net_hdr_mrg_rxbuf *vh;
|
||||
VuVirtqElement *elem;
|
||||
struct ethhdr *eh;
|
||||
int nb_ack;
|
||||
int ret;
|
||||
|
||||
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
|
||||
if (!elem)
|
||||
return 0;
|
||||
|
||||
if (elem->in_num < 1) {
|
||||
err("virtio-net receive queue contains no in buffers");
|
||||
vu_queue_rewind(vdev, vq, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
vh = elem->in_sg[0].iov_base;
|
||||
|
||||
vh->hdr = vu_header;
|
||||
if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
|
||||
vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
|
||||
vh->num_buffers = htole16(1);
|
||||
} else {
|
||||
vnet_hdrlen = sizeof(struct virtio_net_hdr);
|
||||
}
|
||||
eh = (struct ethhdr *)((char *)elem->in_sg[0].iov_base + vnet_hdrlen);
|
||||
|
||||
memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
struct iphdr *iph = (struct iphdr *)(eh + 1);
|
||||
struct tcphdr *th = (struct tcphdr *)(iph + 1);
|
||||
char *data = (char *)(th + 1);
|
||||
|
||||
eh->h_proto = htons(ETH_P_IP);
|
||||
|
||||
*th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
|
||||
ret = tcp_fill_flag_header(c, conn, flags, th, data, &optlen);
|
||||
if (ret <= 0) {
|
||||
vu_queue_rewind(vdev, vq, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ip_len = tcp_fill_headers4(c, conn, iph,
|
||||
(struct tcphdr *)(iph + 1), optlen,
|
||||
NULL, conn->seq_to_tap);
|
||||
|
||||
tlen = ip_len + sizeof(struct ethhdr);
|
||||
|
||||
if (*c->pcap) {
|
||||
uint32_t sum = proto_ipv4_header_psum(iph->tot_len,
|
||||
IPPROTO_TCP,
|
||||
(struct in_addr){ .s_addr = iph->saddr },
|
||||
(struct in_addr){ .s_addr = iph->daddr });
|
||||
|
||||
th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
|
||||
}
|
||||
} else {
|
||||
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
|
||||
struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
|
||||
char *data = (char *)(th + 1);
|
||||
|
||||
eh->h_proto = htons(ETH_P_IPV6);
|
||||
|
||||
*th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
|
||||
ret = tcp_fill_flag_header(c, conn, flags, th, data, &optlen);
|
||||
if (ret <= 0) {
|
||||
vu_queue_rewind(vdev, vq, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ip_len = tcp_fill_headers6(c, conn, ip6h,
|
||||
(struct tcphdr *)(ip6h + 1),
|
||||
optlen, conn->seq_to_tap);
|
||||
|
||||
tlen = ip_len + sizeof(struct ethhdr);
|
||||
|
||||
if (*c->pcap) {
|
||||
uint32_t sum = proto_ipv6_header_psum(ip6h->payload_len,
|
||||
IPPROTO_TCP,
|
||||
&ip6h->saddr,
|
||||
&ip6h->daddr);
|
||||
|
||||
th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
|
||||
}
|
||||
}
|
||||
|
||||
pcap((void *)eh, tlen);
|
||||
|
||||
tlen += vnet_hdrlen;
|
||||
vu_queue_fill(vdev, vq, elem, tlen, 0);
|
||||
nb_ack = 1;
|
||||
|
||||
if (flags & DUP_ACK) {
|
||||
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
|
||||
if (elem) {
|
||||
if (elem->in_num < 1 || elem->in_sg[0].iov_len < tlen) {
|
||||
vu_queue_rewind(vdev, vq, 1);
|
||||
} else {
|
||||
memcpy(elem->in_sg[0].iov_base, vh, tlen);
|
||||
nb_ack++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vu_queue_flush(vdev, vq, nb_ack);
|
||||
vu_queue_notify(vdev, vq);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
||||
uint32_t already_sent;
|
||||
VuDev *vdev = (VuDev *)&c->vdev;
|
||||
VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
int s = conn->sock, v4 = CONN_V4(conn);
|
||||
int i, ret = 0, iov_count, iov_used;
|
||||
struct msghdr mh_sock = { 0 };
|
||||
size_t l2_hdrlen, vnet_hdrlen, fillsize;
|
||||
ssize_t len;
|
||||
uint16_t *check;
|
||||
uint16_t mss = MSS_GET(conn);
|
||||
int num_buffers;
|
||||
int segment_size;
|
||||
struct iovec *first;
|
||||
bool has_mrg_rxbuf;
|
||||
|
||||
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
|
||||
err("Got packet, but no available descriptors on RX virtq.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
|
||||
if (SEQ_LT(already_sent, 0)) {
|
||||
/* RFC 761, section 2.1. */
|
||||
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
|
||||
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
already_sent = 0;
|
||||
}
|
||||
|
||||
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up buffer descriptors we'll fill completely and partially. */
|
||||
|
||||
fillsize = wnd_scaled;
|
||||
|
||||
iov_vu[0].iov_base = tcp_buf_discard;
|
||||
iov_vu[0].iov_len = already_sent;
|
||||
fillsize -= already_sent;
|
||||
|
||||
has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
|
||||
if (has_mrg_rxbuf) {
|
||||
vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
|
||||
} else {
|
||||
vnet_hdrlen = sizeof(struct virtio_net_hdr);
|
||||
}
|
||||
l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr);
|
||||
if (v4) {
|
||||
l2_hdrlen += sizeof(struct iphdr);
|
||||
} else {
|
||||
l2_hdrlen += sizeof(struct ipv6hdr);
|
||||
}
|
||||
|
||||
iov_count = 0;
|
||||
segment_size = 0;
|
||||
while (fillsize > 0 && iov_count < VIRTQUEUE_MAX_SIZE - 1) {
|
||||
VuVirtqElement *elem;
|
||||
|
||||
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
|
||||
if (!elem)
|
||||
break;
|
||||
|
||||
if (elem->in_num < 1) {
|
||||
err("virtio-net receive queue contains no in buffers");
|
||||
goto err;
|
||||
}
|
||||
|
||||
ASSERT(elem->in_num == 1);
|
||||
ASSERT(elem->in_sg[0].iov_len >= l2_hdrlen);
|
||||
|
||||
indexes[iov_count] = elem->index;
|
||||
|
||||
if (segment_size == 0) {
|
||||
iov_vu[iov_count + 1].iov_base =
|
||||
(char *)elem->in_sg[0].iov_base + l2_hdrlen;
|
||||
iov_vu[iov_count + 1].iov_len =
|
||||
elem->in_sg[0].iov_len - l2_hdrlen;
|
||||
} else {
|
||||
iov_vu[iov_count + 1].iov_base = elem->in_sg[0].iov_base;
|
||||
iov_vu[iov_count + 1].iov_len = elem->in_sg[0].iov_len;
|
||||
}
|
||||
|
||||
if (iov_vu[iov_count + 1].iov_len > fillsize)
|
||||
iov_vu[iov_count + 1].iov_len = fillsize;
|
||||
|
||||
segment_size += iov_vu[iov_count + 1].iov_len;
|
||||
if (!has_mrg_rxbuf) {
|
||||
segment_size = 0;
|
||||
} else if (segment_size >= mss) {
|
||||
iov_vu[iov_count + 1].iov_len -= segment_size - mss;
|
||||
segment_size = 0;
|
||||
}
|
||||
fillsize -= iov_vu[iov_count + 1].iov_len;
|
||||
|
||||
iov_count++;
|
||||
}
|
||||
if (iov_count == 0)
|
||||
return 0;
|
||||
|
||||
mh_sock.msg_iov = iov_vu;
|
||||
mh_sock.msg_iovlen = iov_count + 1;
|
||||
|
||||
do
|
||||
len = recvmsg(s, &mh_sock, MSG_PEEK);
|
||||
while (len < 0 && errno == EINTR);
|
||||
|
||||
if (len < 0)
|
||||
goto err;
|
||||
|
||||
if (!len) {
|
||||
vu_queue_rewind(vdev, vq, iov_count);
|
||||
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
||||
if ((ret = tcp_vu_send_flag(c, conn, FIN | ACK))) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
}
|
||||
|
||||
conn_event(c, conn, TAP_FIN_SENT);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
len -= already_sent;
|
||||
if (len <= 0) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
vu_queue_rewind(vdev, vq, iov_count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ~STALLED);
|
||||
|
||||
/* Likely, some new data was acked too. */
|
||||
tcp_update_seqack_wnd(c, conn, 0, NULL);
|
||||
|
||||
/* initialize headers */
|
||||
iov_used = 0;
|
||||
num_buffers = 0;
|
||||
check = NULL;
|
||||
segment_size = 0;
|
||||
for (i = 0; i < iov_count && len; i++) {
|
||||
|
||||
if (segment_size == 0)
|
||||
first = &iov_vu[i + 1];
|
||||
|
||||
if (iov_vu[i + 1].iov_len > (size_t)len)
|
||||
iov_vu[i + 1].iov_len = len;
|
||||
|
||||
len -= iov_vu[i + 1].iov_len;
|
||||
iov_used++;
|
||||
|
||||
segment_size += iov_vu[i + 1].iov_len;
|
||||
num_buffers++;
|
||||
|
||||
if (segment_size >= mss || len == 0 ||
|
||||
i + 1 == iov_count || !has_mrg_rxbuf) {
|
||||
|
||||
struct ethhdr *eh;
|
||||
struct virtio_net_hdr_mrg_rxbuf *vh;
|
||||
char *base = (char *)first->iov_base - l2_hdrlen;
|
||||
size_t size = first->iov_len + l2_hdrlen;
|
||||
|
||||
vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
|
||||
|
||||
vh->hdr = vu_header;
|
||||
if (has_mrg_rxbuf)
|
||||
vh->num_buffers = htole16(num_buffers);
|
||||
|
||||
eh = (struct ethhdr *)((char *)base + vnet_hdrlen);
|
||||
|
||||
memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
|
||||
|
||||
/* initialize header */
|
||||
if (v4) {
|
||||
struct iphdr *iph = (struct iphdr *)(eh + 1);
|
||||
struct tcphdr *th = (struct tcphdr *)(iph + 1);
|
||||
|
||||
eh->h_proto = htons(ETH_P_IP);
|
||||
|
||||
*th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
|
||||
tcp_fill_headers4(c, conn, iph,
|
||||
(struct tcphdr *)(iph + 1),
|
||||
segment_size, len ? check : NULL,
|
||||
conn->seq_to_tap);
|
||||
|
||||
if (*c->pcap) {
|
||||
uint32_t sum = proto_ipv4_header_psum(iph->tot_len,
|
||||
IPPROTO_TCP,
|
||||
(struct in_addr){ .s_addr = iph->saddr },
|
||||
(struct in_addr){ .s_addr = iph->daddr });
|
||||
|
||||
first->iov_base = th;
|
||||
first->iov_len = size - l2_hdrlen + sizeof(*th);
|
||||
|
||||
th->check = csum_iov(first, num_buffers, sum);
|
||||
}
|
||||
|
||||
check = &iph->check;
|
||||
} else {
|
||||
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
|
||||
struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
|
||||
|
||||
eh->h_proto = htons(ETH_P_IPV6);
|
||||
|
||||
*th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
|
||||
tcp_fill_headers6(c, conn, ip6h,
|
||||
(struct tcphdr *)(ip6h + 1),
|
||||
segment_size, conn->seq_to_tap);
|
||||
if (*c->pcap) {
|
||||
uint32_t sum = proto_ipv6_header_psum(ip6h->payload_len,
|
||||
IPPROTO_TCP,
|
||||
&ip6h->saddr,
|
||||
&ip6h->daddr);
|
||||
|
||||
first->iov_base = th;
|
||||
first->iov_len = size - l2_hdrlen + sizeof(*th);
|
||||
|
||||
th->check = csum_iov(first, num_buffers, sum);
|
||||
}
|
||||
}
|
||||
|
||||
/* set iov for pcap logging */
|
||||
first->iov_base = eh;
|
||||
first->iov_len = size - vnet_hdrlen;
|
||||
|
||||
pcap_iov(first, num_buffers);
|
||||
|
||||
/* set iov_len for vu_queue_fill_by_index(); */
|
||||
|
||||
first->iov_base = base;
|
||||
first->iov_len = size;
|
||||
|
||||
conn->seq_to_tap += segment_size;
|
||||
|
||||
segment_size = 0;
|
||||
num_buffers = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* release unused buffers */
|
||||
vu_queue_rewind(vdev, vq, iov_count - iov_used);
|
||||
|
||||
/* send packets */
|
||||
for (i = 0; i < iov_used; i++) {
|
||||
vu_queue_fill_by_index(vdev, vq, indexes[i],
|
||||
iov_vu[i + 1].iov_len, i);
|
||||
}
|
||||
|
||||
vu_queue_flush(vdev, vq, iov_used);
|
||||
vu_queue_notify(vdev, vq);
|
||||
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
|
||||
return 0;
|
||||
err:
|
||||
vu_queue_rewind(vdev, vq, iov_count);
|
||||
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
ret = -errno;
|
||||
tcp_rst(c, conn);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
9
tcp_vu.h
Normal file
9
tcp_vu.h
Normal file
|
@ -0,0 +1,9 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#ifndef TCP_VU_H
|
||||
#define TCP_VU_H
|
||||
|
||||
int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
|
||||
#endif /*TCP_VU_H */
|
1
test/.gitignore
vendored
1
test/.gitignore
vendored
|
@ -1,6 +1,5 @@
|
|||
test_logs/
|
||||
mbuto/
|
||||
podman/
|
||||
*.img
|
||||
QEMU_EFI.fd
|
||||
*.qcow2
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
WGET = wget -c
|
||||
|
||||
DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
|
||||
debian-9-nocloud-amd64-daily-20200210-166.qcow2 \
|
||||
debian-10-nocloud-amd64.qcow2 \
|
||||
debian-10-generic-arm64.qcow2 \
|
||||
debian-10-generic-ppc64el-20220911-1135.qcow2 \
|
||||
|
@ -41,7 +42,8 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \
|
|||
openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \
|
||||
openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \
|
||||
openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \
|
||||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
|
||||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \
|
||||
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
|
||||
|
||||
UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \
|
||||
trusty-server-cloudimg-i386-disk1.img \
|
||||
|
@ -50,10 +52,10 @@ UBUNTU_NEW_IMGS = xenial-server-cloudimg-powerpc-disk1.img \
|
|||
jammy-server-cloudimg-s390x.img
|
||||
UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS)
|
||||
|
||||
DOWNLOAD_ASSETS = mbuto podman \
|
||||
DOWNLOAD_ASSETS = mbuto \
|
||||
$(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS)
|
||||
TESTDATA_ASSETS = small.bin big.bin medium.bin
|
||||
LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \
|
||||
LOCAL_ASSETS = mbuto.img mbuto.mem.img QEMU_EFI.fd \
|
||||
$(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \
|
||||
$(UBUNTU_NEW_IMGS:%=prepared-%) \
|
||||
nstool guest-key guest-key.pub \
|
||||
|
@ -65,27 +67,13 @@ CFLAGS = -Wall -Werror -Wextra -pedantic -std=c99
|
|||
|
||||
assets: $(ASSETS)
|
||||
|
||||
.PHONY: pull-%
|
||||
pull-%: %
|
||||
git -C $* pull
|
||||
|
||||
mbuto:
|
||||
git clone git://mbuto.sh/mbuto
|
||||
|
||||
mbuto/mbuto: pull-mbuto
|
||||
|
||||
podman:
|
||||
git clone https://github.com/containers/podman.git
|
||||
|
||||
# To succesfully build podman, you will need gpgme and systemd
|
||||
# development packages
|
||||
podman/bin/podman: pull-podman
|
||||
$(MAKE) -C podman
|
||||
|
||||
guest-key guest-key.pub:
|
||||
ssh-keygen -f guest-key -N ''
|
||||
|
||||
mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub $(TESTDATA_ASSETS)
|
||||
mbuto.img: passt.mbuto mbuto guest-key.pub $(TESTDATA_ASSETS)
|
||||
./mbuto/mbuto -p ./$< -c lz4 -f $@
|
||||
|
||||
mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2
|
||||
|
@ -133,6 +121,9 @@ realclean: clean
|
|||
debian-8.11.0-openstack-%.qcow2:
|
||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
|
||||
|
||||
debian-9-nocloud-%-daily-20200210-166.qcow2:
|
||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2
|
||||
|
||||
debian-10-nocloud-%.qcow2:
|
||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
|
||||
|
||||
|
@ -198,6 +189,9 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz:
|
|||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
|
||||
$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
|
||||
|
||||
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2:
|
||||
$(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
|
||||
|
||||
# Ubuntu downloads
|
||||
trusty-server-cloudimg-%-disk1.img:
|
||||
$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img
|
||||
|
|
|
@ -28,11 +28,10 @@ on a system, i.e. common utilities such as a shell are not included here.
|
|||
|
||||
Example for Debian, and possibly most Debian-based distributions:
|
||||
|
||||
bats bc build-essential catatonit clang-tidy conmon cppcheck crun fakeroot
|
||||
git go iperf3 isc-dhcp-common jq libgpgme-dev libseccomp-dev linux-cpupower
|
||||
lm-sensors lz4 netavark netcat-openbsd psmisc qemu-efi-aarch64
|
||||
qemu-system-arm qemu-system-misc qemu-system-ppc qemu-system-x86
|
||||
qemu-system-x86 sipcalc socat strace tmux uidmap valgrind
|
||||
build-essential git jq strace iperf3 qemu-system-x86 tmux sipcalc bats bc
|
||||
catatonit clang-tidy cppcheck go isc-dhcp-common psmisc linux-cpupower socat
|
||||
netcat-openbsd fakeroot lz4 lm-sensors qemu-system-arm qemu-system-ppc
|
||||
qemu-system-misc qemu-system-x86 valgrind
|
||||
|
||||
NOTE: the tests need a qemu version >= 7.2, or one that contains commit
|
||||
13c6be96618c ("net: stream: add unix socket"): this change introduces support
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
# layout_pasta() - Panes for host, pasta, and separate one for namespace
|
||||
layout_pasta() {
|
||||
sleep 1
|
||||
sleep 3
|
||||
|
||||
tmux kill-pane -a -t 0
|
||||
cmd_write 0 clear
|
||||
|
@ -46,7 +46,7 @@ layout_pasta() {
|
|||
|
||||
# layout_passt() - Panes for host, passt, and guest
|
||||
layout_passt() {
|
||||
sleep 1
|
||||
sleep 3
|
||||
|
||||
tmux kill-pane -a -t 0
|
||||
cmd_write 0 clear
|
||||
|
@ -77,7 +77,7 @@ layout_passt() {
|
|||
|
||||
# layout_passt_in_pasta() - Host, passt within pasta, namespace and guest
|
||||
layout_passt_in_pasta() {
|
||||
sleep 1
|
||||
sleep 3
|
||||
|
||||
tmux kill-pane -a -t 0
|
||||
cmd_write 0 clear
|
||||
|
@ -113,7 +113,7 @@ layout_passt_in_pasta() {
|
|||
|
||||
# layout_two_guests() - Two guest panes, two passt panes, plus host and log
|
||||
layout_two_guests() {
|
||||
sleep 1
|
||||
sleep 3
|
||||
|
||||
tmux kill-pane -a -t 0
|
||||
cmd_write 0 clear
|
||||
|
@ -152,7 +152,7 @@ layout_two_guests() {
|
|||
|
||||
# layout_demo_pasta() - Four panes for pasta demo
|
||||
layout_demo_pasta() {
|
||||
sleep 1
|
||||
sleep 3
|
||||
|
||||
cmd_write 0 cd ${BASEPATH}
|
||||
cmd_write 0 clear
|
||||
|
@ -188,7 +188,7 @@ layout_demo_pasta() {
|
|||
|
||||
# layout_demo_passt() - Four panes for passt demo
|
||||
layout_demo_passt() {
|
||||
sleep 1
|
||||
sleep 3
|
||||
|
||||
cmd_write 0 cd ${BASEPATH}
|
||||
cmd_write 0 clear
|
||||
|
@ -224,7 +224,7 @@ layout_demo_passt() {
|
|||
|
||||
# layout_demo_podman() - Four panes for pasta demo with Podman
|
||||
layout_demo_podman() {
|
||||
sleep 1
|
||||
sleep 3
|
||||
|
||||
cmd_write 0 cd ${BASEPATH}
|
||||
cmd_write 0 clear
|
||||
|
|
|
@ -18,7 +18,7 @@ PERF_LINK_COUNT=0
|
|||
PERF_JS="${LOGDIR}/web/perf.js"
|
||||
|
||||
PERF_TEMPLATE_HTML="document.write('"'
|
||||
Throughput in Gbps, latency in µs. Threads are <span style="font-family: monospace;">iperf3</span> threads, <i>passt</i> and <i>pasta</i> are currently single-threaded.<br/>
|
||||
Throughput in Gbps, latency in µs. Threads are <span style="font-family: monospace;">iperf3</span> processes, <i>passt</i> and <i>pasta</i> are currently single-threaded.<br/>
|
||||
Click on numbers to show test execution. Measured at head, commit <span style="font-family: monospace;">__commit__</span>.
|
||||
|
||||
<style type="text/CSS">
|
||||
|
@ -56,7 +56,7 @@ table.pasta_local th { text-align: center; font-weight: bold; }
|
|||
table.pasta_local tr:not(:first-of-type) td:not(:first-of-type) { font-family: monospace; font-weight: bolder; }
|
||||
table.pasta_local tr:nth-child(3n+0) { background-color: #112315; }
|
||||
table.pasta_local tr:not(:nth-child(3n+0)) td { background-color: #101010; }
|
||||
table.pasta_local td:nth-child(4n+2) { background-color: #603302; }
|
||||
table.pasta_local td:nth-child(3n+2) { background-color: #603302; }
|
||||
table.pasta_local tr:nth-child(1) { background-color: #363e61; }
|
||||
table.pasta td { border: 0px solid; padding: 6px; line-height: 1; }
|
||||
table.pasta td { text-align: right; }
|
||||
|
|
|
@ -17,8 +17,6 @@ INITRAMFS="${BASEPATH}/mbuto.img"
|
|||
VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )"
|
||||
__mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
|
||||
VMEM="$((${__mem_kib} / 1024 / 4))"
|
||||
QEMU_ARCH="$(uname -m)"
|
||||
[ "${QEMU_ARCH}" = "i686" ] && QEMU_ARCH=i386
|
||||
|
||||
# setup_build() - Set up pane layout for build tests
|
||||
setup_build() {
|
||||
|
@ -55,10 +53,10 @@ setup_passt() {
|
|||
wait_for [ -f "${STATESETUP}/passt.pid" ]
|
||||
|
||||
GUEST_CID=94557
|
||||
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
|
||||
context_run_bg qemu 'qemu-system-$(uname -m)' \
|
||||
' -machine accel=kvm' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
|
@ -126,12 +124,7 @@ setup_passt_in_ns() {
|
|||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
|
||||
__map_host4=192.0.2.1
|
||||
__map_host6=2001:db8:9a55::1
|
||||
__map_ns4=192.0.2.2
|
||||
__map_ns6=2001:db8:9a55::2
|
||||
|
||||
context_run_bg pasta "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${STATESETUP}/pasta.pid --map-host-loopback ${__map_host4} --map-host-loopback ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns.hold"
|
||||
context_run_bg pasta "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${STATESETUP}/pasta.pid --config-net ${NSTOOL} hold ${STATESETUP}/ns.hold"
|
||||
wait_for [ -f "${STATESETUP}/pasta.pid" ]
|
||||
|
||||
context_setup_nstool qemu ${STATESETUP}/ns.hold
|
||||
|
@ -146,20 +139,20 @@ setup_passt_in_ns() {
|
|||
if [ ${VALGRIND} -eq 1 ]; then
|
||||
context_run passt "make clean"
|
||||
context_run passt "make valgrind"
|
||||
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
|
||||
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid"
|
||||
else
|
||||
context_run passt "make clean"
|
||||
context_run passt "make"
|
||||
context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
|
||||
context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid"
|
||||
fi
|
||||
wait_for [ -f "${STATESETUP}/passt.pid" ]
|
||||
|
||||
GUEST_CID=94557
|
||||
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
|
||||
context_run_bg qemu 'qemu-system-$(uname -m)' \
|
||||
' -machine accel=kvm' \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
|
@ -227,10 +220,10 @@ setup_two_guests() {
|
|||
wait_for [ -f "${STATESETUP}/passt_2.pid" ]
|
||||
|
||||
GUEST_1_CID=94557
|
||||
context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
|
||||
context_run_bg qemu_1 'qemu-system-$(uname -m)' \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
|
@ -240,10 +233,10 @@ setup_two_guests() {
|
|||
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"
|
||||
|
||||
GUEST_2_CID=94558
|
||||
context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
|
||||
context_run_bg qemu_2 'qemu-system-$(uname -m)' \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
|
|
|
@ -31,8 +31,8 @@ PR_DELAY_INIT=100 # ms
|
|||
# $@: Message to print
|
||||
info() {
|
||||
tmux select-pane -t ${PANE_INFO}
|
||||
printf "${@}\n" >> $STATEBASE/log_pipe
|
||||
printf "${@}\n" >> "${LOGFILE}"
|
||||
echo "${@}" >> $STATEBASE/log_pipe
|
||||
echo "${@}" >> "${LOGFILE}"
|
||||
}
|
||||
|
||||
# info_n() - Highlight, print message to pane and to log file without newline
|
||||
|
@ -47,13 +47,13 @@ info_n() {
|
|||
# $@: Message to print
|
||||
info_nolog() {
|
||||
tmux select-pane -t ${PANE_INFO}
|
||||
printf "${@}\n" >> $STATEBASE/log_pipe
|
||||
echo "${@}" >> $STATEBASE/log_pipe
|
||||
}
|
||||
|
||||
# info_nolog() - Print message to log file
|
||||
# $@: Message to print
|
||||
log() {
|
||||
printf "${@}\n" >> "${LOGFILE}"
|
||||
echo "${@}" >> "${LOGFILE}"
|
||||
}
|
||||
|
||||
# info_nolog_n() - Send message to pane without highlighting it, without newline
|
||||
|
@ -97,6 +97,7 @@ display_delay() {
|
|||
switch_pane() {
|
||||
tmux select-pane -t ${1}
|
||||
PR_DELAY=${PR_DELAY_INIT}
|
||||
display_delay "0.2"
|
||||
}
|
||||
|
||||
# cmd_write() - Write a command to a pane, letter by letter, and execute it
|
||||
|
@ -198,7 +199,7 @@ pane_run() {
|
|||
# $1: Pane name
|
||||
pane_wait() {
|
||||
__lc="$(echo "${1}" | tr [A-Z] [a-z])"
|
||||
sleep 0.01 || sleep 1
|
||||
sleep 0.1 || sleep 1
|
||||
|
||||
__done=0
|
||||
while
|
||||
|
@ -206,7 +207,7 @@ pane_wait() {
|
|||
case ${__l} in
|
||||
*"$ " | *"# ") return ;;
|
||||
esac
|
||||
do sleep 0.01 || sleep 1; done
|
||||
do sleep 0.1 || sleep 1; done
|
||||
}
|
||||
|
||||
# pane_parse() - Print last line, @EMPTY@ if command had no output
|
||||
|
@ -230,7 +231,7 @@ pane_status() {
|
|||
|
||||
__status="$(pane_parse "${1}")"
|
||||
while ! [ "${__status}" -eq "${__status}" ] 2>/dev/null; do
|
||||
sleep 0.01 || sleep 1
|
||||
sleep 1
|
||||
pane_run "${1}" 'echo $?'
|
||||
pane_wait "${1}"
|
||||
__status="$(pane_parse "${1}")"
|
||||
|
@ -382,16 +383,6 @@ info_check_failed() {
|
|||
printf " < failed.\n" >> "${LOGFILE}"
|
||||
}
|
||||
|
||||
# status_bar_blink() - Make status bar blink
|
||||
status_bar_blink() {
|
||||
for i in `seq 1 3`; do
|
||||
tmux set status-right-style 'bg=colour1 fg=colour196 bold'
|
||||
sleep 0.1 || sleep 1
|
||||
tmux set status-right-style 'bg=colour1 fg=colour233 bold'
|
||||
sleep 0.1 || sleep 1
|
||||
done
|
||||
}
|
||||
|
||||
# info_passed() - Display, log, and make status bar blink when a test passes
|
||||
info_passed() {
|
||||
switch_pane ${PANE_INFO}
|
||||
|
@ -400,7 +391,12 @@ info_passed() {
|
|||
log "...passed."
|
||||
log
|
||||
|
||||
[ ${FAST} -eq 1 ] || status_bar_blink
|
||||
for i in `seq 1 3`; do
|
||||
tmux set status-right-style 'bg=colour1 fg=colour2 bold'
|
||||
sleep "0.1"
|
||||
tmux set status-right-style 'bg=colour1 fg=colour233 bold'
|
||||
sleep "0.1"
|
||||
done
|
||||
}
|
||||
|
||||
# info_failed() - Display, log, and make status bar blink when a test passes
|
||||
|
@ -411,7 +407,12 @@ info_failed() {
|
|||
log "...failed."
|
||||
log
|
||||
|
||||
[ ${FAST} -eq 1 ] || status_bar_blink
|
||||
for i in `seq 1 3`; do
|
||||
tmux set status-right-style 'bg=colour1 fg=colour196 bold'
|
||||
sleep "0.1"
|
||||
tmux set status-right-style 'bg=colour1 fg=colour233 bold'
|
||||
sleep "0.1"
|
||||
done
|
||||
|
||||
pause_continue \
|
||||
"Press any key to pause test session" \
|
||||
|
@ -664,7 +665,7 @@ pause_continue() {
|
|||
|
||||
# run_term() - Start tmux session, running entry point, with recording if needed
|
||||
run_term() {
|
||||
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL"
|
||||
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG"
|
||||
|
||||
if [ ${CI} -eq 1 ]; then
|
||||
printf '\e[8;50;240t'
|
||||
|
|
|
@ -15,13 +15,18 @@
|
|||
|
||||
# test_iperf3s() - Start iperf3 server
|
||||
# $1: Destination/server context
|
||||
# $2: Port number
|
||||
# $2: Port number, ${i} is translated to process index
|
||||
# $3: Number of processes to run in parallel
|
||||
test_iperf3s() {
|
||||
__sctx="${1}"
|
||||
__port="${2}"
|
||||
__procs="$((${3} - 1))"
|
||||
|
||||
pane_or_context_run_bg "${__sctx}" \
|
||||
'iperf3 -s -p'${__port}' & echo $! > s.pid' \
|
||||
'for i in $(seq 0 '${__procs}'); do' \
|
||||
' iperf3 -s -p'${__port}' &' \
|
||||
' echo $! > s${i}.pid; ' \
|
||||
'done' \
|
||||
|
||||
sleep 1 # Wait for server to be ready
|
||||
}
|
||||
|
@ -31,9 +36,9 @@ test_iperf3s() {
|
|||
test_iperf3k() {
|
||||
__sctx="${1}"
|
||||
|
||||
pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'
|
||||
pane_or_context_run "${__sctx}" 'kill -INT $(cat s*.pid); rm s*.pid'
|
||||
|
||||
sleep 1 # Wait for kernel to free up ports
|
||||
sleep 3 # Wait for kernel to free up ports
|
||||
}
|
||||
|
||||
# test_iperf3() - Ugly helper for iperf3 directive
|
||||
|
@ -41,29 +46,37 @@ test_iperf3k() {
|
|||
# $2: Source/client context
|
||||
# $3: Destination name or address for client
|
||||
# $4: Port number, ${i} is translated to process index
|
||||
# $5: Run time, in seconds
|
||||
# $5: Number of processes to run in parallel
|
||||
# $6: Run time, in seconds
|
||||
# $@: Client options
|
||||
test_iperf3() {
|
||||
__var="${1}"; shift
|
||||
__cctx="${1}"; shift
|
||||
__dest="${1}"; shift
|
||||
__port="${1}"; shift
|
||||
__procs="$((${1} - 1))"; shift
|
||||
__time="${1}"; shift
|
||||
|
||||
pane_or_context_run "${__cctx}" 'rm -f c.json'
|
||||
pane_or_context_run "${__cctx}" 'rm -f c*.json'
|
||||
|
||||
# A 1s wait for connection on what's basically a local link
|
||||
# indicates something is pretty wrong
|
||||
__timeout=1000
|
||||
pane_or_context_run "${__cctx}" \
|
||||
'iperf3 -J -c '${__dest}' -p '${__port} \
|
||||
' --connect-timeout '${__timeout} \
|
||||
' -t'${__time}' -i0 '"${@}"' > c.json' \
|
||||
'(' \
|
||||
' for i in $(seq 0 '${__procs}'); do' \
|
||||
' iperf3 -J -c '${__dest}' -p '${__port} \
|
||||
' --connect-timeout '${__timeout} \
|
||||
' -t'${__time}' -i0 -T c${i} '"${@}" \
|
||||
' > c${i}.json &' \
|
||||
' done;' \
|
||||
' wait' \
|
||||
')'
|
||||
|
||||
__jval=".end.sum_received.bits_per_second"
|
||||
|
||||
__bw=$(pane_or_context_output "${__cctx}" \
|
||||
'cat c.json | jq -rMs "map('${__jval}') | add"')
|
||||
'cat c*.json | jq -rMs "map('${__jval}') | add"')
|
||||
|
||||
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ endef
|
|||
def start_stop_diff
|
||||
guest sed /proc/slabinfo -ne 's/^\([^ ]* *[^ ]* *[^ ]* *[^ ]*\).*/\\\1/p' > /tmp/slabinfo.before
|
||||
guest cat /proc/meminfo > /tmp/meminfo.before
|
||||
guest /bin/passt.avx2 -l /tmp/log -s /tmp/sock -P /tmp/pid __OPTS__
|
||||
guest /bin/passt.avx2 -l /tmp/log -s /tmp/sock -P /tmp/pid __OPTS__ --netns-only
|
||||
sleep 2
|
||||
guest cat /proc/meminfo > /tmp/meminfo.after
|
||||
guest sed /proc/slabinfo -ne 's/^\([^ ]* *[^ ]* *[^ ]* *[^ ]*\).*/\\\1/p' > /tmp/slabinfo.after
|
||||
|
@ -78,16 +78,9 @@ guest mount -o bind /proc /test/proc
|
|||
guest mount -o bind /dev /test/dev
|
||||
guest cp -Lr /bin /lib /lib64 /usr /sbin /test/
|
||||
|
||||
guest exec switch_root /test /bin/sh
|
||||
|
||||
guest ulimit -Hn 300000
|
||||
guest unshare -rUn
|
||||
guest ip link add eth0 type dummy
|
||||
guest ip link set eth0 up
|
||||
guest ip address add 192.0.2.2/24 dev eth0
|
||||
guest ip address add 2001:db8::2/64 dev eth0
|
||||
guest ip route add default via 192.0.2.1
|
||||
guest ip -6 route add default via 2001:db8::1 dev eth0
|
||||
guest unshare -rUm -R /test
|
||||
guest chroot .
|
||||
|
||||
guest meminfo_size() { grep "^$2:" $1 | tr -s ' ' | cut -f2 -d ' '; }
|
||||
guest meminfo_diff() { echo $(( $(meminfo_size $2 $3) - $(meminfo_size $1 $3) )); }
|
||||
|
@ -110,17 +103,27 @@ info
|
|||
th symbol MiB
|
||||
set WHAT tcp_buf_discard
|
||||
nm_row
|
||||
set WHAT flowtab
|
||||
set WHAT tcp6_l2_buf
|
||||
nm_row
|
||||
set WHAT tcp6_payload
|
||||
set WHAT tcp4_l2_buf
|
||||
nm_row
|
||||
set WHAT tcp4_payload
|
||||
set WHAT tc
|
||||
nm_row
|
||||
set WHAT pkt_buf
|
||||
nm_row
|
||||
set WHAT udp_payload
|
||||
set WHAT udp_splice_map
|
||||
nm_row
|
||||
set WHAT flow_hashtab
|
||||
set WHAT udp6_l2_buf
|
||||
nm_row
|
||||
set WHAT udp4_l2_buf
|
||||
nm_row
|
||||
set WHAT udp_tap_map
|
||||
nm_row
|
||||
set WHAT icmp_id_map
|
||||
nm_row
|
||||
set WHAT udp_splice_buf
|
||||
nm_row
|
||||
set WHAT tc_hash
|
||||
nm_row
|
||||
set WHAT pool_tap6_storage
|
||||
nm_row
|
||||
|
@ -139,6 +142,8 @@ set WHAT pid
|
|||
slab_row
|
||||
set WHAT dentry
|
||||
slab_row
|
||||
set WHAT Acpi-Parse
|
||||
slab_row
|
||||
set WHAT kmalloc-64
|
||||
slab_row
|
||||
set WHAT kmalloc-32
|
||||
|
|
|
@ -31,15 +31,10 @@
|
|||
|
||||
#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0])))
|
||||
|
||||
#define die(...) \
|
||||
do { \
|
||||
fprintf(stderr, "nstool: " __VA_ARGS__); \
|
||||
exit(1); \
|
||||
} while (0)
|
||||
|
||||
#define err(...) \
|
||||
do { \
|
||||
fprintf(stderr, "nstool: " __VA_ARGS__); \
|
||||
#define die(...) \
|
||||
do { \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
exit(1); \
|
||||
} while (0)
|
||||
|
||||
struct ns_type {
|
||||
|
@ -161,9 +156,6 @@ static int connect_ctl(const char *sockpath, bool wait,
|
|||
|
||||
static void cmd_hold(int argc, char *argv[])
|
||||
{
|
||||
struct sigaction sa = {
|
||||
.sa_handler = SIG_IGN,
|
||||
};
|
||||
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX);
|
||||
struct sockaddr_un addr;
|
||||
const char *sockpath = argv[1];
|
||||
|
@ -193,10 +185,6 @@ static void cmd_hold(int argc, char *argv[])
|
|||
if (!getcwd(info.cwd, sizeof(info.cwd)))
|
||||
die("getcwd(): %s\n", strerror(errno));
|
||||
|
||||
rc = sigaction(SIGPIPE, &sa, NULL);
|
||||
if (rc)
|
||||
die("sigaction(SIGPIPE): %s\n", strerror(errno));
|
||||
|
||||
do {
|
||||
int afd = accept(fd, NULL, NULL);
|
||||
char buf;
|
||||
|
@ -205,21 +193,17 @@ static void cmd_hold(int argc, char *argv[])
|
|||
die("accept(): %s\n", strerror(errno));
|
||||
|
||||
rc = write(afd, &info, sizeof(info));
|
||||
if (rc < 0) {
|
||||
err("holder write() to control socket: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
if (rc < 0)
|
||||
die("write(): %s\n", strerror(errno));
|
||||
if ((size_t)rc < sizeof(info))
|
||||
err("holder short write() on control socket\n");
|
||||
die("short write() on control socket\n");
|
||||
|
||||
rc = read(afd, &buf, sizeof(buf));
|
||||
if (rc < 0) {
|
||||
err("holder read() on control socket: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
if (rc < 0)
|
||||
die("read(): %s\n", strerror(errno));
|
||||
|
||||
close(afd);
|
||||
} while (rc <= 0);
|
||||
} while (rc == 0);
|
||||
|
||||
unlink(sockpath);
|
||||
}
|
||||
|
@ -361,43 +345,21 @@ static int openns(const char *fmt, ...)
|
|||
return fd;
|
||||
}
|
||||
|
||||
static pid_t sig_pid;
|
||||
static void sig_propagate(int signum)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = kill(sig_pid, signum);
|
||||
if (err)
|
||||
die("Propagating %s: %s\n", strsignal(signum), strerror(errno));
|
||||
}
|
||||
|
||||
static void wait_for_child(pid_t pid)
|
||||
{
|
||||
struct sigaction sa = {
|
||||
.sa_handler = sig_propagate,
|
||||
.sa_flags = SA_RESETHAND,
|
||||
};
|
||||
int status, err;
|
||||
|
||||
sig_pid = pid;
|
||||
err = sigaction(SIGTERM, &sa, NULL);
|
||||
if (err)
|
||||
die("sigaction(SIGTERM): %s\n", strerror(errno));
|
||||
int status;
|
||||
|
||||
/* Match the child's exit status, if possible */
|
||||
for (;;) {
|
||||
pid_t rc;
|
||||
|
||||
rc = waitpid(pid, &status, WUNTRACED);
|
||||
if (rc < 0) {
|
||||
if (errno == EINTR)
|
||||
continue;
|
||||
if (rc < 0)
|
||||
die("waitpid() on %d: %s\n", pid, strerror(errno));
|
||||
}
|
||||
if (rc != pid)
|
||||
die("waitpid() on %d returned %d", pid, rc);
|
||||
if (WIFSTOPPED(status)) {
|
||||
/* Stop the parent to match */
|
||||
/* Stop the parent to patch */
|
||||
kill(getpid(), SIGSTOP);
|
||||
/* We must have resumed, resume the child */
|
||||
kill(pid, SIGCONT);
|
||||
|
@ -546,7 +508,7 @@ static void cmd_exec(int argc, char *argv[])
|
|||
/* CHILD */
|
||||
if (argc > optind + 1) {
|
||||
exe = argv[optind + 1];
|
||||
xargs = (const char *const *)(argv + optind + 1);
|
||||
xargs = (const char * const*)(argv + optind + 1);
|
||||
} else {
|
||||
exe = getenv("SHELL");
|
||||
if (!exe)
|
||||
|
|
|
@ -15,14 +15,6 @@ PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod
|
|||
sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
|
||||
nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}"
|
||||
|
||||
# OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and
|
||||
# sshd-session the per-session program. We need the latter as well, and the path
|
||||
# depends on the distribution. It doesn't exist on older versions.
|
||||
for bin in /usr/lib/openssh/sshd-session /usr/lib/ssh/sshd-session \
|
||||
/usr/libexec/openssh/sshd-session; do
|
||||
command -v "${bin}" >/dev/null && PROGS="${PROGS} ${bin}"
|
||||
done
|
||||
|
||||
KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}"
|
||||
|
||||
LINKS="${LINKS:-
|
||||
|
@ -62,7 +54,7 @@ EOF
|
|||
ln -s /run /var/run
|
||||
:> /etc/fstab
|
||||
|
||||
# sshd via vsock
|
||||
# sshd(dropbear) via vsock
|
||||
cat > /etc/passwd << EOF
|
||||
root:x:0:0:root:/root:/bin/sh
|
||||
sshd:x:100:100:Privilege-separated SSH:/var/empty/sshd:/sbin/nologin
|
||||
|
@ -72,9 +64,7 @@ root:::0:99999:7:::
|
|||
EOF
|
||||
chmod 000 /etc/shadow
|
||||
|
||||
cat > /etc/ssh/sshd_config << EOF
|
||||
Subsystem sftp internal-sftp
|
||||
EOF
|
||||
:> /etc/ssh/sshd_config
|
||||
ssh-keygen -A
|
||||
chmod 700 /root/.ssh
|
||||
chmod 700 /run/sshd
|
||||
|
@ -86,7 +76,7 @@ EOF
|
|||
EOF
|
||||
chmod 600 /root/.ssh/authorized_keys
|
||||
chmod 700 /root
|
||||
socat VSOCK-LISTEN:22,fork EXEC:"/sbin/sshd -i -e" 2> /var/log/vsock-ssh.log &
|
||||
socat VSOCK-LISTEN:22,fork EXEC:"sshd -i -e" 2> /var/log/vsock-ssh.log &
|
||||
sh +m
|
||||
'
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
PROGS="${PROGS:-ash,dash,bash chmod ip mount insmod mkdir ln cat chmod modprobe
|
||||
grep mknod sed chown sleep bc ls ps mount unshare chroot cp kill diff
|
||||
head tail sort tr tee cut nm which switch_root}"
|
||||
head tail sort tr tee cut nm which}"
|
||||
|
||||
KMODS="${KMODS:- dummy}"
|
||||
|
||||
|
@ -29,6 +29,13 @@ COPIES="${COPIES} ../passt.avx2,/bin/passt.avx2"
|
|||
FIXUP="${FIXUP}"'
|
||||
ln -s /bin /usr/bin
|
||||
chmod 777 /tmp
|
||||
ip link add eth0 type dummy
|
||||
ip link set eth0 up
|
||||
ip address add 192.0.2.2/24 dev eth0
|
||||
ip address add 2001:db8::2/64 dev eth0
|
||||
ip route add default via 192.0.2.1
|
||||
ip -6 route add default via 2001:db8::1 dev eth0
|
||||
sleep 2
|
||||
sh +m
|
||||
'
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@ check [ __MTU__ = 65520 ]
|
|||
test DHCP: DNS
|
||||
gout DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||
hout HOST_DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | head -n3 | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||
check [ "__DNS__" = "__HOST_DNS__" ] || ( [ "__DNS__" = "__HOST_GW__" ] && expr "__HOST_DNS__" : "127[.]" )
|
||||
check [ "__DNS__" = "__HOST_DNS__" ] || [ "__DNS__" = "__HOST_GW__" -a "__HOST_DNS__" = "127.0.0.1" ]
|
||||
|
||||
# FQDNs should be terminated by dots, but the guest DHCP client might omit them:
|
||||
# strip them first
|
||||
|
@ -49,10 +49,8 @@ check [ "__SEARCH__" = "__HOST_SEARCH__" ]
|
|||
|
||||
test DHCPv6: address
|
||||
guest /sbin/dhclient -6 __IFNAME__
|
||||
# Wait for DAD to complete
|
||||
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global").local] | .[0]'
|
||||
check [ "__ADDR6__" = "__HOST_ADDR6__" ]
|
||||
|
||||
test DHCPv6: route
|
||||
|
|
|
@ -16,16 +16,14 @@ htools ip jq sipcalc grep cut
|
|||
|
||||
test Interface name
|
||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
guest ip link set dev __IFNAME__ up
|
||||
# Wait for DAD to complete
|
||||
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
guest ip link set dev __IFNAME__ up && sleep 2
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME__" ]
|
||||
|
||||
test SLAAC: prefix
|
||||
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
|
||||
gout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
|
||||
gout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global").local] | .[0]'
|
||||
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
|
||||
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]
|
||||
|
||||
|
|
|
@ -1,75 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/passt/dhcp - Check DHCP and DHCPv6 functionality in passt mode
|
||||
#
|
||||
# Copyright (c) 2021 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
gtools ip jq dhclient sed tr
|
||||
htools ip jq sed tr head
|
||||
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
test Interface name
|
||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME__" ]
|
||||
|
||||
test DHCP: address
|
||||
guest /sbin/dhclient -4 __IFNAME__
|
||||
gout ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
|
||||
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||
check [ "__ADDR__" = "__HOST_ADDR__" ]
|
||||
|
||||
test DHCP: route
|
||||
gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
hout HOST_GW ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").gateway] | .[0]'
|
||||
check [ "__GW__" = "__HOST_GW__" ]
|
||||
|
||||
test DHCP: MTU
|
||||
gout MTU ip -j link show | jq -rM '.[] | select(.ifname == "__IFNAME__").mtu'
|
||||
check [ __MTU__ = 65520 ]
|
||||
|
||||
test DHCP: DNS
|
||||
gout DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||
hout HOST_DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | head -n3 | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||
check [ "__DNS__" = "__HOST_DNS__" ] || ( [ "__DNS__" = "__MAP_NS4__" ] && expr "__HOST_DNS__" : "127[.]" )
|
||||
|
||||
# FQDNs should be terminated by dots, but the guest DHCP client might omit them:
|
||||
# strip them first
|
||||
test DHCP: search list
|
||||
gout SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
|
||||
hout HOST_SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
|
||||
check [ "__SEARCH__" = "__HOST_SEARCH__" ]
|
||||
|
||||
test DHCPv6: address
|
||||
guest /sbin/dhclient -6 __IFNAME__
|
||||
# Wait for DAD to complete
|
||||
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
check [ "__ADDR6__" = "__HOST_ADDR6__" ]
|
||||
|
||||
test DHCPv6: route
|
||||
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
hout HOST_GW6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").gateway] | .[0]'
|
||||
check [ "__GW6__" = "__HOST_GW6__" ]
|
||||
|
||||
# Strip interface specifier: interface names might differ between host and guest
|
||||
test DHCPv6: DNS
|
||||
gout DNS6 sed -n 's/^nameserver \([^:]*:\)\([^%]*\).*/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||
hout HOST_DNS6 sed -n 's/^nameserver \([^:]*:\)\([^%]*\).*/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||
check [ "__DNS6__" = "__HOST_DNS6__" ] || [ "__DNS6__" = "__MAP_NS6__" -a "__HOST_DNS6__" = "::1" ]
|
||||
|
||||
test DHCPv6: search list
|
||||
gout SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
|
||||
hout HOST_SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
|
||||
check [ "__SEARCH6__" = "__HOST_SEARCH6__" ]
|
|
@ -15,11 +15,6 @@ gtools socat ip jq
|
|||
htools socat ip jq
|
||||
nstools socat ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
set TEMP_BIG __STATEDIR__/test_big.bin
|
||||
set TEMP_SMALL __STATEDIR__/test_small.bin
|
||||
set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
|
||||
|
@ -32,7 +27,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10001
|
|||
guestw
|
||||
guest cmp test_big.bin /root/big.bin
|
||||
|
||||
test TCP/IPv4: host to ns (spliced): big transfer
|
||||
test TCP/IPv4: host to ns: big transfer
|
||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
|
||||
|
@ -41,15 +36,16 @@ check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
|
|||
|
||||
test TCP/IPv4: guest to host: big transfer
|
||||
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
||||
gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/big.bin TCP4:__MAP_HOST4__:10003
|
||||
guest socat -u OPEN:/root/big.bin TCP4:__GW__:10003
|
||||
hostw
|
||||
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
||||
|
||||
test TCP/IPv4: guest to ns: big transfer
|
||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/big.bin TCP4:__MAP_NS4__:10002
|
||||
guest socat -u OPEN:/root/big.bin TCP4:__GW__:10002
|
||||
nsw
|
||||
check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
|
||||
|
||||
|
@ -63,7 +59,7 @@ check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
|||
test TCP/IPv4: ns to host (via tap): big transfer
|
||||
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
||||
sleep 1
|
||||
ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__MAP_HOST4__:10003
|
||||
ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003
|
||||
hostw
|
||||
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
||||
|
||||
|
@ -90,7 +86,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10001
|
|||
guestw
|
||||
guest cmp test_small.bin /root/small.bin
|
||||
|
||||
test TCP/IPv4: host to ns (spliced): small transfer
|
||||
test TCP/IPv4: host to ns: small transfer
|
||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
|
||||
|
@ -99,15 +95,16 @@ check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
|
|||
|
||||
test TCP/IPv4: guest to host: small transfer
|
||||
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
||||
gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/small.bin TCP4:__MAP_HOST4__:10003
|
||||
guest socat -u OPEN:/root/small.bin TCP4:__GW__:10003
|
||||
hostw
|
||||
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
||||
|
||||
test TCP/IPv4: guest to ns: small transfer
|
||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/small.bin TCP4:__MAP_NS4__:10002
|
||||
guest socat -u OPEN:/root/small.bin TCP4:__GW__:10002
|
||||
nsw
|
||||
check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
|
||||
|
||||
|
@ -121,7 +118,7 @@ check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
|||
test TCP/IPv4: ns to host (via tap): small transfer
|
||||
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
||||
sleep 1
|
||||
ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__MAP_HOST4__:10003
|
||||
ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003
|
||||
hostw
|
||||
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
||||
|
||||
|
@ -146,7 +143,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10001
|
|||
guestw
|
||||
guest cmp test_big.bin /root/big.bin
|
||||
|
||||
test TCP/IPv6: host to ns (spliced): big transfer
|
||||
test TCP/IPv6: host to ns: big transfer
|
||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
|
||||
|
@ -155,15 +152,17 @@ check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
|
|||
|
||||
test TCP/IPv6: guest to host: big transfer
|
||||
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
||||
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/big.bin TCP6:[__MAP_HOST6__]:10003
|
||||
guest socat -u OPEN:/root/big.bin TCP6:[__GW6__%__IFNAME__]:10003
|
||||
hostw
|
||||
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
||||
|
||||
test TCP/IPv6: guest to ns: big transfer
|
||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/big.bin TCP6:[__MAP_NS6__]:10002
|
||||
guest socat -u OPEN:/root/big.bin TCP6:[__GW6__%__IFNAME__]:10002
|
||||
nsw
|
||||
check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
|
||||
|
||||
|
@ -176,8 +175,9 @@ check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
|||
|
||||
test TCP/IPv6: ns to host (via tap): big transfer
|
||||
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
sleep 1
|
||||
ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__MAP_HOST6__]:10003
|
||||
ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003
|
||||
hostw
|
||||
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
||||
|
||||
|
@ -190,7 +190,6 @@ guest cmp test_big.bin /root/big.bin
|
|||
|
||||
test TCP/IPv6: ns to guest (using namespace address): big transfer
|
||||
guestb socat -u TCP6-LISTEN:10001 OPEN:test_big.bin,create,trunc
|
||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
|
||||
sleep 1
|
||||
ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__ADDR6__]:10001
|
||||
|
@ -204,7 +203,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10001
|
|||
guestw
|
||||
guest cmp test_small.bin /root/small.bin
|
||||
|
||||
test TCP/IPv6: host to ns (spliced): small transfer
|
||||
test TCP/IPv6: host to ns: small transfer
|
||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
|
||||
|
@ -213,15 +212,17 @@ check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
|
|||
|
||||
test TCP/IPv6: guest to host: small transfer
|
||||
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
||||
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/small.bin TCP6:[__MAP_HOST6__]:10003
|
||||
guest socat -u OPEN:/root/small.bin TCP6:[__GW6__%__IFNAME__]:10003
|
||||
hostw
|
||||
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
||||
|
||||
test TCP/IPv6: guest to ns: small transfer
|
||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/small.bin TCP6:[__MAP_NS6__]:10002
|
||||
guest socat -u OPEN:/root/small.bin TCP6:[__GW6__%__IFNAME__]:10002
|
||||
nsw
|
||||
check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
|
||||
|
||||
|
@ -234,8 +235,9 @@ check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
|||
|
||||
test TCP/IPv6: ns to host (via tap): small transfer
|
||||
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
sleep 1
|
||||
ns socat -u OPEN:__BASEPATH__/small.bin TCP6:[__MAP_HOST6__]:10003
|
||||
ns socat -u OPEN:__BASEPATH__/small.bin TCP6:[__GW6__%__IFNAME__]:10003
|
||||
hostw
|
||||
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
||||
|
||||
|
|
|
@ -15,11 +15,6 @@ gtools socat ip jq
|
|||
nstools socat ip jq
|
||||
htools socat ip jq
|
||||
|
||||
set MAP_HOST4 192.0.2.1
|
||||
set MAP_HOST6 2001:db8:9a55::1
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
set TEMP __STATEDIR__/test.bin
|
||||
set TEMP_NS __STATEDIR__/test_ns.bin
|
||||
|
||||
|
@ -30,7 +25,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10001,shut-null
|
|||
guestw
|
||||
guest cmp test.bin /root/medium.bin
|
||||
|
||||
test UDP/IPv4: host to ns (recvmmsg/sendmmsg)
|
||||
test UDP/IPv4: host to ns
|
||||
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
|
||||
|
@ -39,15 +34,16 @@ check cmp __TEMP_NS__ __BASEPATH__/medium.bin
|
|||
|
||||
test UDP/IPv4: guest to host
|
||||
hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
||||
gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/medium.bin UDP4:__MAP_HOST4__:10003,shut-null
|
||||
guest socat -u OPEN:/root/medium.bin UDP4:__GW__:10003,shut-null
|
||||
hostw
|
||||
check cmp __TEMP__ __BASEPATH__/medium.bin
|
||||
|
||||
test UDP/IPv4: guest to ns
|
||||
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/medium.bin UDP4:__MAP_NS4__:10002,shut-null
|
||||
guest socat -u OPEN:/root/medium.bin UDP4:__GW__:10002,shut-null
|
||||
nsw
|
||||
check cmp __TEMP_NS__ __BASEPATH__/medium.bin
|
||||
|
||||
|
@ -61,7 +57,7 @@ check cmp __TEMP__ __BASEPATH__/medium.bin
|
|||
test UDP/IPv4: ns to host (via tap)
|
||||
hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
||||
sleep 1
|
||||
ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__MAP_HOST4__:10003,shut-null
|
||||
ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null
|
||||
hostw
|
||||
check cmp __TEMP__ __BASEPATH__/medium.bin
|
||||
|
||||
|
@ -88,7 +84,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10001,shut-null
|
|||
guestw
|
||||
guest cmp test.bin /root/medium.bin
|
||||
|
||||
test UDP/IPv6: host to ns (recvmmsg/sendmmsg)
|
||||
test UDP/IPv6: host to ns
|
||||
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
|
||||
|
@ -97,15 +93,17 @@ check cmp __TEMP_NS__ __BASEPATH__/medium.bin
|
|||
|
||||
test UDP/IPv6: guest to host
|
||||
hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
||||
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/medium.bin UDP6:[__MAP_HOST6__]:10003,shut-null
|
||||
guest socat -u OPEN:/root/medium.bin UDP6:[__GW6__%__IFNAME__]:10003,shut-null
|
||||
hostw
|
||||
check cmp __TEMP__ __BASEPATH__/medium.bin
|
||||
|
||||
test UDP/IPv6: guest to ns
|
||||
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
sleep 1
|
||||
guest socat -u OPEN:/root/medium.bin UDP6:[__MAP_NS6__]:10002,shut-null
|
||||
guest socat -u OPEN:/root/medium.bin UDP6:[__GW6__%__IFNAME__]:10002,shut-null
|
||||
nsw
|
||||
check cmp __TEMP_NS__ __BASEPATH__/medium.bin
|
||||
|
||||
|
@ -118,8 +116,9 @@ check cmp __TEMP__ __BASEPATH__/medium.bin
|
|||
|
||||
test UDP/IPv6: ns to host (via tap)
|
||||
hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
sleep 1
|
||||
ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__MAP_HOST6__]:10003,shut-null
|
||||
ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__GW6__%__IFNAME__]:10003,shut-null
|
||||
hostw
|
||||
check cmp __TEMP__ __BASEPATH__/medium.bin
|
||||
|
||||
|
@ -132,7 +131,6 @@ guest cmp test.bin /root/medium.bin
|
|||
|
||||
test UDP/IPv6: ns to guest (using namespace address)
|
||||
guestb socat -u UDP6-LISTEN:10001,null-eof OPEN:test.bin,create,trunc
|
||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
|
||||
sleep 1
|
||||
ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__ADDR6__]:10001,shut-null
|
||||
|
|
|
@ -35,11 +35,9 @@ check [ __MTU__ = 65520 ]
|
|||
|
||||
test DHCPv6: address
|
||||
ns /sbin/dhclient -6 --no-pid __IFNAME__
|
||||
# Wait for DAD to complete
|
||||
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global").local] | .[0]'
|
||||
check [ __ADDR6__ = __HOST_ADDR6__ ]
|
||||
|
||||
test DHCPv6: route
|
||||
|
|
|
@ -18,13 +18,12 @@ test Interface name
|
|||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
check [ -n "__IFNAME__" ]
|
||||
ns ip link set dev __IFNAME__ up
|
||||
# Wait for DAD to complete
|
||||
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
sleep 2
|
||||
|
||||
test SLAAC: prefix
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
|
||||
nsout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
|
||||
nsout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]'
|
||||
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
|
||||
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]
|
||||
|
||||
|
|
|
@ -19,8 +19,8 @@ set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
|
|||
set TEMP_SMALL __STATEDIR__/test_small.bin
|
||||
set TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin
|
||||
|
||||
test TCP/IPv4: host to ns (spliced): big transfer
|
||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
test TCP/IPv4: host to ns: big transfer
|
||||
nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
|
||||
nsw
|
||||
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
|
||||
|
@ -38,8 +38,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003
|
|||
hostw
|
||||
check cmp __BASEPATH__/big.bin __TEMP_BIG__
|
||||
|
||||
test TCP/IPv4: host to ns (spliced): small transfer
|
||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
test TCP/IPv4: host to ns: small transfer
|
||||
nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
host socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
|
||||
nsw
|
||||
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
|
||||
|
@ -57,8 +57,8 @@ ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003
|
|||
hostw
|
||||
check cmp __BASEPATH__/small.bin __TEMP_SMALL__
|
||||
|
||||
test TCP/IPv6: host to ns (spliced): big transfer
|
||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
test TCP/IPv6: host to ns: big transfer
|
||||
nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
|
||||
nsw
|
||||
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
|
||||
|
@ -77,8 +77,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003
|
|||
hostw
|
||||
check cmp __BASEPATH__/big.bin __TEMP_BIG__
|
||||
|
||||
test TCP/IPv6: host to ns (spliced): small transfer
|
||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
test TCP/IPv6: host to ns: small transfer
|
||||
nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
|
||||
nsw
|
||||
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
|
||||
|
|
|
@ -17,8 +17,8 @@ htools dd socat ip jq
|
|||
set TEMP __STATEDIR__/test.bin
|
||||
set TEMP_NS __STATEDIR__/test_ns.bin
|
||||
|
||||
test UDP/IPv4: host to ns (recvmmsg/sendmmsg)
|
||||
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
test UDP/IPv4: host to ns
|
||||
nsb socat -u UDP4-LISTEN:10002,bind=127.0.0.1,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
host socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
|
||||
nsw
|
||||
check cmp __BASEPATH__/medium.bin __TEMP_NS__
|
||||
|
@ -37,8 +37,8 @@ ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null
|
|||
hostw
|
||||
check cmp __BASEPATH__/medium.bin __TEMP__
|
||||
|
||||
test UDP/IPv6: host to ns (recvmmsg/sendmmsg)
|
||||
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
test UDP/IPv6: host to ns
|
||||
nsb socat -u UDP6-LISTEN:10002,bind=[::1],null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
|
||||
nsw
|
||||
check cmp __BASEPATH__/medium.bin __TEMP_NS__
|
||||
|
|
|
@ -19,7 +19,7 @@ sleep 1
|
|||
endef
|
||||
|
||||
def flood_log_client
|
||||
host tcp_crr --nolog -l1 -P 10001 -C 10002 -6 -c -H ::1
|
||||
host tcp_crr --nolog -P 10001 -C 10002 -6 -c -H ::1
|
||||
endef
|
||||
|
||||
def check_log_size_mountns
|
||||
|
@ -33,16 +33,19 @@ test Log creation
|
|||
set PORTS -t 10001,10002 -u 10001,10002
|
||||
set LOG_FILE __STATEDIR__/pasta.log
|
||||
|
||||
passt ./pasta -l __LOG_FILE__ -- /bin/true
|
||||
passt ./pasta -l __LOG_FILE__
|
||||
passtb exit
|
||||
sleep 1
|
||||
check [ -s __LOG_FILE__ ]
|
||||
|
||||
test Log truncated on creation
|
||||
passt ./pasta -l __LOG_FILE__ -- /bin/true & wait
|
||||
pout PID2 echo $!
|
||||
check head -1 __LOG_FILE__ | grep '^pasta .* [(]__PID2__[)]$'
|
||||
passt ./pasta -l __LOG_FILE__
|
||||
passtb exit
|
||||
sleep 1
|
||||
check [ $(cat __LOG_FILE__ | wc -l) -eq 1 ]
|
||||
|
||||
test Maximum log size
|
||||
passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done'
|
||||
passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -P 10001 -C 10002 -6; done'
|
||||
sleep 1
|
||||
|
||||
flood_log_client
|
||||
|
|
|
@ -11,16 +11,11 @@
|
|||
# Copyright (c) 2022 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
htools git make go bats ip jq socat ./test/podman/bin/podman
|
||||
|
||||
set PODMAN test/podman/bin/podman
|
||||
hout WD pwd
|
||||
|
||||
test Podman pasta path
|
||||
|
||||
hout PASTA_BIN CONTAINERS_HELPER_BINARY_DIR="__WD__" __PODMAN__ info --format "{{.Host.Pasta.Executable}}"
|
||||
check [ "__PASTA_BIN__" = "__WD__/pasta" ]
|
||||
htools git make go bats catatonit ip jq socat
|
||||
|
||||
test Podman system test with bats
|
||||
|
||||
host PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats test/podman/test/system/505-networking-pasta.bats
|
||||
host git -C __STATEDIR__ clone https://github.com/containers/podman.git
|
||||
host make -C __STATEDIR__/podman
|
||||
hout WD pwd
|
||||
host PODMAN="__STATEDIR__/podman/bin/podman" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats __STATEDIR__/podman/test/system/505-networking-pasta.bats
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue