Compare commits
4 commits
master
...
podman2368
Author | SHA1 | Date | |
---|---|---|---|
|
0c6c20dee5 | ||
|
d098e0527a | ||
|
026fb71d1d | ||
|
232e12529e |
64 changed files with 1186 additions and 1762 deletions
126
.clang-format
126
.clang-format
|
@ -1,126 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-2.0
|
||||
#
|
||||
# clang-format configuration file. Intended for clang-format >= 11.
|
||||
#
|
||||
# For more information, see:
|
||||
#
|
||||
# Documentation/dev-tools/clang-format.rst
|
||||
# https://clang.llvm.org/docs/ClangFormat.html
|
||||
# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||
#
|
||||
---
|
||||
AccessModifierOffset: -4
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlines: Left
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: false
|
||||
AllowAllParametersOfDeclarationOnNextLine: false
|
||||
AllowShortBlocksOnASingleLine: false
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: None
|
||||
AllowShortIfStatementsOnASingleLine: false
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: false
|
||||
AlwaysBreakTemplateDeclarations: false
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true
|
||||
BraceWrapping:
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: true
|
||||
AfterNamespace: true
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Custom
|
||||
BreakBeforeInheritanceComma: false
|
||||
BreakBeforeTernaryOperators: false
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BreakConstructorInitializers: BeforeComma
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakStringLiterals: false
|
||||
ColumnLimit: 80
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: false
|
||||
ConstructorInitializerIndentWidth: 8
|
||||
ContinuationIndentWidth: 8
|
||||
Cpp11BracedListStyle: false
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: false
|
||||
|
||||
# Taken from:
|
||||
# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
|
||||
# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
|
||||
# | LC_ALL=C sort -u
|
||||
ForEachMacros:
|
||||
- 'for_each_nst'
|
||||
|
||||
IncludeBlocks: Preserve
|
||||
IncludeCategories:
|
||||
- Regex: '.*'
|
||||
Priority: 1
|
||||
IncludeIsMainRegex: '(Test)?$'
|
||||
IndentCaseLabels: false
|
||||
IndentGotoLabels: false
|
||||
IndentPPDirectives: None
|
||||
IndentWidth: 8
|
||||
IndentWrappedFunctionNames: false
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBinPackProtocolList: Auto
|
||||
ObjCBlockIndentWidth: 8
|
||||
ObjCSpaceAfterProperty: true
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
|
||||
# Taken from git's rules
|
||||
PenaltyBreakAssignment: 10
|
||||
PenaltyBreakBeforeFirstCallParameter: 30
|
||||
PenaltyBreakComment: 10
|
||||
PenaltyBreakFirstLessLess: 0
|
||||
PenaltyBreakString: 10
|
||||
PenaltyExcessCharacter: 100
|
||||
PenaltyReturnTypeOnItsOwnLine: 60
|
||||
|
||||
PointerAlignment: Right
|
||||
ReflowComments: false
|
||||
SortIncludes: false
|
||||
SortUsingDeclarations: false
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeCtorInitializerColon: true
|
||||
SpaceBeforeInheritanceColon: true
|
||||
SpaceBeforeParens: ControlStatementsExceptForEachMacros
|
||||
SpaceBeforeRangeBasedForLoopColon: true
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 1
|
||||
SpacesInAngles: false
|
||||
SpacesInContainerLiterals: false
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
Standard: Cpp03
|
||||
TabWidth: 8
|
||||
UseTab: Always
|
||||
...
|
93
.clang-tidy
93
.clang-tidy
|
@ -1,93 +0,0 @@
|
|||
---
|
||||
Checks:
|
||||
- "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*"
|
||||
|
||||
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
|
||||
- "-clang-analyzer-valist.Uninitialized"
|
||||
|
||||
# Dubious value, would kill readability
|
||||
- "-cppcoreguidelines-init-variables"
|
||||
|
||||
# Dubious value over the compiler's built-in warning. Would
|
||||
# increase verbosity.
|
||||
- "-bugprone-assignment-in-if-condition"
|
||||
|
||||
# Debatable whether these improve readability, right now it would look
|
||||
# like a mess
|
||||
- "-google-readability-braces-around-statements"
|
||||
- "-hicpp-braces-around-statements"
|
||||
- "-readability-braces-around-statements"
|
||||
|
||||
# TODO: in most cases they are justified, but probably not everywhere
|
||||
#
|
||||
- "-readability-magic-numbers"
|
||||
- "-cppcoreguidelines-avoid-magic-numbers"
|
||||
|
||||
# TODO: this is Linux-only for the moment, nice to fix eventually
|
||||
- "-llvmlibc-restrict-system-libc-headers"
|
||||
|
||||
# Those are needed for syscalls, epoll_wait flags, etc.
|
||||
- "-hicpp-signed-bitwise"
|
||||
|
||||
# Probably not doable to impement this without plain memcpy(), memset()
|
||||
- "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
|
||||
|
||||
# TODO: not really important, but nice to fix eventually
|
||||
- "-llvm-include-order"
|
||||
|
||||
# Dubious value, would kill readability
|
||||
- "-readability-isolate-declaration"
|
||||
|
||||
# TODO: nice to fix eventually
|
||||
- "-bugprone-narrowing-conversions"
|
||||
- "-cppcoreguidelines-narrowing-conversions"
|
||||
|
||||
# TODO: check, fix, and more in general constify wherever possible
|
||||
- "-cppcoreguidelines-avoid-non-const-global-variables"
|
||||
|
||||
# TODO: check paths where it might make sense to improve performance
|
||||
- "-altera-unroll-loops"
|
||||
- "-altera-id-dependent-backward-branch"
|
||||
|
||||
# Not much can be done about them other than being careful
|
||||
- "-bugprone-easily-swappable-parameters"
|
||||
|
||||
# TODO: split reported functions
|
||||
- "-readability-function-cognitive-complexity"
|
||||
|
||||
# "Poor" alignment needed for structs reflecting message formats/headers
|
||||
- "-altera-struct-pack-align"
|
||||
|
||||
# TODO: check again if multithreading is implemented
|
||||
- "-concurrency-mt-unsafe"
|
||||
|
||||
# Complains about any identifier <3 characters, reasonable for
|
||||
# globals, pointlessly verbose for locals and parameters.
|
||||
- "-readability-identifier-length"
|
||||
|
||||
# Wants to include headers which *directly* provide the things
|
||||
# we use. That sounds nice, but means it will often want a OS
|
||||
# specific header instead of a mostly standard one, such as
|
||||
# <linux/limits.h> instead of <limits.h>.
|
||||
- "-misc-include-cleaner"
|
||||
|
||||
# Want to replace all #defines of integers with enums. Kind of
|
||||
# makes sense when those defines form an enum-like set, but
|
||||
# weird for cases like standalone constants, and causes other
|
||||
# awkwardness for a bunch of cases we use
|
||||
- "-cppcoreguidelines-macro-to-enum"
|
||||
|
||||
# It's been a couple of centuries since multiplication has been granted
|
||||
# precedence over addition in modern mathematical notation. Adding
|
||||
# parentheses to reinforce that certainly won't improve readability.
|
||||
- "-readability-math-missing-parentheses"
|
||||
WarningsAsErrors: "*"
|
||||
HeaderFileExtensions:
|
||||
- h
|
||||
ImplementationFileExtensions:
|
||||
- c
|
||||
HeaderFilterRegex: ""
|
||||
FormatStyle: none
|
||||
CheckOptions:
|
||||
bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false"
|
||||
SystemHeaders: false
|
3
.clangd
3
.clangd
|
@ -1,3 +0,0 @@
|
|||
CompileFlags:
|
||||
# Don't try to interpret our headers as C++'
|
||||
Add: [-xc, -Wall]
|
161
Makefile
161
Makefile
|
@ -15,11 +15,24 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
|
|||
# the IPv6 socket API? (Linux does)
|
||||
DUAL_STACK_SOCKETS := 1
|
||||
|
||||
RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s')
|
||||
ifeq ($(RLIMIT_STACK_VAL),unlimited)
|
||||
RLIMIT_STACK_VAL := 1024
|
||||
endif
|
||||
|
||||
TARGET ?= $(shell $(CC) -dumpmachine)
|
||||
# Get 'uname -m'-like architecture description for target
|
||||
TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
|
||||
TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')
|
||||
|
||||
AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
|
||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
|
||||
|
||||
# On some systems enabling optimization also enables source fortification,
|
||||
# automagically. Do not override it.
|
||||
FORTIFY_FLAG :=
|
||||
|
@ -31,6 +44,10 @@ FLAGS := -Wall -Wextra -Wno-format-zero-length
|
|||
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
|
||||
FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
|
||||
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
|
||||
FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
|
||||
FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
|
||||
FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
|
||||
FLAGS += -DARCH=\"$(TARGET_ARCH)\"
|
||||
FLAGS += -DVERSION=\"$(VERSION)\"
|
||||
FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
|
||||
|
||||
|
@ -50,6 +67,21 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
|
|||
udp.h udp_flow.h util.h
|
||||
HEADERS = $(PASST_HEADERS) seccomp.h
|
||||
|
||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_SND_WND
|
||||
endif
|
||||
|
||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_BYTES_ACKED
|
||||
endif
|
||||
|
||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 };
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_MIN_RTT
|
||||
endif
|
||||
|
||||
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
FLAGS += -DHAS_GETRANDOM
|
||||
|
@ -59,6 +91,11 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec
|
|||
FLAGS += -fstack-protector-strong
|
||||
endif
|
||||
|
||||
C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE;
|
||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||
EXTRA_SYSCALLS += fallocate
|
||||
endif
|
||||
|
||||
prefix ?= /usr/local
|
||||
exec_prefix ?= $(prefix)
|
||||
bindir ?= $(exec_prefix)/bin
|
||||
|
@ -95,7 +132,7 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
|
|||
ln -sf $< $@
|
||||
|
||||
qrap: $(QRAP_SRCS) passt.h
|
||||
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)
|
||||
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
|
||||
|
||||
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
|
||||
rt_sigreturn getpid gettid kill clock_gettime mmap \
|
||||
|
@ -159,11 +196,116 @@ docs: README.md
|
|||
done < README.md; \
|
||||
) > README.plain.md
|
||||
|
||||
clang-tidy: $(PASST_SRCS) $(HEADERS)
|
||||
clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
|
||||
-DCLANG_TIDY_58992
|
||||
# Checkers currently disabled for clang-tidy:
|
||||
# - llvmlibc-restrict-system-libc-headers
|
||||
# TODO: this is Linux-only for the moment, nice to fix eventually
|
||||
#
|
||||
# - google-readability-braces-around-statements
|
||||
# - hicpp-braces-around-statements
|
||||
# - readability-braces-around-statements
|
||||
# Debatable whether that improves readability, right now it would look
|
||||
# like a mess
|
||||
#
|
||||
# - readability-magic-numbers
|
||||
# - cppcoreguidelines-avoid-magic-numbers
|
||||
# TODO: in most cases they are justified, but probably not everywhere
|
||||
#
|
||||
# - clang-analyzer-valist.Uninitialized
|
||||
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
|
||||
#
|
||||
# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling
|
||||
# Probably not doable to impement this without plain memcpy(), memset()
|
||||
#
|
||||
# - cppcoreguidelines-init-variables
|
||||
# Dubious value, would kill readability
|
||||
#
|
||||
# - hicpp-signed-bitwise
|
||||
# Those are needed for syscalls, epoll_wait flags, etc.
|
||||
#
|
||||
# - llvm-include-order
|
||||
# TODO: not really important, but nice to fix eventually
|
||||
#
|
||||
# - readability-isolate-declaration
|
||||
# Dubious value, would kill readability
|
||||
#
|
||||
# - bugprone-narrowing-conversions
|
||||
# - cppcoreguidelines-narrowing-conversions
|
||||
# TODO: nice to fix eventually
|
||||
#
|
||||
# - cppcoreguidelines-avoid-non-const-global-variables
|
||||
# TODO: check, fix, and more in general constify wherever possible
|
||||
#
|
||||
# - altera-unroll-loops
|
||||
# - altera-id-dependent-backward-branch
|
||||
# TODO: check paths where it might make sense to improve performance
|
||||
#
|
||||
# - bugprone-easily-swappable-parameters
|
||||
# Not much can be done about them other than being careful
|
||||
#
|
||||
# - readability-function-cognitive-complexity
|
||||
# TODO: split reported functions
|
||||
#
|
||||
# - altera-struct-pack-align
|
||||
# "Poor" alignment needed for structs reflecting message formats/headers
|
||||
#
|
||||
# - concurrency-mt-unsafe
|
||||
# TODO: check again if multithreading is implemented
|
||||
#
|
||||
# - readability-identifier-length
|
||||
# Complains about any identifier <3 characters, reasonable for
|
||||
# globals, pointlessly verbose for locals and parameters.
|
||||
#
|
||||
# - bugprone-assignment-in-if-condition
|
||||
# Dubious value over the compiler's built-in warning. Would
|
||||
# increase verbosity.
|
||||
#
|
||||
# - misc-include-cleaner
|
||||
# Wants to include headers which *directly* provide the things
|
||||
# we use. That sounds nice, but means it will often want a OS
|
||||
# specific header instead of a mostly standard one, such as
|
||||
# <linux/limits.h> instead of <limits.h>.
|
||||
#
|
||||
# - cppcoreguidelines-macro-to-enum
|
||||
# Want to replace all #defines of integers with enums. Kind of
|
||||
# makes sense when those defines form an enum-like set, but
|
||||
# weird for cases like standalone constants, and causes other
|
||||
# awkwardness for a bunch of cases we use
|
||||
|
||||
cppcheck: $(PASST_SRCS) $(HEADERS)
|
||||
clang-tidy: $(SRCS) $(HEADERS)
|
||||
clang-tidy -checks=*,-modernize-*,\
|
||||
-clang-analyzer-valist.Uninitialized,\
|
||||
-cppcoreguidelines-init-variables,\
|
||||
-bugprone-assignment-in-if-condition,\
|
||||
-google-readability-braces-around-statements,\
|
||||
-hicpp-braces-around-statements,\
|
||||
-readability-braces-around-statements,\
|
||||
-readability-magic-numbers,\
|
||||
-llvmlibc-restrict-system-libc-headers,\
|
||||
-hicpp-signed-bitwise,\
|
||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
|
||||
-llvm-include-order,\
|
||||
-cppcoreguidelines-avoid-magic-numbers,\
|
||||
-readability-isolate-declaration,\
|
||||
-bugprone-narrowing-conversions,\
|
||||
-cppcoreguidelines-narrowing-conversions,\
|
||||
-cppcoreguidelines-avoid-non-const-global-variables,\
|
||||
-altera-unroll-loops,-altera-id-dependent-backward-branch,\
|
||||
-bugprone-easily-swappable-parameters,\
|
||||
-readability-function-cognitive-complexity,\
|
||||
-altera-struct-pack-align,\
|
||||
-concurrency-mt-unsafe,\
|
||||
-readability-identifier-length,\
|
||||
-misc-include-cleaner,\
|
||||
-cppcoreguidelines-macro-to-enum \
|
||||
-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
|
||||
--warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
|
||||
|
||||
SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
|
||||
ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)
|
||||
VER := $(shell $(CC) -dumpversion)
|
||||
SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include
|
||||
endif
|
||||
cppcheck: $(SRCS) $(HEADERS)
|
||||
if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
|
||||
CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \
|
||||
else \
|
||||
|
@ -172,8 +314,11 @@ cppcheck: $(PASST_SRCS) $(HEADERS)
|
|||
cppcheck --std=c11 --error-exitcode=1 --enable=all --force \
|
||||
--inconclusive --library=posix --quiet \
|
||||
$${CPPCHECK_EXHAUSTIVE} \
|
||||
$(SYSTEM_INCLUDES:%=-I%) \
|
||||
$(SYSTEM_INCLUDES:%=--config-exclude=%) \
|
||||
$(SYSTEM_INCLUDES:%=--suppress=*:%/*) \
|
||||
$(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*) \
|
||||
--inline-suppr \
|
||||
--suppress=missingIncludeSystem \
|
||||
--suppress=unusedStructMember \
|
||||
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936 \
|
||||
$(PASST_SRCS) $(HEADERS)
|
||||
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
|
||||
$(SRCS) $(HEADERS)
|
||||
|
|
8
arch.c
8
arch.c
|
@ -19,7 +19,6 @@
|
|||
#include <unistd.h>
|
||||
|
||||
#include "log.h"
|
||||
#include "util.h"
|
||||
|
||||
/**
|
||||
* arch_avx2_exec() - Switch to AVX2 build if supported
|
||||
|
@ -41,11 +40,8 @@ void arch_avx2_exec(char **argv)
|
|||
if (__builtin_cpu_supports("avx2")) {
|
||||
char new_path[PATH_MAX + sizeof(".avx2")];
|
||||
|
||||
if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"),
|
||||
"%s.avx2", exe))
|
||||
die_perror("Can't build AVX2 executable path");
|
||||
|
||||
execv(new_path, argv);
|
||||
snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe);
|
||||
execve(new_path, argv, environ);
|
||||
warn_perror("Can't run AVX2 build, using non-AVX2 version");
|
||||
}
|
||||
}
|
||||
|
|
8
arp.c
8
arp.c
|
@ -59,12 +59,14 @@ int arp(const struct ctx *c, const struct pool *p)
|
|||
ah->ar_op != htons(ARPOP_REQUEST))
|
||||
return 1;
|
||||
|
||||
/* Discard announcements, but not 0.0.0.0 "probes" */
|
||||
if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) &&
|
||||
/* Discard announcements (but not 0.0.0.0 "probes"): we might have the
|
||||
* same IP address, hide that.
|
||||
*/
|
||||
if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
|
||||
!memcmp(am->sip, am->tip, sizeof(am->sip)))
|
||||
return 1;
|
||||
|
||||
/* Don't resolve the guest's assigned address, either. */
|
||||
/* Don't resolve our own address, either. */
|
||||
if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
|
||||
return 1;
|
||||
|
||||
|
|
46
checksum.c
46
checksum.c
|
@ -59,7 +59,6 @@
|
|||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "checksum.h"
|
||||
#include "iov.h"
|
||||
|
||||
/* Checksums are optional for UDP over IPv4, so we usually just set
|
||||
* them to 0. Change this to 1 to calculate real UDP over IPv4
|
||||
|
@ -166,24 +165,22 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
|||
* @udp4hr: UDP header, initialised apart from checksum
|
||||
* @saddr: IPv4 source address
|
||||
* @daddr: IPv4 destination address
|
||||
* @iov: Pointer to the array of IO vectors
|
||||
* @iov_cnt: Length of the array
|
||||
* @offset: UDP payload offset in the iovec array
|
||||
* @payload: UDP packet payload
|
||||
* @dlen: Length of @payload (not including UDP header)
|
||||
*/
|
||||
void csum_udp4(struct udphdr *udp4hr,
|
||||
struct in_addr saddr, struct in_addr daddr,
|
||||
const struct iovec *iov, int iov_cnt, size_t offset)
|
||||
const void *payload, size_t dlen)
|
||||
{
|
||||
/* UDP checksums are optional, so don't bother */
|
||||
udp4hr->check = 0;
|
||||
|
||||
if (UDP4_REAL_CHECKSUMS) {
|
||||
uint16_t l4len = iov_size(iov, iov_cnt) - offset +
|
||||
sizeof(struct udphdr);
|
||||
uint16_t l4len = dlen + sizeof(struct udphdr);
|
||||
uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
|
||||
saddr, daddr);
|
||||
psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
|
||||
udp4hr->check = csum_iov(iov, iov_cnt, offset, psum);
|
||||
udp4hr->check = csum(payload, dlen, psum);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -229,24 +226,19 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
|||
/**
|
||||
* csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet
|
||||
* @udp6hr: UDP header, initialised apart from checksum
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
* @iov: Pointer to the array of IO vectors
|
||||
* @iov_cnt: Length of the array
|
||||
* @offset: UDP payload offset in the iovec array
|
||||
* @payload: UDP packet payload
|
||||
* @dlen: Length of @payload (not including UDP header)
|
||||
*/
|
||||
void csum_udp6(struct udphdr *udp6hr,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
const struct iovec *iov, int iov_cnt, size_t offset)
|
||||
const void *payload, size_t dlen)
|
||||
{
|
||||
uint16_t l4len = iov_size(iov, iov_cnt) - offset +
|
||||
sizeof(struct udphdr);
|
||||
uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
|
||||
saddr, daddr);
|
||||
uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(struct udphdr),
|
||||
IPPROTO_UDP, saddr, daddr);
|
||||
udp6hr->check = 0;
|
||||
|
||||
psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
|
||||
udp6hr->check = csum_iov(iov, iov_cnt, offset, psum);
|
||||
udp6hr->check = csum(payload, dlen, psum);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -505,26 +497,16 @@ uint16_t csum(const void *buf, size_t len, uint32_t init)
|
|||
*
|
||||
* @iov Pointer to the array of IO vectors
|
||||
* @n Length of the array
|
||||
* @offset: Offset of the data to checksum within the full data length
|
||||
* @init Initial 32-bit checksum, 0 for no pre-computed checksum
|
||||
*
|
||||
* Return: 16-bit folded, complemented checksum
|
||||
*/
|
||||
uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
|
||||
uint32_t init)
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init)
|
||||
{
|
||||
unsigned int i;
|
||||
size_t first;
|
||||
|
||||
i = iov_skip_bytes(iov, n, offset, &first);
|
||||
if (i >= n)
|
||||
return (uint16_t)~csum_fold(init);
|
||||
|
||||
init = csum_unfolded((char *)iov[i].iov_base + first,
|
||||
iov[i].iov_len - first, init);
|
||||
i++;
|
||||
|
||||
for (; i < n; i++)
|
||||
for (i = 0; i < n; i++)
|
||||
init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);
|
||||
|
||||
return (uint16_t)~csum_fold(init);
|
||||
|
|
|
@ -19,20 +19,19 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
|||
struct in_addr saddr, struct in_addr daddr);
|
||||
void csum_udp4(struct udphdr *udp4hr,
|
||||
struct in_addr saddr, struct in_addr daddr,
|
||||
const struct iovec *iov, int iov_cnt, size_t offset);
|
||||
const void *payload, size_t dlen);
|
||||
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
|
||||
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
||||
const struct in6_addr *saddr,
|
||||
const struct in6_addr *daddr);
|
||||
void csum_udp6(struct udphdr *udp6hr,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
const struct iovec *iov, int iov_cnt, size_t offset);
|
||||
const void *payload, size_t dlen);
|
||||
void csum_icmp6(struct icmp6hdr *icmp6hr,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
const void *payload, size_t dlen);
|
||||
uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
|
||||
uint32_t init);
|
||||
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init);
|
||||
|
||||
#endif /* CHECKSUM_H */
|
||||
|
|
117
conf.c
117
conf.c
|
@ -46,8 +46,6 @@
|
|||
#include "isolation.h"
|
||||
#include "log.h"
|
||||
|
||||
#define NETNS_RUN_DIR "/run/netns"
|
||||
|
||||
/**
|
||||
* next_chunk - Return the next piece of a string delimited by a character
|
||||
* @s: String to search
|
||||
|
@ -118,10 +116,11 @@ static int parse_port_range(const char *s, char **endptr,
|
|||
static void conf_ports(const struct ctx *c, char optname, const char *optarg,
|
||||
struct fwd_ports *fwd)
|
||||
{
|
||||
union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
|
||||
char addr_buf[sizeof(struct in6_addr)] = { 0 }, *addr = addr_buf;
|
||||
char buf[BUFSIZ], *spec, *ifname = NULL, *p;
|
||||
bool exclude_only = true, bound_one = false;
|
||||
uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
|
||||
sa_family_t af = AF_UNSPEC;
|
||||
unsigned i;
|
||||
int ret;
|
||||
|
||||
|
@ -167,13 +166,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
|
|||
|
||||
bitmap_set(fwd->map, i);
|
||||
if (optname == 't') {
|
||||
ret = tcp_sock_init(c, NULL, NULL, i);
|
||||
ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL,
|
||||
i);
|
||||
if (ret == -ENFILE || ret == -EMFILE)
|
||||
goto enfile;
|
||||
if (!ret)
|
||||
bound_one = true;
|
||||
} else if (optname == 'u') {
|
||||
ret = udp_sock_init(c, 0, NULL, NULL, i);
|
||||
ret = udp_sock_init(c, 0, AF_UNSPEC, NULL, NULL,
|
||||
i);
|
||||
if (ret == -ENFILE || ret == -EMFILE)
|
||||
goto enfile;
|
||||
if (!ret)
|
||||
|
@ -225,7 +226,11 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
|
|||
p++;
|
||||
}
|
||||
|
||||
if (!inany_pton(p, addr))
|
||||
if (inet_pton(AF_INET, p, addr))
|
||||
af = AF_INET;
|
||||
else if (inet_pton(AF_INET6, p, addr))
|
||||
af = AF_INET6;
|
||||
else
|
||||
goto bad;
|
||||
}
|
||||
} else {
|
||||
|
@ -271,13 +276,13 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
|
|||
bitmap_set(fwd->map, i);
|
||||
|
||||
if (optname == 't') {
|
||||
ret = tcp_sock_init(c, addr, ifname, i);
|
||||
ret = tcp_sock_init(c, af, addr, ifname, i);
|
||||
if (ret == -ENFILE || ret == -EMFILE)
|
||||
goto enfile;
|
||||
if (!ret)
|
||||
bound_one = true;
|
||||
} else if (optname == 'u') {
|
||||
ret = udp_sock_init(c, 0, addr, ifname, i);
|
||||
ret = udp_sock_init(c, 0, af, addr, ifname, i);
|
||||
if (ret == -ENFILE || ret == -EMFILE)
|
||||
goto enfile;
|
||||
if (!ret)
|
||||
|
@ -333,9 +338,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
|
|||
|
||||
ret = 0;
|
||||
if (optname == 't')
|
||||
ret = tcp_sock_init(c, addr, ifname, i);
|
||||
ret = tcp_sock_init(c, af, addr, ifname, i);
|
||||
else if (optname == 'u')
|
||||
ret = udp_sock_init(c, 0, addr, ifname, i);
|
||||
ret = udp_sock_init(c, 0, af, addr, ifname, i);
|
||||
if (ret)
|
||||
goto bind_fail;
|
||||
}
|
||||
|
@ -576,15 +581,10 @@ static void conf_pasta_ns(int *netns_only, char *userns, char *netns,
|
|||
if (pidval < 0 || pidval > INT_MAX)
|
||||
die("Invalid PID %s", argv[optind]);
|
||||
|
||||
if (snprintf_check(netns, PATH_MAX,
|
||||
"/proc/%ld/ns/net", pidval))
|
||||
die_perror("Can't build netns path");
|
||||
|
||||
if (!*userns) {
|
||||
if (snprintf_check(userns, PATH_MAX,
|
||||
"/proc/%ld/ns/user", pidval))
|
||||
die_perror("Can't build userns path");
|
||||
}
|
||||
snprintf(netns, PATH_MAX, "/proc/%ld/ns/net", pidval);
|
||||
if (!*userns)
|
||||
snprintf(userns, PATH_MAX, "/proc/%ld/ns/user",
|
||||
pidval);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -735,19 +735,19 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
|
|||
static void usage(const char *name, FILE *f, int status)
|
||||
{
|
||||
if (strstr(name, "pasta")) {
|
||||
FPRINTF(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name);
|
||||
FPRINTF(f, " %s [OPTION]... PID\n", name);
|
||||
FPRINTF(f, " %s [OPTION]... --netns [PATH|NAME]\n", name);
|
||||
FPRINTF(f,
|
||||
fprintf(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name);
|
||||
fprintf(f, " %s [OPTION]... PID\n", name);
|
||||
fprintf(f, " %s [OPTION]... --netns [PATH|NAME]\n", name);
|
||||
fprintf(f,
|
||||
"\n"
|
||||
"Without PID or --netns, run the given command or a\n"
|
||||
"default shell in a new network and user namespace, and\n"
|
||||
"connect it via pasta.\n");
|
||||
} else {
|
||||
FPRINTF(f, "Usage: %s [OPTION]...\n", name);
|
||||
fprintf(f, "Usage: %s [OPTION]...\n", name);
|
||||
}
|
||||
|
||||
FPRINTF(f,
|
||||
fprintf(f,
|
||||
"\n"
|
||||
" -d, --debug Be verbose\n"
|
||||
" --trace Be extra verbose, implies --debug\n"
|
||||
|
@ -764,17 +764,17 @@ static void usage(const char *name, FILE *f, int status)
|
|||
" --version Show version and exit\n");
|
||||
|
||||
if (strstr(name, "pasta")) {
|
||||
FPRINTF(f,
|
||||
fprintf(f,
|
||||
" -I, --ns-ifname NAME namespace interface name\n"
|
||||
" default: same interface name as external one\n");
|
||||
} else {
|
||||
FPRINTF(f,
|
||||
fprintf(f,
|
||||
" -s, --socket PATH UNIX domain socket path\n"
|
||||
" default: probe free path starting from "
|
||||
UNIX_SOCK_PATH "\n", 1);
|
||||
}
|
||||
|
||||
FPRINTF(f,
|
||||
fprintf(f,
|
||||
" -F, --fd FD Use FD as pre-opened connected socket\n"
|
||||
" -p, --pcap FILE Log tap-facing traffic to pcap file\n"
|
||||
" -P, --pid FILE Write own PID to the given file\n"
|
||||
|
@ -805,28 +805,28 @@ static void usage(const char *name, FILE *f, int status)
|
|||
" can be specified multiple times\n"
|
||||
" a single, empty option disables DNS information\n");
|
||||
if (strstr(name, "pasta"))
|
||||
FPRINTF(f, " default: don't use any addresses\n");
|
||||
fprintf(f, " default: don't use any addresses\n");
|
||||
else
|
||||
FPRINTF(f, " default: use addresses from /etc/resolv.conf\n");
|
||||
FPRINTF(f,
|
||||
fprintf(f, " default: use addresses from /etc/resolv.conf\n");
|
||||
fprintf(f,
|
||||
" -S, --search LIST Space-separated list, search domains\n"
|
||||
" a single, empty option disables the DNS search list\n");
|
||||
if (strstr(name, "pasta"))
|
||||
FPRINTF(f, " default: don't use any search list\n");
|
||||
fprintf(f, " default: don't use any search list\n");
|
||||
else
|
||||
FPRINTF(f, " default: use search list from /etc/resolv.conf\n");
|
||||
fprintf(f, " default: use search list from /etc/resolv.conf\n");
|
||||
|
||||
if (strstr(name, "pasta"))
|
||||
FPRINTF(f, " --dhcp-dns \tPass DNS list via DHCP/DHCPv6/NDP\n");
|
||||
fprintf(f, " --dhcp-dns \tPass DNS list via DHCP/DHCPv6/NDP\n");
|
||||
else
|
||||
FPRINTF(f, " --no-dhcp-dns No DNS list in DHCP/DHCPv6/NDP\n");
|
||||
fprintf(f, " --no-dhcp-dns No DNS list in DHCP/DHCPv6/NDP\n");
|
||||
|
||||
if (strstr(name, "pasta"))
|
||||
FPRINTF(f, " --dhcp-search Pass list via DHCP/DHCPv6/NDP\n");
|
||||
fprintf(f, " --dhcp-search Pass list via DHCP/DHCPv6/NDP\n");
|
||||
else
|
||||
FPRINTF(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n");
|
||||
fprintf(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n");
|
||||
|
||||
FPRINTF(f,
|
||||
fprintf(f,
|
||||
" --map-host-loopback ADDR Translate ADDR to refer to host\n"
|
||||
" can be specified zero to two times (for IPv4 and IPv6)\n"
|
||||
" default: gateway address\n"
|
||||
|
@ -836,9 +836,6 @@ static void usage(const char *name, FILE *f, int status)
|
|||
" --dns-forward ADDR Forward DNS queries sent to ADDR\n"
|
||||
" can be specified zero to two times (for IPv4 and IPv6)\n"
|
||||
" default: don't forward DNS queries\n"
|
||||
" --dns-host ADDR Host nameserver to direct queries to\n"
|
||||
" can be specified zero to two times (for IPv4 and IPv6)\n"
|
||||
" default: first nameserver from host's /etc/resolv.conf\n"
|
||||
" --no-tcp Disable TCP protocol handler\n"
|
||||
" --no-udp Disable UDP protocol handler\n"
|
||||
" --no-icmp Disable ICMP/ICMPv6 protocol handler\n"
|
||||
|
@ -846,7 +843,6 @@ static void usage(const char *name, FILE *f, int status)
|
|||
" --no-ndp Disable NDP responses\n"
|
||||
" --no-dhcpv6 Disable DHCPv6 server\n"
|
||||
" --no-ra Disable router advertisements\n"
|
||||
" --freebind Bind to any address for forwarding\n"
|
||||
" --no-map-gw Don't map gateway address to host\n"
|
||||
" -4, --ipv4-only Enable IPv4 operation only\n"
|
||||
" -6, --ipv6-only Enable IPv6 operation only\n");
|
||||
|
@ -854,7 +850,7 @@ static void usage(const char *name, FILE *f, int status)
|
|||
if (strstr(name, "pasta"))
|
||||
goto pasta_opts;
|
||||
|
||||
FPRINTF(f,
|
||||
fprintf(f,
|
||||
" -1, --one-off Quit after handling one single client\n"
|
||||
" -t, --tcp-ports SPEC TCP port forwarding to guest\n"
|
||||
" can be specified multiple times\n"
|
||||
|
@ -885,7 +881,7 @@ static void usage(const char *name, FILE *f, int status)
|
|||
|
||||
pasta_opts:
|
||||
|
||||
FPRINTF(f,
|
||||
fprintf(f,
|
||||
" -t, --tcp-ports SPEC TCP port forwarding to namespace\n"
|
||||
" can be specified multiple times\n"
|
||||
" SPEC can be:\n"
|
||||
|
@ -919,9 +915,6 @@ pasta_opts:
|
|||
" -U, --udp-ns SPEC UDP port forwarding to init namespace\n"
|
||||
" SPEC is as described above\n"
|
||||
" default: auto\n"
|
||||
" --host-lo-to-ns-lo DEPRECATED:\n"
|
||||
" Translate host-loopback forwards to\n"
|
||||
" namespace loopback\n"
|
||||
" --userns NSPATH Target user namespace to join\n"
|
||||
" --netns PATH|NAME Target network namespace to join\n"
|
||||
" --netns-only Don't join existing user namespace\n"
|
||||
|
@ -1196,11 +1189,7 @@ static void conf_open_files(struct ctx *c)
|
|||
if (c->mode != MODE_PASTA && c->fd_tap == -1)
|
||||
c->fd_tap_listen = tap_sock_unix_open(c->sock_path);
|
||||
|
||||
if (*c->pidfile) {
|
||||
c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY);
|
||||
if (c->pidfile_fd < 0)
|
||||
die_perror("Couldn't open PID file %s", c->pidfile);
|
||||
}
|
||||
c->pidfile_fd = pidfile_open(c->pidfile);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1273,7 +1262,6 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
{"no-dhcpv6", no_argument, &c->no_dhcpv6, 1 },
|
||||
{"no-ndp", no_argument, &c->no_ndp, 1 },
|
||||
{"no-ra", no_argument, &c->no_ra, 1 },
|
||||
{"freebind", no_argument, &c->freebind, 1 },
|
||||
{"no-map-gw", no_argument, &no_map_gw, 1 },
|
||||
{"ipv4-only", no_argument, NULL, '4' },
|
||||
{"ipv6-only", no_argument, NULL, '6' },
|
||||
|
@ -1303,8 +1291,6 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
{"netns-only", no_argument, NULL, 20 },
|
||||
{"map-host-loopback", required_argument, NULL, 21 },
|
||||
{"map-guest-addr", required_argument, NULL, 22 },
|
||||
{"host-lo-to-ns-lo", no_argument, NULL, 23 },
|
||||
{"dns-host", required_argument, NULL, 24 },
|
||||
{ 0 },
|
||||
};
|
||||
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
|
||||
|
@ -1427,9 +1413,9 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
|
||||
break;
|
||||
case 14:
|
||||
FPRINTF(stdout,
|
||||
fprintf(stdout,
|
||||
c->mode == MODE_PASTA ? "pasta " : "passt ");
|
||||
FPRINTF(stdout, VERSION_BLOB);
|
||||
fprintf(stdout, VERSION_BLOB);
|
||||
exit(EXIT_SUCCESS);
|
||||
case 15:
|
||||
ret = snprintf(c->ip4.ifname_out,
|
||||
|
@ -1482,23 +1468,6 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
conf_nat(optarg, &c->ip4.map_guest_addr,
|
||||
&c->ip6.map_guest_addr, NULL);
|
||||
break;
|
||||
case 23:
|
||||
if (c->mode != MODE_PASTA)
|
||||
die("--host-lo-to-ns-lo is for pasta mode only");
|
||||
c->host_lo_to_ns_lo = 1;
|
||||
break;
|
||||
case 24:
|
||||
if (inet_pton(AF_INET6, optarg, &c->ip6.dns_host) &&
|
||||
!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
|
||||
break;
|
||||
|
||||
if (inet_pton(AF_INET, optarg, &c->ip4.dns_host) &&
|
||||
!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host) &&
|
||||
!IN4_IS_ADDR_BROADCAST(&c->ip4.dns_host))
|
||||
break;
|
||||
|
||||
die("Invalid host nameserver address: %s", optarg);
|
||||
break;
|
||||
case 'd':
|
||||
c->debug = 1;
|
||||
c->quiet = 0;
|
||||
|
|
|
@ -34,8 +34,6 @@
|
|||
|
||||
owner @{PROC}/@{pid}/uid_map r, # conf_ugid()
|
||||
|
||||
@{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral()
|
||||
|
||||
network netlink raw, # nl_sock_init_do(), netlink.c
|
||||
|
||||
network inet stream, # tcp.c
|
||||
|
|
|
@ -50,7 +50,6 @@ require {
|
|||
type passwd_file_t;
|
||||
|
||||
class netlink_route_socket { bind create nlmsg_read };
|
||||
type sysctl_net_t;
|
||||
|
||||
class capability { sys_tty_config setuid setgid };
|
||||
class cap_userns { setpcap sys_admin sys_ptrace };
|
||||
|
@ -105,8 +104,6 @@ allow passt_t net_conf_t:lnk_file read;
|
|||
allow passt_t tmp_t:sock_file { create unlink write };
|
||||
allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
|
||||
kernel_search_network_sysctl(passt_t)
|
||||
allow passt_t sysctl_net_t:dir search;
|
||||
allow passt_t sysctl_net_t:file { open read };
|
||||
|
||||
corenet_tcp_bind_all_nodes(passt_t)
|
||||
corenet_udp_bind_all_nodes(passt_t)
|
||||
|
|
|
@ -196,7 +196,7 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch };
|
|||
allow pasta_t self:tun_socket create;
|
||||
allow pasta_t tun_tap_device_t:chr_file { ioctl open read write };
|
||||
allow pasta_t sysctl_net_t:dir search;
|
||||
allow pasta_t sysctl_net_t:file { open read write };
|
||||
allow pasta_t sysctl_net_t:file { open write };
|
||||
allow pasta_t kernel_t:system module_request;
|
||||
|
||||
allow pasta_t nsfs_t:file read;
|
||||
|
|
55
dhcpv6.c
55
dhcpv6.c
|
@ -296,42 +296,47 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
|
|||
static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
|
||||
struct in6_addr *la)
|
||||
{
|
||||
int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
|
||||
const struct opt_ia_addr *opt_addr;
|
||||
char buf[INET6_ADDRSTRLEN];
|
||||
struct in6_addr req_addr;
|
||||
const struct opt_hdr *h;
|
||||
struct opt_hdr *ia;
|
||||
size_t offset;
|
||||
int ia_type;
|
||||
|
||||
foreach(ia_type, ia_types) {
|
||||
offset = 0;
|
||||
while ((ia = dhcpv6_opt(p, &offset, *ia_type))) {
|
||||
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
|
||||
ia_type = OPT_IA_NA;
|
||||
ia_ta:
|
||||
offset = 0;
|
||||
while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
|
||||
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
|
||||
return NULL;
|
||||
|
||||
offset += sizeof(struct opt_ia_na);
|
||||
|
||||
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
|
||||
const struct opt_ia_addr *opt_addr;
|
||||
|
||||
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
|
||||
return NULL;
|
||||
|
||||
offset += sizeof(struct opt_ia_na);
|
||||
|
||||
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
|
||||
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
|
||||
return NULL;
|
||||
|
||||
opt_addr = (const struct opt_ia_addr *)h;
|
||||
req_addr = opt_addr->addr;
|
||||
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr))
|
||||
goto err;
|
||||
|
||||
offset += sizeof(struct opt_ia_addr);
|
||||
opt_addr = (const struct opt_ia_addr *)h;
|
||||
req_addr = opt_addr->addr;
|
||||
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) {
|
||||
info("DHCPv6: requested address %s not on link",
|
||||
inet_ntop(AF_INET6, &req_addr,
|
||||
buf, sizeof(buf)));
|
||||
return ia;
|
||||
}
|
||||
|
||||
offset += sizeof(struct opt_ia_addr);
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
if (ia_type == OPT_IA_NA) {
|
||||
ia_type = OPT_IA_TA;
|
||||
goto ia_ta;
|
||||
}
|
||||
|
||||
err:
|
||||
info("DHCPv6: requested address %s not on link",
|
||||
inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
|
||||
return ia;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -423,11 +428,11 @@ search:
|
|||
int dhcpv6(struct ctx *c, const struct pool *p,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr)
|
||||
{
|
||||
const struct opt_hdr *client_id, *server_id, *ia;
|
||||
struct opt_hdr *ia, *bad_ia, *client_id;
|
||||
const struct opt_hdr *server_id;
|
||||
const struct in6_addr *src;
|
||||
const struct msg_hdr *mh;
|
||||
const struct udphdr *uh;
|
||||
struct opt_hdr *bad_ia;
|
||||
size_t mlen, n;
|
||||
|
||||
uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
|
||||
|
|
53
flow.c
53
flow.c
|
@ -283,23 +283,28 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
|||
"Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
|
||||
}
|
||||
|
||||
/** flow_log_details_() - Log the details of a flow
|
||||
* @f: flow to log
|
||||
* @pri: Log priority
|
||||
* @state: State to log details according to
|
||||
*
|
||||
* Logs the details of the flow: endpoints, interfaces, type etc.
|
||||
/**
|
||||
* flow_set_state() - Change flow's state
|
||||
* @f: Flow changing state
|
||||
* @state: New state
|
||||
*/
|
||||
void flow_log_details_(const struct flow_common *f, int pri,
|
||||
enum flow_state state)
|
||||
static void flow_set_state(struct flow_common *f, enum flow_state state)
|
||||
{
|
||||
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
|
||||
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
|
||||
const struct flowside *ini = &f->side[INISIDE];
|
||||
const struct flowside *tgt = &f->side[TGTSIDE];
|
||||
uint8_t oldstate = f->state;
|
||||
|
||||
if (state >= FLOW_STATE_TGT)
|
||||
flow_log_(f, pri,
|
||||
ASSERT(state < FLOW_NUM_STATES);
|
||||
ASSERT(oldstate < FLOW_NUM_STATES);
|
||||
|
||||
f->state = state;
|
||||
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
|
||||
FLOW_STATE(f));
|
||||
|
||||
if (MAX(state, oldstate) >= FLOW_STATE_TGT)
|
||||
flow_log_(f, LOG_DEBUG,
|
||||
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
|
||||
pif_name(f->pif[INISIDE]),
|
||||
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||
|
@ -311,8 +316,8 @@ void flow_log_details_(const struct flow_common *f, int pri,
|
|||
tgt->oport,
|
||||
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
|
||||
tgt->eport);
|
||||
else if (state >= FLOW_STATE_INI)
|
||||
flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
|
||||
else if (MAX(state, oldstate) >= FLOW_STATE_INI)
|
||||
flow_log_(f, LOG_DEBUG, "%s [%s]:%hu -> [%s]:%hu => ?",
|
||||
pif_name(f->pif[INISIDE]),
|
||||
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||
ini->eport,
|
||||
|
@ -320,25 +325,6 @@ void flow_log_details_(const struct flow_common *f, int pri,
|
|||
ini->oport);
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_set_state() - Change flow's state
|
||||
* @f: Flow changing state
|
||||
* @state: New state
|
||||
*/
|
||||
static void flow_set_state(struct flow_common *f, enum flow_state state)
|
||||
{
|
||||
uint8_t oldstate = f->state;
|
||||
|
||||
ASSERT(state < FLOW_NUM_STATES);
|
||||
ASSERT(oldstate < FLOW_NUM_STATES);
|
||||
|
||||
f->state = state;
|
||||
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
|
||||
FLOW_STATE(f));
|
||||
|
||||
flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_initiate_() - Move flow to INI, setting pif[INISIDE]
|
||||
* @flow: Flow to change state
|
||||
|
@ -711,7 +697,7 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
|
|||
!(FLOW_PROTO(&flow->f) == proto &&
|
||||
flow->f.pif[sidx.sidei] == pif &&
|
||||
flowside_eq(&flow->f.side[sidx.sidei], side)))
|
||||
b = mod_sub(b, 1, FLOW_HASH_SIZE);
|
||||
b = (b + 1) % FLOW_HASH_SIZE;
|
||||
|
||||
return flow_hashtab[b];
|
||||
}
|
||||
|
@ -846,8 +832,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
closed = icmp_ping_timer(c, &flow->ping, now);
|
||||
break;
|
||||
case FLOW_UDP:
|
||||
closed = udp_flow_defer(&flow->udp);
|
||||
if (!closed && timer)
|
||||
if (timer)
|
||||
closed = udp_flow_timer(c, &flow->udp, now);
|
||||
break;
|
||||
default:
|
||||
|
|
7
flow.h
7
flow.h
|
@ -264,11 +264,4 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
|||
flow_dbg((f), __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
void flow_log_details_(const struct flow_common *f, int pri,
|
||||
enum flow_state state);
|
||||
#define flow_log_details(f_, pri) \
|
||||
flow_log_details_(&((f_)->f), (pri), (f_)->f.state)
|
||||
#define flow_dbg_details(f_) flow_log_details((f_), LOG_DEBUG)
|
||||
#define flow_err_details(f_) flow_log_details((f_), LOG_ERR)
|
||||
|
||||
#endif /* FLOW_H */
|
||||
|
|
|
@ -110,7 +110,7 @@ static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
|
|||
const union flow *flow = flow_at_sidx(sidx);
|
||||
|
||||
if (!flow)
|
||||
return NULL;
|
||||
return PIF_NONE;
|
||||
|
||||
return &flow->f.side[sidx.sidei];
|
||||
}
|
||||
|
|
35
fwd.c
35
fwd.c
|
@ -75,8 +75,8 @@ void fwd_probe_ephemeral(void)
|
|||
if (*end || errno)
|
||||
goto parse_err;
|
||||
|
||||
if (min < 0 || min >= (long)NUM_PORTS ||
|
||||
max < 0 || max >= (long)NUM_PORTS)
|
||||
if (min < 0 || min >= NUM_PORTS ||
|
||||
max < 0 || max >= NUM_PORTS)
|
||||
goto parse_err;
|
||||
|
||||
fwd_ephemeral_min = min;
|
||||
|
@ -447,35 +447,20 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
|
|||
(proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
|
||||
/* spliceable */
|
||||
|
||||
/* The traffic will go over the guest's 'lo' interface, but by
|
||||
* default use its external address, so we don't inadvertently
|
||||
* expose services that listen only on the guest's loopback
|
||||
* address. That can be overridden by --host-lo-to-ns-lo which
|
||||
* will instead forward to the loopback address in the guest.
|
||||
*
|
||||
* In either case, let the kernel pick the source address to
|
||||
* match.
|
||||
/* Preserve the specific loopback adddress used, but let the
|
||||
* kernel pick a source port on the target side
|
||||
*/
|
||||
if (inany_v4(&ini->eaddr)) {
|
||||
if (c->host_lo_to_ns_lo)
|
||||
tgt->eaddr = inany_loopback4;
|
||||
else
|
||||
tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
|
||||
tgt->oaddr = inany_any4;
|
||||
} else {
|
||||
if (c->host_lo_to_ns_lo)
|
||||
tgt->eaddr = inany_loopback6;
|
||||
else
|
||||
tgt->eaddr.a6 = c->ip6.addr_seen;
|
||||
tgt->oaddr = inany_any6;
|
||||
}
|
||||
|
||||
/* Let the kernel pick source port */
|
||||
tgt->oaddr = ini->eaddr;
|
||||
tgt->oport = 0;
|
||||
if (proto == IPPROTO_UDP)
|
||||
/* But for UDP preserve the source port */
|
||||
tgt->oport = ini->eport;
|
||||
|
||||
if (inany_v4(&ini->eaddr))
|
||||
tgt->eaddr = inany_loopback4;
|
||||
else
|
||||
tgt->eaddr = inany_loopback6;
|
||||
|
||||
return PIF_SPLICE;
|
||||
}
|
||||
|
||||
|
|
20
inany.c
20
inany.c
|
@ -36,23 +36,3 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
|
|||
|
||||
return inet_ntop(AF_INET6, &src->a6, dst, size);
|
||||
}
|
||||
|
||||
/** inany_pton - Parse an IPv[46] address from text format
|
||||
* @src: IPv[46] address
|
||||
* @dst: output buffer, filled with parsed address
|
||||
*
|
||||
* Return: On success, 1, if no parseable address is found, 0
|
||||
*/
|
||||
int inany_pton(const char *src, union inany_addr *dst)
|
||||
{
|
||||
if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) {
|
||||
memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero));
|
||||
memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one));
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (inet_pton(AF_INET6, src, &dst->a6))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
1
inany.h
1
inany.h
|
@ -270,6 +270,5 @@ static inline void inany_siphash_feed(struct siphash_state *state,
|
|||
#define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)
|
||||
|
||||
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size);
|
||||
int inany_pton(const char *src, union inany_addr *dst);
|
||||
|
||||
#endif /* INANY_H */
|
||||
|
|
144
linux_dep.h
144
linux_dep.h
|
@ -1,144 +0,0 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright Red Hat
|
||||
*
|
||||
* Declarations for Linux specific dependencies
|
||||
*/
|
||||
|
||||
#ifndef LINUX_DEP_H
|
||||
#define LINUX_DEP_H
|
||||
|
||||
/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
|
||||
*
|
||||
* Largely derived from include/linux/tcp.h in the Linux kernel
|
||||
*
|
||||
* Some fields returned by TCP_INFO have been there for ages and are shared with
|
||||
* BSD. struct tcp_info from netinet/tcp.h has only those fields. There are
|
||||
* also a many Linux specific extensions to the structure, which are only found
|
||||
* in the linux/tcp.h version of struct tcp_info.
|
||||
*
|
||||
* We want to use some of those extension fields, when available. We can test
|
||||
* for availability in the runtime kernel using the length returned from
|
||||
* getsockopt(). However, we won't necessarily be compiled against the same
|
||||
* kernel headers as we'll run with, so compiling directly against linux/tcp.h
|
||||
* means wrapping every field access in an #ifdef whose #else does the same
|
||||
* thing as when the field is missing at runtime. This rapidly gets messy.
|
||||
*
|
||||
* Instead we define here struct tcp_info_linux which includes all the Linux
|
||||
* extensions that we want to use. This is taken from v6.11 of the kernel.
|
||||
*/
|
||||
struct tcp_info_linux {
|
||||
uint8_t tcpi_state;
|
||||
uint8_t tcpi_ca_state;
|
||||
uint8_t tcpi_retransmits;
|
||||
uint8_t tcpi_probes;
|
||||
uint8_t tcpi_backoff;
|
||||
uint8_t tcpi_options;
|
||||
uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
|
||||
uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
|
||||
|
||||
uint32_t tcpi_rto;
|
||||
uint32_t tcpi_ato;
|
||||
uint32_t tcpi_snd_mss;
|
||||
uint32_t tcpi_rcv_mss;
|
||||
|
||||
uint32_t tcpi_unacked;
|
||||
uint32_t tcpi_sacked;
|
||||
uint32_t tcpi_lost;
|
||||
uint32_t tcpi_retrans;
|
||||
uint32_t tcpi_fackets;
|
||||
|
||||
/* Times. */
|
||||
uint32_t tcpi_last_data_sent;
|
||||
uint32_t tcpi_last_ack_sent;
|
||||
uint32_t tcpi_last_data_recv;
|
||||
uint32_t tcpi_last_ack_recv;
|
||||
|
||||
/* Metrics. */
|
||||
uint32_t tcpi_pmtu;
|
||||
uint32_t tcpi_rcv_ssthresh;
|
||||
uint32_t tcpi_rtt;
|
||||
uint32_t tcpi_rttvar;
|
||||
uint32_t tcpi_snd_ssthresh;
|
||||
uint32_t tcpi_snd_cwnd;
|
||||
uint32_t tcpi_advmss;
|
||||
uint32_t tcpi_reordering;
|
||||
|
||||
uint32_t tcpi_rcv_rtt;
|
||||
uint32_t tcpi_rcv_space;
|
||||
|
||||
uint32_t tcpi_total_retrans;
|
||||
|
||||
/* Linux extensions */
|
||||
uint64_t tcpi_pacing_rate;
|
||||
uint64_t tcpi_max_pacing_rate;
|
||||
uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
|
||||
uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
|
||||
uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
|
||||
uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
|
||||
|
||||
uint32_t tcpi_notsent_bytes;
|
||||
uint32_t tcpi_min_rtt;
|
||||
uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
|
||||
uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
|
||||
|
||||
uint64_t tcpi_delivery_rate;
|
||||
|
||||
uint64_t tcpi_busy_time; /* Time (usec) busy sending data */
|
||||
uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */
|
||||
uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
|
||||
|
||||
uint32_t tcpi_delivered;
|
||||
uint32_t tcpi_delivered_ce;
|
||||
|
||||
uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
|
||||
uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */
|
||||
uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */
|
||||
uint32_t tcpi_reord_seen; /* reordering events seen */
|
||||
|
||||
uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */
|
||||
|
||||
uint32_t tcpi_snd_wnd; /* peer's advertised receive window after
|
||||
* scaling (bytes)
|
||||
*/
|
||||
uint32_t tcpi_rcv_wnd; /* local advertised receive window after
|
||||
* scaling (bytes)
|
||||
*/
|
||||
|
||||
uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */
|
||||
|
||||
uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including
|
||||
* SYN/SYN-ACK and recurring timeouts.
|
||||
*/
|
||||
uint16_t tcpi_total_rto_recoveries; /* Total number of RTO
|
||||
* recoveries, including any
|
||||
* unfinished recovery.
|
||||
*/
|
||||
uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries
|
||||
* in milliseconds, including any
|
||||
* unfinished recovery.
|
||||
*/
|
||||
};
|
||||
|
||||
#include <linux/falloc.h>
|
||||
|
||||
#ifndef FALLOC_FL_COLLAPSE_RANGE
|
||||
#define FALLOC_FL_COLLAPSE_RANGE 0x08
|
||||
#endif
|
||||
|
||||
#include <linux/close_range.h>
|
||||
|
||||
/* glibc < 2.34 and musl as of 1.2.5 need these */
|
||||
#ifndef SYS_close_range
|
||||
#define SYS_close_range 436
|
||||
#endif
|
||||
#ifndef CLOSE_RANGE_UNSHARE /* Linux kernel < 5.9 */
|
||||
#define CLOSE_RANGE_UNSHARE (1U << 1)
|
||||
#endif
|
||||
|
||||
__attribute__ ((weak))
|
||||
/* cppcheck-suppress funcArgNamesDifferent */
|
||||
int close_range(unsigned int first, unsigned int last, int flags) {
|
||||
return syscall(SYS_close_range, first, last, flags);
|
||||
}
|
||||
|
||||
#endif /* LINUX_DEP_H */
|
19
log.c
19
log.c
|
@ -26,7 +26,6 @@
|
|||
#include <stdarg.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include "linux_dep.h"
|
||||
#include "log.h"
|
||||
#include "util.h"
|
||||
#include "passt.h"
|
||||
|
@ -93,6 +92,7 @@ const char *logfile_prefix[] = {
|
|||
" ", /* LOG_DEBUG */
|
||||
};
|
||||
|
||||
#ifdef FALLOC_FL_COLLAPSE_RANGE
|
||||
/**
|
||||
* logfile_rotate_fallocate() - Write header, set log_written after fallocate()
|
||||
* @fd: Log file descriptor
|
||||
|
@ -126,6 +126,7 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
|
|||
|
||||
log_written -= log_cut_size;
|
||||
}
|
||||
#endif /* FALLOC_FL_COLLAPSE_RANGE */
|
||||
|
||||
/**
|
||||
* logfile_rotate_move() - Fallback: move recent entries toward start, then cut
|
||||
|
@ -197,17 +198,21 @@ out:
|
|||
*
|
||||
* Return: 0 on success, negative error code on failure
|
||||
*
|
||||
* #syscalls fcntl fallocate
|
||||
* #syscalls fcntl
|
||||
*
|
||||
* fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there
|
||||
*/
|
||||
static int logfile_rotate(int fd, const struct timespec *now)
|
||||
{
|
||||
if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */))
|
||||
return -errno;
|
||||
|
||||
#ifdef FALLOC_FL_COLLAPSE_RANGE
|
||||
/* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */
|
||||
if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size))
|
||||
logfile_rotate_fallocate(fd, now);
|
||||
else
|
||||
#endif
|
||||
logfile_rotate_move(fd, now);
|
||||
|
||||
if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND))
|
||||
|
@ -269,7 +274,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
|
|||
char timestr[LOGTIME_STRLEN];
|
||||
|
||||
logtime_fmt(timestr, sizeof(timestr), now);
|
||||
FPRINTF(stderr, "%s: ", timestr);
|
||||
fprintf(stderr, "%s: ", timestr);
|
||||
}
|
||||
|
||||
if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) {
|
||||
|
@ -288,7 +293,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
|
|||
(log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) {
|
||||
(void)vfprintf(stderr, format, ap);
|
||||
if (newline && format[strlen(format)] != '\n')
|
||||
FPRINTF(stderr, "\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -394,7 +399,7 @@ void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
|
|||
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
||||
|
||||
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
|
||||
FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
|
||||
fprintf(stderr, "Failed to send %i bytes to syslog\n", n);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -411,7 +416,8 @@ void logfile_init(const char *name, const char *path, size_t size)
|
|||
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
|
||||
die_perror("Failed to read own /proc/self/exe link");
|
||||
|
||||
log_file = output_file_open(path, O_APPEND | O_RDWR);
|
||||
log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC,
|
||||
S_IRUSR | S_IWUSR);
|
||||
if (log_file == -1)
|
||||
die_perror("Couldn't open log file %s", path);
|
||||
|
||||
|
@ -427,3 +433,4 @@ void logfile_init(const char *name, const char *path, size_t size)
|
|||
/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
|
||||
log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
|
||||
}
|
||||
|
||||
|
|
4
ndp.c
4
ndp.c
|
@ -234,8 +234,8 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
|
|||
return 1;
|
||||
|
||||
if (ih->icmp6_type == NS) {
|
||||
const struct ndp_ns *ns =
|
||||
packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
|
||||
struct ndp_ns *ns = packet_get(p, 0, 0, sizeof(struct ndp_ns),
|
||||
NULL);
|
||||
|
||||
if (!ns)
|
||||
return -1;
|
||||
|
|
|
@ -353,7 +353,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
|||
*/
|
||||
bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
|
||||
{
|
||||
int nh_len = RTA_PAYLOAD(rta);
|
||||
size_t nh_len = RTA_PAYLOAD(rta);
|
||||
struct rtnexthop *rtnh;
|
||||
bool found = false;
|
||||
int hops = -1;
|
||||
|
@ -582,7 +582,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
|||
|
||||
*(unsigned int *)RTA_DATA(rta) = ifi_dst;
|
||||
} else if (rta->rta_type == RTA_MULTIPATH) {
|
||||
int nh_len = RTA_PAYLOAD(rta);
|
||||
size_t nh_len = RTA_PAYLOAD(rta);
|
||||
struct rtnexthop *rtnh;
|
||||
|
||||
for (rtnh = (struct rtnexthop *)RTA_DATA(rta);
|
||||
|
|
96
passt.1
96
passt.1
|
@ -95,7 +95,7 @@ detached PID namespace after starting, because the PID itself cannot change.
|
|||
Default is to fork into background.
|
||||
|
||||
.TP
|
||||
.BR \-e ", " \-\-stderr " " (DEPRECATED)
|
||||
.BR \-e ", " \-\-stderr
|
||||
This option has no effect, and is maintained for compatibility purposes only.
|
||||
|
||||
Note that this configuration option is \fBdeprecated\fR and will be removed in a
|
||||
|
@ -249,19 +249,10 @@ the host.
|
|||
.TP
|
||||
.BR \-\-dns-forward " " \fIaddr
|
||||
Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the
|
||||
nameserver (with corresponding IP version) specified by the
|
||||
\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or
|
||||
port 853. Replies are translated back with a reverse mapping. This
|
||||
option can be specified zero to two times (once for IPv4, once for
|
||||
IPv6).
|
||||
|
||||
.TP
|
||||
.BR \-\-dns-host " " \fIaddr
|
||||
Configure the host nameserver which guest or namespace queries to the
|
||||
\fB\-\-dns-forward\fR address will be redirected to. This option can
|
||||
be specified zero to two times (once for IPv4, once for IPv6).
|
||||
By default, the first nameserver from the host's
|
||||
\fI/etc/resolv.conf\fR.
|
||||
first configured DNS resolver (with corresponding IP version). Maps
|
||||
only UDP and TCP traffic to port 53 or port 853. Replies are
|
||||
translated back with a reverse mapping. This option can be specified
|
||||
zero to two times (once for IPv4, once for IPv6).
|
||||
|
||||
.TP
|
||||
.BR \-S ", " \-\-search " " \fIlist
|
||||
|
@ -336,16 +327,6 @@ namespace will be silently dropped.
|
|||
Disable Router Advertisements. Router Solicitations coming from guest or target
|
||||
namespace will be ignored.
|
||||
|
||||
.TP
|
||||
.BR \-\-freebind
|
||||
Allow any binding address to be specified for \fB-t\fR and \fB-u\fR
|
||||
options. Usually binding addresses must be addresses currently
|
||||
configured on the host. With \fB\-\-freebind\fR, the
|
||||
\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled
|
||||
allowing any address to be used. This is typically used to bind
|
||||
addresses which might be configured on the host in future, at which
|
||||
point the forwarding will immediately start operating.
|
||||
|
||||
.TP
|
||||
.BR \-\-map-host-loopback " " \fIaddr
|
||||
Translate \fIaddr\fR to refer to the host. Packets from the guest to
|
||||
|
@ -605,13 +586,6 @@ Configure UDP port forwarding from target namespace to init namespace.
|
|||
|
||||
Default is \fBauto\fR.
|
||||
|
||||
.TP
|
||||
.BR \-\-host-lo-to-ns-lo " " (DEPRECATED)
|
||||
If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
|
||||
the host's loopback address will appear on the loopback address in the
|
||||
guest as well. Without this option such forwarded packets will appear
|
||||
to come from the guest's public address.
|
||||
|
||||
.TP
|
||||
.BR \-\-userns " " \fIspec
|
||||
Target user namespace to join, as a path. If PID is given, without this option,
|
||||
|
@ -889,41 +863,38 @@ root@localhost's password:
|
|||
|
||||
.SH NOTES
|
||||
|
||||
.SS Handling of traffic with loopback destination and source addresses
|
||||
.SS Handling of traffic with local destination and source addresses
|
||||
|
||||
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback
|
||||
address (127.0.0.0/8 or ::1), depending on the configuration. Loopback
|
||||
destination or source addresses need to be changed before packets are
|
||||
delivered to the guest or target namespace: most operating systems
|
||||
would drop packets received with loopback addresses on non-loopback
|
||||
interfaces, and it would also be impossible for guest or target
|
||||
namespace to route answers back.
|
||||
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address,
|
||||
depending on the configuration. Local destination or source addresses need to be
|
||||
changed before packets are delivered to the guest or target namespace: most
|
||||
operating systems would drop packets received from non-loopback interfaces with
|
||||
local addresses, and it would also be impossible for guest or target namespace
|
||||
to route answers back.
|
||||
|
||||
For convenience, the source address on these packets is translated to
|
||||
the address specified by the \fB\-\-map-host-loopback\fR option (with
|
||||
some exceptions in pasta mode, see next section below). If not
|
||||
specified this defaults, somewhat arbitrarily, to the address of
|
||||
default IPv4 or IPv6 gateway (if any) -- this is known to be an
|
||||
existing, valid address on the same subnet. If \fB\-\-no-map-gw\fR or
|
||||
\fB\-\-map-host-loopback none\fR are specified this translation is
|
||||
disabled and packets with loopback addresses are simply dropped.
|
||||
For convenience, and somewhat arbitrarily, the source address on these packets
|
||||
is translated to the address of the default IPv4 or IPv6 gateway (if any) --
|
||||
this is known to be an existing, valid address on the same subnet.
|
||||
|
||||
Loopback destination addresses are translated to the observed external
|
||||
address of the guest or target namespace. For IPv6, the observed
|
||||
link-local address is used if the translated source address is
|
||||
link-local, otherwise the observed global address is used. For both
|
||||
IPv4 and IPv6, if no addresses have been seen yet, the configured
|
||||
addresses will be used instead.
|
||||
Loopback destination addresses are instead translated to the observed external
|
||||
address of the guest or target namespace. For IPv6 packets, if usage of a
|
||||
link-local address by guest or namespace has ever been observed, and the
|
||||
original destination address is also a link-local address, the observed
|
||||
link-local address is used. Otherwise, the observed global address is used. For
|
||||
both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses
|
||||
will be used instead.
|
||||
|
||||
For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1,
|
||||
with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
|
||||
the last observed source address from guest or namespace is 192.0.2.2, this will
|
||||
be translated to a connection from 192.0.2.1 to 192.0.2.2.
|
||||
|
||||
Similarly, for traffic coming from guest or namespace, packets with
|
||||
destination address corresponding to the \fB\-\-map-host-loopback\fR
|
||||
address will have their destination address translated to a loopback
|
||||
address.
|
||||
Similarly, for traffic coming from guest or namespace, packets with destination
|
||||
address corresponding to the default gateway will have their destination address
|
||||
translated to a loopback address, if and only if a packet, in the opposite
|
||||
direction, with a loopback destination or source address, port-wise matching for
|
||||
UDP, or connection-wise for TCP, has been recently forwarded to guest or
|
||||
namespace. This behaviour can be disabled with \-\-no\-map\-gw.
|
||||
|
||||
.SS Handling of local traffic in pasta
|
||||
|
||||
|
@ -939,15 +910,8 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair
|
|||
of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet
|
||||
transfers.
|
||||
|
||||
Because it's not possible to bind sockets to foreign addresses, this
|
||||
bypass only applies to local connections and traffic. It also means
|
||||
that the address translation differs slightly from passt mode.
|
||||
Connections from loopback to loopback on the host will appear to come
|
||||
from the target namespace's public address within the guest, unless
|
||||
\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will
|
||||
appear to come from loopback in the namespace as well. The latter
|
||||
behaviour used to be the default, but is usually undesirable, since it
|
||||
can unintentionally expose namespace local services to the host.
|
||||
This bypass only applies to local connections and traffic, because it's not
|
||||
possible to bind sockets to foreign addresses.
|
||||
|
||||
.SS Binding to low numbered ports (well-known or system ports, up to 1023)
|
||||
|
||||
|
|
12
passt.c
12
passt.c
|
@ -207,8 +207,7 @@ int main(int argc, char **argv)
|
|||
struct timespec now;
|
||||
struct sigaction sa;
|
||||
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &log_start))
|
||||
die_perror("Failed to get CLOCK_MONOTONIC time");
|
||||
clock_gettime(CLOCK_MONOTONIC, &log_start);
|
||||
|
||||
arch_avx2_exec(argv);
|
||||
|
||||
|
@ -266,8 +265,7 @@ int main(int argc, char **argv)
|
|||
|
||||
secret_init(&c);
|
||||
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &now))
|
||||
die_perror("Failed to get CLOCK_MONOTONIC time");
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
|
||||
flow_init();
|
||||
|
||||
|
@ -309,15 +307,13 @@ int main(int argc, char **argv)
|
|||
timer_init(&c, &now);
|
||||
|
||||
loop:
|
||||
/* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||
/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
|
||||
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
|
||||
/* NOLINTEND(bugprone-branch-clone) */
|
||||
if (nfds == -1 && errno != EINTR)
|
||||
die_perror("epoll_wait() failed in main loop");
|
||||
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &now))
|
||||
err_perror("Failed to get CLOCK_MONOTONIC time");
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
|
||||
for (i = 0; i < nfds; i++) {
|
||||
union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
|
||||
|
|
4
passt.h
4
passt.h
|
@ -225,8 +225,6 @@ struct ip6_ctx {
|
|||
* @no_dhcpv6: Disable DHCPv6 server
|
||||
* @no_ndp: Disable NDP handler altogether
|
||||
* @no_ra: Disable router advertisements
|
||||
* @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses
|
||||
* @freebind: Allow binding of non-local addresses for forwarding
|
||||
* @low_wmem: Low probed net.core.wmem_max
|
||||
* @low_rmem: Low probed net.core.rmem_max
|
||||
*/
|
||||
|
@ -286,8 +284,6 @@ struct ctx {
|
|||
int no_dhcpv6;
|
||||
int no_ndp;
|
||||
int no_ra;
|
||||
int host_lo_to_ns_lo;
|
||||
int freebind;
|
||||
|
||||
int low_wmem;
|
||||
int low_rmem;
|
||||
|
|
14
pasta.c
14
pasta.c
|
@ -102,9 +102,7 @@ static int pasta_wait_for_ns(void *arg)
|
|||
int flags = O_RDONLY | O_CLOEXEC;
|
||||
char ns[PATH_MAX];
|
||||
|
||||
if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid))
|
||||
die_perror("Can't build netns path");
|
||||
|
||||
snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
|
||||
do {
|
||||
while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
|
||||
if (errno != ENOENT)
|
||||
|
@ -241,11 +239,8 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
|
|||
c->quiet = 1;
|
||||
|
||||
/* Configure user and group mappings */
|
||||
if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid))
|
||||
die_perror("Can't build uidmap");
|
||||
|
||||
if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid))
|
||||
die_perror("Can't build gidmap");
|
||||
snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
|
||||
snprintf(gidmap, BUFSIZ, "0 %u 1", gid);
|
||||
|
||||
if (write_file("/proc/self/uid_map", uidmap) ||
|
||||
write_file("/proc/self/setgroups", "deny") ||
|
||||
|
@ -432,12 +427,12 @@ static int pasta_netns_quit_timer(void)
|
|||
*/
|
||||
void pasta_netns_quit_init(const struct ctx *c)
|
||||
{
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY };
|
||||
struct epoll_event ev = { .events = EPOLLIN };
|
||||
int flags = O_NONBLOCK | O_CLOEXEC;
|
||||
struct statfs s = { 0 };
|
||||
bool try_inotify = true;
|
||||
int fd = -1, dir_fd;
|
||||
union epoll_ref ref;
|
||||
|
||||
if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
|
||||
return;
|
||||
|
@ -468,7 +463,6 @@ void pasta_netns_quit_init(const struct ctx *c)
|
|||
ref.type = EPOLL_TYPE_NSQUIT_TIMER;
|
||||
} else {
|
||||
close(dir_fd);
|
||||
ref.type = EPOLL_TYPE_NSQUIT_INOTIFY;
|
||||
}
|
||||
|
||||
if (fd > FD_REF_MAX)
|
||||
|
|
32
pcap.c
32
pcap.c
|
@ -86,8 +86,9 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
|
|||
.caplen = l2len,
|
||||
.len = l2len
|
||||
};
|
||||
struct iovec hiov = { &h, sizeof(h) };
|
||||
|
||||
if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 ||
|
||||
if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 ||
|
||||
write_remainder(pcap_fd, iov, iovcnt, offset) < 0)
|
||||
debug_perror("Cannot log packet, length %zu", l2len);
|
||||
}
|
||||
|
@ -100,14 +101,12 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
|
|||
void pcap(const char *pkt, size_t l2len)
|
||||
{
|
||||
struct iovec iov = { (char *)pkt, l2len };
|
||||
struct timespec now = { 0 };
|
||||
struct timespec now;
|
||||
|
||||
if (pcap_fd == -1)
|
||||
return;
|
||||
|
||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||
err_perror("Failed to get CLOCK_REALTIME time");
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
pcap_frame(&iov, 1, 0, &now);
|
||||
}
|
||||
|
||||
|
@ -121,14 +120,13 @@ void pcap(const char *pkt, size_t l2len)
|
|||
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
||||
size_t offset)
|
||||
{
|
||||
struct timespec now = { 0 };
|
||||
struct timespec now;
|
||||
unsigned int i;
|
||||
|
||||
if (pcap_fd == -1)
|
||||
return;
|
||||
|
||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||
err_perror("Failed to get CLOCK_REALTIME time");
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
|
||||
|
@ -141,20 +139,17 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
|||
* @iov: Pointer to the array of struct iovec describing the I/O vector
|
||||
* containing packet data to write, including L2 header
|
||||
* @iovcnt: Number of buffers (@iov entries)
|
||||
* @offset: Offset of the L2 frame within the full data length
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt)
|
||||
{
|
||||
struct timespec now = { 0 };
|
||||
struct timespec now;
|
||||
|
||||
if (pcap_fd == -1)
|
||||
return;
|
||||
|
||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||
err_perror("Failed to get CLOCK_REALTIME time");
|
||||
|
||||
pcap_frame(iov, iovcnt, offset, &now);
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
pcap_frame(iov, iovcnt, 0, &now);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -163,15 +158,18 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
|
|||
*/
|
||||
void pcap_init(struct ctx *c)
|
||||
{
|
||||
int flags = O_WRONLY | O_CREAT | O_TRUNC;
|
||||
|
||||
if (pcap_fd != -1)
|
||||
return;
|
||||
|
||||
if (!*c->pcap)
|
||||
return;
|
||||
|
||||
pcap_fd = output_file_open(c->pcap, O_WRONLY);
|
||||
flags |= c->foreground ? O_CLOEXEC : 0;
|
||||
pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR);
|
||||
if (pcap_fd == -1) {
|
||||
err_perror("Couldn't open pcap file %s", c->pcap);
|
||||
perror("open");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
2
pcap.h
2
pcap.h
|
@ -9,7 +9,7 @@
|
|||
void pcap(const char *pkt, size_t l2len);
|
||||
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
||||
size_t offset);
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset);
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt);
|
||||
void pcap_init(struct ctx *c);
|
||||
|
||||
#endif /* PCAP_H */
|
||||
|
|
42
pif.c
42
pif.c
|
@ -59,45 +59,3 @@ void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
|
|||
*sl = sizeof(sa->sa6);
|
||||
}
|
||||
}
|
||||
|
||||
/** pif_sock_l4() - Open a socket bound to an address on a specified interface
|
||||
* @c: Execution context
|
||||
* @type: Socket epoll type
|
||||
* @pif: Interface for this socket
|
||||
* @addr: Address to bind to, or NULL for dual-stack any
|
||||
* @ifname: Interface for binding, NULL for any
|
||||
* @port: Port number to bind to (host byte order)
|
||||
* @data: epoll reference portion for protocol handlers
|
||||
*
|
||||
* NOTE: For namespace pifs, this must be called having already entered the
|
||||
* relevant namespace.
|
||||
*
|
||||
* Return: newly created socket, negative error code on failure
|
||||
*/
|
||||
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||
const union inany_addr *addr, const char *ifname,
|
||||
in_port_t port, uint32_t data)
|
||||
{
|
||||
union sockaddr_inany sa = {
|
||||
.sa6.sin6_family = AF_INET6,
|
||||
.sa6.sin6_addr = in6addr_any,
|
||||
.sa6.sin6_port = htons(port),
|
||||
};
|
||||
socklen_t sl;
|
||||
|
||||
ASSERT(pif_is_socket(pif));
|
||||
|
||||
if (pif == PIF_SPLICE) {
|
||||
/* Sanity checks */
|
||||
ASSERT(!ifname);
|
||||
ASSERT(addr && inany_is_loopback(addr));
|
||||
}
|
||||
|
||||
if (!addr)
|
||||
return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
|
||||
ifname, false, data);
|
||||
|
||||
pif_sockaddr(c, &sa, &sl, pif, addr, port);
|
||||
return sock_l4_sa(c, type, &sa, sl,
|
||||
ifname, sa.sa_family == AF_INET6, data);
|
||||
}
|
||||
|
|
3
pif.h
3
pif.h
|
@ -59,8 +59,5 @@ static inline bool pif_is_socket(uint8_t pif)
|
|||
|
||||
void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
|
||||
uint8_t pif, const union inany_addr *addr, in_port_t port);
|
||||
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||
const union inany_addr *addr, const char *ifname,
|
||||
in_port_t port, uint32_t data);
|
||||
|
||||
#endif /* PIF_H */
|
||||
|
|
14
seccomp.sh
14
seccomp.sh
|
@ -20,15 +20,6 @@ OUT="$(mktemp)"
|
|||
[ -z "${ARCH}" ] && ARCH="$(uname -m)"
|
||||
[ -z "${CC}" ] && CC="cc"
|
||||
|
||||
AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z] \
|
||||
| sed 's/^ARM.*/ARM/' \
|
||||
| sed 's/I[456]86/I386/' \
|
||||
| sed 's/PPC64/PPC/' \
|
||||
| sed 's/PPCLE/PPC64LE/' \
|
||||
| sed 's/MIPS64EL/MIPSEL64/' \
|
||||
| sed 's/HPPA/PARISC/' \
|
||||
| sed 's/SH4/SH/')"
|
||||
|
||||
HEADER="/* This file was automatically generated by $(basename ${0}) */
|
||||
|
||||
#ifndef AUDIT_ARCH_PPC64LE
|
||||
|
@ -41,7 +32,7 @@ struct sock_filter filter_@PROFILE@[] = {
|
|||
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
|
||||
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
||||
(offsetof(struct seccomp_data, arch))),
|
||||
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@),
|
||||
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@),
|
||||
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
|
||||
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
||||
(offsetof(struct seccomp_data, nr))),
|
||||
|
@ -242,8 +233,7 @@ gen_profile() {
|
|||
sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}"
|
||||
done
|
||||
|
||||
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \
|
||||
"AUDIT_ARCH:${AUDIT_ARCH}"
|
||||
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))"
|
||||
}
|
||||
|
||||
printf '%s\n' "${HEADER}" > "${OUT}"
|
||||
|
|
142
tap.c
142
tap.c
|
@ -172,15 +172,11 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
|||
struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
|
||||
struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
|
||||
char *data = (char *)(uh + 1);
|
||||
const struct iovec iov = {
|
||||
.iov_base = (void *)in,
|
||||
.iov_len = dlen
|
||||
};
|
||||
|
||||
uh->source = htons(sport);
|
||||
uh->dest = htons(dport);
|
||||
uh->len = htons(l4len);
|
||||
csum_udp4(uh, src, dst, &iov, 1, 0);
|
||||
csum_udp4(uh, src, dst, in, dlen);
|
||||
memcpy(data, in, dlen);
|
||||
|
||||
tap_send_single(c, buf, dlen + (data - buf));
|
||||
|
@ -251,7 +247,7 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
|||
void tap_udp6_send(const struct ctx *c,
|
||||
const struct in6_addr *src, in_port_t sport,
|
||||
const struct in6_addr *dst, in_port_t dport,
|
||||
uint32_t flow, void *in, size_t dlen)
|
||||
uint32_t flow, const void *in, size_t dlen)
|
||||
{
|
||||
size_t l4len = dlen + sizeof(struct udphdr);
|
||||
char buf[USHRT_MAX];
|
||||
|
@ -259,15 +255,11 @@ void tap_udp6_send(const struct ctx *c,
|
|||
struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
|
||||
l4len, IPPROTO_UDP, flow);
|
||||
char *data = (char *)(uh + 1);
|
||||
const struct iovec iov = {
|
||||
.iov_base = in,
|
||||
.iov_len = dlen
|
||||
};
|
||||
|
||||
uh->source = htons(sport);
|
||||
uh->dest = htons(dport);
|
||||
uh->len = htons(l4len);
|
||||
csum_udp6(uh, src, dst, &iov, 1, 0);
|
||||
csum_udp6(uh, src, dst, in, dlen);
|
||||
memcpy(data, in, dlen);
|
||||
|
||||
tap_send_single(c, buf, dlen + (data - buf));
|
||||
|
@ -990,17 +982,24 @@ static void tap_sock_reset(struct ctx *c)
|
|||
}
|
||||
|
||||
/**
|
||||
* tap_passt_input() - Handler for new data on the socket to qemu
|
||||
* tap_handler_passt() - Packet handler for AF_UNIX file descriptor
|
||||
* @c: Execution context
|
||||
* @events: epoll events
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
static void tap_passt_input(struct ctx *c, const struct timespec *now)
|
||||
void tap_handler_passt(struct ctx *c, uint32_t events,
|
||||
const struct timespec *now)
|
||||
{
|
||||
static const char *partial_frame;
|
||||
static ssize_t partial_len = 0;
|
||||
ssize_t n;
|
||||
char *p;
|
||||
|
||||
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
|
||||
tap_sock_reset(c);
|
||||
return;
|
||||
}
|
||||
|
||||
tap_flush_pools();
|
||||
|
||||
if (partial_len) {
|
||||
|
@ -1011,13 +1010,10 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
|
|||
memmove(pkt_buf, partial_frame, partial_len);
|
||||
}
|
||||
|
||||
do {
|
||||
n = recv(c->fd_tap, pkt_buf + partial_len,
|
||||
TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
|
||||
} while ((n < 0) && errno == EINTR);
|
||||
|
||||
n = recv(c->fd_tap, pkt_buf + partial_len, TAP_BUF_BYTES - partial_len,
|
||||
MSG_DONTWAIT);
|
||||
if (n < 0) {
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
err_perror("Receive error on guest connection, reset");
|
||||
tap_sock_reset(c);
|
||||
}
|
||||
|
@ -1055,63 +1051,6 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
|
|||
tap_handler(c, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_handler_passt() - Event handler for AF_UNIX file descriptor
|
||||
* @c: Execution context
|
||||
* @events: epoll events
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void tap_handler_passt(struct ctx *c, uint32_t events,
|
||||
const struct timespec *now)
|
||||
{
|
||||
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
|
||||
tap_sock_reset(c);
|
||||
return;
|
||||
}
|
||||
|
||||
if (events & EPOLLIN)
|
||||
tap_passt_input(c, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_pasta_input() - Handler for new data on the socket to hypervisor
|
||||
* @c: Execution context
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
static void tap_pasta_input(struct ctx *c, const struct timespec *now)
|
||||
{
|
||||
ssize_t n, len;
|
||||
|
||||
tap_flush_pools();
|
||||
|
||||
for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
|
||||
len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
|
||||
|
||||
if (len == 0) {
|
||||
die("EOF on tap device, exiting");
|
||||
} else if (len < 0) {
|
||||
if (errno == EINTR) {
|
||||
len = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (errno == EAGAIN && errno == EWOULDBLOCK)
|
||||
break; /* all done for now */
|
||||
|
||||
die("Error on tap device, exiting");
|
||||
}
|
||||
|
||||
/* Ignore frames of bad length */
|
||||
if (len < (ssize_t)sizeof(struct ethhdr) ||
|
||||
len > (ssize_t)ETH_MAX_MTU)
|
||||
continue;
|
||||
|
||||
tap_add_packet(c, len, pkt_buf + n);
|
||||
}
|
||||
|
||||
tap_handler(c, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor
|
||||
* @c: Execution context
|
||||
|
@ -1121,11 +1060,46 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
|
|||
void tap_handler_pasta(struct ctx *c, uint32_t events,
|
||||
const struct timespec *now)
|
||||
{
|
||||
ssize_t n, len;
|
||||
int ret;
|
||||
|
||||
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
|
||||
die("Disconnect event on /dev/net/tun device, exiting");
|
||||
|
||||
if (events & EPOLLIN)
|
||||
tap_pasta_input(c, now);
|
||||
redo:
|
||||
n = 0;
|
||||
|
||||
tap_flush_pools();
|
||||
restart:
|
||||
while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
|
||||
|
||||
if (len < (ssize_t)sizeof(struct ethhdr) ||
|
||||
len > (ssize_t)ETH_MAX_MTU) {
|
||||
n += len;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
tap_add_packet(c, len, pkt_buf + n);
|
||||
|
||||
if ((n += len) == TAP_BUF_BYTES)
|
||||
break;
|
||||
}
|
||||
|
||||
if (len < 0 && errno == EINTR)
|
||||
goto restart;
|
||||
|
||||
ret = errno;
|
||||
|
||||
tap_handler(c, now);
|
||||
|
||||
if (len > 0 || ret == EAGAIN)
|
||||
return;
|
||||
|
||||
if (n == TAP_BUF_BYTES)
|
||||
goto redo;
|
||||
|
||||
die("Error on tap device, exiting");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1136,7 +1110,7 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
|
|||
*/
|
||||
int tap_sock_unix_open(char *sock_path)
|
||||
{
|
||||
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
|
||||
int fd = socket(AF_UNIX, SOCK_STREAM, 0);
|
||||
struct sockaddr_un addr = {
|
||||
.sun_family = AF_UNIX,
|
||||
};
|
||||
|
@ -1151,12 +1125,10 @@ int tap_sock_unix_open(char *sock_path)
|
|||
|
||||
if (*sock_path)
|
||||
memcpy(path, sock_path, UNIX_PATH_MAX);
|
||||
else if (snprintf_check(path, UNIX_PATH_MAX - 1,
|
||||
UNIX_SOCK_PATH, i))
|
||||
die_perror("Can't build UNIX domain socket path");
|
||||
else
|
||||
snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i);
|
||||
|
||||
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
|
||||
0);
|
||||
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
|
||||
if (ex < 0)
|
||||
die_perror("Failed to check for UNIX domain conflicts");
|
||||
|
||||
|
@ -1289,7 +1261,7 @@ static int tap_ns_tun(void *arg)
|
|||
if (fd < 0)
|
||||
die_perror("Failed to open() /dev/net/tun");
|
||||
|
||||
rc = ioctl(fd, (int)TUNSETIFF, &ifr);
|
||||
rc = ioctl(fd, TUNSETIFF, &ifr);
|
||||
if (rc < 0)
|
||||
die_perror("TUNSETIFF ioctl on /dev/net/tun failed");
|
||||
|
||||
|
|
2
tap.h
2
tap.h
|
@ -53,7 +53,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
|
|||
void tap_udp6_send(const struct ctx *c,
|
||||
const struct in6_addr *src, in_port_t sport,
|
||||
const struct in6_addr *dst, in_port_t dport,
|
||||
uint32_t flow, void *in, size_t dlen);
|
||||
uint32_t flow, const void *in, size_t dlen);
|
||||
void tap_icmp6_send(const struct ctx *c,
|
||||
const struct in6_addr *src, const struct in6_addr *dst,
|
||||
const void *in, size_t l4len);
|
||||
|
|
447
tcp.c
447
tcp.c
|
@ -274,7 +274,6 @@
|
|||
#include <net/if.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
@ -287,6 +286,8 @@
|
|||
#include <time.h>
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include <linux/tcp.h> /* For struct tcp_info */
|
||||
|
||||
#include "checksum.h"
|
||||
#include "util.h"
|
||||
#include "iov.h"
|
||||
|
@ -299,7 +300,6 @@
|
|||
#include "log.h"
|
||||
#include "inany.h"
|
||||
#include "flow.h"
|
||||
#include "linux_dep.h"
|
||||
|
||||
#include "flow_table.h"
|
||||
#include "tcp_internal.h"
|
||||
|
@ -308,6 +308,11 @@
|
|||
/* MSS rounding: see SET_MSS() */
|
||||
#define MSS_DEFAULT 536
|
||||
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
|
||||
#ifdef HAS_SND_WND
|
||||
# define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd)
|
||||
#else
|
||||
# define KERNEL_REPORTS_SND_WND(c) (0 && (c))
|
||||
#endif
|
||||
|
||||
#define ACK_INTERVAL 10 /* ms */
|
||||
#define SYN_TIMEOUT 10 /* s */
|
||||
|
@ -318,6 +323,11 @@
|
|||
#define LOW_RTT_TABLE_SIZE 8
|
||||
#define LOW_RTT_THRESHOLD 10 /* us */
|
||||
|
||||
/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
|
||||
* <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
|
||||
*/
|
||||
#define SOL_TCP IPPROTO_TCP
|
||||
|
||||
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
|
||||
|
||||
#define CONN_IS_CLOSING(conn) \
|
||||
|
@ -361,20 +371,6 @@ char tcp_buf_discard [MAX_WINDOW];
|
|||
/* Does the kernel support TCP_PEEK_OFF? */
|
||||
bool peek_offset_cap;
|
||||
|
||||
/* Size of data returned by TCP_INFO getsockopt() */
|
||||
socklen_t tcp_info_size;
|
||||
|
||||
#define tcp_info_cap(f_) \
|
||||
((offsetof(struct tcp_info_linux, tcpi_##f_) + \
|
||||
sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
|
||||
|
||||
/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
|
||||
#define snd_wnd_cap tcp_info_cap(snd_wnd)
|
||||
/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
|
||||
#define bytes_acked_cap tcp_info_cap(bytes_acked)
|
||||
/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
|
||||
#define min_rtt_cap tcp_info_cap(min_rtt)
|
||||
|
||||
/* sendmsg() to socket */
|
||||
static struct iovec tcp_iov [UIO_MAXIOV];
|
||||
|
||||
|
@ -428,23 +424,27 @@ int tcp_set_peek_offset(int s, int offset)
|
|||
*/
|
||||
static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
|
||||
{
|
||||
uint32_t rdhup;
|
||||
|
||||
if (!events)
|
||||
return 0;
|
||||
|
||||
rdhup = (events & SOCK_FIN_RCVD) ? 0 : EPOLLRDHUP;
|
||||
|
||||
if (events & ESTABLISHED) {
|
||||
if (events & TAP_FIN_SENT)
|
||||
return EPOLLET;
|
||||
|
||||
if (conn_flags & STALLED)
|
||||
return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;
|
||||
return EPOLLIN | EPOLLOUT | rdhup | EPOLLET;
|
||||
|
||||
return EPOLLIN | EPOLLRDHUP;
|
||||
return EPOLLIN | rdhup;
|
||||
}
|
||||
|
||||
if (events == TAP_SYN_RCVD)
|
||||
return EPOLLOUT | EPOLLET | EPOLLRDHUP;
|
||||
return EPOLLOUT | EPOLLET | rdhup;
|
||||
|
||||
return EPOLLET | EPOLLRDHUP;
|
||||
return rdhup;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -549,8 +549,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
(unsigned long long)it.it_value.tv_sec,
|
||||
(unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
|
||||
|
||||
if (timerfd_settime(conn->timer, 0, &it, NULL))
|
||||
flow_err(conn, "failed to set timer: %s", strerror(errno));
|
||||
timerfd_settime(conn->timer, 0, &it, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -680,12 +679,13 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
|
|||
* @tinfo: Pointer to struct tcp_info for socket
|
||||
*/
|
||||
static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
|
||||
const struct tcp_info_linux *tinfo)
|
||||
const struct tcp_info *tinfo)
|
||||
{
|
||||
#ifdef HAS_MIN_RTT
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
int i, hole = -1;
|
||||
|
||||
if (!min_rtt_cap ||
|
||||
if (!tinfo->tcpi_min_rtt ||
|
||||
(int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
|
||||
return;
|
||||
|
||||
|
@ -706,6 +706,10 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
|
|||
if (hole == LOW_RTT_TABLE_SIZE)
|
||||
hole = 0;
|
||||
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
|
||||
#else
|
||||
(void)conn;
|
||||
(void)tinfo;
|
||||
#endif /* HAS_MIN_RTT */
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -752,106 +756,34 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
|
|||
}
|
||||
|
||||
/**
|
||||
* tcp_update_check_tcp4() - Calculate TCP checksum for IPv4
|
||||
* tcp_update_check_tcp4() - Update TCP checksum from stored one
|
||||
* @iph: IPv4 header
|
||||
* @iov: Pointer to the array of IO vectors
|
||||
* @iov_cnt: Length of the array
|
||||
* @l4offset: IPv4 payload offset in the iovec array
|
||||
* @th: TCP header followed by TCP payload
|
||||
*/
|
||||
static void tcp_update_check_tcp4(const struct iphdr *iph,
|
||||
const struct iovec *iov, int iov_cnt,
|
||||
size_t l4offset)
|
||||
static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th)
|
||||
{
|
||||
uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
|
||||
struct in_addr saddr = { .s_addr = iph->saddr };
|
||||
struct in_addr daddr = { .s_addr = iph->daddr };
|
||||
size_t check_ofs;
|
||||
uint16_t *check;
|
||||
int check_idx;
|
||||
uint32_t sum;
|
||||
char *ptr;
|
||||
uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
|
||||
|
||||
sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
|
||||
|
||||
check_idx = iov_skip_bytes(iov, iov_cnt,
|
||||
l4offset + offsetof(struct tcphdr, check),
|
||||
&check_ofs);
|
||||
|
||||
if (check_idx >= iov_cnt) {
|
||||
err("TCP4 buffer is too small, iov size %zd, check offset %zd",
|
||||
iov_size(iov, iov_cnt),
|
||||
l4offset + offsetof(struct tcphdr, check));
|
||||
return;
|
||||
}
|
||||
|
||||
if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
|
||||
err("TCP4 checksum field memory is not contiguous "
|
||||
"check_ofs %zd check_idx %d iov_len %zd",
|
||||
check_ofs, check_idx, iov[check_idx].iov_len);
|
||||
return;
|
||||
}
|
||||
|
||||
ptr = (char *)iov[check_idx].iov_base + check_ofs;
|
||||
if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
|
||||
err("TCP4 checksum field is not correctly aligned in memory");
|
||||
return;
|
||||
}
|
||||
|
||||
check = (uint16_t *)ptr;
|
||||
|
||||
*check = 0;
|
||||
*check = csum_iov(iov, iov_cnt, l4offset, sum);
|
||||
th->check = 0;
|
||||
th->check = csum(th, l4len, sum);
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
|
||||
* @ip6h: IPv6 header
|
||||
* @iov: Pointer to the array of IO vectors
|
||||
* @iov_cnt: Length of the array
|
||||
* @l4offset: IPv6 payload offset in the iovec array
|
||||
* @th: TCP header followed by TCP payload
|
||||
*/
|
||||
static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
|
||||
const struct iovec *iov, int iov_cnt,
|
||||
size_t l4offset)
|
||||
static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
|
||||
{
|
||||
uint16_t l4len = ntohs(ip6h->payload_len);
|
||||
size_t check_ofs;
|
||||
uint16_t *check;
|
||||
int check_idx;
|
||||
uint32_t sum;
|
||||
char *ptr;
|
||||
uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
|
||||
&ip6h->saddr, &ip6h->daddr);
|
||||
|
||||
sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr,
|
||||
&ip6h->daddr);
|
||||
|
||||
check_idx = iov_skip_bytes(iov, iov_cnt,
|
||||
l4offset + offsetof(struct tcphdr, check),
|
||||
&check_ofs);
|
||||
|
||||
if (check_idx >= iov_cnt) {
|
||||
err("TCP6 buffer is too small, iov size %zd, check offset %zd",
|
||||
iov_size(iov, iov_cnt),
|
||||
l4offset + offsetof(struct tcphdr, check));
|
||||
return;
|
||||
}
|
||||
|
||||
if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
|
||||
err("TCP6 checksum field memory is not contiguous "
|
||||
"check_ofs %zd check_idx %d iov_len %zd",
|
||||
check_ofs, check_idx, iov[check_idx].iov_len);
|
||||
return;
|
||||
}
|
||||
|
||||
ptr = (char *)iov[check_idx].iov_base + check_ofs;
|
||||
if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
|
||||
err("TCP6 checksum field is not correctly aligned in memory");
|
||||
return;
|
||||
}
|
||||
|
||||
check = (uint16_t *)ptr;
|
||||
|
||||
*check = 0;
|
||||
*check = csum_iov(iov, iov_cnt, l4offset, sum);
|
||||
th->check = 0;
|
||||
th->check = csum(th, l4len, sum);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -937,6 +869,7 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn)
|
|||
/* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
|
||||
void tcp_defer_handler(struct ctx *c)
|
||||
{
|
||||
tcp_flags_flush(c);
|
||||
tcp_payload_flush(c);
|
||||
}
|
||||
|
||||
|
@ -967,27 +900,26 @@ static void tcp_fill_header(struct tcphdr *th,
|
|||
|
||||
/**
|
||||
* tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers
|
||||
* @conn: Connection pointer
|
||||
* @taph: tap backend specific header
|
||||
* @iph: Pointer to IPv4 header
|
||||
* @bp: Pointer to TCP header followed by TCP payload
|
||||
* @dlen: TCP payload length
|
||||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
* @no_tcp_csum: Do not set TCP checksum
|
||||
* @conn: Connection pointer
|
||||
* @taph: tap backend specific header
|
||||
* @iph: Pointer to IPv4 header
|
||||
* @th: Pointer to TCP header
|
||||
* @dlen: TCP payload length
|
||||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
*
|
||||
* Return: The IPv4 payload length, host order
|
||||
*/
|
||||
static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
|
||||
struct tap_hdr *taph,
|
||||
struct iphdr *iph, struct tcp_payload_t *bp,
|
||||
struct iphdr *iph, struct tcphdr *th,
|
||||
size_t dlen, const uint16_t *check,
|
||||
uint32_t seq, bool no_tcp_csum)
|
||||
uint32_t seq)
|
||||
{
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
const struct in_addr *src4 = inany_v4(&tapside->oaddr);
|
||||
const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
|
||||
size_t l4len = dlen + sizeof(bp->th);
|
||||
size_t l4len = dlen + sizeof(*th);
|
||||
size_t l3len = l4len + sizeof(*iph);
|
||||
|
||||
ASSERT(src4 && dst4);
|
||||
|
@ -999,18 +931,9 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
|
|||
iph->check = check ? *check :
|
||||
csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4);
|
||||
|
||||
tcp_fill_header(&bp->th, conn, seq);
|
||||
tcp_fill_header(th, conn, seq);
|
||||
|
||||
if (no_tcp_csum) {
|
||||
bp->th.check = 0;
|
||||
} else {
|
||||
const struct iovec iov = {
|
||||
.iov_base = bp,
|
||||
.iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr),
|
||||
};
|
||||
|
||||
tcp_update_check_tcp4(iph, &iov, 1, 0);
|
||||
}
|
||||
tcp_update_check_tcp4(iph, th);
|
||||
|
||||
tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
|
||||
|
||||
|
@ -1019,24 +942,23 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
|
|||
|
||||
/**
|
||||
* tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers
|
||||
* @conn: Connection pointer
|
||||
* @taph: tap backend specific header
|
||||
* @ip6h: Pointer to IPv6 header
|
||||
* @bp: Pointer to TCP header followed by TCP payload
|
||||
* @dlen: TCP payload length
|
||||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
* @no_tcp_csum: Do not set TCP checksum
|
||||
* @conn: Connection pointer
|
||||
* @taph: tap backend specific header
|
||||
* @ip6h: Pointer to IPv6 header
|
||||
* @th: Pointer to TCP header
|
||||
* @dlen: TCP payload length
|
||||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
*
|
||||
* Return: The IPv6 payload length, host order
|
||||
*/
|
||||
static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
|
||||
struct tap_hdr *taph,
|
||||
struct ipv6hdr *ip6h, struct tcp_payload_t *bp,
|
||||
size_t dlen, uint32_t seq, bool no_tcp_csum)
|
||||
struct ipv6hdr *ip6h, struct tcphdr *th,
|
||||
size_t dlen, uint32_t seq)
|
||||
{
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
size_t l4len = dlen + sizeof(bp->th);
|
||||
size_t l4len = dlen + sizeof(*th);
|
||||
|
||||
ip6h->payload_len = htons(l4len);
|
||||
ip6h->saddr = tapside->oaddr.a6;
|
||||
|
@ -1050,18 +972,9 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
|
|||
ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
|
||||
ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
|
||||
|
||||
tcp_fill_header(&bp->th, conn, seq);
|
||||
tcp_fill_header(th, conn, seq);
|
||||
|
||||
if (no_tcp_csum) {
|
||||
bp->th.check = 0;
|
||||
} else {
|
||||
const struct iovec iov = {
|
||||
.iov_base = bp,
|
||||
.iov_len = ntohs(ip6h->payload_len)
|
||||
};
|
||||
|
||||
tcp_update_check_tcp6(ip6h, &iov, 1, 0);
|
||||
}
|
||||
tcp_update_check_tcp6(ip6h, th);
|
||||
|
||||
tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
|
||||
|
||||
|
@ -1075,14 +988,12 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
|
|||
* @dlen: TCP payload length
|
||||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
* @no_tcp_csum: Do not set TCP checksum
|
||||
*
|
||||
* Return: IP payload length, host order
|
||||
*/
|
||||
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
||||
struct iovec *iov, size_t dlen,
|
||||
const uint16_t *check, uint32_t seq,
|
||||
bool no_tcp_csum)
|
||||
const uint16_t *check, uint32_t seq)
|
||||
{
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
|
||||
|
@ -1091,13 +1002,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
|||
return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
|
||||
iov[TCP_IOV_IP].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base, dlen,
|
||||
check, seq, no_tcp_csum);
|
||||
check, seq);
|
||||
}
|
||||
|
||||
return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
|
||||
iov[TCP_IOV_IP].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base, dlen,
|
||||
seq, no_tcp_csum);
|
||||
seq);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1110,41 +1021,42 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
|||
* Return: 1 if sequence or window were updated, 0 otherwise
|
||||
*/
|
||||
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
bool force_seq, struct tcp_info_linux *tinfo)
|
||||
int force_seq, struct tcp_info *tinfo)
|
||||
{
|
||||
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
|
||||
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
|
||||
/* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */
|
||||
socklen_t sl = sizeof(*tinfo);
|
||||
struct tcp_info_linux tinfo_new;
|
||||
struct tcp_info tinfo_new;
|
||||
uint32_t new_wnd_to_tap = prev_wnd_to_tap;
|
||||
int s = conn->sock;
|
||||
|
||||
if (!bytes_acked_cap) {
|
||||
#ifndef HAS_BYTES_ACKED
|
||||
(void)force_seq;
|
||||
|
||||
conn->seq_ack_to_tap = conn->seq_from_tap;
|
||||
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
|
||||
conn->seq_ack_to_tap = prev_ack_to_tap;
|
||||
#else
|
||||
if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
|
||||
|| CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
|
||||
conn->seq_ack_to_tap = conn->seq_from_tap;
|
||||
} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
|
||||
if (!tinfo) {
|
||||
tinfo = &tinfo_new;
|
||||
if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
|
||||
conn->seq_init_from_tap;
|
||||
|
||||
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
|
||||
conn->seq_ack_to_tap = prev_ack_to_tap;
|
||||
} else {
|
||||
if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL ||
|
||||
tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) ||
|
||||
(conn->flags & LOCAL) || force_seq) {
|
||||
conn->seq_ack_to_tap = conn->seq_from_tap;
|
||||
} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
|
||||
if (!tinfo) {
|
||||
tinfo = &tinfo_new;
|
||||
if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
|
||||
conn->seq_init_from_tap;
|
||||
|
||||
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
|
||||
conn->seq_ack_to_tap = prev_ack_to_tap;
|
||||
}
|
||||
}
|
||||
#endif /* !HAS_BYTES_ACKED */
|
||||
|
||||
if (!snd_wnd_cap) {
|
||||
if (!KERNEL_REPORTS_SND_WND(c)) {
|
||||
tcp_get_sndbuf(conn);
|
||||
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
|
||||
conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
|
||||
|
@ -1155,13 +1067,14 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
if (!tinfo) {
|
||||
if (prev_wnd_to_tap > WINDOW_DEFAULT) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
tinfo = &tinfo_new;
|
||||
if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_SND_WND
|
||||
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
|
||||
new_wnd_to_tap = tinfo->tcpi_snd_wnd;
|
||||
} else {
|
||||
|
@ -1169,6 +1082,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
|
||||
SNDBUF_GET(conn));
|
||||
}
|
||||
#endif
|
||||
|
||||
new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
|
||||
if (!(conn->events & ESTABLISHED))
|
||||
|
@ -1226,11 +1140,11 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
|
|||
* 0 if there is no flag to send
|
||||
* 1 otherwise
|
||||
*/
|
||||
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
|
||||
int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int flags, struct tcphdr *th, char *data,
|
||||
size_t *optlen)
|
||||
{
|
||||
struct tcp_info_linux tinfo = { 0 };
|
||||
struct tcp_info tinfo = { 0 };
|
||||
socklen_t sl = sizeof(tinfo);
|
||||
int s = conn->sock;
|
||||
|
||||
|
@ -1243,16 +1157,27 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
return -ECONNRESET;
|
||||
}
|
||||
|
||||
#ifdef HAS_SND_WND
|
||||
if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
|
||||
c->tcp.kernel_snd_wnd = 1;
|
||||
#endif
|
||||
|
||||
if (!(conn->flags & LOCAL))
|
||||
tcp_rtt_dst_check(conn, &tinfo);
|
||||
|
||||
if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags)
|
||||
if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
|
||||
return 0;
|
||||
|
||||
*optlen = 0;
|
||||
if (flags & SYN) {
|
||||
int mss;
|
||||
|
||||
/* Options: MSS, NOP and window scale (8 bytes) */
|
||||
*optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
|
||||
|
||||
*data++ = OPT_MSS;
|
||||
*data++ = OPT_MSS_LEN;
|
||||
|
||||
if (c->mtu == -1) {
|
||||
mss = tinfo.tcpi_snd_mss;
|
||||
} else {
|
||||
|
@ -1268,11 +1193,16 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
else if (mss > PAGE_SIZE)
|
||||
mss = ROUND_DOWN(mss, PAGE_SIZE);
|
||||
}
|
||||
*(uint16_t *)data = htons(MIN(USHRT_MAX, mss));
|
||||
|
||||
data += OPT_MSS_LEN - 2;
|
||||
|
||||
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
|
||||
|
||||
*opts = TCP_SYN_OPTS(mss, conn->ws_to_tap);
|
||||
*optlen = sizeof(*opts);
|
||||
*data++ = OPT_NOP;
|
||||
*data++ = OPT_WS;
|
||||
*data++ = OPT_WS_LEN;
|
||||
*data++ = conn->ws_to_tap;
|
||||
} else if (!(flags & RST)) {
|
||||
flags |= ACK;
|
||||
}
|
||||
|
@ -1309,8 +1239,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
*
|
||||
* Return: negative error code on connection reset, 0 otherwise
|
||||
*/
|
||||
static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int flags)
|
||||
int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
{
|
||||
return tcp_buf_send_flag(c, conn, flags);
|
||||
}
|
||||
|
@ -1320,7 +1249,7 @@ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
*/
|
||||
void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
if (conn->events == CLOSED)
|
||||
return;
|
||||
|
@ -1410,7 +1339,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
|
|||
{
|
||||
int s;
|
||||
|
||||
s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP);
|
||||
s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
|
||||
|
||||
if (s > FD_REF_MAX) {
|
||||
close(s);
|
||||
|
@ -1538,7 +1467,7 @@ static void tcp_bind_outbound(const struct ctx *c,
|
|||
* @optlen: Bytes in options: caller MUST ensure available length
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
|
||||
static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
|
||||
const void *saddr, const void *daddr,
|
||||
const struct tcphdr *th, const char *opts,
|
||||
size_t optlen, const struct timespec *now)
|
||||
|
@ -1703,7 +1632,7 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
|
|||
*
|
||||
* #syscalls recvmsg
|
||||
*/
|
||||
static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
return tcp_buf_data_from_sock(c, conn);
|
||||
}
|
||||
|
@ -1719,8 +1648,8 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
*
|
||||
* Return: count of consumed packets
|
||||
*/
|
||||
static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
const struct pool *p, int idx)
|
||||
static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
|
||||
const struct pool *p, int idx)
|
||||
{
|
||||
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
|
||||
uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
|
||||
|
@ -1917,8 +1846,7 @@ out:
|
|||
* @opts: Pointer to start of options
|
||||
* @optlen: Bytes in options: caller MUST ensure available length
|
||||
*/
|
||||
static void tcp_conn_from_sock_finish(const struct ctx *c,
|
||||
struct tcp_tap_conn *conn,
|
||||
static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
|
||||
const struct tcphdr *th,
|
||||
const char *opts, size_t optlen)
|
||||
{
|
||||
|
@ -1941,12 +1869,11 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
|
|||
return;
|
||||
}
|
||||
|
||||
tcp_send_flag(c, conn, ACK);
|
||||
|
||||
/* The client might have sent data already, which we didn't
|
||||
* dequeue waiting for SYN,ACK from tap -- check now.
|
||||
*/
|
||||
tcp_data_from_sock(c, conn);
|
||||
tcp_send_flag(c, conn, ACK);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1962,7 +1889,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
|
|||
*
|
||||
* Return: count of consumed packets
|
||||
*/
|
||||
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
const void *saddr, const void *daddr,
|
||||
const struct pool *p, int idx, const struct timespec *now)
|
||||
{
|
||||
|
@ -2100,7 +2027,7 @@ reset:
|
|||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
*/
|
||||
static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
socklen_t sl;
|
||||
int so;
|
||||
|
@ -2126,8 +2053,8 @@ static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
* @sa: Peer socket address (from accept())
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
|
||||
int s, const struct timespec *now)
|
||||
static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
|
||||
const struct timespec *now)
|
||||
{
|
||||
struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
|
||||
uint64_t hash;
|
||||
|
@ -2158,7 +2085,7 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
|
|||
* @ref: epoll reference of listening socket
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
|
||||
void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
|
||||
const struct timespec *now)
|
||||
{
|
||||
const struct flowside *ini;
|
||||
|
@ -2223,7 +2150,7 @@ cancel:
|
|||
*
|
||||
* #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
|
||||
*/
|
||||
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
|
||||
void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
|
||||
{
|
||||
struct itimerspec check_armed = { { 0 }, { 0 } };
|
||||
struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp;
|
||||
|
@ -2235,9 +2162,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
|
|||
* timer is currently armed, this event came from a previous setting,
|
||||
* and we just set the timer to a new point in the future: discard it.
|
||||
*/
|
||||
if (timerfd_gettime(conn->timer, &check_armed))
|
||||
flow_err(conn, "failed to read timer: %s", strerror(errno));
|
||||
|
||||
timerfd_gettime(conn->timer, &check_armed);
|
||||
if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
|
||||
return;
|
||||
|
||||
|
@ -2275,10 +2200,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
|
|||
* case. This avoids having to preemptively reset the timer on
|
||||
* ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
|
||||
*/
|
||||
if (timerfd_settime(conn->timer, 0, &new, &old))
|
||||
flow_err(conn, "failed to set timer: %s",
|
||||
strerror(errno));
|
||||
|
||||
timerfd_settime(conn->timer, 0, &new, &old);
|
||||
if (old.it_value.tv_sec == ACT_TIMEOUT) {
|
||||
flow_dbg(conn, "activity timeout");
|
||||
tcp_rst(c, conn);
|
||||
|
@ -2292,14 +2214,19 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
|
|||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
*/
|
||||
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events)
|
||||
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
|
||||
{
|
||||
struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside);
|
||||
|
||||
ASSERT(!c->no_tcp);
|
||||
ASSERT(pif_at_sidx(ref.flowside) != PIF_TAP);
|
||||
|
||||
if (events & EPOLLRDHUP) {
|
||||
flow_err(conn, "EPOLLRDHUP: events=0x%x conn->events=0x%x "
|
||||
"conn->flags=0x%x\n", events, conn->events,
|
||||
conn->flags);
|
||||
}
|
||||
|
||||
if (conn->events == CLOSED)
|
||||
return;
|
||||
|
||||
|
@ -2324,7 +2251,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
tcp_data_from_sock(c, conn);
|
||||
|
||||
if (events & EPOLLOUT)
|
||||
tcp_update_seqack_wnd(c, conn, false, NULL);
|
||||
tcp_update_seqack_wnd(c, conn, 0, NULL);
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -2347,16 +2274,17 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
}
|
||||
|
||||
/**
|
||||
* tcp_sock_init_one() - Initialise listening socket for address and port
|
||||
* tcp_sock_init_af() - Initialise listening socket for a given af and port
|
||||
* @c: Execution context
|
||||
* @addr: Pointer to address for binding, NULL for dual stack any
|
||||
* @ifname: Name of interface to bind to, NULL if not configured
|
||||
* @af: Address family to listen on
|
||||
* @port: Port, host order
|
||||
* @addr: Pointer to address for binding, NULL if not configured
|
||||
* @ifname: Name of interface to bind to, NULL if not configured
|
||||
*
|
||||
* Return: fd for the new listening socket, negative error code on failure
|
||||
*/
|
||||
static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
|
||||
const char *ifname, in_port_t port)
|
||||
static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
|
||||
const void *addr, const char *ifname)
|
||||
{
|
||||
union tcp_listen_epoll_ref tref = {
|
||||
.port = port,
|
||||
|
@ -2364,13 +2292,12 @@ static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
|
|||
};
|
||||
int s;
|
||||
|
||||
s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr,
|
||||
ifname, port, tref.u32);
|
||||
s = sock_l4(c, af, EPOLL_TYPE_TCP_LISTEN, addr, ifname, port, tref.u32);
|
||||
|
||||
if (c->tcp.fwd_in.mode == FWD_AUTO) {
|
||||
if (!addr || inany_v4(addr))
|
||||
if (af == AF_INET || af == AF_UNSPEC)
|
||||
tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s;
|
||||
if (!addr || !inany_v4(addr))
|
||||
if (af == AF_INET6 || af == AF_UNSPEC)
|
||||
tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s;
|
||||
}
|
||||
|
||||
|
@ -2384,32 +2311,31 @@ static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
|
|||
/**
|
||||
* tcp_sock_init() - Create listening sockets for a given host ("inbound") port
|
||||
* @c: Execution context
|
||||
* @af: Address family to select a specific IP version, or AF_UNSPEC
|
||||
* @addr: Pointer to address for binding, NULL if not configured
|
||||
* @ifname: Name of interface to bind to, NULL if not configured
|
||||
* @port: Port, host order
|
||||
*
|
||||
* Return: 0 on (partial) success, negative error code on (complete) failure
|
||||
*/
|
||||
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
|
||||
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
|
||||
const char *ifname, in_port_t port)
|
||||
{
|
||||
int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
|
||||
|
||||
ASSERT(!c->no_tcp);
|
||||
|
||||
if (!addr && c->ifi4 && c->ifi6)
|
||||
if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
|
||||
/* Attempt to get a dual stack socket */
|
||||
if (tcp_sock_init_one(c, NULL, ifname, port) >= 0)
|
||||
if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0)
|
||||
return 0;
|
||||
|
||||
/* Otherwise create a socket per IP version */
|
||||
if ((!addr || inany_v4(addr)) && c->ifi4)
|
||||
r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4,
|
||||
ifname, port);
|
||||
if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4)
|
||||
r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname);
|
||||
|
||||
if ((!addr || !inany_v4(addr)) && c->ifi6)
|
||||
r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6,
|
||||
ifname, port);
|
||||
if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
|
||||
r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
|
||||
|
||||
if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
|
||||
return 0;
|
||||
|
@ -2432,8 +2358,8 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
|
|||
|
||||
ASSERT(c->mode == MODE_PASTA);
|
||||
|
||||
s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4,
|
||||
NULL, port, tref.u32);
|
||||
s = sock_l4(c, AF_INET, EPOLL_TYPE_TCP_LISTEN, &in4addr_loopback,
|
||||
NULL, port, tref.u32);
|
||||
if (s >= 0)
|
||||
tcp_sock_set_bufsize(c, s);
|
||||
else
|
||||
|
@ -2458,8 +2384,8 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
|
|||
|
||||
ASSERT(c->mode == MODE_PASTA);
|
||||
|
||||
s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6,
|
||||
NULL, port, tref.u32);
|
||||
s = sock_l4(c, AF_INET6, EPOLL_TYPE_TCP_LISTEN, &in6addr_loopback,
|
||||
NULL, port, tref.u32);
|
||||
if (s >= 0)
|
||||
tcp_sock_set_bufsize(c, s);
|
||||
else
|
||||
|
@ -2561,7 +2487,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
|
|||
*
|
||||
* Return: true if supported, false otherwise
|
||||
*/
|
||||
static bool tcp_probe_peek_offset_cap(sa_family_t af)
|
||||
bool tcp_probe_peek_offset_cap(sa_family_t af)
|
||||
{
|
||||
bool ret = false;
|
||||
int s, optv = 0;
|
||||
|
@ -2578,34 +2504,6 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_probe_tcp_info() - Check what data TCP_INFO reports
|
||||
*
|
||||
* Return: Number of bytes returned by TCP_INFO getsockopt()
|
||||
*/
|
||||
static socklen_t tcp_probe_tcp_info(void)
|
||||
{
|
||||
struct tcp_info_linux tinfo;
|
||||
socklen_t sl = sizeof(tinfo);
|
||||
int s;
|
||||
|
||||
s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
|
||||
if (s < 0) {
|
||||
warn_perror("Temporary TCP socket creation failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
|
||||
warn_perror("Failed to get TCP_INFO on temporary socket");
|
||||
close(s);
|
||||
return false;
|
||||
}
|
||||
|
||||
close(s);
|
||||
|
||||
return sl;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data
|
||||
* @c: Execution context
|
||||
|
@ -2616,7 +2514,11 @@ int tcp_init(struct ctx *c)
|
|||
{
|
||||
ASSERT(!c->no_tcp);
|
||||
|
||||
tcp_sock_iov_init(c);
|
||||
if (c->ifi4)
|
||||
tcp_sock4_iov_init(c);
|
||||
|
||||
if (c->ifi6)
|
||||
tcp_sock6_iov_init(c);
|
||||
|
||||
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
|
||||
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
|
||||
|
@ -2635,15 +2537,6 @@ int tcp_init(struct ctx *c)
|
|||
(!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
|
||||
debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
|
||||
|
||||
tcp_info_size = tcp_probe_tcp_info();
|
||||
|
||||
#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \
|
||||
STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
|
||||
dbg_tcpi(snd_wnd);
|
||||
dbg_tcpi(bytes_acked);
|
||||
dbg_tcpi(min_rtt);
|
||||
#undef dbg_tcpi
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -2685,7 +2578,7 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
|
|||
if (outbound)
|
||||
tcp_ns_sock_init(c, port);
|
||||
else
|
||||
tcp_sock_init(c, NULL, NULL, port);
|
||||
tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
15
tcp.h
15
tcp.h
|
@ -10,15 +10,14 @@
|
|||
|
||||
struct ctx;
|
||||
|
||||
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref);
|
||||
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
|
||||
void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
|
||||
void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
|
||||
const struct timespec *now);
|
||||
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events);
|
||||
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
|
||||
int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
const void *saddr, const void *daddr,
|
||||
const struct pool *p, int idx, const struct timespec *now);
|
||||
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
|
||||
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
|
||||
const char *ifname, in_port_t port);
|
||||
int tcp_init(struct ctx *c);
|
||||
void tcp_timer(struct ctx *c, const struct timespec *now);
|
||||
|
@ -59,12 +58,16 @@ union tcp_listen_epoll_ref {
|
|||
* @fwd_in: Port forwarding configuration for inbound packets
|
||||
* @fwd_out: Port forwarding configuration for outbound packets
|
||||
* @timer_run: Timestamp of most recent timer run
|
||||
* @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
|
||||
* @pipe_size: Size of pipes for spliced connections
|
||||
*/
|
||||
struct tcp_ctx {
|
||||
struct fwd_ports fwd_in;
|
||||
struct fwd_ports fwd_out;
|
||||
struct timespec timer_run;
|
||||
#ifdef HAS_SND_WND
|
||||
int kernel_snd_wnd;
|
||||
#endif
|
||||
size_t pipe_size;
|
||||
};
|
||||
|
||||
|
|
370
tcp_buf.c
370
tcp_buf.c
|
@ -20,7 +20,7 @@
|
|||
|
||||
#include <netinet/ip.h>
|
||||
|
||||
#include <netinet/tcp.h>
|
||||
#include <linux/tcp.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
|
@ -38,32 +38,88 @@
|
|||
(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
|
||||
|
||||
/* Static buffers */
|
||||
/**
|
||||
* struct tcp_payload_t - TCP header and data to send segments with payload
|
||||
* @th: TCP header
|
||||
* @data: TCP data
|
||||
*/
|
||||
struct tcp_payload_t {
|
||||
struct tcphdr th;
|
||||
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
/* Ethernet header for IPv4 and IPv6 frames */
|
||||
/**
|
||||
* struct tcp_flags_t - TCP header and data to send zero-length
|
||||
* segments (flags)
|
||||
* @th: TCP header
|
||||
* @opts TCP options
|
||||
*/
|
||||
struct tcp_flags_t {
|
||||
struct tcphdr th;
|
||||
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)));
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
/* Ethernet header for IPv4 frames */
|
||||
static struct ethhdr tcp4_eth_src;
|
||||
static struct ethhdr tcp6_eth_src;
|
||||
|
||||
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
|
||||
static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
|
||||
/* IPv4 headers */
|
||||
static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
|
||||
/* TCP segments with payload for IPv4 frames */
|
||||
static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
|
||||
|
||||
/* IP headers for IPv4 and IPv6 */
|
||||
struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
|
||||
struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
|
||||
|
||||
/* TCP segments with payload for IPv4 and IPv6 frames */
|
||||
static struct tcp_payload_t tcp_payload[TCP_FRAMES_MEM];
|
||||
|
||||
static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516");
|
||||
static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516");
|
||||
static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
|
||||
|
||||
/* References tracking the owner connection of frames in the tap outqueue */
|
||||
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp_payload_used;
|
||||
static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp4_payload_used;
|
||||
|
||||
static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
|
||||
/* IPv4 headers for TCP segment without payload */
|
||||
static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
|
||||
/* TCP segments without payload for IPv4 frames */
|
||||
static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp4_flags_used;
|
||||
|
||||
/* Ethernet header for IPv6 frames */
|
||||
static struct ethhdr tcp6_eth_src;
|
||||
|
||||
static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
|
||||
/* IPv6 headers */
|
||||
static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
|
||||
/* TCP headers and data for IPv6 frames */
|
||||
static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
|
||||
|
||||
static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
|
||||
|
||||
/* References tracking the owner connection of frames in the tap outqueue */
|
||||
static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp6_payload_used;
|
||||
|
||||
static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
|
||||
/* IPv6 headers for TCP segment without payload */
|
||||
static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
|
||||
/* TCP segment without payload for IPv6 frames */
|
||||
static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp6_flags_used;
|
||||
|
||||
/* recvmsg()/sendmsg() data for tap */
|
||||
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
|
||||
|
||||
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
|
||||
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||
/**
|
||||
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses
|
||||
* @eth_d: Ethernet destination address, NULL if unchanged
|
||||
|
@ -76,30 +132,105 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
|
|||
}
|
||||
|
||||
/**
|
||||
* tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
||||
* tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_sock_iov_init(const struct ctx *c)
|
||||
void tcp_sock4_iov_init(const struct ctx *c)
|
||||
{
|
||||
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
struct iovec *iov;
|
||||
int i;
|
||||
|
||||
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
|
||||
tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
|
||||
tcp6_payload_ip[i] = ip6;
|
||||
for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
|
||||
tcp4_payload_ip[i] = iph;
|
||||
tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
|
||||
tcp4_payload[i].th.ack = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
|
||||
tcp4_flags_ip[i] = iph;
|
||||
tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
|
||||
tcp4_flags[i].th.ack = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
struct iovec *iov = tcp_l2_iov[i];
|
||||
iov = tcp4_l2_iov[i];
|
||||
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
|
||||
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
|
||||
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
iov = tcp4_l2_flags_iov[i];
|
||||
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_sock6_iov_init(const struct ctx *c)
|
||||
{
|
||||
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
struct iovec *iov;
|
||||
int i;
|
||||
|
||||
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
|
||||
tcp6_payload_ip[i] = ip6;
|
||||
tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
|
||||
tcp6_payload[i].th.ack = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
|
||||
tcp6_flags_ip[i] = ip6;
|
||||
tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
|
||||
tcp6_flags[i].th .ack = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
iov = tcp6_l2_iov[i];
|
||||
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
|
||||
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
iov = tcp6_l2_flags_iov[i];
|
||||
|
||||
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
|
||||
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_flags_flush() - Send out buffers for segments with no data (flags)
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_flags_flush(const struct ctx *c)
|
||||
{
|
||||
tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp6_flags_used);
|
||||
tcp6_flags_used = 0;
|
||||
|
||||
tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp4_flags_used);
|
||||
tcp4_flags_used = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -109,7 +240,7 @@ void tcp_sock_iov_init(const struct ctx *c)
|
|||
* @frames: Two-dimensional array containing queued frames with sub-iovs
|
||||
* @num_frames: Number of entries in the two arrays to be compared
|
||||
*/
|
||||
static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
|
||||
static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
|
||||
struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
|
||||
{
|
||||
int i;
|
||||
|
@ -131,20 +262,28 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
|
|||
}
|
||||
|
||||
/**
|
||||
* tcp_payload_flush() - Send out buffers for segments with data or flags
|
||||
* tcp_payload_flush() - Send out buffers for segments with data
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_payload_flush(const struct ctx *c)
|
||||
void tcp_payload_flush(struct ctx *c)
|
||||
{
|
||||
size_t m;
|
||||
|
||||
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp_payload_used);
|
||||
if (m != tcp_payload_used) {
|
||||
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
|
||||
tcp_payload_used - m);
|
||||
m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp6_payload_used);
|
||||
if (m != tcp6_payload_used) {
|
||||
tcp_revert_seq(c, &tcp6_frame_conns[m], &tcp6_l2_iov[m],
|
||||
tcp6_payload_used - m);
|
||||
}
|
||||
tcp_payload_used = 0;
|
||||
tcp6_payload_used = 0;
|
||||
|
||||
m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
|
||||
tcp4_payload_used);
|
||||
if (m != tcp4_payload_used) {
|
||||
tcp_revert_seq(c, &tcp4_frame_conns[m], &tcp4_l2_iov[m],
|
||||
tcp4_payload_used - m);
|
||||
}
|
||||
tcp4_payload_used = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -155,48 +294,58 @@ void tcp_payload_flush(const struct ctx *c)
|
|||
*
|
||||
* Return: negative error code on connection reset, 0 otherwise
|
||||
*/
|
||||
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
{
|
||||
struct tcp_payload_t *payload;
|
||||
struct tcp_flags_t *payload;
|
||||
struct iovec *iov;
|
||||
size_t optlen;
|
||||
size_t l4len;
|
||||
uint32_t seq;
|
||||
int ret;
|
||||
|
||||
iov = tcp_l2_iov[tcp_payload_used];
|
||||
if (CONN_V4(conn)) {
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
} else {
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||
}
|
||||
if (CONN_V4(conn))
|
||||
iov = tcp4_l2_flags_iov[tcp4_flags_used++];
|
||||
else
|
||||
iov = tcp6_l2_flags_iov[tcp6_flags_used++];
|
||||
|
||||
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
||||
|
||||
seq = conn->seq_to_tap;
|
||||
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
|
||||
(struct tcp_syn_opts *)&payload->data, &optlen);
|
||||
if (ret <= 0)
|
||||
payload->opts, &optlen);
|
||||
if (ret <= 0) {
|
||||
if (CONN_V4(conn))
|
||||
tcp4_flags_used--;
|
||||
else
|
||||
tcp6_flags_used--;
|
||||
return ret;
|
||||
|
||||
tcp_payload_used++;
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
if (flags & DUP_ACK) {
|
||||
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
|
||||
|
||||
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
|
||||
iov[TCP_IOV_TAP].iov_len);
|
||||
dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
|
||||
dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP];
|
||||
memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base, l4len);
|
||||
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
}
|
||||
|
||||
if (tcp_payload_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_payload_flush(c);
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
|
||||
if (flags & DUP_ACK) {
|
||||
struct iovec *dup_iov;
|
||||
int i;
|
||||
|
||||
if (CONN_V4(conn))
|
||||
dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
|
||||
else
|
||||
dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
|
||||
|
||||
for (i = 0; i < TCP_NUM_IOVS; i++)
|
||||
memcpy(dup_iov[i].iov_base, iov[i].iov_base,
|
||||
iov[i].iov_len);
|
||||
dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
|
||||
}
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_flags_flush(c);
|
||||
} else {
|
||||
if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_flags_flush(c);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -209,39 +358,39 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|||
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
|
||||
* @seq: Sequence number to be sent
|
||||
*/
|
||||
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
|
||||
ssize_t dlen, int no_csum, uint32_t seq)
|
||||
{
|
||||
struct tcp_payload_t *payload;
|
||||
const uint16_t *check = NULL;
|
||||
struct iovec *iov;
|
||||
size_t l4len;
|
||||
|
||||
conn->seq_to_tap = seq + dlen;
|
||||
tcp_frame_conns[tcp_payload_used] = conn;
|
||||
iov = tcp_l2_iov[tcp_payload_used];
|
||||
if (CONN_V4(conn)) {
|
||||
if (no_csum) {
|
||||
struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
|
||||
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
|
||||
const uint16_t *check = NULL;
|
||||
|
||||
if (no_csum) {
|
||||
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
|
||||
check = &iph->check;
|
||||
}
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
|
||||
tcp4_frame_conns[tcp4_payload_used] = conn;
|
||||
|
||||
iov = tcp4_l2_iov[tcp4_payload_used++];
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_payload_flush(c);
|
||||
} else if (CONN_V6(conn)) {
|
||||
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||
tcp6_frame_conns[tcp6_payload_used] = conn;
|
||||
|
||||
iov = tcp6_l2_iov[tcp6_payload_used++];
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_payload_flush(c);
|
||||
}
|
||||
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
||||
payload->th.th_off = sizeof(struct tcphdr) / 4;
|
||||
payload->th.th_x2 = 0;
|
||||
payload->th.th_flags = 0;
|
||||
payload->th.ack = 1;
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_payload_flush(c);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -253,11 +402,12 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
*
|
||||
* #syscalls recvmsg
|
||||
*/
|
||||
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
||||
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
|
||||
int len, dlen, i, s = conn->sock;
|
||||
int sendlen, len, dlen, v4 = CONN_V4(conn);
|
||||
int s = conn->sock, i, ret = 0;
|
||||
struct msghdr mh_sock = { 0 };
|
||||
uint16_t mss = MSS_GET(conn);
|
||||
uint32_t already_sent, seq;
|
||||
|
@ -304,15 +454,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
mh_sock.msg_iovlen = fill_bufs;
|
||||
}
|
||||
|
||||
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
|
||||
if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
|
||||
(!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
|
||||
tcp_payload_flush(c);
|
||||
|
||||
/* Silence Coverity CWE-125 false positive */
|
||||
tcp_payload_used = 0;
|
||||
tcp4_payload_used = tcp6_payload_used = 0;
|
||||
}
|
||||
|
||||
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
|
||||
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
|
||||
if (v4)
|
||||
iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
|
||||
else
|
||||
iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
|
||||
iov->iov_len = mss;
|
||||
}
|
||||
if (iov_rem)
|
||||
|
@ -323,19 +477,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
len = recvmsg(s, &mh_sock, MSG_PEEK);
|
||||
while (len < 0 && errno == EINTR);
|
||||
|
||||
if (len < 0) {
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
tcp_rst(c, conn);
|
||||
return -errno;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
if (len < 0)
|
||||
goto err;
|
||||
|
||||
if (!len) {
|
||||
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
||||
int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
|
||||
if (ret) {
|
||||
if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
}
|
||||
|
@ -346,27 +493,28 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
return 0;
|
||||
}
|
||||
|
||||
sendlen = len;
|
||||
if (!peek_offset_cap)
|
||||
len -= already_sent;
|
||||
sendlen -= already_sent;
|
||||
|
||||
if (len <= 0) {
|
||||
if (sendlen <= 0) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ~STALLED);
|
||||
|
||||
send_bufs = DIV_ROUND_UP(len, mss);
|
||||
last_len = len - (send_bufs - 1) * mss;
|
||||
send_bufs = DIV_ROUND_UP(sendlen, mss);
|
||||
last_len = sendlen - (send_bufs - 1) * mss;
|
||||
|
||||
/* Likely, some new data was acked too. */
|
||||
tcp_update_seqack_wnd(c, conn, false, NULL);
|
||||
tcp_update_seqack_wnd(c, conn, 0, NULL);
|
||||
|
||||
/* Finally, queue to tap */
|
||||
dlen = mss;
|
||||
seq = conn->seq_to_tap;
|
||||
for (i = 0; i < send_bufs; i++) {
|
||||
int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
|
||||
int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
|
||||
|
||||
if (i == send_bufs - 1)
|
||||
dlen = last_len;
|
||||
|
@ -378,4 +526,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
ret = -errno;
|
||||
tcp_rst(c, conn);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
10
tcp_buf.h
10
tcp_buf.h
|
@ -6,9 +6,11 @@
|
|||
#ifndef TCP_BUF_H
|
||||
#define TCP_BUF_H
|
||||
|
||||
void tcp_sock_iov_init(const struct ctx *c);
|
||||
void tcp_payload_flush(const struct ctx *c);
|
||||
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
|
||||
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
void tcp_sock4_iov_init(const struct ctx *c);
|
||||
void tcp_sock6_iov_init(const struct ctx *c);
|
||||
void tcp_flags_flush(const struct ctx *c);
|
||||
void tcp_payload_flush(struct ctx *c);
|
||||
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
|
||||
#endif /*TCP_BUF_H */
|
||||
|
|
|
@ -33,7 +33,9 @@
|
|||
#define OPT_EOL 0
|
||||
#define OPT_NOP 1
|
||||
#define OPT_MSS 2
|
||||
#define OPT_MSS_LEN 4
|
||||
#define OPT_WS 3
|
||||
#define OPT_WS_LEN 3
|
||||
#define OPT_SACKP 4
|
||||
#define OPT_SACK 5
|
||||
#define OPT_TS 8
|
||||
|
@ -61,79 +63,6 @@ enum tcp_iov_parts {
|
|||
TCP_NUM_IOVS
|
||||
};
|
||||
|
||||
/**
|
||||
* struct tcp_payload_t - TCP header and data to send segments with payload
|
||||
* @th: TCP header
|
||||
* @data: TCP data
|
||||
*/
|
||||
struct tcp_payload_t {
|
||||
struct tcphdr th;
|
||||
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
/** struct tcp_opt_nop - TCP NOP option
|
||||
* @kind: Option kind (OPT_NOP = 1)
|
||||
*/
|
||||
struct tcp_opt_nop {
|
||||
uint8_t kind;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_OPT_NOP ((struct tcp_opt_nop){ .kind = OPT_NOP, })
|
||||
|
||||
/** struct tcp_opt_mss - TCP MSS option
|
||||
* @kind: Option kind (OPT_MSS == 2)
|
||||
* @len: Option length (4)
|
||||
* @mss: Maximum Segment Size
|
||||
*/
|
||||
struct tcp_opt_mss {
|
||||
uint8_t kind;
|
||||
uint8_t len;
|
||||
uint16_t mss;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_OPT_MSS(mss_) \
|
||||
((struct tcp_opt_mss) { \
|
||||
.kind = OPT_MSS, \
|
||||
.len = sizeof(struct tcp_opt_mss), \
|
||||
.mss = htons(mss_), \
|
||||
})
|
||||
|
||||
/** struct tcp_opt_ws - TCP Window Scaling option
|
||||
* @kind: Option kind (OPT_WS == 3)
|
||||
* @len: Option length (3)
|
||||
* @shift: Window scaling shift
|
||||
*/
|
||||
struct tcp_opt_ws {
|
||||
uint8_t kind;
|
||||
uint8_t len;
|
||||
uint8_t shift;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_OPT_WS(shift_) \
|
||||
((struct tcp_opt_ws) { \
|
||||
.kind = OPT_WS, \
|
||||
.len = sizeof(struct tcp_opt_ws), \
|
||||
.shift = (shift_), \
|
||||
})
|
||||
|
||||
/** struct tcp_syn_opts - TCP options we apply to SYN packets
|
||||
* @mss: Maximum Segment Size (MSS) option
|
||||
* @nop: NOP opt (for alignment)
|
||||
* @ws: Window Scaling (WS) option
|
||||
*/
|
||||
struct tcp_syn_opts {
|
||||
struct tcp_opt_mss mss;
|
||||
struct tcp_opt_nop nop;
|
||||
struct tcp_opt_ws ws;
|
||||
} __attribute__ ((packed));
|
||||
#define TCP_SYN_OPTS(mss_, ws_) \
|
||||
((struct tcp_syn_opts){ \
|
||||
.mss = TCP_OPT_MSS(mss_), \
|
||||
.nop = TCP_OPT_NOP, \
|
||||
.ws = TCP_OPT_WS(ws_), \
|
||||
})
|
||||
|
||||
extern char tcp_buf_discard [MAX_WINDOW];
|
||||
|
||||
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
|
@ -153,23 +82,19 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
conn_event_do(c, conn, event); \
|
||||
} while (0)
|
||||
|
||||
void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
|
||||
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
#define tcp_rst(c, conn) \
|
||||
do { \
|
||||
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
|
||||
tcp_rst_do(c, conn); \
|
||||
} while (0)
|
||||
|
||||
struct tcp_info_linux;
|
||||
|
||||
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
||||
struct iovec *iov, size_t dlen,
|
||||
const uint16_t *check, uint32_t seq,
|
||||
bool no_tcp_csum);
|
||||
const uint16_t *check, uint32_t seq);
|
||||
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
bool force_seq, struct tcp_info_linux *tinfo);
|
||||
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
|
||||
size_t *optlen);
|
||||
int force_seq, struct tcp_info *tinfo);
|
||||
int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
|
||||
struct tcphdr *th, char *data, size_t *optlen);
|
||||
|
||||
#endif /* TCP_INTERNAL_H */
|
||||
|
|
20
tcp_splice.c
20
tcp_splice.c
|
@ -320,7 +320,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
|
|||
}
|
||||
|
||||
if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
|
||||
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
|
||||
c->tcp.pipe_size)) {
|
||||
flow_trace(conn,
|
||||
"cannot set %d->%d pipe size to %zu",
|
||||
sidei, !sidei, c->tcp.pipe_size);
|
||||
|
@ -503,7 +503,7 @@ swap:
|
|||
lowat_act_flag = RCVLOWAT_ACT(fromsidei);
|
||||
|
||||
while (1) {
|
||||
ssize_t readlen, written, pending;
|
||||
ssize_t readlen, to_write = 0, written;
|
||||
int more = 0;
|
||||
|
||||
retry:
|
||||
|
@ -518,11 +518,14 @@ retry:
|
|||
|
||||
if (errno != EAGAIN)
|
||||
goto close;
|
||||
|
||||
to_write = c->tcp.pipe_size;
|
||||
} else if (!readlen) {
|
||||
eof = 1;
|
||||
to_write = c->tcp.pipe_size;
|
||||
} else {
|
||||
never_read = 0;
|
||||
|
||||
to_write += readlen;
|
||||
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
|
||||
more = SPLICE_F_MORE;
|
||||
|
||||
|
@ -532,10 +535,10 @@ retry:
|
|||
|
||||
eintr:
|
||||
written = splice(conn->pipe[fromsidei][0], NULL,
|
||||
conn->s[!fromsidei], NULL, c->tcp.pipe_size,
|
||||
conn->s[!fromsidei], NULL, to_write,
|
||||
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
|
||||
flow_trace(conn, "%zi from write-side call (passed %zi)",
|
||||
written, c->tcp.pipe_size);
|
||||
written, to_write);
|
||||
|
||||
/* Most common case: skip updating counters. */
|
||||
if (readlen > 0 && readlen == written) {
|
||||
|
@ -581,9 +584,10 @@ eintr:
|
|||
if (never_read && written == (long)(c->tcp.pipe_size))
|
||||
goto retry;
|
||||
|
||||
pending = conn->read[fromsidei] - conn->written[fromsidei];
|
||||
if (!never_read && written > 0 && written < pending)
|
||||
if (!never_read && written < to_write) {
|
||||
to_write -= written;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (eof)
|
||||
break;
|
||||
|
@ -672,7 +676,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
|
|||
continue;
|
||||
|
||||
if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
|
||||
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
|
||||
c->tcp.pipe_size)) {
|
||||
trace("TCP (spliced): cannot set pool pipe size to %zu",
|
||||
c->tcp.pipe_size);
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
WGET = wget -c
|
||||
|
||||
DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
|
||||
debian-9-nocloud-amd64-daily-20200210-166.qcow2 \
|
||||
debian-10-nocloud-amd64.qcow2 \
|
||||
debian-10-generic-arm64.qcow2 \
|
||||
debian-10-generic-ppc64el-20220911-1135.qcow2 \
|
||||
|
@ -41,7 +42,8 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \
|
|||
openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \
|
||||
openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \
|
||||
openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \
|
||||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
|
||||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \
|
||||
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
|
||||
|
||||
UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \
|
||||
trusty-server-cloudimg-i386-disk1.img \
|
||||
|
@ -133,6 +135,9 @@ realclean: clean
|
|||
debian-8.11.0-openstack-%.qcow2:
|
||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
|
||||
|
||||
debian-9-nocloud-%-daily-20200210-166.qcow2:
|
||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2
|
||||
|
||||
debian-10-nocloud-%.qcow2:
|
||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
|
||||
|
||||
|
@ -198,6 +203,9 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz:
|
|||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
|
||||
$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
|
||||
|
||||
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2:
|
||||
$(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
|
||||
|
||||
# Ubuntu downloads
|
||||
trusty-server-cloudimg-%-disk1.img:
|
||||
$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img
|
||||
|
|
|
@ -58,7 +58,7 @@ setup_passt() {
|
|||
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -machine accel=kvm' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
|
@ -159,7 +159,7 @@ setup_passt_in_ns() {
|
|||
' -machine accel=kvm' \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
|
@ -230,7 +230,7 @@ setup_two_guests() {
|
|||
context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
|
@ -243,7 +243,7 @@ setup_two_guests() {
|
|||
context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
|
|
|
@ -31,8 +31,8 @@ PR_DELAY_INIT=100 # ms
|
|||
# $@: Message to print
|
||||
info() {
|
||||
tmux select-pane -t ${PANE_INFO}
|
||||
printf "${@}\n" >> $STATEBASE/log_pipe
|
||||
printf "${@}\n" >> "${LOGFILE}"
|
||||
echo "${@}" >> $STATEBASE/log_pipe
|
||||
echo "${@}" >> "${LOGFILE}"
|
||||
}
|
||||
|
||||
# info_n() - Highlight, print message to pane and to log file without newline
|
||||
|
@ -47,13 +47,13 @@ info_n() {
|
|||
# $@: Message to print
|
||||
info_nolog() {
|
||||
tmux select-pane -t ${PANE_INFO}
|
||||
printf "${@}\n" >> $STATEBASE/log_pipe
|
||||
echo "${@}" >> $STATEBASE/log_pipe
|
||||
}
|
||||
|
||||
# info_nolog() - Print message to log file
|
||||
# $@: Message to print
|
||||
log() {
|
||||
printf "${@}\n" >> "${LOGFILE}"
|
||||
echo "${@}" >> "${LOGFILE}"
|
||||
}
|
||||
|
||||
# info_nolog_n() - Send message to pane without highlighting it, without newline
|
||||
|
@ -664,7 +664,7 @@ pause_continue() {
|
|||
|
||||
# run_term() - Start tmux session, running entry point, with recording if needed
|
||||
run_term() {
|
||||
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL"
|
||||
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG"
|
||||
|
||||
if [ ${CI} -eq 1 ]; then
|
||||
printf '\e[8;50;240t'
|
||||
|
|
|
@ -31,15 +31,10 @@
|
|||
|
||||
#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0])))
|
||||
|
||||
#define die(...) \
|
||||
do { \
|
||||
fprintf(stderr, "nstool: " __VA_ARGS__); \
|
||||
exit(1); \
|
||||
} while (0)
|
||||
|
||||
#define err(...) \
|
||||
do { \
|
||||
fprintf(stderr, "nstool: " __VA_ARGS__); \
|
||||
#define die(...) \
|
||||
do { \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
exit(1); \
|
||||
} while (0)
|
||||
|
||||
struct ns_type {
|
||||
|
@ -161,9 +156,6 @@ static int connect_ctl(const char *sockpath, bool wait,
|
|||
|
||||
static void cmd_hold(int argc, char *argv[])
|
||||
{
|
||||
struct sigaction sa = {
|
||||
.sa_handler = SIG_IGN,
|
||||
};
|
||||
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX);
|
||||
struct sockaddr_un addr;
|
||||
const char *sockpath = argv[1];
|
||||
|
@ -193,10 +185,6 @@ static void cmd_hold(int argc, char *argv[])
|
|||
if (!getcwd(info.cwd, sizeof(info.cwd)))
|
||||
die("getcwd(): %s\n", strerror(errno));
|
||||
|
||||
rc = sigaction(SIGPIPE, &sa, NULL);
|
||||
if (rc)
|
||||
die("sigaction(SIGPIPE): %s\n", strerror(errno));
|
||||
|
||||
do {
|
||||
int afd = accept(fd, NULL, NULL);
|
||||
char buf;
|
||||
|
@ -205,21 +193,17 @@ static void cmd_hold(int argc, char *argv[])
|
|||
die("accept(): %s\n", strerror(errno));
|
||||
|
||||
rc = write(afd, &info, sizeof(info));
|
||||
if (rc < 0) {
|
||||
err("holder write() to control socket: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
if (rc < 0)
|
||||
die("write(): %s\n", strerror(errno));
|
||||
if ((size_t)rc < sizeof(info))
|
||||
err("holder short write() on control socket\n");
|
||||
die("short write() on control socket\n");
|
||||
|
||||
rc = read(afd, &buf, sizeof(buf));
|
||||
if (rc < 0) {
|
||||
err("holder read() on control socket: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
if (rc < 0)
|
||||
die("read(): %s\n", strerror(errno));
|
||||
|
||||
close(afd);
|
||||
} while (rc <= 0);
|
||||
} while (rc == 0);
|
||||
|
||||
unlink(sockpath);
|
||||
}
|
||||
|
@ -362,7 +346,7 @@ static int openns(const char *fmt, ...)
|
|||
}
|
||||
|
||||
static pid_t sig_pid;
|
||||
static void sig_propagate(int signum)
|
||||
static void sig_handler(int signum)
|
||||
{
|
||||
int err;
|
||||
|
||||
|
@ -374,7 +358,7 @@ static void sig_propagate(int signum)
|
|||
static void wait_for_child(pid_t pid)
|
||||
{
|
||||
struct sigaction sa = {
|
||||
.sa_handler = sig_propagate,
|
||||
.sa_handler = sig_handler,
|
||||
.sa_flags = SA_RESETHAND,
|
||||
};
|
||||
int status, err;
|
||||
|
|
|
@ -49,8 +49,6 @@ check [ "__SEARCH__" = "__HOST_SEARCH__" ]
|
|||
|
||||
test DHCPv6: address
|
||||
guest /sbin/dhclient -6 __IFNAME__
|
||||
# Wait for DAD to complete
|
||||
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
check [ "__ADDR6__" = "__HOST_ADDR6__" ]
|
||||
|
|
|
@ -16,15 +16,13 @@ htools ip jq sipcalc grep cut
|
|||
|
||||
test Interface name
|
||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
guest ip link set dev __IFNAME__ up
|
||||
# Wait for DAD to complete
|
||||
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
guest ip link set dev __IFNAME__ up && sleep 2
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
check [ -n "__IFNAME__" ]
|
||||
|
||||
test SLAAC: prefix
|
||||
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
|
||||
gout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
|
||||
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
|
||||
gout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
|
||||
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]
|
||||
|
|
|
@ -52,8 +52,6 @@ check [ "__SEARCH__" = "__HOST_SEARCH__" ]
|
|||
|
||||
test DHCPv6: address
|
||||
guest /sbin/dhclient -6 __IFNAME__
|
||||
# Wait for DAD to complete
|
||||
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
check [ "__ADDR6__" = "__HOST_ADDR6__" ]
|
||||
|
|
|
@ -32,7 +32,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10001
|
|||
guestw
|
||||
guest cmp test_big.bin /root/big.bin
|
||||
|
||||
test TCP/IPv4: host to ns (spliced): big transfer
|
||||
test TCP/IPv4: host to ns: big transfer
|
||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
|
||||
|
@ -90,7 +90,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10001
|
|||
guestw
|
||||
guest cmp test_small.bin /root/small.bin
|
||||
|
||||
test TCP/IPv4: host to ns (spliced): small transfer
|
||||
test TCP/IPv4: host to ns: small transfer
|
||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
|
||||
|
@ -146,7 +146,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10001
|
|||
guestw
|
||||
guest cmp test_big.bin /root/big.bin
|
||||
|
||||
test TCP/IPv6: host to ns (spliced): big transfer
|
||||
test TCP/IPv6: host to ns: big transfer
|
||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
|
||||
|
@ -204,7 +204,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10001
|
|||
guestw
|
||||
guest cmp test_small.bin /root/small.bin
|
||||
|
||||
test TCP/IPv6: host to ns (spliced): small transfer
|
||||
test TCP/IPv6: host to ns: small transfer
|
||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
|
||||
|
|
|
@ -30,7 +30,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10001,shut-null
|
|||
guestw
|
||||
guest cmp test.bin /root/medium.bin
|
||||
|
||||
test UDP/IPv4: host to ns (recvmmsg/sendmmsg)
|
||||
test UDP/IPv4: host to ns
|
||||
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
|
||||
|
@ -88,7 +88,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10001,shut-null
|
|||
guestw
|
||||
guest cmp test.bin /root/medium.bin
|
||||
|
||||
test UDP/IPv6: host to ns (recvmmsg/sendmmsg)
|
||||
test UDP/IPv6: host to ns
|
||||
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
sleep 1
|
||||
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
|
||||
|
|
|
@ -35,8 +35,6 @@ check [ __MTU__ = 65520 ]
|
|||
|
||||
test DHCPv6: address
|
||||
ns /sbin/dhclient -6 --no-pid __IFNAME__
|
||||
# Wait for DAD to complete
|
||||
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
|
|
|
@ -18,12 +18,11 @@ test Interface name
|
|||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
check [ -n "__IFNAME__" ]
|
||||
ns ip link set dev __IFNAME__ up
|
||||
# Wait for DAD to complete
|
||||
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
sleep 2
|
||||
|
||||
test SLAAC: prefix
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
|
||||
nsout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
|
||||
nsout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
|
||||
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]
|
||||
|
|
|
@ -19,8 +19,8 @@ set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
|
|||
set TEMP_SMALL __STATEDIR__/test_small.bin
|
||||
set TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin
|
||||
|
||||
test TCP/IPv4: host to ns (spliced): big transfer
|
||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
test TCP/IPv4: host to ns: big transfer
|
||||
nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
|
||||
nsw
|
||||
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
|
||||
|
@ -38,8 +38,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003
|
|||
hostw
|
||||
check cmp __BASEPATH__/big.bin __TEMP_BIG__
|
||||
|
||||
test TCP/IPv4: host to ns (spliced): small transfer
|
||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
test TCP/IPv4: host to ns: small transfer
|
||||
nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
host socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
|
||||
nsw
|
||||
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
|
||||
|
@ -57,8 +57,8 @@ ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003
|
|||
hostw
|
||||
check cmp __BASEPATH__/small.bin __TEMP_SMALL__
|
||||
|
||||
test TCP/IPv6: host to ns (spliced): big transfer
|
||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
test TCP/IPv6: host to ns: big transfer
|
||||
nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_BIG__,create,trunc
|
||||
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
|
||||
nsw
|
||||
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
|
||||
|
@ -77,8 +77,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003
|
|||
hostw
|
||||
check cmp __BASEPATH__/big.bin __TEMP_BIG__
|
||||
|
||||
test TCP/IPv6: host to ns (spliced): small transfer
|
||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
test TCP/IPv6: host to ns: small transfer
|
||||
nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
|
||||
nsw
|
||||
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
|
||||
|
|
|
@ -17,8 +17,8 @@ htools dd socat ip jq
|
|||
set TEMP __STATEDIR__/test.bin
|
||||
set TEMP_NS __STATEDIR__/test_ns.bin
|
||||
|
||||
test UDP/IPv4: host to ns (recvmmsg/sendmmsg)
|
||||
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
test UDP/IPv4: host to ns
|
||||
nsb socat -u UDP4-LISTEN:10002,bind=127.0.0.1,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
host socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
|
||||
nsw
|
||||
check cmp __BASEPATH__/medium.bin __TEMP_NS__
|
||||
|
@ -37,8 +37,8 @@ ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null
|
|||
hostw
|
||||
check cmp __BASEPATH__/medium.bin __TEMP__
|
||||
|
||||
test UDP/IPv6: host to ns (recvmmsg/sendmmsg)
|
||||
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
test UDP/IPv6: host to ns
|
||||
nsb socat -u UDP6-LISTEN:10002,bind=[::1],null-eof OPEN:__TEMP_NS__,create,trunc
|
||||
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
|
||||
nsw
|
||||
check cmp __BASEPATH__/medium.bin __TEMP_NS__
|
||||
|
|
|
@ -116,8 +116,6 @@ iperf3k ns
|
|||
# Reducing MTU below 1280 deconfigures IPv6, get our address back
|
||||
guest dhclient -6 -x
|
||||
guest dhclient -6 __IFNAME__
|
||||
# Wait for DAD to complete
|
||||
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
|
||||
tl TCP RR latency over IPv4: guest to host
|
||||
lat -
|
||||
|
|
|
@ -211,7 +211,7 @@ tr TCP throughput over IPv6: host to ns
|
|||
iperf3s ns 10002
|
||||
|
||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
|
||||
bw -
|
||||
bw -
|
||||
bw -
|
||||
|
|
|
@ -196,7 +196,7 @@ tr UDP throughput over IPv6: host to ns
|
|||
iperf3s ns 10002
|
||||
|
||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
|
||||
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
|
||||
iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472
|
||||
bw __BW__ 0.3 0.5
|
||||
iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972
|
||||
|
|
3
test/run
3
test/run
|
@ -38,9 +38,6 @@ TRACE=${TRACE:-0}
|
|||
# If set, tell passt and pasta to take packet captures
|
||||
PCAP=${PCAP:-0}
|
||||
|
||||
# Custom kernel to boot guests with, if given
|
||||
KERNEL=${KERNEL:-"/boot/vmlinuz-$(uname -r)"}
|
||||
|
||||
COMMIT="$(git log --oneline --no-decorate -1)"
|
||||
|
||||
. lib/util
|
||||
|
|
|
@ -36,13 +36,9 @@ check [ "__ADDR2__" = "__HOST_ADDR__" ]
|
|||
|
||||
test DHCPv6: addresses
|
||||
# Link is up now, wait for DAD to complete
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
sleep 2
|
||||
guest1 /sbin/dhclient -6 __IFNAME1__
|
||||
guest2 /sbin/dhclient -6 __IFNAME2__
|
||||
# Wait for DAD to complete on the DHCP address
|
||||
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
g2out ADDR2_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME2__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||
|
@ -52,33 +48,33 @@ check [ "__ADDR2_6__" = "__HOST_ADDR6__" ]
|
|||
test TCP/IPv4: guest 1 > guest 2
|
||||
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc
|
||||
sleep 1
|
||||
guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004
|
||||
guest2w
|
||||
sleep 1
|
||||
g2out MSG2 cat msg
|
||||
check [ "__MSG2__" = "Hello_from_guest_1" ]
|
||||
|
||||
test TCP/IPv6: guest 2 > guest 1
|
||||
g2out GW2_6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||
guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc
|
||||
sleep 1
|
||||
guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001
|
||||
guest1w
|
||||
sleep 1
|
||||
g1out MSG1 cat msg
|
||||
check [ "__MSG1__" = "Hello_from_guest_2" ]
|
||||
|
||||
test UDP/IPv4: guest 1 > guest 2
|
||||
guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc
|
||||
sleep 1
|
||||
guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004
|
||||
guest2w
|
||||
sleep 1
|
||||
g2out MSG2 cat msg
|
||||
check [ "__MSG2__" = "Hello_from_guest_1" ]
|
||||
|
||||
test UDP/IPv6: guest 2 > guest 1
|
||||
guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc
|
||||
sleep 1
|
||||
guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001
|
||||
guest1w
|
||||
sleep 1
|
||||
g1out MSG1 cat msg
|
||||
check [ "__MSG1__" = "Hello_from_guest_2" ]
|
||||
|
|
270
udp.c
270
udp.c
|
@ -169,11 +169,11 @@ udp_meta[UDP_MAX_FRAMES];
|
|||
* @UDP_NUM_IOVS the number of entries in the iovec array
|
||||
*/
|
||||
enum udp_iov_idx {
|
||||
UDP_IOV_TAP,
|
||||
UDP_IOV_ETH,
|
||||
UDP_IOV_IP,
|
||||
UDP_IOV_PAYLOAD,
|
||||
UDP_NUM_IOVS,
|
||||
UDP_IOV_TAP = 0,
|
||||
UDP_IOV_ETH = 1,
|
||||
UDP_IOV_IP = 2,
|
||||
UDP_IOV_PAYLOAD = 3,
|
||||
UDP_NUM_IOVS
|
||||
};
|
||||
|
||||
/* IOVs and msghdr arrays for receiving datagrams from sockets */
|
||||
|
@ -294,17 +294,15 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
|
|||
|
||||
/**
|
||||
* udp_update_hdr4() - Update headers for one IPv4 datagram
|
||||
* @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
|
||||
* @bp: Pointer to udp_payload_t to update
|
||||
* @toside: Flowside for destination side
|
||||
* @dlen: Length of UDP payload
|
||||
* @no_udp_csum: Do not set UDP checksum
|
||||
* @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
|
||||
* @bp: Pointer to udp_payload_t to update
|
||||
* @toside: Flowside for destination side
|
||||
* @dlen: Length of UDP payload
|
||||
*
|
||||
* Return: size of IPv4 payload (UDP header + data)
|
||||
*/
|
||||
static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
|
||||
const struct flowside *toside, size_t dlen,
|
||||
bool no_udp_csum)
|
||||
const struct flowside *toside, size_t dlen)
|
||||
{
|
||||
const struct in_addr *src = inany_v4(&toside->oaddr);
|
||||
const struct in_addr *dst = inany_v4(&toside->eaddr);
|
||||
|
@ -321,33 +319,22 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
|
|||
bp->uh.source = htons(toside->oport);
|
||||
bp->uh.dest = htons(toside->eport);
|
||||
bp->uh.len = htons(l4len);
|
||||
if (no_udp_csum) {
|
||||
bp->uh.check = 0;
|
||||
} else {
|
||||
const struct iovec iov = {
|
||||
.iov_base = bp->data,
|
||||
.iov_len = dlen
|
||||
};
|
||||
csum_udp4(&bp->uh, *src, *dst, &iov, 1, 0);
|
||||
}
|
||||
csum_udp4(&bp->uh, *src, *dst, bp->data, dlen);
|
||||
|
||||
return l4len;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_update_hdr6() - Update headers for one IPv6 datagram
|
||||
* @ip6h: Pre-filled IPv6 header (except for payload_len and
|
||||
* addresses)
|
||||
* @bp: Pointer to udp_payload_t to update
|
||||
* @toside: Flowside for destination side
|
||||
* @dlen: Length of UDP payload
|
||||
* @no_udp_csum: Do not set UDP checksum
|
||||
* @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
|
||||
* @bp: Pointer to udp_payload_t to update
|
||||
* @toside: Flowside for destination side
|
||||
* @dlen: Length of UDP payload
|
||||
*
|
||||
* Return: size of IPv6 payload (UDP header + data)
|
||||
*/
|
||||
static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
|
||||
const struct flowside *toside, size_t dlen,
|
||||
bool no_udp_csum)
|
||||
const struct flowside *toside, size_t dlen)
|
||||
{
|
||||
uint16_t l4len = dlen + sizeof(bp->uh);
|
||||
|
||||
|
@ -361,20 +348,7 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
|
|||
bp->uh.source = htons(toside->oport);
|
||||
bp->uh.dest = htons(toside->eport);
|
||||
bp->uh.len = ip6h->payload_len;
|
||||
if (no_udp_csum) {
|
||||
/* 0 is an invalid checksum for UDP IPv6 and dropped by
|
||||
* the kernel stack, even if the checksum is disabled by virtio
|
||||
* flags. We need to put any non-zero value here.
|
||||
*/
|
||||
bp->uh.check = 0xffff;
|
||||
} else {
|
||||
const struct iovec iov = {
|
||||
.iov_base = bp->data,
|
||||
.iov_len = dlen
|
||||
};
|
||||
csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
|
||||
&iov, 1, 0);
|
||||
}
|
||||
csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, bp->data, dlen);
|
||||
|
||||
return l4len;
|
||||
}
|
||||
|
@ -384,11 +358,9 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
|
|||
* @mmh: Receiving mmsghdr array
|
||||
* @idx: Index of the datagram to prepare
|
||||
* @toside: Flowside for destination side
|
||||
* @no_udp_csum: Do not set UDP checksum
|
||||
*/
|
||||
static void udp_tap_prepare(const struct mmsghdr *mmh,
|
||||
unsigned idx, const struct flowside *toside,
|
||||
bool no_udp_csum)
|
||||
static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
|
||||
const struct flowside *toside)
|
||||
{
|
||||
struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
|
||||
struct udp_payload_t *bp = &udp_payload[idx];
|
||||
|
@ -396,15 +368,13 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
|
|||
size_t l4len;
|
||||
|
||||
if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
|
||||
l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
|
||||
mmh[idx].msg_len, no_udp_csum);
|
||||
l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len);
|
||||
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
|
||||
sizeof(udp6_eth_hdr));
|
||||
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
|
||||
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
|
||||
} else {
|
||||
l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
|
||||
mmh[idx].msg_len, no_udp_csum);
|
||||
l4len = udp_update_hdr4(&bm->ip4h, bp, toside, mmh[idx].msg_len);
|
||||
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
|
||||
sizeof(udp4_eth_hdr));
|
||||
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
|
||||
|
@ -417,8 +387,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
|
|||
* udp_sock_recverr() - Receive and clear an error from a socket
|
||||
* @s: Socket to receive from
|
||||
*
|
||||
* Return: 1 if error received and processed, 0 if no more errors in queue, < 0
|
||||
* if there was an error reading the queue
|
||||
* Return: ee_errno, 0 on empty queue
|
||||
*
|
||||
* #syscalls recvmsg
|
||||
*/
|
||||
|
@ -439,16 +408,15 @@ static int udp_sock_recverr(int s)
|
|||
|
||||
rc = recvmsg(s, &mh, MSG_ERRQUEUE);
|
||||
if (rc < 0) {
|
||||
if (errno == EAGAIN || errno == EWOULDBLOCK)
|
||||
return 0;
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK)
|
||||
err_perror("Failed to read error queue");
|
||||
|
||||
err_perror("UDP: Failed to read error queue");
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!(mh.msg_flags & MSG_ERRQUEUE)) {
|
||||
err("Missing MSG_ERRQUEUE flag reading error queue");
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
hdr = CMSG_FIRSTHDR(&mh);
|
||||
|
@ -457,7 +425,7 @@ static int udp_sock_recverr(int s)
|
|||
(hdr->cmsg_level == IPPROTO_IPV6 &&
|
||||
hdr->cmsg_type == IPV6_RECVERR))) {
|
||||
err("Unexpected cmsg reading error queue");
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
|
||||
|
@ -466,54 +434,7 @@ static int udp_sock_recverr(int s)
|
|||
debug("%s error on UDP socket %i: %s",
|
||||
str_ee_origin(ee), s, strerror(ee->ee_errno));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_sock_errs() - Process errors on a socket
|
||||
* @c: Execution context
|
||||
* @s: Socket to receive from
|
||||
* @events: epoll events bitmap
|
||||
*
|
||||
* Return: Number of errors handled, or < 0 if we have an unrecoverable error
|
||||
*/
|
||||
static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
|
||||
{
|
||||
unsigned n_err = 0;
|
||||
socklen_t errlen;
|
||||
int rc, err;
|
||||
|
||||
ASSERT(!c->no_udp);
|
||||
|
||||
if (!(events & EPOLLERR))
|
||||
return 0; /* Nothing to do */
|
||||
|
||||
/* Empty the error queue */
|
||||
while ((rc = udp_sock_recverr(s)) > 0)
|
||||
n_err += rc;
|
||||
|
||||
if (rc < 0)
|
||||
return -1; /* error reading error, unrecoverable */
|
||||
|
||||
errlen = sizeof(err);
|
||||
if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 ||
|
||||
errlen != sizeof(err)) {
|
||||
err_perror("Error reading SO_ERROR");
|
||||
return -1; /* error reading error, unrecoverable */
|
||||
}
|
||||
|
||||
if (err) {
|
||||
debug("Unqueued error on UDP socket %i: %s", s, strerror(err));
|
||||
n_err++;
|
||||
}
|
||||
|
||||
if (!n_err) {
|
||||
/* EPOLLERR, but no errors to clear !? */
|
||||
err("EPOLLERR event without reported errors on socket %i", s);
|
||||
return -1; /* no way to clear, unrecoverable */
|
||||
}
|
||||
|
||||
return n_err;
|
||||
return ee->ee_errno;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -521,14 +442,15 @@ static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
|
|||
* @c: Execution context
|
||||
* @s: Socket to receive from
|
||||
* @events: epoll events bitmap
|
||||
* @mmh mmsghdr array to receive into
|
||||
* @mmh: mmsghdr array to receive into
|
||||
* @recv_err: Set to last error in queue. If none: -1 on EPOLLERR, 0 otherwise
|
||||
*
|
||||
* Return: Number of datagrams received
|
||||
* Return: count of datagrams received
|
||||
*
|
||||
* #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
|
||||
*/
|
||||
static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
|
||||
struct mmsghdr *mmh)
|
||||
struct mmsghdr *mmh, int *recv_err)
|
||||
{
|
||||
/* For not entirely clear reasons (data locality?) pasta gets better
|
||||
* throughput if we receive tap datagrams one at a atime. For small
|
||||
|
@ -541,6 +463,17 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
|
|||
|
||||
ASSERT(!c->no_udp);
|
||||
|
||||
/* Clear any errors first */
|
||||
if (events & EPOLLERR) {
|
||||
bool found = false;
|
||||
int ret;
|
||||
|
||||
while ((ret = udp_sock_recverr(s)))
|
||||
found = true;
|
||||
|
||||
*recv_err = found ? ret : -1;
|
||||
}
|
||||
|
||||
if (!(events & EPOLLIN))
|
||||
return 0;
|
||||
|
||||
|
@ -566,16 +499,10 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
uint32_t events, const struct timespec *now)
|
||||
{
|
||||
const socklen_t sasize = sizeof(udp_meta[0].s_in);
|
||||
int recv_err = 0;
|
||||
int n, i;
|
||||
|
||||
if (udp_sock_errs(c, ref.fd, events) < 0) {
|
||||
err("UDP: Unrecoverable error on listening socket:"
|
||||
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
|
||||
/* FIXME: what now? close/re-open socket? */
|
||||
return;
|
||||
}
|
||||
|
||||
if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
|
||||
if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv, &recv_err)) <= 0)
|
||||
return;
|
||||
|
||||
/* We divide datagrams into batches based on how we need to send them,
|
||||
|
@ -595,8 +522,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
udp_splice_prepare(udp_mh_recv, i);
|
||||
} else if (batchpif == PIF_TAP) {
|
||||
udp_tap_prepare(udp_mh_recv, i,
|
||||
flowside_at_sidx(batchsidx),
|
||||
false);
|
||||
flowside_at_sidx(batchsidx));
|
||||
}
|
||||
|
||||
if (++i >= n)
|
||||
|
@ -644,21 +570,51 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
|
||||
const struct flowside *toside = flowside_at_sidx(tosidx);
|
||||
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
|
||||
int from_s = uflow->s[ref.flowside.sidei];
|
||||
uint8_t topif = pif_at_sidx(tosidx);
|
||||
int n, i, from_s;
|
||||
int recv_err = 0;
|
||||
int n, i;
|
||||
|
||||
ASSERT(!c->no_udp && uflow);
|
||||
|
||||
from_s = uflow->s[ref.flowside.sidei];
|
||||
n = udp_sock_recv(c, from_s, events, udp_mh_recv, &recv_err);
|
||||
if (recv_err == -1) {
|
||||
struct flow_common *f = &uflow->f;
|
||||
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
|
||||
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
|
||||
const struct flowside *ini = &f->side[INISIDE];
|
||||
const struct flowside *tgt = &f->side[TGTSIDE];
|
||||
|
||||
flow_err(uflow, "EPOLLERR without error queue, closing flow");
|
||||
err("Last recorded errno was: %i (%s)", uflow->last_errno,
|
||||
strerror(uflow->last_errno));
|
||||
|
||||
flow_log_(f, LOG_ERR,
|
||||
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
|
||||
pif_name(f->pif[INISIDE]),
|
||||
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||
ini->eport,
|
||||
inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
|
||||
ini->oport,
|
||||
pif_name(f->pif[TGTSIDE]),
|
||||
inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)),
|
||||
tgt->oport,
|
||||
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
|
||||
tgt->eport);
|
||||
|
||||
if (udp_sock_errs(c, from_s, events) < 0) {
|
||||
flow_err(uflow, "Unrecoverable error on reply socket");
|
||||
flow_err_details(uflow);
|
||||
udp_flow_close(c, uflow);
|
||||
return;
|
||||
}
|
||||
|
||||
if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
|
||||
if (recv_err) {
|
||||
struct udp_flow *uflow = udp_at_sidx(udp_meta[0].tosidx);
|
||||
|
||||
uflow->last_errno = recv_err;
|
||||
flow_err(uflow, "Recorded errno %i (%s)", recv_err,
|
||||
strerror(recv_err));
|
||||
}
|
||||
|
||||
if (n <= 0)
|
||||
return;
|
||||
|
||||
flow_trace(uflow, "Received %d datagrams on reply socket", n);
|
||||
|
@ -668,7 +624,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
if (pif_is_socket(topif))
|
||||
udp_splice_prepare(udp_mh_recv, i);
|
||||
else if (topif == PIF_TAP)
|
||||
udp_tap_prepare(udp_mh_recv, i, toside, false);
|
||||
udp_tap_prepare(udp_mh_recv, i, toside);
|
||||
/* Restore sockaddr length clobbered by recvmsg() */
|
||||
udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
|
||||
}
|
||||
|
@ -795,61 +751,69 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
|
|||
* udp_sock_init() - Initialise listening sockets for a given port
|
||||
* @c: Execution context
|
||||
* @ns: In pasta mode, if set, bind with loopback address in namespace
|
||||
* @af: Address family to select a specific IP version, or AF_UNSPEC
|
||||
* @addr: Pointer to address for binding, NULL if not configured
|
||||
* @ifname: Name of interface to bind to, NULL if not configured
|
||||
* @port: Port, host order
|
||||
*
|
||||
* Return: 0 on (partial) success, negative error code on (complete) failure
|
||||
*/
|
||||
int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
|
||||
const char *ifname, in_port_t port)
|
||||
int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
|
||||
const void *addr, const char *ifname, in_port_t port)
|
||||
{
|
||||
union udp_listen_epoll_ref uref = {
|
||||
.pif = ns ? PIF_SPLICE : PIF_HOST,
|
||||
.port = port,
|
||||
};
|
||||
union udp_listen_epoll_ref uref = { .port = port };
|
||||
int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
|
||||
|
||||
ASSERT(!c->no_udp);
|
||||
|
||||
if (!addr && c->ifi4 && c->ifi6 && !ns) {
|
||||
if (ns)
|
||||
uref.pif = PIF_SPLICE;
|
||||
else
|
||||
uref.pif = PIF_HOST;
|
||||
|
||||
if (af == AF_UNSPEC && c->ifi4 && c->ifi6) {
|
||||
int s;
|
||||
|
||||
/* Attempt to get a dual stack socket */
|
||||
s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
|
||||
NULL, ifname, port, uref.u32);
|
||||
udp_splice_init[V4][port] = s < 0 ? -1 : s;
|
||||
udp_splice_init[V6][port] = s < 0 ? -1 : s;
|
||||
if (!ns) {
|
||||
s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
|
||||
addr, ifname, port, uref.u32);
|
||||
udp_splice_init[V4][port] = s < 0 ? -1 : s;
|
||||
udp_splice_init[V6][port] = s < 0 ? -1 : s;
|
||||
} else {
|
||||
s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
|
||||
&in4addr_loopback, ifname, port, uref.u32);
|
||||
udp_splice_ns[V4][port] = s < 0 ? -1 : s;
|
||||
udp_splice_ns[V6][port] = s < 0 ? -1 : s;
|
||||
}
|
||||
if (IN_INTERVAL(0, FD_REF_MAX, s))
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ((!addr || inany_v4(addr)) && c->ifi4) {
|
||||
if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
|
||||
if (!ns) {
|
||||
r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
|
||||
addr ? addr : &inany_any4, ifname,
|
||||
port, uref.u32);
|
||||
r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
|
||||
addr, ifname, port, uref.u32);
|
||||
|
||||
udp_splice_init[V4][port] = r4 < 0 ? -1 : r4;
|
||||
} else {
|
||||
r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
|
||||
&inany_loopback4, ifname,
|
||||
port, uref.u32);
|
||||
r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
|
||||
&in4addr_loopback,
|
||||
ifname, port, uref.u32);
|
||||
udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4;
|
||||
}
|
||||
}
|
||||
|
||||
if ((!addr || !inany_v4(addr)) && c->ifi6) {
|
||||
if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
|
||||
if (!ns) {
|
||||
r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
|
||||
addr ? addr : &inany_any6, ifname,
|
||||
port, uref.u32);
|
||||
r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
|
||||
addr, ifname, port, uref.u32);
|
||||
|
||||
udp_splice_init[V6][port] = r6 < 0 ? -1 : r6;
|
||||
} else {
|
||||
r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
|
||||
&inany_loopback6, ifname,
|
||||
port, uref.u32);
|
||||
r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
|
||||
&in6addr_loopback,
|
||||
ifname, port, uref.u32);
|
||||
udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6;
|
||||
}
|
||||
}
|
||||
|
@ -917,7 +881,7 @@ static void udp_port_rebind(struct ctx *c, bool outbound)
|
|||
|
||||
if ((c->ifi4 && socks[V4][port] == -1) ||
|
||||
(c->ifi6 && socks[V6][port] == -1))
|
||||
udp_sock_init(c, outbound, NULL, NULL, port);
|
||||
udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
4
udp.h
4
udp.h
|
@ -16,8 +16,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
int udp_tap_handler(const struct ctx *c, uint8_t pif,
|
||||
sa_family_t af, const void *saddr, const void *daddr,
|
||||
const struct pool *p, int idx, const struct timespec *now);
|
||||
int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
|
||||
const char *ifname, in_port_t port);
|
||||
int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
|
||||
const void *addr, const char *ifname, in_port_t port);
|
||||
int udp_init(struct ctx *c);
|
||||
void udp_timer(struct ctx *c, const struct timespec *now);
|
||||
void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
|
||||
|
|
21
udp_flow.c
21
udp_flow.c
|
@ -34,16 +34,13 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
|
|||
return &flow->udp;
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* udp_flow_close() - Close and clean up UDP flow
|
||||
* @c: Execution context
|
||||
* @uflow: UDP flow
|
||||
*/
|
||||
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
|
||||
{
|
||||
if (uflow->closed)
|
||||
return; /* Nothing to do */
|
||||
|
||||
if (uflow->s[INISIDE] >= 0) {
|
||||
/* The listening socket needs to stay in epoll */
|
||||
close(uflow->s[INISIDE]);
|
||||
|
@ -56,11 +53,12 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
|
|||
close(uflow->s[TGTSIDE]);
|
||||
uflow->s[TGTSIDE] = -1;
|
||||
}
|
||||
|
||||
uflow->last_errno = 0;
|
||||
|
||||
flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
|
||||
if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
|
||||
flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
|
||||
|
||||
uflow->closed = true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -261,17 +259,6 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
|
|||
return udp_flow_new(c, flow, -1, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
|
||||
* @uflow: Flow to handle
|
||||
*
|
||||
* Return: true if the connection is ready to free, false otherwise
|
||||
*/
|
||||
bool udp_flow_defer(const struct udp_flow *uflow)
|
||||
{
|
||||
return uflow->closed;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_flow_timer() - Handler for timed events related to a given flow
|
||||
* @c: Execution context
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
/**
|
||||
* struct udp - Descriptor for a flow of UDP packets
|
||||
* @f: Generic flow information
|
||||
* @closed: Flow is already closed
|
||||
* @ts: Activity timestamp
|
||||
* @s: Socket fd (or -1) for each side of the flow
|
||||
*/
|
||||
|
@ -18,9 +17,10 @@ struct udp_flow {
|
|||
/* Must be first element */
|
||||
struct flow_common f;
|
||||
|
||||
bool closed :1;
|
||||
time_t ts;
|
||||
int s[SIDES];
|
||||
|
||||
int last_errno;
|
||||
};
|
||||
|
||||
struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
|
||||
|
@ -33,7 +33,6 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
|
|||
in_port_t srcport, in_port_t dstport,
|
||||
const struct timespec *now);
|
||||
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
|
||||
bool udp_flow_defer(const struct udp_flow *uflow);
|
||||
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
|
||||
const struct timespec *now);
|
||||
|
||||
|
|
212
util.c
212
util.c
|
@ -28,7 +28,6 @@
|
|||
#include <linux/errqueue.h>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "linux_dep.h"
|
||||
#include "util.h"
|
||||
#include "iov.h"
|
||||
#include "passt.h"
|
||||
|
@ -53,7 +52,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
|
|||
{
|
||||
sa_family_t af = ((const struct sockaddr *)sa)->sa_family;
|
||||
union epoll_ref ref = { .type = type, .data = data };
|
||||
bool freebind = false;
|
||||
struct epoll_event ev;
|
||||
int fd, y = 1, ret;
|
||||
uint8_t proto;
|
||||
|
@ -63,11 +61,8 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
|
|||
case EPOLL_TYPE_TCP_LISTEN:
|
||||
proto = IPPROTO_TCP;
|
||||
socktype = SOCK_STREAM | SOCK_NONBLOCK;
|
||||
freebind = c->freebind;
|
||||
break;
|
||||
case EPOLL_TYPE_UDP_LISTEN:
|
||||
freebind = c->freebind;
|
||||
/* fallthrough */
|
||||
case EPOLL_TYPE_UDP_REPLY:
|
||||
proto = IPPROTO_UDP;
|
||||
socktype = SOCK_DGRAM | SOCK_NONBLOCK;
|
||||
|
@ -132,18 +127,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
|
|||
}
|
||||
}
|
||||
|
||||
if (freebind) {
|
||||
int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
|
||||
int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND;
|
||||
|
||||
if (setsockopt(fd, level, opt, &y, sizeof(y))) {
|
||||
err_perror("Failed to set %s on socket %i",
|
||||
af == AF_INET ? "IP_FREEBIND"
|
||||
: "IPV6_FREEBIND",
|
||||
fd);
|
||||
}
|
||||
}
|
||||
|
||||
if (bind(fd, sa, sl) < 0) {
|
||||
/* We'll fail to bind to low ports if we don't have enough
|
||||
* capabilities, and we'll fail to bind on already bound ports,
|
||||
|
@ -174,6 +157,58 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
|
|||
|
||||
return fd;
|
||||
}
|
||||
/**
|
||||
* sock_l4() - Create and bind socket for given L4, add to epoll list
|
||||
* @c: Execution context
|
||||
* @af: Address family, AF_INET or AF_INET6
|
||||
* @type: epoll type
|
||||
* @bind_addr: Address for binding, NULL for any
|
||||
* @ifname: Interface for binding, NULL for any
|
||||
* @port: Port, host order
|
||||
* @data: epoll reference portion for protocol handlers
|
||||
*
|
||||
* Return: newly created socket, negative error code on failure
|
||||
*/
|
||||
int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
|
||||
const void *bind_addr, const char *ifname, uint16_t port,
|
||||
uint32_t data)
|
||||
{
|
||||
switch (af) {
|
||||
case AF_INET: {
|
||||
struct sockaddr_in addr4 = {
|
||||
.sin_family = AF_INET,
|
||||
.sin_port = htons(port),
|
||||
{ 0 }, { 0 },
|
||||
};
|
||||
if (bind_addr)
|
||||
addr4.sin_addr = *(struct in_addr *)bind_addr;
|
||||
return sock_l4_sa(c, type, &addr4, sizeof(addr4), ifname,
|
||||
false, data);
|
||||
}
|
||||
|
||||
case AF_UNSPEC:
|
||||
if (!DUAL_STACK_SOCKETS || bind_addr)
|
||||
return -EINVAL;
|
||||
/* fallthrough */
|
||||
case AF_INET6: {
|
||||
struct sockaddr_in6 addr6 = {
|
||||
.sin6_family = AF_INET6,
|
||||
.sin6_port = htons(port),
|
||||
0, IN6ADDR_ANY_INIT, 0,
|
||||
};
|
||||
if (bind_addr) {
|
||||
addr6.sin6_addr = *(struct in6_addr *)bind_addr;
|
||||
|
||||
if (IN6_IS_ADDR_LINKLOCAL(bind_addr))
|
||||
addr6.sin6_scope_id = c->ifi6;
|
||||
}
|
||||
return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname,
|
||||
af == AF_INET6, data);
|
||||
}
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
|
||||
|
@ -184,8 +219,7 @@ void sock_probe_mem(struct ctx *c)
|
|||
int v = INT_MAX / 2, s;
|
||||
socklen_t sl;
|
||||
|
||||
s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
|
||||
if (s < 0) {
|
||||
if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
|
||||
c->low_wmem = c->low_rmem = 1;
|
||||
return;
|
||||
}
|
||||
|
@ -215,7 +249,7 @@ void sock_probe_mem(struct ctx *c)
|
|||
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b)
|
||||
{
|
||||
if (a->tv_nsec < b->tv_nsec) {
|
||||
return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 +
|
||||
return (b->tv_nsec - a->tv_nsec) / 1000 +
|
||||
(a->tv_sec - b->tv_sec - 1) * 1000000;
|
||||
}
|
||||
|
||||
|
@ -409,20 +443,25 @@ void pidfile_write(int fd, pid_t pid)
|
|||
}
|
||||
|
||||
/**
|
||||
* output_file_open() - Open file for output, if needed
|
||||
* @path: Path for output file
|
||||
* @flags: Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC
|
||||
* pidfile_open() - Open PID file if needed
|
||||
* @path: Path for PID file, empty string if no PID file is requested
|
||||
*
|
||||
* Return: file descriptor on success, -1 on failure with errno set by open()
|
||||
* Return: descriptor for PID file, -1 if path is NULL, won't return on failure
|
||||
*/
|
||||
int output_file_open(const char *path, int flags)
|
||||
int pidfile_open(const char *path)
|
||||
{
|
||||
/* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for
|
||||
* it in the 'mode' argument if we have one
|
||||
*/
|
||||
return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags,
|
||||
/* NOLINTNEXTLINE(android-cloexec-open) */
|
||||
S_IRUSR | S_IWUSR);
|
||||
int fd;
|
||||
|
||||
if (!*path)
|
||||
return -1;
|
||||
|
||||
if ((fd = open(path, O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC,
|
||||
S_IRUSR | S_IWUSR)) < 0) {
|
||||
perror("PID file open");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
return fd;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -446,11 +485,16 @@ int __daemon(int pidfile_fd, int devnull_fd)
|
|||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
if (setsid() < 0 ||
|
||||
dup2(devnull_fd, STDIN_FILENO) < 0 ||
|
||||
dup2(devnull_fd, STDOUT_FILENO) < 0 ||
|
||||
dup2(devnull_fd, STDERR_FILENO) < 0 ||
|
||||
close(devnull_fd))
|
||||
errno = 0;
|
||||
|
||||
setsid();
|
||||
|
||||
dup2(devnull_fd, STDIN_FILENO);
|
||||
dup2(devnull_fd, STDOUT_FILENO);
|
||||
dup2(devnull_fd, STDERR_FILENO);
|
||||
close(devnull_fd);
|
||||
|
||||
if (errno)
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
return 0;
|
||||
|
@ -538,36 +582,6 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
|
|||
#endif
|
||||
}
|
||||
|
||||
/* write_all_buf() - write all of a buffer to an fd
|
||||
* @fd: File descriptor
|
||||
* @buf: Pointer to base of buffer
|
||||
* @len: Length of buffer
|
||||
*
|
||||
* Return: 0 on success, -1 on error (with errno set)
|
||||
*
|
||||
* #syscalls write
|
||||
*/
|
||||
int write_all_buf(int fd, const void *buf, size_t len)
|
||||
{
|
||||
const char *p = buf;
|
||||
size_t left = len;
|
||||
|
||||
while (left) {
|
||||
ssize_t rc;
|
||||
|
||||
do
|
||||
rc = write(fd, p, left);
|
||||
while ((rc < 0) && errno == EINTR);
|
||||
|
||||
if (rc < 0)
|
||||
return -1;
|
||||
|
||||
p += rc;
|
||||
left -= rc;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* write_remainder() - write the tail of an IO vector to an fd
|
||||
* @fd: File descriptor
|
||||
* @iov: IO vector
|
||||
|
@ -576,30 +590,28 @@ int write_all_buf(int fd, const void *buf, size_t len)
|
|||
*
|
||||
* Return: 0 on success, -1 on error (with errno set)
|
||||
*
|
||||
* #syscalls writev
|
||||
* #syscalls write writev
|
||||
*/
|
||||
int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip)
|
||||
{
|
||||
size_t i = 0, offset;
|
||||
size_t offset, i;
|
||||
|
||||
while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) {
|
||||
while ((i = iov_skip_bytes(iov, iovcnt, skip, &offset)) < iovcnt) {
|
||||
ssize_t rc;
|
||||
|
||||
if (offset) {
|
||||
/* Write the remainder of the partially written buffer */
|
||||
if (write_all_buf(fd, (char *)iov[i].iov_base + offset,
|
||||
iov[i].iov_len - offset) < 0)
|
||||
return -1;
|
||||
i++;
|
||||
rc = write(fd, (char *)iov[i].iov_base + offset,
|
||||
iov[i].iov_len - offset);
|
||||
} else {
|
||||
rc = writev(fd, &iov[i], iovcnt - i);
|
||||
}
|
||||
|
||||
/* Write as much of the remaining whole buffers as we can */
|
||||
rc = writev(fd, &iov[i], iovcnt - i);
|
||||
if (rc < 0)
|
||||
return -1;
|
||||
|
||||
skip = rc;
|
||||
skip += rc;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -738,48 +750,6 @@ void close_open_files(int argc, char **argv)
|
|||
rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE);
|
||||
}
|
||||
|
||||
if (rc) {
|
||||
if (errno == ENOSYS || errno == EINVAL) {
|
||||
/* This probably means close_range() or the
|
||||
* CLOSE_RANGE_UNSHARE flag is not supported by the
|
||||
* kernel. Not much we can do here except carry on and
|
||||
* hope for the best.
|
||||
*/
|
||||
warn(
|
||||
"Can't use close_range() to ensure no files leaked by parent");
|
||||
} else {
|
||||
die_perror("Failed to close files leaked by parent");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* snprintf_check() - snprintf() wrapper, checking for truncation and errors
|
||||
* @str: Output buffer
|
||||
* @size: Maximum size to write to @str
|
||||
* @format: Message
|
||||
*
|
||||
* Return: false on success, true on truncation or error, sets errno on failure
|
||||
*/
|
||||
bool snprintf_check(char *str, size_t size, const char *format, ...)
|
||||
{
|
||||
va_list ap;
|
||||
int rc;
|
||||
|
||||
va_start(ap, format);
|
||||
rc = vsnprintf(str, size, format, ap);
|
||||
va_end(ap);
|
||||
|
||||
if (rc < 0) {
|
||||
errno = EIO;
|
||||
return true;
|
||||
}
|
||||
|
||||
if ((size_t)rc >= size) {
|
||||
errno = ENOBUFS;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
if (rc)
|
||||
die_perror("Failed to close files leaked by parent");
|
||||
}
|
||||
|
|
54
util.h
54
util.h
|
@ -11,12 +11,12 @@
|
|||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <signal.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <linux/close_range.h>
|
||||
|
||||
#include "log.h"
|
||||
|
||||
|
@ -67,15 +67,6 @@
|
|||
#define STRINGIFY(x) #x
|
||||
#define STR(x) STRINGIFY(x)
|
||||
|
||||
#ifdef CPPCHECK_6936
|
||||
/* Some cppcheck versions get confused by aborts inside a loop, causing
|
||||
* it to give false positive uninitialised variable warnings later in
|
||||
* the function, because it doesn't realise the non-initialising path
|
||||
* already exited. See https://trac.cppcheck.net/ticket/13227
|
||||
*/
|
||||
#define ASSERT(expr) \
|
||||
((expr) ? (void)0 : abort())
|
||||
#else
|
||||
#define ASSERT(expr) \
|
||||
do { \
|
||||
if (!(expr)) { \
|
||||
|
@ -87,7 +78,6 @@
|
|||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#ifdef P_tmpdir
|
||||
#define TMPDIR P_tmpdir
|
||||
|
@ -101,9 +91,6 @@
|
|||
|
||||
#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0])))
|
||||
|
||||
#define foreach(item, array) \
|
||||
for ((item) = (array); (item) - (array) < ARRAY_SIZE(array); (item)++)
|
||||
|
||||
#define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b))
|
||||
#define FD_PROTO(x, proto) \
|
||||
(IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x)))
|
||||
|
@ -144,7 +131,7 @@ static inline uint32_t ntohl_unaligned(const void *p)
|
|||
return ntohl(val);
|
||||
}
|
||||
|
||||
#define NS_FN_STACK_SIZE (1024 * 1024) /* 1MiB */
|
||||
#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8)
|
||||
int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
|
||||
void *arg);
|
||||
#define NS_CALL(fn, arg) \
|
||||
|
@ -157,9 +144,9 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
|
|||
(void *)(arg)); \
|
||||
} while (0)
|
||||
|
||||
#define RCVBUF_BIG (2ULL * 1024 * 1024)
|
||||
#define SNDBUF_BIG (4ULL * 1024 * 1024)
|
||||
#define SNDBUF_SMALL (128ULL * 1024)
|
||||
#define RCVBUF_BIG (2UL * 1024 * 1024)
|
||||
#define SNDBUF_BIG (4UL * 1024 * 1024)
|
||||
#define SNDBUF_SMALL (128UL * 1024)
|
||||
|
||||
#include <net/if.h>
|
||||
#include <limits.h>
|
||||
|
@ -170,9 +157,33 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
|
|||
|
||||
struct ctx;
|
||||
|
||||
/* cppcheck-suppress funcArgNamesDifferent */
|
||||
__attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); }
|
||||
|
||||
#ifdef CLOSE_RANGE_UNSHARE /* Linux kernel >= 5.9 */
|
||||
/* glibc < 2.34 and musl as of 1.2.5 need these */
|
||||
#ifndef SYS_close_range
|
||||
#define SYS_close_range 436
|
||||
#endif
|
||||
__attribute__ ((weak))
|
||||
/* cppcheck-suppress funcArgNamesDifferent */
|
||||
int close_range(unsigned int first, unsigned int last, int flags) {
|
||||
return syscall(SYS_close_range, first, last, flags);
|
||||
}
|
||||
#else
|
||||
/* No reasonable fallback option */
|
||||
/* cppcheck-suppress funcArgNamesDifferent */
|
||||
int close_range(unsigned int first, unsigned int last, int flags) {
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
|
||||
const void *sa, socklen_t sl,
|
||||
const char *ifname, bool v6only, uint32_t data);
|
||||
int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
|
||||
const void *bind_addr, const char *ifname, uint16_t port,
|
||||
uint32_t data);
|
||||
void sock_probe_mem(struct ctx *c);
|
||||
long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
|
||||
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);
|
||||
|
@ -184,15 +195,13 @@ char *line_read(char *buf, size_t len, int fd);
|
|||
void ns_enter(const struct ctx *c);
|
||||
bool ns_is_init(void);
|
||||
int open_in_ns(const struct ctx *c, const char *path, int flags);
|
||||
int output_file_open(const char *path, int flags);
|
||||
int pidfile_open(const char *path);
|
||||
void pidfile_write(int fd, pid_t pid);
|
||||
int __daemon(int pidfile_fd, int devnull_fd);
|
||||
int fls(unsigned long x);
|
||||
int write_file(const char *path, const char *buf);
|
||||
int write_all_buf(int fd, const void *buf, size_t len);
|
||||
int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
|
||||
void close_open_files(int argc, char **argv);
|
||||
bool snprintf_check(char *str, size_t size, const char *format, ...);
|
||||
|
||||
/**
|
||||
* af_name() - Return name of an address family
|
||||
|
@ -260,9 +269,6 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
|
|||
return mod_sub(x, i, m) < mod_sub(j, i, m);
|
||||
}
|
||||
|
||||
/* FPRINTF() intentionally silences cert-err33-c clang-tidy warnings */
|
||||
#define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__)
|
||||
|
||||
/*
|
||||
* Workarounds for https://github.com/llvm/llvm-project/issues/58992
|
||||
*
|
||||
|
|
Loading…
Reference in a new issue